NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=ResNet50 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=image Channels=3 Height=224 Width=224
Conv FromTensor=image ToTensor=sevenDS ToChannels=64 FilterH=7 FilterW=7 StrideH=2 StrideW=2 PaddingH=3 PaddingW=3 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=sevenDS ToTensor=bn1 Epsilon=0.00001
Activation FromTensor=bn1 ToTensor=relu1 Kind=ReLU Param=0
Pooling FromTensor=relu1 ToTensor=pool1 Kind=Max3x3Stride2 PaddingH=1 PaddingW=1
Conv FromTensor=pool1 ToTensor=one1 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one1 ToTensor=bn2 Epsilon=0.00001
Conv FromTensor=pool1 ToTensor=one2 ToChannels=64 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one2 ToTensor=bn3 Epsilon=0.00001
Activation FromTensor=bn3 ToTensor=relu2 Kind=ReLU Param=0
Conv FromTensor=relu2 ToTensor=three1 ToChannels=64 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three1 ToTensor=bn4 Epsilon=0.00001
Activation FromTensor=bn4 ToTensor=relu3 Kind=ReLU Param=0
Conv FromTensor=relu3 ToTensor=one3 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one3 ToTensor=bn5 Epsilon=0.00001
Add FromTensor1=bn2 FromTensor2=bn5 ToTensor=add1
Activation FromTensor=add1 ToTensor=relu4 Kind=ReLU Param=0
Conv FromTensor=relu4 ToTensor=one4 ToChannels=64 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one4 ToTensor=bn6 Epsilon=0.00001
Activation FromTensor=bn6 ToTensor=relu5 Kind=ReLU Param=0
Conv FromTensor=relu5 ToTensor=three2 ToChannels=64 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three2 ToTensor=bn7 Epsilon=0.00001
Activation FromTensor=bn7 ToTensor=relu6 Kind=ReLU Param=0
Conv FromTensor=relu6 ToTensor=one5 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one5 ToTensor=bn8 Epsilon=0.00001
Add FromTensor1=relu4 FromTensor2=bn8 ToTensor=add2
Activation FromTensor=add2 ToTensor=relu7 Kind=ReLU Param=0
Conv FromTensor=relu7 ToTensor=one6 ToChannels=64 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one6 ToTensor=bn9 Epsilon=0.00001
Activation FromTensor=bn9 ToTensor=relu8 Kind=ReLU Param=0
Conv FromTensor=relu8 ToTensor=three3 ToChannels=64 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three3 ToTensor=bn10 Epsilon=0.00001
Activation FromTensor=bn10 ToTensor=relu9 Kind=ReLU Param=0
Conv FromTensor=relu9 ToTensor=one7 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one7 ToTensor=bn11 Epsilon=0.00001
Add FromTensor1=relu7 FromTensor2=bn11 ToTensor=add3
Activation FromTensor=add3 ToTensor=relu10 Kind=ReLU Param=0
Conv FromTensor=relu10 ToTensor=oneDS1 ToChannels=512 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS1 ToTensor=bn12 Epsilon=0.00001
Conv FromTensor=relu10 ToTensor=oneDS2 ToChannels=128 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS2 ToTensor=bn13 Epsilon=0.00001
Activation FromTensor=bn13 ToTensor=relu11 Kind=ReLU Param=0
Conv FromTensor=relu11 ToTensor=three4 ToChannels=128 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three4 ToTensor=bn14 Epsilon=0.00001
Activation FromTensor=bn14 ToTensor=relu12 Kind=ReLU Param=0
Conv FromTensor=relu12 ToTensor=one8 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one8 ToTensor=bn15 Epsilon=0.00001
Add FromTensor1=bn12 FromTensor2=bn15 ToTensor=add4
Activation FromTensor=add4 ToTensor=relu13 Kind=ReLU Param=0
Conv FromTensor=relu13 ToTensor=one9 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one9 ToTensor=bn16 Epsilon=0.00001
Activation FromTensor=bn16 ToTensor=relu14 Kind=ReLU Param=0
Conv FromTensor=relu14 ToTensor=three5 ToChannels=128 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three5 ToTensor=bn17 Epsilon=0.00001
Activation FromTensor=bn17 ToTensor=relu15 Kind=ReLU Param=0
Conv FromTensor=relu15 ToTensor=one10 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one10 ToTensor=bn18 Epsilon=0.00001
Add FromTensor1=relu13 FromTensor2=bn18 ToTensor=add5
Activation FromTensor=add5 ToTensor=relu16 Kind=ReLU Param=0
Conv FromTensor=relu16 ToTensor=one11 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one11 ToTensor=bn19 Epsilon=0.00001
Activation FromTensor=bn19 ToTensor=relu17 Kind=ReLU Param=0
Conv FromTensor=relu17 ToTensor=three6 ToChannels=128 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three6 ToTensor=bn20 Epsilon=0.00001
Activation FromTensor=bn20 ToTensor=relu18 Kind=ReLU Param=0
Conv FromTensor=relu18 ToTensor=one12 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one12 ToTensor=bn21 Epsilon=0.00001
Add FromTensor1=relu16 FromTensor2=bn21 ToTensor=add6
Activation FromTensor=add6 ToTensor=relu19 Kind=ReLU Param=0
Conv FromTensor=relu19 ToTensor=one13 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one13 ToTensor=bn22 Epsilon=0.00001
Activation FromTensor=bn22 ToTensor=relu20 Kind=ReLU Param=0
Conv FromTensor=relu20 ToTensor=three7 ToChannels=128 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three7 ToTensor=bn23 Epsilon=0.00001
Activation FromTensor=bn23 ToTensor=relu21 Kind=ReLU Param=0
Conv FromTensor=relu21 ToTensor=one14 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one14 ToTensor=bn24 Epsilon=0.00001
Add FromTensor1=relu19 FromTensor2=bn24 ToTensor=add7
Activation FromTensor=add7 ToTensor=relu22 Kind=ReLU Param=0
Conv FromTensor=relu22 ToTensor=oneDS3 ToChannels=1024 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS3 ToTensor=bn25 Epsilon=0.00001
Conv FromTensor=relu22 ToTensor=oneDS4 ToChannels=256 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS4 ToTensor=bn26 Epsilon=0.00001
Activation FromTensor=bn26 ToTensor=relu23 Kind=ReLU Param=0
Conv FromTensor=relu23 ToTensor=three8 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three8 ToTensor=bn27 Epsilon=0.00001
Activation FromTensor=bn27 ToTensor=relu24 Kind=ReLU Param=0
Conv FromTensor=relu24 ToTensor=one15 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one15 ToTensor=bn28 Epsilon=0.00001
Add FromTensor1=bn25 FromTensor2=bn28 ToTensor=add8
Activation FromTensor=add8 ToTensor=relu25 Kind=ReLU Param=0
Conv FromTensor=relu25 ToTensor=one16 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one16 ToTensor=bn29 Epsilon=0.00001
Activation FromTensor=bn29 ToTensor=relu26 Kind=ReLU Param=0
Conv FromTensor=relu26 ToTensor=three9 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three9 ToTensor=bn30 Epsilon=0.00001
Activation FromTensor=bn30 ToTensor=relu27 Kind=ReLU Param=0
Conv FromTensor=relu27 ToTensor=one17 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one17 ToTensor=bn31 Epsilon=0.00001
Add FromTensor1=relu25 FromTensor2=bn31 ToTensor=add9
Activation FromTensor=add9 ToTensor=relu28 Kind=ReLU Param=0
Conv FromTensor=relu28 ToTensor=one18 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one18 ToTensor=bn32 Epsilon=0.00001
Activation FromTensor=bn32 ToTensor=relu29 Kind=ReLU Param=0
Conv FromTensor=relu29 ToTensor=three10 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three10 ToTensor=bn33 Epsilon=0.00001
Activation FromTensor=bn33 ToTensor=relu30 Kind=ReLU Param=0
Conv FromTensor=relu30 ToTensor=one19 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one19 ToTensor=bn34 Epsilon=0.00001
Add FromTensor1=relu28 FromTensor2=bn34 ToTensor=add10
Activation FromTensor=add10 ToTensor=relu31 Kind=ReLU Param=0
Conv FromTensor=relu31 ToTensor=one20 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one20 ToTensor=bn35 Epsilon=0.00001
Activation FromTensor=bn35 ToTensor=relu32 Kind=ReLU Param=0
Conv FromTensor=relu32 ToTensor=three11 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three11 ToTensor=bn36 Epsilon=0.00001
Activation FromTensor=bn36 ToTensor=relu33 Kind=ReLU Param=0
Conv FromTensor=relu33 ToTensor=one21 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one21 ToTensor=bn37 Epsilon=0.00001
Add FromTensor1=relu31 FromTensor2=bn37 ToTensor=add11
Activation FromTensor=add11 ToTensor=relu34 Kind=ReLU Param=0
Conv FromTensor=relu34 ToTensor=one22 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one22 ToTensor=bn38 Epsilon=0.00001
Activation FromTensor=bn38 ToTensor=relu35 Kind=ReLU Param=0
Conv FromTensor=relu35 ToTensor=three12 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three12 ToTensor=bn39 Epsilon=0.00001
Activation FromTensor=bn39 ToTensor=relu36 Kind=ReLU Param=0
Conv FromTensor=relu36 ToTensor=one23 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one23 ToTensor=bn40 Epsilon=0.00001
Add FromTensor1=relu34 FromTensor2=bn40 ToTensor=add12
Activation FromTensor=add12 ToTensor=relu37 Kind=ReLU Param=0
Conv FromTensor=relu37 ToTensor=one24 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one24 ToTensor=bn41 Epsilon=0.00001
Activation FromTensor=bn41 ToTensor=relu38 Kind=ReLU Param=0
Conv FromTensor=relu38 ToTensor=three13 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three13 ToTensor=bn42 Epsilon=0.00001
Activation FromTensor=bn42 ToTensor=relu39 Kind=ReLU Param=0
Conv FromTensor=relu39 ToTensor=one25 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one25 ToTensor=bn43 Epsilon=0.00001
Add FromTensor1=relu37 FromTensor2=bn43 ToTensor=add13
Activation FromTensor=add13 ToTensor=relu40 Kind=ReLU Param=0
Conv FromTensor=relu40 ToTensor=oneDS5 ToChannels=2048 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS5 ToTensor=bn44 Epsilon=0.00001
Conv FromTensor=relu40 ToTensor=oneDS6 ToChannels=512 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS6 ToTensor=bn45 Epsilon=0.00001
Activation FromTensor=bn45 ToTensor=relu41 Kind=ReLU Param=0
Conv FromTensor=relu41 ToTensor=three14 ToChannels=512 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three14 ToTensor=bn46 Epsilon=0.00001
Activation FromTensor=bn46 ToTensor=relu42 Kind=ReLU Param=0
Conv FromTensor=relu42 ToTensor=one26 ToChannels=2048 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one26 ToTensor=bn47 Epsilon=0.00001
Add FromTensor1=bn44 FromTensor2=bn47 ToTensor=add14
Activation FromTensor=add14 ToTensor=relu43 Kind=ReLU Param=0
Conv FromTensor=relu43 ToTensor=one27 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one27 ToTensor=bn48 Epsilon=0.00001
Activation FromTensor=bn48 ToTensor=relu44 Kind=ReLU Param=0
Conv FromTensor=relu44 ToTensor=three15 ToChannels=512 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three15 ToTensor=bn49 Epsilon=0.00001
Activation FromTensor=bn49 ToTensor=relu45 Kind=ReLU Param=0
Conv FromTensor=relu45 ToTensor=one28 ToChannels=2048 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one28 ToTensor=bn50 Epsilon=0.00001
Add FromTensor1=relu43 FromTensor2=bn50 ToTensor=add15
Activation FromTensor=add15 ToTensor=relu46 Kind=ReLU Param=0
Conv FromTensor=relu46 ToTensor=one29 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one29 ToTensor=bn51 Epsilon=0.00001
Activation FromTensor=bn51 ToTensor=relu47 Kind=ReLU Param=0
Conv FromTensor=relu47 ToTensor=three16 ToChannels=512 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=three16 ToTensor=bn52 Epsilon=0.00001
Activation FromTensor=bn52 ToTensor=relu48 Kind=ReLU Param=0
Conv FromTensor=relu48 ToTensor=one30 ToChannels=2048 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one30 ToTensor=bn53 Epsilon=0.00001
Add FromTensor1=relu46 FromTensor2=bn53 ToTensor=add16
Activation FromTensor=add16 ToTensor=relu49 Kind=ReLU Param=0
Pooling FromTensor=relu49 ToTensor=pool2 Kind=AvgGlobal PaddingH=0 PaddingW=0
FullyConnected FromTensor=pool2 ToTensor=fc ToChannels=1000
Softmax FromTensor=fc ToTensor=prob
Output FromTensor=prob

Top || Output ResNet50.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(ResNet50Params);
// ResNet50Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct ResNet50Params ResNet50Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// ResNet50Params* params = malloc(sizeof(ResNet50Params));
//
// ... Load params (read from a file, perhaps) ...
//
// ResNet50Net* net; // For example, 4 threads:
// char* err = ResNet50NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// ResNet50NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct ResNet50Net ResNet50Net;

char* ResNet50NetCreate(
ResNet50Net**,
ResNet50Params*,
ptrdiff_t threads
);

void ResNet50NetDestroy(ResNet50Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// ResNet50Net* net;
//
// ... Create net ...
//
// ResNet50Engine* engine; // For example, 4 inference threads:
// char* err = ResNet50EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// ResNet50EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = ResNet50EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* imageData = malloc(sizeof(float)*3*224*224);
// float* probData = malloc(sizeof(float)*1000*1*1);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// ResNet50EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// imageData, // The tensor arguments are sorted by name.
// probData
// );
//
// ... Read the output floats ...
//
// }
//
// free(imageData);
// free(probData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct ResNet50Engine ResNet50Engine;

char* ResNet50EngineCreate(
ResNet50Engine**,
ResNet50Net*,
ptrdiff_t threads
);

char* ResNet50EnginePthreadT(
ResNet50Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void ResNet50EngineInference(
ResNet50Engine*,
float* imageData,
float* probData
);

void ResNet50EngineDestroy(ResNet50Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct ResNet50Params {
float bn10Means[64]; // 1x64x1x1
float bn10Scales[64]; // 1x64x1x1
float bn10Shifts[64]; // 1x64x1x1
float bn10Variances[64]; // 1x64x1x1
float bn11Means[256]; // 1x256x1x1
float bn11Scales[256]; // 1x256x1x1
float bn11Shifts[256]; // 1x256x1x1
float bn11Variances[256]; // 1x256x1x1
float bn12Means[512]; // 1x512x1x1
float bn12Scales[512]; // 1x512x1x1
float bn12Shifts[512]; // 1x512x1x1
float bn12Variances[512]; // 1x512x1x1
float bn13Means[128]; // 1x128x1x1
float bn13Scales[128]; // 1x128x1x1
float bn13Shifts[128]; // 1x128x1x1
float bn13Variances[128]; // 1x128x1x1
float bn14Means[128]; // 1x128x1x1
float bn14Scales[128]; // 1x128x1x1
float bn14Shifts[128]; // 1x128x1x1
float bn14Variances[128]; // 1x128x1x1
float bn15Means[512]; // 1x512x1x1
float bn15Scales[512]; // 1x512x1x1
float bn15Shifts[512]; // 1x512x1x1
float bn15Variances[512]; // 1x512x1x1
float bn16Means[128]; // 1x128x1x1
float bn16Scales[128]; // 1x128x1x1
float bn16Shifts[128]; // 1x128x1x1
float bn16Variances[128]; // 1x128x1x1
float bn17Means[128]; // 1x128x1x1
float bn17Scales[128]; // 1x128x1x1
float bn17Shifts[128]; // 1x128x1x1
float bn17Variances[128]; // 1x128x1x1
float bn18Means[512]; // 1x512x1x1
float bn18Scales[512]; // 1x512x1x1
float bn18Shifts[512]; // 1x512x1x1
float bn18Variances[512]; // 1x512x1x1
float bn19Means[128]; // 1x128x1x1
float bn19Scales[128]; // 1x128x1x1
float bn19Shifts[128]; // 1x128x1x1
float bn19Variances[128]; // 1x128x1x1
float bn1Means[64]; // 1x64x1x1
float bn1Scales[64]; // 1x64x1x1
float bn1Shifts[64]; // 1x64x1x1
float bn1Variances[64]; // 1x64x1x1
float bn20Means[128]; // 1x128x1x1
float bn20Scales[128]; // 1x128x1x1
float bn20Shifts[128]; // 1x128x1x1
float bn20Variances[128]; // 1x128x1x1
float bn21Means[512]; // 1x512x1x1
float bn21Scales[512]; // 1x512x1x1
float bn21Shifts[512]; // 1x512x1x1
float bn21Variances[512]; // 1x512x1x1
float bn22Means[128]; // 1x128x1x1
float bn22Scales[128]; // 1x128x1x1
float bn22Shifts[128]; // 1x128x1x1
float bn22Variances[128]; // 1x128x1x1
float bn23Means[128]; // 1x128x1x1
float bn23Scales[128]; // 1x128x1x1
float bn23Shifts[128]; // 1x128x1x1
float bn23Variances[128]; // 1x128x1x1
float bn24Means[512]; // 1x512x1x1
float bn24Scales[512]; // 1x512x1x1
float bn24Shifts[512]; // 1x512x1x1
float bn24Variances[512]; // 1x512x1x1
float bn25Means[1024]; // 1x1024x1x1
float bn25Scales[1024]; // 1x1024x1x1
float bn25Shifts[1024]; // 1x1024x1x1
float bn25Variances[1024]; // 1x1024x1x1
float bn26Means[256]; // 1x256x1x1
float bn26Scales[256]; // 1x256x1x1
float bn26Shifts[256]; // 1x256x1x1
float bn26Variances[256]; // 1x256x1x1
float bn27Means[256]; // 1x256x1x1
float bn27Scales[256]; // 1x256x1x1
float bn27Shifts[256]; // 1x256x1x1
float bn27Variances[256]; // 1x256x1x1
float bn28Means[1024]; // 1x1024x1x1
float bn28Scales[1024]; // 1x1024x1x1
float bn28Shifts[1024]; // 1x1024x1x1
float bn28Variances[1024]; // 1x1024x1x1
float bn29Means[256]; // 1x256x1x1
float bn29Scales[256]; // 1x256x1x1
float bn29Shifts[256]; // 1x256x1x1
float bn29Variances[256]; // 1x256x1x1
float bn2Means[256]; // 1x256x1x1
float bn2Scales[256]; // 1x256x1x1
float bn2Shifts[256]; // 1x256x1x1
float bn2Variances[256]; // 1x256x1x1
float bn30Means[256]; // 1x256x1x1
float bn30Scales[256]; // 1x256x1x1
float bn30Shifts[256]; // 1x256x1x1
float bn30Variances[256]; // 1x256x1x1
float bn31Means[1024]; // 1x1024x1x1
float bn31Scales[1024]; // 1x1024x1x1
float bn31Shifts[1024]; // 1x1024x1x1
float bn31Variances[1024]; // 1x1024x1x1
float bn32Means[256]; // 1x256x1x1
float bn32Scales[256]; // 1x256x1x1
float bn32Shifts[256]; // 1x256x1x1
float bn32Variances[256]; // 1x256x1x1
float bn33Means[256]; // 1x256x1x1
float bn33Scales[256]; // 1x256x1x1
float bn33Shifts[256]; // 1x256x1x1
float bn33Variances[256]; // 1x256x1x1
float bn34Means[1024]; // 1x1024x1x1
float bn34Scales[1024]; // 1x1024x1x1
float bn34Shifts[1024]; // 1x1024x1x1
float bn34Variances[1024]; // 1x1024x1x1
float bn35Means[256]; // 1x256x1x1
float bn35Scales[256]; // 1x256x1x1
float bn35Shifts[256]; // 1x256x1x1
float bn35Variances[256]; // 1x256x1x1
float bn36Means[256]; // 1x256x1x1
float bn36Scales[256]; // 1x256x1x1
float bn36Shifts[256]; // 1x256x1x1
float bn36Variances[256]; // 1x256x1x1
float bn37Means[1024]; // 1x1024x1x1
float bn37Scales[1024]; // 1x1024x1x1
float bn37Shifts[1024]; // 1x1024x1x1
float bn37Variances[1024]; // 1x1024x1x1
float bn38Means[256]; // 1x256x1x1
float bn38Scales[256]; // 1x256x1x1
float bn38Shifts[256]; // 1x256x1x1
float bn38Variances[256]; // 1x256x1x1
float bn39Means[256]; // 1x256x1x1
float bn39Scales[256]; // 1x256x1x1
float bn39Shifts[256]; // 1x256x1x1
float bn39Variances[256]; // 1x256x1x1
float bn3Means[64]; // 1x64x1x1
float bn3Scales[64]; // 1x64x1x1
float bn3Shifts[64]; // 1x64x1x1
float bn3Variances[64]; // 1x64x1x1
float bn40Means[1024]; // 1x1024x1x1
float bn40Scales[1024]; // 1x1024x1x1
float bn40Shifts[1024]; // 1x1024x1x1
float bn40Variances[1024]; // 1x1024x1x1
float bn41Means[256]; // 1x256x1x1
float bn41Scales[256]; // 1x256x1x1
float bn41Shifts[256]; // 1x256x1x1
float bn41Variances[256]; // 1x256x1x1
float bn42Means[256]; // 1x256x1x1
float bn42Scales[256]; // 1x256x1x1
float bn42Shifts[256]; // 1x256x1x1
float bn42Variances[256]; // 1x256x1x1
float bn43Means[1024]; // 1x1024x1x1
float bn43Scales[1024]; // 1x1024x1x1
float bn43Shifts[1024]; // 1x1024x1x1
float bn43Variances[1024]; // 1x1024x1x1
float bn44Means[2048]; // 1x2048x1x1
float bn44Scales[2048]; // 1x2048x1x1
float bn44Shifts[2048]; // 1x2048x1x1
float bn44Variances[2048]; // 1x2048x1x1
float bn45Means[512]; // 1x512x1x1
float bn45Scales[512]; // 1x512x1x1
float bn45Shifts[512]; // 1x512x1x1
float bn45Variances[512]; // 1x512x1x1
float bn46Means[512]; // 1x512x1x1
float bn46Scales[512]; // 1x512x1x1
float bn46Shifts[512]; // 1x512x1x1
float bn46Variances[512]; // 1x512x1x1
float bn47Means[2048]; // 1x2048x1x1
float bn47Scales[2048]; // 1x2048x1x1
float bn47Shifts[2048]; // 1x2048x1x1
float bn47Variances[2048]; // 1x2048x1x1
float bn48Means[512]; // 1x512x1x1
float bn48Scales[512]; // 1x512x1x1
float bn48Shifts[512]; // 1x512x1x1
float bn48Variances[512]; // 1x512x1x1
float bn49Means[512]; // 1x512x1x1
float bn49Scales[512]; // 1x512x1x1
float bn49Shifts[512]; // 1x512x1x1
float bn49Variances[512]; // 1x512x1x1
float bn4Means[64]; // 1x64x1x1
float bn4Scales[64]; // 1x64x1x1
float bn4Shifts[64]; // 1x64x1x1
float bn4Variances[64]; // 1x64x1x1
float bn50Means[2048]; // 1x2048x1x1
float bn50Scales[2048]; // 1x2048x1x1
float bn50Shifts[2048]; // 1x2048x1x1
float bn50Variances[2048]; // 1x2048x1x1
float bn51Means[512]; // 1x512x1x1
float bn51Scales[512]; // 1x512x1x1
float bn51Shifts[512]; // 1x512x1x1
float bn51Variances[512]; // 1x512x1x1
float bn52Means[512]; // 1x512x1x1
float bn52Scales[512]; // 1x512x1x1
float bn52Shifts[512]; // 1x512x1x1
float bn52Variances[512]; // 1x512x1x1
float bn53Means[2048]; // 1x2048x1x1
float bn53Scales[2048]; // 1x2048x1x1
float bn53Shifts[2048]; // 1x2048x1x1
float bn53Variances[2048]; // 1x2048x1x1
float bn5Means[256]; // 1x256x1x1
float bn5Scales[256]; // 1x256x1x1
float bn5Shifts[256]; // 1x256x1x1
float bn5Variances[256]; // 1x256x1x1
float bn6Means[64]; // 1x64x1x1
float bn6Scales[64]; // 1x64x1x1
float bn6Shifts[64]; // 1x64x1x1
float bn6Variances[64]; // 1x64x1x1
float bn7Means[64]; // 1x64x1x1
float bn7Scales[64]; // 1x64x1x1
float bn7Shifts[64]; // 1x64x1x1
float bn7Variances[64]; // 1x64x1x1
float bn8Means[256]; // 1x256x1x1
float bn8Scales[256]; // 1x256x1x1
float bn8Shifts[256]; // 1x256x1x1
float bn8Variances[256]; // 1x256x1x1
float bn9Means[64]; // 1x64x1x1
float bn9Scales[64]; // 1x64x1x1
float bn9Shifts[64]; // 1x64x1x1
float bn9Variances[64]; // 1x64x1x1
float fcBiases[1000]; // 1x1000x1x1
float fcWeights[2048000]; // 1000x2048x1x1
float one10Biases[512]; // 1x512x1x1
float one10Weights[65536]; // 512x128x1x1
float one11Biases[128]; // 1x128x1x1
float one11Weights[65536]; // 128x512x1x1
float one12Biases[512]; // 1x512x1x1
float one12Weights[65536]; // 512x128x1x1
float one13Biases[128]; // 1x128x1x1
float one13Weights[65536]; // 128x512x1x1
float one14Biases[512]; // 1x512x1x1
float one14Weights[65536]; // 512x128x1x1
float one15Biases[1024]; // 1x1024x1x1
float one15Weights[262144]; // 1024x256x1x1
float one16Biases[256]; // 1x256x1x1
float one16Weights[262144]; // 256x1024x1x1
float one17Biases[1024]; // 1x1024x1x1
float one17Weights[262144]; // 1024x256x1x1
float one18Biases[256]; // 1x256x1x1
float one18Weights[262144]; // 256x1024x1x1
float one19Biases[1024]; // 1x1024x1x1
float one19Weights[262144]; // 1024x256x1x1
float one1Biases[256]; // 1x256x1x1
float one1Weights[16384]; // 256x64x1x1
float one20Biases[256]; // 1x256x1x1
float one20Weights[262144]; // 256x1024x1x1
float one21Biases[1024]; // 1x1024x1x1
float one21Weights[262144]; // 1024x256x1x1
float one22Biases[256]; // 1x256x1x1
float one22Weights[262144]; // 256x1024x1x1
float one23Biases[1024]; // 1x1024x1x1
float one23Weights[262144]; // 1024x256x1x1
float one24Biases[256]; // 1x256x1x1
float one24Weights[262144]; // 256x1024x1x1
float one25Biases[1024]; // 1x1024x1x1
float one25Weights[262144]; // 1024x256x1x1
float one26Biases[2048]; // 1x2048x1x1
float one26Weights[1048576]; // 2048x512x1x1
float one27Biases[512]; // 1x512x1x1
float one27Weights[1048576]; // 512x2048x1x1
float one28Biases[2048]; // 1x2048x1x1
float one28Weights[1048576]; // 2048x512x1x1
float one29Biases[512]; // 1x512x1x1
float one29Weights[1048576]; // 512x2048x1x1
float one2Biases[64]; // 1x64x1x1
float one2Weights[4096]; // 64x64x1x1
float one30Biases[2048]; // 1x2048x1x1
float one30Weights[1048576]; // 2048x512x1x1
float one3Biases[256]; // 1x256x1x1
float one3Weights[16384]; // 256x64x1x1
float one4Biases[64]; // 1x64x1x1
float one4Weights[16384]; // 64x256x1x1
float one5Biases[256]; // 1x256x1x1
float one5Weights[16384]; // 256x64x1x1
float one6Biases[64]; // 1x64x1x1
float one6Weights[16384]; // 64x256x1x1
float one7Biases[256]; // 1x256x1x1
float one7Weights[16384]; // 256x64x1x1
float one8Biases[512]; // 1x512x1x1
float one8Weights[65536]; // 512x128x1x1
float one9Biases[128]; // 1x128x1x1
float one9Weights[65536]; // 128x512x1x1
float oneDS1Biases[512]; // 1x512x1x1
float oneDS1Weights[131072]; // 512x256x1x1
float oneDS2Biases[128]; // 1x128x1x1
float oneDS2Weights[32768]; // 128x256x1x1
float oneDS3Biases[1024]; // 1x1024x1x1
float oneDS3Weights[524288]; // 1024x512x1x1
float oneDS4Biases[256]; // 1x256x1x1
float oneDS4Weights[131072]; // 256x512x1x1
float oneDS5Biases[2048]; // 1x2048x1x1
float oneDS5Weights[2097152]; // 2048x1024x1x1
float oneDS6Biases[512]; // 1x512x1x1
float oneDS6Weights[524288]; // 512x1024x1x1
float sevenDSBiases[64]; // 1x64x1x1
float sevenDSWeights[9408]; // 64x3x7x7
float three10Biases[256]; // 1x256x1x1
float three10Weights[589824]; // 256x256x3x3
float three11Biases[256]; // 1x256x1x1
float three11Weights[589824]; // 256x256x3x3
float three12Biases[256]; // 1x256x1x1
float three12Weights[589824]; // 256x256x3x3
float three13Biases[256]; // 1x256x1x1
float three13Weights[589824]; // 256x256x3x3
float three14Biases[512]; // 1x512x1x1
float three14Weights[2359296]; // 512x512x3x3
float three15Biases[512]; // 1x512x1x1
float three15Weights[2359296]; // 512x512x3x3
float three16Biases[512]; // 1x512x1x1
float three16Weights[2359296]; // 512x512x3x3
float three1Biases[64]; // 1x64x1x1
float three1Weights[36864]; // 64x64x3x3
float three2Biases[64]; // 1x64x1x1
float three2Weights[36864]; // 64x64x3x3
float three3Biases[64]; // 1x64x1x1
float three3Weights[36864]; // 64x64x3x3
float three4Biases[128]; // 1x128x1x1
float three4Weights[147456]; // 128x128x3x3
float three5Biases[128]; // 1x128x1x1
float three5Weights[147456]; // 128x128x3x3
float three6Biases[128]; // 1x128x1x1
float three6Weights[147456]; // 128x128x3x3
float three7Biases[128]; // 1x128x1x1
float three7Weights[147456]; // 128x128x3x3
float three8Biases[256]; // 1x256x1x1
float three8Weights[589824]; // 256x256x3x3
float three9Biases[256]; // 1x256x1x1
float three9Weights[589824]; // 256x256x3x3
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output ResNet50.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f ResNet50.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "ResNet50.h"

static char* ResNet50Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(276);
int step1 = sprintf(msg1, "ResNet50: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 276-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct ResNet50ThreaderTask1 ResNet50ThreaderTask1;
typedef void (*ResNet50ThreaderCallee1)(ResNet50ThreaderTask1*, int64_t*);
typedef struct ResNet50ThreaderHub1 ResNet50ThreaderHub1;
typedef struct ResNet50ThreaderNode1 ResNet50ThreaderNode1;
typedef struct ResNet50ThreaderUnwind1 ResNet50ThreaderUnwind1;
typedef struct ResNet50ThreaderTeam1 ResNet50ThreaderTeam1;

struct ResNet50ThreaderTask1 {
ResNet50ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct ResNet50ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct ResNet50ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
ResNet50ThreaderTask1* task1;
pthread_cond_t cond2;
ResNet50ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct ResNet50ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct ResNet50ThreaderTeam1 {
ptrdiff_t nt1;
ResNet50ThreaderHub1* hub2;
ResNet50ThreaderNode1* nodes2;
ResNet50ThreaderUnwind1 unwind1;
};

static void ResNet50ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void ResNet50ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void ResNet50ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* ResNet50ThreaderMain1(void* arg1) {
ResNet50ThreaderNode1* node1 = arg1;
ResNet50ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
ResNet50ThreaderHub1* hub3 = team2->hub2;
ResNet50ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
ResNet50ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
ResNet50ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
ResNet50ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
ResNet50ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
ResNet50ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void ResNet50ThreaderDestroy1(ResNet50ThreaderTeam1* team3) {
if (!team3) return;
ResNet50ThreaderNode1* nodes4 = team3->nodes2;
ResNet50ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (ResNet50ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (ResNet50ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (ResNet50ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (ResNet50ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (ResNet50ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
ResNet50ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* ResNet50ThreaderCreate1Up4(ResNet50ThreaderTeam1* team8, ptrdiff_t nt7) {
ResNet50ThreaderNode1* nodes5 = team8->nodes2;
for (ResNet50ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = ResNet50Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = ResNet50Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, ResNet50ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = ResNet50Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* ResNet50ThreaderCreate1Up3(ResNet50ThreaderTeam1* team7, ptrdiff_t nt6) {
ResNet50ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return ResNet50Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return ResNet50Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return ResNet50ThreaderCreate1Up4(team7, nt6);
}

static char* ResNet50ThreaderCreate1Up2(ResNet50ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(ResNet50ThreaderNode1);
if (__builtin_expect(size2/sizeof(ResNet50ThreaderNode1) != (size_t)nt5, 0)) {
return ResNet50Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return ResNet50Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return ResNet50ThreaderCreate1Up3(team6, nt5);
}

static char* ResNet50ThreaderCreate1Up1(ResNet50ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(ResNet50ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return ResNet50Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return ResNet50ThreaderCreate1Up2(team5, nt4);
}

static char* ResNet50ThreaderCreate1(ResNet50ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return ResNet50Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(ResNet50ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return ResNet50Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = ResNet50ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
ResNet50ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* ResNet50ThreaderPthreadT1(
pthread_t* thr2,
ResNet50ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return ResNet50Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void ResNet50ThreaderDo1(ResNet50ThreaderTeam1* team10, ResNet50ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
ResNet50ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
ResNet50ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
ResNet50ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
ResNet50ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 ResNet50Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static void ResNet50Softmax1(ResNet50ThreaderTeam1* team90, char** tensors155) {
(void)team90;
char*restrict ptr5 = tensors155[0];
char*restrict ptr6 = tensors155[1];
__m512 max1 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0);
__m512 max2 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1);
__m512 max3 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2);
__m512 max4 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3);
__m512 max5 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4);
__m512 max6 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5);
__m512 max7 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6);
__m512 max8 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7);
__m512 max9 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8);
__m512 max10 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9);
__m512 max11 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10);
__m512 max12 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11);
__m512 max13 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12);
__m512 max14 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13);
__m512 max15 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*14);
__m512 max16 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*15);
for (ptrdiff_t i93 = 1; i93 <= 2; ++i93) {
__m512 dat2557 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0+(ptrdiff_t)64*16*i93);
__m512 dat2558 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1+(ptrdiff_t)64*16*i93);
__m512 dat2559 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2+(ptrdiff_t)64*16*i93);
__m512 dat2560 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3+(ptrdiff_t)64*16*i93);
__m512 dat2561 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4+(ptrdiff_t)64*16*i93);
__m512 dat2562 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5+(ptrdiff_t)64*16*i93);
__m512 dat2563 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6+(ptrdiff_t)64*16*i93);
__m512 dat2564 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7+(ptrdiff_t)64*16*i93);
__m512 dat2565 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8+(ptrdiff_t)64*16*i93);
__m512 dat2566 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9+(ptrdiff_t)64*16*i93);
__m512 dat2567 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10+(ptrdiff_t)64*16*i93);
__m512 dat2568 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11+(ptrdiff_t)64*16*i93);
__m512 dat2569 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12+(ptrdiff_t)64*16*i93);
__m512 dat2570 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13+(ptrdiff_t)64*16*i93);
__m512 dat2571 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*14+(ptrdiff_t)64*16*i93);
__m512 dat2572 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*15+(ptrdiff_t)64*16*i93);
max1 = _mm512_max_ps(max1, dat2557);
max2 = _mm512_max_ps(max2, dat2558);
max3 = _mm512_max_ps(max3, dat2559);
max4 = _mm512_max_ps(max4, dat2560);
max5 = _mm512_max_ps(max5, dat2561);
max6 = _mm512_max_ps(max6, dat2562);
max7 = _mm512_max_ps(max7, dat2563);
max8 = _mm512_max_ps(max8, dat2564);
max9 = _mm512_max_ps(max9, dat2565);
max10 = _mm512_max_ps(max10, dat2566);
max11 = _mm512_max_ps(max11, dat2567);
max12 = _mm512_max_ps(max12, dat2568);
max13 = _mm512_max_ps(max13, dat2569);
max14 = _mm512_max_ps(max14, dat2570);
max15 = _mm512_max_ps(max15, dat2571);
max16 = _mm512_max_ps(max16, dat2572);
}
__m512 dat2573 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0+(ptrdiff_t)64*16*3);
__m512 dat2574 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1+(ptrdiff_t)64*16*3);
__m512 dat2575 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2+(ptrdiff_t)64*16*3);
__m512 dat2576 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3+(ptrdiff_t)64*16*3);
__m512 dat2577 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4+(ptrdiff_t)64*16*3);
__m512 dat2578 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5+(ptrdiff_t)64*16*3);
__m512 dat2579 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6+(ptrdiff_t)64*16*3);
__m512 dat2580 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7+(ptrdiff_t)64*16*3);
__m512 dat2581 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8+(ptrdiff_t)64*16*3);
__m512 dat2582 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9+(ptrdiff_t)64*16*3);
__m512 dat2583 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10+(ptrdiff_t)64*16*3);
__m512 dat2584 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11+(ptrdiff_t)64*16*3);
__m512 dat2585 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12+(ptrdiff_t)64*16*3);
__m512 dat2586 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13+(ptrdiff_t)64*16*3);
max1 = _mm512_max_ps(max1, dat2573);
max2 = _mm512_max_ps(max2, dat2574);
max3 = _mm512_max_ps(max3, dat2575);
max4 = _mm512_max_ps(max4, dat2576);
max5 = _mm512_max_ps(max5, dat2577);
max6 = _mm512_max_ps(max6, dat2578);
max7 = _mm512_max_ps(max7, dat2579);
max8 = _mm512_max_ps(max8, dat2580);
max9 = _mm512_max_ps(max9, dat2581);
max10 = _mm512_max_ps(max10, dat2582);
max11 = _mm512_max_ps(max11, dat2583);
max12 = _mm512_max_ps(max12, dat2584);
max13 = _mm512_max_ps(max13, dat2585);
max14 = _mm512_max_ps(max14, dat2586);
__m512 dat2587 = _mm512_maskz_loadu_ps(255, ptr5+(ptrdiff_t)64*62);
max16 = _mm512_mask_max_ps(max16, 255, max16, dat2587);
max1 = _mm512_max_ps(max1, max9);
max2 = _mm512_max_ps(max2, max10);
max3 = _mm512_max_ps(max3, max11);
max4 = _mm512_max_ps(max4, max12);
max5 = _mm512_max_ps(max5, max13);
max6 = _mm512_max_ps(max6, max14);
max7 = _mm512_max_ps(max7, max15);
max8 = _mm512_max_ps(max8, max16);
max1 = _mm512_max_ps(max1, max5);
max2 = _mm512_max_ps(max2, max6);
max3 = _mm512_max_ps(max3, max7);
max4 = _mm512_max_ps(max4, max8);
max1 = _mm512_max_ps(max1, max3);
max2 = _mm512_max_ps(max2, max4);
max1 = _mm512_max_ps(max1, max2);
__m512i p2 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8);
max1 = _mm512_mask_max_ps(max1, 255, max1, _mm512_permutexvar_ps(p2, max1));
__m512i p3 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4);
max1 = _mm512_mask_max_ps(max1, 15, max1, _mm512_permutexvar_ps(p3, max1));
__m512i p4 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2);
max1 = _mm512_mask_max_ps(max1, 3, max1, _mm512_permutexvar_ps(p4, max1));
__m512i p5 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
max1 = _mm512_mask_max_ps(max1, 1, max1, _mm512_permutexvar_ps(p5, max1));
__m512i p6 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
max1 = _mm512_permutexvar_ps(p6, max1);
__m512 sum861 = _mm512_setzero_ps();
__m512 neg1 = _mm512_sub_ps(sum861, max1);
__m512 dat2618 = _mm512_maskz_loadu_ps(255, ptr5+(ptrdiff_t)64*14+(ptrdiff_t)64*16*3);
__m512 dat2617 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13+(ptrdiff_t)64*16*3);
__m512 dat2616 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12+(ptrdiff_t)64*16*3);
__m512 dat2615 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11+(ptrdiff_t)64*16*3);
__m512 dat2614 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10+(ptrdiff_t)64*16*3);
__m512 dat2613 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9+(ptrdiff_t)64*16*3);
__m512 dat2612 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8+(ptrdiff_t)64*16*3);
__m512 dat2611 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7+(ptrdiff_t)64*16*3);
__m512 dat2610 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6+(ptrdiff_t)64*16*3);
__m512 dat2609 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5+(ptrdiff_t)64*16*3);
__m512 dat2608 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4+(ptrdiff_t)64*16*3);
__m512 dat2607 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3+(ptrdiff_t)64*16*3);
__m512 dat2606 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2+(ptrdiff_t)64*16*3);
__m512 dat2605 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1+(ptrdiff_t)64*16*3);
__m512 dat2604 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0+(ptrdiff_t)64*16*3);
dat2618 = ResNet50Exp1(_mm512_add_ps(neg1, dat2618));
sum861 = _mm512_mask_add_ps(sum861, 255, sum861, dat2618);
dat2617 = ResNet50Exp1(_mm512_add_ps(neg1, dat2617));
sum861 = _mm512_add_ps(sum861, dat2617);
dat2616 = ResNet50Exp1(_mm512_add_ps(neg1, dat2616));
sum861 = _mm512_add_ps(sum861, dat2616);
dat2615 = ResNet50Exp1(_mm512_add_ps(neg1, dat2615));
sum861 = _mm512_add_ps(sum861, dat2615);
dat2614 = ResNet50Exp1(_mm512_add_ps(neg1, dat2614));
sum861 = _mm512_add_ps(sum861, dat2614);
dat2613 = ResNet50Exp1(_mm512_add_ps(neg1, dat2613));
sum861 = _mm512_add_ps(sum861, dat2613);
dat2612 = ResNet50Exp1(_mm512_add_ps(neg1, dat2612));
sum861 = _mm512_add_ps(sum861, dat2612);
dat2611 = ResNet50Exp1(_mm512_add_ps(neg1, dat2611));
sum861 = _mm512_add_ps(sum861, dat2611);
dat2610 = ResNet50Exp1(_mm512_add_ps(neg1, dat2610));
sum861 = _mm512_add_ps(sum861, dat2610);
dat2609 = ResNet50Exp1(_mm512_add_ps(neg1, dat2609));
sum861 = _mm512_add_ps(sum861, dat2609);
dat2608 = ResNet50Exp1(_mm512_add_ps(neg1, dat2608));
sum861 = _mm512_add_ps(sum861, dat2608);
dat2607 = ResNet50Exp1(_mm512_add_ps(neg1, dat2607));
sum861 = _mm512_add_ps(sum861, dat2607);
dat2606 = ResNet50Exp1(_mm512_add_ps(neg1, dat2606));
sum861 = _mm512_add_ps(sum861, dat2606);
dat2605 = ResNet50Exp1(_mm512_add_ps(neg1, dat2605));
sum861 = _mm512_add_ps(sum861, dat2605);
dat2604 = ResNet50Exp1(_mm512_add_ps(neg1, dat2604));
sum861 = _mm512_add_ps(sum861, dat2604);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*14+(ptrdiff_t)64*16*3, 255, dat2618);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*13+(ptrdiff_t)64*16*3, 65535, dat2617);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*12+(ptrdiff_t)64*16*3, 65535, dat2616);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*11+(ptrdiff_t)64*16*3, 65535, dat2615);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*10+(ptrdiff_t)64*16*3, 65535, dat2614);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*9+(ptrdiff_t)64*16*3, 65535, dat2613);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*8+(ptrdiff_t)64*16*3, 65535, dat2612);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*7+(ptrdiff_t)64*16*3, 65535, dat2611);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*6+(ptrdiff_t)64*16*3, 65535, dat2610);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*5+(ptrdiff_t)64*16*3, 65535, dat2609);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*4+(ptrdiff_t)64*16*3, 65535, dat2608);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*3+(ptrdiff_t)64*16*3, 65535, dat2607);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*2+(ptrdiff_t)64*16*3, 65535, dat2606);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*1+(ptrdiff_t)64*16*3, 65535, dat2605);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*0+(ptrdiff_t)64*16*3, 65535, dat2604);
for (ptrdiff_t i94 = 2; i94 >= 0; --i94) {
__m512 dat2603 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*15+(ptrdiff_t)64*16*i94);
__m512 dat2602 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*14+(ptrdiff_t)64*16*i94);
__m512 dat2601 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13+(ptrdiff_t)64*16*i94);
__m512 dat2600 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12+(ptrdiff_t)64*16*i94);
__m512 dat2599 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11+(ptrdiff_t)64*16*i94);
__m512 dat2598 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10+(ptrdiff_t)64*16*i94);
__m512 dat2597 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9+(ptrdiff_t)64*16*i94);
__m512 dat2596 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8+(ptrdiff_t)64*16*i94);
__m512 dat2595 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7+(ptrdiff_t)64*16*i94);
__m512 dat2594 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6+(ptrdiff_t)64*16*i94);
__m512 dat2593 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5+(ptrdiff_t)64*16*i94);
__m512 dat2592 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4+(ptrdiff_t)64*16*i94);
__m512 dat2591 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3+(ptrdiff_t)64*16*i94);
__m512 dat2590 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2+(ptrdiff_t)64*16*i94);
__m512 dat2589 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1+(ptrdiff_t)64*16*i94);
__m512 dat2588 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0+(ptrdiff_t)64*16*i94);
dat2603 = ResNet50Exp1(_mm512_add_ps(neg1, dat2603));
sum861 = _mm512_add_ps(sum861, dat2603);
dat2602 = ResNet50Exp1(_mm512_add_ps(neg1, dat2602));
sum861 = _mm512_add_ps(sum861, dat2602);
dat2601 = ResNet50Exp1(_mm512_add_ps(neg1, dat2601));
sum861 = _mm512_add_ps(sum861, dat2601);
dat2600 = ResNet50Exp1(_mm512_add_ps(neg1, dat2600));
sum861 = _mm512_add_ps(sum861, dat2600);
dat2599 = ResNet50Exp1(_mm512_add_ps(neg1, dat2599));
sum861 = _mm512_add_ps(sum861, dat2599);
dat2598 = ResNet50Exp1(_mm512_add_ps(neg1, dat2598));
sum861 = _mm512_add_ps(sum861, dat2598);
dat2597 = ResNet50Exp1(_mm512_add_ps(neg1, dat2597));
sum861 = _mm512_add_ps(sum861, dat2597);
dat2596 = ResNet50Exp1(_mm512_add_ps(neg1, dat2596));
sum861 = _mm512_add_ps(sum861, dat2596);
dat2595 = ResNet50Exp1(_mm512_add_ps(neg1, dat2595));
sum861 = _mm512_add_ps(sum861, dat2595);
dat2594 = ResNet50Exp1(_mm512_add_ps(neg1, dat2594));
sum861 = _mm512_add_ps(sum861, dat2594);
dat2593 = ResNet50Exp1(_mm512_add_ps(neg1, dat2593));
sum861 = _mm512_add_ps(sum861, dat2593);
dat2592 = ResNet50Exp1(_mm512_add_ps(neg1, dat2592));
sum861 = _mm512_add_ps(sum861, dat2592);
dat2591 = ResNet50Exp1(_mm512_add_ps(neg1, dat2591));
sum861 = _mm512_add_ps(sum861, dat2591);
dat2590 = ResNet50Exp1(_mm512_add_ps(neg1, dat2590));
sum861 = _mm512_add_ps(sum861, dat2590);
dat2589 = ResNet50Exp1(_mm512_add_ps(neg1, dat2589));
sum861 = _mm512_add_ps(sum861, dat2589);
dat2588 = ResNet50Exp1(_mm512_add_ps(neg1, dat2588));
sum861 = _mm512_add_ps(sum861, dat2588);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*15+(ptrdiff_t)64*16*i94, 65535, dat2603);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*14+(ptrdiff_t)64*16*i94, 65535, dat2602);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*13+(ptrdiff_t)64*16*i94, 65535, dat2601);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*12+(ptrdiff_t)64*16*i94, 65535, dat2600);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*11+(ptrdiff_t)64*16*i94, 65535, dat2599);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*10+(ptrdiff_t)64*16*i94, 65535, dat2598);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*9+(ptrdiff_t)64*16*i94, 65535, dat2597);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*8+(ptrdiff_t)64*16*i94, 65535, dat2596);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*7+(ptrdiff_t)64*16*i94, 65535, dat2595);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*6+(ptrdiff_t)64*16*i94, 65535, dat2594);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*5+(ptrdiff_t)64*16*i94, 65535, dat2593);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*4+(ptrdiff_t)64*16*i94, 65535, dat2592);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*3+(ptrdiff_t)64*16*i94, 65535, dat2591);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*2+(ptrdiff_t)64*16*i94, 65535, dat2590);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*1+(ptrdiff_t)64*16*i94, 65535, dat2589);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*0+(ptrdiff_t)64*16*i94, 65535, dat2588);
}
__m512i p7 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8);
sum861 = _mm512_mask_add_ps(sum861, 255, sum861, _mm512_permutexvar_ps(p7, sum861));
__m512i p8 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4);
sum861 = _mm512_mask_add_ps(sum861, 15, sum861, _mm512_permutexvar_ps(p8, sum861));
__m512i p9 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2);
sum861 = _mm512_mask_add_ps(sum861, 3, sum861, _mm512_permutexvar_ps(p9, sum861));
__m512i p10 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
sum861 = _mm512_mask_add_ps(sum861, 1, sum861, _mm512_permutexvar_ps(p10, sum861));
__m512i p11 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
sum861 = _mm512_permutexvar_ps(p11, sum861);
__m512 rcp43 = _mm512_div_ps(_mm512_set1_ps(1e+00f), sum861);
for (ptrdiff_t i95 = 0; i95 < 62; ++i95) {
__m512 dat2619 = _mm512_maskz_loadu_ps(65535, ptr6+(ptrdiff_t)64*i95);
dat2619 = _mm512_mul_ps(rcp43, dat2619);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*i95, 65535, dat2619);
}
__m512 dat2620 = _mm512_maskz_loadu_ps(255, ptr6+(ptrdiff_t)64*62);
dat2620 = _mm512_mul_ps(rcp43, dat2620);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*62, 255, dat2620);
}

static __m512 ResNet50Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void ResNet50BnSimplify1(
float*restrict means1,
float*restrict variances1,
float*restrict scales1,
float*restrict shifts1,
char*restrict mas1
) {
__m512 eps1 = _mm512_set1_ps(1e-05f);
__m512i xlo1 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi1 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
__m512 va1 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*0);
__m512 va2 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*1);
__m512 va3 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*2);
__m512 va4 = _mm512_loadu_ps(variances1+(ptrdiff_t)16*3);
__m512 rcp1 = ResNet50Rsqrt1(_mm512_add_ps(eps1, va1));
__m512 rcp2 = ResNet50Rsqrt1(_mm512_add_ps(eps1, va2));
__m512 rcp3 = ResNet50Rsqrt1(_mm512_add_ps(eps1, va3));
__m512 rcp4 = ResNet50Rsqrt1(_mm512_add_ps(eps1, va4));
__m512 sc1 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*0);
__m512 sc2 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*1);
__m512 sc3 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*2);
__m512 sc4 = _mm512_loadu_ps(scales1+(ptrdiff_t)16*3);
__m512 mul1 = _mm512_mul_ps(rcp1, sc1);
__m512 mul2 = _mm512_mul_ps(rcp2, sc2);
__m512 mul3 = _mm512_mul_ps(rcp3, sc3);
__m512 mul4 = _mm512_mul_ps(rcp4, sc4);
__m512 me1 = _mm512_loadu_ps(means1+(ptrdiff_t)16*0);
__m512 me2 = _mm512_loadu_ps(means1+(ptrdiff_t)16*1);
__m512 me3 = _mm512_loadu_ps(means1+(ptrdiff_t)16*2);
__m512 me4 = _mm512_loadu_ps(means1+(ptrdiff_t)16*3);
__m512 sh1 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*0);
__m512 sh2 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*1);
__m512 sh3 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*2);
__m512 sh4 = _mm512_loadu_ps(shifts1+(ptrdiff_t)16*3);
__m512 add1 = _mm512_fnmadd_ps(me1, mul1, sh1);
__m512 add2 = _mm512_fnmadd_ps(me2, mul2, sh2);
__m512 add3 = _mm512_fnmadd_ps(me3, mul3, sh3);
__m512 add4 = _mm512_fnmadd_ps(me4, mul4, sh4);
__m512 lo1 = _mm512_permutex2var_ps(mul1, xlo1, add1);
__m512 lo2 = _mm512_permutex2var_ps(mul2, xlo1, add2);
__m512 lo3 = _mm512_permutex2var_ps(mul3, xlo1, add3);
__m512 lo4 = _mm512_permutex2var_ps(mul4, xlo1, add4);
__m512 hi1 = _mm512_permutex2var_ps(mul1, xhi1, add1);
__m512 hi2 = _mm512_permutex2var_ps(mul2, xhi1, add2);
__m512 hi3 = _mm512_permutex2var_ps(mul3, xhi1, add3);
__m512 hi4 = _mm512_permutex2var_ps(mul4, xhi1, add4);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*0, lo1);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*1, hi1);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*2, lo2);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*3, hi2);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*4, lo3);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*5, hi3);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*6, lo4);
_mm512_storeu_ps(mas1+(ptrdiff_t)64*7, hi4);
}

static void ResNet50BnSimplify2(
float*restrict means2,
float*restrict variances2,
float*restrict scales2,
float*restrict shifts2,
char*restrict mas3
) {
__m512 eps2 = _mm512_set1_ps(1e-05f);
__m512i xlo2 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi2 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i11 = 0; i11 < 3; ++i11) {
__m512 va5 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 va6 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 va7 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 va8 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 va9 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 rcp5 = ResNet50Rsqrt1(_mm512_add_ps(eps2, va5));
__m512 rcp6 = ResNet50Rsqrt1(_mm512_add_ps(eps2, va6));
__m512 rcp7 = ResNet50Rsqrt1(_mm512_add_ps(eps2, va7));
__m512 rcp8 = ResNet50Rsqrt1(_mm512_add_ps(eps2, va8));
__m512 rcp9 = ResNet50Rsqrt1(_mm512_add_ps(eps2, va9));
__m512 sc5 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 sc6 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 sc7 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 sc8 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 sc9 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 mul5 = _mm512_mul_ps(rcp5, sc5);
__m512 mul6 = _mm512_mul_ps(rcp6, sc6);
__m512 mul7 = _mm512_mul_ps(rcp7, sc7);
__m512 mul8 = _mm512_mul_ps(rcp8, sc8);
__m512 mul9 = _mm512_mul_ps(rcp9, sc9);
__m512 me5 = _mm512_loadu_ps(means2+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 me6 = _mm512_loadu_ps(means2+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 me7 = _mm512_loadu_ps(means2+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 me8 = _mm512_loadu_ps(means2+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 me9 = _mm512_loadu_ps(means2+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 sh5 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 sh6 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 sh7 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 sh8 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 sh9 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 add5 = _mm512_fnmadd_ps(me5, mul5, sh5);
__m512 add6 = _mm512_fnmadd_ps(me6, mul6, sh6);
__m512 add7 = _mm512_fnmadd_ps(me7, mul7, sh7);
__m512 add8 = _mm512_fnmadd_ps(me8, mul8, sh8);
__m512 add9 = _mm512_fnmadd_ps(me9, mul9, sh9);
__m512 lo5 = _mm512_permutex2var_ps(mul5, xlo2, add5);
__m512 lo6 = _mm512_permutex2var_ps(mul6, xlo2, add6);
__m512 lo7 = _mm512_permutex2var_ps(mul7, xlo2, add7);
__m512 lo8 = _mm512_permutex2var_ps(mul8, xlo2, add8);
__m512 lo9 = _mm512_permutex2var_ps(mul9, xlo2, add9);
__m512 hi5 = _mm512_permutex2var_ps(mul5, xhi2, add5);
__m512 hi6 = _mm512_permutex2var_ps(mul6, xhi2, add6);
__m512 hi7 = _mm512_permutex2var_ps(mul7, xhi2, add7);
__m512 hi8 = _mm512_permutex2var_ps(mul8, xhi2, add8);
__m512 hi9 = _mm512_permutex2var_ps(mul9, xhi2, add9);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*0+(ptrdiff_t)640*i11, lo5);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*1+(ptrdiff_t)640*i11, hi5);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*2+(ptrdiff_t)640*i11, lo6);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*3+(ptrdiff_t)640*i11, hi6);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*4+(ptrdiff_t)640*i11, lo7);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*5+(ptrdiff_t)640*i11, hi7);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*6+(ptrdiff_t)640*i11, lo8);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*7+(ptrdiff_t)640*i11, hi8);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*8+(ptrdiff_t)640*i11, lo9);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*9+(ptrdiff_t)640*i11, hi9);
}
__m512 va10 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 rcp10 = ResNet50Rsqrt1(_mm512_add_ps(eps2, va10));
__m512 sc10 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 mul10 = _mm512_mul_ps(rcp10, sc10);
__m512 me10 = _mm512_loadu_ps(means2+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 sh10 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 add10 = _mm512_fnmadd_ps(me10, mul10, sh10);
__m512 lo10 = _mm512_permutex2var_ps(mul10, xlo2, add10);
__m512 hi10 = _mm512_permutex2var_ps(mul10, xhi2, add10);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*0+(ptrdiff_t)640*3, lo10);
_mm512_storeu_ps(mas3+(ptrdiff_t)64*1+(ptrdiff_t)640*3, hi10);
}

static void ResNet50BnSimplify3(
float*restrict means3,
float*restrict variances3,
float*restrict scales3,
float*restrict shifts3,
char*restrict mas6
) {
__m512 eps3 = _mm512_set1_ps(1e-05f);
__m512i xlo3 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi3 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i29 = 0; i29 < 6; ++i29) {
__m512 va11 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*0+(ptrdiff_t)80*i29);
__m512 va12 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*1+(ptrdiff_t)80*i29);
__m512 va13 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*2+(ptrdiff_t)80*i29);
__m512 va14 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*3+(ptrdiff_t)80*i29);
__m512 va15 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*4+(ptrdiff_t)80*i29);
__m512 rcp11 = ResNet50Rsqrt1(_mm512_add_ps(eps3, va11));
__m512 rcp12 = ResNet50Rsqrt1(_mm512_add_ps(eps3, va12));
__m512 rcp13 = ResNet50Rsqrt1(_mm512_add_ps(eps3, va13));
__m512 rcp14 = ResNet50Rsqrt1(_mm512_add_ps(eps3, va14));
__m512 rcp15 = ResNet50Rsqrt1(_mm512_add_ps(eps3, va15));
__m512 sc11 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*0+(ptrdiff_t)80*i29);
__m512 sc12 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*1+(ptrdiff_t)80*i29);
__m512 sc13 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*2+(ptrdiff_t)80*i29);
__m512 sc14 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*3+(ptrdiff_t)80*i29);
__m512 sc15 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*4+(ptrdiff_t)80*i29);
__m512 mul11 = _mm512_mul_ps(rcp11, sc11);
__m512 mul12 = _mm512_mul_ps(rcp12, sc12);
__m512 mul13 = _mm512_mul_ps(rcp13, sc13);
__m512 mul14 = _mm512_mul_ps(rcp14, sc14);
__m512 mul15 = _mm512_mul_ps(rcp15, sc15);
__m512 me11 = _mm512_loadu_ps(means3+(ptrdiff_t)16*0+(ptrdiff_t)80*i29);
__m512 me12 = _mm512_loadu_ps(means3+(ptrdiff_t)16*1+(ptrdiff_t)80*i29);
__m512 me13 = _mm512_loadu_ps(means3+(ptrdiff_t)16*2+(ptrdiff_t)80*i29);
__m512 me14 = _mm512_loadu_ps(means3+(ptrdiff_t)16*3+(ptrdiff_t)80*i29);
__m512 me15 = _mm512_loadu_ps(means3+(ptrdiff_t)16*4+(ptrdiff_t)80*i29);
__m512 sh11 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*0+(ptrdiff_t)80*i29);
__m512 sh12 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*1+(ptrdiff_t)80*i29);
__m512 sh13 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*2+(ptrdiff_t)80*i29);
__m512 sh14 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*3+(ptrdiff_t)80*i29);
__m512 sh15 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*4+(ptrdiff_t)80*i29);
__m512 add11 = _mm512_fnmadd_ps(me11, mul11, sh11);
__m512 add12 = _mm512_fnmadd_ps(me12, mul12, sh12);
__m512 add13 = _mm512_fnmadd_ps(me13, mul13, sh13);
__m512 add14 = _mm512_fnmadd_ps(me14, mul14, sh14);
__m512 add15 = _mm512_fnmadd_ps(me15, mul15, sh15);
__m512 lo11 = _mm512_permutex2var_ps(mul11, xlo3, add11);
__m512 lo12 = _mm512_permutex2var_ps(mul12, xlo3, add12);
__m512 lo13 = _mm512_permutex2var_ps(mul13, xlo3, add13);
__m512 lo14 = _mm512_permutex2var_ps(mul14, xlo3, add14);
__m512 lo15 = _mm512_permutex2var_ps(mul15, xlo3, add15);
__m512 hi11 = _mm512_permutex2var_ps(mul11, xhi3, add11);
__m512 hi12 = _mm512_permutex2var_ps(mul12, xhi3, add12);
__m512 hi13 = _mm512_permutex2var_ps(mul13, xhi3, add13);
__m512 hi14 = _mm512_permutex2var_ps(mul14, xhi3, add14);
__m512 hi15 = _mm512_permutex2var_ps(mul15, xhi3, add15);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*0+(ptrdiff_t)640*i29, lo11);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*1+(ptrdiff_t)640*i29, hi11);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*2+(ptrdiff_t)640*i29, lo12);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*3+(ptrdiff_t)640*i29, hi12);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*4+(ptrdiff_t)640*i29, lo13);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*5+(ptrdiff_t)640*i29, hi13);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*6+(ptrdiff_t)640*i29, lo14);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*7+(ptrdiff_t)640*i29, hi14);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*8+(ptrdiff_t)640*i29, lo15);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*9+(ptrdiff_t)640*i29, hi15);
}
__m512 va16 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 va17 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 rcp16 = ResNet50Rsqrt1(_mm512_add_ps(eps3, va16));
__m512 rcp17 = ResNet50Rsqrt1(_mm512_add_ps(eps3, va17));
__m512 sc16 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 sc17 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 mul16 = _mm512_mul_ps(rcp16, sc16);
__m512 mul17 = _mm512_mul_ps(rcp17, sc17);
__m512 me16 = _mm512_loadu_ps(means3+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 me17 = _mm512_loadu_ps(means3+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 sh16 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 sh17 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 add16 = _mm512_fnmadd_ps(me16, mul16, sh16);
__m512 add17 = _mm512_fnmadd_ps(me17, mul17, sh17);
__m512 lo16 = _mm512_permutex2var_ps(mul16, xlo3, add16);
__m512 lo17 = _mm512_permutex2var_ps(mul17, xlo3, add17);
__m512 hi16 = _mm512_permutex2var_ps(mul16, xhi3, add16);
__m512 hi17 = _mm512_permutex2var_ps(mul17, xhi3, add17);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*0+(ptrdiff_t)640*6, lo16);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*1+(ptrdiff_t)640*6, hi16);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*2+(ptrdiff_t)640*6, lo17);
_mm512_storeu_ps(mas6+(ptrdiff_t)64*3+(ptrdiff_t)640*6, hi17);
}

static void ResNet50BnSimplify4(
float*restrict means4,
float*restrict variances4,
float*restrict scales4,
float*restrict shifts4,
char*restrict mas7
) {
__m512 eps4 = _mm512_set1_ps(1e-05f);
__m512i xlo4 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi4 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i30 = 0; i30 < 1; ++i30) {
__m512 va18 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 va19 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 va20 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 va21 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 va22 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 rcp18 = ResNet50Rsqrt1(_mm512_add_ps(eps4, va18));
__m512 rcp19 = ResNet50Rsqrt1(_mm512_add_ps(eps4, va19));
__m512 rcp20 = ResNet50Rsqrt1(_mm512_add_ps(eps4, va20));
__m512 rcp21 = ResNet50Rsqrt1(_mm512_add_ps(eps4, va21));
__m512 rcp22 = ResNet50Rsqrt1(_mm512_add_ps(eps4, va22));
__m512 sc18 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 sc19 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 sc20 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 sc21 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 sc22 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 mul18 = _mm512_mul_ps(rcp18, sc18);
__m512 mul19 = _mm512_mul_ps(rcp19, sc19);
__m512 mul20 = _mm512_mul_ps(rcp20, sc20);
__m512 mul21 = _mm512_mul_ps(rcp21, sc21);
__m512 mul22 = _mm512_mul_ps(rcp22, sc22);
__m512 me18 = _mm512_loadu_ps(means4+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 me19 = _mm512_loadu_ps(means4+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 me20 = _mm512_loadu_ps(means4+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 me21 = _mm512_loadu_ps(means4+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 me22 = _mm512_loadu_ps(means4+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 sh18 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 sh19 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 sh20 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 sh21 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 sh22 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 add18 = _mm512_fnmadd_ps(me18, mul18, sh18);
__m512 add19 = _mm512_fnmadd_ps(me19, mul19, sh19);
__m512 add20 = _mm512_fnmadd_ps(me20, mul20, sh20);
__m512 add21 = _mm512_fnmadd_ps(me21, mul21, sh21);
__m512 add22 = _mm512_fnmadd_ps(me22, mul22, sh22);
__m512 lo18 = _mm512_permutex2var_ps(mul18, xlo4, add18);
__m512 lo19 = _mm512_permutex2var_ps(mul19, xlo4, add19);
__m512 lo20 = _mm512_permutex2var_ps(mul20, xlo4, add20);
__m512 lo21 = _mm512_permutex2var_ps(mul21, xlo4, add21);
__m512 lo22 = _mm512_permutex2var_ps(mul22, xlo4, add22);
__m512 hi18 = _mm512_permutex2var_ps(mul18, xhi4, add18);
__m512 hi19 = _mm512_permutex2var_ps(mul19, xhi4, add19);
__m512 hi20 = _mm512_permutex2var_ps(mul20, xhi4, add20);
__m512 hi21 = _mm512_permutex2var_ps(mul21, xhi4, add21);
__m512 hi22 = _mm512_permutex2var_ps(mul22, xhi4, add22);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*0+(ptrdiff_t)640*i30, lo18);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*1+(ptrdiff_t)640*i30, hi18);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*2+(ptrdiff_t)640*i30, lo19);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*3+(ptrdiff_t)640*i30, hi19);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*4+(ptrdiff_t)640*i30, lo20);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*5+(ptrdiff_t)640*i30, hi20);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*6+(ptrdiff_t)640*i30, lo21);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*7+(ptrdiff_t)640*i30, hi21);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*8+(ptrdiff_t)640*i30, lo22);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*9+(ptrdiff_t)640*i30, hi22);
}
__m512 va23 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 va24 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 va25 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 rcp23 = ResNet50Rsqrt1(_mm512_add_ps(eps4, va23));
__m512 rcp24 = ResNet50Rsqrt1(_mm512_add_ps(eps4, va24));
__m512 rcp25 = ResNet50Rsqrt1(_mm512_add_ps(eps4, va25));
__m512 sc23 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 sc24 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 sc25 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 mul23 = _mm512_mul_ps(rcp23, sc23);
__m512 mul24 = _mm512_mul_ps(rcp24, sc24);
__m512 mul25 = _mm512_mul_ps(rcp25, sc25);
__m512 me23 = _mm512_loadu_ps(means4+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 me24 = _mm512_loadu_ps(means4+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 me25 = _mm512_loadu_ps(means4+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 sh23 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 sh24 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 sh25 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 add23 = _mm512_fnmadd_ps(me23, mul23, sh23);
__m512 add24 = _mm512_fnmadd_ps(me24, mul24, sh24);
__m512 add25 = _mm512_fnmadd_ps(me25, mul25, sh25);
__m512 lo23 = _mm512_permutex2var_ps(mul23, xlo4, add23);
__m512 lo24 = _mm512_permutex2var_ps(mul24, xlo4, add24);
__m512 lo25 = _mm512_permutex2var_ps(mul25, xlo4, add25);
__m512 hi23 = _mm512_permutex2var_ps(mul23, xhi4, add23);
__m512 hi24 = _mm512_permutex2var_ps(mul24, xhi4, add24);
__m512 hi25 = _mm512_permutex2var_ps(mul25, xhi4, add25);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*0+(ptrdiff_t)640*1, lo23);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*1+(ptrdiff_t)640*1, hi23);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*2+(ptrdiff_t)640*1, lo24);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*3+(ptrdiff_t)640*1, hi24);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*4+(ptrdiff_t)640*1, lo25);
_mm512_storeu_ps(mas7+(ptrdiff_t)64*5+(ptrdiff_t)640*1, hi25);
}

static void ResNet50BnSimplify5(
float*restrict means5,
float*restrict variances5,
float*restrict scales5,
float*restrict shifts5,
char*restrict mas10
) {
__m512 eps5 = _mm512_set1_ps(1e-05f);
__m512i xlo5 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi5 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i48 = 0; i48 < 12; ++i48) {
__m512 va26 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*0+(ptrdiff_t)80*i48);
__m512 va27 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*1+(ptrdiff_t)80*i48);
__m512 va28 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*2+(ptrdiff_t)80*i48);
__m512 va29 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*3+(ptrdiff_t)80*i48);
__m512 va30 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*4+(ptrdiff_t)80*i48);
__m512 rcp26 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va26));
__m512 rcp27 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va27));
__m512 rcp28 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va28));
__m512 rcp29 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va29));
__m512 rcp30 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va30));
__m512 sc26 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*0+(ptrdiff_t)80*i48);
__m512 sc27 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*1+(ptrdiff_t)80*i48);
__m512 sc28 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*2+(ptrdiff_t)80*i48);
__m512 sc29 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*3+(ptrdiff_t)80*i48);
__m512 sc30 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*4+(ptrdiff_t)80*i48);
__m512 mul26 = _mm512_mul_ps(rcp26, sc26);
__m512 mul27 = _mm512_mul_ps(rcp27, sc27);
__m512 mul28 = _mm512_mul_ps(rcp28, sc28);
__m512 mul29 = _mm512_mul_ps(rcp29, sc29);
__m512 mul30 = _mm512_mul_ps(rcp30, sc30);
__m512 me26 = _mm512_loadu_ps(means5+(ptrdiff_t)16*0+(ptrdiff_t)80*i48);
__m512 me27 = _mm512_loadu_ps(means5+(ptrdiff_t)16*1+(ptrdiff_t)80*i48);
__m512 me28 = _mm512_loadu_ps(means5+(ptrdiff_t)16*2+(ptrdiff_t)80*i48);
__m512 me29 = _mm512_loadu_ps(means5+(ptrdiff_t)16*3+(ptrdiff_t)80*i48);
__m512 me30 = _mm512_loadu_ps(means5+(ptrdiff_t)16*4+(ptrdiff_t)80*i48);
__m512 sh26 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*0+(ptrdiff_t)80*i48);
__m512 sh27 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*1+(ptrdiff_t)80*i48);
__m512 sh28 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*2+(ptrdiff_t)80*i48);
__m512 sh29 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*3+(ptrdiff_t)80*i48);
__m512 sh30 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*4+(ptrdiff_t)80*i48);
__m512 add26 = _mm512_fnmadd_ps(me26, mul26, sh26);
__m512 add27 = _mm512_fnmadd_ps(me27, mul27, sh27);
__m512 add28 = _mm512_fnmadd_ps(me28, mul28, sh28);
__m512 add29 = _mm512_fnmadd_ps(me29, mul29, sh29);
__m512 add30 = _mm512_fnmadd_ps(me30, mul30, sh30);
__m512 lo26 = _mm512_permutex2var_ps(mul26, xlo5, add26);
__m512 lo27 = _mm512_permutex2var_ps(mul27, xlo5, add27);
__m512 lo28 = _mm512_permutex2var_ps(mul28, xlo5, add28);
__m512 lo29 = _mm512_permutex2var_ps(mul29, xlo5, add29);
__m512 lo30 = _mm512_permutex2var_ps(mul30, xlo5, add30);
__m512 hi26 = _mm512_permutex2var_ps(mul26, xhi5, add26);
__m512 hi27 = _mm512_permutex2var_ps(mul27, xhi5, add27);
__m512 hi28 = _mm512_permutex2var_ps(mul28, xhi5, add28);
__m512 hi29 = _mm512_permutex2var_ps(mul29, xhi5, add29);
__m512 hi30 = _mm512_permutex2var_ps(mul30, xhi5, add30);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*0+(ptrdiff_t)640*i48, lo26);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*1+(ptrdiff_t)640*i48, hi26);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*2+(ptrdiff_t)640*i48, lo27);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*3+(ptrdiff_t)640*i48, hi27);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*4+(ptrdiff_t)640*i48, lo28);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*5+(ptrdiff_t)640*i48, hi28);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*6+(ptrdiff_t)640*i48, lo29);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*7+(ptrdiff_t)640*i48, hi29);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*8+(ptrdiff_t)640*i48, lo30);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*9+(ptrdiff_t)640*i48, hi30);
}
__m512 va31 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 va32 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 va33 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 va34 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 rcp31 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va31));
__m512 rcp32 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va32));
__m512 rcp33 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va33));
__m512 rcp34 = ResNet50Rsqrt1(_mm512_add_ps(eps5, va34));
__m512 sc31 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 sc32 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 sc33 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 sc34 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 mul31 = _mm512_mul_ps(rcp31, sc31);
__m512 mul32 = _mm512_mul_ps(rcp32, sc32);
__m512 mul33 = _mm512_mul_ps(rcp33, sc33);
__m512 mul34 = _mm512_mul_ps(rcp34, sc34);
__m512 me31 = _mm512_loadu_ps(means5+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 me32 = _mm512_loadu_ps(means5+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 me33 = _mm512_loadu_ps(means5+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 me34 = _mm512_loadu_ps(means5+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 sh31 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 sh32 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 sh33 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 sh34 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 add31 = _mm512_fnmadd_ps(me31, mul31, sh31);
__m512 add32 = _mm512_fnmadd_ps(me32, mul32, sh32);
__m512 add33 = _mm512_fnmadd_ps(me33, mul33, sh33);
__m512 add34 = _mm512_fnmadd_ps(me34, mul34, sh34);
__m512 lo31 = _mm512_permutex2var_ps(mul31, xlo5, add31);
__m512 lo32 = _mm512_permutex2var_ps(mul32, xlo5, add32);
__m512 lo33 = _mm512_permutex2var_ps(mul33, xlo5, add33);
__m512 lo34 = _mm512_permutex2var_ps(mul34, xlo5, add34);
__m512 hi31 = _mm512_permutex2var_ps(mul31, xhi5, add31);
__m512 hi32 = _mm512_permutex2var_ps(mul32, xhi5, add32);
__m512 hi33 = _mm512_permutex2var_ps(mul33, xhi5, add33);
__m512 hi34 = _mm512_permutex2var_ps(mul34, xhi5, add34);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*0+(ptrdiff_t)640*12, lo31);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*1+(ptrdiff_t)640*12, hi31);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*2+(ptrdiff_t)640*12, lo32);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*3+(ptrdiff_t)640*12, hi32);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*4+(ptrdiff_t)640*12, lo33);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*5+(ptrdiff_t)640*12, hi33);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*6+(ptrdiff_t)640*12, lo34);
_mm512_storeu_ps(mas10+(ptrdiff_t)64*7+(ptrdiff_t)640*12, hi34);
}

static void ResNet50BnSimplify6(
float*restrict means6,
float*restrict variances6,
float*restrict scales6,
float*restrict shifts6,
char*restrict mas13
) {
__m512 eps6 = _mm512_set1_ps(1e-05f);
__m512i xlo6 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi6 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i66 = 0; i66 < 25; ++i66) {
__m512 va35 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*0+(ptrdiff_t)80*i66);
__m512 va36 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*1+(ptrdiff_t)80*i66);
__m512 va37 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*2+(ptrdiff_t)80*i66);
__m512 va38 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*3+(ptrdiff_t)80*i66);
__m512 va39 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*4+(ptrdiff_t)80*i66);
__m512 rcp35 = ResNet50Rsqrt1(_mm512_add_ps(eps6, va35));
__m512 rcp36 = ResNet50Rsqrt1(_mm512_add_ps(eps6, va36));
__m512 rcp37 = ResNet50Rsqrt1(_mm512_add_ps(eps6, va37));
__m512 rcp38 = ResNet50Rsqrt1(_mm512_add_ps(eps6, va38));
__m512 rcp39 = ResNet50Rsqrt1(_mm512_add_ps(eps6, va39));
__m512 sc35 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*0+(ptrdiff_t)80*i66);
__m512 sc36 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*1+(ptrdiff_t)80*i66);
__m512 sc37 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*2+(ptrdiff_t)80*i66);
__m512 sc38 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*3+(ptrdiff_t)80*i66);
__m512 sc39 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*4+(ptrdiff_t)80*i66);
__m512 mul35 = _mm512_mul_ps(rcp35, sc35);
__m512 mul36 = _mm512_mul_ps(rcp36, sc36);
__m512 mul37 = _mm512_mul_ps(rcp37, sc37);
__m512 mul38 = _mm512_mul_ps(rcp38, sc38);
__m512 mul39 = _mm512_mul_ps(rcp39, sc39);
__m512 me35 = _mm512_loadu_ps(means6+(ptrdiff_t)16*0+(ptrdiff_t)80*i66);
__m512 me36 = _mm512_loadu_ps(means6+(ptrdiff_t)16*1+(ptrdiff_t)80*i66);
__m512 me37 = _mm512_loadu_ps(means6+(ptrdiff_t)16*2+(ptrdiff_t)80*i66);
__m512 me38 = _mm512_loadu_ps(means6+(ptrdiff_t)16*3+(ptrdiff_t)80*i66);
__m512 me39 = _mm512_loadu_ps(means6+(ptrdiff_t)16*4+(ptrdiff_t)80*i66);
__m512 sh35 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*0+(ptrdiff_t)80*i66);
__m512 sh36 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*1+(ptrdiff_t)80*i66);
__m512 sh37 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*2+(ptrdiff_t)80*i66);
__m512 sh38 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*3+(ptrdiff_t)80*i66);
__m512 sh39 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*4+(ptrdiff_t)80*i66);
__m512 add35 = _mm512_fnmadd_ps(me35, mul35, sh35);
__m512 add36 = _mm512_fnmadd_ps(me36, mul36, sh36);
__m512 add37 = _mm512_fnmadd_ps(me37, mul37, sh37);
__m512 add38 = _mm512_fnmadd_ps(me38, mul38, sh38);
__m512 add39 = _mm512_fnmadd_ps(me39, mul39, sh39);
__m512 lo35 = _mm512_permutex2var_ps(mul35, xlo6, add35);
__m512 lo36 = _mm512_permutex2var_ps(mul36, xlo6, add36);
__m512 lo37 = _mm512_permutex2var_ps(mul37, xlo6, add37);
__m512 lo38 = _mm512_permutex2var_ps(mul38, xlo6, add38);
__m512 lo39 = _mm512_permutex2var_ps(mul39, xlo6, add39);
__m512 hi35 = _mm512_permutex2var_ps(mul35, xhi6, add35);
__m512 hi36 = _mm512_permutex2var_ps(mul36, xhi6, add36);
__m512 hi37 = _mm512_permutex2var_ps(mul37, xhi6, add37);
__m512 hi38 = _mm512_permutex2var_ps(mul38, xhi6, add38);
__m512 hi39 = _mm512_permutex2var_ps(mul39, xhi6, add39);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*0+(ptrdiff_t)640*i66, lo35);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*1+(ptrdiff_t)640*i66, hi35);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*2+(ptrdiff_t)640*i66, lo36);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*3+(ptrdiff_t)640*i66, hi36);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*4+(ptrdiff_t)640*i66, lo37);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*5+(ptrdiff_t)640*i66, hi37);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*6+(ptrdiff_t)640*i66, lo38);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*7+(ptrdiff_t)640*i66, hi38);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*8+(ptrdiff_t)640*i66, lo39);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*9+(ptrdiff_t)640*i66, hi39);
}
__m512 va40 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*0+(ptrdiff_t)80*25);
__m512 va41 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*1+(ptrdiff_t)80*25);
__m512 va42 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*2+(ptrdiff_t)80*25);
__m512 rcp40 = ResNet50Rsqrt1(_mm512_add_ps(eps6, va40));
__m512 rcp41 = ResNet50Rsqrt1(_mm512_add_ps(eps6, va41));
__m512 rcp42 = ResNet50Rsqrt1(_mm512_add_ps(eps6, va42));
__m512 sc40 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*0+(ptrdiff_t)80*25);
__m512 sc41 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*1+(ptrdiff_t)80*25);
__m512 sc42 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*2+(ptrdiff_t)80*25);
__m512 mul40 = _mm512_mul_ps(rcp40, sc40);
__m512 mul41 = _mm512_mul_ps(rcp41, sc41);
__m512 mul42 = _mm512_mul_ps(rcp42, sc42);
__m512 me40 = _mm512_loadu_ps(means6+(ptrdiff_t)16*0+(ptrdiff_t)80*25);
__m512 me41 = _mm512_loadu_ps(means6+(ptrdiff_t)16*1+(ptrdiff_t)80*25);
__m512 me42 = _mm512_loadu_ps(means6+(ptrdiff_t)16*2+(ptrdiff_t)80*25);
__m512 sh40 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*0+(ptrdiff_t)80*25);
__m512 sh41 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*1+(ptrdiff_t)80*25);
__m512 sh42 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*2+(ptrdiff_t)80*25);
__m512 add40 = _mm512_fnmadd_ps(me40, mul40, sh40);
__m512 add41 = _mm512_fnmadd_ps(me41, mul41, sh41);
__m512 add42 = _mm512_fnmadd_ps(me42, mul42, sh42);
__m512 lo40 = _mm512_permutex2var_ps(mul40, xlo6, add40);
__m512 lo41 = _mm512_permutex2var_ps(mul41, xlo6, add41);
__m512 lo42 = _mm512_permutex2var_ps(mul42, xlo6, add42);
__m512 hi40 = _mm512_permutex2var_ps(mul40, xhi6, add40);
__m512 hi41 = _mm512_permutex2var_ps(mul41, xhi6, add41);
__m512 hi42 = _mm512_permutex2var_ps(mul42, xhi6, add42);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*0+(ptrdiff_t)640*25, lo40);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*1+(ptrdiff_t)640*25, hi40);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*2+(ptrdiff_t)640*25, lo41);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*3+(ptrdiff_t)640*25, hi41);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*4+(ptrdiff_t)640*25, lo42);
_mm512_storeu_ps(mas13+(ptrdiff_t)64*5+(ptrdiff_t)640*25, hi42);
}

static void ResNet50Glopl1Callee1(ResNet50ThreaderTask1* task154, int64_t* pt82) {
char** tensors150 = task154->any1;
ptrdiff_t c69 = pt82[0];
char*restrict ptr3 = tensors150[0]+(ptrdiff_t)40960*c69;
char*restrict ptr4 = tensors150[1]+(ptrdiff_t)512*c69;
__m512 buf1 = _mm512_setzero_ps();
__mmask16 mask3 = 65535;
for (ptrdiff_t i88 = 0; i88 < 64; ++i88) {
__m512 acc1 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)0+(ptrdiff_t)640*i88);
__m512 acc2 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)64+(ptrdiff_t)640*i88);
__m512 acc3 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)128+(ptrdiff_t)640*i88);
__m512 acc4 = _mm512_maskz_loadu_ps(1, ptr3+(ptrdiff_t)192+(ptrdiff_t)640*i88);
__m512 acc5 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)320+(ptrdiff_t)640*i88);
__m512 acc6 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)384+(ptrdiff_t)640*i88);
__m512 acc7 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)448+(ptrdiff_t)640*i88);
__m512 acc8 = _mm512_maskz_loadu_ps(1, ptr3+(ptrdiff_t)512+(ptrdiff_t)640*i88);
acc1 = _mm512_mask_add_ps(acc1, 65535, acc1, acc3);
acc5 = _mm512_mask_add_ps(acc5, 65535, acc5, acc7);
acc2 = _mm512_mask_add_ps(acc2, 1, acc2, acc4);
acc6 = _mm512_mask_add_ps(acc6, 1, acc6, acc8);
acc1 = _mm512_mask_add_ps(acc1, 65535, acc1, acc2);
acc5 = _mm512_mask_add_ps(acc5, 65535, acc5, acc6);
__m512i pm1lo1 = _mm512_set_epi32(16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0);
__m512i pm1hi1 = _mm512_set_epi32(17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1);
__m512 hi43 = _mm512_shuffle_f32x4(acc1, acc1, 238);
__m512 hi46 = _mm512_shuffle_f32x4(acc5, acc5, 238);
acc1 = _mm512_mask_add_ps(acc1, 255, acc1, hi43);
acc5 = _mm512_mask_add_ps(acc5, 255, acc5, hi46);
__m512 hi44 = _mm512_shuffle_f32x4(acc1, acc1, 1);
__m512 hi47 = _mm512_shuffle_f32x4(acc5, acc5, 1);
acc1 = _mm512_mask_add_ps(acc1, 15, acc1, hi44);
acc5 = _mm512_mask_add_ps(acc5, 15, acc5, hi47);
__m512 hi45 = _mm512_shuffle_ps(acc1, acc1, 238);
__m512 hi48 = _mm512_shuffle_ps(acc5, acc5, 238);
acc1 = _mm512_mask_add_ps(acc1, 3, acc1, hi45);
acc5 = _mm512_mask_add_ps(acc5, 3, acc5, hi48);
__m512 hi49 = _mm512_permutex2var_ps(acc1, pm1hi1, acc5);
acc1 = _mm512_permutex2var_ps(acc1, pm1lo1, acc5);
acc1 = _mm512_mask_add_ps(acc1, 65535, acc1, hi49);
buf1 = _mm512_mask_mov_ps(buf1, mask3, acc1);
mask3 &= mask3<<2;
if (__builtin_expect(!mask3, 0)) {
mask3 = 65535;
buf1 = _mm512_mul_ps(buf1, _mm512_set1_ps(2.0408163e-02f));
_mm512_mask_storeu_ps(ptr4+(ptrdiff_t)4*((ptrdiff_t)2*i88-14), 65535, buf1);
}
}
}

static void ResNet50Glopl1(ResNet50ThreaderTeam1* team87, char** tensors149) {
ResNet50ThreaderTask1 task155;
task155.callee1 = ResNet50Glopl1Callee1;
task155.any1 = tensors149;
task155.nd1 = 1;
task155.hull1[0] = 16;
ResNet50ThreaderDo1(team87, &task155);
}

static void ResNet50Thrpl1Callee1(ResNet50ThreaderTask1* task12, int64_t* pt11) {
char** tensors10 = task12->any1;
ptrdiff_t b43 = pt11[0];
ptrdiff_t e5 = pt11[1];
ptrdiff_t c4 = pt11[2];
char*restrict ptr1 = tensors10[0]-(ptrdiff_t)448+(ptrdiff_t)50176*b43+(ptrdiff_t)448*e5+(ptrdiff_t)50240*c4;
char*restrict ptr2 = tensors10[1]+(ptrdiff_t)12544*b43+(ptrdiff_t)224*e5+(ptrdiff_t)12608*c4;
for (ptrdiff_t i10 = 0; i10 < 1; ++i10) {
__m512 in1 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
__m512 in2 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
__m512 dat894 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
__m512 dat895 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
in1 = _mm512_max_ps(in1, dat894);
in2 = _mm512_max_ps(in2, dat895);
__m512i pm57 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pm58 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512i pm59 = _mm512_set_epi32(29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, 31);
__m512 out1 = _mm512_permutex2var_ps(in1, pm57, in2);
__m512 pack263 = _mm512_permutex2var_ps(in1, pm58, in2);
__m512 pack264 = _mm512_permutex2var_ps(in1, pm59, in2);
out1 = _mm512_mask_max_ps(out1, 65535, out1, pack263);
out1 = _mm512_mask_max_ps(out1, 65534, out1, pack264);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*0+(ptrdiff_t)64*0, 65535, out1);
for (ptrdiff_t k44 = 1; k44 < 3; ++k44) {
__m512 in3 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
__m512 in4 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
__m512 dat896 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
__m512 dat897 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
in3 = _mm512_max_ps(in3, dat896);
in4 = _mm512_max_ps(in4, dat897);
__m512 blend1 = _mm512_mask_mov_ps(in4, 32768, in2);
__m512 out2 = _mm512_permutex2var_ps(in3, pm57, in4);
__m512 pack265 = _mm512_permutex2var_ps(in3, pm58, in4);
__m512 pack266 = _mm512_permutex2var_ps(in3, pm59, blend1);
out2 = _mm512_mask_max_ps(out2, 65535, out2, pack265);
out2 = _mm512_mask_max_ps(out2, 65535, out2, pack266);
in2 = in4;
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*0+(ptrdiff_t)64*k44, 65535, out2);
}
__m512 in5 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*3);
__m512 dat898 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*3);
in5 = _mm512_max_ps(in5, dat898);
__m512 blend2 = _mm512_mask_mov_ps(in5, 32768, in2);
__m512 out3 = _mm512_permutexvar_ps(pm57, in5);
__m512 pack267 = _mm512_permutexvar_ps(pm58, in5);
__m512 pack268 = _mm512_permutexvar_ps(pm59, blend2);
out3 = _mm512_mask_max_ps(out3, 255, out3, pack267);
out3 = _mm512_mask_max_ps(out3, 255, out3, pack268);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*0+(ptrdiff_t)64*3, 255, out3);
for (ptrdiff_t j6 = 1; j6 < 56; ++j6) {
__m512 in6 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)0+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 in7 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)64+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat899 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat901 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat900 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat902 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
in6 = _mm512_max_ps(in6, dat899);
in7 = _mm512_max_ps(in7, dat901);
in6 = _mm512_max_ps(in6, dat900);
in7 = _mm512_max_ps(in7, dat902);
__m512i pm60 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pm61 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512i pm62 = _mm512_set_epi32(29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, 31);
__m512 out4 = _mm512_permutex2var_ps(in6, pm60, in7);
__m512 pack269 = _mm512_permutex2var_ps(in6, pm61, in7);
__m512 pack270 = _mm512_permutex2var_ps(in6, pm62, in7);
out4 = _mm512_mask_max_ps(out4, 65535, out4, pack269);
out4 = _mm512_mask_max_ps(out4, 65534, out4, pack270);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*j6+(ptrdiff_t)64*0, 65535, out4);
for (ptrdiff_t k45 = 1; k45 < 3; ++k45) {
__m512 in8 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)0+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 in9 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)64+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat903 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat905 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat904 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat906 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
in8 = _mm512_max_ps(in8, dat903);
in9 = _mm512_max_ps(in9, dat905);
in8 = _mm512_max_ps(in8, dat904);
in9 = _mm512_max_ps(in9, dat906);
__m512 blend3 = _mm512_mask_mov_ps(in9, 32768, in7);
__m512 out5 = _mm512_permutex2var_ps(in8, pm60, in9);
__m512 pack271 = _mm512_permutex2var_ps(in8, pm61, in9);
__m512 pack272 = _mm512_permutex2var_ps(in8, pm62, blend3);
out5 = _mm512_mask_max_ps(out5, 65535, out5, pack271);
out5 = _mm512_mask_max_ps(out5, 65535, out5, pack272);
in7 = in9;
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*j6+(ptrdiff_t)64*k45, 65535, out5);
}
__m512 in10 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)0+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*3);
__m512 dat907 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*3);
__m512 dat908 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*3);
in10 = _mm512_max_ps(in10, dat907);
in10 = _mm512_max_ps(in10, dat908);
__m512 blend4 = _mm512_mask_mov_ps(in10, 32768, in7);
__m512 out6 = _mm512_permutexvar_ps(pm60, in10);
__m512 pack273 = _mm512_permutexvar_ps(pm61, in10);
__m512 pack274 = _mm512_permutexvar_ps(pm62, blend4);
out6 = _mm512_mask_max_ps(out6, 255, out6, pack273);
out6 = _mm512_mask_max_ps(out6, 255, out6, pack274);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*j6+(ptrdiff_t)64*3, 255, out6);
}
}
}

static void ResNet50Thrpl1(ResNet50ThreaderTeam1* team18, char** tensors9) {
ResNet50ThreaderTask1 task13;
task13.callee1 = ResNet50Thrpl1Callee1;
task13.any1 = tensors9;
task13.nd1 = 3;
task13.hull1[0] = 1;
task13.hull1[1] = 1;
task13.hull1[2] = 64;
ResNet50ThreaderDo1(team18, &task13);
}

static void ResNet50FcArrange1Callee1(ResNet50ThreaderTask1* task156, int64_t* pt83) {
char** tensors152 = task156->any1;
ptrdiff_t t35 = pt83[0];
char*restrict weights1 = tensors152[0]+(ptrdiff_t)131072*t35;
char*restrict biases1 = tensors152[1]+(ptrdiff_t)64*t35;
char*restrict weights2 = tensors152[2]+(ptrdiff_t)65536*t35;
char*restrict biases2 = tensors152[2]+(ptrdiff_t)4096000+(ptrdiff_t)64*t35;
if (t35 < 62) {
for (ptrdiff_t i89 = 0; i89 < 1; ++i89) {
for (ptrdiff_t j79 = 0; j79 < 128; ++j79) {
__m512 wtLo1 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)0+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtHi1 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)8192+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtLo2 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)16384+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtHi2 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)24576+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtLo3 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)32768+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtHi3 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)40960+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtLo4 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)49152+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtHi4 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)57344+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtLo5 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)65536+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtHi5 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)73728+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtLo6 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)81920+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtHi6 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)90112+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtLo7 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)98304+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtHi7 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)106496+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtLo8 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)114688+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m512 wtHi8 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)122880+(ptrdiff_t)131072*i89+(ptrdiff_t)64*j79);
__m256i halfLo1 = _mm512_cvtps_ph(wtLo1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi1 = _mm512_cvtps_ph(wtHi1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo2 = _mm512_cvtps_ph(wtLo2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi2 = _mm512_cvtps_ph(wtHi2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo3 = _mm512_cvtps_ph(wtLo3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi3 = _mm512_cvtps_ph(wtHi3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo4 = _mm512_cvtps_ph(wtLo4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi4 = _mm512_cvtps_ph(wtHi4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo5 = _mm512_cvtps_ph(wtLo5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi5 = _mm512_cvtps_ph(wtHi5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo6 = _mm512_cvtps_ph(wtLo6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi6 = _mm512_cvtps_ph(wtHi6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo7 = _mm512_cvtps_ph(wtLo7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi7 = _mm512_cvtps_ph(wtHi7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo8 = _mm512_cvtps_ph(wtLo8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi8 = _mm512_cvtps_ph(wtHi8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512i yield1 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo1), halfHi1, 1);
__m512i yield2 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo2), halfHi2, 1);
__m512i yield3 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo3), halfHi3, 1);
__m512i yield4 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo4), halfHi4, 1);
__m512i yield5 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo5), halfHi5, 1);
__m512i yield6 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo6), halfHi6, 1);
__m512i yield7 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo7), halfHi7, 1);
__m512i yield8 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo8), halfHi8, 1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)0+(ptrdiff_t)65536*i89+(ptrdiff_t)512*j79, 65535, yield1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)64+(ptrdiff_t)65536*i89+(ptrdiff_t)512*j79, 65535, yield2);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)128+(ptrdiff_t)65536*i89+(ptrdiff_t)512*j79, 65535, yield3);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)192+(ptrdiff_t)65536*i89+(ptrdiff_t)512*j79, 65535, yield4);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)256+(ptrdiff_t)65536*i89+(ptrdiff_t)512*j79, 65535, yield5);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)320+(ptrdiff_t)65536*i89+(ptrdiff_t)512*j79, 65535, yield6);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)384+(ptrdiff_t)65536*i89+(ptrdiff_t)512*j79, 65535, yield7);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)448+(ptrdiff_t)65536*i89+(ptrdiff_t)512*j79, 65535, yield8);
}
__m512 bias10 = _mm512_maskz_loadu_ps(65535, biases1+(ptrdiff_t)0+(ptrdiff_t)64*i89);
_mm512_mask_storeu_ps(biases2+(ptrdiff_t)0+(ptrdiff_t)64*i89, 65535, bias10);
}
return;
}
for (ptrdiff_t i90 = 0; i90 < 1; ++i90) {
for (ptrdiff_t j80 = 0; j80 < 64; ++j80) {
__m512 wtLo9 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)0+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtHi9 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)8192+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtLo10 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)16384+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtHi10 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)24576+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtLo11 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)32768+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtHi11 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)40960+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtLo12 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)49152+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtHi12 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)57344+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtLo13 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)64+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtHi13 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)8256+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtLo14 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)16448+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtHi14 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)24640+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtLo15 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)32832+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtHi15 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)41024+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtLo16 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)49216+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m512 wtHi16 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)57408+(ptrdiff_t)65536*i90+(ptrdiff_t)128*j80);
__m256i halfLo9 = _mm512_cvtps_ph(wtLo9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi9 = _mm512_cvtps_ph(wtHi9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo10 = _mm512_cvtps_ph(wtLo10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi10 = _mm512_cvtps_ph(wtHi10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo11 = _mm512_cvtps_ph(wtLo11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi11 = _mm512_cvtps_ph(wtHi11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo12 = _mm512_cvtps_ph(wtLo12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi12 = _mm512_cvtps_ph(wtHi12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo13 = _mm512_cvtps_ph(wtLo13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi13 = _mm512_cvtps_ph(wtHi13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo14 = _mm512_cvtps_ph(wtLo14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi14 = _mm512_cvtps_ph(wtHi14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo15 = _mm512_cvtps_ph(wtLo15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi15 = _mm512_cvtps_ph(wtHi15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo16 = _mm512_cvtps_ph(wtLo16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi16 = _mm512_cvtps_ph(wtHi16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512i yield9 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo9), halfHi9, 1);
__m512i yield10 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo10), halfHi10, 1);
__m512i yield11 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo11), halfHi11, 1);
__m512i yield12 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo12), halfHi12, 1);
__m512i yield13 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo13), halfHi13, 1);
__m512i yield14 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo14), halfHi14, 1);
__m512i yield15 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo15), halfHi15, 1);
__m512i yield16 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo16), halfHi16, 1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)0+(ptrdiff_t)65536*i90+(ptrdiff_t)512*j80, 65535, yield9);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)64+(ptrdiff_t)65536*i90+(ptrdiff_t)512*j80, 65535, yield10);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)128+(ptrdiff_t)65536*i90+(ptrdiff_t)512*j80, 65535, yield11);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)192+(ptrdiff_t)65536*i90+(ptrdiff_t)512*j80, 65535, yield12);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)256+(ptrdiff_t)65536*i90+(ptrdiff_t)512*j80, 65535, yield13);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)320+(ptrdiff_t)65536*i90+(ptrdiff_t)512*j80, 65535, yield14);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)384+(ptrdiff_t)65536*i90+(ptrdiff_t)512*j80, 65535, yield15);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)448+(ptrdiff_t)65536*i90+(ptrdiff_t)512*j80, 65535, yield16);
}
__m512 bias11 = _mm512_maskz_loadu_ps(255, biases1+(ptrdiff_t)0+(ptrdiff_t)32*i90);
_mm512_mask_storeu_ps(biases2+(ptrdiff_t)0+(ptrdiff_t)32*i90, 255, bias11);
}
}

static void ResNet50FcArrange1(ResNet50ThreaderTeam1* team88, char** tensors151) {
ResNet50ThreaderTask1 task157;
task157.callee1 = ResNet50FcArrange1Callee1;
task157.any1 = tensors151;
task157.nd1 = 1;
task157.hull1[0] = 63;
ResNet50ThreaderDo1(team88, &task157);
}

static void ResNet50FcApply1Callee1(ResNet50ThreaderTask1* task158, int64_t* pt84) {
char** tensors154 = task158->any1;
ptrdiff_t t36 = pt84[0];
char*restrict wtPtr27 = tensors154[0]+(ptrdiff_t)65536*t36;
char*restrict biasPtr26 = tensors154[0]+(ptrdiff_t)4096000+(ptrdiff_t)64*t36;
char*restrict datPtr50 = tensors154[1];
char*restrict datPtr51 = tensors154[2]+(ptrdiff_t)64*t36;
if (t36 < 62) {
for (ptrdiff_t i91 = 0; i91 < 1; ++i91) {
__m512 sum837 = _mm512_setzero_ps();
__m512 sum838 = _mm512_setzero_ps();
__m512 sum839 = _mm512_setzero_ps();
__m512 sum840 = _mm512_setzero_ps();
__m512 sum841 = _mm512_setzero_ps();
__m512 sum842 = _mm512_setzero_ps();
__m512 sum843 = _mm512_setzero_ps();
__m512 sum844 = _mm512_setzero_ps();
__m512 sum845 = _mm512_setzero_ps();
__m512 sum846 = _mm512_setzero_ps();
__m512 sum847 = _mm512_setzero_ps();
__m512 sum848 = _mm512_setzero_ps();
__m512 sum849 = _mm512_setzero_ps();
__m512 sum850 = _mm512_setzero_ps();
__m512 sum851 = _mm512_setzero_ps();
__m512 sum852 = _mm512_setzero_ps();
for (ptrdiff_t j81 = 0; j81 < 128; ++j81) {
__m512i wts1 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)0+(ptrdiff_t)65536*i91+(ptrdiff_t)512*j81);
__m512 dat2555 = _mm512_maskz_loadu_ps(65535, datPtr50+(ptrdiff_t)0+(ptrdiff_t)64*j81);
__m512i wts2 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)64+(ptrdiff_t)65536*i91+(ptrdiff_t)512*j81);
__m512i wts3 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)128+(ptrdiff_t)65536*i91+(ptrdiff_t)512*j81);
__m512i wts4 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)192+(ptrdiff_t)65536*i91+(ptrdiff_t)512*j81);
__m512 wtLo17 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts1));
__m512 wtHi17 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts1, 1));
__m512 wtLo18 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts2));
__m512 wtHi18 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts2, 1));
__m512 wtLo19 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts3));
__m512 wtHi19 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts3, 1));
__m512 wtLo20 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts4));
__m512 wtHi20 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts4, 1));
sum837 = _mm512_fmadd_ps(wtLo17, dat2555, sum837);
sum838 = _mm512_fmadd_ps(wtHi17, dat2555, sum838);
sum839 = _mm512_fmadd_ps(wtLo18, dat2555, sum839);
sum840 = _mm512_fmadd_ps(wtHi18, dat2555, sum840);
sum841 = _mm512_fmadd_ps(wtLo19, dat2555, sum841);
sum842 = _mm512_fmadd_ps(wtHi19, dat2555, sum842);
sum843 = _mm512_fmadd_ps(wtLo20, dat2555, sum843);
sum844 = _mm512_fmadd_ps(wtHi20, dat2555, sum844);
__m512i wts5 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)256+(ptrdiff_t)65536*i91+(ptrdiff_t)512*j81);
__m512i wts6 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)320+(ptrdiff_t)65536*i91+(ptrdiff_t)512*j81);
__m512i wts7 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)384+(ptrdiff_t)65536*i91+(ptrdiff_t)512*j81);
__m512i wts8 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)448+(ptrdiff_t)65536*i91+(ptrdiff_t)512*j81);
__m512 wtLo21 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts5));
__m512 wtHi21 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts5, 1));
__m512 wtLo22 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts6));
__m512 wtHi22 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts6, 1));
__m512 wtLo23 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts7));
__m512 wtHi23 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts7, 1));
__m512 wtLo24 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts8));
__m512 wtHi24 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts8, 1));
sum845 = _mm512_fmadd_ps(wtLo21, dat2555, sum845);
sum846 = _mm512_fmadd_ps(wtHi21, dat2555, sum846);
sum847 = _mm512_fmadd_ps(wtLo22, dat2555, sum847);
sum848 = _mm512_fmadd_ps(wtHi22, dat2555, sum848);
sum849 = _mm512_fmadd_ps(wtLo23, dat2555, sum849);
sum850 = _mm512_fmadd_ps(wtHi23, dat2555, sum850);
sum851 = _mm512_fmadd_ps(wtLo24, dat2555, sum851);
sum852 = _mm512_fmadd_ps(wtHi24, dat2555, sum852);
}
__m512 bias12 = _mm512_maskz_loadu_ps(65535, biasPtr26+(ptrdiff_t)0+(ptrdiff_t)64*i91);
__m512i pm1Lo1 = _mm512_set_epi32(30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0);
__m512i pm1Hi1 = _mm512_set_epi32(31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1);
__m512i pm4Lo1 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
__m512i pm4Hi1 = _mm512_set_epi32(31, 30, 29, 28, 15, 14, 13, 12, 23, 22, 21, 20, 7, 6, 5, 4);
__m512 upper4 = _mm512_shuffle_f32x4(sum837, sum845, 238);
__m512 upper5 = _mm512_shuffle_f32x4(sum841, sum849, 238);
sum837 = _mm512_shuffle_f32x4(sum837, sum845, 68);
sum841 = _mm512_shuffle_f32x4(sum841, sum849, 68);
sum837 = _mm512_add_ps(sum837, upper4);
sum841 = _mm512_add_ps(sum841, upper5);
__m512 upper7 = _mm512_shuffle_f32x4(sum839, sum847, 238);
__m512 upper8 = _mm512_shuffle_f32x4(sum843, sum851, 238);
sum839 = _mm512_shuffle_f32x4(sum839, sum847, 68);
sum843 = _mm512_shuffle_f32x4(sum843, sum851, 68);
sum839 = _mm512_add_ps(sum839, upper7);
sum843 = _mm512_add_ps(sum843, upper8);
__m512 upper3 = _mm512_permutex2var_ps(sum837, pm4Hi1, sum841);
__m512 upper6 = _mm512_permutex2var_ps(sum839, pm4Hi1, sum843);
sum837 = _mm512_permutex2var_ps(sum837, pm4Lo1, sum841);
sum839 = _mm512_permutex2var_ps(sum839, pm4Lo1, sum843);
sum837 = _mm512_add_ps(sum837, upper3);
sum839 = _mm512_add_ps(sum839, upper6);
__m512 upper11 = _mm512_shuffle_f32x4(sum838, sum846, 238);
__m512 upper12 = _mm512_shuffle_f32x4(sum842, sum850, 238);
sum838 = _mm512_shuffle_f32x4(sum838, sum846, 68);
sum842 = _mm512_shuffle_f32x4(sum842, sum850, 68);
sum838 = _mm512_add_ps(sum838, upper11);
sum842 = _mm512_add_ps(sum842, upper12);
__m512 upper14 = _mm512_shuffle_f32x4(sum840, sum848, 238);
__m512 upper15 = _mm512_shuffle_f32x4(sum844, sum852, 238);
sum840 = _mm512_shuffle_f32x4(sum840, sum848, 68);
sum844 = _mm512_shuffle_f32x4(sum844, sum852, 68);
sum840 = _mm512_add_ps(sum840, upper14);
sum844 = _mm512_add_ps(sum844, upper15);
__m512 upper10 = _mm512_permutex2var_ps(sum838, pm4Hi1, sum842);
__m512 upper13 = _mm512_permutex2var_ps(sum840, pm4Hi1, sum844);
sum838 = _mm512_permutex2var_ps(sum838, pm4Lo1, sum842);
sum840 = _mm512_permutex2var_ps(sum840, pm4Lo1, sum844);
sum838 = _mm512_add_ps(sum838, upper10);
sum840 = _mm512_add_ps(sum840, upper13);
__m512 upper2 = _mm512_shuffle_ps(sum837, sum839, 238);
__m512 upper9 = _mm512_shuffle_ps(sum838, sum840, 238);
sum837 = _mm512_shuffle_ps(sum837, sum839, 68);
sum838 = _mm512_shuffle_ps(sum838, sum840, 68);
sum837 = _mm512_add_ps(sum837, upper2);
sum838 = _mm512_add_ps(sum838, upper9);
__m512 upper1 = _mm512_permutex2var_ps(sum837, pm1Hi1, sum838);
sum837 = _mm512_permutex2var_ps(sum837, pm1Lo1, sum838);
sum837 = _mm512_add_ps(sum837, upper1);
sum837 = _mm512_add_ps(sum837, bias12);
_mm512_mask_storeu_ps(datPtr51+(ptrdiff_t)0+(ptrdiff_t)64*i91, 65535, sum837);
}
return;
}
for (ptrdiff_t i92 = 0; i92 < 1; ++i92) {
__m512 sum853 = _mm512_setzero_ps();
__m512 sum854 = _mm512_setzero_ps();
__m512 sum855 = _mm512_setzero_ps();
__m512 sum856 = _mm512_setzero_ps();
__m512 sum857 = _mm512_setzero_ps();
__m512 sum858 = _mm512_setzero_ps();
__m512 sum859 = _mm512_setzero_ps();
__m512 sum860 = _mm512_setzero_ps();
for (ptrdiff_t j82 = 0; j82 < 128; ++j82) {
__m512i wts9 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)0+(ptrdiff_t)65536*i92+(ptrdiff_t)256*j82);
__m512 dat2556 = _mm512_maskz_loadu_ps(65535, datPtr50+(ptrdiff_t)0+(ptrdiff_t)64*j82);
__m512i wts10 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)64+(ptrdiff_t)65536*i92+(ptrdiff_t)256*j82);
__m512i wts11 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)128+(ptrdiff_t)65536*i92+(ptrdiff_t)256*j82);
__m512i wts12 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)192+(ptrdiff_t)65536*i92+(ptrdiff_t)256*j82);
__m512 wtLo25 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts9));
__m512 wtHi25 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts9, 1));
__m512 wtLo26 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts10));
__m512 wtHi26 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts10, 1));
__m512 wtLo27 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts11));
__m512 wtHi27 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts11, 1));
__m512 wtLo28 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts12));
__m512 wtHi28 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts12, 1));
sum853 = _mm512_fmadd_ps(wtLo25, dat2556, sum853);
sum854 = _mm512_fmadd_ps(wtHi25, dat2556, sum854);
sum855 = _mm512_fmadd_ps(wtLo26, dat2556, sum855);
sum856 = _mm512_fmadd_ps(wtHi26, dat2556, sum856);
sum857 = _mm512_fmadd_ps(wtLo27, dat2556, sum857);
sum858 = _mm512_fmadd_ps(wtHi27, dat2556, sum858);
sum859 = _mm512_fmadd_ps(wtLo28, dat2556, sum859);
sum860 = _mm512_fmadd_ps(wtHi28, dat2556, sum860);
}
__m512 bias13 = _mm512_maskz_loadu_ps(255, biasPtr26+(ptrdiff_t)0+(ptrdiff_t)32*i92);
__m512i pmEven1 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmOdd1 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512i pm4Lo2 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
__m512i pm4Hi2 = _mm512_set_epi32(31, 30, 29, 28, 15, 14, 13, 12, 23, 22, 21, 20, 7, 6, 5, 4);
__m512 upper18 = _mm512_shuffle_f32x4(sum853, sum857, 238);
__m512 upper19 = _mm512_shuffle_f32x4(sum855, sum859, 238);
sum853 = _mm512_shuffle_f32x4(sum853, sum857, 68);
sum855 = _mm512_shuffle_f32x4(sum855, sum859, 68);
sum853 = _mm512_add_ps(sum853, upper18);
sum855 = _mm512_add_ps(sum855, upper19);
__m512 upper21 = _mm512_shuffle_f32x4(sum854, sum858, 238);
__m512 upper22 = _mm512_shuffle_f32x4(sum856, sum860, 238);
sum854 = _mm512_shuffle_f32x4(sum854, sum858, 68);
sum856 = _mm512_shuffle_f32x4(sum856, sum860, 68);
sum854 = _mm512_add_ps(sum854, upper21);
sum856 = _mm512_add_ps(sum856, upper22);
__m512 upper17 = _mm512_permutex2var_ps(sum853, pm4Hi2, sum855);
__m512 upper20 = _mm512_permutex2var_ps(sum854, pm4Hi2, sum856);
sum853 = _mm512_permutex2var_ps(sum853, pm4Lo2, sum855);
sum854 = _mm512_permutex2var_ps(sum854, pm4Lo2, sum856);
sum853 = _mm512_add_ps(sum853, upper17);
sum854 = _mm512_add_ps(sum854, upper20);
__m512 upper16 = _mm512_shuffle_ps(sum853, sum854, 238);
sum853 = _mm512_shuffle_ps(sum853, sum854, 68);
sum853 = _mm512_add_ps(sum853, upper16);
__m512 upper23 = _mm512_permutexvar_ps(pmOdd1, sum853);
sum853 = _mm512_permutexvar_ps(pmEven1, sum853);
sum853 = _mm512_add_ps(sum853, upper23);
sum853 = _mm512_add_ps(sum853, bias13);
_mm512_mask_storeu_ps(datPtr51+(ptrdiff_t)0+(ptrdiff_t)32*i92, 255, sum853);
}
}

static void ResNet50FcApply1(ResNet50ThreaderTeam1* team89, char** tensors153) {
ResNet50ThreaderTask1 task159;
task159.callee1 = ResNet50FcApply1Callee1;
task159.any1 = tensors153;
task159.nd1 = 1;
task159.hull1[0] = 63;
ResNet50ThreaderDo1(team89, &task159);
}

static void ResNet50OneArrangeWts1Callee1(ResNet50ThreaderTask1* task14, int64_t* pt12) {
char** tensors12 = task14->any1;
ptrdiff_t b44 = pt12[0];
char*restrict wtPtr2 = tensors12[0]+(ptrdiff_t)3340*0+(ptrdiff_t)81920*0;
char*restrict biasPtr2 = tensors12[1]+(ptrdiff_t)1280*0;
char*restrict bnPtr2 = tensors12[2]+(ptrdiff_t)8*320*0;
char*restrict wtPtr3 = tensors12[3]+(ptrdiff_t)3340*0+(ptrdiff_t)81920*0;
char*restrict biasPtr3 = tensors12[4]+(ptrdiff_t)1280*0;
char*restrict bnPtr3 = tensors12[5]+(ptrdiff_t)8*320*0;
char*restrict arranged1 = tensors12[6]+(ptrdiff_t)1070080*0+(ptrdiff_t)83200*0;
ptrdiff_t ii1 = 1;
for (ptrdiff_t i12 = 0; i12 < ii1; ++i12) {
ptrdiff_t j7 = 10*b44;
ptrdiff_t jj19 = j7+10;
for (; j7 < jj19; ++j7) {
if (j7 < 16) {
ptrdiff_t k46 = 0+16*(j7-0);
ptrdiff_t l9 = (size_t)(0+k46)/6;
ptrdiff_t cut1 = (size_t)(0+k46)%6;
switch (cut1) {
case 0:;
case 2: {
__m512 sum2 = _mm512_maskz_loadu_ps(65535, biasPtr2+1280*i12+4*k46);
__m512i pmMul2 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd2 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo1 = _mm512_loadu_ps(bnPtr2+(ptrdiff_t)8*(k46+320*i12));
__m512 masHi1 = _mm512_maskz_loadu_ps(65535, bnPtr2+(ptrdiff_t)8*(k46+320*i12)+(ptrdiff_t)64);
__m512 postMul4 = _mm512_permutex2var_ps(masLo1, pmMul2, masHi1);
__m512 postAdd2 = _mm512_permutex2var_ps(masLo1, pmAdd2, masHi1);
sum2 = _mm512_fmadd_ps(sum2, postMul4, postAdd2);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)0, 63>>cut1, sum2);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)1536, 4032>>cut1, sum2);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)3072, 65535-(4095>>cut1), sum2);
ptrdiff_t c5 = 0;
for (; c5 != 4; ++c5) {
__m512 wt15 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)0);
__m512 wt16 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)256);
__m512 wt17 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)512);
__m512 wt18 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)768);
__m512 wt19 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)1024);
__m512 wt20 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)1280);
__m512 wt21 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)1536);
__m512 wt22 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)1792);
__m512 wt23 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)2048);
__m512 wt24 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)2304);
__m512 wt25 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)2560);
__m512 wt26 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)2816);
__m512 wt27 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)3072);
__m512 wt28 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)3328);
__m512 wt29 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)3584);
__m512 wt30 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c5+(ptrdiff_t)3840);
__m512 tmp1 = _mm512_unpacklo_ps(wt15, wt16);
__m512 tmp2 = _mm512_unpackhi_ps(wt15, wt16);
__m512 tmp3 = _mm512_unpacklo_ps(wt17, wt18);
__m512 tmp4 = _mm512_unpackhi_ps(wt17, wt18);
__m512 tmp5 = _mm512_unpacklo_ps(wt19, wt20);
__m512 tmp6 = _mm512_unpackhi_ps(wt19, wt20);
__m512 tmp7 = _mm512_unpacklo_ps(wt21, wt22);
__m512 tmp8 = _mm512_unpackhi_ps(wt21, wt22);
__m512 tmp9 = _mm512_unpacklo_ps(wt23, wt24);
__m512 tmp10 = _mm512_unpackhi_ps(wt23, wt24);
__m512 tmp11 = _mm512_unpacklo_ps(wt25, wt26);
__m512 tmp12 = _mm512_unpackhi_ps(wt25, wt26);
__m512 tmp13 = _mm512_unpacklo_ps(wt27, wt28);
__m512 tmp14 = _mm512_unpackhi_ps(wt27, wt28);
__m512 tmp15 = _mm512_unpacklo_ps(wt29, wt30);
__m512 tmp16 = _mm512_unpackhi_ps(wt29, wt30);
__m512 tmp17 = _mm512_shuffle_ps(tmp1, tmp3, 68);
__m512 tmp18 = _mm512_shuffle_ps(tmp1, tmp3, 238);
__m512 tmp19 = _mm512_shuffle_ps(tmp2, tmp4, 68);
__m512 tmp20 = _mm512_shuffle_ps(tmp2, tmp4, 238);
__m512 tmp21 = _mm512_shuffle_ps(tmp5, tmp7, 68);
__m512 tmp22 = _mm512_shuffle_ps(tmp5, tmp7, 238);
__m512 tmp23 = _mm512_shuffle_ps(tmp6, tmp8, 68);
__m512 tmp24 = _mm512_shuffle_ps(tmp6, tmp8, 238);
__m512 tmp25 = _mm512_shuffle_ps(tmp9, tmp11, 68);
__m512 tmp26 = _mm512_shuffle_ps(tmp9, tmp11, 238);
__m512 tmp27 = _mm512_shuffle_ps(tmp10, tmp12, 68);
__m512 tmp28 = _mm512_shuffle_ps(tmp10, tmp12, 238);
__m512 tmp29 = _mm512_shuffle_ps(tmp13, tmp15, 68);
__m512 tmp30 = _mm512_shuffle_ps(tmp13, tmp15, 238);
__m512 tmp31 = _mm512_shuffle_ps(tmp14, tmp16, 68);
__m512 tmp32 = _mm512_shuffle_ps(tmp14, tmp16, 238);
__m512 tmp33 = _mm512_shuffle_f32x4(tmp17, tmp21, 136);
__m512 tmp34 = _mm512_shuffle_f32x4(tmp17, tmp21, 221);
__m512 tmp35 = _mm512_shuffle_f32x4(tmp18, tmp22, 136);
__m512 tmp36 = _mm512_shuffle_f32x4(tmp18, tmp22, 221);
__m512 tmp37 = _mm512_shuffle_f32x4(tmp19, tmp23, 136);
__m512 tmp38 = _mm512_shuffle_f32x4(tmp19, tmp23, 221);
__m512 tmp39 = _mm512_shuffle_f32x4(tmp20, tmp24, 136);
__m512 tmp40 = _mm512_shuffle_f32x4(tmp20, tmp24, 221);
__m512 tmp41 = _mm512_shuffle_f32x4(tmp25, tmp29, 136);
__m512 tmp42 = _mm512_shuffle_f32x4(tmp25, tmp29, 221);
__m512 tmp43 = _mm512_shuffle_f32x4(tmp26, tmp30, 136);
__m512 tmp44 = _mm512_shuffle_f32x4(tmp26, tmp30, 221);
__m512 tmp45 = _mm512_shuffle_f32x4(tmp27, tmp31, 136);
__m512 tmp46 = _mm512_shuffle_f32x4(tmp27, tmp31, 221);
__m512 tmp47 = _mm512_shuffle_f32x4(tmp28, tmp32, 136);
__m512 tmp48 = _mm512_shuffle_f32x4(tmp28, tmp32, 221);
wt15 = _mm512_shuffle_f32x4(tmp33, tmp41, 136);
wt23 = _mm512_shuffle_f32x4(tmp33, tmp41, 221);
wt16 = _mm512_shuffle_f32x4(tmp35, tmp43, 136);
wt24 = _mm512_shuffle_f32x4(tmp35, tmp43, 221);
wt17 = _mm512_shuffle_f32x4(tmp37, tmp45, 136);
wt25 = _mm512_shuffle_f32x4(tmp37, tmp45, 221);
wt18 = _mm512_shuffle_f32x4(tmp39, tmp47, 136);
wt26 = _mm512_shuffle_f32x4(tmp39, tmp47, 221);
wt19 = _mm512_shuffle_f32x4(tmp34, tmp42, 136);
wt27 = _mm512_shuffle_f32x4(tmp34, tmp42, 221);
wt20 = _mm512_shuffle_f32x4(tmp36, tmp44, 136);
wt28 = _mm512_shuffle_f32x4(tmp36, tmp44, 221);
wt21 = _mm512_shuffle_f32x4(tmp38, tmp46, 136);
wt29 = _mm512_shuffle_f32x4(tmp38, tmp46, 221);
wt22 = _mm512_shuffle_f32x4(tmp40, tmp48, 136);
wt30 = _mm512_shuffle_f32x4(tmp40, tmp48, 221);
wt15 = _mm512_mul_ps(wt15, postMul4);
wt16 = _mm512_mul_ps(wt16, postMul4);
wt17 = _mm512_mul_ps(wt17, postMul4);
wt18 = _mm512_mul_ps(wt18, postMul4);
wt19 = _mm512_mul_ps(wt19, postMul4);
wt20 = _mm512_mul_ps(wt20, postMul4);
wt21 = _mm512_mul_ps(wt21, postMul4);
wt22 = _mm512_mul_ps(wt22, postMul4);
wt23 = _mm512_mul_ps(wt23, postMul4);
wt24 = _mm512_mul_ps(wt24, postMul4);
wt25 = _mm512_mul_ps(wt25, postMul4);
wt26 = _mm512_mul_ps(wt26, postMul4);
wt27 = _mm512_mul_ps(wt27, postMul4);
wt28 = _mm512_mul_ps(wt28, postMul4);
wt29 = _mm512_mul_ps(wt29, postMul4);
wt30 = _mm512_mul_ps(wt30, postMul4);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(1+16*c5)+(ptrdiff_t)0, 63>>cut1, wt15);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(2+16*c5)+(ptrdiff_t)0, 63>>cut1, wt16);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(3+16*c5)+(ptrdiff_t)0, 63>>cut1, wt17);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(4+16*c5)+(ptrdiff_t)0, 63>>cut1, wt18);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(5+16*c5)+(ptrdiff_t)0, 63>>cut1, wt19);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(6+16*c5)+(ptrdiff_t)0, 63>>cut1, wt20);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(7+16*c5)+(ptrdiff_t)0, 63>>cut1, wt21);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(8+16*c5)+(ptrdiff_t)0, 63>>cut1, wt22);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(9+16*c5)+(ptrdiff_t)0, 63>>cut1, wt23);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(10+16*c5)+(ptrdiff_t)0, 63>>cut1, wt24);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(11+16*c5)+(ptrdiff_t)0, 63>>cut1, wt25);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(12+16*c5)+(ptrdiff_t)0, 63>>cut1, wt26);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(13+16*c5)+(ptrdiff_t)0, 63>>cut1, wt27);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(14+16*c5)+(ptrdiff_t)0, 63>>cut1, wt28);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(15+16*c5)+(ptrdiff_t)0, 63>>cut1, wt29);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(16+16*c5)+(ptrdiff_t)0, 63>>cut1, wt30);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(1+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt15);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(2+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt16);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(3+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt17);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(4+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt18);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(5+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt19);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(6+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt20);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(7+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt21);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(8+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt22);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(9+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt23);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(10+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt24);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(11+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt25);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(12+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt26);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(13+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt27);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(14+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt28);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(15+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt29);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(16+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt30);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(1+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt15);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(2+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt16);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(3+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt17);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(4+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt18);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(5+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt19);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(6+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt20);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(7+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt21);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(8+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt22);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(9+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt23);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(10+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt24);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(11+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt25);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(12+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt26);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(13+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt27);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(14+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt28);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(15+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt29);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(16+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt30);
}
break;
}
default: {
cut1 = 4;
__m512 sum3 = _mm512_maskz_loadu_ps(65535, biasPtr2+1280*i12+4*k46);
__m512i pmMul3 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd3 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo2 = _mm512_loadu_ps(bnPtr2+(ptrdiff_t)8*(k46+320*i12));
__m512 masHi2 = _mm512_maskz_loadu_ps(65535, bnPtr2+(ptrdiff_t)8*(k46+320*i12)+(ptrdiff_t)64);
__m512 postMul5 = _mm512_permutex2var_ps(masLo2, pmMul3, masHi2);
__m512 postAdd3 = _mm512_permutex2var_ps(masLo2, pmAdd3, masHi2);
sum3 = _mm512_fmadd_ps(sum3, postMul5, postAdd3);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)0, 63>>cut1, sum3);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)1536, 4032>>cut1, sum3);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)3072, 258048>>cut1, sum3);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*0+(ptrdiff_t)4608, 65535-(262143>>cut1), sum3);
ptrdiff_t c6 = 0;
for (; c6 != 4; ++c6) {
__m512 wt31 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)0);
__m512 wt32 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)256);
__m512 wt33 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)512);
__m512 wt34 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)768);
__m512 wt35 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)1024);
__m512 wt36 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)1280);
__m512 wt37 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)1536);
__m512 wt38 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)1792);
__m512 wt39 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)2048);
__m512 wt40 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)2304);
__m512 wt41 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)2560);
__m512 wt42 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)2816);
__m512 wt43 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)3072);
__m512 wt44 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)3328);
__m512 wt45 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)3584);
__m512 wt46 = _mm512_maskz_loadu_ps(65535, wtPtr2+81920*i12+256*k46+64*c6+(ptrdiff_t)3840);
__m512 tmp49 = _mm512_unpacklo_ps(wt31, wt32);
__m512 tmp50 = _mm512_unpackhi_ps(wt31, wt32);
__m512 tmp51 = _mm512_unpacklo_ps(wt33, wt34);
__m512 tmp52 = _mm512_unpackhi_ps(wt33, wt34);
__m512 tmp53 = _mm512_unpacklo_ps(wt35, wt36);
__m512 tmp54 = _mm512_unpackhi_ps(wt35, wt36);
__m512 tmp55 = _mm512_unpacklo_ps(wt37, wt38);
__m512 tmp56 = _mm512_unpackhi_ps(wt37, wt38);
__m512 tmp57 = _mm512_unpacklo_ps(wt39, wt40);
__m512 tmp58 = _mm512_unpackhi_ps(wt39, wt40);
__m512 tmp59 = _mm512_unpacklo_ps(wt41, wt42);
__m512 tmp60 = _mm512_unpackhi_ps(wt41, wt42);
__m512 tmp61 = _mm512_unpacklo_ps(wt43, wt44);
__m512 tmp62 = _mm512_unpackhi_ps(wt43, wt44);
__m512 tmp63 = _mm512_unpacklo_ps(wt45, wt46);
__m512 tmp64 = _mm512_unpackhi_ps(wt45, wt46);
__m512 tmp65 = _mm512_shuffle_ps(tmp49, tmp51, 68);
__m512 tmp66 = _mm512_shuffle_ps(tmp49, tmp51, 238);
__m512 tmp67 = _mm512_shuffle_ps(tmp50, tmp52, 68);
__m512 tmp68 = _mm512_shuffle_ps(tmp50, tmp52, 238);
__m512 tmp69 = _mm512_shuffle_ps(tmp53, tmp55, 68);
__m512 tmp70 = _mm512_shuffle_ps(tmp53, tmp55, 238);
__m512 tmp71 = _mm512_shuffle_ps(tmp54, tmp56, 68);
__m512 tmp72 = _mm512_shuffle_ps(tmp54, tmp56, 238);
__m512 tmp73 = _mm512_shuffle_ps(tmp57, tmp59, 68);
__m512 tmp74 = _mm512_shuffle_ps(tmp57, tmp59, 238);
__m512 tmp75 = _mm512_shuffle_ps(tmp58, tmp60, 68);
__m512 tmp76 = _mm512_shuffle_ps(tmp58, tmp60, 238);
__m512 tmp77 = _mm512_shuffle_ps(tmp61, tmp63, 68);
__m512 tmp78 = _mm512_shuffle_ps(tmp61, tmp63, 238);
__m512 tmp79 = _mm512_shuffle_ps(tmp62, tmp64, 68);
__m512 tmp80 = _mm512_shuffle_ps(tmp62, tmp64, 238);
__m512 tmp81 = _mm512_shuffle_f32x4(tmp65, tmp69, 136);
__m512 tmp82 = _mm512_shuffle_f32x4(tmp65, tmp69, 221);
__m512 tmp83 = _mm512_shuffle_f32x4(tmp66, tmp70, 136);
__m512 tmp84 = _mm512_shuffle_f32x4(tmp66, tmp70, 221);
__m512 tmp85 = _mm512_shuffle_f32x4(tmp67, tmp71, 136);
__m512 tmp86 = _mm512_shuffle_f32x4(tmp67, tmp71, 221);
__m512 tmp87 = _mm512_shuffle_f32x4(tmp68, tmp72, 136);
__m512 tmp88 = _mm512_shuffle_f32x4(tmp68, tmp72, 221);
__m512 tmp89 = _mm512_shuffle_f32x4(tmp73, tmp77, 136);
__m512 tmp90 = _mm512_shuffle_f32x4(tmp73, tmp77, 221);
__m512 tmp91 = _mm512_shuffle_f32x4(tmp74, tmp78, 136);
__m512 tmp92 = _mm512_shuffle_f32x4(tmp74, tmp78, 221);
__m512 tmp93 = _mm512_shuffle_f32x4(tmp75, tmp79, 136);
__m512 tmp94 = _mm512_shuffle_f32x4(tmp75, tmp79, 221);
__m512 tmp95 = _mm512_shuffle_f32x4(tmp76, tmp80, 136);
__m512 tmp96 = _mm512_shuffle_f32x4(tmp76, tmp80, 221);
wt31 = _mm512_shuffle_f32x4(tmp81, tmp89, 136);
wt39 = _mm512_shuffle_f32x4(tmp81, tmp89, 221);
wt32 = _mm512_shuffle_f32x4(tmp83, tmp91, 136);
wt40 = _mm512_shuffle_f32x4(tmp83, tmp91, 221);
wt33 = _mm512_shuffle_f32x4(tmp85, tmp93, 136);
wt41 = _mm512_shuffle_f32x4(tmp85, tmp93, 221);
wt34 = _mm512_shuffle_f32x4(tmp87, tmp95, 136);
wt42 = _mm512_shuffle_f32x4(tmp87, tmp95, 221);
wt35 = _mm512_shuffle_f32x4(tmp82, tmp90, 136);
wt43 = _mm512_shuffle_f32x4(tmp82, tmp90, 221);
wt36 = _mm512_shuffle_f32x4(tmp84, tmp92, 136);
wt44 = _mm512_shuffle_f32x4(tmp84, tmp92, 221);
wt37 = _mm512_shuffle_f32x4(tmp86, tmp94, 136);
wt45 = _mm512_shuffle_f32x4(tmp86, tmp94, 221);
wt38 = _mm512_shuffle_f32x4(tmp88, tmp96, 136);
wt46 = _mm512_shuffle_f32x4(tmp88, tmp96, 221);
wt31 = _mm512_mul_ps(wt31, postMul5);
wt32 = _mm512_mul_ps(wt32, postMul5);
wt33 = _mm512_mul_ps(wt33, postMul5);
wt34 = _mm512_mul_ps(wt34, postMul5);
wt35 = _mm512_mul_ps(wt35, postMul5);
wt36 = _mm512_mul_ps(wt36, postMul5);
wt37 = _mm512_mul_ps(wt37, postMul5);
wt38 = _mm512_mul_ps(wt38, postMul5);
wt39 = _mm512_mul_ps(wt39, postMul5);
wt40 = _mm512_mul_ps(wt40, postMul5);
wt41 = _mm512_mul_ps(wt41, postMul5);
wt42 = _mm512_mul_ps(wt42, postMul5);
wt43 = _mm512_mul_ps(wt43, postMul5);
wt44 = _mm512_mul_ps(wt44, postMul5);
wt45 = _mm512_mul_ps(wt45, postMul5);
wt46 = _mm512_mul_ps(wt46, postMul5);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(1+16*c6)+(ptrdiff_t)0, 63>>cut1, wt31);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(2+16*c6)+(ptrdiff_t)0, 63>>cut1, wt32);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(3+16*c6)+(ptrdiff_t)0, 63>>cut1, wt33);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(4+16*c6)+(ptrdiff_t)0, 63>>cut1, wt34);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(5+16*c6)+(ptrdiff_t)0, 63>>cut1, wt35);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(6+16*c6)+(ptrdiff_t)0, 63>>cut1, wt36);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(7+16*c6)+(ptrdiff_t)0, 63>>cut1, wt37);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(8+16*c6)+(ptrdiff_t)0, 63>>cut1, wt38);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(9+16*c6)+(ptrdiff_t)0, 63>>cut1, wt39);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(10+16*c6)+(ptrdiff_t)0, 63>>cut1, wt40);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(11+16*c6)+(ptrdiff_t)0, 63>>cut1, wt41);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(12+16*c6)+(ptrdiff_t)0, 63>>cut1, wt42);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(13+16*c6)+(ptrdiff_t)0, 63>>cut1, wt43);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(14+16*c6)+(ptrdiff_t)0, 63>>cut1, wt44);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(15+16*c6)+(ptrdiff_t)0, 63>>cut1, wt45);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(16+16*c6)+(ptrdiff_t)0, 63>>cut1, wt46);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(1+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt31);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(2+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt32);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(3+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt33);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(4+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt34);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(5+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt35);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(6+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt36);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(7+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt37);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(8+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt38);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(9+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt39);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(10+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt40);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(11+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt41);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(12+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt42);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(13+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt43);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(14+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt44);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(15+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt45);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(16+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt46);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(1+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt31);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(2+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt32);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(3+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt33);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(4+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt34);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(5+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt35);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(6+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt36);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(7+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt37);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(8+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt38);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(9+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt39);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(10+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt40);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(11+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt41);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(12+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt42);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(13+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt43);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(14+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt44);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(15+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt45);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(16+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt46);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(1+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt31);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(2+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt32);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(3+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt33);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(4+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt34);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(5+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt35);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(6+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt36);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(7+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt37);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(8+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt38);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(9+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt39);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(10+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt40);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(11+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt41);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(12+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt42);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(13+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt43);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(14+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt44);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(15+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt45);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l9+4*cut1+24*(16+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt46);
}
}
}
} else if (j7 < 19) {
ptrdiff_t k48 = 0+16*(j7-16);
ptrdiff_t l11 = (size_t)(256+k48)/6;
ptrdiff_t cut3 = (size_t)(256+k48)%6;
switch (cut3) {
case 0:;
case 2: {
__m512 sum5 = _mm512_maskz_loadu_ps(65535, biasPtr3+1280*i12+4*k48);
__m512i pmMul4 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd4 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo3 = _mm512_loadu_ps(bnPtr3+(ptrdiff_t)8*(k48+320*i12));
__m512 masHi3 = _mm512_maskz_loadu_ps(65535, bnPtr3+(ptrdiff_t)8*(k48+320*i12)+(ptrdiff_t)64);
__m512 postMul7 = _mm512_permutex2var_ps(masLo3, pmMul4, masHi3);
__m512 postAdd5 = _mm512_permutex2var_ps(masLo3, pmAdd4, masHi3);
sum5 = _mm512_fmadd_ps(sum5, postMul7, postAdd5);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*0+(ptrdiff_t)0, 63>>cut3, sum5);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*0+(ptrdiff_t)1536, 4032>>cut3, sum5);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*0+(ptrdiff_t)3072, 65535-(4095>>cut3), sum5);
ptrdiff_t c8 = 0;
for (; c8 != 4; ++c8) {
__m512 wt63 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)0);
__m512 wt64 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)256);
__m512 wt65 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)512);
__m512 wt66 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)768);
__m512 wt67 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)1024);
__m512 wt68 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)1280);
__m512 wt69 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)1536);
__m512 wt70 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)1792);
__m512 wt71 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)2048);
__m512 wt72 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)2304);
__m512 wt73 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)2560);
__m512 wt74 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)2816);
__m512 wt75 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)3072);
__m512 wt76 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)3328);
__m512 wt77 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)3584);
__m512 wt78 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c8+(ptrdiff_t)3840);
__m512 tmp97 = _mm512_unpacklo_ps(wt63, wt64);
__m512 tmp98 = _mm512_unpackhi_ps(wt63, wt64);
__m512 tmp99 = _mm512_unpacklo_ps(wt65, wt66);
__m512 tmp100 = _mm512_unpackhi_ps(wt65, wt66);
__m512 tmp101 = _mm512_unpacklo_ps(wt67, wt68);
__m512 tmp102 = _mm512_unpackhi_ps(wt67, wt68);
__m512 tmp103 = _mm512_unpacklo_ps(wt69, wt70);
__m512 tmp104 = _mm512_unpackhi_ps(wt69, wt70);
__m512 tmp105 = _mm512_unpacklo_ps(wt71, wt72);
__m512 tmp106 = _mm512_unpackhi_ps(wt71, wt72);
__m512 tmp107 = _mm512_unpacklo_ps(wt73, wt74);
__m512 tmp108 = _mm512_unpackhi_ps(wt73, wt74);
__m512 tmp109 = _mm512_unpacklo_ps(wt75, wt76);
__m512 tmp110 = _mm512_unpackhi_ps(wt75, wt76);
__m512 tmp111 = _mm512_unpacklo_ps(wt77, wt78);
__m512 tmp112 = _mm512_unpackhi_ps(wt77, wt78);
__m512 tmp113 = _mm512_shuffle_ps(tmp97, tmp99, 68);
__m512 tmp114 = _mm512_shuffle_ps(tmp97, tmp99, 238);
__m512 tmp115 = _mm512_shuffle_ps(tmp98, tmp100, 68);
__m512 tmp116 = _mm512_shuffle_ps(tmp98, tmp100, 238);
__m512 tmp117 = _mm512_shuffle_ps(tmp101, tmp103, 68);
__m512 tmp118 = _mm512_shuffle_ps(tmp101, tmp103, 238);
__m512 tmp119 = _mm512_shuffle_ps(tmp102, tmp104, 68);
__m512 tmp120 = _mm512_shuffle_ps(tmp102, tmp104, 238);
__m512 tmp121 = _mm512_shuffle_ps(tmp105, tmp107, 68);
__m512 tmp122 = _mm512_shuffle_ps(tmp105, tmp107, 238);
__m512 tmp123 = _mm512_shuffle_ps(tmp106, tmp108, 68);
__m512 tmp124 = _mm512_shuffle_ps(tmp106, tmp108, 238);
__m512 tmp125 = _mm512_shuffle_ps(tmp109, tmp111, 68);
__m512 tmp126 = _mm512_shuffle_ps(tmp109, tmp111, 238);
__m512 tmp127 = _mm512_shuffle_ps(tmp110, tmp112, 68);
__m512 tmp128 = _mm512_shuffle_ps(tmp110, tmp112, 238);
__m512 tmp129 = _mm512_shuffle_f32x4(tmp113, tmp117, 136);
__m512 tmp130 = _mm512_shuffle_f32x4(tmp113, tmp117, 221);
__m512 tmp131 = _mm512_shuffle_f32x4(tmp114, tmp118, 136);
__m512 tmp132 = _mm512_shuffle_f32x4(tmp114, tmp118, 221);
__m512 tmp133 = _mm512_shuffle_f32x4(tmp115, tmp119, 136);
__m512 tmp134 = _mm512_shuffle_f32x4(tmp115, tmp119, 221);
__m512 tmp135 = _mm512_shuffle_f32x4(tmp116, tmp120, 136);
__m512 tmp136 = _mm512_shuffle_f32x4(tmp116, tmp120, 221);
__m512 tmp137 = _mm512_shuffle_f32x4(tmp121, tmp125, 136);
__m512 tmp138 = _mm512_shuffle_f32x4(tmp121, tmp125, 221);
__m512 tmp139 = _mm512_shuffle_f32x4(tmp122, tmp126, 136);
__m512 tmp140 = _mm512_shuffle_f32x4(tmp122, tmp126, 221);
__m512 tmp141 = _mm512_shuffle_f32x4(tmp123, tmp127, 136);
__m512 tmp142 = _mm512_shuffle_f32x4(tmp123, tmp127, 221);
__m512 tmp143 = _mm512_shuffle_f32x4(tmp124, tmp128, 136);
__m512 tmp144 = _mm512_shuffle_f32x4(tmp124, tmp128, 221);
wt63 = _mm512_shuffle_f32x4(tmp129, tmp137, 136);
wt71 = _mm512_shuffle_f32x4(tmp129, tmp137, 221);
wt64 = _mm512_shuffle_f32x4(tmp131, tmp139, 136);
wt72 = _mm512_shuffle_f32x4(tmp131, tmp139, 221);
wt65 = _mm512_shuffle_f32x4(tmp133, tmp141, 136);
wt73 = _mm512_shuffle_f32x4(tmp133, tmp141, 221);
wt66 = _mm512_shuffle_f32x4(tmp135, tmp143, 136);
wt74 = _mm512_shuffle_f32x4(tmp135, tmp143, 221);
wt67 = _mm512_shuffle_f32x4(tmp130, tmp138, 136);
wt75 = _mm512_shuffle_f32x4(tmp130, tmp138, 221);
wt68 = _mm512_shuffle_f32x4(tmp132, tmp140, 136);
wt76 = _mm512_shuffle_f32x4(tmp132, tmp140, 221);
wt69 = _mm512_shuffle_f32x4(tmp134, tmp142, 136);
wt77 = _mm512_shuffle_f32x4(tmp134, tmp142, 221);
wt70 = _mm512_shuffle_f32x4(tmp136, tmp144, 136);
wt78 = _mm512_shuffle_f32x4(tmp136, tmp144, 221);
wt63 = _mm512_mul_ps(wt63, postMul7);
wt64 = _mm512_mul_ps(wt64, postMul7);
wt65 = _mm512_mul_ps(wt65, postMul7);
wt66 = _mm512_mul_ps(wt66, postMul7);
wt67 = _mm512_mul_ps(wt67, postMul7);
wt68 = _mm512_mul_ps(wt68, postMul7);
wt69 = _mm512_mul_ps(wt69, postMul7);
wt70 = _mm512_mul_ps(wt70, postMul7);
wt71 = _mm512_mul_ps(wt71, postMul7);
wt72 = _mm512_mul_ps(wt72, postMul7);
wt73 = _mm512_mul_ps(wt73, postMul7);
wt74 = _mm512_mul_ps(wt74, postMul7);
wt75 = _mm512_mul_ps(wt75, postMul7);
wt76 = _mm512_mul_ps(wt76, postMul7);
wt77 = _mm512_mul_ps(wt77, postMul7);
wt78 = _mm512_mul_ps(wt78, postMul7);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(1+16*c8)+(ptrdiff_t)0, 63>>cut3, wt63);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(2+16*c8)+(ptrdiff_t)0, 63>>cut3, wt64);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(3+16*c8)+(ptrdiff_t)0, 63>>cut3, wt65);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(4+16*c8)+(ptrdiff_t)0, 63>>cut3, wt66);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(5+16*c8)+(ptrdiff_t)0, 63>>cut3, wt67);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(6+16*c8)+(ptrdiff_t)0, 63>>cut3, wt68);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(7+16*c8)+(ptrdiff_t)0, 63>>cut3, wt69);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(8+16*c8)+(ptrdiff_t)0, 63>>cut3, wt70);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(9+16*c8)+(ptrdiff_t)0, 63>>cut3, wt71);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(10+16*c8)+(ptrdiff_t)0, 63>>cut3, wt72);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(11+16*c8)+(ptrdiff_t)0, 63>>cut3, wt73);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(12+16*c8)+(ptrdiff_t)0, 63>>cut3, wt74);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(13+16*c8)+(ptrdiff_t)0, 63>>cut3, wt75);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(14+16*c8)+(ptrdiff_t)0, 63>>cut3, wt76);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(15+16*c8)+(ptrdiff_t)0, 63>>cut3, wt77);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(16+16*c8)+(ptrdiff_t)0, 63>>cut3, wt78);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(1+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt63);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(2+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt64);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(3+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt65);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(4+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt66);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(5+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt67);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(6+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt68);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(7+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt69);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(8+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt70);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(9+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt71);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(10+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt72);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(11+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt73);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(12+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt74);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(13+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt75);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(14+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt76);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(15+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt77);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(16+16*c8)+(ptrdiff_t)1536, 4032>>cut3, wt78);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(1+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt63);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(2+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt64);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(3+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt65);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(4+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt66);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(5+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt67);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(6+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt68);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(7+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt69);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(8+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt70);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(9+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt71);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(10+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt72);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(11+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt73);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(12+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt74);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(13+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt75);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(14+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt76);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(15+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt77);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(16+16*c8)+(ptrdiff_t)3072, 65535-(4095>>cut3), wt78);
}
break;
}
default: {
cut3 = 4;
__m512 sum6 = _mm512_maskz_loadu_ps(65535, biasPtr3+1280*i12+4*k48);
__m512i pmMul5 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd5 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo4 = _mm512_loadu_ps(bnPtr3+(ptrdiff_t)8*(k48+320*i12));
__m512 masHi4 = _mm512_maskz_loadu_ps(65535, bnPtr3+(ptrdiff_t)8*(k48+320*i12)+(ptrdiff_t)64);
__m512 postMul8 = _mm512_permutex2var_ps(masLo4, pmMul5, masHi4);
__m512 postAdd6 = _mm512_permutex2var_ps(masLo4, pmAdd5, masHi4);
sum6 = _mm512_fmadd_ps(sum6, postMul8, postAdd6);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*0+(ptrdiff_t)0, 63>>cut3, sum6);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*0+(ptrdiff_t)1536, 4032>>cut3, sum6);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*0+(ptrdiff_t)3072, 258048>>cut3, sum6);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*0+(ptrdiff_t)4608, 65535-(262143>>cut3), sum6);
ptrdiff_t c9 = 0;
for (; c9 != 4; ++c9) {
__m512 wt79 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)0);
__m512 wt80 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)256);
__m512 wt81 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)512);
__m512 wt82 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)768);
__m512 wt83 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)1024);
__m512 wt84 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)1280);
__m512 wt85 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)1536);
__m512 wt86 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)1792);
__m512 wt87 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)2048);
__m512 wt88 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)2304);
__m512 wt89 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)2560);
__m512 wt90 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)2816);
__m512 wt91 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)3072);
__m512 wt92 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)3328);
__m512 wt93 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)3584);
__m512 wt94 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k48+64*c9+(ptrdiff_t)3840);
__m512 tmp145 = _mm512_unpacklo_ps(wt79, wt80);
__m512 tmp146 = _mm512_unpackhi_ps(wt79, wt80);
__m512 tmp147 = _mm512_unpacklo_ps(wt81, wt82);
__m512 tmp148 = _mm512_unpackhi_ps(wt81, wt82);
__m512 tmp149 = _mm512_unpacklo_ps(wt83, wt84);
__m512 tmp150 = _mm512_unpackhi_ps(wt83, wt84);
__m512 tmp151 = _mm512_unpacklo_ps(wt85, wt86);
__m512 tmp152 = _mm512_unpackhi_ps(wt85, wt86);
__m512 tmp153 = _mm512_unpacklo_ps(wt87, wt88);
__m512 tmp154 = _mm512_unpackhi_ps(wt87, wt88);
__m512 tmp155 = _mm512_unpacklo_ps(wt89, wt90);
__m512 tmp156 = _mm512_unpackhi_ps(wt89, wt90);
__m512 tmp157 = _mm512_unpacklo_ps(wt91, wt92);
__m512 tmp158 = _mm512_unpackhi_ps(wt91, wt92);
__m512 tmp159 = _mm512_unpacklo_ps(wt93, wt94);
__m512 tmp160 = _mm512_unpackhi_ps(wt93, wt94);
__m512 tmp161 = _mm512_shuffle_ps(tmp145, tmp147, 68);
__m512 tmp162 = _mm512_shuffle_ps(tmp145, tmp147, 238);
__m512 tmp163 = _mm512_shuffle_ps(tmp146, tmp148, 68);
__m512 tmp164 = _mm512_shuffle_ps(tmp146, tmp148, 238);
__m512 tmp165 = _mm512_shuffle_ps(tmp149, tmp151, 68);
__m512 tmp166 = _mm512_shuffle_ps(tmp149, tmp151, 238);
__m512 tmp167 = _mm512_shuffle_ps(tmp150, tmp152, 68);
__m512 tmp168 = _mm512_shuffle_ps(tmp150, tmp152, 238);
__m512 tmp169 = _mm512_shuffle_ps(tmp153, tmp155, 68);
__m512 tmp170 = _mm512_shuffle_ps(tmp153, tmp155, 238);
__m512 tmp171 = _mm512_shuffle_ps(tmp154, tmp156, 68);
__m512 tmp172 = _mm512_shuffle_ps(tmp154, tmp156, 238);
__m512 tmp173 = _mm512_shuffle_ps(tmp157, tmp159, 68);
__m512 tmp174 = _mm512_shuffle_ps(tmp157, tmp159, 238);
__m512 tmp175 = _mm512_shuffle_ps(tmp158, tmp160, 68);
__m512 tmp176 = _mm512_shuffle_ps(tmp158, tmp160, 238);
__m512 tmp177 = _mm512_shuffle_f32x4(tmp161, tmp165, 136);
__m512 tmp178 = _mm512_shuffle_f32x4(tmp161, tmp165, 221);
__m512 tmp179 = _mm512_shuffle_f32x4(tmp162, tmp166, 136);
__m512 tmp180 = _mm512_shuffle_f32x4(tmp162, tmp166, 221);
__m512 tmp181 = _mm512_shuffle_f32x4(tmp163, tmp167, 136);
__m512 tmp182 = _mm512_shuffle_f32x4(tmp163, tmp167, 221);
__m512 tmp183 = _mm512_shuffle_f32x4(tmp164, tmp168, 136);
__m512 tmp184 = _mm512_shuffle_f32x4(tmp164, tmp168, 221);
__m512 tmp185 = _mm512_shuffle_f32x4(tmp169, tmp173, 136);
__m512 tmp186 = _mm512_shuffle_f32x4(tmp169, tmp173, 221);
__m512 tmp187 = _mm512_shuffle_f32x4(tmp170, tmp174, 136);
__m512 tmp188 = _mm512_shuffle_f32x4(tmp170, tmp174, 221);
__m512 tmp189 = _mm512_shuffle_f32x4(tmp171, tmp175, 136);
__m512 tmp190 = _mm512_shuffle_f32x4(tmp171, tmp175, 221);
__m512 tmp191 = _mm512_shuffle_f32x4(tmp172, tmp176, 136);
__m512 tmp192 = _mm512_shuffle_f32x4(tmp172, tmp176, 221);
wt79 = _mm512_shuffle_f32x4(tmp177, tmp185, 136);
wt87 = _mm512_shuffle_f32x4(tmp177, tmp185, 221);
wt80 = _mm512_shuffle_f32x4(tmp179, tmp187, 136);
wt88 = _mm512_shuffle_f32x4(tmp179, tmp187, 221);
wt81 = _mm512_shuffle_f32x4(tmp181, tmp189, 136);
wt89 = _mm512_shuffle_f32x4(tmp181, tmp189, 221);
wt82 = _mm512_shuffle_f32x4(tmp183, tmp191, 136);
wt90 = _mm512_shuffle_f32x4(tmp183, tmp191, 221);
wt83 = _mm512_shuffle_f32x4(tmp178, tmp186, 136);
wt91 = _mm512_shuffle_f32x4(tmp178, tmp186, 221);
wt84 = _mm512_shuffle_f32x4(tmp180, tmp188, 136);
wt92 = _mm512_shuffle_f32x4(tmp180, tmp188, 221);
wt85 = _mm512_shuffle_f32x4(tmp182, tmp190, 136);
wt93 = _mm512_shuffle_f32x4(tmp182, tmp190, 221);
wt86 = _mm512_shuffle_f32x4(tmp184, tmp192, 136);
wt94 = _mm512_shuffle_f32x4(tmp184, tmp192, 221);
wt79 = _mm512_mul_ps(wt79, postMul8);
wt80 = _mm512_mul_ps(wt80, postMul8);
wt81 = _mm512_mul_ps(wt81, postMul8);
wt82 = _mm512_mul_ps(wt82, postMul8);
wt83 = _mm512_mul_ps(wt83, postMul8);
wt84 = _mm512_mul_ps(wt84, postMul8);
wt85 = _mm512_mul_ps(wt85, postMul8);
wt86 = _mm512_mul_ps(wt86, postMul8);
wt87 = _mm512_mul_ps(wt87, postMul8);
wt88 = _mm512_mul_ps(wt88, postMul8);
wt89 = _mm512_mul_ps(wt89, postMul8);
wt90 = _mm512_mul_ps(wt90, postMul8);
wt91 = _mm512_mul_ps(wt91, postMul8);
wt92 = _mm512_mul_ps(wt92, postMul8);
wt93 = _mm512_mul_ps(wt93, postMul8);
wt94 = _mm512_mul_ps(wt94, postMul8);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(1+16*c9)+(ptrdiff_t)0, 63>>cut3, wt79);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(2+16*c9)+(ptrdiff_t)0, 63>>cut3, wt80);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(3+16*c9)+(ptrdiff_t)0, 63>>cut3, wt81);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(4+16*c9)+(ptrdiff_t)0, 63>>cut3, wt82);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(5+16*c9)+(ptrdiff_t)0, 63>>cut3, wt83);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(6+16*c9)+(ptrdiff_t)0, 63>>cut3, wt84);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(7+16*c9)+(ptrdiff_t)0, 63>>cut3, wt85);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(8+16*c9)+(ptrdiff_t)0, 63>>cut3, wt86);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(9+16*c9)+(ptrdiff_t)0, 63>>cut3, wt87);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(10+16*c9)+(ptrdiff_t)0, 63>>cut3, wt88);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(11+16*c9)+(ptrdiff_t)0, 63>>cut3, wt89);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(12+16*c9)+(ptrdiff_t)0, 63>>cut3, wt90);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(13+16*c9)+(ptrdiff_t)0, 63>>cut3, wt91);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(14+16*c9)+(ptrdiff_t)0, 63>>cut3, wt92);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(15+16*c9)+(ptrdiff_t)0, 63>>cut3, wt93);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(16+16*c9)+(ptrdiff_t)0, 63>>cut3, wt94);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(1+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt79);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(2+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt80);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(3+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt81);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(4+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt82);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(5+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt83);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(6+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt84);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(7+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt85);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(8+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt86);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(9+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt87);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(10+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt88);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(11+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt89);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(12+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt90);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(13+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt91);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(14+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt92);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(15+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt93);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(16+16*c9)+(ptrdiff_t)1536, 4032>>cut3, wt94);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(1+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt79);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(2+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt80);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(3+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt81);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(4+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt82);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(5+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt83);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(6+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt84);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(7+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt85);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(8+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt86);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(9+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt87);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(10+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt88);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(11+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt89);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(12+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt90);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(13+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt91);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(14+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt92);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(15+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt93);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(16+16*c9)+(ptrdiff_t)3072, 258048>>cut3, wt94);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(1+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt79);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(2+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt80);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(3+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt81);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(4+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt82);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(5+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt83);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(6+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt84);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(7+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt85);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(8+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt86);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(9+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt87);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(10+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt88);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(11+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt89);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(12+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt90);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(13+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt91);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(14+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt92);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(15+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt93);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l11+4*cut3+24*(16+16*c9)+(ptrdiff_t)4608, 65535-(262143>>cut3), wt94);
}
}
}
} else {
ptrdiff_t k47 = 48;
ptrdiff_t l10 = (size_t)(256+k47)/6;
ptrdiff_t cut2 = (size_t)(256+k47)%6;
__m512 sum4 = _mm512_maskz_loadu_ps(65535, biasPtr3+1280*i12+4*k47);
__m512i pmMul6 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd6 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo5 = _mm512_loadu_ps(bnPtr3+(ptrdiff_t)8*(k47+320*i12));
__m512 masHi5 = _mm512_maskz_loadu_ps(65535, bnPtr3+(ptrdiff_t)8*(k47+320*i12)+(ptrdiff_t)64);
__m512 postMul6 = _mm512_permutex2var_ps(masLo5, pmMul6, masHi5);
__m512 postAdd4 = _mm512_permutex2var_ps(masLo5, pmAdd6, masHi5);
sum4 = _mm512_fmadd_ps(sum4, postMul6, postAdd4);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*0+(ptrdiff_t)0, 63>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*0+(ptrdiff_t)1536, 4032>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*0+(ptrdiff_t)3072, 258048>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*0+(ptrdiff_t)4608, 65535-(262143>>cut2), sum4);
ptrdiff_t c7 = 0;
for (; c7 != 4; ++c7) {
__m512 wt47 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)0);
__m512 wt48 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)256);
__m512 wt49 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)512);
__m512 wt50 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)768);
__m512 wt51 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)1024);
__m512 wt52 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)1280);
__m512 wt53 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)1536);
__m512 wt54 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)1792);
__m512 wt55 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)2048);
__m512 wt56 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)2304);
__m512 wt57 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)2560);
__m512 wt58 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)2816);
__m512 wt59 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)3072);
__m512 wt60 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)3328);
__m512 wt61 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)3584);
__m512 wt62 = _mm512_maskz_loadu_ps(65535, wtPtr3+81920*i12+256*k47+64*c7+(ptrdiff_t)3840);
__m512 tmp193 = _mm512_unpacklo_ps(wt47, wt48);
__m512 tmp194 = _mm512_unpackhi_ps(wt47, wt48);
__m512 tmp195 = _mm512_unpacklo_ps(wt49, wt50);
__m512 tmp196 = _mm512_unpackhi_ps(wt49, wt50);
__m512 tmp197 = _mm512_unpacklo_ps(wt51, wt52);
__m512 tmp198 = _mm512_unpackhi_ps(wt51, wt52);
__m512 tmp199 = _mm512_unpacklo_ps(wt53, wt54);
__m512 tmp200 = _mm512_unpackhi_ps(wt53, wt54);
__m512 tmp201 = _mm512_unpacklo_ps(wt55, wt56);
__m512 tmp202 = _mm512_unpackhi_ps(wt55, wt56);
__m512 tmp203 = _mm512_unpacklo_ps(wt57, wt58);
__m512 tmp204 = _mm512_unpackhi_ps(wt57, wt58);
__m512 tmp205 = _mm512_unpacklo_ps(wt59, wt60);
__m512 tmp206 = _mm512_unpackhi_ps(wt59, wt60);
__m512 tmp207 = _mm512_unpacklo_ps(wt61, wt62);
__m512 tmp208 = _mm512_unpackhi_ps(wt61, wt62);
__m512 tmp209 = _mm512_shuffle_ps(tmp193, tmp195, 68);
__m512 tmp210 = _mm512_shuffle_ps(tmp193, tmp195, 238);
__m512 tmp211 = _mm512_shuffle_ps(tmp194, tmp196, 68);
__m512 tmp212 = _mm512_shuffle_ps(tmp194, tmp196, 238);
__m512 tmp213 = _mm512_shuffle_ps(tmp197, tmp199, 68);
__m512 tmp214 = _mm512_shuffle_ps(tmp197, tmp199, 238);
__m512 tmp215 = _mm512_shuffle_ps(tmp198, tmp200, 68);
__m512 tmp216 = _mm512_shuffle_ps(tmp198, tmp200, 238);
__m512 tmp217 = _mm512_shuffle_ps(tmp201, tmp203, 68);
__m512 tmp218 = _mm512_shuffle_ps(tmp201, tmp203, 238);
__m512 tmp219 = _mm512_shuffle_ps(tmp202, tmp204, 68);
__m512 tmp220 = _mm512_shuffle_ps(tmp202, tmp204, 238);
__m512 tmp221 = _mm512_shuffle_ps(tmp205, tmp207, 68);
__m512 tmp222 = _mm512_shuffle_ps(tmp205, tmp207, 238);
__m512 tmp223 = _mm512_shuffle_ps(tmp206, tmp208, 68);
__m512 tmp224 = _mm512_shuffle_ps(tmp206, tmp208, 238);
__m512 tmp225 = _mm512_shuffle_f32x4(tmp209, tmp213, 136);
__m512 tmp226 = _mm512_shuffle_f32x4(tmp209, tmp213, 221);
__m512 tmp227 = _mm512_shuffle_f32x4(tmp210, tmp214, 136);
__m512 tmp228 = _mm512_shuffle_f32x4(tmp210, tmp214, 221);
__m512 tmp229 = _mm512_shuffle_f32x4(tmp211, tmp215, 136);
__m512 tmp230 = _mm512_shuffle_f32x4(tmp211, tmp215, 221);
__m512 tmp231 = _mm512_shuffle_f32x4(tmp212, tmp216, 136);
__m512 tmp232 = _mm512_shuffle_f32x4(tmp212, tmp216, 221);
__m512 tmp233 = _mm512_shuffle_f32x4(tmp217, tmp221, 136);
__m512 tmp234 = _mm512_shuffle_f32x4(tmp217, tmp221, 221);
__m512 tmp235 = _mm512_shuffle_f32x4(tmp218, tmp222, 136);
__m512 tmp236 = _mm512_shuffle_f32x4(tmp218, tmp222, 221);
__m512 tmp237 = _mm512_shuffle_f32x4(tmp219, tmp223, 136);
__m512 tmp238 = _mm512_shuffle_f32x4(tmp219, tmp223, 221);
__m512 tmp239 = _mm512_shuffle_f32x4(tmp220, tmp224, 136);
__m512 tmp240 = _mm512_shuffle_f32x4(tmp220, tmp224, 221);
wt47 = _mm512_shuffle_f32x4(tmp225, tmp233, 136);
wt55 = _mm512_shuffle_f32x4(tmp225, tmp233, 221);
wt48 = _mm512_shuffle_f32x4(tmp227, tmp235, 136);
wt56 = _mm512_shuffle_f32x4(tmp227, tmp235, 221);
wt49 = _mm512_shuffle_f32x4(tmp229, tmp237, 136);
wt57 = _mm512_shuffle_f32x4(tmp229, tmp237, 221);
wt50 = _mm512_shuffle_f32x4(tmp231, tmp239, 136);
wt58 = _mm512_shuffle_f32x4(tmp231, tmp239, 221);
wt51 = _mm512_shuffle_f32x4(tmp226, tmp234, 136);
wt59 = _mm512_shuffle_f32x4(tmp226, tmp234, 221);
wt52 = _mm512_shuffle_f32x4(tmp228, tmp236, 136);
wt60 = _mm512_shuffle_f32x4(tmp228, tmp236, 221);
wt53 = _mm512_shuffle_f32x4(tmp230, tmp238, 136);
wt61 = _mm512_shuffle_f32x4(tmp230, tmp238, 221);
wt54 = _mm512_shuffle_f32x4(tmp232, tmp240, 136);
wt62 = _mm512_shuffle_f32x4(tmp232, tmp240, 221);
wt47 = _mm512_mul_ps(wt47, postMul6);
wt48 = _mm512_mul_ps(wt48, postMul6);
wt49 = _mm512_mul_ps(wt49, postMul6);
wt50 = _mm512_mul_ps(wt50, postMul6);
wt51 = _mm512_mul_ps(wt51, postMul6);
wt52 = _mm512_mul_ps(wt52, postMul6);
wt53 = _mm512_mul_ps(wt53, postMul6);
wt54 = _mm512_mul_ps(wt54, postMul6);
wt55 = _mm512_mul_ps(wt55, postMul6);
wt56 = _mm512_mul_ps(wt56, postMul6);
wt57 = _mm512_mul_ps(wt57, postMul6);
wt58 = _mm512_mul_ps(wt58, postMul6);
wt59 = _mm512_mul_ps(wt59, postMul6);
wt60 = _mm512_mul_ps(wt60, postMul6);
wt61 = _mm512_mul_ps(wt61, postMul6);
wt62 = _mm512_mul_ps(wt62, postMul6);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(1+16*c7)+(ptrdiff_t)0, 63>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(2+16*c7)+(ptrdiff_t)0, 63>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(3+16*c7)+(ptrdiff_t)0, 63>>cut2, wt49);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(4+16*c7)+(ptrdiff_t)0, 63>>cut2, wt50);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(5+16*c7)+(ptrdiff_t)0, 63>>cut2, wt51);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(6+16*c7)+(ptrdiff_t)0, 63>>cut2, wt52);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(7+16*c7)+(ptrdiff_t)0, 63>>cut2, wt53);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(8+16*c7)+(ptrdiff_t)0, 63>>cut2, wt54);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(9+16*c7)+(ptrdiff_t)0, 63>>cut2, wt55);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(10+16*c7)+(ptrdiff_t)0, 63>>cut2, wt56);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(11+16*c7)+(ptrdiff_t)0, 63>>cut2, wt57);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(12+16*c7)+(ptrdiff_t)0, 63>>cut2, wt58);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(13+16*c7)+(ptrdiff_t)0, 63>>cut2, wt59);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(14+16*c7)+(ptrdiff_t)0, 63>>cut2, wt60);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(15+16*c7)+(ptrdiff_t)0, 63>>cut2, wt61);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(16+16*c7)+(ptrdiff_t)0, 63>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(1+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(2+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(3+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt49);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(4+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt50);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(5+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt51);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(6+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt52);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(7+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt53);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(8+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt54);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(9+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt55);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(10+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt56);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(11+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt57);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(12+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt58);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(13+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt59);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(14+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt60);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(15+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt61);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(16+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(1+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(2+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(3+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt49);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(4+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt50);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(5+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt51);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(6+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt52);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(7+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt53);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(8+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt54);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(9+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt55);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(10+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt56);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(11+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt57);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(12+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt58);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(13+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt59);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(14+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt60);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(15+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt61);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+24*(16+16*c7)+(ptrdiff_t)3072, 258048>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(1+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt47);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(2+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt48);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(3+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt49);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(4+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt50);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(5+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt51);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(6+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt52);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(7+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt53);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(8+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt54);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(9+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt55);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(10+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt56);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(11+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt57);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(12+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt58);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(13+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt59);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(14+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt60);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(15+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt61);
_mm512_mask_storeu_ps(arranged1+83200*i12+1560*l10+4*cut2+8*(16+16*c7)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt62);
}
}
}
}
}

static void ResNet50OneArrangeWts1(ResNet50ThreaderTeam1* team19, char** tensors11) {
ResNet50ThreaderTask1 task15;
task15.callee1 = ResNet50OneArrangeWts1Callee1;
task15.any1 = tensors11;
task15.nd1 = 3;
task15.hull1[0] = 2;
task15.hull1[1] = 1;
task15.hull1[2] = 1;
ResNet50ThreaderDo1(team19, &task15);
}

static void ResNet50OneArrangeDats1Callee1(ResNet50ThreaderTask1* task16, int64_t* pt13) {
char** tensors14 = task16->any1;
ptrdiff_t c10 = pt13[1];
char*restrict datPtr3 = tensors14[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)806912*0;
char*restrict arranged2 = tensors14[1]+(ptrdiff_t)10474240*0+(ptrdiff_t)802816*0;
ptrdiff_t ii2 = 1;
for (ptrdiff_t i13 = 0; i13 < ii2; ++i13) {
ptrdiff_t j8 = 2*c10;
ptrdiff_t jj20 = j8+(c10 < 23 ? 1 : 2);
for (; j8 != 49; ++j8) {
ptrdiff_t k49 = 0;
ptrdiff_t kk24 = k49+64;
for (; k49 < kk24; ++k49) {
__m512 dat909 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i13+256*j8+12608*k49+(ptrdiff_t)0);
__m512 dat910 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i13+256*j8+12608*k49+(ptrdiff_t)64);
__m512 dat911 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i13+256*j8+12608*k49+(ptrdiff_t)128);
__m512 dat912 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i13+256*j8+12608*k49+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged2+802816*i13+16384*j8+256*k49+(ptrdiff_t)0, 65535, dat909);
_mm512_mask_storeu_ps(arranged2+802816*i13+16384*j8+256*k49+(ptrdiff_t)64, 65535, dat910);
_mm512_mask_storeu_ps(arranged2+802816*i13+16384*j8+256*k49+(ptrdiff_t)128, 65535, dat911);
_mm512_mask_storeu_ps(arranged2+802816*i13+16384*j8+256*k49+(ptrdiff_t)192, 65535, dat912);
}
if (j8 >= jj20) goto next1;
}
next1:;
}
}

static void ResNet50OneArrangeDats1(ResNet50ThreaderTeam1* team20, char** tensors13) {
ResNet50ThreaderTask1 task17;
task17.callee1 = ResNet50OneArrangeDats1Callee1;
task17.any1 = tensors13;
task17.nd1 = 4;
task17.hull1[0] = 1;
task17.hull1[1] = 24;
task17.hull1[2] = 1;
task17.hull1[3] = 1;
ResNet50ThreaderDo1(team20, &task17);
}

static void ResNet50OneApply1Callee1(ResNet50ThreaderTask1* task18, int64_t* pt14) {
void** pair2 = task18->any1;
char** tensors16 = pair2[0];
ptrdiff_t e6 = 0;
ptrdiff_t g6 = 0;
ptrdiff_t d3 = pt14[1];
ptrdiff_t w22 = pt14[0];
char*restrict arrangedWts1 = tensors16[0]+1070080*e6+(ptrdiff_t)83200*1*g6;
char*restrict arrangedDats1 = tensors16[1]+10474240*e6+(ptrdiff_t)802816*1*g6;
char*restrict datPtr4 = tensors16[2]+(ptrdiff_t)4034560*1*g6;
ptrdiff_t ii3 = 1;
for (ptrdiff_t i14 = 0; i14 < ii3; ++i14) {
ptrdiff_t j9 = 1*d3;
ptrdiff_t jj21 = j9+0;
for (; j9 != 49; ++j9) {
ptrdiff_t k50 = 9*w22;
ptrdiff_t kk25 = k50+8;
for (; k50 != 53; ++k50) {
ptrdiff_t s10 = -1;
__m512 sum7 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)24));
__m512 sum11 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)28));
__m512 sum15 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)32));
__m512 sum19 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)36));
__m512 sum23 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)40));
__m512 sum27 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)44));
__m512 sum8 = sum7;
__m512 sum9 = sum7;
__m512 sum10 = sum7;
__m512 sum12 = sum11;
__m512 sum13 = sum11;
__m512 sum14 = sum11;
__m512 sum16 = sum15;
__m512 sum17 = sum15;
__m512 sum18 = sum15;
__m512 sum20 = sum19;
__m512 sum21 = sum19;
__m512 sum22 = sum19;
__m512 sum24 = sum23;
__m512 sum25 = sum23;
__m512 sum26 = sum23;
__m512 sum28 = sum27;
__m512 sum29 = sum27;
__m512 sum30 = sum27;
for (s10 = 0; s10 < 64; ++s10) {
__m512 dat913 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s10+(ptrdiff_t)0);
__m512 dat914 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s10+(ptrdiff_t)64);
__m512 dat915 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s10+(ptrdiff_t)128);
__m512 dat916 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s10+(ptrdiff_t)192);
__m512 wt95 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)24));
sum7 = _mm512_fmadd_ps(wt95, dat913, sum7);
sum8 = _mm512_fmadd_ps(wt95, dat914, sum8);
sum9 = _mm512_fmadd_ps(wt95, dat915, sum9);
sum10 = _mm512_fmadd_ps(wt95, dat916, sum10);
__m512 wt96 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)28));
sum11 = _mm512_fmadd_ps(wt96, dat913, sum11);
sum12 = _mm512_fmadd_ps(wt96, dat914, sum12);
sum13 = _mm512_fmadd_ps(wt96, dat915, sum13);
sum14 = _mm512_fmadd_ps(wt96, dat916, sum14);
__m512 wt97 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)32));
sum15 = _mm512_fmadd_ps(wt97, dat913, sum15);
sum16 = _mm512_fmadd_ps(wt97, dat914, sum16);
sum17 = _mm512_fmadd_ps(wt97, dat915, sum17);
sum18 = _mm512_fmadd_ps(wt97, dat916, sum18);
__m512 wt98 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)36));
sum19 = _mm512_fmadd_ps(wt98, dat913, sum19);
sum20 = _mm512_fmadd_ps(wt98, dat914, sum20);
sum21 = _mm512_fmadd_ps(wt98, dat915, sum21);
sum22 = _mm512_fmadd_ps(wt98, dat916, sum22);
__m512 wt99 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)40));
sum23 = _mm512_fmadd_ps(wt99, dat913, sum23);
sum24 = _mm512_fmadd_ps(wt99, dat914, sum24);
sum25 = _mm512_fmadd_ps(wt99, dat915, sum25);
sum26 = _mm512_fmadd_ps(wt99, dat916, sum26);
__m512 wt100 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+24*s10+(ptrdiff_t)44));
sum27 = _mm512_fmadd_ps(wt100, dat913, sum27);
sum28 = _mm512_fmadd_ps(wt100, dat914, sum28);
sum29 = _mm512_fmadd_ps(wt100, dat915, sum29);
sum30 = _mm512_fmadd_ps(wt100, dat916, sum30);
}
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)0, 65535, sum7);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)64, 65535, sum8);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)128, 65535, sum9);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)192, 65535, sum10);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)12608, 65535, sum11);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)12672, 65535, sum12);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)12736, 65535, sum13);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)12800, 65535, sum14);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)25216, 65535, sum15);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)25280, 65535, sum16);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)25344, 65535, sum17);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)25408, 65535, sum18);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)37824, 65535, sum19);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)37888, 65535, sum20);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)37952, 65535, sum21);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)38016, 65535, sum22);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)50432, 65535, sum23);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)50496, 65535, sum24);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)50560, 65535, sum25);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)50624, 65535, sum26);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)63040, 65535, sum27);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)63104, 65535, sum28);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)63168, 65535, sum29);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)63232, 65535, sum30);
if (k50 >= kk25) return;
}
ptrdiff_t s11 = -1;
__m512 sum31 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+8*s11+(ptrdiff_t)8));
__m512 sum35 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+8*s11+(ptrdiff_t)12));
__m512 sum32 = sum31;
__m512 sum33 = sum31;
__m512 sum34 = sum31;
__m512 sum36 = sum35;
__m512 sum37 = sum35;
__m512 sum38 = sum35;
for (s11 = 0; s11 < 64; ++s11) {
__m512 dat917 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s11+(ptrdiff_t)0);
__m512 dat918 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s11+(ptrdiff_t)64);
__m512 dat919 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s11+(ptrdiff_t)128);
__m512 dat920 = _mm512_loadu_ps(arrangedDats1+802816*i14+16384*j9+256*s11+(ptrdiff_t)192);
__m512 wt101 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+8*s11+(ptrdiff_t)8));
sum31 = _mm512_fmadd_ps(wt101, dat917, sum31);
sum32 = _mm512_fmadd_ps(wt101, dat918, sum32);
sum33 = _mm512_fmadd_ps(wt101, dat919, sum33);
sum34 = _mm512_fmadd_ps(wt101, dat920, sum34);
__m512 wt102 = _mm512_set1_ps(*(float*)(arrangedWts1+83200*i14+1560*k50+8*s11+(ptrdiff_t)12));
sum35 = _mm512_fmadd_ps(wt102, dat917, sum35);
sum36 = _mm512_fmadd_ps(wt102, dat918, sum36);
sum37 = _mm512_fmadd_ps(wt102, dat919, sum37);
sum38 = _mm512_fmadd_ps(wt102, dat920, sum38);
}
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)0, 65535, sum31);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)64, 65535, sum32);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)128, 65535, sum33);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)192, 65535, sum34);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)12608, 65535, sum35);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)12672, 65535, sum36);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)12736, 65535, sum37);
_mm512_mask_storeu_ps(datPtr4+4034560*i14+256*j9+75648*k50+(ptrdiff_t)12800, 65535, sum38);
if (j9 >= jj21) return;
}
}
}

static void ResNet50OneApply1(ResNet50ThreaderTeam1* team21, char** tensors15) {
void* pair1[] = {tensors15, 0};
ResNet50ThreaderTask1 task19;
task19.callee1 = ResNet50OneApply1Callee1;
task19.any1 = pair1;
task19.nd1 = 3;
task19.hull1[0] = 6;
task19.hull1[1] = 49;
task19.hull1[2] = 1;
ResNet50ThreaderDo1(team21, &task19);
}

static void ResNet50OneArrangeWts2Callee1(ResNet50ThreaderTask1* task28, int64_t* pt19) {
char** tensors26 = task28->any1;
ptrdiff_t b48 = pt19[0];
char*restrict wtPtr5 = tensors26[0]+(ptrdiff_t)3340*0+(ptrdiff_t)65536*0;
char*restrict biasPtr5 = tensors26[1]+(ptrdiff_t)1024*0;
char*restrict bnPtr5 = tensors26[2]+(ptrdiff_t)8*256*0;
char*restrict arranged3 = tensors26[3]+(ptrdiff_t)856064*0+(ptrdiff_t)66560*0;
ptrdiff_t ii4 = 1;
for (ptrdiff_t i19 = 0; i19 < ii4; ++i19) {
ptrdiff_t j14 = 8*b48;
ptrdiff_t jj23 = j14+8;
for (; j14 < jj23; ++j14) {
if (j14 < 15) {
ptrdiff_t k72 = 0+16*(j14-0);
ptrdiff_t l24 = (size_t)(0+k72)/6;
ptrdiff_t cut6 = (size_t)(0+k72)%6;
switch (cut6) {
case 0:;
case 2: {
__m512 sum80 = _mm512_maskz_loadu_ps(65535, biasPtr5+1024*i19+4*k72);
__m512i pmMul8 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd8 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo6 = _mm512_loadu_ps(bnPtr5+(ptrdiff_t)8*(k72+256*i19));
__m512 masHi6 = _mm512_maskz_loadu_ps(65535, bnPtr5+(ptrdiff_t)8*(k72+256*i19)+(ptrdiff_t)64);
__m512 postMul15 = _mm512_permutex2var_ps(masLo6, pmMul8, masHi6);
__m512 postAdd9 = _mm512_permutex2var_ps(masLo6, pmAdd8, masHi6);
sum80 = _mm512_fmadd_ps(sum80, postMul15, postAdd9);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*0+(ptrdiff_t)0, 63>>cut6, sum80);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*0+(ptrdiff_t)1536, 4032>>cut6, sum80);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*0+(ptrdiff_t)3072, 65535-(4095>>cut6), sum80);
ptrdiff_t c13 = 0;
for (; c13 != 4; ++c13) {
__m512 wt123 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)0);
__m512 wt124 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)256);
__m512 wt125 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)512);
__m512 wt126 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)768);
__m512 wt127 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)1024);
__m512 wt128 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)1280);
__m512 wt129 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)1536);
__m512 wt130 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)1792);
__m512 wt131 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)2048);
__m512 wt132 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)2304);
__m512 wt133 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)2560);
__m512 wt134 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)2816);
__m512 wt135 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)3072);
__m512 wt136 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)3328);
__m512 wt137 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)3584);
__m512 wt138 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c13+(ptrdiff_t)3840);
__m512 tmp5253 = _mm512_unpacklo_ps(wt123, wt124);
__m512 tmp5254 = _mm512_unpackhi_ps(wt123, wt124);
__m512 tmp5255 = _mm512_unpacklo_ps(wt125, wt126);
__m512 tmp5256 = _mm512_unpackhi_ps(wt125, wt126);
__m512 tmp5257 = _mm512_unpacklo_ps(wt127, wt128);
__m512 tmp5258 = _mm512_unpackhi_ps(wt127, wt128);
__m512 tmp5259 = _mm512_unpacklo_ps(wt129, wt130);
__m512 tmp5260 = _mm512_unpackhi_ps(wt129, wt130);
__m512 tmp5261 = _mm512_unpacklo_ps(wt131, wt132);
__m512 tmp5262 = _mm512_unpackhi_ps(wt131, wt132);
__m512 tmp5263 = _mm512_unpacklo_ps(wt133, wt134);
__m512 tmp5264 = _mm512_unpackhi_ps(wt133, wt134);
__m512 tmp5265 = _mm512_unpacklo_ps(wt135, wt136);
__m512 tmp5266 = _mm512_unpackhi_ps(wt135, wt136);
__m512 tmp5267 = _mm512_unpacklo_ps(wt137, wt138);
__m512 tmp5268 = _mm512_unpackhi_ps(wt137, wt138);
__m512 tmp5269 = _mm512_shuffle_ps(tmp5253, tmp5255, 68);
__m512 tmp5270 = _mm512_shuffle_ps(tmp5253, tmp5255, 238);
__m512 tmp5271 = _mm512_shuffle_ps(tmp5254, tmp5256, 68);
__m512 tmp5272 = _mm512_shuffle_ps(tmp5254, tmp5256, 238);
__m512 tmp5273 = _mm512_shuffle_ps(tmp5257, tmp5259, 68);
__m512 tmp5274 = _mm512_shuffle_ps(tmp5257, tmp5259, 238);
__m512 tmp5275 = _mm512_shuffle_ps(tmp5258, tmp5260, 68);
__m512 tmp5276 = _mm512_shuffle_ps(tmp5258, tmp5260, 238);
__m512 tmp5277 = _mm512_shuffle_ps(tmp5261, tmp5263, 68);
__m512 tmp5278 = _mm512_shuffle_ps(tmp5261, tmp5263, 238);
__m512 tmp5279 = _mm512_shuffle_ps(tmp5262, tmp5264, 68);
__m512 tmp5280 = _mm512_shuffle_ps(tmp5262, tmp5264, 238);
__m512 tmp5281 = _mm512_shuffle_ps(tmp5265, tmp5267, 68);
__m512 tmp5282 = _mm512_shuffle_ps(tmp5265, tmp5267, 238);
__m512 tmp5283 = _mm512_shuffle_ps(tmp5266, tmp5268, 68);
__m512 tmp5284 = _mm512_shuffle_ps(tmp5266, tmp5268, 238);
__m512 tmp5285 = _mm512_shuffle_f32x4(tmp5269, tmp5273, 136);
__m512 tmp5286 = _mm512_shuffle_f32x4(tmp5269, tmp5273, 221);
__m512 tmp5287 = _mm512_shuffle_f32x4(tmp5270, tmp5274, 136);
__m512 tmp5288 = _mm512_shuffle_f32x4(tmp5270, tmp5274, 221);
__m512 tmp5289 = _mm512_shuffle_f32x4(tmp5271, tmp5275, 136);
__m512 tmp5290 = _mm512_shuffle_f32x4(tmp5271, tmp5275, 221);
__m512 tmp5291 = _mm512_shuffle_f32x4(tmp5272, tmp5276, 136);
__m512 tmp5292 = _mm512_shuffle_f32x4(tmp5272, tmp5276, 221);
__m512 tmp5293 = _mm512_shuffle_f32x4(tmp5277, tmp5281, 136);
__m512 tmp5294 = _mm512_shuffle_f32x4(tmp5277, tmp5281, 221);
__m512 tmp5295 = _mm512_shuffle_f32x4(tmp5278, tmp5282, 136);
__m512 tmp5296 = _mm512_shuffle_f32x4(tmp5278, tmp5282, 221);
__m512 tmp5297 = _mm512_shuffle_f32x4(tmp5279, tmp5283, 136);
__m512 tmp5298 = _mm512_shuffle_f32x4(tmp5279, tmp5283, 221);
__m512 tmp5299 = _mm512_shuffle_f32x4(tmp5280, tmp5284, 136);
__m512 tmp5300 = _mm512_shuffle_f32x4(tmp5280, tmp5284, 221);
wt123 = _mm512_shuffle_f32x4(tmp5285, tmp5293, 136);
wt131 = _mm512_shuffle_f32x4(tmp5285, tmp5293, 221);
wt124 = _mm512_shuffle_f32x4(tmp5287, tmp5295, 136);
wt132 = _mm512_shuffle_f32x4(tmp5287, tmp5295, 221);
wt125 = _mm512_shuffle_f32x4(tmp5289, tmp5297, 136);
wt133 = _mm512_shuffle_f32x4(tmp5289, tmp5297, 221);
wt126 = _mm512_shuffle_f32x4(tmp5291, tmp5299, 136);
wt134 = _mm512_shuffle_f32x4(tmp5291, tmp5299, 221);
wt127 = _mm512_shuffle_f32x4(tmp5286, tmp5294, 136);
wt135 = _mm512_shuffle_f32x4(tmp5286, tmp5294, 221);
wt128 = _mm512_shuffle_f32x4(tmp5288, tmp5296, 136);
wt136 = _mm512_shuffle_f32x4(tmp5288, tmp5296, 221);
wt129 = _mm512_shuffle_f32x4(tmp5290, tmp5298, 136);
wt137 = _mm512_shuffle_f32x4(tmp5290, tmp5298, 221);
wt130 = _mm512_shuffle_f32x4(tmp5292, tmp5300, 136);
wt138 = _mm512_shuffle_f32x4(tmp5292, tmp5300, 221);
wt123 = _mm512_mul_ps(wt123, postMul15);
wt124 = _mm512_mul_ps(wt124, postMul15);
wt125 = _mm512_mul_ps(wt125, postMul15);
wt126 = _mm512_mul_ps(wt126, postMul15);
wt127 = _mm512_mul_ps(wt127, postMul15);
wt128 = _mm512_mul_ps(wt128, postMul15);
wt129 = _mm512_mul_ps(wt129, postMul15);
wt130 = _mm512_mul_ps(wt130, postMul15);
wt131 = _mm512_mul_ps(wt131, postMul15);
wt132 = _mm512_mul_ps(wt132, postMul15);
wt133 = _mm512_mul_ps(wt133, postMul15);
wt134 = _mm512_mul_ps(wt134, postMul15);
wt135 = _mm512_mul_ps(wt135, postMul15);
wt136 = _mm512_mul_ps(wt136, postMul15);
wt137 = _mm512_mul_ps(wt137, postMul15);
wt138 = _mm512_mul_ps(wt138, postMul15);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(1+16*c13)+(ptrdiff_t)0, 63>>cut6, wt123);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(2+16*c13)+(ptrdiff_t)0, 63>>cut6, wt124);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(3+16*c13)+(ptrdiff_t)0, 63>>cut6, wt125);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(4+16*c13)+(ptrdiff_t)0, 63>>cut6, wt126);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(5+16*c13)+(ptrdiff_t)0, 63>>cut6, wt127);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(6+16*c13)+(ptrdiff_t)0, 63>>cut6, wt128);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(7+16*c13)+(ptrdiff_t)0, 63>>cut6, wt129);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(8+16*c13)+(ptrdiff_t)0, 63>>cut6, wt130);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(9+16*c13)+(ptrdiff_t)0, 63>>cut6, wt131);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(10+16*c13)+(ptrdiff_t)0, 63>>cut6, wt132);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(11+16*c13)+(ptrdiff_t)0, 63>>cut6, wt133);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(12+16*c13)+(ptrdiff_t)0, 63>>cut6, wt134);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(13+16*c13)+(ptrdiff_t)0, 63>>cut6, wt135);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(14+16*c13)+(ptrdiff_t)0, 63>>cut6, wt136);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(15+16*c13)+(ptrdiff_t)0, 63>>cut6, wt137);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(16+16*c13)+(ptrdiff_t)0, 63>>cut6, wt138);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(1+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt123);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(2+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt124);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(3+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt125);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(4+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt126);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(5+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt127);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(6+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt128);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(7+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt129);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(8+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt130);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(9+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt131);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(10+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt132);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(11+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt133);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(12+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt134);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(13+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt135);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(14+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt136);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(15+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt137);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(16+16*c13)+(ptrdiff_t)1536, 4032>>cut6, wt138);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(1+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt123);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(2+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt124);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(3+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt125);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(4+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt126);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(5+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt127);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(6+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt128);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(7+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt129);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(8+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt130);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(9+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt131);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(10+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt132);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(11+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt133);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(12+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt134);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(13+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt135);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(14+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt136);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(15+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt137);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(16+16*c13)+(ptrdiff_t)3072, 65535-(4095>>cut6), wt138);
}
break;
}
default: {
cut6 = 4;
__m512 sum81 = _mm512_maskz_loadu_ps(65535, biasPtr5+1024*i19+4*k72);
__m512i pmMul9 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd9 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo7 = _mm512_loadu_ps(bnPtr5+(ptrdiff_t)8*(k72+256*i19));
__m512 masHi7 = _mm512_maskz_loadu_ps(65535, bnPtr5+(ptrdiff_t)8*(k72+256*i19)+(ptrdiff_t)64);
__m512 postMul16 = _mm512_permutex2var_ps(masLo7, pmMul9, masHi7);
__m512 postAdd10 = _mm512_permutex2var_ps(masLo7, pmAdd9, masHi7);
sum81 = _mm512_fmadd_ps(sum81, postMul16, postAdd10);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*0+(ptrdiff_t)0, 63>>cut6, sum81);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*0+(ptrdiff_t)1536, 4032>>cut6, sum81);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*0+(ptrdiff_t)3072, 258048>>cut6, sum81);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*0+(ptrdiff_t)4608, 65535-(262143>>cut6), sum81);
ptrdiff_t c14 = 0;
for (; c14 != 4; ++c14) {
__m512 wt139 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)0);
__m512 wt140 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)256);
__m512 wt141 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)512);
__m512 wt142 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)768);
__m512 wt143 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)1024);
__m512 wt144 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)1280);
__m512 wt145 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)1536);
__m512 wt146 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)1792);
__m512 wt147 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)2048);
__m512 wt148 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)2304);
__m512 wt149 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)2560);
__m512 wt150 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)2816);
__m512 wt151 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)3072);
__m512 wt152 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)3328);
__m512 wt153 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)3584);
__m512 wt154 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k72+64*c14+(ptrdiff_t)3840);
__m512 tmp5301 = _mm512_unpacklo_ps(wt139, wt140);
__m512 tmp5302 = _mm512_unpackhi_ps(wt139, wt140);
__m512 tmp5303 = _mm512_unpacklo_ps(wt141, wt142);
__m512 tmp5304 = _mm512_unpackhi_ps(wt141, wt142);
__m512 tmp5305 = _mm512_unpacklo_ps(wt143, wt144);
__m512 tmp5306 = _mm512_unpackhi_ps(wt143, wt144);
__m512 tmp5307 = _mm512_unpacklo_ps(wt145, wt146);
__m512 tmp5308 = _mm512_unpackhi_ps(wt145, wt146);
__m512 tmp5309 = _mm512_unpacklo_ps(wt147, wt148);
__m512 tmp5310 = _mm512_unpackhi_ps(wt147, wt148);
__m512 tmp5311 = _mm512_unpacklo_ps(wt149, wt150);
__m512 tmp5312 = _mm512_unpackhi_ps(wt149, wt150);
__m512 tmp5313 = _mm512_unpacklo_ps(wt151, wt152);
__m512 tmp5314 = _mm512_unpackhi_ps(wt151, wt152);
__m512 tmp5315 = _mm512_unpacklo_ps(wt153, wt154);
__m512 tmp5316 = _mm512_unpackhi_ps(wt153, wt154);
__m512 tmp5317 = _mm512_shuffle_ps(tmp5301, tmp5303, 68);
__m512 tmp5318 = _mm512_shuffle_ps(tmp5301, tmp5303, 238);
__m512 tmp5319 = _mm512_shuffle_ps(tmp5302, tmp5304, 68);
__m512 tmp5320 = _mm512_shuffle_ps(tmp5302, tmp5304, 238);
__m512 tmp5321 = _mm512_shuffle_ps(tmp5305, tmp5307, 68);
__m512 tmp5322 = _mm512_shuffle_ps(tmp5305, tmp5307, 238);
__m512 tmp5323 = _mm512_shuffle_ps(tmp5306, tmp5308, 68);
__m512 tmp5324 = _mm512_shuffle_ps(tmp5306, tmp5308, 238);
__m512 tmp5325 = _mm512_shuffle_ps(tmp5309, tmp5311, 68);
__m512 tmp5326 = _mm512_shuffle_ps(tmp5309, tmp5311, 238);
__m512 tmp5327 = _mm512_shuffle_ps(tmp5310, tmp5312, 68);
__m512 tmp5328 = _mm512_shuffle_ps(tmp5310, tmp5312, 238);
__m512 tmp5329 = _mm512_shuffle_ps(tmp5313, tmp5315, 68);
__m512 tmp5330 = _mm512_shuffle_ps(tmp5313, tmp5315, 238);
__m512 tmp5331 = _mm512_shuffle_ps(tmp5314, tmp5316, 68);
__m512 tmp5332 = _mm512_shuffle_ps(tmp5314, tmp5316, 238);
__m512 tmp5333 = _mm512_shuffle_f32x4(tmp5317, tmp5321, 136);
__m512 tmp5334 = _mm512_shuffle_f32x4(tmp5317, tmp5321, 221);
__m512 tmp5335 = _mm512_shuffle_f32x4(tmp5318, tmp5322, 136);
__m512 tmp5336 = _mm512_shuffle_f32x4(tmp5318, tmp5322, 221);
__m512 tmp5337 = _mm512_shuffle_f32x4(tmp5319, tmp5323, 136);
__m512 tmp5338 = _mm512_shuffle_f32x4(tmp5319, tmp5323, 221);
__m512 tmp5339 = _mm512_shuffle_f32x4(tmp5320, tmp5324, 136);
__m512 tmp5340 = _mm512_shuffle_f32x4(tmp5320, tmp5324, 221);
__m512 tmp5341 = _mm512_shuffle_f32x4(tmp5325, tmp5329, 136);
__m512 tmp5342 = _mm512_shuffle_f32x4(tmp5325, tmp5329, 221);
__m512 tmp5343 = _mm512_shuffle_f32x4(tmp5326, tmp5330, 136);
__m512 tmp5344 = _mm512_shuffle_f32x4(tmp5326, tmp5330, 221);
__m512 tmp5345 = _mm512_shuffle_f32x4(tmp5327, tmp5331, 136);
__m512 tmp5346 = _mm512_shuffle_f32x4(tmp5327, tmp5331, 221);
__m512 tmp5347 = _mm512_shuffle_f32x4(tmp5328, tmp5332, 136);
__m512 tmp5348 = _mm512_shuffle_f32x4(tmp5328, tmp5332, 221);
wt139 = _mm512_shuffle_f32x4(tmp5333, tmp5341, 136);
wt147 = _mm512_shuffle_f32x4(tmp5333, tmp5341, 221);
wt140 = _mm512_shuffle_f32x4(tmp5335, tmp5343, 136);
wt148 = _mm512_shuffle_f32x4(tmp5335, tmp5343, 221);
wt141 = _mm512_shuffle_f32x4(tmp5337, tmp5345, 136);
wt149 = _mm512_shuffle_f32x4(tmp5337, tmp5345, 221);
wt142 = _mm512_shuffle_f32x4(tmp5339, tmp5347, 136);
wt150 = _mm512_shuffle_f32x4(tmp5339, tmp5347, 221);
wt143 = _mm512_shuffle_f32x4(tmp5334, tmp5342, 136);
wt151 = _mm512_shuffle_f32x4(tmp5334, tmp5342, 221);
wt144 = _mm512_shuffle_f32x4(tmp5336, tmp5344, 136);
wt152 = _mm512_shuffle_f32x4(tmp5336, tmp5344, 221);
wt145 = _mm512_shuffle_f32x4(tmp5338, tmp5346, 136);
wt153 = _mm512_shuffle_f32x4(tmp5338, tmp5346, 221);
wt146 = _mm512_shuffle_f32x4(tmp5340, tmp5348, 136);
wt154 = _mm512_shuffle_f32x4(tmp5340, tmp5348, 221);
wt139 = _mm512_mul_ps(wt139, postMul16);
wt140 = _mm512_mul_ps(wt140, postMul16);
wt141 = _mm512_mul_ps(wt141, postMul16);
wt142 = _mm512_mul_ps(wt142, postMul16);
wt143 = _mm512_mul_ps(wt143, postMul16);
wt144 = _mm512_mul_ps(wt144, postMul16);
wt145 = _mm512_mul_ps(wt145, postMul16);
wt146 = _mm512_mul_ps(wt146, postMul16);
wt147 = _mm512_mul_ps(wt147, postMul16);
wt148 = _mm512_mul_ps(wt148, postMul16);
wt149 = _mm512_mul_ps(wt149, postMul16);
wt150 = _mm512_mul_ps(wt150, postMul16);
wt151 = _mm512_mul_ps(wt151, postMul16);
wt152 = _mm512_mul_ps(wt152, postMul16);
wt153 = _mm512_mul_ps(wt153, postMul16);
wt154 = _mm512_mul_ps(wt154, postMul16);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(1+16*c14)+(ptrdiff_t)0, 63>>cut6, wt139);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(2+16*c14)+(ptrdiff_t)0, 63>>cut6, wt140);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(3+16*c14)+(ptrdiff_t)0, 63>>cut6, wt141);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(4+16*c14)+(ptrdiff_t)0, 63>>cut6, wt142);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(5+16*c14)+(ptrdiff_t)0, 63>>cut6, wt143);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(6+16*c14)+(ptrdiff_t)0, 63>>cut6, wt144);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(7+16*c14)+(ptrdiff_t)0, 63>>cut6, wt145);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(8+16*c14)+(ptrdiff_t)0, 63>>cut6, wt146);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(9+16*c14)+(ptrdiff_t)0, 63>>cut6, wt147);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(10+16*c14)+(ptrdiff_t)0, 63>>cut6, wt148);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(11+16*c14)+(ptrdiff_t)0, 63>>cut6, wt149);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(12+16*c14)+(ptrdiff_t)0, 63>>cut6, wt150);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(13+16*c14)+(ptrdiff_t)0, 63>>cut6, wt151);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(14+16*c14)+(ptrdiff_t)0, 63>>cut6, wt152);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(15+16*c14)+(ptrdiff_t)0, 63>>cut6, wt153);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(16+16*c14)+(ptrdiff_t)0, 63>>cut6, wt154);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(1+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt139);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(2+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt140);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(3+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt141);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(4+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt142);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(5+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt143);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(6+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt144);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(7+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt145);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(8+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt146);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(9+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt147);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(10+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt148);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(11+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt149);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(12+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt150);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(13+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt151);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(14+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt152);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(15+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt153);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(16+16*c14)+(ptrdiff_t)1536, 4032>>cut6, wt154);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(1+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt139);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(2+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt140);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(3+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt141);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(4+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt142);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(5+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt143);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(6+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt144);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(7+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt145);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(8+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt146);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(9+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt147);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(10+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt148);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(11+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt149);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(12+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt150);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(13+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt151);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(14+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt152);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(15+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt153);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(16+16*c14)+(ptrdiff_t)3072, 258048>>cut6, wt154);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(1+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt139);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(2+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt140);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(3+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt141);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(4+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt142);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(5+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt143);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(6+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt144);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(7+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt145);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(8+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt146);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(9+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt147);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(10+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt148);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(11+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt149);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(12+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt150);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(13+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt151);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(14+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt152);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(15+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt153);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l24+4*cut6+24*(16+16*c14)+(ptrdiff_t)4608, 65535-(262143>>cut6), wt154);
}
}
}
} else {
ptrdiff_t k71 = 240;
ptrdiff_t l23 = (size_t)(0+k71)/6;
ptrdiff_t cut5 = (size_t)(0+k71)%6;
__m512 sum79 = _mm512_maskz_loadu_ps(65535, biasPtr5+1024*i19+4*k71);
__m512i pmMul10 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd10 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo8 = _mm512_loadu_ps(bnPtr5+(ptrdiff_t)8*(k71+256*i19));
__m512 masHi8 = _mm512_maskz_loadu_ps(65535, bnPtr5+(ptrdiff_t)8*(k71+256*i19)+(ptrdiff_t)64);
__m512 postMul14 = _mm512_permutex2var_ps(masLo8, pmMul10, masHi8);
__m512 postAdd8 = _mm512_permutex2var_ps(masLo8, pmAdd10, masHi8);
sum79 = _mm512_fmadd_ps(sum79, postMul14, postAdd8);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*0+(ptrdiff_t)0, 63>>cut5, sum79);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*0+(ptrdiff_t)1536, 4032>>cut5, sum79);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*0+(ptrdiff_t)3072, 65535-(4095>>cut5), sum79);
ptrdiff_t c12 = 0;
for (; c12 != 4; ++c12) {
__m512 wt107 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)0);
__m512 wt108 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)256);
__m512 wt109 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)512);
__m512 wt110 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)768);
__m512 wt111 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)1024);
__m512 wt112 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)1280);
__m512 wt113 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)1536);
__m512 wt114 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)1792);
__m512 wt115 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)2048);
__m512 wt116 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)2304);
__m512 wt117 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)2560);
__m512 wt118 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)2816);
__m512 wt119 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)3072);
__m512 wt120 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)3328);
__m512 wt121 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)3584);
__m512 wt122 = _mm512_maskz_loadu_ps(65535, wtPtr5+65536*i19+256*k71+64*c12+(ptrdiff_t)3840);
__m512 tmp5349 = _mm512_unpacklo_ps(wt107, wt108);
__m512 tmp5350 = _mm512_unpackhi_ps(wt107, wt108);
__m512 tmp5351 = _mm512_unpacklo_ps(wt109, wt110);
__m512 tmp5352 = _mm512_unpackhi_ps(wt109, wt110);
__m512 tmp5353 = _mm512_unpacklo_ps(wt111, wt112);
__m512 tmp5354 = _mm512_unpackhi_ps(wt111, wt112);
__m512 tmp5355 = _mm512_unpacklo_ps(wt113, wt114);
__m512 tmp5356 = _mm512_unpackhi_ps(wt113, wt114);
__m512 tmp5357 = _mm512_unpacklo_ps(wt115, wt116);
__m512 tmp5358 = _mm512_unpackhi_ps(wt115, wt116);
__m512 tmp5359 = _mm512_unpacklo_ps(wt117, wt118);
__m512 tmp5360 = _mm512_unpackhi_ps(wt117, wt118);
__m512 tmp5361 = _mm512_unpacklo_ps(wt119, wt120);
__m512 tmp5362 = _mm512_unpackhi_ps(wt119, wt120);
__m512 tmp5363 = _mm512_unpacklo_ps(wt121, wt122);
__m512 tmp5364 = _mm512_unpackhi_ps(wt121, wt122);
__m512 tmp5365 = _mm512_shuffle_ps(tmp5349, tmp5351, 68);
__m512 tmp5366 = _mm512_shuffle_ps(tmp5349, tmp5351, 238);
__m512 tmp5367 = _mm512_shuffle_ps(tmp5350, tmp5352, 68);
__m512 tmp5368 = _mm512_shuffle_ps(tmp5350, tmp5352, 238);
__m512 tmp5369 = _mm512_shuffle_ps(tmp5353, tmp5355, 68);
__m512 tmp5370 = _mm512_shuffle_ps(tmp5353, tmp5355, 238);
__m512 tmp5371 = _mm512_shuffle_ps(tmp5354, tmp5356, 68);
__m512 tmp5372 = _mm512_shuffle_ps(tmp5354, tmp5356, 238);
__m512 tmp5373 = _mm512_shuffle_ps(tmp5357, tmp5359, 68);
__m512 tmp5374 = _mm512_shuffle_ps(tmp5357, tmp5359, 238);
__m512 tmp5375 = _mm512_shuffle_ps(tmp5358, tmp5360, 68);
__m512 tmp5376 = _mm512_shuffle_ps(tmp5358, tmp5360, 238);
__m512 tmp5377 = _mm512_shuffle_ps(tmp5361, tmp5363, 68);
__m512 tmp5378 = _mm512_shuffle_ps(tmp5361, tmp5363, 238);
__m512 tmp5379 = _mm512_shuffle_ps(tmp5362, tmp5364, 68);
__m512 tmp5380 = _mm512_shuffle_ps(tmp5362, tmp5364, 238);
__m512 tmp5381 = _mm512_shuffle_f32x4(tmp5365, tmp5369, 136);
__m512 tmp5382 = _mm512_shuffle_f32x4(tmp5365, tmp5369, 221);
__m512 tmp5383 = _mm512_shuffle_f32x4(tmp5366, tmp5370, 136);
__m512 tmp5384 = _mm512_shuffle_f32x4(tmp5366, tmp5370, 221);
__m512 tmp5385 = _mm512_shuffle_f32x4(tmp5367, tmp5371, 136);
__m512 tmp5386 = _mm512_shuffle_f32x4(tmp5367, tmp5371, 221);
__m512 tmp5387 = _mm512_shuffle_f32x4(tmp5368, tmp5372, 136);
__m512 tmp5388 = _mm512_shuffle_f32x4(tmp5368, tmp5372, 221);
__m512 tmp5389 = _mm512_shuffle_f32x4(tmp5373, tmp5377, 136);
__m512 tmp5390 = _mm512_shuffle_f32x4(tmp5373, tmp5377, 221);
__m512 tmp5391 = _mm512_shuffle_f32x4(tmp5374, tmp5378, 136);
__m512 tmp5392 = _mm512_shuffle_f32x4(tmp5374, tmp5378, 221);
__m512 tmp5393 = _mm512_shuffle_f32x4(tmp5375, tmp5379, 136);
__m512 tmp5394 = _mm512_shuffle_f32x4(tmp5375, tmp5379, 221);
__m512 tmp5395 = _mm512_shuffle_f32x4(tmp5376, tmp5380, 136);
__m512 tmp5396 = _mm512_shuffle_f32x4(tmp5376, tmp5380, 221);
wt107 = _mm512_shuffle_f32x4(tmp5381, tmp5389, 136);
wt115 = _mm512_shuffle_f32x4(tmp5381, tmp5389, 221);
wt108 = _mm512_shuffle_f32x4(tmp5383, tmp5391, 136);
wt116 = _mm512_shuffle_f32x4(tmp5383, tmp5391, 221);
wt109 = _mm512_shuffle_f32x4(tmp5385, tmp5393, 136);
wt117 = _mm512_shuffle_f32x4(tmp5385, tmp5393, 221);
wt110 = _mm512_shuffle_f32x4(tmp5387, tmp5395, 136);
wt118 = _mm512_shuffle_f32x4(tmp5387, tmp5395, 221);
wt111 = _mm512_shuffle_f32x4(tmp5382, tmp5390, 136);
wt119 = _mm512_shuffle_f32x4(tmp5382, tmp5390, 221);
wt112 = _mm512_shuffle_f32x4(tmp5384, tmp5392, 136);
wt120 = _mm512_shuffle_f32x4(tmp5384, tmp5392, 221);
wt113 = _mm512_shuffle_f32x4(tmp5386, tmp5394, 136);
wt121 = _mm512_shuffle_f32x4(tmp5386, tmp5394, 221);
wt114 = _mm512_shuffle_f32x4(tmp5388, tmp5396, 136);
wt122 = _mm512_shuffle_f32x4(tmp5388, tmp5396, 221);
wt107 = _mm512_mul_ps(wt107, postMul14);
wt108 = _mm512_mul_ps(wt108, postMul14);
wt109 = _mm512_mul_ps(wt109, postMul14);
wt110 = _mm512_mul_ps(wt110, postMul14);
wt111 = _mm512_mul_ps(wt111, postMul14);
wt112 = _mm512_mul_ps(wt112, postMul14);
wt113 = _mm512_mul_ps(wt113, postMul14);
wt114 = _mm512_mul_ps(wt114, postMul14);
wt115 = _mm512_mul_ps(wt115, postMul14);
wt116 = _mm512_mul_ps(wt116, postMul14);
wt117 = _mm512_mul_ps(wt117, postMul14);
wt118 = _mm512_mul_ps(wt118, postMul14);
wt119 = _mm512_mul_ps(wt119, postMul14);
wt120 = _mm512_mul_ps(wt120, postMul14);
wt121 = _mm512_mul_ps(wt121, postMul14);
wt122 = _mm512_mul_ps(wt122, postMul14);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(1+16*c12)+(ptrdiff_t)0, 63>>cut5, wt107);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(2+16*c12)+(ptrdiff_t)0, 63>>cut5, wt108);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(3+16*c12)+(ptrdiff_t)0, 63>>cut5, wt109);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(4+16*c12)+(ptrdiff_t)0, 63>>cut5, wt110);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(5+16*c12)+(ptrdiff_t)0, 63>>cut5, wt111);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(6+16*c12)+(ptrdiff_t)0, 63>>cut5, wt112);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(7+16*c12)+(ptrdiff_t)0, 63>>cut5, wt113);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(8+16*c12)+(ptrdiff_t)0, 63>>cut5, wt114);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(9+16*c12)+(ptrdiff_t)0, 63>>cut5, wt115);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(10+16*c12)+(ptrdiff_t)0, 63>>cut5, wt116);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(11+16*c12)+(ptrdiff_t)0, 63>>cut5, wt117);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(12+16*c12)+(ptrdiff_t)0, 63>>cut5, wt118);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(13+16*c12)+(ptrdiff_t)0, 63>>cut5, wt119);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(14+16*c12)+(ptrdiff_t)0, 63>>cut5, wt120);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(15+16*c12)+(ptrdiff_t)0, 63>>cut5, wt121);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(16+16*c12)+(ptrdiff_t)0, 63>>cut5, wt122);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(1+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt107);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(2+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt108);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(3+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt109);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(4+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt110);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(5+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt111);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(6+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt112);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(7+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt113);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(8+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt114);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(9+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt115);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(10+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt116);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(11+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt117);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(12+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt118);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(13+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt119);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(14+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt120);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(15+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt121);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+24*(16+16*c12)+(ptrdiff_t)1536, 4032>>cut5, wt122);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(1+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt107);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(2+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt108);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(3+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt109);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(4+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt110);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(5+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt111);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(6+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt112);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(7+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt113);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(8+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt114);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(9+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt115);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(10+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt116);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(11+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt117);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(12+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt118);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(13+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt119);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(14+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt120);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(15+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt121);
_mm512_mask_storeu_ps(arranged3+66560*i19+1560*l23+4*cut5+16*(16+16*c12)+(ptrdiff_t)3072, 65535-(4095>>cut5), wt122);
}
}
}
}
}

static void ResNet50OneArrangeWts2(ResNet50ThreaderTeam1* team26, char** tensors25) {
ResNet50ThreaderTask1 task29;
task29.callee1 = ResNet50OneArrangeWts2Callee1;
task29.any1 = tensors25;
task29.nd1 = 3;
task29.hull1[0] = 2;
task29.hull1[1] = 1;
task29.hull1[2] = 1;
ResNet50ThreaderDo1(team26, &task29);
}

static void ResNet50OneArrangeDats2Callee1(ResNet50ThreaderTask1* task30, int64_t* pt20) {
char** tensors28 = task30->any1;
ptrdiff_t c15 = pt20[1];
char*restrict datPtr7 = tensors28[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)806912*0;
char*restrict arranged4 = tensors28[1]+(ptrdiff_t)10474240*0+(ptrdiff_t)802816*0;
ptrdiff_t ii5 = 1;
for (ptrdiff_t i20 = 0; i20 < ii5; ++i20) {
ptrdiff_t j15 = 2*c15;
ptrdiff_t jj24 = j15+(c15 < 23 ? 1 : 2);
for (; j15 != 49; ++j15) {
ptrdiff_t k73 = 0;
ptrdiff_t kk27 = k73+64;
for (; k73 < kk27; ++k73) {
__m512 dat1271 = _mm512_maskz_loadu_ps(65535, datPtr7+806912*i20+256*j15+12608*k73+(ptrdiff_t)0);
__m512 dat1272 = _mm512_maskz_loadu_ps(65535, datPtr7+806912*i20+256*j15+12608*k73+(ptrdiff_t)64);
__m512 dat1273 = _mm512_maskz_loadu_ps(65535, datPtr7+806912*i20+256*j15+12608*k73+(ptrdiff_t)128);
__m512 dat1274 = _mm512_maskz_loadu_ps(65535, datPtr7+806912*i20+256*j15+12608*k73+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged4+802816*i20+16384*j15+256*k73+(ptrdiff_t)0, 65535, dat1271);
_mm512_mask_storeu_ps(arranged4+802816*i20+16384*j15+256*k73+(ptrdiff_t)64, 65535, dat1272);
_mm512_mask_storeu_ps(arranged4+802816*i20+16384*j15+256*k73+(ptrdiff_t)128, 65535, dat1273);
_mm512_mask_storeu_ps(arranged4+802816*i20+16384*j15+256*k73+(ptrdiff_t)192, 65535, dat1274);
}
if (j15 >= jj24) goto next2;
}
next2:;
}
}

static void ResNet50OneArrangeDats2(ResNet50ThreaderTeam1* team27, char** tensors27) {
ResNet50ThreaderTask1 task31;
task31.callee1 = ResNet50OneArrangeDats2Callee1;
task31.any1 = tensors27;
task31.nd1 = 4;
task31.hull1[0] = 1;
task31.hull1[1] = 24;
task31.hull1[2] = 1;
task31.hull1[3] = 1;
ResNet50ThreaderDo1(team27, &task31);
}

static void ResNet50OneApply2Callee1(ResNet50ThreaderTask1* task32, int64_t* pt21) {
void** pair6 = task32->any1;
char** tensors30 = pair6[0];
ptrdiff_t e10 = 0;
ptrdiff_t g11 = 0;
ptrdiff_t d6 = pt21[1];
ptrdiff_t w34 = pt21[0];
char*restrict arrangedWts2 = tensors30[0]+856064*e10+(ptrdiff_t)66560*1*g11;
char*restrict arrangedDats2 = tensors30[1]+10474240*e10+(ptrdiff_t)802816*1*g11;
char*restrict datPtr8 = tensors30[2]+(ptrdiff_t)3227648*1*g11;
char*restrict datPtr9 = tensors30[3]+(ptrdiff_t)3227648*1*g11;
ptrdiff_t ii6 = 1;
for (ptrdiff_t i21 = 0; i21 < ii6; ++i21) {
ptrdiff_t j16 = 1*d6;
ptrdiff_t jj25 = j16+0;
for (; j16 != 49; ++j16) {
ptrdiff_t k74 = 8*w34;
ptrdiff_t kk28 = k74+(w34 < 4 ? 7 : 10);
for (; k74 != 42; ++k74) {
ptrdiff_t s14 = -1;
__m512 sum82 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)24));
__m512 sum86 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)28));
__m512 sum90 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)32));
__m512 sum94 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)36));
__m512 sum98 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)40));
__m512 sum102 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)44));
__m512 sum83 = sum82;
__m512 sum84 = sum82;
__m512 sum85 = sum82;
__m512 sum87 = sum86;
__m512 sum88 = sum86;
__m512 sum89 = sum86;
__m512 sum91 = sum90;
__m512 sum92 = sum90;
__m512 sum93 = sum90;
__m512 sum95 = sum94;
__m512 sum96 = sum94;
__m512 sum97 = sum94;
__m512 sum99 = sum98;
__m512 sum100 = sum98;
__m512 sum101 = sum98;
__m512 sum103 = sum102;
__m512 sum104 = sum102;
__m512 sum105 = sum102;
for (s14 = 0; s14 < 64; ++s14) {
__m512 dat1275 = _mm512_loadu_ps(arrangedDats2+802816*i21+16384*j16+256*s14+(ptrdiff_t)0);
__m512 dat1276 = _mm512_loadu_ps(arrangedDats2+802816*i21+16384*j16+256*s14+(ptrdiff_t)64);
__m512 dat1277 = _mm512_loadu_ps(arrangedDats2+802816*i21+16384*j16+256*s14+(ptrdiff_t)128);
__m512 dat1278 = _mm512_loadu_ps(arrangedDats2+802816*i21+16384*j16+256*s14+(ptrdiff_t)192);
__m512 wt155 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)24));
sum82 = _mm512_fmadd_ps(wt155, dat1275, sum82);
sum83 = _mm512_fmadd_ps(wt155, dat1276, sum83);
sum84 = _mm512_fmadd_ps(wt155, dat1277, sum84);
sum85 = _mm512_fmadd_ps(wt155, dat1278, sum85);
__m512 wt156 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)28));
sum86 = _mm512_fmadd_ps(wt156, dat1275, sum86);
sum87 = _mm512_fmadd_ps(wt156, dat1276, sum87);
sum88 = _mm512_fmadd_ps(wt156, dat1277, sum88);
sum89 = _mm512_fmadd_ps(wt156, dat1278, sum89);
__m512 wt157 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)32));
sum90 = _mm512_fmadd_ps(wt157, dat1275, sum90);
sum91 = _mm512_fmadd_ps(wt157, dat1276, sum91);
sum92 = _mm512_fmadd_ps(wt157, dat1277, sum92);
sum93 = _mm512_fmadd_ps(wt157, dat1278, sum93);
__m512 wt158 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)36));
sum94 = _mm512_fmadd_ps(wt158, dat1275, sum94);
sum95 = _mm512_fmadd_ps(wt158, dat1276, sum95);
sum96 = _mm512_fmadd_ps(wt158, dat1277, sum96);
sum97 = _mm512_fmadd_ps(wt158, dat1278, sum97);
__m512 wt159 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)40));
sum98 = _mm512_fmadd_ps(wt159, dat1275, sum98);
sum99 = _mm512_fmadd_ps(wt159, dat1276, sum99);
sum100 = _mm512_fmadd_ps(wt159, dat1277, sum100);
sum101 = _mm512_fmadd_ps(wt159, dat1278, sum101);
__m512 wt160 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+24*s14+(ptrdiff_t)44));
sum102 = _mm512_fmadd_ps(wt160, dat1275, sum102);
sum103 = _mm512_fmadd_ps(wt160, dat1276, sum103);
sum104 = _mm512_fmadd_ps(wt160, dat1277, sum104);
sum105 = _mm512_fmadd_ps(wt160, dat1278, sum105);
}
sum82 = _mm512_add_ps(sum82, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)0));
sum83 = _mm512_add_ps(sum83, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)64));
sum84 = _mm512_add_ps(sum84, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)128));
sum85 = _mm512_add_ps(sum85, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)192));
sum82 = _mm512_max_ps(_mm512_setzero_ps(), sum82);
sum83 = _mm512_max_ps(_mm512_setzero_ps(), sum83);
sum84 = _mm512_max_ps(_mm512_setzero_ps(), sum84);
sum85 = _mm512_max_ps(_mm512_setzero_ps(), sum85);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)0, 65535, sum82);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)64, 65535, sum83);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)128, 65535, sum84);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)192, 65535, sum85);
sum86 = _mm512_add_ps(sum86, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12608));
sum87 = _mm512_add_ps(sum87, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12672));
sum88 = _mm512_add_ps(sum88, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12736));
sum89 = _mm512_add_ps(sum89, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12800));
sum86 = _mm512_max_ps(_mm512_setzero_ps(), sum86);
sum87 = _mm512_max_ps(_mm512_setzero_ps(), sum87);
sum88 = _mm512_max_ps(_mm512_setzero_ps(), sum88);
sum89 = _mm512_max_ps(_mm512_setzero_ps(), sum89);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12608, 65535, sum86);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12672, 65535, sum87);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12736, 65535, sum88);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12800, 65535, sum89);
sum90 = _mm512_add_ps(sum90, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25216));
sum91 = _mm512_add_ps(sum91, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25280));
sum92 = _mm512_add_ps(sum92, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25344));
sum93 = _mm512_add_ps(sum93, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25408));
sum90 = _mm512_max_ps(_mm512_setzero_ps(), sum90);
sum91 = _mm512_max_ps(_mm512_setzero_ps(), sum91);
sum92 = _mm512_max_ps(_mm512_setzero_ps(), sum92);
sum93 = _mm512_max_ps(_mm512_setzero_ps(), sum93);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25216, 65535, sum90);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25280, 65535, sum91);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25344, 65535, sum92);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25408, 65535, sum93);
sum94 = _mm512_add_ps(sum94, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37824));
sum95 = _mm512_add_ps(sum95, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37888));
sum96 = _mm512_add_ps(sum96, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37952));
sum97 = _mm512_add_ps(sum97, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)38016));
sum94 = _mm512_max_ps(_mm512_setzero_ps(), sum94);
sum95 = _mm512_max_ps(_mm512_setzero_ps(), sum95);
sum96 = _mm512_max_ps(_mm512_setzero_ps(), sum96);
sum97 = _mm512_max_ps(_mm512_setzero_ps(), sum97);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37824, 65535, sum94);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37888, 65535, sum95);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37952, 65535, sum96);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)38016, 65535, sum97);
sum98 = _mm512_add_ps(sum98, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)50432));
sum99 = _mm512_add_ps(sum99, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)50496));
sum100 = _mm512_add_ps(sum100, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)50560));
sum101 = _mm512_add_ps(sum101, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)50624));
sum98 = _mm512_max_ps(_mm512_setzero_ps(), sum98);
sum99 = _mm512_max_ps(_mm512_setzero_ps(), sum99);
sum100 = _mm512_max_ps(_mm512_setzero_ps(), sum100);
sum101 = _mm512_max_ps(_mm512_setzero_ps(), sum101);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)50432, 65535, sum98);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)50496, 65535, sum99);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)50560, 65535, sum100);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)50624, 65535, sum101);
sum102 = _mm512_add_ps(sum102, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)63040));
sum103 = _mm512_add_ps(sum103, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)63104));
sum104 = _mm512_add_ps(sum104, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)63168));
sum105 = _mm512_add_ps(sum105, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)63232));
sum102 = _mm512_max_ps(_mm512_setzero_ps(), sum102);
sum103 = _mm512_max_ps(_mm512_setzero_ps(), sum103);
sum104 = _mm512_max_ps(_mm512_setzero_ps(), sum104);
sum105 = _mm512_max_ps(_mm512_setzero_ps(), sum105);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)63040, 65535, sum102);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)63104, 65535, sum103);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)63168, 65535, sum104);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)63232, 65535, sum105);
if (k74 >= kk28) return;
}
ptrdiff_t s15 = -1;
__m512 sum106 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+16*s15+(ptrdiff_t)16));
__m512 sum110 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+16*s15+(ptrdiff_t)20));
__m512 sum114 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+16*s15+(ptrdiff_t)24));
__m512 sum118 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+16*s15+(ptrdiff_t)28));
__m512 sum107 = sum106;
__m512 sum108 = sum106;
__m512 sum109 = sum106;
__m512 sum111 = sum110;
__m512 sum112 = sum110;
__m512 sum113 = sum110;
__m512 sum115 = sum114;
__m512 sum116 = sum114;
__m512 sum117 = sum114;
__m512 sum119 = sum118;
__m512 sum120 = sum118;
__m512 sum121 = sum118;
for (s15 = 0; s15 < 64; ++s15) {
__m512 dat1279 = _mm512_loadu_ps(arrangedDats2+802816*i21+16384*j16+256*s15+(ptrdiff_t)0);
__m512 dat1280 = _mm512_loadu_ps(arrangedDats2+802816*i21+16384*j16+256*s15+(ptrdiff_t)64);
__m512 dat1281 = _mm512_loadu_ps(arrangedDats2+802816*i21+16384*j16+256*s15+(ptrdiff_t)128);
__m512 dat1282 = _mm512_loadu_ps(arrangedDats2+802816*i21+16384*j16+256*s15+(ptrdiff_t)192);
__m512 wt161 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+16*s15+(ptrdiff_t)16));
sum106 = _mm512_fmadd_ps(wt161, dat1279, sum106);
sum107 = _mm512_fmadd_ps(wt161, dat1280, sum107);
sum108 = _mm512_fmadd_ps(wt161, dat1281, sum108);
sum109 = _mm512_fmadd_ps(wt161, dat1282, sum109);
__m512 wt162 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+16*s15+(ptrdiff_t)20));
sum110 = _mm512_fmadd_ps(wt162, dat1279, sum110);
sum111 = _mm512_fmadd_ps(wt162, dat1280, sum111);
sum112 = _mm512_fmadd_ps(wt162, dat1281, sum112);
sum113 = _mm512_fmadd_ps(wt162, dat1282, sum113);
__m512 wt163 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+16*s15+(ptrdiff_t)24));
sum114 = _mm512_fmadd_ps(wt163, dat1279, sum114);
sum115 = _mm512_fmadd_ps(wt163, dat1280, sum115);
sum116 = _mm512_fmadd_ps(wt163, dat1281, sum116);
sum117 = _mm512_fmadd_ps(wt163, dat1282, sum117);
__m512 wt164 = _mm512_set1_ps(*(float*)(arrangedWts2+66560*i21+1560*k74+16*s15+(ptrdiff_t)28));
sum118 = _mm512_fmadd_ps(wt164, dat1279, sum118);
sum119 = _mm512_fmadd_ps(wt164, dat1280, sum119);
sum120 = _mm512_fmadd_ps(wt164, dat1281, sum120);
sum121 = _mm512_fmadd_ps(wt164, dat1282, sum121);
}
sum106 = _mm512_add_ps(sum106, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)0));
sum107 = _mm512_add_ps(sum107, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)64));
sum108 = _mm512_add_ps(sum108, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)128));
sum109 = _mm512_add_ps(sum109, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)192));
sum106 = _mm512_max_ps(_mm512_setzero_ps(), sum106);
sum107 = _mm512_max_ps(_mm512_setzero_ps(), sum107);
sum108 = _mm512_max_ps(_mm512_setzero_ps(), sum108);
sum109 = _mm512_max_ps(_mm512_setzero_ps(), sum109);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)0, 65535, sum106);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)64, 65535, sum107);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)128, 65535, sum108);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)192, 65535, sum109);
sum110 = _mm512_add_ps(sum110, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12608));
sum111 = _mm512_add_ps(sum111, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12672));
sum112 = _mm512_add_ps(sum112, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12736));
sum113 = _mm512_add_ps(sum113, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12800));
sum110 = _mm512_max_ps(_mm512_setzero_ps(), sum110);
sum111 = _mm512_max_ps(_mm512_setzero_ps(), sum111);
sum112 = _mm512_max_ps(_mm512_setzero_ps(), sum112);
sum113 = _mm512_max_ps(_mm512_setzero_ps(), sum113);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12608, 65535, sum110);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12672, 65535, sum111);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12736, 65535, sum112);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)12800, 65535, sum113);
sum114 = _mm512_add_ps(sum114, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25216));
sum115 = _mm512_add_ps(sum115, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25280));
sum116 = _mm512_add_ps(sum116, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25344));
sum117 = _mm512_add_ps(sum117, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25408));
sum114 = _mm512_max_ps(_mm512_setzero_ps(), sum114);
sum115 = _mm512_max_ps(_mm512_setzero_ps(), sum115);
sum116 = _mm512_max_ps(_mm512_setzero_ps(), sum116);
sum117 = _mm512_max_ps(_mm512_setzero_ps(), sum117);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25216, 65535, sum114);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25280, 65535, sum115);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25344, 65535, sum116);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)25408, 65535, sum117);
sum118 = _mm512_add_ps(sum118, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37824));
sum119 = _mm512_add_ps(sum119, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37888));
sum120 = _mm512_add_ps(sum120, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37952));
sum121 = _mm512_add_ps(sum121, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i21+256*j16+75648*k74+(ptrdiff_t)38016));
sum118 = _mm512_max_ps(_mm512_setzero_ps(), sum118);
sum119 = _mm512_max_ps(_mm512_setzero_ps(), sum119);
sum120 = _mm512_max_ps(_mm512_setzero_ps(), sum120);
sum121 = _mm512_max_ps(_mm512_setzero_ps(), sum121);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37824, 65535, sum118);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37888, 65535, sum119);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)37952, 65535, sum120);
_mm512_mask_storeu_ps(datPtr9+3227648*i21+256*j16+75648*k74+(ptrdiff_t)38016, 65535, sum121);
if (j16 >= jj25) return;
}
}
}

static void ResNet50OneApply2(ResNet50ThreaderTeam1* team28, char** tensors29) {
void* pair5[] = {tensors29, 0};
ResNet50ThreaderTask1 task33;
task33.callee1 = ResNet50OneApply2Callee1;
task33.any1 = pair5;
task33.nd1 = 3;
task33.hull1[0] = 5;
task33.hull1[1] = 49;
task33.hull1[2] = 1;
ResNet50ThreaderDo1(team28, &task33);
}

static void ResNet50OneArrangeWts3Callee1(ResNet50ThreaderTask1* task34, int64_t* pt22) {
char** tensors32 = task34->any1;
ptrdiff_t b49 = pt22[0];
char*restrict wtPtr6 = tensors32[0]+(ptrdiff_t)3340*0+(ptrdiff_t)65536*0;
char*restrict biasPtr6 = tensors32[1]+(ptrdiff_t)256*0;
char*restrict bnPtr6 = tensors32[2]+(ptrdiff_t)8*64*0;
char*restrict arranged5 = tensors32[3]+(ptrdiff_t)214016*0+(ptrdiff_t)65792*0;
ptrdiff_t ii7 = 1;
for (ptrdiff_t i22 = 0; i22 < ii7; ++i22) {
ptrdiff_t j17 = 2*b49;
ptrdiff_t jj26 = j17+2;
for (; j17 < jj26; ++j17) {
if (j17 < 3) {
ptrdiff_t k76 = 0+16*(j17-0);
ptrdiff_t l26 = (size_t)(0+k76)/6;
ptrdiff_t cut8 = (size_t)(0+k76)%6;
switch (cut8) {
case 0:;
case 2: {
__m512 sum123 = _mm512_maskz_loadu_ps(65535, biasPtr6+256*i22+4*k76);
__m512i pmMul11 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd11 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo9 = _mm512_loadu_ps(bnPtr6+(ptrdiff_t)8*(k76+64*i22));
__m512 masHi9 = _mm512_maskz_loadu_ps(65535, bnPtr6+(ptrdiff_t)8*(k76+64*i22)+(ptrdiff_t)64);
__m512 postMul18 = _mm512_permutex2var_ps(masLo9, pmMul11, masHi9);
__m512 postAdd12 = _mm512_permutex2var_ps(masLo9, pmAdd11, masHi9);
sum123 = _mm512_fmadd_ps(sum123, postMul18, postAdd12);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*0+(ptrdiff_t)0, 63>>cut8, sum123);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*0+(ptrdiff_t)6144, 4032>>cut8, sum123);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*0+(ptrdiff_t)12288, 65535-(4095>>cut8), sum123);
ptrdiff_t c17 = 0;
for (; c17 != 16; ++c17) {
__m512 wt181 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)0);
__m512 wt182 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)1024);
__m512 wt183 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)2048);
__m512 wt184 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)3072);
__m512 wt185 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)4096);
__m512 wt186 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)5120);
__m512 wt187 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)6144);
__m512 wt188 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)7168);
__m512 wt189 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)8192);
__m512 wt190 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)9216);
__m512 wt191 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)10240);
__m512 wt192 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)11264);
__m512 wt193 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)12288);
__m512 wt194 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)13312);
__m512 wt195 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)14336);
__m512 wt196 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c17+(ptrdiff_t)15360);
__m512 tmp5397 = _mm512_unpacklo_ps(wt181, wt182);
__m512 tmp5398 = _mm512_unpackhi_ps(wt181, wt182);
__m512 tmp5399 = _mm512_unpacklo_ps(wt183, wt184);
__m512 tmp5400 = _mm512_unpackhi_ps(wt183, wt184);
__m512 tmp5401 = _mm512_unpacklo_ps(wt185, wt186);
__m512 tmp5402 = _mm512_unpackhi_ps(wt185, wt186);
__m512 tmp5403 = _mm512_unpacklo_ps(wt187, wt188);
__m512 tmp5404 = _mm512_unpackhi_ps(wt187, wt188);
__m512 tmp5405 = _mm512_unpacklo_ps(wt189, wt190);
__m512 tmp5406 = _mm512_unpackhi_ps(wt189, wt190);
__m512 tmp5407 = _mm512_unpacklo_ps(wt191, wt192);
__m512 tmp5408 = _mm512_unpackhi_ps(wt191, wt192);
__m512 tmp5409 = _mm512_unpacklo_ps(wt193, wt194);
__m512 tmp5410 = _mm512_unpackhi_ps(wt193, wt194);
__m512 tmp5411 = _mm512_unpacklo_ps(wt195, wt196);
__m512 tmp5412 = _mm512_unpackhi_ps(wt195, wt196);
__m512 tmp5413 = _mm512_shuffle_ps(tmp5397, tmp5399, 68);
__m512 tmp5414 = _mm512_shuffle_ps(tmp5397, tmp5399, 238);
__m512 tmp5415 = _mm512_shuffle_ps(tmp5398, tmp5400, 68);
__m512 tmp5416 = _mm512_shuffle_ps(tmp5398, tmp5400, 238);
__m512 tmp5417 = _mm512_shuffle_ps(tmp5401, tmp5403, 68);
__m512 tmp5418 = _mm512_shuffle_ps(tmp5401, tmp5403, 238);
__m512 tmp5419 = _mm512_shuffle_ps(tmp5402, tmp5404, 68);
__m512 tmp5420 = _mm512_shuffle_ps(tmp5402, tmp5404, 238);
__m512 tmp5421 = _mm512_shuffle_ps(tmp5405, tmp5407, 68);
__m512 tmp5422 = _mm512_shuffle_ps(tmp5405, tmp5407, 238);
__m512 tmp5423 = _mm512_shuffle_ps(tmp5406, tmp5408, 68);
__m512 tmp5424 = _mm512_shuffle_ps(tmp5406, tmp5408, 238);
__m512 tmp5425 = _mm512_shuffle_ps(tmp5409, tmp5411, 68);
__m512 tmp5426 = _mm512_shuffle_ps(tmp5409, tmp5411, 238);
__m512 tmp5427 = _mm512_shuffle_ps(tmp5410, tmp5412, 68);
__m512 tmp5428 = _mm512_shuffle_ps(tmp5410, tmp5412, 238);
__m512 tmp5429 = _mm512_shuffle_f32x4(tmp5413, tmp5417, 136);
__m512 tmp5430 = _mm512_shuffle_f32x4(tmp5413, tmp5417, 221);
__m512 tmp5431 = _mm512_shuffle_f32x4(tmp5414, tmp5418, 136);
__m512 tmp5432 = _mm512_shuffle_f32x4(tmp5414, tmp5418, 221);
__m512 tmp5433 = _mm512_shuffle_f32x4(tmp5415, tmp5419, 136);
__m512 tmp5434 = _mm512_shuffle_f32x4(tmp5415, tmp5419, 221);
__m512 tmp5435 = _mm512_shuffle_f32x4(tmp5416, tmp5420, 136);
__m512 tmp5436 = _mm512_shuffle_f32x4(tmp5416, tmp5420, 221);
__m512 tmp5437 = _mm512_shuffle_f32x4(tmp5421, tmp5425, 136);
__m512 tmp5438 = _mm512_shuffle_f32x4(tmp5421, tmp5425, 221);
__m512 tmp5439 = _mm512_shuffle_f32x4(tmp5422, tmp5426, 136);
__m512 tmp5440 = _mm512_shuffle_f32x4(tmp5422, tmp5426, 221);
__m512 tmp5441 = _mm512_shuffle_f32x4(tmp5423, tmp5427, 136);
__m512 tmp5442 = _mm512_shuffle_f32x4(tmp5423, tmp5427, 221);
__m512 tmp5443 = _mm512_shuffle_f32x4(tmp5424, tmp5428, 136);
__m512 tmp5444 = _mm512_shuffle_f32x4(tmp5424, tmp5428, 221);
wt181 = _mm512_shuffle_f32x4(tmp5429, tmp5437, 136);
wt189 = _mm512_shuffle_f32x4(tmp5429, tmp5437, 221);
wt182 = _mm512_shuffle_f32x4(tmp5431, tmp5439, 136);
wt190 = _mm512_shuffle_f32x4(tmp5431, tmp5439, 221);
wt183 = _mm512_shuffle_f32x4(tmp5433, tmp5441, 136);
wt191 = _mm512_shuffle_f32x4(tmp5433, tmp5441, 221);
wt184 = _mm512_shuffle_f32x4(tmp5435, tmp5443, 136);
wt192 = _mm512_shuffle_f32x4(tmp5435, tmp5443, 221);
wt185 = _mm512_shuffle_f32x4(tmp5430, tmp5438, 136);
wt193 = _mm512_shuffle_f32x4(tmp5430, tmp5438, 221);
wt186 = _mm512_shuffle_f32x4(tmp5432, tmp5440, 136);
wt194 = _mm512_shuffle_f32x4(tmp5432, tmp5440, 221);
wt187 = _mm512_shuffle_f32x4(tmp5434, tmp5442, 136);
wt195 = _mm512_shuffle_f32x4(tmp5434, tmp5442, 221);
wt188 = _mm512_shuffle_f32x4(tmp5436, tmp5444, 136);
wt196 = _mm512_shuffle_f32x4(tmp5436, tmp5444, 221);
wt181 = _mm512_mul_ps(wt181, postMul18);
wt182 = _mm512_mul_ps(wt182, postMul18);
wt183 = _mm512_mul_ps(wt183, postMul18);
wt184 = _mm512_mul_ps(wt184, postMul18);
wt185 = _mm512_mul_ps(wt185, postMul18);
wt186 = _mm512_mul_ps(wt186, postMul18);
wt187 = _mm512_mul_ps(wt187, postMul18);
wt188 = _mm512_mul_ps(wt188, postMul18);
wt189 = _mm512_mul_ps(wt189, postMul18);
wt190 = _mm512_mul_ps(wt190, postMul18);
wt191 = _mm512_mul_ps(wt191, postMul18);
wt192 = _mm512_mul_ps(wt192, postMul18);
wt193 = _mm512_mul_ps(wt193, postMul18);
wt194 = _mm512_mul_ps(wt194, postMul18);
wt195 = _mm512_mul_ps(wt195, postMul18);
wt196 = _mm512_mul_ps(wt196, postMul18);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(1+16*c17)+(ptrdiff_t)0, 63>>cut8, wt181);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(2+16*c17)+(ptrdiff_t)0, 63>>cut8, wt182);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(3+16*c17)+(ptrdiff_t)0, 63>>cut8, wt183);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(4+16*c17)+(ptrdiff_t)0, 63>>cut8, wt184);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(5+16*c17)+(ptrdiff_t)0, 63>>cut8, wt185);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(6+16*c17)+(ptrdiff_t)0, 63>>cut8, wt186);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(7+16*c17)+(ptrdiff_t)0, 63>>cut8, wt187);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(8+16*c17)+(ptrdiff_t)0, 63>>cut8, wt188);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(9+16*c17)+(ptrdiff_t)0, 63>>cut8, wt189);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(10+16*c17)+(ptrdiff_t)0, 63>>cut8, wt190);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(11+16*c17)+(ptrdiff_t)0, 63>>cut8, wt191);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(12+16*c17)+(ptrdiff_t)0, 63>>cut8, wt192);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(13+16*c17)+(ptrdiff_t)0, 63>>cut8, wt193);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(14+16*c17)+(ptrdiff_t)0, 63>>cut8, wt194);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(15+16*c17)+(ptrdiff_t)0, 63>>cut8, wt195);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(16+16*c17)+(ptrdiff_t)0, 63>>cut8, wt196);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(1+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt181);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(2+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt182);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(3+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt183);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(4+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt184);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(5+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt185);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(6+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt186);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(7+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt187);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(8+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt188);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(9+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt189);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(10+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt190);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(11+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt191);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(12+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt192);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(13+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt193);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(14+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt194);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(15+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt195);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(16+16*c17)+(ptrdiff_t)6144, 4032>>cut8, wt196);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(1+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt181);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(2+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt182);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(3+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt183);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(4+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt184);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(5+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt185);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(6+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt186);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(7+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt187);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(8+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt188);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(9+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt189);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(10+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt190);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(11+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt191);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(12+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt192);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(13+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt193);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(14+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt194);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(15+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt195);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(16+16*c17)+(ptrdiff_t)12288, 65535-(4095>>cut8), wt196);
}
break;
}
default: {
cut8 = 4;
__m512 sum124 = _mm512_maskz_loadu_ps(65535, biasPtr6+256*i22+4*k76);
__m512i pmMul12 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd12 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo10 = _mm512_loadu_ps(bnPtr6+(ptrdiff_t)8*(k76+64*i22));
__m512 masHi10 = _mm512_maskz_loadu_ps(65535, bnPtr6+(ptrdiff_t)8*(k76+64*i22)+(ptrdiff_t)64);
__m512 postMul19 = _mm512_permutex2var_ps(masLo10, pmMul12, masHi10);
__m512 postAdd13 = _mm512_permutex2var_ps(masLo10, pmAdd12, masHi10);
sum124 = _mm512_fmadd_ps(sum124, postMul19, postAdd13);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*0+(ptrdiff_t)0, 63>>cut8, sum124);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*0+(ptrdiff_t)6144, 4032>>cut8, sum124);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*0+(ptrdiff_t)12288, 258048>>cut8, sum124);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*0+(ptrdiff_t)18432, 65535-(262143>>cut8), sum124);
ptrdiff_t c18 = 0;
for (; c18 != 16; ++c18) {
__m512 wt197 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)0);
__m512 wt198 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)1024);
__m512 wt199 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)2048);
__m512 wt200 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)3072);
__m512 wt201 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)4096);
__m512 wt202 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)5120);
__m512 wt203 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)6144);
__m512 wt204 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)7168);
__m512 wt205 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)8192);
__m512 wt206 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)9216);
__m512 wt207 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)10240);
__m512 wt208 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)11264);
__m512 wt209 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)12288);
__m512 wt210 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)13312);
__m512 wt211 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)14336);
__m512 wt212 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k76+64*c18+(ptrdiff_t)15360);
__m512 tmp5445 = _mm512_unpacklo_ps(wt197, wt198);
__m512 tmp5446 = _mm512_unpackhi_ps(wt197, wt198);
__m512 tmp5447 = _mm512_unpacklo_ps(wt199, wt200);
__m512 tmp5448 = _mm512_unpackhi_ps(wt199, wt200);
__m512 tmp5449 = _mm512_unpacklo_ps(wt201, wt202);
__m512 tmp5450 = _mm512_unpackhi_ps(wt201, wt202);
__m512 tmp5451 = _mm512_unpacklo_ps(wt203, wt204);
__m512 tmp5452 = _mm512_unpackhi_ps(wt203, wt204);
__m512 tmp5453 = _mm512_unpacklo_ps(wt205, wt206);
__m512 tmp5454 = _mm512_unpackhi_ps(wt205, wt206);
__m512 tmp5455 = _mm512_unpacklo_ps(wt207, wt208);
__m512 tmp5456 = _mm512_unpackhi_ps(wt207, wt208);
__m512 tmp5457 = _mm512_unpacklo_ps(wt209, wt210);
__m512 tmp5458 = _mm512_unpackhi_ps(wt209, wt210);
__m512 tmp5459 = _mm512_unpacklo_ps(wt211, wt212);
__m512 tmp5460 = _mm512_unpackhi_ps(wt211, wt212);
__m512 tmp5461 = _mm512_shuffle_ps(tmp5445, tmp5447, 68);
__m512 tmp5462 = _mm512_shuffle_ps(tmp5445, tmp5447, 238);
__m512 tmp5463 = _mm512_shuffle_ps(tmp5446, tmp5448, 68);
__m512 tmp5464 = _mm512_shuffle_ps(tmp5446, tmp5448, 238);
__m512 tmp5465 = _mm512_shuffle_ps(tmp5449, tmp5451, 68);
__m512 tmp5466 = _mm512_shuffle_ps(tmp5449, tmp5451, 238);
__m512 tmp5467 = _mm512_shuffle_ps(tmp5450, tmp5452, 68);
__m512 tmp5468 = _mm512_shuffle_ps(tmp5450, tmp5452, 238);
__m512 tmp5469 = _mm512_shuffle_ps(tmp5453, tmp5455, 68);
__m512 tmp5470 = _mm512_shuffle_ps(tmp5453, tmp5455, 238);
__m512 tmp5471 = _mm512_shuffle_ps(tmp5454, tmp5456, 68);
__m512 tmp5472 = _mm512_shuffle_ps(tmp5454, tmp5456, 238);
__m512 tmp5473 = _mm512_shuffle_ps(tmp5457, tmp5459, 68);
__m512 tmp5474 = _mm512_shuffle_ps(tmp5457, tmp5459, 238);
__m512 tmp5475 = _mm512_shuffle_ps(tmp5458, tmp5460, 68);
__m512 tmp5476 = _mm512_shuffle_ps(tmp5458, tmp5460, 238);
__m512 tmp5477 = _mm512_shuffle_f32x4(tmp5461, tmp5465, 136);
__m512 tmp5478 = _mm512_shuffle_f32x4(tmp5461, tmp5465, 221);
__m512 tmp5479 = _mm512_shuffle_f32x4(tmp5462, tmp5466, 136);
__m512 tmp5480 = _mm512_shuffle_f32x4(tmp5462, tmp5466, 221);
__m512 tmp5481 = _mm512_shuffle_f32x4(tmp5463, tmp5467, 136);
__m512 tmp5482 = _mm512_shuffle_f32x4(tmp5463, tmp5467, 221);
__m512 tmp5483 = _mm512_shuffle_f32x4(tmp5464, tmp5468, 136);
__m512 tmp5484 = _mm512_shuffle_f32x4(tmp5464, tmp5468, 221);
__m512 tmp5485 = _mm512_shuffle_f32x4(tmp5469, tmp5473, 136);
__m512 tmp5486 = _mm512_shuffle_f32x4(tmp5469, tmp5473, 221);
__m512 tmp5487 = _mm512_shuffle_f32x4(tmp5470, tmp5474, 136);
__m512 tmp5488 = _mm512_shuffle_f32x4(tmp5470, tmp5474, 221);
__m512 tmp5489 = _mm512_shuffle_f32x4(tmp5471, tmp5475, 136);
__m512 tmp5490 = _mm512_shuffle_f32x4(tmp5471, tmp5475, 221);
__m512 tmp5491 = _mm512_shuffle_f32x4(tmp5472, tmp5476, 136);
__m512 tmp5492 = _mm512_shuffle_f32x4(tmp5472, tmp5476, 221);
wt197 = _mm512_shuffle_f32x4(tmp5477, tmp5485, 136);
wt205 = _mm512_shuffle_f32x4(tmp5477, tmp5485, 221);
wt198 = _mm512_shuffle_f32x4(tmp5479, tmp5487, 136);
wt206 = _mm512_shuffle_f32x4(tmp5479, tmp5487, 221);
wt199 = _mm512_shuffle_f32x4(tmp5481, tmp5489, 136);
wt207 = _mm512_shuffle_f32x4(tmp5481, tmp5489, 221);
wt200 = _mm512_shuffle_f32x4(tmp5483, tmp5491, 136);
wt208 = _mm512_shuffle_f32x4(tmp5483, tmp5491, 221);
wt201 = _mm512_shuffle_f32x4(tmp5478, tmp5486, 136);
wt209 = _mm512_shuffle_f32x4(tmp5478, tmp5486, 221);
wt202 = _mm512_shuffle_f32x4(tmp5480, tmp5488, 136);
wt210 = _mm512_shuffle_f32x4(tmp5480, tmp5488, 221);
wt203 = _mm512_shuffle_f32x4(tmp5482, tmp5490, 136);
wt211 = _mm512_shuffle_f32x4(tmp5482, tmp5490, 221);
wt204 = _mm512_shuffle_f32x4(tmp5484, tmp5492, 136);
wt212 = _mm512_shuffle_f32x4(tmp5484, tmp5492, 221);
wt197 = _mm512_mul_ps(wt197, postMul19);
wt198 = _mm512_mul_ps(wt198, postMul19);
wt199 = _mm512_mul_ps(wt199, postMul19);
wt200 = _mm512_mul_ps(wt200, postMul19);
wt201 = _mm512_mul_ps(wt201, postMul19);
wt202 = _mm512_mul_ps(wt202, postMul19);
wt203 = _mm512_mul_ps(wt203, postMul19);
wt204 = _mm512_mul_ps(wt204, postMul19);
wt205 = _mm512_mul_ps(wt205, postMul19);
wt206 = _mm512_mul_ps(wt206, postMul19);
wt207 = _mm512_mul_ps(wt207, postMul19);
wt208 = _mm512_mul_ps(wt208, postMul19);
wt209 = _mm512_mul_ps(wt209, postMul19);
wt210 = _mm512_mul_ps(wt210, postMul19);
wt211 = _mm512_mul_ps(wt211, postMul19);
wt212 = _mm512_mul_ps(wt212, postMul19);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(1+16*c18)+(ptrdiff_t)0, 63>>cut8, wt197);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(2+16*c18)+(ptrdiff_t)0, 63>>cut8, wt198);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(3+16*c18)+(ptrdiff_t)0, 63>>cut8, wt199);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(4+16*c18)+(ptrdiff_t)0, 63>>cut8, wt200);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(5+16*c18)+(ptrdiff_t)0, 63>>cut8, wt201);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(6+16*c18)+(ptrdiff_t)0, 63>>cut8, wt202);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(7+16*c18)+(ptrdiff_t)0, 63>>cut8, wt203);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(8+16*c18)+(ptrdiff_t)0, 63>>cut8, wt204);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(9+16*c18)+(ptrdiff_t)0, 63>>cut8, wt205);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(10+16*c18)+(ptrdiff_t)0, 63>>cut8, wt206);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(11+16*c18)+(ptrdiff_t)0, 63>>cut8, wt207);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(12+16*c18)+(ptrdiff_t)0, 63>>cut8, wt208);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(13+16*c18)+(ptrdiff_t)0, 63>>cut8, wt209);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(14+16*c18)+(ptrdiff_t)0, 63>>cut8, wt210);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(15+16*c18)+(ptrdiff_t)0, 63>>cut8, wt211);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(16+16*c18)+(ptrdiff_t)0, 63>>cut8, wt212);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(1+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt197);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(2+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt198);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(3+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt199);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(4+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt200);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(5+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt201);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(6+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt202);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(7+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt203);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(8+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt204);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(9+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt205);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(10+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt206);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(11+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt207);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(12+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt208);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(13+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt209);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(14+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt210);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(15+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt211);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(16+16*c18)+(ptrdiff_t)6144, 4032>>cut8, wt212);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(1+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt197);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(2+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt198);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(3+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt199);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(4+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt200);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(5+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt201);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(6+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt202);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(7+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt203);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(8+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt204);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(9+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt205);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(10+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt206);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(11+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt207);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(12+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt208);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(13+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt209);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(14+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt210);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(15+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt211);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(16+16*c18)+(ptrdiff_t)12288, 258048>>cut8, wt212);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(1+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt197);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(2+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt198);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(3+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt199);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(4+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt200);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(5+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt201);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(6+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt202);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(7+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt203);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(8+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt204);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(9+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt205);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(10+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt206);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(11+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt207);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(12+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt208);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(13+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt209);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(14+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt210);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(15+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt211);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l26+4*cut8+24*(16+16*c18)+(ptrdiff_t)18432, 65535-(262143>>cut8), wt212);
}
}
}
} else {
ptrdiff_t k75 = 48;
ptrdiff_t l25 = (size_t)(0+k75)/6;
ptrdiff_t cut7 = (size_t)(0+k75)%6;
__m512 sum122 = _mm512_maskz_loadu_ps(65535, biasPtr6+256*i22+4*k75);
__m512i pmMul13 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd13 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo11 = _mm512_loadu_ps(bnPtr6+(ptrdiff_t)8*(k75+64*i22));
__m512 masHi11 = _mm512_maskz_loadu_ps(65535, bnPtr6+(ptrdiff_t)8*(k75+64*i22)+(ptrdiff_t)64);
__m512 postMul17 = _mm512_permutex2var_ps(masLo11, pmMul13, masHi11);
__m512 postAdd11 = _mm512_permutex2var_ps(masLo11, pmAdd13, masHi11);
sum122 = _mm512_fmadd_ps(sum122, postMul17, postAdd11);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*0+(ptrdiff_t)0, 63>>cut7, sum122);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*0+(ptrdiff_t)6144, 4032>>cut7, sum122);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*0+(ptrdiff_t)12288, 65535-(4095>>cut7), sum122);
ptrdiff_t c16 = 0;
for (; c16 != 16; ++c16) {
__m512 wt165 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)0);
__m512 wt166 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)1024);
__m512 wt167 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)2048);
__m512 wt168 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)3072);
__m512 wt169 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)4096);
__m512 wt170 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)5120);
__m512 wt171 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)6144);
__m512 wt172 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)7168);
__m512 wt173 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)8192);
__m512 wt174 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)9216);
__m512 wt175 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)10240);
__m512 wt176 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)11264);
__m512 wt177 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)12288);
__m512 wt178 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)13312);
__m512 wt179 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)14336);
__m512 wt180 = _mm512_maskz_loadu_ps(65535, wtPtr6+65536*i22+1024*k75+64*c16+(ptrdiff_t)15360);
__m512 tmp5493 = _mm512_unpacklo_ps(wt165, wt166);
__m512 tmp5494 = _mm512_unpackhi_ps(wt165, wt166);
__m512 tmp5495 = _mm512_unpacklo_ps(wt167, wt168);
__m512 tmp5496 = _mm512_unpackhi_ps(wt167, wt168);
__m512 tmp5497 = _mm512_unpacklo_ps(wt169, wt170);
__m512 tmp5498 = _mm512_unpackhi_ps(wt169, wt170);
__m512 tmp5499 = _mm512_unpacklo_ps(wt171, wt172);
__m512 tmp5500 = _mm512_unpackhi_ps(wt171, wt172);
__m512 tmp5501 = _mm512_unpacklo_ps(wt173, wt174);
__m512 tmp5502 = _mm512_unpackhi_ps(wt173, wt174);
__m512 tmp5503 = _mm512_unpacklo_ps(wt175, wt176);
__m512 tmp5504 = _mm512_unpackhi_ps(wt175, wt176);
__m512 tmp5505 = _mm512_unpacklo_ps(wt177, wt178);
__m512 tmp5506 = _mm512_unpackhi_ps(wt177, wt178);
__m512 tmp5507 = _mm512_unpacklo_ps(wt179, wt180);
__m512 tmp5508 = _mm512_unpackhi_ps(wt179, wt180);
__m512 tmp5509 = _mm512_shuffle_ps(tmp5493, tmp5495, 68);
__m512 tmp5510 = _mm512_shuffle_ps(tmp5493, tmp5495, 238);
__m512 tmp5511 = _mm512_shuffle_ps(tmp5494, tmp5496, 68);
__m512 tmp5512 = _mm512_shuffle_ps(tmp5494, tmp5496, 238);
__m512 tmp5513 = _mm512_shuffle_ps(tmp5497, tmp5499, 68);
__m512 tmp5514 = _mm512_shuffle_ps(tmp5497, tmp5499, 238);
__m512 tmp5515 = _mm512_shuffle_ps(tmp5498, tmp5500, 68);
__m512 tmp5516 = _mm512_shuffle_ps(tmp5498, tmp5500, 238);
__m512 tmp5517 = _mm512_shuffle_ps(tmp5501, tmp5503, 68);
__m512 tmp5518 = _mm512_shuffle_ps(tmp5501, tmp5503, 238);
__m512 tmp5519 = _mm512_shuffle_ps(tmp5502, tmp5504, 68);
__m512 tmp5520 = _mm512_shuffle_ps(tmp5502, tmp5504, 238);
__m512 tmp5521 = _mm512_shuffle_ps(tmp5505, tmp5507, 68);
__m512 tmp5522 = _mm512_shuffle_ps(tmp5505, tmp5507, 238);
__m512 tmp5523 = _mm512_shuffle_ps(tmp5506, tmp5508, 68);
__m512 tmp5524 = _mm512_shuffle_ps(tmp5506, tmp5508, 238);
__m512 tmp5525 = _mm512_shuffle_f32x4(tmp5509, tmp5513, 136);
__m512 tmp5526 = _mm512_shuffle_f32x4(tmp5509, tmp5513, 221);
__m512 tmp5527 = _mm512_shuffle_f32x4(tmp5510, tmp5514, 136);
__m512 tmp5528 = _mm512_shuffle_f32x4(tmp5510, tmp5514, 221);
__m512 tmp5529 = _mm512_shuffle_f32x4(tmp5511, tmp5515, 136);
__m512 tmp5530 = _mm512_shuffle_f32x4(tmp5511, tmp5515, 221);
__m512 tmp5531 = _mm512_shuffle_f32x4(tmp5512, tmp5516, 136);
__m512 tmp5532 = _mm512_shuffle_f32x4(tmp5512, tmp5516, 221);
__m512 tmp5533 = _mm512_shuffle_f32x4(tmp5517, tmp5521, 136);
__m512 tmp5534 = _mm512_shuffle_f32x4(tmp5517, tmp5521, 221);
__m512 tmp5535 = _mm512_shuffle_f32x4(tmp5518, tmp5522, 136);
__m512 tmp5536 = _mm512_shuffle_f32x4(tmp5518, tmp5522, 221);
__m512 tmp5537 = _mm512_shuffle_f32x4(tmp5519, tmp5523, 136);
__m512 tmp5538 = _mm512_shuffle_f32x4(tmp5519, tmp5523, 221);
__m512 tmp5539 = _mm512_shuffle_f32x4(tmp5520, tmp5524, 136);
__m512 tmp5540 = _mm512_shuffle_f32x4(tmp5520, tmp5524, 221);
wt165 = _mm512_shuffle_f32x4(tmp5525, tmp5533, 136);
wt173 = _mm512_shuffle_f32x4(tmp5525, tmp5533, 221);
wt166 = _mm512_shuffle_f32x4(tmp5527, tmp5535, 136);
wt174 = _mm512_shuffle_f32x4(tmp5527, tmp5535, 221);
wt167 = _mm512_shuffle_f32x4(tmp5529, tmp5537, 136);
wt175 = _mm512_shuffle_f32x4(tmp5529, tmp5537, 221);
wt168 = _mm512_shuffle_f32x4(tmp5531, tmp5539, 136);
wt176 = _mm512_shuffle_f32x4(tmp5531, tmp5539, 221);
wt169 = _mm512_shuffle_f32x4(tmp5526, tmp5534, 136);
wt177 = _mm512_shuffle_f32x4(tmp5526, tmp5534, 221);
wt170 = _mm512_shuffle_f32x4(tmp5528, tmp5536, 136);
wt178 = _mm512_shuffle_f32x4(tmp5528, tmp5536, 221);
wt171 = _mm512_shuffle_f32x4(tmp5530, tmp5538, 136);
wt179 = _mm512_shuffle_f32x4(tmp5530, tmp5538, 221);
wt172 = _mm512_shuffle_f32x4(tmp5532, tmp5540, 136);
wt180 = _mm512_shuffle_f32x4(tmp5532, tmp5540, 221);
wt165 = _mm512_mul_ps(wt165, postMul17);
wt166 = _mm512_mul_ps(wt166, postMul17);
wt167 = _mm512_mul_ps(wt167, postMul17);
wt168 = _mm512_mul_ps(wt168, postMul17);
wt169 = _mm512_mul_ps(wt169, postMul17);
wt170 = _mm512_mul_ps(wt170, postMul17);
wt171 = _mm512_mul_ps(wt171, postMul17);
wt172 = _mm512_mul_ps(wt172, postMul17);
wt173 = _mm512_mul_ps(wt173, postMul17);
wt174 = _mm512_mul_ps(wt174, postMul17);
wt175 = _mm512_mul_ps(wt175, postMul17);
wt176 = _mm512_mul_ps(wt176, postMul17);
wt177 = _mm512_mul_ps(wt177, postMul17);
wt178 = _mm512_mul_ps(wt178, postMul17);
wt179 = _mm512_mul_ps(wt179, postMul17);
wt180 = _mm512_mul_ps(wt180, postMul17);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(1+16*c16)+(ptrdiff_t)0, 63>>cut7, wt165);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(2+16*c16)+(ptrdiff_t)0, 63>>cut7, wt166);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(3+16*c16)+(ptrdiff_t)0, 63>>cut7, wt167);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(4+16*c16)+(ptrdiff_t)0, 63>>cut7, wt168);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(5+16*c16)+(ptrdiff_t)0, 63>>cut7, wt169);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(6+16*c16)+(ptrdiff_t)0, 63>>cut7, wt170);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(7+16*c16)+(ptrdiff_t)0, 63>>cut7, wt171);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(8+16*c16)+(ptrdiff_t)0, 63>>cut7, wt172);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(9+16*c16)+(ptrdiff_t)0, 63>>cut7, wt173);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(10+16*c16)+(ptrdiff_t)0, 63>>cut7, wt174);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(11+16*c16)+(ptrdiff_t)0, 63>>cut7, wt175);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(12+16*c16)+(ptrdiff_t)0, 63>>cut7, wt176);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(13+16*c16)+(ptrdiff_t)0, 63>>cut7, wt177);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(14+16*c16)+(ptrdiff_t)0, 63>>cut7, wt178);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(15+16*c16)+(ptrdiff_t)0, 63>>cut7, wt179);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(16+16*c16)+(ptrdiff_t)0, 63>>cut7, wt180);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(1+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt165);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(2+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt166);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(3+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt167);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(4+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt168);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(5+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt169);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(6+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt170);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(7+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt171);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(8+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt172);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(9+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt173);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(10+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt174);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(11+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt175);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(12+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt176);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(13+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt177);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(14+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt178);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(15+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt179);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+24*(16+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt180);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(1+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt165);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(2+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt166);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(3+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt167);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(4+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt168);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(5+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt169);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(6+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt170);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(7+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt171);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(8+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt172);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(9+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt173);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(10+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt174);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(11+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt175);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(12+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt176);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(13+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt177);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(14+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt178);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(15+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt179);
_mm512_mask_storeu_ps(arranged5+65792*i22+6168*l25+4*cut7+16*(16+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt180);
}
}
}
}
}

static void ResNet50OneArrangeWts3(ResNet50ThreaderTeam1* team29, char** tensors31) {
ResNet50ThreaderTask1 task35;
task35.callee1 = ResNet50OneArrangeWts3Callee1;
task35.any1 = tensors31;
task35.nd1 = 3;
task35.hull1[0] = 2;
task35.hull1[1] = 1;
task35.hull1[2] = 1;
ResNet50ThreaderDo1(team29, &task35);
}

static void ResNet50OneArrangeDats3Callee1(ResNet50ThreaderTask1* task36, int64_t* pt23) {
char** tensors34 = task36->any1;
ptrdiff_t s16 = pt23[0];
ptrdiff_t c19 = pt23[1];
char*restrict datPtr10 = tensors34[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)3227648*0;
char*restrict arranged6 = tensors34[1]+(ptrdiff_t)10474240*0+(ptrdiff_t)3211264*0;
ptrdiff_t ii8 = 1;
for (ptrdiff_t i23 = 0; i23 < ii8; ++i23) {
ptrdiff_t j18 = 1*c19;
ptrdiff_t jj27 = j18+0;
for (; j18 != 49; ++j18) {
ptrdiff_t k77 = 128*s16;
ptrdiff_t kk29 = k77+128;
for (; k77 < kk29; ++k77) {
__m512 dat1283 = _mm512_maskz_loadu_ps(65535, datPtr10+3227648*i23+256*j18+12608*k77+(ptrdiff_t)0);
__m512 dat1284 = _mm512_maskz_loadu_ps(65535, datPtr10+3227648*i23+256*j18+12608*k77+(ptrdiff_t)64);
__m512 dat1285 = _mm512_maskz_loadu_ps(65535, datPtr10+3227648*i23+256*j18+12608*k77+(ptrdiff_t)128);
__m512 dat1286 = _mm512_maskz_loadu_ps(65535, datPtr10+3227648*i23+256*j18+12608*k77+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged6+3211264*i23+65536*j18+256*k77+(ptrdiff_t)0, 65535, dat1283);
_mm512_mask_storeu_ps(arranged6+3211264*i23+65536*j18+256*k77+(ptrdiff_t)64, 65535, dat1284);
_mm512_mask_storeu_ps(arranged6+3211264*i23+65536*j18+256*k77+(ptrdiff_t)128, 65535, dat1285);
_mm512_mask_storeu_ps(arranged6+3211264*i23+65536*j18+256*k77+(ptrdiff_t)192, 65535, dat1286);
}
if (j18 >= jj27) goto next3;
}
next3:;
}
}

static void ResNet50OneArrangeDats3(ResNet50ThreaderTeam1* team30, char** tensors33) {
ResNet50ThreaderTask1 task37;
task37.callee1 = ResNet50OneArrangeDats3Callee1;
task37.any1 = tensors33;
task37.nd1 = 4;
task37.hull1[0] = 2;
task37.hull1[1] = 49;
task37.hull1[2] = 1;
task37.hull1[3] = 1;
ResNet50ThreaderDo1(team30, &task37);
}

static void ResNet50OneApply3Callee1(ResNet50ThreaderTask1* task38, int64_t* pt24) {
void** pair8 = task38->any1;
char** tensors36 = pair8[0];
ptrdiff_t e11 = 0;
ptrdiff_t g12 = 0;
ptrdiff_t d7 = pt24[1];
ptrdiff_t w35 = pt24[0];
char*restrict arrangedWts3 = tensors36[0]+214016*e11+(ptrdiff_t)65792*1*g12;
char*restrict arrangedDats3 = tensors36[1]+10474240*e11+(ptrdiff_t)3211264*1*g12;
char*restrict datPtr11 = tensors36[2]+(ptrdiff_t)806912*1*g12;
ptrdiff_t ii9 = 1;
for (ptrdiff_t i24 = 0; i24 < ii9; ++i24) {
ptrdiff_t j19 = 1*d7;
ptrdiff_t jj28 = j19+0;
for (; j19 != 49; ++j19) {
ptrdiff_t k78 = 2*w35;
ptrdiff_t kk30 = k78+(w35 < 4 ? 1 : 2);
for (; k78 != 10; ++k78) {
ptrdiff_t s17 = -1;
__m512 sum125 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)24));
__m512 sum129 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)28));
__m512 sum133 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)32));
__m512 sum137 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)36));
__m512 sum141 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)40));
__m512 sum145 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)44));
__m512 sum126 = sum125;
__m512 sum127 = sum125;
__m512 sum128 = sum125;
__m512 sum130 = sum129;
__m512 sum131 = sum129;
__m512 sum132 = sum129;
__m512 sum134 = sum133;
__m512 sum135 = sum133;
__m512 sum136 = sum133;
__m512 sum138 = sum137;
__m512 sum139 = sum137;
__m512 sum140 = sum137;
__m512 sum142 = sum141;
__m512 sum143 = sum141;
__m512 sum144 = sum141;
__m512 sum146 = sum145;
__m512 sum147 = sum145;
__m512 sum148 = sum145;
for (s17 = 0; s17 < 256; ++s17) {
__m512 dat1287 = _mm512_loadu_ps(arrangedDats3+3211264*i24+65536*j19+256*s17+(ptrdiff_t)0);
__m512 dat1288 = _mm512_loadu_ps(arrangedDats3+3211264*i24+65536*j19+256*s17+(ptrdiff_t)64);
__m512 dat1289 = _mm512_loadu_ps(arrangedDats3+3211264*i24+65536*j19+256*s17+(ptrdiff_t)128);
__m512 dat1290 = _mm512_loadu_ps(arrangedDats3+3211264*i24+65536*j19+256*s17+(ptrdiff_t)192);
__m512 wt213 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)24));
sum125 = _mm512_fmadd_ps(wt213, dat1287, sum125);
sum126 = _mm512_fmadd_ps(wt213, dat1288, sum126);
sum127 = _mm512_fmadd_ps(wt213, dat1289, sum127);
sum128 = _mm512_fmadd_ps(wt213, dat1290, sum128);
__m512 wt214 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)28));
sum129 = _mm512_fmadd_ps(wt214, dat1287, sum129);
sum130 = _mm512_fmadd_ps(wt214, dat1288, sum130);
sum131 = _mm512_fmadd_ps(wt214, dat1289, sum131);
sum132 = _mm512_fmadd_ps(wt214, dat1290, sum132);
__m512 wt215 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)32));
sum133 = _mm512_fmadd_ps(wt215, dat1287, sum133);
sum134 = _mm512_fmadd_ps(wt215, dat1288, sum134);
sum135 = _mm512_fmadd_ps(wt215, dat1289, sum135);
sum136 = _mm512_fmadd_ps(wt215, dat1290, sum136);
__m512 wt216 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)36));
sum137 = _mm512_fmadd_ps(wt216, dat1287, sum137);
sum138 = _mm512_fmadd_ps(wt216, dat1288, sum138);
sum139 = _mm512_fmadd_ps(wt216, dat1289, sum139);
sum140 = _mm512_fmadd_ps(wt216, dat1290, sum140);
__m512 wt217 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)40));
sum141 = _mm512_fmadd_ps(wt217, dat1287, sum141);
sum142 = _mm512_fmadd_ps(wt217, dat1288, sum142);
sum143 = _mm512_fmadd_ps(wt217, dat1289, sum143);
sum144 = _mm512_fmadd_ps(wt217, dat1290, sum144);
__m512 wt218 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+24*s17+(ptrdiff_t)44));
sum145 = _mm512_fmadd_ps(wt218, dat1287, sum145);
sum146 = _mm512_fmadd_ps(wt218, dat1288, sum146);
sum147 = _mm512_fmadd_ps(wt218, dat1289, sum147);
sum148 = _mm512_fmadd_ps(wt218, dat1290, sum148);
}
sum125 = _mm512_max_ps(_mm512_setzero_ps(), sum125);
sum126 = _mm512_max_ps(_mm512_setzero_ps(), sum126);
sum127 = _mm512_max_ps(_mm512_setzero_ps(), sum127);
sum128 = _mm512_max_ps(_mm512_setzero_ps(), sum128);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)0, 65535, sum125);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)64, 65535, sum126);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)128, 65535, sum127);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)192, 65535, sum128);
sum129 = _mm512_max_ps(_mm512_setzero_ps(), sum129);
sum130 = _mm512_max_ps(_mm512_setzero_ps(), sum130);
sum131 = _mm512_max_ps(_mm512_setzero_ps(), sum131);
sum132 = _mm512_max_ps(_mm512_setzero_ps(), sum132);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)12608, 65535, sum129);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)12672, 65535, sum130);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)12736, 65535, sum131);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)12800, 65535, sum132);
sum133 = _mm512_max_ps(_mm512_setzero_ps(), sum133);
sum134 = _mm512_max_ps(_mm512_setzero_ps(), sum134);
sum135 = _mm512_max_ps(_mm512_setzero_ps(), sum135);
sum136 = _mm512_max_ps(_mm512_setzero_ps(), sum136);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)25216, 65535, sum133);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)25280, 65535, sum134);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)25344, 65535, sum135);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)25408, 65535, sum136);
sum137 = _mm512_max_ps(_mm512_setzero_ps(), sum137);
sum138 = _mm512_max_ps(_mm512_setzero_ps(), sum138);
sum139 = _mm512_max_ps(_mm512_setzero_ps(), sum139);
sum140 = _mm512_max_ps(_mm512_setzero_ps(), sum140);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)37824, 65535, sum137);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)37888, 65535, sum138);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)37952, 65535, sum139);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)38016, 65535, sum140);
sum141 = _mm512_max_ps(_mm512_setzero_ps(), sum141);
sum142 = _mm512_max_ps(_mm512_setzero_ps(), sum142);
sum143 = _mm512_max_ps(_mm512_setzero_ps(), sum143);
sum144 = _mm512_max_ps(_mm512_setzero_ps(), sum144);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)50432, 65535, sum141);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)50496, 65535, sum142);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)50560, 65535, sum143);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)50624, 65535, sum144);
sum145 = _mm512_max_ps(_mm512_setzero_ps(), sum145);
sum146 = _mm512_max_ps(_mm512_setzero_ps(), sum146);
sum147 = _mm512_max_ps(_mm512_setzero_ps(), sum147);
sum148 = _mm512_max_ps(_mm512_setzero_ps(), sum148);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)63040, 65535, sum145);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)63104, 65535, sum146);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)63168, 65535, sum147);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)63232, 65535, sum148);
if (k78 >= kk30) return;
}
ptrdiff_t s18 = -1;
__m512 sum149 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+16*s18+(ptrdiff_t)16));
__m512 sum153 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+16*s18+(ptrdiff_t)20));
__m512 sum157 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+16*s18+(ptrdiff_t)24));
__m512 sum161 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+16*s18+(ptrdiff_t)28));
__m512 sum150 = sum149;
__m512 sum151 = sum149;
__m512 sum152 = sum149;
__m512 sum154 = sum153;
__m512 sum155 = sum153;
__m512 sum156 = sum153;
__m512 sum158 = sum157;
__m512 sum159 = sum157;
__m512 sum160 = sum157;
__m512 sum162 = sum161;
__m512 sum163 = sum161;
__m512 sum164 = sum161;
for (s18 = 0; s18 < 256; ++s18) {
__m512 dat1291 = _mm512_loadu_ps(arrangedDats3+3211264*i24+65536*j19+256*s18+(ptrdiff_t)0);
__m512 dat1292 = _mm512_loadu_ps(arrangedDats3+3211264*i24+65536*j19+256*s18+(ptrdiff_t)64);
__m512 dat1293 = _mm512_loadu_ps(arrangedDats3+3211264*i24+65536*j19+256*s18+(ptrdiff_t)128);
__m512 dat1294 = _mm512_loadu_ps(arrangedDats3+3211264*i24+65536*j19+256*s18+(ptrdiff_t)192);
__m512 wt219 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+16*s18+(ptrdiff_t)16));
sum149 = _mm512_fmadd_ps(wt219, dat1291, sum149);
sum150 = _mm512_fmadd_ps(wt219, dat1292, sum150);
sum151 = _mm512_fmadd_ps(wt219, dat1293, sum151);
sum152 = _mm512_fmadd_ps(wt219, dat1294, sum152);
__m512 wt220 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+16*s18+(ptrdiff_t)20));
sum153 = _mm512_fmadd_ps(wt220, dat1291, sum153);
sum154 = _mm512_fmadd_ps(wt220, dat1292, sum154);
sum155 = _mm512_fmadd_ps(wt220, dat1293, sum155);
sum156 = _mm512_fmadd_ps(wt220, dat1294, sum156);
__m512 wt221 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+16*s18+(ptrdiff_t)24));
sum157 = _mm512_fmadd_ps(wt221, dat1291, sum157);
sum158 = _mm512_fmadd_ps(wt221, dat1292, sum158);
sum159 = _mm512_fmadd_ps(wt221, dat1293, sum159);
sum160 = _mm512_fmadd_ps(wt221, dat1294, sum160);
__m512 wt222 = _mm512_set1_ps(*(float*)(arrangedWts3+65792*i24+6168*k78+16*s18+(ptrdiff_t)28));
sum161 = _mm512_fmadd_ps(wt222, dat1291, sum161);
sum162 = _mm512_fmadd_ps(wt222, dat1292, sum162);
sum163 = _mm512_fmadd_ps(wt222, dat1293, sum163);
sum164 = _mm512_fmadd_ps(wt222, dat1294, sum164);
}
sum149 = _mm512_max_ps(_mm512_setzero_ps(), sum149);
sum150 = _mm512_max_ps(_mm512_setzero_ps(), sum150);
sum151 = _mm512_max_ps(_mm512_setzero_ps(), sum151);
sum152 = _mm512_max_ps(_mm512_setzero_ps(), sum152);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)0, 65535, sum149);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)64, 65535, sum150);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)128, 65535, sum151);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)192, 65535, sum152);
sum153 = _mm512_max_ps(_mm512_setzero_ps(), sum153);
sum154 = _mm512_max_ps(_mm512_setzero_ps(), sum154);
sum155 = _mm512_max_ps(_mm512_setzero_ps(), sum155);
sum156 = _mm512_max_ps(_mm512_setzero_ps(), sum156);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)12608, 65535, sum153);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)12672, 65535, sum154);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)12736, 65535, sum155);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)12800, 65535, sum156);
sum157 = _mm512_max_ps(_mm512_setzero_ps(), sum157);
sum158 = _mm512_max_ps(_mm512_setzero_ps(), sum158);
sum159 = _mm512_max_ps(_mm512_setzero_ps(), sum159);
sum160 = _mm512_max_ps(_mm512_setzero_ps(), sum160);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)25216, 65535, sum157);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)25280, 65535, sum158);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)25344, 65535, sum159);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)25408, 65535, sum160);
sum161 = _mm512_max_ps(_mm512_setzero_ps(), sum161);
sum162 = _mm512_max_ps(_mm512_setzero_ps(), sum162);
sum163 = _mm512_max_ps(_mm512_setzero_ps(), sum163);
sum164 = _mm512_max_ps(_mm512_setzero_ps(), sum164);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)37824, 65535, sum161);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)37888, 65535, sum162);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)37952, 65535, sum163);
_mm512_mask_storeu_ps(datPtr11+806912*i24+256*j19+75648*k78+(ptrdiff_t)38016, 65535, sum164);
if (j19 >= jj28) return;
}
}
}

static void ResNet50OneApply3(ResNet50ThreaderTeam1* team31, char** tensors35) {
void* pair7[] = {tensors35, 0};
ResNet50ThreaderTask1 task39;
task39.callee1 = ResNet50OneApply3Callee1;
task39.any1 = pair7;
task39.nd1 = 3;
task39.hull1[0] = 5;
task39.hull1[1] = 49;
task39.hull1[2] = 1;
ResNet50ThreaderDo1(team31, &task39);
}

static void ResNet50OneArrangeWts4Callee1(ResNet50ThreaderTask1* task48, int64_t* pt29) {
char** tensors46 = task48->any1;
ptrdiff_t b53 = pt29[0];
char*restrict wtPtr8 = tensors46[0]+(ptrdiff_t)3340*0+(ptrdiff_t)655360*0;
char*restrict biasPtr8 = tensors46[1]+(ptrdiff_t)2560*0;
char*restrict bnPtr8 = tensors46[2]+(ptrdiff_t)8*640*0;
char*restrict wtPtr9 = tensors46[3]+(ptrdiff_t)3340*0+(ptrdiff_t)655360*0;
char*restrict biasPtr9 = tensors46[4]+(ptrdiff_t)2560*0;
char*restrict bnPtr9 = tensors46[5]+(ptrdiff_t)8*640*0;
char*restrict arranged7 = tensors46[6]+(ptrdiff_t)2140160*0+(ptrdiff_t)657920*0;
ptrdiff_t ii10 = 1;
for (ptrdiff_t i31 = 0; i31 < ii10; ++i31) {
ptrdiff_t j24 = 2*b53;
ptrdiff_t jj30 = j24+2;
for (; j24 < jj30; ++j24) {
if (j24 < 32) {
ptrdiff_t k99 = 0+16*(j24-0);
ptrdiff_t l38 = (size_t)(0+k99)/6;
ptrdiff_t cut10 = (size_t)(0+k99)%6;
switch (cut10) {
case 0:;
case 2: {
__m512 sum205 = _mm512_maskz_loadu_ps(65535, biasPtr8+2560*i31+4*k99);
__m512i pmMul15 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd15 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo12 = _mm512_loadu_ps(bnPtr8+(ptrdiff_t)8*(k99+640*i31));
__m512 masHi12 = _mm512_maskz_loadu_ps(65535, bnPtr8+(ptrdiff_t)8*(k99+640*i31)+(ptrdiff_t)64);
__m512 postMul25 = _mm512_permutex2var_ps(masLo12, pmMul15, masHi12);
__m512 postAdd15 = _mm512_permutex2var_ps(masLo12, pmAdd15, masHi12);
sum205 = _mm512_fmadd_ps(sum205, postMul25, postAdd15);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)0, 63>>cut10, sum205);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)6144, 4032>>cut10, sum205);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)12288, 65535-(4095>>cut10), sum205);
ptrdiff_t c21 = 0;
for (; c21 != 16; ++c21) {
__m512 wt227 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)0);
__m512 wt228 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)1024);
__m512 wt229 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)2048);
__m512 wt230 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)3072);
__m512 wt231 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)4096);
__m512 wt232 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)5120);
__m512 wt233 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)6144);
__m512 wt234 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)7168);
__m512 wt235 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)8192);
__m512 wt236 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)9216);
__m512 wt237 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)10240);
__m512 wt238 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)11264);
__m512 wt239 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)12288);
__m512 wt240 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)13312);
__m512 wt241 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)14336);
__m512 wt242 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c21+(ptrdiff_t)15360);
__m512 tmp10553 = _mm512_unpacklo_ps(wt227, wt228);
__m512 tmp10554 = _mm512_unpackhi_ps(wt227, wt228);
__m512 tmp10555 = _mm512_unpacklo_ps(wt229, wt230);
__m512 tmp10556 = _mm512_unpackhi_ps(wt229, wt230);
__m512 tmp10557 = _mm512_unpacklo_ps(wt231, wt232);
__m512 tmp10558 = _mm512_unpackhi_ps(wt231, wt232);
__m512 tmp10559 = _mm512_unpacklo_ps(wt233, wt234);
__m512 tmp10560 = _mm512_unpackhi_ps(wt233, wt234);
__m512 tmp10561 = _mm512_unpacklo_ps(wt235, wt236);
__m512 tmp10562 = _mm512_unpackhi_ps(wt235, wt236);
__m512 tmp10563 = _mm512_unpacklo_ps(wt237, wt238);
__m512 tmp10564 = _mm512_unpackhi_ps(wt237, wt238);
__m512 tmp10565 = _mm512_unpacklo_ps(wt239, wt240);
__m512 tmp10566 = _mm512_unpackhi_ps(wt239, wt240);
__m512 tmp10567 = _mm512_unpacklo_ps(wt241, wt242);
__m512 tmp10568 = _mm512_unpackhi_ps(wt241, wt242);
__m512 tmp10569 = _mm512_shuffle_ps(tmp10553, tmp10555, 68);
__m512 tmp10570 = _mm512_shuffle_ps(tmp10553, tmp10555, 238);
__m512 tmp10571 = _mm512_shuffle_ps(tmp10554, tmp10556, 68);
__m512 tmp10572 = _mm512_shuffle_ps(tmp10554, tmp10556, 238);
__m512 tmp10573 = _mm512_shuffle_ps(tmp10557, tmp10559, 68);
__m512 tmp10574 = _mm512_shuffle_ps(tmp10557, tmp10559, 238);
__m512 tmp10575 = _mm512_shuffle_ps(tmp10558, tmp10560, 68);
__m512 tmp10576 = _mm512_shuffle_ps(tmp10558, tmp10560, 238);
__m512 tmp10577 = _mm512_shuffle_ps(tmp10561, tmp10563, 68);
__m512 tmp10578 = _mm512_shuffle_ps(tmp10561, tmp10563, 238);
__m512 tmp10579 = _mm512_shuffle_ps(tmp10562, tmp10564, 68);
__m512 tmp10580 = _mm512_shuffle_ps(tmp10562, tmp10564, 238);
__m512 tmp10581 = _mm512_shuffle_ps(tmp10565, tmp10567, 68);
__m512 tmp10582 = _mm512_shuffle_ps(tmp10565, tmp10567, 238);
__m512 tmp10583 = _mm512_shuffle_ps(tmp10566, tmp10568, 68);
__m512 tmp10584 = _mm512_shuffle_ps(tmp10566, tmp10568, 238);
__m512 tmp10585 = _mm512_shuffle_f32x4(tmp10569, tmp10573, 136);
__m512 tmp10586 = _mm512_shuffle_f32x4(tmp10569, tmp10573, 221);
__m512 tmp10587 = _mm512_shuffle_f32x4(tmp10570, tmp10574, 136);
__m512 tmp10588 = _mm512_shuffle_f32x4(tmp10570, tmp10574, 221);
__m512 tmp10589 = _mm512_shuffle_f32x4(tmp10571, tmp10575, 136);
__m512 tmp10590 = _mm512_shuffle_f32x4(tmp10571, tmp10575, 221);
__m512 tmp10591 = _mm512_shuffle_f32x4(tmp10572, tmp10576, 136);
__m512 tmp10592 = _mm512_shuffle_f32x4(tmp10572, tmp10576, 221);
__m512 tmp10593 = _mm512_shuffle_f32x4(tmp10577, tmp10581, 136);
__m512 tmp10594 = _mm512_shuffle_f32x4(tmp10577, tmp10581, 221);
__m512 tmp10595 = _mm512_shuffle_f32x4(tmp10578, tmp10582, 136);
__m512 tmp10596 = _mm512_shuffle_f32x4(tmp10578, tmp10582, 221);
__m512 tmp10597 = _mm512_shuffle_f32x4(tmp10579, tmp10583, 136);
__m512 tmp10598 = _mm512_shuffle_f32x4(tmp10579, tmp10583, 221);
__m512 tmp10599 = _mm512_shuffle_f32x4(tmp10580, tmp10584, 136);
__m512 tmp10600 = _mm512_shuffle_f32x4(tmp10580, tmp10584, 221);
wt227 = _mm512_shuffle_f32x4(tmp10585, tmp10593, 136);
wt235 = _mm512_shuffle_f32x4(tmp10585, tmp10593, 221);
wt228 = _mm512_shuffle_f32x4(tmp10587, tmp10595, 136);
wt236 = _mm512_shuffle_f32x4(tmp10587, tmp10595, 221);
wt229 = _mm512_shuffle_f32x4(tmp10589, tmp10597, 136);
wt237 = _mm512_shuffle_f32x4(tmp10589, tmp10597, 221);
wt230 = _mm512_shuffle_f32x4(tmp10591, tmp10599, 136);
wt238 = _mm512_shuffle_f32x4(tmp10591, tmp10599, 221);
wt231 = _mm512_shuffle_f32x4(tmp10586, tmp10594, 136);
wt239 = _mm512_shuffle_f32x4(tmp10586, tmp10594, 221);
wt232 = _mm512_shuffle_f32x4(tmp10588, tmp10596, 136);
wt240 = _mm512_shuffle_f32x4(tmp10588, tmp10596, 221);
wt233 = _mm512_shuffle_f32x4(tmp10590, tmp10598, 136);
wt241 = _mm512_shuffle_f32x4(tmp10590, tmp10598, 221);
wt234 = _mm512_shuffle_f32x4(tmp10592, tmp10600, 136);
wt242 = _mm512_shuffle_f32x4(tmp10592, tmp10600, 221);
wt227 = _mm512_mul_ps(wt227, postMul25);
wt228 = _mm512_mul_ps(wt228, postMul25);
wt229 = _mm512_mul_ps(wt229, postMul25);
wt230 = _mm512_mul_ps(wt230, postMul25);
wt231 = _mm512_mul_ps(wt231, postMul25);
wt232 = _mm512_mul_ps(wt232, postMul25);
wt233 = _mm512_mul_ps(wt233, postMul25);
wt234 = _mm512_mul_ps(wt234, postMul25);
wt235 = _mm512_mul_ps(wt235, postMul25);
wt236 = _mm512_mul_ps(wt236, postMul25);
wt237 = _mm512_mul_ps(wt237, postMul25);
wt238 = _mm512_mul_ps(wt238, postMul25);
wt239 = _mm512_mul_ps(wt239, postMul25);
wt240 = _mm512_mul_ps(wt240, postMul25);
wt241 = _mm512_mul_ps(wt241, postMul25);
wt242 = _mm512_mul_ps(wt242, postMul25);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(1+16*c21)+(ptrdiff_t)0, 63>>cut10, wt227);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(2+16*c21)+(ptrdiff_t)0, 63>>cut10, wt228);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(3+16*c21)+(ptrdiff_t)0, 63>>cut10, wt229);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(4+16*c21)+(ptrdiff_t)0, 63>>cut10, wt230);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(5+16*c21)+(ptrdiff_t)0, 63>>cut10, wt231);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(6+16*c21)+(ptrdiff_t)0, 63>>cut10, wt232);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(7+16*c21)+(ptrdiff_t)0, 63>>cut10, wt233);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(8+16*c21)+(ptrdiff_t)0, 63>>cut10, wt234);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(9+16*c21)+(ptrdiff_t)0, 63>>cut10, wt235);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(10+16*c21)+(ptrdiff_t)0, 63>>cut10, wt236);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(11+16*c21)+(ptrdiff_t)0, 63>>cut10, wt237);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(12+16*c21)+(ptrdiff_t)0, 63>>cut10, wt238);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(13+16*c21)+(ptrdiff_t)0, 63>>cut10, wt239);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(14+16*c21)+(ptrdiff_t)0, 63>>cut10, wt240);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(15+16*c21)+(ptrdiff_t)0, 63>>cut10, wt241);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(16+16*c21)+(ptrdiff_t)0, 63>>cut10, wt242);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(1+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt227);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(2+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt228);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(3+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt229);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(4+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt230);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(5+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt231);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(6+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt232);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(7+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt233);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(8+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt234);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(9+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt235);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(10+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt236);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(11+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt237);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(12+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt238);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(13+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt239);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(14+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt240);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(15+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt241);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(16+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt242);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(1+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt227);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(2+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt228);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(3+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt229);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(4+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt230);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(5+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt231);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(6+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt232);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(7+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt233);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(8+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt234);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(9+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt235);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(10+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt236);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(11+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt237);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(12+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt238);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(13+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt239);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(14+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt240);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(15+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt241);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(16+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt242);
}
break;
}
default: {
cut10 = 4;
__m512 sum206 = _mm512_maskz_loadu_ps(65535, biasPtr8+2560*i31+4*k99);
__m512i pmMul16 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd16 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo13 = _mm512_loadu_ps(bnPtr8+(ptrdiff_t)8*(k99+640*i31));
__m512 masHi13 = _mm512_maskz_loadu_ps(65535, bnPtr8+(ptrdiff_t)8*(k99+640*i31)+(ptrdiff_t)64);
__m512 postMul26 = _mm512_permutex2var_ps(masLo13, pmMul16, masHi13);
__m512 postAdd16 = _mm512_permutex2var_ps(masLo13, pmAdd16, masHi13);
sum206 = _mm512_fmadd_ps(sum206, postMul26, postAdd16);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)0, 63>>cut10, sum206);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)6144, 4032>>cut10, sum206);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)12288, 258048>>cut10, sum206);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)18432, 65535-(262143>>cut10), sum206);
ptrdiff_t c22 = 0;
for (; c22 != 16; ++c22) {
__m512 wt243 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)0);
__m512 wt244 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)1024);
__m512 wt245 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)2048);
__m512 wt246 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)3072);
__m512 wt247 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)4096);
__m512 wt248 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)5120);
__m512 wt249 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)6144);
__m512 wt250 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)7168);
__m512 wt251 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)8192);
__m512 wt252 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)9216);
__m512 wt253 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)10240);
__m512 wt254 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)11264);
__m512 wt255 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)12288);
__m512 wt256 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)13312);
__m512 wt257 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)14336);
__m512 wt258 = _mm512_maskz_loadu_ps(65535, wtPtr8+655360*i31+1024*k99+64*c22+(ptrdiff_t)15360);
__m512 tmp10601 = _mm512_unpacklo_ps(wt243, wt244);
__m512 tmp10602 = _mm512_unpackhi_ps(wt243, wt244);
__m512 tmp10603 = _mm512_unpacklo_ps(wt245, wt246);
__m512 tmp10604 = _mm512_unpackhi_ps(wt245, wt246);
__m512 tmp10605 = _mm512_unpacklo_ps(wt247, wt248);
__m512 tmp10606 = _mm512_unpackhi_ps(wt247, wt248);
__m512 tmp10607 = _mm512_unpacklo_ps(wt249, wt250);
__m512 tmp10608 = _mm512_unpackhi_ps(wt249, wt250);
__m512 tmp10609 = _mm512_unpacklo_ps(wt251, wt252);
__m512 tmp10610 = _mm512_unpackhi_ps(wt251, wt252);
__m512 tmp10611 = _mm512_unpacklo_ps(wt253, wt254);
__m512 tmp10612 = _mm512_unpackhi_ps(wt253, wt254);
__m512 tmp10613 = _mm512_unpacklo_ps(wt255, wt256);
__m512 tmp10614 = _mm512_unpackhi_ps(wt255, wt256);
__m512 tmp10615 = _mm512_unpacklo_ps(wt257, wt258);
__m512 tmp10616 = _mm512_unpackhi_ps(wt257, wt258);
__m512 tmp10617 = _mm512_shuffle_ps(tmp10601, tmp10603, 68);
__m512 tmp10618 = _mm512_shuffle_ps(tmp10601, tmp10603, 238);
__m512 tmp10619 = _mm512_shuffle_ps(tmp10602, tmp10604, 68);
__m512 tmp10620 = _mm512_shuffle_ps(tmp10602, tmp10604, 238);
__m512 tmp10621 = _mm512_shuffle_ps(tmp10605, tmp10607, 68);
__m512 tmp10622 = _mm512_shuffle_ps(tmp10605, tmp10607, 238);
__m512 tmp10623 = _mm512_shuffle_ps(tmp10606, tmp10608, 68);
__m512 tmp10624 = _mm512_shuffle_ps(tmp10606, tmp10608, 238);
__m512 tmp10625 = _mm512_shuffle_ps(tmp10609, tmp10611, 68);
__m512 tmp10626 = _mm512_shuffle_ps(tmp10609, tmp10611, 238);
__m512 tmp10627 = _mm512_shuffle_ps(tmp10610, tmp10612, 68);
__m512 tmp10628 = _mm512_shuffle_ps(tmp10610, tmp10612, 238);
__m512 tmp10629 = _mm512_shuffle_ps(tmp10613, tmp10615, 68);
__m512 tmp10630 = _mm512_shuffle_ps(tmp10613, tmp10615, 238);
__m512 tmp10631 = _mm512_shuffle_ps(tmp10614, tmp10616, 68);
__m512 tmp10632 = _mm512_shuffle_ps(tmp10614, tmp10616, 238);
__m512 tmp10633 = _mm512_shuffle_f32x4(tmp10617, tmp10621, 136);
__m512 tmp10634 = _mm512_shuffle_f32x4(tmp10617, tmp10621, 221);
__m512 tmp10635 = _mm512_shuffle_f32x4(tmp10618, tmp10622, 136);
__m512 tmp10636 = _mm512_shuffle_f32x4(tmp10618, tmp10622, 221);
__m512 tmp10637 = _mm512_shuffle_f32x4(tmp10619, tmp10623, 136);
__m512 tmp10638 = _mm512_shuffle_f32x4(tmp10619, tmp10623, 221);
__m512 tmp10639 = _mm512_shuffle_f32x4(tmp10620, tmp10624, 136);
__m512 tmp10640 = _mm512_shuffle_f32x4(tmp10620, tmp10624, 221);
__m512 tmp10641 = _mm512_shuffle_f32x4(tmp10625, tmp10629, 136);
__m512 tmp10642 = _mm512_shuffle_f32x4(tmp10625, tmp10629, 221);
__m512 tmp10643 = _mm512_shuffle_f32x4(tmp10626, tmp10630, 136);
__m512 tmp10644 = _mm512_shuffle_f32x4(tmp10626, tmp10630, 221);
__m512 tmp10645 = _mm512_shuffle_f32x4(tmp10627, tmp10631, 136);
__m512 tmp10646 = _mm512_shuffle_f32x4(tmp10627, tmp10631, 221);
__m512 tmp10647 = _mm512_shuffle_f32x4(tmp10628, tmp10632, 136);
__m512 tmp10648 = _mm512_shuffle_f32x4(tmp10628, tmp10632, 221);
wt243 = _mm512_shuffle_f32x4(tmp10633, tmp10641, 136);
wt251 = _mm512_shuffle_f32x4(tmp10633, tmp10641, 221);
wt244 = _mm512_shuffle_f32x4(tmp10635, tmp10643, 136);
wt252 = _mm512_shuffle_f32x4(tmp10635, tmp10643, 221);
wt245 = _mm512_shuffle_f32x4(tmp10637, tmp10645, 136);
wt253 = _mm512_shuffle_f32x4(tmp10637, tmp10645, 221);
wt246 = _mm512_shuffle_f32x4(tmp10639, tmp10647, 136);
wt254 = _mm512_shuffle_f32x4(tmp10639, tmp10647, 221);
wt247 = _mm512_shuffle_f32x4(tmp10634, tmp10642, 136);
wt255 = _mm512_shuffle_f32x4(tmp10634, tmp10642, 221);
wt248 = _mm512_shuffle_f32x4(tmp10636, tmp10644, 136);
wt256 = _mm512_shuffle_f32x4(tmp10636, tmp10644, 221);
wt249 = _mm512_shuffle_f32x4(tmp10638, tmp10646, 136);
wt257 = _mm512_shuffle_f32x4(tmp10638, tmp10646, 221);
wt250 = _mm512_shuffle_f32x4(tmp10640, tmp10648, 136);
wt258 = _mm512_shuffle_f32x4(tmp10640, tmp10648, 221);
wt243 = _mm512_mul_ps(wt243, postMul26);
wt244 = _mm512_mul_ps(wt244, postMul26);
wt245 = _mm512_mul_ps(wt245, postMul26);
wt246 = _mm512_mul_ps(wt246, postMul26);
wt247 = _mm512_mul_ps(wt247, postMul26);
wt248 = _mm512_mul_ps(wt248, postMul26);
wt249 = _mm512_mul_ps(wt249, postMul26);
wt250 = _mm512_mul_ps(wt250, postMul26);
wt251 = _mm512_mul_ps(wt251, postMul26);
wt252 = _mm512_mul_ps(wt252, postMul26);
wt253 = _mm512_mul_ps(wt253, postMul26);
wt254 = _mm512_mul_ps(wt254, postMul26);
wt255 = _mm512_mul_ps(wt255, postMul26);
wt256 = _mm512_mul_ps(wt256, postMul26);
wt257 = _mm512_mul_ps(wt257, postMul26);
wt258 = _mm512_mul_ps(wt258, postMul26);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(1+16*c22)+(ptrdiff_t)0, 63>>cut10, wt243);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(2+16*c22)+(ptrdiff_t)0, 63>>cut10, wt244);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(3+16*c22)+(ptrdiff_t)0, 63>>cut10, wt245);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(4+16*c22)+(ptrdiff_t)0, 63>>cut10, wt246);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(5+16*c22)+(ptrdiff_t)0, 63>>cut10, wt247);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(6+16*c22)+(ptrdiff_t)0, 63>>cut10, wt248);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(7+16*c22)+(ptrdiff_t)0, 63>>cut10, wt249);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(8+16*c22)+(ptrdiff_t)0, 63>>cut10, wt250);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(9+16*c22)+(ptrdiff_t)0, 63>>cut10, wt251);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(10+16*c22)+(ptrdiff_t)0, 63>>cut10, wt252);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(11+16*c22)+(ptrdiff_t)0, 63>>cut10, wt253);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(12+16*c22)+(ptrdiff_t)0, 63>>cut10, wt254);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(13+16*c22)+(ptrdiff_t)0, 63>>cut10, wt255);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(14+16*c22)+(ptrdiff_t)0, 63>>cut10, wt256);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(15+16*c22)+(ptrdiff_t)0, 63>>cut10, wt257);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(16+16*c22)+(ptrdiff_t)0, 63>>cut10, wt258);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(1+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt243);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(2+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt244);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(3+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt245);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(4+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt246);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(5+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt247);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(6+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt248);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(7+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt249);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(8+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt250);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(9+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt251);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(10+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt252);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(11+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt253);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(12+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt254);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(13+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt255);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(14+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt256);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(15+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt257);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(16+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt258);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(1+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt243);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(2+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt244);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(3+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt245);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(4+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt246);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(5+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt247);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(6+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt248);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(7+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt249);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(8+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt250);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(9+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt251);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(10+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt252);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(11+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt253);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(12+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt254);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(13+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt255);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(14+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt256);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(15+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt257);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(16+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt258);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(1+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt243);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(2+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt244);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(3+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt245);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(4+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt246);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(5+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt247);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(6+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt248);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(7+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt249);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(8+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt250);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(9+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt251);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(10+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt252);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(11+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt253);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(12+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt254);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(13+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt255);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(14+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt256);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(15+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt257);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l38+4*cut10+24*(16+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt258);
}
}
}
} else if (j24 < 39) {
ptrdiff_t k101 = 0+16*(j24-32);
ptrdiff_t l40 = (size_t)(512+k101)/6;
ptrdiff_t cut12 = (size_t)(512+k101)%6;
switch (cut12) {
case 0:;
case 2: {
__m512 sum208 = _mm512_maskz_loadu_ps(65535, biasPtr9+2560*i31+4*k101);
__m512i pmMul17 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd17 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo14 = _mm512_loadu_ps(bnPtr9+(ptrdiff_t)8*(k101+640*i31));
__m512 masHi14 = _mm512_maskz_loadu_ps(65535, bnPtr9+(ptrdiff_t)8*(k101+640*i31)+(ptrdiff_t)64);
__m512 postMul28 = _mm512_permutex2var_ps(masLo14, pmMul17, masHi14);
__m512 postAdd18 = _mm512_permutex2var_ps(masLo14, pmAdd17, masHi14);
sum208 = _mm512_fmadd_ps(sum208, postMul28, postAdd18);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*0+(ptrdiff_t)0, 63>>cut12, sum208);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*0+(ptrdiff_t)6144, 4032>>cut12, sum208);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*0+(ptrdiff_t)12288, 65535-(4095>>cut12), sum208);
ptrdiff_t c24 = 0;
for (; c24 != 16; ++c24) {
__m512 wt275 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)0);
__m512 wt276 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)1024);
__m512 wt277 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)2048);
__m512 wt278 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)3072);
__m512 wt279 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)4096);
__m512 wt280 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)5120);
__m512 wt281 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)6144);
__m512 wt282 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)7168);
__m512 wt283 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)8192);
__m512 wt284 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)9216);
__m512 wt285 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)10240);
__m512 wt286 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)11264);
__m512 wt287 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)12288);
__m512 wt288 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)13312);
__m512 wt289 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)14336);
__m512 wt290 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c24+(ptrdiff_t)15360);
__m512 tmp10649 = _mm512_unpacklo_ps(wt275, wt276);
__m512 tmp10650 = _mm512_unpackhi_ps(wt275, wt276);
__m512 tmp10651 = _mm512_unpacklo_ps(wt277, wt278);
__m512 tmp10652 = _mm512_unpackhi_ps(wt277, wt278);
__m512 tmp10653 = _mm512_unpacklo_ps(wt279, wt280);
__m512 tmp10654 = _mm512_unpackhi_ps(wt279, wt280);
__m512 tmp10655 = _mm512_unpacklo_ps(wt281, wt282);
__m512 tmp10656 = _mm512_unpackhi_ps(wt281, wt282);
__m512 tmp10657 = _mm512_unpacklo_ps(wt283, wt284);
__m512 tmp10658 = _mm512_unpackhi_ps(wt283, wt284);
__m512 tmp10659 = _mm512_unpacklo_ps(wt285, wt286);
__m512 tmp10660 = _mm512_unpackhi_ps(wt285, wt286);
__m512 tmp10661 = _mm512_unpacklo_ps(wt287, wt288);
__m512 tmp10662 = _mm512_unpackhi_ps(wt287, wt288);
__m512 tmp10663 = _mm512_unpacklo_ps(wt289, wt290);
__m512 tmp10664 = _mm512_unpackhi_ps(wt289, wt290);
__m512 tmp10665 = _mm512_shuffle_ps(tmp10649, tmp10651, 68);
__m512 tmp10666 = _mm512_shuffle_ps(tmp10649, tmp10651, 238);
__m512 tmp10667 = _mm512_shuffle_ps(tmp10650, tmp10652, 68);
__m512 tmp10668 = _mm512_shuffle_ps(tmp10650, tmp10652, 238);
__m512 tmp10669 = _mm512_shuffle_ps(tmp10653, tmp10655, 68);
__m512 tmp10670 = _mm512_shuffle_ps(tmp10653, tmp10655, 238);
__m512 tmp10671 = _mm512_shuffle_ps(tmp10654, tmp10656, 68);
__m512 tmp10672 = _mm512_shuffle_ps(tmp10654, tmp10656, 238);
__m512 tmp10673 = _mm512_shuffle_ps(tmp10657, tmp10659, 68);
__m512 tmp10674 = _mm512_shuffle_ps(tmp10657, tmp10659, 238);
__m512 tmp10675 = _mm512_shuffle_ps(tmp10658, tmp10660, 68);
__m512 tmp10676 = _mm512_shuffle_ps(tmp10658, tmp10660, 238);
__m512 tmp10677 = _mm512_shuffle_ps(tmp10661, tmp10663, 68);
__m512 tmp10678 = _mm512_shuffle_ps(tmp10661, tmp10663, 238);
__m512 tmp10679 = _mm512_shuffle_ps(tmp10662, tmp10664, 68);
__m512 tmp10680 = _mm512_shuffle_ps(tmp10662, tmp10664, 238);
__m512 tmp10681 = _mm512_shuffle_f32x4(tmp10665, tmp10669, 136);
__m512 tmp10682 = _mm512_shuffle_f32x4(tmp10665, tmp10669, 221);
__m512 tmp10683 = _mm512_shuffle_f32x4(tmp10666, tmp10670, 136);
__m512 tmp10684 = _mm512_shuffle_f32x4(tmp10666, tmp10670, 221);
__m512 tmp10685 = _mm512_shuffle_f32x4(tmp10667, tmp10671, 136);
__m512 tmp10686 = _mm512_shuffle_f32x4(tmp10667, tmp10671, 221);
__m512 tmp10687 = _mm512_shuffle_f32x4(tmp10668, tmp10672, 136);
__m512 tmp10688 = _mm512_shuffle_f32x4(tmp10668, tmp10672, 221);
__m512 tmp10689 = _mm512_shuffle_f32x4(tmp10673, tmp10677, 136);
__m512 tmp10690 = _mm512_shuffle_f32x4(tmp10673, tmp10677, 221);
__m512 tmp10691 = _mm512_shuffle_f32x4(tmp10674, tmp10678, 136);
__m512 tmp10692 = _mm512_shuffle_f32x4(tmp10674, tmp10678, 221);
__m512 tmp10693 = _mm512_shuffle_f32x4(tmp10675, tmp10679, 136);
__m512 tmp10694 = _mm512_shuffle_f32x4(tmp10675, tmp10679, 221);
__m512 tmp10695 = _mm512_shuffle_f32x4(tmp10676, tmp10680, 136);
__m512 tmp10696 = _mm512_shuffle_f32x4(tmp10676, tmp10680, 221);
wt275 = _mm512_shuffle_f32x4(tmp10681, tmp10689, 136);
wt283 = _mm512_shuffle_f32x4(tmp10681, tmp10689, 221);
wt276 = _mm512_shuffle_f32x4(tmp10683, tmp10691, 136);
wt284 = _mm512_shuffle_f32x4(tmp10683, tmp10691, 221);
wt277 = _mm512_shuffle_f32x4(tmp10685, tmp10693, 136);
wt285 = _mm512_shuffle_f32x4(tmp10685, tmp10693, 221);
wt278 = _mm512_shuffle_f32x4(tmp10687, tmp10695, 136);
wt286 = _mm512_shuffle_f32x4(tmp10687, tmp10695, 221);
wt279 = _mm512_shuffle_f32x4(tmp10682, tmp10690, 136);
wt287 = _mm512_shuffle_f32x4(tmp10682, tmp10690, 221);
wt280 = _mm512_shuffle_f32x4(tmp10684, tmp10692, 136);
wt288 = _mm512_shuffle_f32x4(tmp10684, tmp10692, 221);
wt281 = _mm512_shuffle_f32x4(tmp10686, tmp10694, 136);
wt289 = _mm512_shuffle_f32x4(tmp10686, tmp10694, 221);
wt282 = _mm512_shuffle_f32x4(tmp10688, tmp10696, 136);
wt290 = _mm512_shuffle_f32x4(tmp10688, tmp10696, 221);
wt275 = _mm512_mul_ps(wt275, postMul28);
wt276 = _mm512_mul_ps(wt276, postMul28);
wt277 = _mm512_mul_ps(wt277, postMul28);
wt278 = _mm512_mul_ps(wt278, postMul28);
wt279 = _mm512_mul_ps(wt279, postMul28);
wt280 = _mm512_mul_ps(wt280, postMul28);
wt281 = _mm512_mul_ps(wt281, postMul28);
wt282 = _mm512_mul_ps(wt282, postMul28);
wt283 = _mm512_mul_ps(wt283, postMul28);
wt284 = _mm512_mul_ps(wt284, postMul28);
wt285 = _mm512_mul_ps(wt285, postMul28);
wt286 = _mm512_mul_ps(wt286, postMul28);
wt287 = _mm512_mul_ps(wt287, postMul28);
wt288 = _mm512_mul_ps(wt288, postMul28);
wt289 = _mm512_mul_ps(wt289, postMul28);
wt290 = _mm512_mul_ps(wt290, postMul28);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(1+16*c24)+(ptrdiff_t)0, 63>>cut12, wt275);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(2+16*c24)+(ptrdiff_t)0, 63>>cut12, wt276);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(3+16*c24)+(ptrdiff_t)0, 63>>cut12, wt277);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(4+16*c24)+(ptrdiff_t)0, 63>>cut12, wt278);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(5+16*c24)+(ptrdiff_t)0, 63>>cut12, wt279);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(6+16*c24)+(ptrdiff_t)0, 63>>cut12, wt280);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(7+16*c24)+(ptrdiff_t)0, 63>>cut12, wt281);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(8+16*c24)+(ptrdiff_t)0, 63>>cut12, wt282);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(9+16*c24)+(ptrdiff_t)0, 63>>cut12, wt283);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(10+16*c24)+(ptrdiff_t)0, 63>>cut12, wt284);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(11+16*c24)+(ptrdiff_t)0, 63>>cut12, wt285);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(12+16*c24)+(ptrdiff_t)0, 63>>cut12, wt286);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(13+16*c24)+(ptrdiff_t)0, 63>>cut12, wt287);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(14+16*c24)+(ptrdiff_t)0, 63>>cut12, wt288);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(15+16*c24)+(ptrdiff_t)0, 63>>cut12, wt289);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(16+16*c24)+(ptrdiff_t)0, 63>>cut12, wt290);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(1+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt275);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(2+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt276);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(3+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt277);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(4+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt278);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(5+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt279);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(6+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt280);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(7+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt281);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(8+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt282);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(9+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt283);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(10+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt284);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(11+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt285);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(12+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt286);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(13+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt287);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(14+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt288);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(15+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt289);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(16+16*c24)+(ptrdiff_t)6144, 4032>>cut12, wt290);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(1+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt275);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(2+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt276);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(3+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt277);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(4+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt278);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(5+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt279);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(6+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt280);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(7+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt281);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(8+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt282);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(9+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt283);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(10+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt284);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(11+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt285);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(12+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt286);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(13+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt287);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(14+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt288);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(15+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt289);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(16+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt290);
}
break;
}
default: {
cut12 = 4;
__m512 sum209 = _mm512_maskz_loadu_ps(65535, biasPtr9+2560*i31+4*k101);
__m512i pmMul18 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd18 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo15 = _mm512_loadu_ps(bnPtr9+(ptrdiff_t)8*(k101+640*i31));
__m512 masHi15 = _mm512_maskz_loadu_ps(65535, bnPtr9+(ptrdiff_t)8*(k101+640*i31)+(ptrdiff_t)64);
__m512 postMul29 = _mm512_permutex2var_ps(masLo15, pmMul18, masHi15);
__m512 postAdd19 = _mm512_permutex2var_ps(masLo15, pmAdd18, masHi15);
sum209 = _mm512_fmadd_ps(sum209, postMul29, postAdd19);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*0+(ptrdiff_t)0, 63>>cut12, sum209);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*0+(ptrdiff_t)6144, 4032>>cut12, sum209);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*0+(ptrdiff_t)12288, 258048>>cut12, sum209);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*0+(ptrdiff_t)18432, 65535-(262143>>cut12), sum209);
ptrdiff_t c25 = 0;
for (; c25 != 16; ++c25) {
__m512 wt291 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)0);
__m512 wt292 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)1024);
__m512 wt293 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)2048);
__m512 wt294 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)3072);
__m512 wt295 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)4096);
__m512 wt296 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)5120);
__m512 wt297 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)6144);
__m512 wt298 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)7168);
__m512 wt299 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)8192);
__m512 wt300 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)9216);
__m512 wt301 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)10240);
__m512 wt302 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)11264);
__m512 wt303 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)12288);
__m512 wt304 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)13312);
__m512 wt305 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)14336);
__m512 wt306 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k101+64*c25+(ptrdiff_t)15360);
__m512 tmp10697 = _mm512_unpacklo_ps(wt291, wt292);
__m512 tmp10698 = _mm512_unpackhi_ps(wt291, wt292);
__m512 tmp10699 = _mm512_unpacklo_ps(wt293, wt294);
__m512 tmp10700 = _mm512_unpackhi_ps(wt293, wt294);
__m512 tmp10701 = _mm512_unpacklo_ps(wt295, wt296);
__m512 tmp10702 = _mm512_unpackhi_ps(wt295, wt296);
__m512 tmp10703 = _mm512_unpacklo_ps(wt297, wt298);
__m512 tmp10704 = _mm512_unpackhi_ps(wt297, wt298);
__m512 tmp10705 = _mm512_unpacklo_ps(wt299, wt300);
__m512 tmp10706 = _mm512_unpackhi_ps(wt299, wt300);
__m512 tmp10707 = _mm512_unpacklo_ps(wt301, wt302);
__m512 tmp10708 = _mm512_unpackhi_ps(wt301, wt302);
__m512 tmp10709 = _mm512_unpacklo_ps(wt303, wt304);
__m512 tmp10710 = _mm512_unpackhi_ps(wt303, wt304);
__m512 tmp10711 = _mm512_unpacklo_ps(wt305, wt306);
__m512 tmp10712 = _mm512_unpackhi_ps(wt305, wt306);
__m512 tmp10713 = _mm512_shuffle_ps(tmp10697, tmp10699, 68);
__m512 tmp10714 = _mm512_shuffle_ps(tmp10697, tmp10699, 238);
__m512 tmp10715 = _mm512_shuffle_ps(tmp10698, tmp10700, 68);
__m512 tmp10716 = _mm512_shuffle_ps(tmp10698, tmp10700, 238);
__m512 tmp10717 = _mm512_shuffle_ps(tmp10701, tmp10703, 68);
__m512 tmp10718 = _mm512_shuffle_ps(tmp10701, tmp10703, 238);
__m512 tmp10719 = _mm512_shuffle_ps(tmp10702, tmp10704, 68);
__m512 tmp10720 = _mm512_shuffle_ps(tmp10702, tmp10704, 238);
__m512 tmp10721 = _mm512_shuffle_ps(tmp10705, tmp10707, 68);
__m512 tmp10722 = _mm512_shuffle_ps(tmp10705, tmp10707, 238);
__m512 tmp10723 = _mm512_shuffle_ps(tmp10706, tmp10708, 68);
__m512 tmp10724 = _mm512_shuffle_ps(tmp10706, tmp10708, 238);
__m512 tmp10725 = _mm512_shuffle_ps(tmp10709, tmp10711, 68);
__m512 tmp10726 = _mm512_shuffle_ps(tmp10709, tmp10711, 238);
__m512 tmp10727 = _mm512_shuffle_ps(tmp10710, tmp10712, 68);
__m512 tmp10728 = _mm512_shuffle_ps(tmp10710, tmp10712, 238);
__m512 tmp10729 = _mm512_shuffle_f32x4(tmp10713, tmp10717, 136);
__m512 tmp10730 = _mm512_shuffle_f32x4(tmp10713, tmp10717, 221);
__m512 tmp10731 = _mm512_shuffle_f32x4(tmp10714, tmp10718, 136);
__m512 tmp10732 = _mm512_shuffle_f32x4(tmp10714, tmp10718, 221);
__m512 tmp10733 = _mm512_shuffle_f32x4(tmp10715, tmp10719, 136);
__m512 tmp10734 = _mm512_shuffle_f32x4(tmp10715, tmp10719, 221);
__m512 tmp10735 = _mm512_shuffle_f32x4(tmp10716, tmp10720, 136);
__m512 tmp10736 = _mm512_shuffle_f32x4(tmp10716, tmp10720, 221);
__m512 tmp10737 = _mm512_shuffle_f32x4(tmp10721, tmp10725, 136);
__m512 tmp10738 = _mm512_shuffle_f32x4(tmp10721, tmp10725, 221);
__m512 tmp10739 = _mm512_shuffle_f32x4(tmp10722, tmp10726, 136);
__m512 tmp10740 = _mm512_shuffle_f32x4(tmp10722, tmp10726, 221);
__m512 tmp10741 = _mm512_shuffle_f32x4(tmp10723, tmp10727, 136);
__m512 tmp10742 = _mm512_shuffle_f32x4(tmp10723, tmp10727, 221);
__m512 tmp10743 = _mm512_shuffle_f32x4(tmp10724, tmp10728, 136);
__m512 tmp10744 = _mm512_shuffle_f32x4(tmp10724, tmp10728, 221);
wt291 = _mm512_shuffle_f32x4(tmp10729, tmp10737, 136);
wt299 = _mm512_shuffle_f32x4(tmp10729, tmp10737, 221);
wt292 = _mm512_shuffle_f32x4(tmp10731, tmp10739, 136);
wt300 = _mm512_shuffle_f32x4(tmp10731, tmp10739, 221);
wt293 = _mm512_shuffle_f32x4(tmp10733, tmp10741, 136);
wt301 = _mm512_shuffle_f32x4(tmp10733, tmp10741, 221);
wt294 = _mm512_shuffle_f32x4(tmp10735, tmp10743, 136);
wt302 = _mm512_shuffle_f32x4(tmp10735, tmp10743, 221);
wt295 = _mm512_shuffle_f32x4(tmp10730, tmp10738, 136);
wt303 = _mm512_shuffle_f32x4(tmp10730, tmp10738, 221);
wt296 = _mm512_shuffle_f32x4(tmp10732, tmp10740, 136);
wt304 = _mm512_shuffle_f32x4(tmp10732, tmp10740, 221);
wt297 = _mm512_shuffle_f32x4(tmp10734, tmp10742, 136);
wt305 = _mm512_shuffle_f32x4(tmp10734, tmp10742, 221);
wt298 = _mm512_shuffle_f32x4(tmp10736, tmp10744, 136);
wt306 = _mm512_shuffle_f32x4(tmp10736, tmp10744, 221);
wt291 = _mm512_mul_ps(wt291, postMul29);
wt292 = _mm512_mul_ps(wt292, postMul29);
wt293 = _mm512_mul_ps(wt293, postMul29);
wt294 = _mm512_mul_ps(wt294, postMul29);
wt295 = _mm512_mul_ps(wt295, postMul29);
wt296 = _mm512_mul_ps(wt296, postMul29);
wt297 = _mm512_mul_ps(wt297, postMul29);
wt298 = _mm512_mul_ps(wt298, postMul29);
wt299 = _mm512_mul_ps(wt299, postMul29);
wt300 = _mm512_mul_ps(wt300, postMul29);
wt301 = _mm512_mul_ps(wt301, postMul29);
wt302 = _mm512_mul_ps(wt302, postMul29);
wt303 = _mm512_mul_ps(wt303, postMul29);
wt304 = _mm512_mul_ps(wt304, postMul29);
wt305 = _mm512_mul_ps(wt305, postMul29);
wt306 = _mm512_mul_ps(wt306, postMul29);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(1+16*c25)+(ptrdiff_t)0, 63>>cut12, wt291);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(2+16*c25)+(ptrdiff_t)0, 63>>cut12, wt292);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(3+16*c25)+(ptrdiff_t)0, 63>>cut12, wt293);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(4+16*c25)+(ptrdiff_t)0, 63>>cut12, wt294);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(5+16*c25)+(ptrdiff_t)0, 63>>cut12, wt295);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(6+16*c25)+(ptrdiff_t)0, 63>>cut12, wt296);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(7+16*c25)+(ptrdiff_t)0, 63>>cut12, wt297);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(8+16*c25)+(ptrdiff_t)0, 63>>cut12, wt298);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(9+16*c25)+(ptrdiff_t)0, 63>>cut12, wt299);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(10+16*c25)+(ptrdiff_t)0, 63>>cut12, wt300);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(11+16*c25)+(ptrdiff_t)0, 63>>cut12, wt301);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(12+16*c25)+(ptrdiff_t)0, 63>>cut12, wt302);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(13+16*c25)+(ptrdiff_t)0, 63>>cut12, wt303);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(14+16*c25)+(ptrdiff_t)0, 63>>cut12, wt304);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(15+16*c25)+(ptrdiff_t)0, 63>>cut12, wt305);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(16+16*c25)+(ptrdiff_t)0, 63>>cut12, wt306);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(1+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt291);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(2+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt292);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(3+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt293);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(4+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt294);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(5+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt295);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(6+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt296);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(7+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt297);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(8+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt298);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(9+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt299);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(10+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt300);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(11+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt301);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(12+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt302);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(13+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt303);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(14+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt304);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(15+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt305);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(16+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt306);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(1+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt291);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(2+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt292);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(3+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt293);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(4+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt294);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(5+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt295);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(6+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt296);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(7+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt297);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(8+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt298);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(9+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt299);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(10+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt300);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(11+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt301);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(12+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt302);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(13+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt303);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(14+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt304);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(15+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt305);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(16+16*c25)+(ptrdiff_t)12288, 258048>>cut12, wt306);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(1+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt291);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(2+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt292);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(3+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt293);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(4+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt294);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(5+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt295);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(6+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt296);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(7+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt297);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(8+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt298);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(9+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt299);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(10+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt300);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(11+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt301);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(12+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt302);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(13+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt303);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(14+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt304);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(15+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt305);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l40+4*cut12+24*(16+16*c25)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt306);
}
}
}
} else {
ptrdiff_t k100 = 112;
ptrdiff_t l39 = (size_t)(512+k100)/6;
ptrdiff_t cut11 = (size_t)(512+k100)%6;
__m512 sum207 = _mm512_maskz_loadu_ps(65535, biasPtr9+2560*i31+4*k100);
__m512i pmMul19 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd19 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo16 = _mm512_loadu_ps(bnPtr9+(ptrdiff_t)8*(k100+640*i31));
__m512 masHi16 = _mm512_maskz_loadu_ps(65535, bnPtr9+(ptrdiff_t)8*(k100+640*i31)+(ptrdiff_t)64);
__m512 postMul27 = _mm512_permutex2var_ps(masLo16, pmMul19, masHi16);
__m512 postAdd17 = _mm512_permutex2var_ps(masLo16, pmAdd19, masHi16);
sum207 = _mm512_fmadd_ps(sum207, postMul27, postAdd17);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*0+(ptrdiff_t)0, 63>>cut11, sum207);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*0+(ptrdiff_t)6144, 4032>>cut11, sum207);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*0+(ptrdiff_t)12288, 65535-(4095>>cut11), sum207);
ptrdiff_t c23 = 0;
for (; c23 != 16; ++c23) {
__m512 wt259 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)0);
__m512 wt260 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)1024);
__m512 wt261 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)2048);
__m512 wt262 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)3072);
__m512 wt263 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)4096);
__m512 wt264 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)5120);
__m512 wt265 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)6144);
__m512 wt266 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)7168);
__m512 wt267 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)8192);
__m512 wt268 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)9216);
__m512 wt269 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)10240);
__m512 wt270 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)11264);
__m512 wt271 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)12288);
__m512 wt272 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)13312);
__m512 wt273 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)14336);
__m512 wt274 = _mm512_maskz_loadu_ps(65535, wtPtr9+655360*i31+1024*k100+64*c23+(ptrdiff_t)15360);
__m512 tmp10745 = _mm512_unpacklo_ps(wt259, wt260);
__m512 tmp10746 = _mm512_unpackhi_ps(wt259, wt260);
__m512 tmp10747 = _mm512_unpacklo_ps(wt261, wt262);
__m512 tmp10748 = _mm512_unpackhi_ps(wt261, wt262);
__m512 tmp10749 = _mm512_unpacklo_ps(wt263, wt264);
__m512 tmp10750 = _mm512_unpackhi_ps(wt263, wt264);
__m512 tmp10751 = _mm512_unpacklo_ps(wt265, wt266);
__m512 tmp10752 = _mm512_unpackhi_ps(wt265, wt266);
__m512 tmp10753 = _mm512_unpacklo_ps(wt267, wt268);
__m512 tmp10754 = _mm512_unpackhi_ps(wt267, wt268);
__m512 tmp10755 = _mm512_unpacklo_ps(wt269, wt270);
__m512 tmp10756 = _mm512_unpackhi_ps(wt269, wt270);
__m512 tmp10757 = _mm512_unpacklo_ps(wt271, wt272);
__m512 tmp10758 = _mm512_unpackhi_ps(wt271, wt272);
__m512 tmp10759 = _mm512_unpacklo_ps(wt273, wt274);
__m512 tmp10760 = _mm512_unpackhi_ps(wt273, wt274);
__m512 tmp10761 = _mm512_shuffle_ps(tmp10745, tmp10747, 68);
__m512 tmp10762 = _mm512_shuffle_ps(tmp10745, tmp10747, 238);
__m512 tmp10763 = _mm512_shuffle_ps(tmp10746, tmp10748, 68);
__m512 tmp10764 = _mm512_shuffle_ps(tmp10746, tmp10748, 238);
__m512 tmp10765 = _mm512_shuffle_ps(tmp10749, tmp10751, 68);
__m512 tmp10766 = _mm512_shuffle_ps(tmp10749, tmp10751, 238);
__m512 tmp10767 = _mm512_shuffle_ps(tmp10750, tmp10752, 68);
__m512 tmp10768 = _mm512_shuffle_ps(tmp10750, tmp10752, 238);
__m512 tmp10769 = _mm512_shuffle_ps(tmp10753, tmp10755, 68);
__m512 tmp10770 = _mm512_shuffle_ps(tmp10753, tmp10755, 238);
__m512 tmp10771 = _mm512_shuffle_ps(tmp10754, tmp10756, 68);
__m512 tmp10772 = _mm512_shuffle_ps(tmp10754, tmp10756, 238);
__m512 tmp10773 = _mm512_shuffle_ps(tmp10757, tmp10759, 68);
__m512 tmp10774 = _mm512_shuffle_ps(tmp10757, tmp10759, 238);
__m512 tmp10775 = _mm512_shuffle_ps(tmp10758, tmp10760, 68);
__m512 tmp10776 = _mm512_shuffle_ps(tmp10758, tmp10760, 238);
__m512 tmp10777 = _mm512_shuffle_f32x4(tmp10761, tmp10765, 136);
__m512 tmp10778 = _mm512_shuffle_f32x4(tmp10761, tmp10765, 221);
__m512 tmp10779 = _mm512_shuffle_f32x4(tmp10762, tmp10766, 136);
__m512 tmp10780 = _mm512_shuffle_f32x4(tmp10762, tmp10766, 221);
__m512 tmp10781 = _mm512_shuffle_f32x4(tmp10763, tmp10767, 136);
__m512 tmp10782 = _mm512_shuffle_f32x4(tmp10763, tmp10767, 221);
__m512 tmp10783 = _mm512_shuffle_f32x4(tmp10764, tmp10768, 136);
__m512 tmp10784 = _mm512_shuffle_f32x4(tmp10764, tmp10768, 221);
__m512 tmp10785 = _mm512_shuffle_f32x4(tmp10769, tmp10773, 136);
__m512 tmp10786 = _mm512_shuffle_f32x4(tmp10769, tmp10773, 221);
__m512 tmp10787 = _mm512_shuffle_f32x4(tmp10770, tmp10774, 136);
__m512 tmp10788 = _mm512_shuffle_f32x4(tmp10770, tmp10774, 221);
__m512 tmp10789 = _mm512_shuffle_f32x4(tmp10771, tmp10775, 136);
__m512 tmp10790 = _mm512_shuffle_f32x4(tmp10771, tmp10775, 221);
__m512 tmp10791 = _mm512_shuffle_f32x4(tmp10772, tmp10776, 136);
__m512 tmp10792 = _mm512_shuffle_f32x4(tmp10772, tmp10776, 221);
wt259 = _mm512_shuffle_f32x4(tmp10777, tmp10785, 136);
wt267 = _mm512_shuffle_f32x4(tmp10777, tmp10785, 221);
wt260 = _mm512_shuffle_f32x4(tmp10779, tmp10787, 136);
wt268 = _mm512_shuffle_f32x4(tmp10779, tmp10787, 221);
wt261 = _mm512_shuffle_f32x4(tmp10781, tmp10789, 136);
wt269 = _mm512_shuffle_f32x4(tmp10781, tmp10789, 221);
wt262 = _mm512_shuffle_f32x4(tmp10783, tmp10791, 136);
wt270 = _mm512_shuffle_f32x4(tmp10783, tmp10791, 221);
wt263 = _mm512_shuffle_f32x4(tmp10778, tmp10786, 136);
wt271 = _mm512_shuffle_f32x4(tmp10778, tmp10786, 221);
wt264 = _mm512_shuffle_f32x4(tmp10780, tmp10788, 136);
wt272 = _mm512_shuffle_f32x4(tmp10780, tmp10788, 221);
wt265 = _mm512_shuffle_f32x4(tmp10782, tmp10790, 136);
wt273 = _mm512_shuffle_f32x4(tmp10782, tmp10790, 221);
wt266 = _mm512_shuffle_f32x4(tmp10784, tmp10792, 136);
wt274 = _mm512_shuffle_f32x4(tmp10784, tmp10792, 221);
wt259 = _mm512_mul_ps(wt259, postMul27);
wt260 = _mm512_mul_ps(wt260, postMul27);
wt261 = _mm512_mul_ps(wt261, postMul27);
wt262 = _mm512_mul_ps(wt262, postMul27);
wt263 = _mm512_mul_ps(wt263, postMul27);
wt264 = _mm512_mul_ps(wt264, postMul27);
wt265 = _mm512_mul_ps(wt265, postMul27);
wt266 = _mm512_mul_ps(wt266, postMul27);
wt267 = _mm512_mul_ps(wt267, postMul27);
wt268 = _mm512_mul_ps(wt268, postMul27);
wt269 = _mm512_mul_ps(wt269, postMul27);
wt270 = _mm512_mul_ps(wt270, postMul27);
wt271 = _mm512_mul_ps(wt271, postMul27);
wt272 = _mm512_mul_ps(wt272, postMul27);
wt273 = _mm512_mul_ps(wt273, postMul27);
wt274 = _mm512_mul_ps(wt274, postMul27);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(1+16*c23)+(ptrdiff_t)0, 63>>cut11, wt259);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(2+16*c23)+(ptrdiff_t)0, 63>>cut11, wt260);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(3+16*c23)+(ptrdiff_t)0, 63>>cut11, wt261);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(4+16*c23)+(ptrdiff_t)0, 63>>cut11, wt262);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(5+16*c23)+(ptrdiff_t)0, 63>>cut11, wt263);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(6+16*c23)+(ptrdiff_t)0, 63>>cut11, wt264);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(7+16*c23)+(ptrdiff_t)0, 63>>cut11, wt265);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(8+16*c23)+(ptrdiff_t)0, 63>>cut11, wt266);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(9+16*c23)+(ptrdiff_t)0, 63>>cut11, wt267);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(10+16*c23)+(ptrdiff_t)0, 63>>cut11, wt268);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(11+16*c23)+(ptrdiff_t)0, 63>>cut11, wt269);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(12+16*c23)+(ptrdiff_t)0, 63>>cut11, wt270);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(13+16*c23)+(ptrdiff_t)0, 63>>cut11, wt271);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(14+16*c23)+(ptrdiff_t)0, 63>>cut11, wt272);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(15+16*c23)+(ptrdiff_t)0, 63>>cut11, wt273);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(16+16*c23)+(ptrdiff_t)0, 63>>cut11, wt274);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(1+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt259);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(2+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt260);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(3+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt261);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(4+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt262);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(5+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt263);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(6+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt264);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(7+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt265);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(8+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt266);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(9+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt267);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(10+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt268);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(11+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt269);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(12+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt270);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(13+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt271);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(14+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt272);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(15+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt273);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+24*(16+16*c23)+(ptrdiff_t)6144, 4032>>cut11, wt274);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(1+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt259);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(2+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt260);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(3+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt261);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(4+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt262);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(5+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt263);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(6+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt264);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(7+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt265);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(8+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt266);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(9+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt267);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(10+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt268);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(11+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt269);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(12+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt270);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(13+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt271);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(14+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt272);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(15+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt273);
_mm512_mask_storeu_ps(arranged7+657920*i31+6168*l39+4*cut11+16*(16+16*c23)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt274);
}
}
}
}
}

static void ResNet50OneArrangeWts4(ResNet50ThreaderTeam1* team36, char** tensors45) {
ResNet50ThreaderTask1 task49;
task49.callee1 = ResNet50OneArrangeWts4Callee1;
task49.any1 = tensors45;
task49.nd1 = 3;
task49.hull1[0] = 20;
task49.hull1[1] = 1;
task49.hull1[2] = 1;
ResNet50ThreaderDo1(team36, &task49);
}

static void ResNet50OneArrangeDats4Callee1(ResNet50ThreaderTask1* task50, int64_t* pt30) {
char** tensors48 = task50->any1;
ptrdiff_t s21 = pt30[0];
ptrdiff_t c26 = pt30[1];
char*restrict datPtr14 = tensors48[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)3227648*0;
char*restrict arranged8 = tensors48[1]+(ptrdiff_t)2992640*0+(ptrdiff_t)917504*0;
ptrdiff_t ii11 = 1;
for (ptrdiff_t i32 = 0; i32 < ii11; ++i32) {
ptrdiff_t j25 = 1*c26;
ptrdiff_t jj31 = j25+0;
ptrdiff_t h38 = 0+((size_t)j25-0)/1*4;
switch (((size_t)j25-0)%1) {
default: {
wrap3:;
ptrdiff_t k102 = 128*s21;
ptrdiff_t kk32 = k102+128;
for (; k102 < kk32; ++k102) {
__m512 dat1645 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k102+(ptrdiff_t)0);
__m512 dat1646 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k102+(ptrdiff_t)64);
__m512i pm153 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat1647 = _mm512_permutex2var_ps(dat1645, pm153, dat1646);
__m512 dat1648 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k102+(ptrdiff_t)128);
__m512 dat1649 = _mm512_maskz_loadu_ps(127, datPtr14+3227648*i32+224*h38+12608*k102+(ptrdiff_t)192);
__m512i pm154 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat1650 = _mm512_permutex2var_ps(dat1648, pm154, dat1649);
__m512 dat1651 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k102+(ptrdiff_t)448);
__m512 dat1652 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k102+(ptrdiff_t)512);
__m512i pm155 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat1653 = _mm512_permutex2var_ps(dat1651, pm155, dat1652);
__m512 dat1654 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k102+(ptrdiff_t)576);
__m512 dat1655 = _mm512_maskz_loadu_ps(127, datPtr14+3227648*i32+224*h38+12608*k102+(ptrdiff_t)640);
__m512i pm156 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat1656 = _mm512_permutex2var_ps(dat1654, pm156, dat1655);
_mm512_storeu_ps(arranged8+917504*i32+65536*j25+256*k102+(ptrdiff_t)0, dat1647);
_mm512_storeu_ps(arranged8+917504*i32+65536*j25+256*k102+(ptrdiff_t)64, dat1650);
_mm512_storeu_ps(arranged8+917504*i32+65536*j25+256*k102+(ptrdiff_t)128, dat1653);
_mm512_storeu_ps(arranged8+917504*i32+65536*j25+256*k102+(ptrdiff_t)192, dat1656);
}
if (j25 >= jj31) goto next4;
if (j25 >= 13) break;
++j25;
h38 += 4;
goto wrap3;
}
}
j25 = 14;
next4:;
}
}

static void ResNet50OneArrangeDats4(ResNet50ThreaderTeam1* team37, char** tensors47) {
ResNet50ThreaderTask1 task51;
task51.callee1 = ResNet50OneArrangeDats4Callee1;
task51.any1 = tensors47;
task51.nd1 = 4;
task51.hull1[0] = 2;
task51.hull1[1] = 14;
task51.hull1[2] = 1;
task51.hull1[3] = 1;
ResNet50ThreaderDo1(team37, &task51);
}

static void ResNet50OneApply4Callee1(ResNet50ThreaderTask1* task52, int64_t* pt31) {
void** pair12 = task52->any1;
char** tensors50 = pair12[0];
ptrdiff_t e15 = 0;
ptrdiff_t g17 = 0;
ptrdiff_t d10 = pt31[1];
ptrdiff_t w47 = pt31[0];
char*restrict arrangedWts4 = tensors50[0]+2140160*e15+(ptrdiff_t)657920*1*g17;
char*restrict arrangedDats4 = tensors50[1]+2992640*e15+(ptrdiff_t)917504*1*g17;
char*restrict datPtr15 = tensors50[2]+(ptrdiff_t)2007040*1*g17;
ptrdiff_t ii12 = 1;
for (ptrdiff_t i33 = 0; i33 < ii12; ++i33) {
ptrdiff_t j26 = 1*d10;
ptrdiff_t jj32 = j26+0;
ptrdiff_t h39 = 0+((size_t)j26-0)/1*2;
switch (((size_t)j26-0)%1) {
default: {
wrap4:;
ptrdiff_t k103 = 2*w47;
ptrdiff_t kk33 = k103+(w47 < 52 ? 1 : 2);
for (; k103 != 106; ++k103) {
ptrdiff_t s22 = -1;
__m512 sum210 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)24));
__m512 sum214 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)28));
__m512 sum218 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)32));
__m512 sum222 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)36));
__m512 sum226 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)40));
__m512 sum230 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)44));
__m512 sum211 = sum210;
__m512 sum212 = sum210;
__m512 sum213 = sum210;
__m512 sum215 = sum214;
__m512 sum216 = sum214;
__m512 sum217 = sum214;
__m512 sum219 = sum218;
__m512 sum220 = sum218;
__m512 sum221 = sum218;
__m512 sum223 = sum222;
__m512 sum224 = sum222;
__m512 sum225 = sum222;
__m512 sum227 = sum226;
__m512 sum228 = sum226;
__m512 sum229 = sum226;
__m512 sum231 = sum230;
__m512 sum232 = sum230;
__m512 sum233 = sum230;
for (s22 = 0; s22 < 256; ++s22) {
__m512 dat1657 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s22+(ptrdiff_t)0);
__m512 dat1658 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s22+(ptrdiff_t)64);
__m512 dat1659 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s22+(ptrdiff_t)128);
__m512 dat1660 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s22+(ptrdiff_t)192);
__m512 wt307 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)24));
sum210 = _mm512_fmadd_ps(wt307, dat1657, sum210);
sum211 = _mm512_fmadd_ps(wt307, dat1658, sum211);
sum212 = _mm512_fmadd_ps(wt307, dat1659, sum212);
sum213 = _mm512_fmadd_ps(wt307, dat1660, sum213);
__m512 wt308 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)28));
sum214 = _mm512_fmadd_ps(wt308, dat1657, sum214);
sum215 = _mm512_fmadd_ps(wt308, dat1658, sum215);
sum216 = _mm512_fmadd_ps(wt308, dat1659, sum216);
sum217 = _mm512_fmadd_ps(wt308, dat1660, sum217);
__m512 wt309 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)32));
sum218 = _mm512_fmadd_ps(wt309, dat1657, sum218);
sum219 = _mm512_fmadd_ps(wt309, dat1658, sum219);
sum220 = _mm512_fmadd_ps(wt309, dat1659, sum220);
sum221 = _mm512_fmadd_ps(wt309, dat1660, sum221);
__m512 wt310 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)36));
sum222 = _mm512_fmadd_ps(wt310, dat1657, sum222);
sum223 = _mm512_fmadd_ps(wt310, dat1658, sum223);
sum224 = _mm512_fmadd_ps(wt310, dat1659, sum224);
sum225 = _mm512_fmadd_ps(wt310, dat1660, sum225);
__m512 wt311 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)40));
sum226 = _mm512_fmadd_ps(wt311, dat1657, sum226);
sum227 = _mm512_fmadd_ps(wt311, dat1658, sum227);
sum228 = _mm512_fmadd_ps(wt311, dat1659, sum228);
sum229 = _mm512_fmadd_ps(wt311, dat1660, sum229);
__m512 wt312 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+24*s22+(ptrdiff_t)44));
sum230 = _mm512_fmadd_ps(wt312, dat1657, sum230);
sum231 = _mm512_fmadd_ps(wt312, dat1658, sum231);
sum232 = _mm512_fmadd_ps(wt312, dat1659, sum232);
sum233 = _mm512_fmadd_ps(wt312, dat1660, sum233);
}
__m512 dat1661 = sum211;
__m512 dat1662 = sum213;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)0, 65535, sum210);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)64, 4095, dat1661);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)112, 65535, sum212);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)176, 4095, dat1662);
__m512 dat1663 = sum215;
__m512 dat1664 = sum217;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)3136, 65535, sum214);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)3200, 4095, dat1663);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)3248, 65535, sum216);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)3312, 4095, dat1664);
__m512 dat1665 = sum219;
__m512 dat1666 = sum221;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)6272, 65535, sum218);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)6336, 4095, dat1665);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)6384, 65535, sum220);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)6448, 4095, dat1666);
__m512 dat1667 = sum223;
__m512 dat1668 = sum225;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)9408, 65535, sum222);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)9472, 4095, dat1667);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)9520, 65535, sum224);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)9584, 4095, dat1668);
__m512 dat1669 = sum227;
__m512 dat1670 = sum229;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)12544, 65535, sum226);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)12608, 4095, dat1669);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)12656, 65535, sum228);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)12720, 4095, dat1670);
__m512 dat1671 = sum231;
__m512 dat1672 = sum233;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)15680, 65535, sum230);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)15744, 4095, dat1671);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)15792, 65535, sum232);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)15856, 4095, dat1672);
if (k103 >= kk33) return;
}
ptrdiff_t s23 = -1;
__m512 sum234 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+16*s23+(ptrdiff_t)16));
__m512 sum238 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+16*s23+(ptrdiff_t)20));
__m512 sum242 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+16*s23+(ptrdiff_t)24));
__m512 sum246 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+16*s23+(ptrdiff_t)28));
__m512 sum235 = sum234;
__m512 sum236 = sum234;
__m512 sum237 = sum234;
__m512 sum239 = sum238;
__m512 sum240 = sum238;
__m512 sum241 = sum238;
__m512 sum243 = sum242;
__m512 sum244 = sum242;
__m512 sum245 = sum242;
__m512 sum247 = sum246;
__m512 sum248 = sum246;
__m512 sum249 = sum246;
for (s23 = 0; s23 < 256; ++s23) {
__m512 dat1673 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s23+(ptrdiff_t)0);
__m512 dat1674 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s23+(ptrdiff_t)64);
__m512 dat1675 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s23+(ptrdiff_t)128);
__m512 dat1676 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s23+(ptrdiff_t)192);
__m512 wt313 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+16*s23+(ptrdiff_t)16));
sum234 = _mm512_fmadd_ps(wt313, dat1673, sum234);
sum235 = _mm512_fmadd_ps(wt313, dat1674, sum235);
sum236 = _mm512_fmadd_ps(wt313, dat1675, sum236);
sum237 = _mm512_fmadd_ps(wt313, dat1676, sum237);
__m512 wt314 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+16*s23+(ptrdiff_t)20));
sum238 = _mm512_fmadd_ps(wt314, dat1673, sum238);
sum239 = _mm512_fmadd_ps(wt314, dat1674, sum239);
sum240 = _mm512_fmadd_ps(wt314, dat1675, sum240);
sum241 = _mm512_fmadd_ps(wt314, dat1676, sum241);
__m512 wt315 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+16*s23+(ptrdiff_t)24));
sum242 = _mm512_fmadd_ps(wt315, dat1673, sum242);
sum243 = _mm512_fmadd_ps(wt315, dat1674, sum243);
sum244 = _mm512_fmadd_ps(wt315, dat1675, sum244);
sum245 = _mm512_fmadd_ps(wt315, dat1676, sum245);
__m512 wt316 = _mm512_set1_ps(*(float*)(arrangedWts4+657920*i33+6168*k103+16*s23+(ptrdiff_t)28));
sum246 = _mm512_fmadd_ps(wt316, dat1673, sum246);
sum247 = _mm512_fmadd_ps(wt316, dat1674, sum247);
sum248 = _mm512_fmadd_ps(wt316, dat1675, sum248);
sum249 = _mm512_fmadd_ps(wt316, dat1676, sum249);
}
__m512 dat1677 = sum235;
__m512 dat1678 = sum237;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)0, 65535, sum234);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)64, 4095, dat1677);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)112, 65535, sum236);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)176, 4095, dat1678);
__m512 dat1679 = sum239;
__m512 dat1680 = sum241;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)3136, 65535, sum238);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)3200, 4095, dat1679);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)3248, 65535, sum240);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)3312, 4095, dat1680);
__m512 dat1681 = sum243;
__m512 dat1682 = sum245;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)6272, 65535, sum242);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)6336, 4095, dat1681);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)6384, 65535, sum244);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)6448, 4095, dat1682);
__m512 dat1683 = sum247;
__m512 dat1684 = sum249;
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)9408, 65535, sum246);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)9472, 4095, dat1683);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)9520, 65535, sum248);
_mm512_mask_storeu_ps(datPtr15+2007040*i33+112*h39+18816*k103+(ptrdiff_t)9584, 4095, dat1684);
if (j26 >= jj32) return;
if (j26 >= 13) break;
++j26;
h39 += 2;
goto wrap4;
}
}
j26 = 14;
}
}

static void ResNet50OneApply4(ResNet50ThreaderTeam1* team38, char** tensors49) {
void* pair11[] = {tensors49, 0};
ResNet50ThreaderTask1 task53;
task53.callee1 = ResNet50OneApply4Callee1;
task53.any1 = pair11;
task53.nd1 = 3;
task53.hull1[0] = 53;
task53.hull1[1] = 14;
task53.hull1[2] = 1;
ResNet50ThreaderDo1(team38, &task53);
}

static void ResNet50OneArrangeWts5Callee1(ResNet50ThreaderTask1* task62, int64_t* pt36) {
char** tensors60 = task62->any1;
ptrdiff_t b57 = pt36[0];
char*restrict wtPtr11 = tensors60[0]+(ptrdiff_t)3340*0+(ptrdiff_t)262144*0;
char*restrict biasPtr11 = tensors60[1]+(ptrdiff_t)2048*0;
char*restrict bnPtr11 = tensors60[2]+(ptrdiff_t)8*512*0;
char*restrict arranged9 = tensors60[3]+(ptrdiff_t)1712128*0+(ptrdiff_t)264192*0;
ptrdiff_t ii13 = 1;
for (ptrdiff_t i38 = 0; i38 < ii13; ++i38) {
ptrdiff_t j31 = 4*b57;
ptrdiff_t jj34 = j31+4;
for (; j31 < jj34; ++j31) {
if (j31 < 31) {
ptrdiff_t k117 = 0+16*(j31-0);
ptrdiff_t l49 = (size_t)(0+k117)/6;
ptrdiff_t cut15 = (size_t)(0+k117)%6;
switch (cut15) {
case 0:;
case 2: {
__m512 sum279 = _mm512_maskz_loadu_ps(65535, biasPtr11+2048*i38+4*k117);
__m512i pmMul21 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd21 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo17 = _mm512_loadu_ps(bnPtr11+(ptrdiff_t)8*(k117+512*i38));
__m512 masHi17 = _mm512_maskz_loadu_ps(65535, bnPtr11+(ptrdiff_t)8*(k117+512*i38)+(ptrdiff_t)64);
__m512 postMul36 = _mm512_permutex2var_ps(masLo17, pmMul21, masHi17);
__m512 postAdd22 = _mm512_permutex2var_ps(masLo17, pmAdd21, masHi17);
sum279 = _mm512_fmadd_ps(sum279, postMul36, postAdd22);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*0+(ptrdiff_t)0, 63>>cut15, sum279);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*0+(ptrdiff_t)3072, 4032>>cut15, sum279);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*0+(ptrdiff_t)6144, 65535-(4095>>cut15), sum279);
ptrdiff_t c29 = 0;
for (; c29 != 8; ++c29) {
__m512 wt337 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)0);
__m512 wt338 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)512);
__m512 wt339 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)1024);
__m512 wt340 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)1536);
__m512 wt341 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)2048);
__m512 wt342 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)2560);
__m512 wt343 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)3072);
__m512 wt344 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)3584);
__m512 wt345 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)4096);
__m512 wt346 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)4608);
__m512 wt347 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)5120);
__m512 wt348 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)5632);
__m512 wt349 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)6144);
__m512 wt350 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)6656);
__m512 wt351 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)7168);
__m512 wt352 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c29+(ptrdiff_t)7680);
__m512 tmp13411 = _mm512_unpacklo_ps(wt337, wt338);
__m512 tmp13412 = _mm512_unpackhi_ps(wt337, wt338);
__m512 tmp13413 = _mm512_unpacklo_ps(wt339, wt340);
__m512 tmp13414 = _mm512_unpackhi_ps(wt339, wt340);
__m512 tmp13415 = _mm512_unpacklo_ps(wt341, wt342);
__m512 tmp13416 = _mm512_unpackhi_ps(wt341, wt342);
__m512 tmp13417 = _mm512_unpacklo_ps(wt343, wt344);
__m512 tmp13418 = _mm512_unpackhi_ps(wt343, wt344);
__m512 tmp13419 = _mm512_unpacklo_ps(wt345, wt346);
__m512 tmp13420 = _mm512_unpackhi_ps(wt345, wt346);
__m512 tmp13421 = _mm512_unpacklo_ps(wt347, wt348);
__m512 tmp13422 = _mm512_unpackhi_ps(wt347, wt348);
__m512 tmp13423 = _mm512_unpacklo_ps(wt349, wt350);
__m512 tmp13424 = _mm512_unpackhi_ps(wt349, wt350);
__m512 tmp13425 = _mm512_unpacklo_ps(wt351, wt352);
__m512 tmp13426 = _mm512_unpackhi_ps(wt351, wt352);
__m512 tmp13427 = _mm512_shuffle_ps(tmp13411, tmp13413, 68);
__m512 tmp13428 = _mm512_shuffle_ps(tmp13411, tmp13413, 238);
__m512 tmp13429 = _mm512_shuffle_ps(tmp13412, tmp13414, 68);
__m512 tmp13430 = _mm512_shuffle_ps(tmp13412, tmp13414, 238);
__m512 tmp13431 = _mm512_shuffle_ps(tmp13415, tmp13417, 68);
__m512 tmp13432 = _mm512_shuffle_ps(tmp13415, tmp13417, 238);
__m512 tmp13433 = _mm512_shuffle_ps(tmp13416, tmp13418, 68);
__m512 tmp13434 = _mm512_shuffle_ps(tmp13416, tmp13418, 238);
__m512 tmp13435 = _mm512_shuffle_ps(tmp13419, tmp13421, 68);
__m512 tmp13436 = _mm512_shuffle_ps(tmp13419, tmp13421, 238);
__m512 tmp13437 = _mm512_shuffle_ps(tmp13420, tmp13422, 68);
__m512 tmp13438 = _mm512_shuffle_ps(tmp13420, tmp13422, 238);
__m512 tmp13439 = _mm512_shuffle_ps(tmp13423, tmp13425, 68);
__m512 tmp13440 = _mm512_shuffle_ps(tmp13423, tmp13425, 238);
__m512 tmp13441 = _mm512_shuffle_ps(tmp13424, tmp13426, 68);
__m512 tmp13442 = _mm512_shuffle_ps(tmp13424, tmp13426, 238);
__m512 tmp13443 = _mm512_shuffle_f32x4(tmp13427, tmp13431, 136);
__m512 tmp13444 = _mm512_shuffle_f32x4(tmp13427, tmp13431, 221);
__m512 tmp13445 = _mm512_shuffle_f32x4(tmp13428, tmp13432, 136);
__m512 tmp13446 = _mm512_shuffle_f32x4(tmp13428, tmp13432, 221);
__m512 tmp13447 = _mm512_shuffle_f32x4(tmp13429, tmp13433, 136);
__m512 tmp13448 = _mm512_shuffle_f32x4(tmp13429, tmp13433, 221);
__m512 tmp13449 = _mm512_shuffle_f32x4(tmp13430, tmp13434, 136);
__m512 tmp13450 = _mm512_shuffle_f32x4(tmp13430, tmp13434, 221);
__m512 tmp13451 = _mm512_shuffle_f32x4(tmp13435, tmp13439, 136);
__m512 tmp13452 = _mm512_shuffle_f32x4(tmp13435, tmp13439, 221);
__m512 tmp13453 = _mm512_shuffle_f32x4(tmp13436, tmp13440, 136);
__m512 tmp13454 = _mm512_shuffle_f32x4(tmp13436, tmp13440, 221);
__m512 tmp13455 = _mm512_shuffle_f32x4(tmp13437, tmp13441, 136);
__m512 tmp13456 = _mm512_shuffle_f32x4(tmp13437, tmp13441, 221);
__m512 tmp13457 = _mm512_shuffle_f32x4(tmp13438, tmp13442, 136);
__m512 tmp13458 = _mm512_shuffle_f32x4(tmp13438, tmp13442, 221);
wt337 = _mm512_shuffle_f32x4(tmp13443, tmp13451, 136);
wt345 = _mm512_shuffle_f32x4(tmp13443, tmp13451, 221);
wt338 = _mm512_shuffle_f32x4(tmp13445, tmp13453, 136);
wt346 = _mm512_shuffle_f32x4(tmp13445, tmp13453, 221);
wt339 = _mm512_shuffle_f32x4(tmp13447, tmp13455, 136);
wt347 = _mm512_shuffle_f32x4(tmp13447, tmp13455, 221);
wt340 = _mm512_shuffle_f32x4(tmp13449, tmp13457, 136);
wt348 = _mm512_shuffle_f32x4(tmp13449, tmp13457, 221);
wt341 = _mm512_shuffle_f32x4(tmp13444, tmp13452, 136);
wt349 = _mm512_shuffle_f32x4(tmp13444, tmp13452, 221);
wt342 = _mm512_shuffle_f32x4(tmp13446, tmp13454, 136);
wt350 = _mm512_shuffle_f32x4(tmp13446, tmp13454, 221);
wt343 = _mm512_shuffle_f32x4(tmp13448, tmp13456, 136);
wt351 = _mm512_shuffle_f32x4(tmp13448, tmp13456, 221);
wt344 = _mm512_shuffle_f32x4(tmp13450, tmp13458, 136);
wt352 = _mm512_shuffle_f32x4(tmp13450, tmp13458, 221);
wt337 = _mm512_mul_ps(wt337, postMul36);
wt338 = _mm512_mul_ps(wt338, postMul36);
wt339 = _mm512_mul_ps(wt339, postMul36);
wt340 = _mm512_mul_ps(wt340, postMul36);
wt341 = _mm512_mul_ps(wt341, postMul36);
wt342 = _mm512_mul_ps(wt342, postMul36);
wt343 = _mm512_mul_ps(wt343, postMul36);
wt344 = _mm512_mul_ps(wt344, postMul36);
wt345 = _mm512_mul_ps(wt345, postMul36);
wt346 = _mm512_mul_ps(wt346, postMul36);
wt347 = _mm512_mul_ps(wt347, postMul36);
wt348 = _mm512_mul_ps(wt348, postMul36);
wt349 = _mm512_mul_ps(wt349, postMul36);
wt350 = _mm512_mul_ps(wt350, postMul36);
wt351 = _mm512_mul_ps(wt351, postMul36);
wt352 = _mm512_mul_ps(wt352, postMul36);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(1+16*c29)+(ptrdiff_t)0, 63>>cut15, wt337);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(2+16*c29)+(ptrdiff_t)0, 63>>cut15, wt338);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(3+16*c29)+(ptrdiff_t)0, 63>>cut15, wt339);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(4+16*c29)+(ptrdiff_t)0, 63>>cut15, wt340);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(5+16*c29)+(ptrdiff_t)0, 63>>cut15, wt341);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(6+16*c29)+(ptrdiff_t)0, 63>>cut15, wt342);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(7+16*c29)+(ptrdiff_t)0, 63>>cut15, wt343);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(8+16*c29)+(ptrdiff_t)0, 63>>cut15, wt344);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(9+16*c29)+(ptrdiff_t)0, 63>>cut15, wt345);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(10+16*c29)+(ptrdiff_t)0, 63>>cut15, wt346);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(11+16*c29)+(ptrdiff_t)0, 63>>cut15, wt347);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(12+16*c29)+(ptrdiff_t)0, 63>>cut15, wt348);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(13+16*c29)+(ptrdiff_t)0, 63>>cut15, wt349);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(14+16*c29)+(ptrdiff_t)0, 63>>cut15, wt350);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(15+16*c29)+(ptrdiff_t)0, 63>>cut15, wt351);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(16+16*c29)+(ptrdiff_t)0, 63>>cut15, wt352);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(1+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt337);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(2+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt338);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(3+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt339);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(4+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt340);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(5+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt341);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(6+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt342);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(7+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt343);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(8+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt344);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(9+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt345);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(10+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt346);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(11+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt347);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(12+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt348);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(13+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt349);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(14+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt350);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(15+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt351);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(16+16*c29)+(ptrdiff_t)3072, 4032>>cut15, wt352);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(1+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt337);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(2+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt338);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(3+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt339);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(4+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt340);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(5+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt341);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(6+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt342);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(7+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt343);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(8+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt344);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(9+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt345);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(10+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt346);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(11+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt347);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(12+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt348);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(13+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt349);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(14+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt350);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(15+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt351);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(16+16*c29)+(ptrdiff_t)6144, 65535-(4095>>cut15), wt352);
}
break;
}
default: {
cut15 = 4;
__m512 sum280 = _mm512_maskz_loadu_ps(65535, biasPtr11+2048*i38+4*k117);
__m512i pmMul22 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd22 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo18 = _mm512_loadu_ps(bnPtr11+(ptrdiff_t)8*(k117+512*i38));
__m512 masHi18 = _mm512_maskz_loadu_ps(65535, bnPtr11+(ptrdiff_t)8*(k117+512*i38)+(ptrdiff_t)64);
__m512 postMul37 = _mm512_permutex2var_ps(masLo18, pmMul22, masHi18);
__m512 postAdd23 = _mm512_permutex2var_ps(masLo18, pmAdd22, masHi18);
sum280 = _mm512_fmadd_ps(sum280, postMul37, postAdd23);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*0+(ptrdiff_t)0, 63>>cut15, sum280);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*0+(ptrdiff_t)3072, 4032>>cut15, sum280);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*0+(ptrdiff_t)6144, 258048>>cut15, sum280);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*0+(ptrdiff_t)9216, 65535-(262143>>cut15), sum280);
ptrdiff_t c30 = 0;
for (; c30 != 8; ++c30) {
__m512 wt353 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)0);
__m512 wt354 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)512);
__m512 wt355 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)1024);
__m512 wt356 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)1536);
__m512 wt357 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)2048);
__m512 wt358 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)2560);
__m512 wt359 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)3072);
__m512 wt360 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)3584);
__m512 wt361 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)4096);
__m512 wt362 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)4608);
__m512 wt363 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)5120);
__m512 wt364 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)5632);
__m512 wt365 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)6144);
__m512 wt366 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)6656);
__m512 wt367 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)7168);
__m512 wt368 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k117+64*c30+(ptrdiff_t)7680);
__m512 tmp13459 = _mm512_unpacklo_ps(wt353, wt354);
__m512 tmp13460 = _mm512_unpackhi_ps(wt353, wt354);
__m512 tmp13461 = _mm512_unpacklo_ps(wt355, wt356);
__m512 tmp13462 = _mm512_unpackhi_ps(wt355, wt356);
__m512 tmp13463 = _mm512_unpacklo_ps(wt357, wt358);
__m512 tmp13464 = _mm512_unpackhi_ps(wt357, wt358);
__m512 tmp13465 = _mm512_unpacklo_ps(wt359, wt360);
__m512 tmp13466 = _mm512_unpackhi_ps(wt359, wt360);
__m512 tmp13467 = _mm512_unpacklo_ps(wt361, wt362);
__m512 tmp13468 = _mm512_unpackhi_ps(wt361, wt362);
__m512 tmp13469 = _mm512_unpacklo_ps(wt363, wt364);
__m512 tmp13470 = _mm512_unpackhi_ps(wt363, wt364);
__m512 tmp13471 = _mm512_unpacklo_ps(wt365, wt366);
__m512 tmp13472 = _mm512_unpackhi_ps(wt365, wt366);
__m512 tmp13473 = _mm512_unpacklo_ps(wt367, wt368);
__m512 tmp13474 = _mm512_unpackhi_ps(wt367, wt368);
__m512 tmp13475 = _mm512_shuffle_ps(tmp13459, tmp13461, 68);
__m512 tmp13476 = _mm512_shuffle_ps(tmp13459, tmp13461, 238);
__m512 tmp13477 = _mm512_shuffle_ps(tmp13460, tmp13462, 68);
__m512 tmp13478 = _mm512_shuffle_ps(tmp13460, tmp13462, 238);
__m512 tmp13479 = _mm512_shuffle_ps(tmp13463, tmp13465, 68);
__m512 tmp13480 = _mm512_shuffle_ps(tmp13463, tmp13465, 238);
__m512 tmp13481 = _mm512_shuffle_ps(tmp13464, tmp13466, 68);
__m512 tmp13482 = _mm512_shuffle_ps(tmp13464, tmp13466, 238);
__m512 tmp13483 = _mm512_shuffle_ps(tmp13467, tmp13469, 68);
__m512 tmp13484 = _mm512_shuffle_ps(tmp13467, tmp13469, 238);
__m512 tmp13485 = _mm512_shuffle_ps(tmp13468, tmp13470, 68);
__m512 tmp13486 = _mm512_shuffle_ps(tmp13468, tmp13470, 238);
__m512 tmp13487 = _mm512_shuffle_ps(tmp13471, tmp13473, 68);
__m512 tmp13488 = _mm512_shuffle_ps(tmp13471, tmp13473, 238);
__m512 tmp13489 = _mm512_shuffle_ps(tmp13472, tmp13474, 68);
__m512 tmp13490 = _mm512_shuffle_ps(tmp13472, tmp13474, 238);
__m512 tmp13491 = _mm512_shuffle_f32x4(tmp13475, tmp13479, 136);
__m512 tmp13492 = _mm512_shuffle_f32x4(tmp13475, tmp13479, 221);
__m512 tmp13493 = _mm512_shuffle_f32x4(tmp13476, tmp13480, 136);
__m512 tmp13494 = _mm512_shuffle_f32x4(tmp13476, tmp13480, 221);
__m512 tmp13495 = _mm512_shuffle_f32x4(tmp13477, tmp13481, 136);
__m512 tmp13496 = _mm512_shuffle_f32x4(tmp13477, tmp13481, 221);
__m512 tmp13497 = _mm512_shuffle_f32x4(tmp13478, tmp13482, 136);
__m512 tmp13498 = _mm512_shuffle_f32x4(tmp13478, tmp13482, 221);
__m512 tmp13499 = _mm512_shuffle_f32x4(tmp13483, tmp13487, 136);
__m512 tmp13500 = _mm512_shuffle_f32x4(tmp13483, tmp13487, 221);
__m512 tmp13501 = _mm512_shuffle_f32x4(tmp13484, tmp13488, 136);
__m512 tmp13502 = _mm512_shuffle_f32x4(tmp13484, tmp13488, 221);
__m512 tmp13503 = _mm512_shuffle_f32x4(tmp13485, tmp13489, 136);
__m512 tmp13504 = _mm512_shuffle_f32x4(tmp13485, tmp13489, 221);
__m512 tmp13505 = _mm512_shuffle_f32x4(tmp13486, tmp13490, 136);
__m512 tmp13506 = _mm512_shuffle_f32x4(tmp13486, tmp13490, 221);
wt353 = _mm512_shuffle_f32x4(tmp13491, tmp13499, 136);
wt361 = _mm512_shuffle_f32x4(tmp13491, tmp13499, 221);
wt354 = _mm512_shuffle_f32x4(tmp13493, tmp13501, 136);
wt362 = _mm512_shuffle_f32x4(tmp13493, tmp13501, 221);
wt355 = _mm512_shuffle_f32x4(tmp13495, tmp13503, 136);
wt363 = _mm512_shuffle_f32x4(tmp13495, tmp13503, 221);
wt356 = _mm512_shuffle_f32x4(tmp13497, tmp13505, 136);
wt364 = _mm512_shuffle_f32x4(tmp13497, tmp13505, 221);
wt357 = _mm512_shuffle_f32x4(tmp13492, tmp13500, 136);
wt365 = _mm512_shuffle_f32x4(tmp13492, tmp13500, 221);
wt358 = _mm512_shuffle_f32x4(tmp13494, tmp13502, 136);
wt366 = _mm512_shuffle_f32x4(tmp13494, tmp13502, 221);
wt359 = _mm512_shuffle_f32x4(tmp13496, tmp13504, 136);
wt367 = _mm512_shuffle_f32x4(tmp13496, tmp13504, 221);
wt360 = _mm512_shuffle_f32x4(tmp13498, tmp13506, 136);
wt368 = _mm512_shuffle_f32x4(tmp13498, tmp13506, 221);
wt353 = _mm512_mul_ps(wt353, postMul37);
wt354 = _mm512_mul_ps(wt354, postMul37);
wt355 = _mm512_mul_ps(wt355, postMul37);
wt356 = _mm512_mul_ps(wt356, postMul37);
wt357 = _mm512_mul_ps(wt357, postMul37);
wt358 = _mm512_mul_ps(wt358, postMul37);
wt359 = _mm512_mul_ps(wt359, postMul37);
wt360 = _mm512_mul_ps(wt360, postMul37);
wt361 = _mm512_mul_ps(wt361, postMul37);
wt362 = _mm512_mul_ps(wt362, postMul37);
wt363 = _mm512_mul_ps(wt363, postMul37);
wt364 = _mm512_mul_ps(wt364, postMul37);
wt365 = _mm512_mul_ps(wt365, postMul37);
wt366 = _mm512_mul_ps(wt366, postMul37);
wt367 = _mm512_mul_ps(wt367, postMul37);
wt368 = _mm512_mul_ps(wt368, postMul37);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(1+16*c30)+(ptrdiff_t)0, 63>>cut15, wt353);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(2+16*c30)+(ptrdiff_t)0, 63>>cut15, wt354);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(3+16*c30)+(ptrdiff_t)0, 63>>cut15, wt355);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(4+16*c30)+(ptrdiff_t)0, 63>>cut15, wt356);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(5+16*c30)+(ptrdiff_t)0, 63>>cut15, wt357);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(6+16*c30)+(ptrdiff_t)0, 63>>cut15, wt358);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(7+16*c30)+(ptrdiff_t)0, 63>>cut15, wt359);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(8+16*c30)+(ptrdiff_t)0, 63>>cut15, wt360);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(9+16*c30)+(ptrdiff_t)0, 63>>cut15, wt361);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(10+16*c30)+(ptrdiff_t)0, 63>>cut15, wt362);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(11+16*c30)+(ptrdiff_t)0, 63>>cut15, wt363);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(12+16*c30)+(ptrdiff_t)0, 63>>cut15, wt364);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(13+16*c30)+(ptrdiff_t)0, 63>>cut15, wt365);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(14+16*c30)+(ptrdiff_t)0, 63>>cut15, wt366);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(15+16*c30)+(ptrdiff_t)0, 63>>cut15, wt367);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(16+16*c30)+(ptrdiff_t)0, 63>>cut15, wt368);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(1+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt353);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(2+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt354);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(3+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt355);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(4+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt356);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(5+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt357);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(6+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt358);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(7+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt359);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(8+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt360);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(9+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt361);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(10+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt362);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(11+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt363);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(12+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt364);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(13+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt365);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(14+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt366);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(15+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt367);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(16+16*c30)+(ptrdiff_t)3072, 4032>>cut15, wt368);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(1+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt353);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(2+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt354);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(3+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt355);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(4+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt356);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(5+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt357);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(6+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt358);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(7+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt359);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(8+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt360);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(9+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt361);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(10+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt362);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(11+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt363);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(12+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt364);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(13+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt365);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(14+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt366);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(15+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt367);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(16+16*c30)+(ptrdiff_t)6144, 258048>>cut15, wt368);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(1+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt353);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(2+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt354);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(3+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt355);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(4+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt356);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(5+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt357);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(6+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt358);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(7+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt359);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(8+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt360);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(9+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt361);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(10+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt362);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(11+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt363);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(12+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt364);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(13+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt365);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(14+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt366);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(15+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt367);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l49+4*cut15+24*(16+16*c30)+(ptrdiff_t)9216, 65535-(262143>>cut15), wt368);
}
}
}
} else {
ptrdiff_t k116 = 496;
ptrdiff_t l48 = (size_t)(0+k116)/6;
ptrdiff_t cut14 = (size_t)(0+k116)%6;
__m512 sum278 = _mm512_maskz_loadu_ps(65535, biasPtr11+2048*i38+4*k116);
__m512i pmMul23 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd23 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo19 = _mm512_loadu_ps(bnPtr11+(ptrdiff_t)8*(k116+512*i38));
__m512 masHi19 = _mm512_maskz_loadu_ps(65535, bnPtr11+(ptrdiff_t)8*(k116+512*i38)+(ptrdiff_t)64);
__m512 postMul35 = _mm512_permutex2var_ps(masLo19, pmMul23, masHi19);
__m512 postAdd21 = _mm512_permutex2var_ps(masLo19, pmAdd23, masHi19);
sum278 = _mm512_fmadd_ps(sum278, postMul35, postAdd21);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*0+(ptrdiff_t)0, 63>>cut14, sum278);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*0+(ptrdiff_t)3072, 4032>>cut14, sum278);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*0+(ptrdiff_t)6144, 258048>>cut14, sum278);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*0+(ptrdiff_t)9216, 65535-(262143>>cut14), sum278);
ptrdiff_t c28 = 0;
for (; c28 != 8; ++c28) {
__m512 wt321 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)0);
__m512 wt322 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)512);
__m512 wt323 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)1024);
__m512 wt324 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)1536);
__m512 wt325 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)2048);
__m512 wt326 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)2560);
__m512 wt327 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)3072);
__m512 wt328 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)3584);
__m512 wt329 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)4096);
__m512 wt330 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)4608);
__m512 wt331 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)5120);
__m512 wt332 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)5632);
__m512 wt333 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)6144);
__m512 wt334 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)6656);
__m512 wt335 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)7168);
__m512 wt336 = _mm512_maskz_loadu_ps(65535, wtPtr11+262144*i38+512*k116+64*c28+(ptrdiff_t)7680);
__m512 tmp13507 = _mm512_unpacklo_ps(wt321, wt322);
__m512 tmp13508 = _mm512_unpackhi_ps(wt321, wt322);
__m512 tmp13509 = _mm512_unpacklo_ps(wt323, wt324);
__m512 tmp13510 = _mm512_unpackhi_ps(wt323, wt324);
__m512 tmp13511 = _mm512_unpacklo_ps(wt325, wt326);
__m512 tmp13512 = _mm512_unpackhi_ps(wt325, wt326);
__m512 tmp13513 = _mm512_unpacklo_ps(wt327, wt328);
__m512 tmp13514 = _mm512_unpackhi_ps(wt327, wt328);
__m512 tmp13515 = _mm512_unpacklo_ps(wt329, wt330);
__m512 tmp13516 = _mm512_unpackhi_ps(wt329, wt330);
__m512 tmp13517 = _mm512_unpacklo_ps(wt331, wt332);
__m512 tmp13518 = _mm512_unpackhi_ps(wt331, wt332);
__m512 tmp13519 = _mm512_unpacklo_ps(wt333, wt334);
__m512 tmp13520 = _mm512_unpackhi_ps(wt333, wt334);
__m512 tmp13521 = _mm512_unpacklo_ps(wt335, wt336);
__m512 tmp13522 = _mm512_unpackhi_ps(wt335, wt336);
__m512 tmp13523 = _mm512_shuffle_ps(tmp13507, tmp13509, 68);
__m512 tmp13524 = _mm512_shuffle_ps(tmp13507, tmp13509, 238);
__m512 tmp13525 = _mm512_shuffle_ps(tmp13508, tmp13510, 68);
__m512 tmp13526 = _mm512_shuffle_ps(tmp13508, tmp13510, 238);
__m512 tmp13527 = _mm512_shuffle_ps(tmp13511, tmp13513, 68);
__m512 tmp13528 = _mm512_shuffle_ps(tmp13511, tmp13513, 238);
__m512 tmp13529 = _mm512_shuffle_ps(tmp13512, tmp13514, 68);
__m512 tmp13530 = _mm512_shuffle_ps(tmp13512, tmp13514, 238);
__m512 tmp13531 = _mm512_shuffle_ps(tmp13515, tmp13517, 68);
__m512 tmp13532 = _mm512_shuffle_ps(tmp13515, tmp13517, 238);
__m512 tmp13533 = _mm512_shuffle_ps(tmp13516, tmp13518, 68);
__m512 tmp13534 = _mm512_shuffle_ps(tmp13516, tmp13518, 238);
__m512 tmp13535 = _mm512_shuffle_ps(tmp13519, tmp13521, 68);
__m512 tmp13536 = _mm512_shuffle_ps(tmp13519, tmp13521, 238);
__m512 tmp13537 = _mm512_shuffle_ps(tmp13520, tmp13522, 68);
__m512 tmp13538 = _mm512_shuffle_ps(tmp13520, tmp13522, 238);
__m512 tmp13539 = _mm512_shuffle_f32x4(tmp13523, tmp13527, 136);
__m512 tmp13540 = _mm512_shuffle_f32x4(tmp13523, tmp13527, 221);
__m512 tmp13541 = _mm512_shuffle_f32x4(tmp13524, tmp13528, 136);
__m512 tmp13542 = _mm512_shuffle_f32x4(tmp13524, tmp13528, 221);
__m512 tmp13543 = _mm512_shuffle_f32x4(tmp13525, tmp13529, 136);
__m512 tmp13544 = _mm512_shuffle_f32x4(tmp13525, tmp13529, 221);
__m512 tmp13545 = _mm512_shuffle_f32x4(tmp13526, tmp13530, 136);
__m512 tmp13546 = _mm512_shuffle_f32x4(tmp13526, tmp13530, 221);
__m512 tmp13547 = _mm512_shuffle_f32x4(tmp13531, tmp13535, 136);
__m512 tmp13548 = _mm512_shuffle_f32x4(tmp13531, tmp13535, 221);
__m512 tmp13549 = _mm512_shuffle_f32x4(tmp13532, tmp13536, 136);
__m512 tmp13550 = _mm512_shuffle_f32x4(tmp13532, tmp13536, 221);
__m512 tmp13551 = _mm512_shuffle_f32x4(tmp13533, tmp13537, 136);
__m512 tmp13552 = _mm512_shuffle_f32x4(tmp13533, tmp13537, 221);
__m512 tmp13553 = _mm512_shuffle_f32x4(tmp13534, tmp13538, 136);
__m512 tmp13554 = _mm512_shuffle_f32x4(tmp13534, tmp13538, 221);
wt321 = _mm512_shuffle_f32x4(tmp13539, tmp13547, 136);
wt329 = _mm512_shuffle_f32x4(tmp13539, tmp13547, 221);
wt322 = _mm512_shuffle_f32x4(tmp13541, tmp13549, 136);
wt330 = _mm512_shuffle_f32x4(tmp13541, tmp13549, 221);
wt323 = _mm512_shuffle_f32x4(tmp13543, tmp13551, 136);
wt331 = _mm512_shuffle_f32x4(tmp13543, tmp13551, 221);
wt324 = _mm512_shuffle_f32x4(tmp13545, tmp13553, 136);
wt332 = _mm512_shuffle_f32x4(tmp13545, tmp13553, 221);
wt325 = _mm512_shuffle_f32x4(tmp13540, tmp13548, 136);
wt333 = _mm512_shuffle_f32x4(tmp13540, tmp13548, 221);
wt326 = _mm512_shuffle_f32x4(tmp13542, tmp13550, 136);
wt334 = _mm512_shuffle_f32x4(tmp13542, tmp13550, 221);
wt327 = _mm512_shuffle_f32x4(tmp13544, tmp13552, 136);
wt335 = _mm512_shuffle_f32x4(tmp13544, tmp13552, 221);
wt328 = _mm512_shuffle_f32x4(tmp13546, tmp13554, 136);
wt336 = _mm512_shuffle_f32x4(tmp13546, tmp13554, 221);
wt321 = _mm512_mul_ps(wt321, postMul35);
wt322 = _mm512_mul_ps(wt322, postMul35);
wt323 = _mm512_mul_ps(wt323, postMul35);
wt324 = _mm512_mul_ps(wt324, postMul35);
wt325 = _mm512_mul_ps(wt325, postMul35);
wt326 = _mm512_mul_ps(wt326, postMul35);
wt327 = _mm512_mul_ps(wt327, postMul35);
wt328 = _mm512_mul_ps(wt328, postMul35);
wt329 = _mm512_mul_ps(wt329, postMul35);
wt330 = _mm512_mul_ps(wt330, postMul35);
wt331 = _mm512_mul_ps(wt331, postMul35);
wt332 = _mm512_mul_ps(wt332, postMul35);
wt333 = _mm512_mul_ps(wt333, postMul35);
wt334 = _mm512_mul_ps(wt334, postMul35);
wt335 = _mm512_mul_ps(wt335, postMul35);
wt336 = _mm512_mul_ps(wt336, postMul35);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(1+16*c28)+(ptrdiff_t)0, 63>>cut14, wt321);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(2+16*c28)+(ptrdiff_t)0, 63>>cut14, wt322);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(3+16*c28)+(ptrdiff_t)0, 63>>cut14, wt323);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(4+16*c28)+(ptrdiff_t)0, 63>>cut14, wt324);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(5+16*c28)+(ptrdiff_t)0, 63>>cut14, wt325);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(6+16*c28)+(ptrdiff_t)0, 63>>cut14, wt326);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(7+16*c28)+(ptrdiff_t)0, 63>>cut14, wt327);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(8+16*c28)+(ptrdiff_t)0, 63>>cut14, wt328);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(9+16*c28)+(ptrdiff_t)0, 63>>cut14, wt329);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(10+16*c28)+(ptrdiff_t)0, 63>>cut14, wt330);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(11+16*c28)+(ptrdiff_t)0, 63>>cut14, wt331);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(12+16*c28)+(ptrdiff_t)0, 63>>cut14, wt332);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(13+16*c28)+(ptrdiff_t)0, 63>>cut14, wt333);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(14+16*c28)+(ptrdiff_t)0, 63>>cut14, wt334);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(15+16*c28)+(ptrdiff_t)0, 63>>cut14, wt335);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(16+16*c28)+(ptrdiff_t)0, 63>>cut14, wt336);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(1+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt321);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(2+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt322);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(3+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt323);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(4+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt324);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(5+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt325);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(6+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt326);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(7+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt327);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(8+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt328);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(9+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt329);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(10+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt330);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(11+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt331);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(12+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt332);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(13+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt333);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(14+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt334);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(15+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt335);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(16+16*c28)+(ptrdiff_t)3072, 4032>>cut14, wt336);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(1+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt321);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(2+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt322);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(3+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt323);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(4+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt324);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(5+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt325);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(6+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt326);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(7+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt327);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(8+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt328);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(9+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt329);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(10+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt330);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(11+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt331);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(12+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt332);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(13+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt333);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(14+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt334);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(15+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt335);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+24*(16+16*c28)+(ptrdiff_t)6144, 258048>>cut14, wt336);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(1+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt321);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(2+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt322);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(3+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt323);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(4+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt324);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(5+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt325);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(6+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt326);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(7+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt327);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(8+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt328);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(9+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt329);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(10+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt330);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(11+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt331);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(12+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt332);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(13+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt333);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(14+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt334);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(15+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt335);
_mm512_mask_storeu_ps(arranged9+264192*i38+3096*l48+4*cut14+8*(16+16*c28)+(ptrdiff_t)9216, 65535-(262143>>cut14), wt336);
}
}
}
}
}

static void ResNet50OneArrangeWts5(ResNet50ThreaderTeam1* team43, char** tensors59) {
ResNet50ThreaderTask1 task63;
task63.callee1 = ResNet50OneArrangeWts5Callee1;
task63.any1 = tensors59;
task63.nd1 = 3;
task63.hull1[0] = 8;
task63.hull1[1] = 1;
task63.hull1[2] = 1;
ResNet50ThreaderDo1(team43, &task63);
}

static void ResNet50OneArrangeDats5Callee1(ResNet50ThreaderTask1* task64, int64_t* pt37) {
char** tensors62 = task64->any1;
ptrdiff_t c31 = pt37[1];
char*restrict datPtr18 = tensors62[0]+(ptrdiff_t)0+(ptrdiff_t)2618560*0+(ptrdiff_t)401408*0;
char*restrict arranged10 = tensors62[1]+(ptrdiff_t)2618560*0+(ptrdiff_t)401408*0;
ptrdiff_t ii14 = 1;
for (ptrdiff_t i39 = 0; i39 < ii14; ++i39) {
ptrdiff_t j32 = 1*c31;
ptrdiff_t jj35 = j32+0;
for (; j32 != 12; ++j32) {
ptrdiff_t k118 = 0;
ptrdiff_t kk35 = k118+128;
for (; k118 < kk35; ++k118) {
__m512 dat1911 = _mm512_maskz_loadu_ps(65535, datPtr18+401408*i39+256*j32+3136*k118+(ptrdiff_t)0);
__m512 dat1912 = _mm512_maskz_loadu_ps(65535, datPtr18+401408*i39+256*j32+3136*k118+(ptrdiff_t)64);
__m512 dat1913 = _mm512_maskz_loadu_ps(65535, datPtr18+401408*i39+256*j32+3136*k118+(ptrdiff_t)128);
__m512 dat1914 = _mm512_maskz_loadu_ps(65535, datPtr18+401408*i39+256*j32+3136*k118+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged10+401408*i39+32768*j32+256*k118+(ptrdiff_t)0, 65535, dat1911);
_mm512_mask_storeu_ps(arranged10+401408*i39+32768*j32+256*k118+(ptrdiff_t)64, 65535, dat1912);
_mm512_mask_storeu_ps(arranged10+401408*i39+32768*j32+256*k118+(ptrdiff_t)128, 65535, dat1913);
_mm512_mask_storeu_ps(arranged10+401408*i39+32768*j32+256*k118+(ptrdiff_t)192, 65535, dat1914);
}
if (j32 >= jj35) goto next5;
}
ptrdiff_t k119 = 0;
ptrdiff_t kk36 = k119+128;
for (; k119 < kk36; ++k119) {
__m512 dat1915 = _mm512_maskz_loadu_ps(65535, datPtr18+401408*i39+256*j32+3136*k119+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged10+401408*i39+32768*j32+64*k119+(ptrdiff_t)0, 65535, dat1915);
}
next5:;
}
}

static void ResNet50OneArrangeDats5(ResNet50ThreaderTeam1* team44, char** tensors61) {
ResNet50ThreaderTask1 task65;
task65.callee1 = ResNet50OneArrangeDats5Callee1;
task65.any1 = tensors61;
task65.nd1 = 4;
task65.hull1[0] = 1;
task65.hull1[1] = 13;
task65.hull1[2] = 1;
task65.hull1[3] = 1;
ResNet50ThreaderDo1(team44, &task65);
}

static void ResNet50OneApply5Callee1(ResNet50ThreaderTask1* task66, int64_t* pt38) {
void** pair16 = task66->any1;
char** tensors64 = pair16[0];
ptrdiff_t e19 = 0;
ptrdiff_t g22 = 0;
ptrdiff_t d13 = pt38[1];
ptrdiff_t w55 = pt38[0];
char*restrict arrangedWts5 = tensors64[0]+1712128*e19+(ptrdiff_t)264192*1*g22;
char*restrict arrangedDats5 = tensors64[1]+2618560*e19+(ptrdiff_t)401408*1*g22;
char*restrict datPtr19 = tensors64[2]+(ptrdiff_t)1605632*1*g22;
char*restrict datPtr20 = tensors64[3]+(ptrdiff_t)1605632*1*g22;
ptrdiff_t ii15 = 1;
for (ptrdiff_t i40 = 0; i40 < ii15; ++i40) {
ptrdiff_t j33 = 1*d13;
ptrdiff_t jj36 = j33+0;
for (; j33 != 12; ++j33) {
ptrdiff_t k120 = 4*w55;
ptrdiff_t kk37 = k120+(w55 < 20 ? 3 : 5);
for (; k120 != 85; ++k120) {
ptrdiff_t s26 = -1;
__m512 sum281 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)24));
__m512 sum285 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)28));
__m512 sum289 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)32));
__m512 sum293 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)36));
__m512 sum297 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)40));
__m512 sum301 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)44));
__m512 sum282 = sum281;
__m512 sum283 = sum281;
__m512 sum284 = sum281;
__m512 sum286 = sum285;
__m512 sum287 = sum285;
__m512 sum288 = sum285;
__m512 sum290 = sum289;
__m512 sum291 = sum289;
__m512 sum292 = sum289;
__m512 sum294 = sum293;
__m512 sum295 = sum293;
__m512 sum296 = sum293;
__m512 sum298 = sum297;
__m512 sum299 = sum297;
__m512 sum300 = sum297;
__m512 sum302 = sum301;
__m512 sum303 = sum301;
__m512 sum304 = sum301;
for (s26 = 0; s26 < 128; ++s26) {
__m512 dat1916 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+256*s26+(ptrdiff_t)0);
__m512 dat1917 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+256*s26+(ptrdiff_t)64);
__m512 dat1918 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+256*s26+(ptrdiff_t)128);
__m512 dat1919 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+256*s26+(ptrdiff_t)192);
__m512 wt369 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)24));
sum281 = _mm512_fmadd_ps(wt369, dat1916, sum281);
sum282 = _mm512_fmadd_ps(wt369, dat1917, sum282);
sum283 = _mm512_fmadd_ps(wt369, dat1918, sum283);
sum284 = _mm512_fmadd_ps(wt369, dat1919, sum284);
__m512 wt370 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)28));
sum285 = _mm512_fmadd_ps(wt370, dat1916, sum285);
sum286 = _mm512_fmadd_ps(wt370, dat1917, sum286);
sum287 = _mm512_fmadd_ps(wt370, dat1918, sum287);
sum288 = _mm512_fmadd_ps(wt370, dat1919, sum288);
__m512 wt371 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)32));
sum289 = _mm512_fmadd_ps(wt371, dat1916, sum289);
sum290 = _mm512_fmadd_ps(wt371, dat1917, sum290);
sum291 = _mm512_fmadd_ps(wt371, dat1918, sum291);
sum292 = _mm512_fmadd_ps(wt371, dat1919, sum292);
__m512 wt372 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)36));
sum293 = _mm512_fmadd_ps(wt372, dat1916, sum293);
sum294 = _mm512_fmadd_ps(wt372, dat1917, sum294);
sum295 = _mm512_fmadd_ps(wt372, dat1918, sum295);
sum296 = _mm512_fmadd_ps(wt372, dat1919, sum296);
__m512 wt373 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)40));
sum297 = _mm512_fmadd_ps(wt373, dat1916, sum297);
sum298 = _mm512_fmadd_ps(wt373, dat1917, sum298);
sum299 = _mm512_fmadd_ps(wt373, dat1918, sum299);
sum300 = _mm512_fmadd_ps(wt373, dat1919, sum300);
__m512 wt374 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+24*s26+(ptrdiff_t)44));
sum301 = _mm512_fmadd_ps(wt374, dat1916, sum301);
sum302 = _mm512_fmadd_ps(wt374, dat1917, sum302);
sum303 = _mm512_fmadd_ps(wt374, dat1918, sum303);
sum304 = _mm512_fmadd_ps(wt374, dat1919, sum304);
}
sum281 = _mm512_add_ps(sum281, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)0));
sum282 = _mm512_add_ps(sum282, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)64));
sum283 = _mm512_add_ps(sum283, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)128));
sum284 = _mm512_add_ps(sum284, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)192));
sum281 = _mm512_max_ps(_mm512_setzero_ps(), sum281);
sum282 = _mm512_max_ps(_mm512_setzero_ps(), sum282);
sum283 = _mm512_max_ps(_mm512_setzero_ps(), sum283);
sum284 = _mm512_max_ps(_mm512_setzero_ps(), sum284);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)0, 65535, sum281);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)64, 65535, sum282);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)128, 65535, sum283);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)192, 65535, sum284);
sum285 = _mm512_add_ps(sum285, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3136));
sum286 = _mm512_add_ps(sum286, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3200));
sum287 = _mm512_add_ps(sum287, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3264));
sum288 = _mm512_add_ps(sum288, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3328));
sum285 = _mm512_max_ps(_mm512_setzero_ps(), sum285);
sum286 = _mm512_max_ps(_mm512_setzero_ps(), sum286);
sum287 = _mm512_max_ps(_mm512_setzero_ps(), sum287);
sum288 = _mm512_max_ps(_mm512_setzero_ps(), sum288);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3136, 65535, sum285);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3200, 65535, sum286);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3264, 65535, sum287);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3328, 65535, sum288);
sum289 = _mm512_add_ps(sum289, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)6272));
sum290 = _mm512_add_ps(sum290, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)6336));
sum291 = _mm512_add_ps(sum291, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)6400));
sum292 = _mm512_add_ps(sum292, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)6464));
sum289 = _mm512_max_ps(_mm512_setzero_ps(), sum289);
sum290 = _mm512_max_ps(_mm512_setzero_ps(), sum290);
sum291 = _mm512_max_ps(_mm512_setzero_ps(), sum291);
sum292 = _mm512_max_ps(_mm512_setzero_ps(), sum292);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)6272, 65535, sum289);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)6336, 65535, sum290);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)6400, 65535, sum291);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)6464, 65535, sum292);
sum293 = _mm512_add_ps(sum293, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)9408));
sum294 = _mm512_add_ps(sum294, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)9472));
sum295 = _mm512_add_ps(sum295, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)9536));
sum296 = _mm512_add_ps(sum296, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)9600));
sum293 = _mm512_max_ps(_mm512_setzero_ps(), sum293);
sum294 = _mm512_max_ps(_mm512_setzero_ps(), sum294);
sum295 = _mm512_max_ps(_mm512_setzero_ps(), sum295);
sum296 = _mm512_max_ps(_mm512_setzero_ps(), sum296);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)9408, 65535, sum293);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)9472, 65535, sum294);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)9536, 65535, sum295);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)9600, 65535, sum296);
sum297 = _mm512_add_ps(sum297, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)12544));
sum298 = _mm512_add_ps(sum298, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)12608));
sum299 = _mm512_add_ps(sum299, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)12672));
sum300 = _mm512_add_ps(sum300, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)12736));
sum297 = _mm512_max_ps(_mm512_setzero_ps(), sum297);
sum298 = _mm512_max_ps(_mm512_setzero_ps(), sum298);
sum299 = _mm512_max_ps(_mm512_setzero_ps(), sum299);
sum300 = _mm512_max_ps(_mm512_setzero_ps(), sum300);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)12544, 65535, sum297);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)12608, 65535, sum298);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)12672, 65535, sum299);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)12736, 65535, sum300);
sum301 = _mm512_add_ps(sum301, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)15680));
sum302 = _mm512_add_ps(sum302, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)15744));
sum303 = _mm512_add_ps(sum303, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)15808));
sum304 = _mm512_add_ps(sum304, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)15872));
sum301 = _mm512_max_ps(_mm512_setzero_ps(), sum301);
sum302 = _mm512_max_ps(_mm512_setzero_ps(), sum302);
sum303 = _mm512_max_ps(_mm512_setzero_ps(), sum303);
sum304 = _mm512_max_ps(_mm512_setzero_ps(), sum304);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)15680, 65535, sum301);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)15744, 65535, sum302);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)15808, 65535, sum303);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)15872, 65535, sum304);
if (k120 >= kk37) return;
}
ptrdiff_t s27 = -1;
__m512 sum305 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+8*s27+(ptrdiff_t)8));
__m512 sum309 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+8*s27+(ptrdiff_t)12));
__m512 sum306 = sum305;
__m512 sum307 = sum305;
__m512 sum308 = sum305;
__m512 sum310 = sum309;
__m512 sum311 = sum309;
__m512 sum312 = sum309;
for (s27 = 0; s27 < 128; ++s27) {
__m512 dat1920 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+256*s27+(ptrdiff_t)0);
__m512 dat1921 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+256*s27+(ptrdiff_t)64);
__m512 dat1922 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+256*s27+(ptrdiff_t)128);
__m512 dat1923 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+256*s27+(ptrdiff_t)192);
__m512 wt375 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+8*s27+(ptrdiff_t)8));
sum305 = _mm512_fmadd_ps(wt375, dat1920, sum305);
sum306 = _mm512_fmadd_ps(wt375, dat1921, sum306);
sum307 = _mm512_fmadd_ps(wt375, dat1922, sum307);
sum308 = _mm512_fmadd_ps(wt375, dat1923, sum308);
__m512 wt376 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k120+8*s27+(ptrdiff_t)12));
sum309 = _mm512_fmadd_ps(wt376, dat1920, sum309);
sum310 = _mm512_fmadd_ps(wt376, dat1921, sum310);
sum311 = _mm512_fmadd_ps(wt376, dat1922, sum311);
sum312 = _mm512_fmadd_ps(wt376, dat1923, sum312);
}
sum305 = _mm512_add_ps(sum305, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)0));
sum306 = _mm512_add_ps(sum306, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)64));
sum307 = _mm512_add_ps(sum307, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)128));
sum308 = _mm512_add_ps(sum308, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)192));
sum305 = _mm512_max_ps(_mm512_setzero_ps(), sum305);
sum306 = _mm512_max_ps(_mm512_setzero_ps(), sum306);
sum307 = _mm512_max_ps(_mm512_setzero_ps(), sum307);
sum308 = _mm512_max_ps(_mm512_setzero_ps(), sum308);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)0, 65535, sum305);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)64, 65535, sum306);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)128, 65535, sum307);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)192, 65535, sum308);
sum309 = _mm512_add_ps(sum309, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3136));
sum310 = _mm512_add_ps(sum310, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3200));
sum311 = _mm512_add_ps(sum311, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3264));
sum312 = _mm512_add_ps(sum312, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3328));
sum309 = _mm512_max_ps(_mm512_setzero_ps(), sum309);
sum310 = _mm512_max_ps(_mm512_setzero_ps(), sum310);
sum311 = _mm512_max_ps(_mm512_setzero_ps(), sum311);
sum312 = _mm512_max_ps(_mm512_setzero_ps(), sum312);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3136, 65535, sum309);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3200, 65535, sum310);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3264, 65535, sum311);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k120+(ptrdiff_t)3328, 65535, sum312);
if (j33 >= jj36) return;
}
ptrdiff_t k121 = 4*w55;
ptrdiff_t kk38 = k121+(w55 < 20 ? 3 : 5);
for (; k121 != 85; ++k121) {
ptrdiff_t s28 = -1;
__m512 sum313 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)24));
__m512 sum314 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)28));
__m512 sum315 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)32));
__m512 sum316 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)36));
__m512 sum317 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)40));
__m512 sum318 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)44));
for (s28 = 0; s28 < 128; ++s28) {
__m512 dat1924 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+64*s28+(ptrdiff_t)0);
__m512 wt377 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)24));
sum313 = _mm512_fmadd_ps(wt377, dat1924, sum313);
__m512 wt378 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)28));
sum314 = _mm512_fmadd_ps(wt378, dat1924, sum314);
__m512 wt379 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)32));
sum315 = _mm512_fmadd_ps(wt379, dat1924, sum315);
__m512 wt380 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)36));
sum316 = _mm512_fmadd_ps(wt380, dat1924, sum316);
__m512 wt381 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)40));
sum317 = _mm512_fmadd_ps(wt381, dat1924, sum317);
__m512 wt382 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+24*s28+(ptrdiff_t)44));
sum318 = _mm512_fmadd_ps(wt382, dat1924, sum318);
}
sum313 = _mm512_add_ps(sum313, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k121+(ptrdiff_t)0));
sum313 = _mm512_max_ps(_mm512_setzero_ps(), sum313);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k121+(ptrdiff_t)0, 65535, sum313);
sum314 = _mm512_add_ps(sum314, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k121+(ptrdiff_t)3136));
sum314 = _mm512_max_ps(_mm512_setzero_ps(), sum314);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k121+(ptrdiff_t)3136, 65535, sum314);
sum315 = _mm512_add_ps(sum315, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k121+(ptrdiff_t)6272));
sum315 = _mm512_max_ps(_mm512_setzero_ps(), sum315);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k121+(ptrdiff_t)6272, 65535, sum315);
sum316 = _mm512_add_ps(sum316, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k121+(ptrdiff_t)9408));
sum316 = _mm512_max_ps(_mm512_setzero_ps(), sum316);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k121+(ptrdiff_t)9408, 65535, sum316);
sum317 = _mm512_add_ps(sum317, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k121+(ptrdiff_t)12544));
sum317 = _mm512_max_ps(_mm512_setzero_ps(), sum317);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k121+(ptrdiff_t)12544, 65535, sum317);
sum318 = _mm512_add_ps(sum318, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k121+(ptrdiff_t)15680));
sum318 = _mm512_max_ps(_mm512_setzero_ps(), sum318);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k121+(ptrdiff_t)15680, 65535, sum318);
if (k121 >= kk38) return;
}
ptrdiff_t s29 = -1;
__m512 sum319 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+8*s29+(ptrdiff_t)8));
__m512 sum320 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+8*s29+(ptrdiff_t)12));
for (s29 = 0; s29 < 128; ++s29) {
__m512 dat1925 = _mm512_loadu_ps(arrangedDats5+401408*i40+32768*j33+64*s29+(ptrdiff_t)0);
__m512 wt383 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+8*s29+(ptrdiff_t)8));
sum319 = _mm512_fmadd_ps(wt383, dat1925, sum319);
__m512 wt384 = _mm512_set1_ps(*(float*)(arrangedWts5+264192*i40+3096*k121+8*s29+(ptrdiff_t)12));
sum320 = _mm512_fmadd_ps(wt384, dat1925, sum320);
}
sum319 = _mm512_add_ps(sum319, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k121+(ptrdiff_t)0));
sum319 = _mm512_max_ps(_mm512_setzero_ps(), sum319);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k121+(ptrdiff_t)0, 65535, sum319);
sum320 = _mm512_add_ps(sum320, _mm512_maskz_loadu_ps(65535, datPtr19+1605632*i40+256*j33+18816*k121+(ptrdiff_t)3136));
sum320 = _mm512_max_ps(_mm512_setzero_ps(), sum320);
_mm512_mask_storeu_ps(datPtr20+1605632*i40+256*j33+18816*k121+(ptrdiff_t)3136, 65535, sum320);
}
}

static void ResNet50OneApply5(ResNet50ThreaderTeam1* team45, char** tensors63) {
void* pair15[] = {tensors63, 0};
ResNet50ThreaderTask1 task67;
task67.callee1 = ResNet50OneApply5Callee1;
task67.any1 = pair15;
task67.nd1 = 3;
task67.hull1[0] = 21;
task67.hull1[1] = 13;
task67.hull1[2] = 1;
ResNet50ThreaderDo1(team45, &task67);
}

static void ResNet50OneArrangeWts6Callee1(ResNet50ThreaderTask1* task68, int64_t* pt39) {
char** tensors66 = task68->any1;
ptrdiff_t b58 = pt39[0];
char*restrict wtPtr12 = tensors66[0]+(ptrdiff_t)3340*0+(ptrdiff_t)262144*0;
char*restrict biasPtr12 = tensors66[1]+(ptrdiff_t)512*0;
char*restrict bnPtr12 = tensors66[2]+(ptrdiff_t)8*128*0;
char*restrict arranged11 = tensors66[3]+(ptrdiff_t)428032*0+(ptrdiff_t)262656*0;
ptrdiff_t ii16 = 1;
for (ptrdiff_t i41 = 0; i41 < ii16; ++i41) {
ptrdiff_t j34 = 1*b58;
ptrdiff_t jj37 = j34+1;
for (; j34 < jj37; ++j34) {
if (j34 < 7) {
ptrdiff_t k123 = 0+16*(j34-0);
ptrdiff_t l51 = (size_t)(0+k123)/6;
ptrdiff_t cut17 = (size_t)(0+k123)%6;
switch (cut17) {
case 0:;
case 2: {
__m512 sum322 = _mm512_maskz_loadu_ps(65535, biasPtr12+512*i41+4*k123);
__m512i pmMul24 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd24 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo20 = _mm512_loadu_ps(bnPtr12+(ptrdiff_t)8*(k123+128*i41));
__m512 masHi20 = _mm512_maskz_loadu_ps(65535, bnPtr12+(ptrdiff_t)8*(k123+128*i41)+(ptrdiff_t)64);
__m512 postMul39 = _mm512_permutex2var_ps(masLo20, pmMul24, masHi20);
__m512 postAdd25 = _mm512_permutex2var_ps(masLo20, pmAdd24, masHi20);
sum322 = _mm512_fmadd_ps(sum322, postMul39, postAdd25);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*0+(ptrdiff_t)0, 63>>cut17, sum322);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*0+(ptrdiff_t)12288, 4032>>cut17, sum322);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*0+(ptrdiff_t)24576, 65535-(4095>>cut17), sum322);
ptrdiff_t c33 = 0;
for (; c33 != 32; ++c33) {
__m512 wt401 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)0);
__m512 wt402 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)2048);
__m512 wt403 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)4096);
__m512 wt404 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)6144);
__m512 wt405 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)8192);
__m512 wt406 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)10240);
__m512 wt407 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)12288);
__m512 wt408 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)14336);
__m512 wt409 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)16384);
__m512 wt410 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)18432);
__m512 wt411 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)20480);
__m512 wt412 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)22528);
__m512 wt413 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)24576);
__m512 wt414 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)26624);
__m512 wt415 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)28672);
__m512 wt416 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c33+(ptrdiff_t)30720);
__m512 tmp13555 = _mm512_unpacklo_ps(wt401, wt402);
__m512 tmp13556 = _mm512_unpackhi_ps(wt401, wt402);
__m512 tmp13557 = _mm512_unpacklo_ps(wt403, wt404);
__m512 tmp13558 = _mm512_unpackhi_ps(wt403, wt404);
__m512 tmp13559 = _mm512_unpacklo_ps(wt405, wt406);
__m512 tmp13560 = _mm512_unpackhi_ps(wt405, wt406);
__m512 tmp13561 = _mm512_unpacklo_ps(wt407, wt408);
__m512 tmp13562 = _mm512_unpackhi_ps(wt407, wt408);
__m512 tmp13563 = _mm512_unpacklo_ps(wt409, wt410);
__m512 tmp13564 = _mm512_unpackhi_ps(wt409, wt410);
__m512 tmp13565 = _mm512_unpacklo_ps(wt411, wt412);
__m512 tmp13566 = _mm512_unpackhi_ps(wt411, wt412);
__m512 tmp13567 = _mm512_unpacklo_ps(wt413, wt414);
__m512 tmp13568 = _mm512_unpackhi_ps(wt413, wt414);
__m512 tmp13569 = _mm512_unpacklo_ps(wt415, wt416);
__m512 tmp13570 = _mm512_unpackhi_ps(wt415, wt416);
__m512 tmp13571 = _mm512_shuffle_ps(tmp13555, tmp13557, 68);
__m512 tmp13572 = _mm512_shuffle_ps(tmp13555, tmp13557, 238);
__m512 tmp13573 = _mm512_shuffle_ps(tmp13556, tmp13558, 68);
__m512 tmp13574 = _mm512_shuffle_ps(tmp13556, tmp13558, 238);
__m512 tmp13575 = _mm512_shuffle_ps(tmp13559, tmp13561, 68);
__m512 tmp13576 = _mm512_shuffle_ps(tmp13559, tmp13561, 238);
__m512 tmp13577 = _mm512_shuffle_ps(tmp13560, tmp13562, 68);
__m512 tmp13578 = _mm512_shuffle_ps(tmp13560, tmp13562, 238);
__m512 tmp13579 = _mm512_shuffle_ps(tmp13563, tmp13565, 68);
__m512 tmp13580 = _mm512_shuffle_ps(tmp13563, tmp13565, 238);
__m512 tmp13581 = _mm512_shuffle_ps(tmp13564, tmp13566, 68);
__m512 tmp13582 = _mm512_shuffle_ps(tmp13564, tmp13566, 238);
__m512 tmp13583 = _mm512_shuffle_ps(tmp13567, tmp13569, 68);
__m512 tmp13584 = _mm512_shuffle_ps(tmp13567, tmp13569, 238);
__m512 tmp13585 = _mm512_shuffle_ps(tmp13568, tmp13570, 68);
__m512 tmp13586 = _mm512_shuffle_ps(tmp13568, tmp13570, 238);
__m512 tmp13587 = _mm512_shuffle_f32x4(tmp13571, tmp13575, 136);
__m512 tmp13588 = _mm512_shuffle_f32x4(tmp13571, tmp13575, 221);
__m512 tmp13589 = _mm512_shuffle_f32x4(tmp13572, tmp13576, 136);
__m512 tmp13590 = _mm512_shuffle_f32x4(tmp13572, tmp13576, 221);
__m512 tmp13591 = _mm512_shuffle_f32x4(tmp13573, tmp13577, 136);
__m512 tmp13592 = _mm512_shuffle_f32x4(tmp13573, tmp13577, 221);
__m512 tmp13593 = _mm512_shuffle_f32x4(tmp13574, tmp13578, 136);
__m512 tmp13594 = _mm512_shuffle_f32x4(tmp13574, tmp13578, 221);
__m512 tmp13595 = _mm512_shuffle_f32x4(tmp13579, tmp13583, 136);
__m512 tmp13596 = _mm512_shuffle_f32x4(tmp13579, tmp13583, 221);
__m512 tmp13597 = _mm512_shuffle_f32x4(tmp13580, tmp13584, 136);
__m512 tmp13598 = _mm512_shuffle_f32x4(tmp13580, tmp13584, 221);
__m512 tmp13599 = _mm512_shuffle_f32x4(tmp13581, tmp13585, 136);
__m512 tmp13600 = _mm512_shuffle_f32x4(tmp13581, tmp13585, 221);
__m512 tmp13601 = _mm512_shuffle_f32x4(tmp13582, tmp13586, 136);
__m512 tmp13602 = _mm512_shuffle_f32x4(tmp13582, tmp13586, 221);
wt401 = _mm512_shuffle_f32x4(tmp13587, tmp13595, 136);
wt409 = _mm512_shuffle_f32x4(tmp13587, tmp13595, 221);
wt402 = _mm512_shuffle_f32x4(tmp13589, tmp13597, 136);
wt410 = _mm512_shuffle_f32x4(tmp13589, tmp13597, 221);
wt403 = _mm512_shuffle_f32x4(tmp13591, tmp13599, 136);
wt411 = _mm512_shuffle_f32x4(tmp13591, tmp13599, 221);
wt404 = _mm512_shuffle_f32x4(tmp13593, tmp13601, 136);
wt412 = _mm512_shuffle_f32x4(tmp13593, tmp13601, 221);
wt405 = _mm512_shuffle_f32x4(tmp13588, tmp13596, 136);
wt413 = _mm512_shuffle_f32x4(tmp13588, tmp13596, 221);
wt406 = _mm512_shuffle_f32x4(tmp13590, tmp13598, 136);
wt414 = _mm512_shuffle_f32x4(tmp13590, tmp13598, 221);
wt407 = _mm512_shuffle_f32x4(tmp13592, tmp13600, 136);
wt415 = _mm512_shuffle_f32x4(tmp13592, tmp13600, 221);
wt408 = _mm512_shuffle_f32x4(tmp13594, tmp13602, 136);
wt416 = _mm512_shuffle_f32x4(tmp13594, tmp13602, 221);
wt401 = _mm512_mul_ps(wt401, postMul39);
wt402 = _mm512_mul_ps(wt402, postMul39);
wt403 = _mm512_mul_ps(wt403, postMul39);
wt404 = _mm512_mul_ps(wt404, postMul39);
wt405 = _mm512_mul_ps(wt405, postMul39);
wt406 = _mm512_mul_ps(wt406, postMul39);
wt407 = _mm512_mul_ps(wt407, postMul39);
wt408 = _mm512_mul_ps(wt408, postMul39);
wt409 = _mm512_mul_ps(wt409, postMul39);
wt410 = _mm512_mul_ps(wt410, postMul39);
wt411 = _mm512_mul_ps(wt411, postMul39);
wt412 = _mm512_mul_ps(wt412, postMul39);
wt413 = _mm512_mul_ps(wt413, postMul39);
wt414 = _mm512_mul_ps(wt414, postMul39);
wt415 = _mm512_mul_ps(wt415, postMul39);
wt416 = _mm512_mul_ps(wt416, postMul39);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(1+16*c33)+(ptrdiff_t)0, 63>>cut17, wt401);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(2+16*c33)+(ptrdiff_t)0, 63>>cut17, wt402);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(3+16*c33)+(ptrdiff_t)0, 63>>cut17, wt403);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(4+16*c33)+(ptrdiff_t)0, 63>>cut17, wt404);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(5+16*c33)+(ptrdiff_t)0, 63>>cut17, wt405);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(6+16*c33)+(ptrdiff_t)0, 63>>cut17, wt406);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(7+16*c33)+(ptrdiff_t)0, 63>>cut17, wt407);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(8+16*c33)+(ptrdiff_t)0, 63>>cut17, wt408);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(9+16*c33)+(ptrdiff_t)0, 63>>cut17, wt409);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(10+16*c33)+(ptrdiff_t)0, 63>>cut17, wt410);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(11+16*c33)+(ptrdiff_t)0, 63>>cut17, wt411);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(12+16*c33)+(ptrdiff_t)0, 63>>cut17, wt412);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(13+16*c33)+(ptrdiff_t)0, 63>>cut17, wt413);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(14+16*c33)+(ptrdiff_t)0, 63>>cut17, wt414);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(15+16*c33)+(ptrdiff_t)0, 63>>cut17, wt415);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(16+16*c33)+(ptrdiff_t)0, 63>>cut17, wt416);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(1+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt401);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(2+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt402);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(3+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt403);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(4+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt404);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(5+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt405);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(6+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt406);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(7+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt407);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(8+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt408);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(9+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt409);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(10+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt410);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(11+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt411);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(12+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt412);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(13+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt413);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(14+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt414);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(15+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt415);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(16+16*c33)+(ptrdiff_t)12288, 4032>>cut17, wt416);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(1+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt401);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(2+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt402);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(3+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt403);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(4+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt404);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(5+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt405);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(6+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt406);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(7+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt407);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(8+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt408);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(9+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt409);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(10+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt410);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(11+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt411);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(12+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt412);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(13+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt413);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(14+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt414);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(15+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt415);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(16+16*c33)+(ptrdiff_t)24576, 65535-(4095>>cut17), wt416);
}
break;
}
default: {
cut17 = 4;
__m512 sum323 = _mm512_maskz_loadu_ps(65535, biasPtr12+512*i41+4*k123);
__m512i pmMul25 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd25 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo21 = _mm512_loadu_ps(bnPtr12+(ptrdiff_t)8*(k123+128*i41));
__m512 masHi21 = _mm512_maskz_loadu_ps(65535, bnPtr12+(ptrdiff_t)8*(k123+128*i41)+(ptrdiff_t)64);
__m512 postMul40 = _mm512_permutex2var_ps(masLo21, pmMul25, masHi21);
__m512 postAdd26 = _mm512_permutex2var_ps(masLo21, pmAdd25, masHi21);
sum323 = _mm512_fmadd_ps(sum323, postMul40, postAdd26);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*0+(ptrdiff_t)0, 63>>cut17, sum323);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*0+(ptrdiff_t)12288, 4032>>cut17, sum323);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*0+(ptrdiff_t)24576, 258048>>cut17, sum323);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*0+(ptrdiff_t)36864, 65535-(262143>>cut17), sum323);
ptrdiff_t c34 = 0;
for (; c34 != 32; ++c34) {
__m512 wt417 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)0);
__m512 wt418 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)2048);
__m512 wt419 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)4096);
__m512 wt420 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)6144);
__m512 wt421 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)8192);
__m512 wt422 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)10240);
__m512 wt423 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)12288);
__m512 wt424 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)14336);
__m512 wt425 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)16384);
__m512 wt426 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)18432);
__m512 wt427 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)20480);
__m512 wt428 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)22528);
__m512 wt429 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)24576);
__m512 wt430 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)26624);
__m512 wt431 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)28672);
__m512 wt432 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k123+64*c34+(ptrdiff_t)30720);
__m512 tmp13603 = _mm512_unpacklo_ps(wt417, wt418);
__m512 tmp13604 = _mm512_unpackhi_ps(wt417, wt418);
__m512 tmp13605 = _mm512_unpacklo_ps(wt419, wt420);
__m512 tmp13606 = _mm512_unpackhi_ps(wt419, wt420);
__m512 tmp13607 = _mm512_unpacklo_ps(wt421, wt422);
__m512 tmp13608 = _mm512_unpackhi_ps(wt421, wt422);
__m512 tmp13609 = _mm512_unpacklo_ps(wt423, wt424);
__m512 tmp13610 = _mm512_unpackhi_ps(wt423, wt424);
__m512 tmp13611 = _mm512_unpacklo_ps(wt425, wt426);
__m512 tmp13612 = _mm512_unpackhi_ps(wt425, wt426);
__m512 tmp13613 = _mm512_unpacklo_ps(wt427, wt428);
__m512 tmp13614 = _mm512_unpackhi_ps(wt427, wt428);
__m512 tmp13615 = _mm512_unpacklo_ps(wt429, wt430);
__m512 tmp13616 = _mm512_unpackhi_ps(wt429, wt430);
__m512 tmp13617 = _mm512_unpacklo_ps(wt431, wt432);
__m512 tmp13618 = _mm512_unpackhi_ps(wt431, wt432);
__m512 tmp13619 = _mm512_shuffle_ps(tmp13603, tmp13605, 68);
__m512 tmp13620 = _mm512_shuffle_ps(tmp13603, tmp13605, 238);
__m512 tmp13621 = _mm512_shuffle_ps(tmp13604, tmp13606, 68);
__m512 tmp13622 = _mm512_shuffle_ps(tmp13604, tmp13606, 238);
__m512 tmp13623 = _mm512_shuffle_ps(tmp13607, tmp13609, 68);
__m512 tmp13624 = _mm512_shuffle_ps(tmp13607, tmp13609, 238);
__m512 tmp13625 = _mm512_shuffle_ps(tmp13608, tmp13610, 68);
__m512 tmp13626 = _mm512_shuffle_ps(tmp13608, tmp13610, 238);
__m512 tmp13627 = _mm512_shuffle_ps(tmp13611, tmp13613, 68);
__m512 tmp13628 = _mm512_shuffle_ps(tmp13611, tmp13613, 238);
__m512 tmp13629 = _mm512_shuffle_ps(tmp13612, tmp13614, 68);
__m512 tmp13630 = _mm512_shuffle_ps(tmp13612, tmp13614, 238);
__m512 tmp13631 = _mm512_shuffle_ps(tmp13615, tmp13617, 68);
__m512 tmp13632 = _mm512_shuffle_ps(tmp13615, tmp13617, 238);
__m512 tmp13633 = _mm512_shuffle_ps(tmp13616, tmp13618, 68);
__m512 tmp13634 = _mm512_shuffle_ps(tmp13616, tmp13618, 238);
__m512 tmp13635 = _mm512_shuffle_f32x4(tmp13619, tmp13623, 136);
__m512 tmp13636 = _mm512_shuffle_f32x4(tmp13619, tmp13623, 221);
__m512 tmp13637 = _mm512_shuffle_f32x4(tmp13620, tmp13624, 136);
__m512 tmp13638 = _mm512_shuffle_f32x4(tmp13620, tmp13624, 221);
__m512 tmp13639 = _mm512_shuffle_f32x4(tmp13621, tmp13625, 136);
__m512 tmp13640 = _mm512_shuffle_f32x4(tmp13621, tmp13625, 221);
__m512 tmp13641 = _mm512_shuffle_f32x4(tmp13622, tmp13626, 136);
__m512 tmp13642 = _mm512_shuffle_f32x4(tmp13622, tmp13626, 221);
__m512 tmp13643 = _mm512_shuffle_f32x4(tmp13627, tmp13631, 136);
__m512 tmp13644 = _mm512_shuffle_f32x4(tmp13627, tmp13631, 221);
__m512 tmp13645 = _mm512_shuffle_f32x4(tmp13628, tmp13632, 136);
__m512 tmp13646 = _mm512_shuffle_f32x4(tmp13628, tmp13632, 221);
__m512 tmp13647 = _mm512_shuffle_f32x4(tmp13629, tmp13633, 136);
__m512 tmp13648 = _mm512_shuffle_f32x4(tmp13629, tmp13633, 221);
__m512 tmp13649 = _mm512_shuffle_f32x4(tmp13630, tmp13634, 136);
__m512 tmp13650 = _mm512_shuffle_f32x4(tmp13630, tmp13634, 221);
wt417 = _mm512_shuffle_f32x4(tmp13635, tmp13643, 136);
wt425 = _mm512_shuffle_f32x4(tmp13635, tmp13643, 221);
wt418 = _mm512_shuffle_f32x4(tmp13637, tmp13645, 136);
wt426 = _mm512_shuffle_f32x4(tmp13637, tmp13645, 221);
wt419 = _mm512_shuffle_f32x4(tmp13639, tmp13647, 136);
wt427 = _mm512_shuffle_f32x4(tmp13639, tmp13647, 221);
wt420 = _mm512_shuffle_f32x4(tmp13641, tmp13649, 136);
wt428 = _mm512_shuffle_f32x4(tmp13641, tmp13649, 221);
wt421 = _mm512_shuffle_f32x4(tmp13636, tmp13644, 136);
wt429 = _mm512_shuffle_f32x4(tmp13636, tmp13644, 221);
wt422 = _mm512_shuffle_f32x4(tmp13638, tmp13646, 136);
wt430 = _mm512_shuffle_f32x4(tmp13638, tmp13646, 221);
wt423 = _mm512_shuffle_f32x4(tmp13640, tmp13648, 136);
wt431 = _mm512_shuffle_f32x4(tmp13640, tmp13648, 221);
wt424 = _mm512_shuffle_f32x4(tmp13642, tmp13650, 136);
wt432 = _mm512_shuffle_f32x4(tmp13642, tmp13650, 221);
wt417 = _mm512_mul_ps(wt417, postMul40);
wt418 = _mm512_mul_ps(wt418, postMul40);
wt419 = _mm512_mul_ps(wt419, postMul40);
wt420 = _mm512_mul_ps(wt420, postMul40);
wt421 = _mm512_mul_ps(wt421, postMul40);
wt422 = _mm512_mul_ps(wt422, postMul40);
wt423 = _mm512_mul_ps(wt423, postMul40);
wt424 = _mm512_mul_ps(wt424, postMul40);
wt425 = _mm512_mul_ps(wt425, postMul40);
wt426 = _mm512_mul_ps(wt426, postMul40);
wt427 = _mm512_mul_ps(wt427, postMul40);
wt428 = _mm512_mul_ps(wt428, postMul40);
wt429 = _mm512_mul_ps(wt429, postMul40);
wt430 = _mm512_mul_ps(wt430, postMul40);
wt431 = _mm512_mul_ps(wt431, postMul40);
wt432 = _mm512_mul_ps(wt432, postMul40);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(1+16*c34)+(ptrdiff_t)0, 63>>cut17, wt417);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(2+16*c34)+(ptrdiff_t)0, 63>>cut17, wt418);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(3+16*c34)+(ptrdiff_t)0, 63>>cut17, wt419);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(4+16*c34)+(ptrdiff_t)0, 63>>cut17, wt420);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(5+16*c34)+(ptrdiff_t)0, 63>>cut17, wt421);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(6+16*c34)+(ptrdiff_t)0, 63>>cut17, wt422);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(7+16*c34)+(ptrdiff_t)0, 63>>cut17, wt423);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(8+16*c34)+(ptrdiff_t)0, 63>>cut17, wt424);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(9+16*c34)+(ptrdiff_t)0, 63>>cut17, wt425);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(10+16*c34)+(ptrdiff_t)0, 63>>cut17, wt426);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(11+16*c34)+(ptrdiff_t)0, 63>>cut17, wt427);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(12+16*c34)+(ptrdiff_t)0, 63>>cut17, wt428);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(13+16*c34)+(ptrdiff_t)0, 63>>cut17, wt429);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(14+16*c34)+(ptrdiff_t)0, 63>>cut17, wt430);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(15+16*c34)+(ptrdiff_t)0, 63>>cut17, wt431);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(16+16*c34)+(ptrdiff_t)0, 63>>cut17, wt432);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(1+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt417);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(2+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt418);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(3+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt419);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(4+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt420);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(5+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt421);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(6+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt422);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(7+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt423);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(8+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt424);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(9+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt425);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(10+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt426);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(11+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt427);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(12+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt428);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(13+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt429);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(14+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt430);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(15+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt431);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(16+16*c34)+(ptrdiff_t)12288, 4032>>cut17, wt432);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(1+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt417);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(2+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt418);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(3+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt419);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(4+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt420);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(5+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt421);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(6+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt422);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(7+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt423);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(8+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt424);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(9+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt425);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(10+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt426);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(11+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt427);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(12+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt428);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(13+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt429);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(14+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt430);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(15+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt431);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(16+16*c34)+(ptrdiff_t)24576, 258048>>cut17, wt432);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(1+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt417);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(2+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt418);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(3+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt419);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(4+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt420);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(5+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt421);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(6+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt422);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(7+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt423);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(8+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt424);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(9+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt425);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(10+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt426);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(11+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt427);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(12+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt428);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(13+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt429);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(14+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt430);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(15+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt431);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l51+4*cut17+24*(16+16*c34)+(ptrdiff_t)36864, 65535-(262143>>cut17), wt432);
}
}
}
} else {
ptrdiff_t k122 = 112;
ptrdiff_t l50 = (size_t)(0+k122)/6;
ptrdiff_t cut16 = (size_t)(0+k122)%6;
__m512 sum321 = _mm512_maskz_loadu_ps(65535, biasPtr12+512*i41+4*k122);
__m512i pmMul26 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd26 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo22 = _mm512_loadu_ps(bnPtr12+(ptrdiff_t)8*(k122+128*i41));
__m512 masHi22 = _mm512_maskz_loadu_ps(65535, bnPtr12+(ptrdiff_t)8*(k122+128*i41)+(ptrdiff_t)64);
__m512 postMul38 = _mm512_permutex2var_ps(masLo22, pmMul26, masHi22);
__m512 postAdd24 = _mm512_permutex2var_ps(masLo22, pmAdd26, masHi22);
sum321 = _mm512_fmadd_ps(sum321, postMul38, postAdd24);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*0+(ptrdiff_t)0, 63>>cut16, sum321);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*0+(ptrdiff_t)12288, 4032>>cut16, sum321);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*0+(ptrdiff_t)24576, 258048>>cut16, sum321);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*0+(ptrdiff_t)36864, 65535-(262143>>cut16), sum321);
ptrdiff_t c32 = 0;
for (; c32 != 32; ++c32) {
__m512 wt385 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)0);
__m512 wt386 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)2048);
__m512 wt387 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)4096);
__m512 wt388 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)6144);
__m512 wt389 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)8192);
__m512 wt390 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)10240);
__m512 wt391 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)12288);
__m512 wt392 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)14336);
__m512 wt393 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)16384);
__m512 wt394 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)18432);
__m512 wt395 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)20480);
__m512 wt396 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)22528);
__m512 wt397 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)24576);
__m512 wt398 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)26624);
__m512 wt399 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)28672);
__m512 wt400 = _mm512_maskz_loadu_ps(65535, wtPtr12+262144*i41+2048*k122+64*c32+(ptrdiff_t)30720);
__m512 tmp13651 = _mm512_unpacklo_ps(wt385, wt386);
__m512 tmp13652 = _mm512_unpackhi_ps(wt385, wt386);
__m512 tmp13653 = _mm512_unpacklo_ps(wt387, wt388);
__m512 tmp13654 = _mm512_unpackhi_ps(wt387, wt388);
__m512 tmp13655 = _mm512_unpacklo_ps(wt389, wt390);
__m512 tmp13656 = _mm512_unpackhi_ps(wt389, wt390);
__m512 tmp13657 = _mm512_unpacklo_ps(wt391, wt392);
__m512 tmp13658 = _mm512_unpackhi_ps(wt391, wt392);
__m512 tmp13659 = _mm512_unpacklo_ps(wt393, wt394);
__m512 tmp13660 = _mm512_unpackhi_ps(wt393, wt394);
__m512 tmp13661 = _mm512_unpacklo_ps(wt395, wt396);
__m512 tmp13662 = _mm512_unpackhi_ps(wt395, wt396);
__m512 tmp13663 = _mm512_unpacklo_ps(wt397, wt398);
__m512 tmp13664 = _mm512_unpackhi_ps(wt397, wt398);
__m512 tmp13665 = _mm512_unpacklo_ps(wt399, wt400);
__m512 tmp13666 = _mm512_unpackhi_ps(wt399, wt400);
__m512 tmp13667 = _mm512_shuffle_ps(tmp13651, tmp13653, 68);
__m512 tmp13668 = _mm512_shuffle_ps(tmp13651, tmp13653, 238);
__m512 tmp13669 = _mm512_shuffle_ps(tmp13652, tmp13654, 68);
__m512 tmp13670 = _mm512_shuffle_ps(tmp13652, tmp13654, 238);
__m512 tmp13671 = _mm512_shuffle_ps(tmp13655, tmp13657, 68);
__m512 tmp13672 = _mm512_shuffle_ps(tmp13655, tmp13657, 238);
__m512 tmp13673 = _mm512_shuffle_ps(tmp13656, tmp13658, 68);
__m512 tmp13674 = _mm512_shuffle_ps(tmp13656, tmp13658, 238);
__m512 tmp13675 = _mm512_shuffle_ps(tmp13659, tmp13661, 68);
__m512 tmp13676 = _mm512_shuffle_ps(tmp13659, tmp13661, 238);
__m512 tmp13677 = _mm512_shuffle_ps(tmp13660, tmp13662, 68);
__m512 tmp13678 = _mm512_shuffle_ps(tmp13660, tmp13662, 238);
__m512 tmp13679 = _mm512_shuffle_ps(tmp13663, tmp13665, 68);
__m512 tmp13680 = _mm512_shuffle_ps(tmp13663, tmp13665, 238);
__m512 tmp13681 = _mm512_shuffle_ps(tmp13664, tmp13666, 68);
__m512 tmp13682 = _mm512_shuffle_ps(tmp13664, tmp13666, 238);
__m512 tmp13683 = _mm512_shuffle_f32x4(tmp13667, tmp13671, 136);
__m512 tmp13684 = _mm512_shuffle_f32x4(tmp13667, tmp13671, 221);
__m512 tmp13685 = _mm512_shuffle_f32x4(tmp13668, tmp13672, 136);
__m512 tmp13686 = _mm512_shuffle_f32x4(tmp13668, tmp13672, 221);
__m512 tmp13687 = _mm512_shuffle_f32x4(tmp13669, tmp13673, 136);
__m512 tmp13688 = _mm512_shuffle_f32x4(tmp13669, tmp13673, 221);
__m512 tmp13689 = _mm512_shuffle_f32x4(tmp13670, tmp13674, 136);
__m512 tmp13690 = _mm512_shuffle_f32x4(tmp13670, tmp13674, 221);
__m512 tmp13691 = _mm512_shuffle_f32x4(tmp13675, tmp13679, 136);
__m512 tmp13692 = _mm512_shuffle_f32x4(tmp13675, tmp13679, 221);
__m512 tmp13693 = _mm512_shuffle_f32x4(tmp13676, tmp13680, 136);
__m512 tmp13694 = _mm512_shuffle_f32x4(tmp13676, tmp13680, 221);
__m512 tmp13695 = _mm512_shuffle_f32x4(tmp13677, tmp13681, 136);
__m512 tmp13696 = _mm512_shuffle_f32x4(tmp13677, tmp13681, 221);
__m512 tmp13697 = _mm512_shuffle_f32x4(tmp13678, tmp13682, 136);
__m512 tmp13698 = _mm512_shuffle_f32x4(tmp13678, tmp13682, 221);
wt385 = _mm512_shuffle_f32x4(tmp13683, tmp13691, 136);
wt393 = _mm512_shuffle_f32x4(tmp13683, tmp13691, 221);
wt386 = _mm512_shuffle_f32x4(tmp13685, tmp13693, 136);
wt394 = _mm512_shuffle_f32x4(tmp13685, tmp13693, 221);
wt387 = _mm512_shuffle_f32x4(tmp13687, tmp13695, 136);
wt395 = _mm512_shuffle_f32x4(tmp13687, tmp13695, 221);
wt388 = _mm512_shuffle_f32x4(tmp13689, tmp13697, 136);
wt396 = _mm512_shuffle_f32x4(tmp13689, tmp13697, 221);
wt389 = _mm512_shuffle_f32x4(tmp13684, tmp13692, 136);
wt397 = _mm512_shuffle_f32x4(tmp13684, tmp13692, 221);
wt390 = _mm512_shuffle_f32x4(tmp13686, tmp13694, 136);
wt398 = _mm512_shuffle_f32x4(tmp13686, tmp13694, 221);
wt391 = _mm512_shuffle_f32x4(tmp13688, tmp13696, 136);
wt399 = _mm512_shuffle_f32x4(tmp13688, tmp13696, 221);
wt392 = _mm512_shuffle_f32x4(tmp13690, tmp13698, 136);
wt400 = _mm512_shuffle_f32x4(tmp13690, tmp13698, 221);
wt385 = _mm512_mul_ps(wt385, postMul38);
wt386 = _mm512_mul_ps(wt386, postMul38);
wt387 = _mm512_mul_ps(wt387, postMul38);
wt388 = _mm512_mul_ps(wt388, postMul38);
wt389 = _mm512_mul_ps(wt389, postMul38);
wt390 = _mm512_mul_ps(wt390, postMul38);
wt391 = _mm512_mul_ps(wt391, postMul38);
wt392 = _mm512_mul_ps(wt392, postMul38);
wt393 = _mm512_mul_ps(wt393, postMul38);
wt394 = _mm512_mul_ps(wt394, postMul38);
wt395 = _mm512_mul_ps(wt395, postMul38);
wt396 = _mm512_mul_ps(wt396, postMul38);
wt397 = _mm512_mul_ps(wt397, postMul38);
wt398 = _mm512_mul_ps(wt398, postMul38);
wt399 = _mm512_mul_ps(wt399, postMul38);
wt400 = _mm512_mul_ps(wt400, postMul38);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(1+16*c32)+(ptrdiff_t)0, 63>>cut16, wt385);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(2+16*c32)+(ptrdiff_t)0, 63>>cut16, wt386);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(3+16*c32)+(ptrdiff_t)0, 63>>cut16, wt387);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(4+16*c32)+(ptrdiff_t)0, 63>>cut16, wt388);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(5+16*c32)+(ptrdiff_t)0, 63>>cut16, wt389);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(6+16*c32)+(ptrdiff_t)0, 63>>cut16, wt390);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(7+16*c32)+(ptrdiff_t)0, 63>>cut16, wt391);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(8+16*c32)+(ptrdiff_t)0, 63>>cut16, wt392);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(9+16*c32)+(ptrdiff_t)0, 63>>cut16, wt393);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(10+16*c32)+(ptrdiff_t)0, 63>>cut16, wt394);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(11+16*c32)+(ptrdiff_t)0, 63>>cut16, wt395);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(12+16*c32)+(ptrdiff_t)0, 63>>cut16, wt396);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(13+16*c32)+(ptrdiff_t)0, 63>>cut16, wt397);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(14+16*c32)+(ptrdiff_t)0, 63>>cut16, wt398);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(15+16*c32)+(ptrdiff_t)0, 63>>cut16, wt399);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(16+16*c32)+(ptrdiff_t)0, 63>>cut16, wt400);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(1+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt385);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(2+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt386);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(3+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt387);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(4+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt388);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(5+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt389);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(6+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt390);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(7+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt391);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(8+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt392);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(9+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt393);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(10+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt394);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(11+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt395);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(12+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt396);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(13+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt397);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(14+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt398);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(15+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt399);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(16+16*c32)+(ptrdiff_t)12288, 4032>>cut16, wt400);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(1+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt385);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(2+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt386);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(3+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt387);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(4+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt388);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(5+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt389);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(6+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt390);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(7+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt391);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(8+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt392);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(9+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt393);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(10+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt394);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(11+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt395);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(12+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt396);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(13+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt397);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(14+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt398);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(15+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt399);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+24*(16+16*c32)+(ptrdiff_t)24576, 258048>>cut16, wt400);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(1+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt385);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(2+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt386);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(3+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt387);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(4+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt388);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(5+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt389);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(6+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt390);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(7+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt391);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(8+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt392);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(9+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt393);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(10+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt394);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(11+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt395);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(12+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt396);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(13+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt397);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(14+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt398);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(15+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt399);
_mm512_mask_storeu_ps(arranged11+262656*i41+12312*l50+4*cut16+8*(16+16*c32)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt400);
}
}
}
}
}

static void ResNet50OneArrangeWts6(ResNet50ThreaderTeam1* team46, char** tensors65) {
ResNet50ThreaderTask1 task69;
task69.callee1 = ResNet50OneArrangeWts6Callee1;
task69.any1 = tensors65;
task69.nd1 = 3;
task69.hull1[0] = 8;
task69.hull1[1] = 1;
task69.hull1[2] = 1;
ResNet50ThreaderDo1(team46, &task69);
}

static void ResNet50OneArrangeDats6Callee1(ResNet50ThreaderTask1* task70, int64_t* pt40) {
char** tensors68 = task70->any1;
ptrdiff_t s30 = pt40[0];
ptrdiff_t c35 = pt40[1];
char*restrict datPtr21 = tensors68[0]+(ptrdiff_t)0+(ptrdiff_t)2618560*0+(ptrdiff_t)1605632*0;
char*restrict arranged12 = tensors68[1]+(ptrdiff_t)2618560*0+(ptrdiff_t)1605632*0;
ptrdiff_t ii17 = 1;
for (ptrdiff_t i42 = 0; i42 < ii17; ++i42) {
ptrdiff_t j35 = 1*c35;
ptrdiff_t jj38 = j35+0;
for (; j35 != 12; ++j35) {
ptrdiff_t k124 = 128*s30;
ptrdiff_t kk39 = k124+128;
for (; k124 < kk39; ++k124) {
__m512 dat1926 = _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i42+256*j35+3136*k124+(ptrdiff_t)0);
__m512 dat1927 = _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i42+256*j35+3136*k124+(ptrdiff_t)64);
__m512 dat1928 = _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i42+256*j35+3136*k124+(ptrdiff_t)128);
__m512 dat1929 = _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i42+256*j35+3136*k124+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged12+1605632*i42+131072*j35+256*k124+(ptrdiff_t)0, 65535, dat1926);
_mm512_mask_storeu_ps(arranged12+1605632*i42+131072*j35+256*k124+(ptrdiff_t)64, 65535, dat1927);
_mm512_mask_storeu_ps(arranged12+1605632*i42+131072*j35+256*k124+(ptrdiff_t)128, 65535, dat1928);
_mm512_mask_storeu_ps(arranged12+1605632*i42+131072*j35+256*k124+(ptrdiff_t)192, 65535, dat1929);
}
if (j35 >= jj38) goto next6;
}
ptrdiff_t k125 = 128*s30;
ptrdiff_t kk40 = k125+128;
for (; k125 < kk40; ++k125) {
__m512 dat1930 = _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i42+256*j35+3136*k125+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged12+1605632*i42+131072*j35+64*k125+(ptrdiff_t)0, 65535, dat1930);
}
next6:;
}
}

static void ResNet50OneArrangeDats6(ResNet50ThreaderTeam1* team47, char** tensors67) {
ResNet50ThreaderTask1 task71;
task71.callee1 = ResNet50OneArrangeDats6Callee1;
task71.any1 = tensors67;
task71.nd1 = 4;
task71.hull1[0] = 4;
task71.hull1[1] = 13;
task71.hull1[2] = 1;
task71.hull1[3] = 1;
ResNet50ThreaderDo1(team47, &task71);
}

static void ResNet50OneApply6Callee1(ResNet50ThreaderTask1* task72, int64_t* pt41) {
void** pair18 = task72->any1;
char** tensors70 = pair18[0];
ptrdiff_t e20 = 0;
ptrdiff_t g23 = 0;
ptrdiff_t d14 = pt41[1];
ptrdiff_t w56 = pt41[0];
char*restrict arrangedWts6 = tensors70[0]+428032*e20+(ptrdiff_t)262656*1*g23;
char*restrict arrangedDats6 = tensors70[1]+2618560*e20+(ptrdiff_t)1605632*1*g23;
char*restrict datPtr22 = tensors70[2]+(ptrdiff_t)401408*1*g23;
ptrdiff_t ii18 = 1;
for (ptrdiff_t i43 = 0; i43 < ii18; ++i43) {
ptrdiff_t j36 = 1*d14;
ptrdiff_t jj39 = j36+0;
for (; j36 != 12; ++j36) {
ptrdiff_t k126 = 1*w56;
ptrdiff_t kk41 = k126+0;
for (; k126 != 21; ++k126) {
ptrdiff_t s31 = -1;
__m512 sum324 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)24));
__m512 sum328 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)28));
__m512 sum332 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)32));
__m512 sum336 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)36));
__m512 sum340 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)40));
__m512 sum344 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)44));
__m512 sum325 = sum324;
__m512 sum326 = sum324;
__m512 sum327 = sum324;
__m512 sum329 = sum328;
__m512 sum330 = sum328;
__m512 sum331 = sum328;
__m512 sum333 = sum332;
__m512 sum334 = sum332;
__m512 sum335 = sum332;
__m512 sum337 = sum336;
__m512 sum338 = sum336;
__m512 sum339 = sum336;
__m512 sum341 = sum340;
__m512 sum342 = sum340;
__m512 sum343 = sum340;
__m512 sum345 = sum344;
__m512 sum346 = sum344;
__m512 sum347 = sum344;
for (s31 = 0; s31 < 512; ++s31) {
__m512 dat1931 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+256*s31+(ptrdiff_t)0);
__m512 dat1932 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+256*s31+(ptrdiff_t)64);
__m512 dat1933 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+256*s31+(ptrdiff_t)128);
__m512 dat1934 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+256*s31+(ptrdiff_t)192);
__m512 wt433 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)24));
sum324 = _mm512_fmadd_ps(wt433, dat1931, sum324);
sum325 = _mm512_fmadd_ps(wt433, dat1932, sum325);
sum326 = _mm512_fmadd_ps(wt433, dat1933, sum326);
sum327 = _mm512_fmadd_ps(wt433, dat1934, sum327);
__m512 wt434 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)28));
sum328 = _mm512_fmadd_ps(wt434, dat1931, sum328);
sum329 = _mm512_fmadd_ps(wt434, dat1932, sum329);
sum330 = _mm512_fmadd_ps(wt434, dat1933, sum330);
sum331 = _mm512_fmadd_ps(wt434, dat1934, sum331);
__m512 wt435 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)32));
sum332 = _mm512_fmadd_ps(wt435, dat1931, sum332);
sum333 = _mm512_fmadd_ps(wt435, dat1932, sum333);
sum334 = _mm512_fmadd_ps(wt435, dat1933, sum334);
sum335 = _mm512_fmadd_ps(wt435, dat1934, sum335);
__m512 wt436 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)36));
sum336 = _mm512_fmadd_ps(wt436, dat1931, sum336);
sum337 = _mm512_fmadd_ps(wt436, dat1932, sum337);
sum338 = _mm512_fmadd_ps(wt436, dat1933, sum338);
sum339 = _mm512_fmadd_ps(wt436, dat1934, sum339);
__m512 wt437 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)40));
sum340 = _mm512_fmadd_ps(wt437, dat1931, sum340);
sum341 = _mm512_fmadd_ps(wt437, dat1932, sum341);
sum342 = _mm512_fmadd_ps(wt437, dat1933, sum342);
sum343 = _mm512_fmadd_ps(wt437, dat1934, sum343);
__m512 wt438 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+24*s31+(ptrdiff_t)44));
sum344 = _mm512_fmadd_ps(wt438, dat1931, sum344);
sum345 = _mm512_fmadd_ps(wt438, dat1932, sum345);
sum346 = _mm512_fmadd_ps(wt438, dat1933, sum346);
sum347 = _mm512_fmadd_ps(wt438, dat1934, sum347);
}
sum324 = _mm512_max_ps(_mm512_setzero_ps(), sum324);
sum325 = _mm512_max_ps(_mm512_setzero_ps(), sum325);
sum326 = _mm512_max_ps(_mm512_setzero_ps(), sum326);
sum327 = _mm512_max_ps(_mm512_setzero_ps(), sum327);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)0, 65535, sum324);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)64, 65535, sum325);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)128, 65535, sum326);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)192, 65535, sum327);
sum328 = _mm512_max_ps(_mm512_setzero_ps(), sum328);
sum329 = _mm512_max_ps(_mm512_setzero_ps(), sum329);
sum330 = _mm512_max_ps(_mm512_setzero_ps(), sum330);
sum331 = _mm512_max_ps(_mm512_setzero_ps(), sum331);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)3136, 65535, sum328);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)3200, 65535, sum329);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)3264, 65535, sum330);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)3328, 65535, sum331);
sum332 = _mm512_max_ps(_mm512_setzero_ps(), sum332);
sum333 = _mm512_max_ps(_mm512_setzero_ps(), sum333);
sum334 = _mm512_max_ps(_mm512_setzero_ps(), sum334);
sum335 = _mm512_max_ps(_mm512_setzero_ps(), sum335);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)6272, 65535, sum332);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)6336, 65535, sum333);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)6400, 65535, sum334);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)6464, 65535, sum335);
sum336 = _mm512_max_ps(_mm512_setzero_ps(), sum336);
sum337 = _mm512_max_ps(_mm512_setzero_ps(), sum337);
sum338 = _mm512_max_ps(_mm512_setzero_ps(), sum338);
sum339 = _mm512_max_ps(_mm512_setzero_ps(), sum339);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)9408, 65535, sum336);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)9472, 65535, sum337);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)9536, 65535, sum338);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)9600, 65535, sum339);
sum340 = _mm512_max_ps(_mm512_setzero_ps(), sum340);
sum341 = _mm512_max_ps(_mm512_setzero_ps(), sum341);
sum342 = _mm512_max_ps(_mm512_setzero_ps(), sum342);
sum343 = _mm512_max_ps(_mm512_setzero_ps(), sum343);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)12544, 65535, sum340);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)12608, 65535, sum341);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)12672, 65535, sum342);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)12736, 65535, sum343);
sum344 = _mm512_max_ps(_mm512_setzero_ps(), sum344);
sum345 = _mm512_max_ps(_mm512_setzero_ps(), sum345);
sum346 = _mm512_max_ps(_mm512_setzero_ps(), sum346);
sum347 = _mm512_max_ps(_mm512_setzero_ps(), sum347);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)15680, 65535, sum344);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)15744, 65535, sum345);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)15808, 65535, sum346);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)15872, 65535, sum347);
if (k126 >= kk41) return;
}
ptrdiff_t s32 = -1;
__m512 sum348 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+8*s32+(ptrdiff_t)8));
__m512 sum352 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+8*s32+(ptrdiff_t)12));
__m512 sum349 = sum348;
__m512 sum350 = sum348;
__m512 sum351 = sum348;
__m512 sum353 = sum352;
__m512 sum354 = sum352;
__m512 sum355 = sum352;
for (s32 = 0; s32 < 512; ++s32) {
__m512 dat1935 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+256*s32+(ptrdiff_t)0);
__m512 dat1936 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+256*s32+(ptrdiff_t)64);
__m512 dat1937 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+256*s32+(ptrdiff_t)128);
__m512 dat1938 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+256*s32+(ptrdiff_t)192);
__m512 wt439 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+8*s32+(ptrdiff_t)8));
sum348 = _mm512_fmadd_ps(wt439, dat1935, sum348);
sum349 = _mm512_fmadd_ps(wt439, dat1936, sum349);
sum350 = _mm512_fmadd_ps(wt439, dat1937, sum350);
sum351 = _mm512_fmadd_ps(wt439, dat1938, sum351);
__m512 wt440 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k126+8*s32+(ptrdiff_t)12));
sum352 = _mm512_fmadd_ps(wt440, dat1935, sum352);
sum353 = _mm512_fmadd_ps(wt440, dat1936, sum353);
sum354 = _mm512_fmadd_ps(wt440, dat1937, sum354);
sum355 = _mm512_fmadd_ps(wt440, dat1938, sum355);
}
sum348 = _mm512_max_ps(_mm512_setzero_ps(), sum348);
sum349 = _mm512_max_ps(_mm512_setzero_ps(), sum349);
sum350 = _mm512_max_ps(_mm512_setzero_ps(), sum350);
sum351 = _mm512_max_ps(_mm512_setzero_ps(), sum351);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)0, 65535, sum348);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)64, 65535, sum349);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)128, 65535, sum350);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)192, 65535, sum351);
sum352 = _mm512_max_ps(_mm512_setzero_ps(), sum352);
sum353 = _mm512_max_ps(_mm512_setzero_ps(), sum353);
sum354 = _mm512_max_ps(_mm512_setzero_ps(), sum354);
sum355 = _mm512_max_ps(_mm512_setzero_ps(), sum355);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)3136, 65535, sum352);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)3200, 65535, sum353);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)3264, 65535, sum354);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k126+(ptrdiff_t)3328, 65535, sum355);
if (j36 >= jj39) return;
}
ptrdiff_t k127 = 1*w56;
ptrdiff_t kk42 = k127+0;
for (; k127 != 21; ++k127) {
ptrdiff_t s33 = -1;
__m512 sum356 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)24));
__m512 sum357 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)28));
__m512 sum358 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)32));
__m512 sum359 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)36));
__m512 sum360 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)40));
__m512 sum361 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)44));
for (s33 = 0; s33 < 512; ++s33) {
__m512 dat1939 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+64*s33+(ptrdiff_t)0);
__m512 wt441 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)24));
sum356 = _mm512_fmadd_ps(wt441, dat1939, sum356);
__m512 wt442 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)28));
sum357 = _mm512_fmadd_ps(wt442, dat1939, sum357);
__m512 wt443 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)32));
sum358 = _mm512_fmadd_ps(wt443, dat1939, sum358);
__m512 wt444 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)36));
sum359 = _mm512_fmadd_ps(wt444, dat1939, sum359);
__m512 wt445 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)40));
sum360 = _mm512_fmadd_ps(wt445, dat1939, sum360);
__m512 wt446 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+24*s33+(ptrdiff_t)44));
sum361 = _mm512_fmadd_ps(wt446, dat1939, sum361);
}
sum356 = _mm512_max_ps(_mm512_setzero_ps(), sum356);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k127+(ptrdiff_t)0, 65535, sum356);
sum357 = _mm512_max_ps(_mm512_setzero_ps(), sum357);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k127+(ptrdiff_t)3136, 65535, sum357);
sum358 = _mm512_max_ps(_mm512_setzero_ps(), sum358);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k127+(ptrdiff_t)6272, 65535, sum358);
sum359 = _mm512_max_ps(_mm512_setzero_ps(), sum359);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k127+(ptrdiff_t)9408, 65535, sum359);
sum360 = _mm512_max_ps(_mm512_setzero_ps(), sum360);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k127+(ptrdiff_t)12544, 65535, sum360);
sum361 = _mm512_max_ps(_mm512_setzero_ps(), sum361);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k127+(ptrdiff_t)15680, 65535, sum361);
if (k127 >= kk42) return;
}
ptrdiff_t s34 = -1;
__m512 sum362 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+8*s34+(ptrdiff_t)8));
__m512 sum363 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+8*s34+(ptrdiff_t)12));
for (s34 = 0; s34 < 512; ++s34) {
__m512 dat1940 = _mm512_loadu_ps(arrangedDats6+1605632*i43+131072*j36+64*s34+(ptrdiff_t)0);
__m512 wt447 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+8*s34+(ptrdiff_t)8));
sum362 = _mm512_fmadd_ps(wt447, dat1940, sum362);
__m512 wt448 = _mm512_set1_ps(*(float*)(arrangedWts6+262656*i43+12312*k127+8*s34+(ptrdiff_t)12));
sum363 = _mm512_fmadd_ps(wt448, dat1940, sum363);
}
sum362 = _mm512_max_ps(_mm512_setzero_ps(), sum362);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k127+(ptrdiff_t)0, 65535, sum362);
sum363 = _mm512_max_ps(_mm512_setzero_ps(), sum363);
_mm512_mask_storeu_ps(datPtr22+401408*i43+256*j36+18816*k127+(ptrdiff_t)3136, 65535, sum363);
}
}

static void ResNet50OneApply6(ResNet50ThreaderTeam1* team48, char** tensors69) {
void* pair17[] = {tensors69, 0};
ResNet50ThreaderTask1 task73;
task73.callee1 = ResNet50OneApply6Callee1;
task73.any1 = pair17;
task73.nd1 = 3;
task73.hull1[0] = 22;
task73.hull1[1] = 13;
task73.hull1[2] = 1;
ResNet50ThreaderDo1(team48, &task73);
}

static void ResNet50OneArrangeWts7Callee1(ResNet50ThreaderTask1* task82, int64_t* pt46) {
char** tensors80 = task82->any1;
ptrdiff_t b62 = pt46[0];
char*restrict wtPtr14 = tensors80[0]+(ptrdiff_t)3340*0+(ptrdiff_t)2621440*0;
char*restrict biasPtr14 = tensors80[1]+(ptrdiff_t)5120*0;
char*restrict bnPtr14 = tensors80[2]+(ptrdiff_t)8*1280*0;
char*restrict wtPtr15 = tensors80[3]+(ptrdiff_t)3340*0+(ptrdiff_t)2621440*0;
char*restrict biasPtr15 = tensors80[4]+(ptrdiff_t)5120*0;
char*restrict bnPtr15 = tensors80[5]+(ptrdiff_t)8*1280*0;
char*restrict arranged13 = tensors80[6]+(ptrdiff_t)4280320*0+(ptrdiff_t)2626560*0;
ptrdiff_t ii19 = 1;
for (ptrdiff_t i49 = 0; i49 < ii19; ++i49) {
ptrdiff_t j41 = 1*b62;
ptrdiff_t jj41 = j41+1;
for (; j41 < jj41; ++j41) {
if (j41 < 64) {
ptrdiff_t k140 = 0+16*(j41-0);
ptrdiff_t l59 = (size_t)(0+k140)/6;
ptrdiff_t cut19 = (size_t)(0+k140)%6;
switch (cut19) {
case 0:;
case 2: {
__m512 sum392 = _mm512_maskz_loadu_ps(65535, biasPtr14+5120*i49+4*k140);
__m512i pmMul28 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd28 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo23 = _mm512_loadu_ps(bnPtr14+(ptrdiff_t)8*(k140+1280*i49));
__m512 masHi23 = _mm512_maskz_loadu_ps(65535, bnPtr14+(ptrdiff_t)8*(k140+1280*i49)+(ptrdiff_t)64);
__m512 postMul46 = _mm512_permutex2var_ps(masLo23, pmMul28, masHi23);
__m512 postAdd28 = _mm512_permutex2var_ps(masLo23, pmAdd28, masHi23);
sum392 = _mm512_fmadd_ps(sum392, postMul46, postAdd28);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*0+(ptrdiff_t)0, 63>>cut19, sum392);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*0+(ptrdiff_t)12288, 4032>>cut19, sum392);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*0+(ptrdiff_t)24576, 65535-(4095>>cut19), sum392);
ptrdiff_t c37 = 0;
for (; c37 != 32; ++c37) {
__m512 wt453 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)0);
__m512 wt454 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)2048);
__m512 wt455 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)4096);
__m512 wt456 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)6144);
__m512 wt457 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)8192);
__m512 wt458 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)10240);
__m512 wt459 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)12288);
__m512 wt460 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)14336);
__m512 wt461 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)16384);
__m512 wt462 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)18432);
__m512 wt463 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)20480);
__m512 wt464 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)22528);
__m512 wt465 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)24576);
__m512 wt466 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)26624);
__m512 wt467 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)28672);
__m512 wt468 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c37+(ptrdiff_t)30720);
__m512 tmp16317 = _mm512_unpacklo_ps(wt453, wt454);
__m512 tmp16318 = _mm512_unpackhi_ps(wt453, wt454);
__m512 tmp16319 = _mm512_unpacklo_ps(wt455, wt456);
__m512 tmp16320 = _mm512_unpackhi_ps(wt455, wt456);
__m512 tmp16321 = _mm512_unpacklo_ps(wt457, wt458);
__m512 tmp16322 = _mm512_unpackhi_ps(wt457, wt458);
__m512 tmp16323 = _mm512_unpacklo_ps(wt459, wt460);
__m512 tmp16324 = _mm512_unpackhi_ps(wt459, wt460);
__m512 tmp16325 = _mm512_unpacklo_ps(wt461, wt462);
__m512 tmp16326 = _mm512_unpackhi_ps(wt461, wt462);
__m512 tmp16327 = _mm512_unpacklo_ps(wt463, wt464);
__m512 tmp16328 = _mm512_unpackhi_ps(wt463, wt464);
__m512 tmp16329 = _mm512_unpacklo_ps(wt465, wt466);
__m512 tmp16330 = _mm512_unpackhi_ps(wt465, wt466);
__m512 tmp16331 = _mm512_unpacklo_ps(wt467, wt468);
__m512 tmp16332 = _mm512_unpackhi_ps(wt467, wt468);
__m512 tmp16333 = _mm512_shuffle_ps(tmp16317, tmp16319, 68);
__m512 tmp16334 = _mm512_shuffle_ps(tmp16317, tmp16319, 238);
__m512 tmp16335 = _mm512_shuffle_ps(tmp16318, tmp16320, 68);
__m512 tmp16336 = _mm512_shuffle_ps(tmp16318, tmp16320, 238);
__m512 tmp16337 = _mm512_shuffle_ps(tmp16321, tmp16323, 68);
__m512 tmp16338 = _mm512_shuffle_ps(tmp16321, tmp16323, 238);
__m512 tmp16339 = _mm512_shuffle_ps(tmp16322, tmp16324, 68);
__m512 tmp16340 = _mm512_shuffle_ps(tmp16322, tmp16324, 238);
__m512 tmp16341 = _mm512_shuffle_ps(tmp16325, tmp16327, 68);
__m512 tmp16342 = _mm512_shuffle_ps(tmp16325, tmp16327, 238);
__m512 tmp16343 = _mm512_shuffle_ps(tmp16326, tmp16328, 68);
__m512 tmp16344 = _mm512_shuffle_ps(tmp16326, tmp16328, 238);
__m512 tmp16345 = _mm512_shuffle_ps(tmp16329, tmp16331, 68);
__m512 tmp16346 = _mm512_shuffle_ps(tmp16329, tmp16331, 238);
__m512 tmp16347 = _mm512_shuffle_ps(tmp16330, tmp16332, 68);
__m512 tmp16348 = _mm512_shuffle_ps(tmp16330, tmp16332, 238);
__m512 tmp16349 = _mm512_shuffle_f32x4(tmp16333, tmp16337, 136);
__m512 tmp16350 = _mm512_shuffle_f32x4(tmp16333, tmp16337, 221);
__m512 tmp16351 = _mm512_shuffle_f32x4(tmp16334, tmp16338, 136);
__m512 tmp16352 = _mm512_shuffle_f32x4(tmp16334, tmp16338, 221);
__m512 tmp16353 = _mm512_shuffle_f32x4(tmp16335, tmp16339, 136);
__m512 tmp16354 = _mm512_shuffle_f32x4(tmp16335, tmp16339, 221);
__m512 tmp16355 = _mm512_shuffle_f32x4(tmp16336, tmp16340, 136);
__m512 tmp16356 = _mm512_shuffle_f32x4(tmp16336, tmp16340, 221);
__m512 tmp16357 = _mm512_shuffle_f32x4(tmp16341, tmp16345, 136);
__m512 tmp16358 = _mm512_shuffle_f32x4(tmp16341, tmp16345, 221);
__m512 tmp16359 = _mm512_shuffle_f32x4(tmp16342, tmp16346, 136);
__m512 tmp16360 = _mm512_shuffle_f32x4(tmp16342, tmp16346, 221);
__m512 tmp16361 = _mm512_shuffle_f32x4(tmp16343, tmp16347, 136);
__m512 tmp16362 = _mm512_shuffle_f32x4(tmp16343, tmp16347, 221);
__m512 tmp16363 = _mm512_shuffle_f32x4(tmp16344, tmp16348, 136);
__m512 tmp16364 = _mm512_shuffle_f32x4(tmp16344, tmp16348, 221);
wt453 = _mm512_shuffle_f32x4(tmp16349, tmp16357, 136);
wt461 = _mm512_shuffle_f32x4(tmp16349, tmp16357, 221);
wt454 = _mm512_shuffle_f32x4(tmp16351, tmp16359, 136);
wt462 = _mm512_shuffle_f32x4(tmp16351, tmp16359, 221);
wt455 = _mm512_shuffle_f32x4(tmp16353, tmp16361, 136);
wt463 = _mm512_shuffle_f32x4(tmp16353, tmp16361, 221);
wt456 = _mm512_shuffle_f32x4(tmp16355, tmp16363, 136);
wt464 = _mm512_shuffle_f32x4(tmp16355, tmp16363, 221);
wt457 = _mm512_shuffle_f32x4(tmp16350, tmp16358, 136);
wt465 = _mm512_shuffle_f32x4(tmp16350, tmp16358, 221);
wt458 = _mm512_shuffle_f32x4(tmp16352, tmp16360, 136);
wt466 = _mm512_shuffle_f32x4(tmp16352, tmp16360, 221);
wt459 = _mm512_shuffle_f32x4(tmp16354, tmp16362, 136);
wt467 = _mm512_shuffle_f32x4(tmp16354, tmp16362, 221);
wt460 = _mm512_shuffle_f32x4(tmp16356, tmp16364, 136);
wt468 = _mm512_shuffle_f32x4(tmp16356, tmp16364, 221);
wt453 = _mm512_mul_ps(wt453, postMul46);
wt454 = _mm512_mul_ps(wt454, postMul46);
wt455 = _mm512_mul_ps(wt455, postMul46);
wt456 = _mm512_mul_ps(wt456, postMul46);
wt457 = _mm512_mul_ps(wt457, postMul46);
wt458 = _mm512_mul_ps(wt458, postMul46);
wt459 = _mm512_mul_ps(wt459, postMul46);
wt460 = _mm512_mul_ps(wt460, postMul46);
wt461 = _mm512_mul_ps(wt461, postMul46);
wt462 = _mm512_mul_ps(wt462, postMul46);
wt463 = _mm512_mul_ps(wt463, postMul46);
wt464 = _mm512_mul_ps(wt464, postMul46);
wt465 = _mm512_mul_ps(wt465, postMul46);
wt466 = _mm512_mul_ps(wt466, postMul46);
wt467 = _mm512_mul_ps(wt467, postMul46);
wt468 = _mm512_mul_ps(wt468, postMul46);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(1+16*c37)+(ptrdiff_t)0, 63>>cut19, wt453);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(2+16*c37)+(ptrdiff_t)0, 63>>cut19, wt454);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(3+16*c37)+(ptrdiff_t)0, 63>>cut19, wt455);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(4+16*c37)+(ptrdiff_t)0, 63>>cut19, wt456);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(5+16*c37)+(ptrdiff_t)0, 63>>cut19, wt457);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(6+16*c37)+(ptrdiff_t)0, 63>>cut19, wt458);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(7+16*c37)+(ptrdiff_t)0, 63>>cut19, wt459);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(8+16*c37)+(ptrdiff_t)0, 63>>cut19, wt460);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(9+16*c37)+(ptrdiff_t)0, 63>>cut19, wt461);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(10+16*c37)+(ptrdiff_t)0, 63>>cut19, wt462);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(11+16*c37)+(ptrdiff_t)0, 63>>cut19, wt463);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(12+16*c37)+(ptrdiff_t)0, 63>>cut19, wt464);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(13+16*c37)+(ptrdiff_t)0, 63>>cut19, wt465);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(14+16*c37)+(ptrdiff_t)0, 63>>cut19, wt466);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(15+16*c37)+(ptrdiff_t)0, 63>>cut19, wt467);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(16+16*c37)+(ptrdiff_t)0, 63>>cut19, wt468);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(1+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt453);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(2+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt454);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(3+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt455);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(4+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt456);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(5+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt457);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(6+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt458);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(7+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt459);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(8+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt460);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(9+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt461);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(10+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt462);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(11+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt463);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(12+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt464);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(13+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt465);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(14+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt466);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(15+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt467);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(16+16*c37)+(ptrdiff_t)12288, 4032>>cut19, wt468);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(1+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt453);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(2+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt454);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(3+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt455);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(4+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt456);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(5+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt457);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(6+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt458);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(7+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt459);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(8+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt460);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(9+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt461);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(10+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt462);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(11+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt463);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(12+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt464);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(13+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt465);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(14+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt466);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(15+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt467);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(16+16*c37)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt468);
}
break;
}
default: {
cut19 = 4;
__m512 sum393 = _mm512_maskz_loadu_ps(65535, biasPtr14+5120*i49+4*k140);
__m512i pmMul29 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd29 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo24 = _mm512_loadu_ps(bnPtr14+(ptrdiff_t)8*(k140+1280*i49));
__m512 masHi24 = _mm512_maskz_loadu_ps(65535, bnPtr14+(ptrdiff_t)8*(k140+1280*i49)+(ptrdiff_t)64);
__m512 postMul47 = _mm512_permutex2var_ps(masLo24, pmMul29, masHi24);
__m512 postAdd29 = _mm512_permutex2var_ps(masLo24, pmAdd29, masHi24);
sum393 = _mm512_fmadd_ps(sum393, postMul47, postAdd29);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*0+(ptrdiff_t)0, 63>>cut19, sum393);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*0+(ptrdiff_t)12288, 4032>>cut19, sum393);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*0+(ptrdiff_t)24576, 258048>>cut19, sum393);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*0+(ptrdiff_t)36864, 65535-(262143>>cut19), sum393);
ptrdiff_t c38 = 0;
for (; c38 != 32; ++c38) {
__m512 wt469 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)0);
__m512 wt470 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)2048);
__m512 wt471 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)4096);
__m512 wt472 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)6144);
__m512 wt473 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)8192);
__m512 wt474 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)10240);
__m512 wt475 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)12288);
__m512 wt476 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)14336);
__m512 wt477 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)16384);
__m512 wt478 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)18432);
__m512 wt479 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)20480);
__m512 wt480 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)22528);
__m512 wt481 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)24576);
__m512 wt482 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)26624);
__m512 wt483 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)28672);
__m512 wt484 = _mm512_maskz_loadu_ps(65535, wtPtr14+2621440*i49+2048*k140+64*c38+(ptrdiff_t)30720);
__m512 tmp16365 = _mm512_unpacklo_ps(wt469, wt470);
__m512 tmp16366 = _mm512_unpackhi_ps(wt469, wt470);
__m512 tmp16367 = _mm512_unpacklo_ps(wt471, wt472);
__m512 tmp16368 = _mm512_unpackhi_ps(wt471, wt472);
__m512 tmp16369 = _mm512_unpacklo_ps(wt473, wt474);
__m512 tmp16370 = _mm512_unpackhi_ps(wt473, wt474);
__m512 tmp16371 = _mm512_unpacklo_ps(wt475, wt476);
__m512 tmp16372 = _mm512_unpackhi_ps(wt475, wt476);
__m512 tmp16373 = _mm512_unpacklo_ps(wt477, wt478);
__m512 tmp16374 = _mm512_unpackhi_ps(wt477, wt478);
__m512 tmp16375 = _mm512_unpacklo_ps(wt479, wt480);
__m512 tmp16376 = _mm512_unpackhi_ps(wt479, wt480);
__m512 tmp16377 = _mm512_unpacklo_ps(wt481, wt482);
__m512 tmp16378 = _mm512_unpackhi_ps(wt481, wt482);
__m512 tmp16379 = _mm512_unpacklo_ps(wt483, wt484);
__m512 tmp16380 = _mm512_unpackhi_ps(wt483, wt484);
__m512 tmp16381 = _mm512_shuffle_ps(tmp16365, tmp16367, 68);
__m512 tmp16382 = _mm512_shuffle_ps(tmp16365, tmp16367, 238);
__m512 tmp16383 = _mm512_shuffle_ps(tmp16366, tmp16368, 68);
__m512 tmp16384 = _mm512_shuffle_ps(tmp16366, tmp16368, 238);
__m512 tmp16385 = _mm512_shuffle_ps(tmp16369, tmp16371, 68);
__m512 tmp16386 = _mm512_shuffle_ps(tmp16369, tmp16371, 238);
__m512 tmp16387 = _mm512_shuffle_ps(tmp16370, tmp16372, 68);
__m512 tmp16388 = _mm512_shuffle_ps(tmp16370, tmp16372, 238);
__m512 tmp16389 = _mm512_shuffle_ps(tmp16373, tmp16375, 68);
__m512 tmp16390 = _mm512_shuffle_ps(tmp16373, tmp16375, 238);
__m512 tmp16391 = _mm512_shuffle_ps(tmp16374, tmp16376, 68);
__m512 tmp16392 = _mm512_shuffle_ps(tmp16374, tmp16376, 238);
__m512 tmp16393 = _mm512_shuffle_ps(tmp16377, tmp16379, 68);
__m512 tmp16394 = _mm512_shuffle_ps(tmp16377, tmp16379, 238);
__m512 tmp16395 = _mm512_shuffle_ps(tmp16378, tmp16380, 68);
__m512 tmp16396 = _mm512_shuffle_ps(tmp16378, tmp16380, 238);
__m512 tmp16397 = _mm512_shuffle_f32x4(tmp16381, tmp16385, 136);
__m512 tmp16398 = _mm512_shuffle_f32x4(tmp16381, tmp16385, 221);
__m512 tmp16399 = _mm512_shuffle_f32x4(tmp16382, tmp16386, 136);
__m512 tmp16400 = _mm512_shuffle_f32x4(tmp16382, tmp16386, 221);
__m512 tmp16401 = _mm512_shuffle_f32x4(tmp16383, tmp16387, 136);
__m512 tmp16402 = _mm512_shuffle_f32x4(tmp16383, tmp16387, 221);
__m512 tmp16403 = _mm512_shuffle_f32x4(tmp16384, tmp16388, 136);
__m512 tmp16404 = _mm512_shuffle_f32x4(tmp16384, tmp16388, 221);
__m512 tmp16405 = _mm512_shuffle_f32x4(tmp16389, tmp16393, 136);
__m512 tmp16406 = _mm512_shuffle_f32x4(tmp16389, tmp16393, 221);
__m512 tmp16407 = _mm512_shuffle_f32x4(tmp16390, tmp16394, 136);
__m512 tmp16408 = _mm512_shuffle_f32x4(tmp16390, tmp16394, 221);
__m512 tmp16409 = _mm512_shuffle_f32x4(tmp16391, tmp16395, 136);
__m512 tmp16410 = _mm512_shuffle_f32x4(tmp16391, tmp16395, 221);
__m512 tmp16411 = _mm512_shuffle_f32x4(tmp16392, tmp16396, 136);
__m512 tmp16412 = _mm512_shuffle_f32x4(tmp16392, tmp16396, 221);
wt469 = _mm512_shuffle_f32x4(tmp16397, tmp16405, 136);
wt477 = _mm512_shuffle_f32x4(tmp16397, tmp16405, 221);
wt470 = _mm512_shuffle_f32x4(tmp16399, tmp16407, 136);
wt478 = _mm512_shuffle_f32x4(tmp16399, tmp16407, 221);
wt471 = _mm512_shuffle_f32x4(tmp16401, tmp16409, 136);
wt479 = _mm512_shuffle_f32x4(tmp16401, tmp16409, 221);
wt472 = _mm512_shuffle_f32x4(tmp16403, tmp16411, 136);
wt480 = _mm512_shuffle_f32x4(tmp16403, tmp16411, 221);
wt473 = _mm512_shuffle_f32x4(tmp16398, tmp16406, 136);
wt481 = _mm512_shuffle_f32x4(tmp16398, tmp16406, 221);
wt474 = _mm512_shuffle_f32x4(tmp16400, tmp16408, 136);
wt482 = _mm512_shuffle_f32x4(tmp16400, tmp16408, 221);
wt475 = _mm512_shuffle_f32x4(tmp16402, tmp16410, 136);
wt483 = _mm512_shuffle_f32x4(tmp16402, tmp16410, 221);
wt476 = _mm512_shuffle_f32x4(tmp16404, tmp16412, 136);
wt484 = _mm512_shuffle_f32x4(tmp16404, tmp16412, 221);
wt469 = _mm512_mul_ps(wt469, postMul47);
wt470 = _mm512_mul_ps(wt470, postMul47);
wt471 = _mm512_mul_ps(wt471, postMul47);
wt472 = _mm512_mul_ps(wt472, postMul47);
wt473 = _mm512_mul_ps(wt473, postMul47);
wt474 = _mm512_mul_ps(wt474, postMul47);
wt475 = _mm512_mul_ps(wt475, postMul47);
wt476 = _mm512_mul_ps(wt476, postMul47);
wt477 = _mm512_mul_ps(wt477, postMul47);
wt478 = _mm512_mul_ps(wt478, postMul47);
wt479 = _mm512_mul_ps(wt479, postMul47);
wt480 = _mm512_mul_ps(wt480, postMul47);
wt481 = _mm512_mul_ps(wt481, postMul47);
wt482 = _mm512_mul_ps(wt482, postMul47);
wt483 = _mm512_mul_ps(wt483, postMul47);
wt484 = _mm512_mul_ps(wt484, postMul47);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(1+16*c38)+(ptrdiff_t)0, 63>>cut19, wt469);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(2+16*c38)+(ptrdiff_t)0, 63>>cut19, wt470);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(3+16*c38)+(ptrdiff_t)0, 63>>cut19, wt471);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(4+16*c38)+(ptrdiff_t)0, 63>>cut19, wt472);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(5+16*c38)+(ptrdiff_t)0, 63>>cut19, wt473);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(6+16*c38)+(ptrdiff_t)0, 63>>cut19, wt474);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(7+16*c38)+(ptrdiff_t)0, 63>>cut19, wt475);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(8+16*c38)+(ptrdiff_t)0, 63>>cut19, wt476);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(9+16*c38)+(ptrdiff_t)0, 63>>cut19, wt477);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(10+16*c38)+(ptrdiff_t)0, 63>>cut19, wt478);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(11+16*c38)+(ptrdiff_t)0, 63>>cut19, wt479);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(12+16*c38)+(ptrdiff_t)0, 63>>cut19, wt480);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(13+16*c38)+(ptrdiff_t)0, 63>>cut19, wt481);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(14+16*c38)+(ptrdiff_t)0, 63>>cut19, wt482);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(15+16*c38)+(ptrdiff_t)0, 63>>cut19, wt483);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(16+16*c38)+(ptrdiff_t)0, 63>>cut19, wt484);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(1+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt469);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(2+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt470);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(3+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt471);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(4+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt472);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(5+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt473);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(6+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt474);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(7+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt475);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(8+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt476);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(9+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt477);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(10+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt478);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(11+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt479);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(12+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt480);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(13+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt481);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(14+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt482);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(15+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt483);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(16+16*c38)+(ptrdiff_t)12288, 4032>>cut19, wt484);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(1+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt469);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(2+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt470);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(3+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt471);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(4+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt472);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(5+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt473);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(6+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt474);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(7+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt475);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(8+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt476);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(9+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt477);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(10+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt478);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(11+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt479);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(12+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt480);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(13+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt481);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(14+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt482);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(15+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt483);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(16+16*c38)+(ptrdiff_t)24576, 258048>>cut19, wt484);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(1+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt469);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(2+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt470);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(3+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt471);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(4+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt472);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(5+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt473);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(6+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt474);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(7+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt475);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(8+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt476);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(9+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt477);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(10+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt478);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(11+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt479);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(12+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt480);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(13+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt481);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(14+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt482);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(15+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt483);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l59+4*cut19+24*(16+16*c38)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt484);
}
}
}
} else if (j41 < 79) {
ptrdiff_t k142 = 0+16*(j41-64);
ptrdiff_t l61 = (size_t)(1024+k142)/6;
ptrdiff_t cut21 = (size_t)(1024+k142)%6;
switch (cut21) {
case 0:;
case 2: {
__m512 sum395 = _mm512_maskz_loadu_ps(65535, biasPtr15+5120*i49+4*k142);
__m512i pmMul30 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd30 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo25 = _mm512_loadu_ps(bnPtr15+(ptrdiff_t)8*(k142+1280*i49));
__m512 masHi25 = _mm512_maskz_loadu_ps(65535, bnPtr15+(ptrdiff_t)8*(k142+1280*i49)+(ptrdiff_t)64);
__m512 postMul49 = _mm512_permutex2var_ps(masLo25, pmMul30, masHi25);
__m512 postAdd31 = _mm512_permutex2var_ps(masLo25, pmAdd30, masHi25);
sum395 = _mm512_fmadd_ps(sum395, postMul49, postAdd31);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*0+(ptrdiff_t)0, 63>>cut21, sum395);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*0+(ptrdiff_t)12288, 4032>>cut21, sum395);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*0+(ptrdiff_t)24576, 65535-(4095>>cut21), sum395);
ptrdiff_t c40 = 0;
for (; c40 != 32; ++c40) {
__m512 wt501 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)0);
__m512 wt502 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)2048);
__m512 wt503 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)4096);
__m512 wt504 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)6144);
__m512 wt505 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)8192);
__m512 wt506 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)10240);
__m512 wt507 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)12288);
__m512 wt508 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)14336);
__m512 wt509 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)16384);
__m512 wt510 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)18432);
__m512 wt511 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)20480);
__m512 wt512 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)22528);
__m512 wt513 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)24576);
__m512 wt514 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)26624);
__m512 wt515 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)28672);
__m512 wt516 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c40+(ptrdiff_t)30720);
__m512 tmp16413 = _mm512_unpacklo_ps(wt501, wt502);
__m512 tmp16414 = _mm512_unpackhi_ps(wt501, wt502);
__m512 tmp16415 = _mm512_unpacklo_ps(wt503, wt504);
__m512 tmp16416 = _mm512_unpackhi_ps(wt503, wt504);
__m512 tmp16417 = _mm512_unpacklo_ps(wt505, wt506);
__m512 tmp16418 = _mm512_unpackhi_ps(wt505, wt506);
__m512 tmp16419 = _mm512_unpacklo_ps(wt507, wt508);
__m512 tmp16420 = _mm512_unpackhi_ps(wt507, wt508);
__m512 tmp16421 = _mm512_unpacklo_ps(wt509, wt510);
__m512 tmp16422 = _mm512_unpackhi_ps(wt509, wt510);
__m512 tmp16423 = _mm512_unpacklo_ps(wt511, wt512);
__m512 tmp16424 = _mm512_unpackhi_ps(wt511, wt512);
__m512 tmp16425 = _mm512_unpacklo_ps(wt513, wt514);
__m512 tmp16426 = _mm512_unpackhi_ps(wt513, wt514);
__m512 tmp16427 = _mm512_unpacklo_ps(wt515, wt516);
__m512 tmp16428 = _mm512_unpackhi_ps(wt515, wt516);
__m512 tmp16429 = _mm512_shuffle_ps(tmp16413, tmp16415, 68);
__m512 tmp16430 = _mm512_shuffle_ps(tmp16413, tmp16415, 238);
__m512 tmp16431 = _mm512_shuffle_ps(tmp16414, tmp16416, 68);
__m512 tmp16432 = _mm512_shuffle_ps(tmp16414, tmp16416, 238);
__m512 tmp16433 = _mm512_shuffle_ps(tmp16417, tmp16419, 68);
__m512 tmp16434 = _mm512_shuffle_ps(tmp16417, tmp16419, 238);
__m512 tmp16435 = _mm512_shuffle_ps(tmp16418, tmp16420, 68);
__m512 tmp16436 = _mm512_shuffle_ps(tmp16418, tmp16420, 238);
__m512 tmp16437 = _mm512_shuffle_ps(tmp16421, tmp16423, 68);
__m512 tmp16438 = _mm512_shuffle_ps(tmp16421, tmp16423, 238);
__m512 tmp16439 = _mm512_shuffle_ps(tmp16422, tmp16424, 68);
__m512 tmp16440 = _mm512_shuffle_ps(tmp16422, tmp16424, 238);
__m512 tmp16441 = _mm512_shuffle_ps(tmp16425, tmp16427, 68);
__m512 tmp16442 = _mm512_shuffle_ps(tmp16425, tmp16427, 238);
__m512 tmp16443 = _mm512_shuffle_ps(tmp16426, tmp16428, 68);
__m512 tmp16444 = _mm512_shuffle_ps(tmp16426, tmp16428, 238);
__m512 tmp16445 = _mm512_shuffle_f32x4(tmp16429, tmp16433, 136);
__m512 tmp16446 = _mm512_shuffle_f32x4(tmp16429, tmp16433, 221);
__m512 tmp16447 = _mm512_shuffle_f32x4(tmp16430, tmp16434, 136);
__m512 tmp16448 = _mm512_shuffle_f32x4(tmp16430, tmp16434, 221);
__m512 tmp16449 = _mm512_shuffle_f32x4(tmp16431, tmp16435, 136);
__m512 tmp16450 = _mm512_shuffle_f32x4(tmp16431, tmp16435, 221);
__m512 tmp16451 = _mm512_shuffle_f32x4(tmp16432, tmp16436, 136);
__m512 tmp16452 = _mm512_shuffle_f32x4(tmp16432, tmp16436, 221);
__m512 tmp16453 = _mm512_shuffle_f32x4(tmp16437, tmp16441, 136);
__m512 tmp16454 = _mm512_shuffle_f32x4(tmp16437, tmp16441, 221);
__m512 tmp16455 = _mm512_shuffle_f32x4(tmp16438, tmp16442, 136);
__m512 tmp16456 = _mm512_shuffle_f32x4(tmp16438, tmp16442, 221);
__m512 tmp16457 = _mm512_shuffle_f32x4(tmp16439, tmp16443, 136);
__m512 tmp16458 = _mm512_shuffle_f32x4(tmp16439, tmp16443, 221);
__m512 tmp16459 = _mm512_shuffle_f32x4(tmp16440, tmp16444, 136);
__m512 tmp16460 = _mm512_shuffle_f32x4(tmp16440, tmp16444, 221);
wt501 = _mm512_shuffle_f32x4(tmp16445, tmp16453, 136);
wt509 = _mm512_shuffle_f32x4(tmp16445, tmp16453, 221);
wt502 = _mm512_shuffle_f32x4(tmp16447, tmp16455, 136);
wt510 = _mm512_shuffle_f32x4(tmp16447, tmp16455, 221);
wt503 = _mm512_shuffle_f32x4(tmp16449, tmp16457, 136);
wt511 = _mm512_shuffle_f32x4(tmp16449, tmp16457, 221);
wt504 = _mm512_shuffle_f32x4(tmp16451, tmp16459, 136);
wt512 = _mm512_shuffle_f32x4(tmp16451, tmp16459, 221);
wt505 = _mm512_shuffle_f32x4(tmp16446, tmp16454, 136);
wt513 = _mm512_shuffle_f32x4(tmp16446, tmp16454, 221);
wt506 = _mm512_shuffle_f32x4(tmp16448, tmp16456, 136);
wt514 = _mm512_shuffle_f32x4(tmp16448, tmp16456, 221);
wt507 = _mm512_shuffle_f32x4(tmp16450, tmp16458, 136);
wt515 = _mm512_shuffle_f32x4(tmp16450, tmp16458, 221);
wt508 = _mm512_shuffle_f32x4(tmp16452, tmp16460, 136);
wt516 = _mm512_shuffle_f32x4(tmp16452, tmp16460, 221);
wt501 = _mm512_mul_ps(wt501, postMul49);
wt502 = _mm512_mul_ps(wt502, postMul49);
wt503 = _mm512_mul_ps(wt503, postMul49);
wt504 = _mm512_mul_ps(wt504, postMul49);
wt505 = _mm512_mul_ps(wt505, postMul49);
wt506 = _mm512_mul_ps(wt506, postMul49);
wt507 = _mm512_mul_ps(wt507, postMul49);
wt508 = _mm512_mul_ps(wt508, postMul49);
wt509 = _mm512_mul_ps(wt509, postMul49);
wt510 = _mm512_mul_ps(wt510, postMul49);
wt511 = _mm512_mul_ps(wt511, postMul49);
wt512 = _mm512_mul_ps(wt512, postMul49);
wt513 = _mm512_mul_ps(wt513, postMul49);
wt514 = _mm512_mul_ps(wt514, postMul49);
wt515 = _mm512_mul_ps(wt515, postMul49);
wt516 = _mm512_mul_ps(wt516, postMul49);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(1+16*c40)+(ptrdiff_t)0, 63>>cut21, wt501);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(2+16*c40)+(ptrdiff_t)0, 63>>cut21, wt502);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(3+16*c40)+(ptrdiff_t)0, 63>>cut21, wt503);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(4+16*c40)+(ptrdiff_t)0, 63>>cut21, wt504);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(5+16*c40)+(ptrdiff_t)0, 63>>cut21, wt505);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(6+16*c40)+(ptrdiff_t)0, 63>>cut21, wt506);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(7+16*c40)+(ptrdiff_t)0, 63>>cut21, wt507);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(8+16*c40)+(ptrdiff_t)0, 63>>cut21, wt508);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(9+16*c40)+(ptrdiff_t)0, 63>>cut21, wt509);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(10+16*c40)+(ptrdiff_t)0, 63>>cut21, wt510);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(11+16*c40)+(ptrdiff_t)0, 63>>cut21, wt511);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(12+16*c40)+(ptrdiff_t)0, 63>>cut21, wt512);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(13+16*c40)+(ptrdiff_t)0, 63>>cut21, wt513);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(14+16*c40)+(ptrdiff_t)0, 63>>cut21, wt514);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(15+16*c40)+(ptrdiff_t)0, 63>>cut21, wt515);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(16+16*c40)+(ptrdiff_t)0, 63>>cut21, wt516);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(1+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt501);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(2+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt502);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(3+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt503);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(4+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt504);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(5+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt505);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(6+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt506);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(7+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt507);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(8+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt508);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(9+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt509);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(10+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt510);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(11+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt511);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(12+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt512);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(13+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt513);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(14+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt514);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(15+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt515);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(16+16*c40)+(ptrdiff_t)12288, 4032>>cut21, wt516);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(1+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt501);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(2+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt502);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(3+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt503);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(4+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt504);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(5+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt505);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(6+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt506);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(7+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt507);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(8+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt508);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(9+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt509);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(10+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt510);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(11+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt511);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(12+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt512);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(13+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt513);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(14+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt514);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(15+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt515);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(16+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt516);
}
break;
}
default: {
cut21 = 4;
__m512 sum396 = _mm512_maskz_loadu_ps(65535, biasPtr15+5120*i49+4*k142);
__m512i pmMul31 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd31 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo26 = _mm512_loadu_ps(bnPtr15+(ptrdiff_t)8*(k142+1280*i49));
__m512 masHi26 = _mm512_maskz_loadu_ps(65535, bnPtr15+(ptrdiff_t)8*(k142+1280*i49)+(ptrdiff_t)64);
__m512 postMul50 = _mm512_permutex2var_ps(masLo26, pmMul31, masHi26);
__m512 postAdd32 = _mm512_permutex2var_ps(masLo26, pmAdd31, masHi26);
sum396 = _mm512_fmadd_ps(sum396, postMul50, postAdd32);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*0+(ptrdiff_t)0, 63>>cut21, sum396);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*0+(ptrdiff_t)12288, 4032>>cut21, sum396);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*0+(ptrdiff_t)24576, 258048>>cut21, sum396);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*0+(ptrdiff_t)36864, 65535-(262143>>cut21), sum396);
ptrdiff_t c41 = 0;
for (; c41 != 32; ++c41) {
__m512 wt517 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)0);
__m512 wt518 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)2048);
__m512 wt519 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)4096);
__m512 wt520 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)6144);
__m512 wt521 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)8192);
__m512 wt522 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)10240);
__m512 wt523 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)12288);
__m512 wt524 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)14336);
__m512 wt525 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)16384);
__m512 wt526 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)18432);
__m512 wt527 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)20480);
__m512 wt528 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)22528);
__m512 wt529 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)24576);
__m512 wt530 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)26624);
__m512 wt531 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)28672);
__m512 wt532 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k142+64*c41+(ptrdiff_t)30720);
__m512 tmp16461 = _mm512_unpacklo_ps(wt517, wt518);
__m512 tmp16462 = _mm512_unpackhi_ps(wt517, wt518);
__m512 tmp16463 = _mm512_unpacklo_ps(wt519, wt520);
__m512 tmp16464 = _mm512_unpackhi_ps(wt519, wt520);
__m512 tmp16465 = _mm512_unpacklo_ps(wt521, wt522);
__m512 tmp16466 = _mm512_unpackhi_ps(wt521, wt522);
__m512 tmp16467 = _mm512_unpacklo_ps(wt523, wt524);
__m512 tmp16468 = _mm512_unpackhi_ps(wt523, wt524);
__m512 tmp16469 = _mm512_unpacklo_ps(wt525, wt526);
__m512 tmp16470 = _mm512_unpackhi_ps(wt525, wt526);
__m512 tmp16471 = _mm512_unpacklo_ps(wt527, wt528);
__m512 tmp16472 = _mm512_unpackhi_ps(wt527, wt528);
__m512 tmp16473 = _mm512_unpacklo_ps(wt529, wt530);
__m512 tmp16474 = _mm512_unpackhi_ps(wt529, wt530);
__m512 tmp16475 = _mm512_unpacklo_ps(wt531, wt532);
__m512 tmp16476 = _mm512_unpackhi_ps(wt531, wt532);
__m512 tmp16477 = _mm512_shuffle_ps(tmp16461, tmp16463, 68);
__m512 tmp16478 = _mm512_shuffle_ps(tmp16461, tmp16463, 238);
__m512 tmp16479 = _mm512_shuffle_ps(tmp16462, tmp16464, 68);
__m512 tmp16480 = _mm512_shuffle_ps(tmp16462, tmp16464, 238);
__m512 tmp16481 = _mm512_shuffle_ps(tmp16465, tmp16467, 68);
__m512 tmp16482 = _mm512_shuffle_ps(tmp16465, tmp16467, 238);
__m512 tmp16483 = _mm512_shuffle_ps(tmp16466, tmp16468, 68);
__m512 tmp16484 = _mm512_shuffle_ps(tmp16466, tmp16468, 238);
__m512 tmp16485 = _mm512_shuffle_ps(tmp16469, tmp16471, 68);
__m512 tmp16486 = _mm512_shuffle_ps(tmp16469, tmp16471, 238);
__m512 tmp16487 = _mm512_shuffle_ps(tmp16470, tmp16472, 68);
__m512 tmp16488 = _mm512_shuffle_ps(tmp16470, tmp16472, 238);
__m512 tmp16489 = _mm512_shuffle_ps(tmp16473, tmp16475, 68);
__m512 tmp16490 = _mm512_shuffle_ps(tmp16473, tmp16475, 238);
__m512 tmp16491 = _mm512_shuffle_ps(tmp16474, tmp16476, 68);
__m512 tmp16492 = _mm512_shuffle_ps(tmp16474, tmp16476, 238);
__m512 tmp16493 = _mm512_shuffle_f32x4(tmp16477, tmp16481, 136);
__m512 tmp16494 = _mm512_shuffle_f32x4(tmp16477, tmp16481, 221);
__m512 tmp16495 = _mm512_shuffle_f32x4(tmp16478, tmp16482, 136);
__m512 tmp16496 = _mm512_shuffle_f32x4(tmp16478, tmp16482, 221);
__m512 tmp16497 = _mm512_shuffle_f32x4(tmp16479, tmp16483, 136);
__m512 tmp16498 = _mm512_shuffle_f32x4(tmp16479, tmp16483, 221);
__m512 tmp16499 = _mm512_shuffle_f32x4(tmp16480, tmp16484, 136);
__m512 tmp16500 = _mm512_shuffle_f32x4(tmp16480, tmp16484, 221);
__m512 tmp16501 = _mm512_shuffle_f32x4(tmp16485, tmp16489, 136);
__m512 tmp16502 = _mm512_shuffle_f32x4(tmp16485, tmp16489, 221);
__m512 tmp16503 = _mm512_shuffle_f32x4(tmp16486, tmp16490, 136);
__m512 tmp16504 = _mm512_shuffle_f32x4(tmp16486, tmp16490, 221);
__m512 tmp16505 = _mm512_shuffle_f32x4(tmp16487, tmp16491, 136);
__m512 tmp16506 = _mm512_shuffle_f32x4(tmp16487, tmp16491, 221);
__m512 tmp16507 = _mm512_shuffle_f32x4(tmp16488, tmp16492, 136);
__m512 tmp16508 = _mm512_shuffle_f32x4(tmp16488, tmp16492, 221);
wt517 = _mm512_shuffle_f32x4(tmp16493, tmp16501, 136);
wt525 = _mm512_shuffle_f32x4(tmp16493, tmp16501, 221);
wt518 = _mm512_shuffle_f32x4(tmp16495, tmp16503, 136);
wt526 = _mm512_shuffle_f32x4(tmp16495, tmp16503, 221);
wt519 = _mm512_shuffle_f32x4(tmp16497, tmp16505, 136);
wt527 = _mm512_shuffle_f32x4(tmp16497, tmp16505, 221);
wt520 = _mm512_shuffle_f32x4(tmp16499, tmp16507, 136);
wt528 = _mm512_shuffle_f32x4(tmp16499, tmp16507, 221);
wt521 = _mm512_shuffle_f32x4(tmp16494, tmp16502, 136);
wt529 = _mm512_shuffle_f32x4(tmp16494, tmp16502, 221);
wt522 = _mm512_shuffle_f32x4(tmp16496, tmp16504, 136);
wt530 = _mm512_shuffle_f32x4(tmp16496, tmp16504, 221);
wt523 = _mm512_shuffle_f32x4(tmp16498, tmp16506, 136);
wt531 = _mm512_shuffle_f32x4(tmp16498, tmp16506, 221);
wt524 = _mm512_shuffle_f32x4(tmp16500, tmp16508, 136);
wt532 = _mm512_shuffle_f32x4(tmp16500, tmp16508, 221);
wt517 = _mm512_mul_ps(wt517, postMul50);
wt518 = _mm512_mul_ps(wt518, postMul50);
wt519 = _mm512_mul_ps(wt519, postMul50);
wt520 = _mm512_mul_ps(wt520, postMul50);
wt521 = _mm512_mul_ps(wt521, postMul50);
wt522 = _mm512_mul_ps(wt522, postMul50);
wt523 = _mm512_mul_ps(wt523, postMul50);
wt524 = _mm512_mul_ps(wt524, postMul50);
wt525 = _mm512_mul_ps(wt525, postMul50);
wt526 = _mm512_mul_ps(wt526, postMul50);
wt527 = _mm512_mul_ps(wt527, postMul50);
wt528 = _mm512_mul_ps(wt528, postMul50);
wt529 = _mm512_mul_ps(wt529, postMul50);
wt530 = _mm512_mul_ps(wt530, postMul50);
wt531 = _mm512_mul_ps(wt531, postMul50);
wt532 = _mm512_mul_ps(wt532, postMul50);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(1+16*c41)+(ptrdiff_t)0, 63>>cut21, wt517);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(2+16*c41)+(ptrdiff_t)0, 63>>cut21, wt518);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(3+16*c41)+(ptrdiff_t)0, 63>>cut21, wt519);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(4+16*c41)+(ptrdiff_t)0, 63>>cut21, wt520);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(5+16*c41)+(ptrdiff_t)0, 63>>cut21, wt521);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(6+16*c41)+(ptrdiff_t)0, 63>>cut21, wt522);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(7+16*c41)+(ptrdiff_t)0, 63>>cut21, wt523);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(8+16*c41)+(ptrdiff_t)0, 63>>cut21, wt524);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(9+16*c41)+(ptrdiff_t)0, 63>>cut21, wt525);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(10+16*c41)+(ptrdiff_t)0, 63>>cut21, wt526);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(11+16*c41)+(ptrdiff_t)0, 63>>cut21, wt527);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(12+16*c41)+(ptrdiff_t)0, 63>>cut21, wt528);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(13+16*c41)+(ptrdiff_t)0, 63>>cut21, wt529);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(14+16*c41)+(ptrdiff_t)0, 63>>cut21, wt530);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(15+16*c41)+(ptrdiff_t)0, 63>>cut21, wt531);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(16+16*c41)+(ptrdiff_t)0, 63>>cut21, wt532);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(1+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt517);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(2+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt518);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(3+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt519);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(4+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt520);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(5+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt521);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(6+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt522);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(7+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt523);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(8+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt524);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(9+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt525);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(10+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt526);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(11+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt527);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(12+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt528);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(13+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt529);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(14+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt530);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(15+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt531);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(16+16*c41)+(ptrdiff_t)12288, 4032>>cut21, wt532);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(1+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt517);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(2+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt518);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(3+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt519);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(4+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt520);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(5+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt521);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(6+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt522);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(7+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt523);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(8+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt524);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(9+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt525);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(10+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt526);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(11+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt527);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(12+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt528);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(13+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt529);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(14+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt530);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(15+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt531);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(16+16*c41)+(ptrdiff_t)24576, 258048>>cut21, wt532);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(1+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt517);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(2+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt518);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(3+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt519);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(4+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt520);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(5+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt521);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(6+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt522);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(7+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt523);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(8+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt524);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(9+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt525);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(10+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt526);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(11+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt527);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(12+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt528);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(13+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt529);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(14+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt530);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(15+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt531);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l61+4*cut21+24*(16+16*c41)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt532);
}
}
}
} else {
ptrdiff_t k141 = 240;
ptrdiff_t l60 = (size_t)(1024+k141)/6;
ptrdiff_t cut20 = (size_t)(1024+k141)%6;
__m512 sum394 = _mm512_maskz_loadu_ps(65535, biasPtr15+5120*i49+4*k141);
__m512i pmMul32 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd32 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo27 = _mm512_loadu_ps(bnPtr15+(ptrdiff_t)8*(k141+1280*i49));
__m512 masHi27 = _mm512_maskz_loadu_ps(65535, bnPtr15+(ptrdiff_t)8*(k141+1280*i49)+(ptrdiff_t)64);
__m512 postMul48 = _mm512_permutex2var_ps(masLo27, pmMul32, masHi27);
__m512 postAdd30 = _mm512_permutex2var_ps(masLo27, pmAdd32, masHi27);
sum394 = _mm512_fmadd_ps(sum394, postMul48, postAdd30);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*0+(ptrdiff_t)0, 63>>cut20, sum394);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*0+(ptrdiff_t)12288, 4032>>cut20, sum394);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*0+(ptrdiff_t)24576, 258048>>cut20, sum394);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*0+(ptrdiff_t)36864, 65535-(262143>>cut20), sum394);
ptrdiff_t c39 = 0;
for (; c39 != 32; ++c39) {
__m512 wt485 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)0);
__m512 wt486 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)2048);
__m512 wt487 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)4096);
__m512 wt488 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)6144);
__m512 wt489 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)8192);
__m512 wt490 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)10240);
__m512 wt491 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)12288);
__m512 wt492 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)14336);
__m512 wt493 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)16384);
__m512 wt494 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)18432);
__m512 wt495 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)20480);
__m512 wt496 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)22528);
__m512 wt497 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)24576);
__m512 wt498 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)26624);
__m512 wt499 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)28672);
__m512 wt500 = _mm512_maskz_loadu_ps(65535, wtPtr15+2621440*i49+2048*k141+64*c39+(ptrdiff_t)30720);
__m512 tmp16509 = _mm512_unpacklo_ps(wt485, wt486);
__m512 tmp16510 = _mm512_unpackhi_ps(wt485, wt486);
__m512 tmp16511 = _mm512_unpacklo_ps(wt487, wt488);
__m512 tmp16512 = _mm512_unpackhi_ps(wt487, wt488);
__m512 tmp16513 = _mm512_unpacklo_ps(wt489, wt490);
__m512 tmp16514 = _mm512_unpackhi_ps(wt489, wt490);
__m512 tmp16515 = _mm512_unpacklo_ps(wt491, wt492);
__m512 tmp16516 = _mm512_unpackhi_ps(wt491, wt492);
__m512 tmp16517 = _mm512_unpacklo_ps(wt493, wt494);
__m512 tmp16518 = _mm512_unpackhi_ps(wt493, wt494);
__m512 tmp16519 = _mm512_unpacklo_ps(wt495, wt496);
__m512 tmp16520 = _mm512_unpackhi_ps(wt495, wt496);
__m512 tmp16521 = _mm512_unpacklo_ps(wt497, wt498);
__m512 tmp16522 = _mm512_unpackhi_ps(wt497, wt498);
__m512 tmp16523 = _mm512_unpacklo_ps(wt499, wt500);
__m512 tmp16524 = _mm512_unpackhi_ps(wt499, wt500);
__m512 tmp16525 = _mm512_shuffle_ps(tmp16509, tmp16511, 68);
__m512 tmp16526 = _mm512_shuffle_ps(tmp16509, tmp16511, 238);
__m512 tmp16527 = _mm512_shuffle_ps(tmp16510, tmp16512, 68);
__m512 tmp16528 = _mm512_shuffle_ps(tmp16510, tmp16512, 238);
__m512 tmp16529 = _mm512_shuffle_ps(tmp16513, tmp16515, 68);
__m512 tmp16530 = _mm512_shuffle_ps(tmp16513, tmp16515, 238);
__m512 tmp16531 = _mm512_shuffle_ps(tmp16514, tmp16516, 68);
__m512 tmp16532 = _mm512_shuffle_ps(tmp16514, tmp16516, 238);
__m512 tmp16533 = _mm512_shuffle_ps(tmp16517, tmp16519, 68);
__m512 tmp16534 = _mm512_shuffle_ps(tmp16517, tmp16519, 238);
__m512 tmp16535 = _mm512_shuffle_ps(tmp16518, tmp16520, 68);
__m512 tmp16536 = _mm512_shuffle_ps(tmp16518, tmp16520, 238);
__m512 tmp16537 = _mm512_shuffle_ps(tmp16521, tmp16523, 68);
__m512 tmp16538 = _mm512_shuffle_ps(tmp16521, tmp16523, 238);
__m512 tmp16539 = _mm512_shuffle_ps(tmp16522, tmp16524, 68);
__m512 tmp16540 = _mm512_shuffle_ps(tmp16522, tmp16524, 238);
__m512 tmp16541 = _mm512_shuffle_f32x4(tmp16525, tmp16529, 136);
__m512 tmp16542 = _mm512_shuffle_f32x4(tmp16525, tmp16529, 221);
__m512 tmp16543 = _mm512_shuffle_f32x4(tmp16526, tmp16530, 136);
__m512 tmp16544 = _mm512_shuffle_f32x4(tmp16526, tmp16530, 221);
__m512 tmp16545 = _mm512_shuffle_f32x4(tmp16527, tmp16531, 136);
__m512 tmp16546 = _mm512_shuffle_f32x4(tmp16527, tmp16531, 221);
__m512 tmp16547 = _mm512_shuffle_f32x4(tmp16528, tmp16532, 136);
__m512 tmp16548 = _mm512_shuffle_f32x4(tmp16528, tmp16532, 221);
__m512 tmp16549 = _mm512_shuffle_f32x4(tmp16533, tmp16537, 136);
__m512 tmp16550 = _mm512_shuffle_f32x4(tmp16533, tmp16537, 221);
__m512 tmp16551 = _mm512_shuffle_f32x4(tmp16534, tmp16538, 136);
__m512 tmp16552 = _mm512_shuffle_f32x4(tmp16534, tmp16538, 221);
__m512 tmp16553 = _mm512_shuffle_f32x4(tmp16535, tmp16539, 136);
__m512 tmp16554 = _mm512_shuffle_f32x4(tmp16535, tmp16539, 221);
__m512 tmp16555 = _mm512_shuffle_f32x4(tmp16536, tmp16540, 136);
__m512 tmp16556 = _mm512_shuffle_f32x4(tmp16536, tmp16540, 221);
wt485 = _mm512_shuffle_f32x4(tmp16541, tmp16549, 136);
wt493 = _mm512_shuffle_f32x4(tmp16541, tmp16549, 221);
wt486 = _mm512_shuffle_f32x4(tmp16543, tmp16551, 136);
wt494 = _mm512_shuffle_f32x4(tmp16543, tmp16551, 221);
wt487 = _mm512_shuffle_f32x4(tmp16545, tmp16553, 136);
wt495 = _mm512_shuffle_f32x4(tmp16545, tmp16553, 221);
wt488 = _mm512_shuffle_f32x4(tmp16547, tmp16555, 136);
wt496 = _mm512_shuffle_f32x4(tmp16547, tmp16555, 221);
wt489 = _mm512_shuffle_f32x4(tmp16542, tmp16550, 136);
wt497 = _mm512_shuffle_f32x4(tmp16542, tmp16550, 221);
wt490 = _mm512_shuffle_f32x4(tmp16544, tmp16552, 136);
wt498 = _mm512_shuffle_f32x4(tmp16544, tmp16552, 221);
wt491 = _mm512_shuffle_f32x4(tmp16546, tmp16554, 136);
wt499 = _mm512_shuffle_f32x4(tmp16546, tmp16554, 221);
wt492 = _mm512_shuffle_f32x4(tmp16548, tmp16556, 136);
wt500 = _mm512_shuffle_f32x4(tmp16548, tmp16556, 221);
wt485 = _mm512_mul_ps(wt485, postMul48);
wt486 = _mm512_mul_ps(wt486, postMul48);
wt487 = _mm512_mul_ps(wt487, postMul48);
wt488 = _mm512_mul_ps(wt488, postMul48);
wt489 = _mm512_mul_ps(wt489, postMul48);
wt490 = _mm512_mul_ps(wt490, postMul48);
wt491 = _mm512_mul_ps(wt491, postMul48);
wt492 = _mm512_mul_ps(wt492, postMul48);
wt493 = _mm512_mul_ps(wt493, postMul48);
wt494 = _mm512_mul_ps(wt494, postMul48);
wt495 = _mm512_mul_ps(wt495, postMul48);
wt496 = _mm512_mul_ps(wt496, postMul48);
wt497 = _mm512_mul_ps(wt497, postMul48);
wt498 = _mm512_mul_ps(wt498, postMul48);
wt499 = _mm512_mul_ps(wt499, postMul48);
wt500 = _mm512_mul_ps(wt500, postMul48);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(1+16*c39)+(ptrdiff_t)0, 63>>cut20, wt485);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(2+16*c39)+(ptrdiff_t)0, 63>>cut20, wt486);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(3+16*c39)+(ptrdiff_t)0, 63>>cut20, wt487);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(4+16*c39)+(ptrdiff_t)0, 63>>cut20, wt488);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(5+16*c39)+(ptrdiff_t)0, 63>>cut20, wt489);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(6+16*c39)+(ptrdiff_t)0, 63>>cut20, wt490);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(7+16*c39)+(ptrdiff_t)0, 63>>cut20, wt491);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(8+16*c39)+(ptrdiff_t)0, 63>>cut20, wt492);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(9+16*c39)+(ptrdiff_t)0, 63>>cut20, wt493);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(10+16*c39)+(ptrdiff_t)0, 63>>cut20, wt494);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(11+16*c39)+(ptrdiff_t)0, 63>>cut20, wt495);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(12+16*c39)+(ptrdiff_t)0, 63>>cut20, wt496);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(13+16*c39)+(ptrdiff_t)0, 63>>cut20, wt497);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(14+16*c39)+(ptrdiff_t)0, 63>>cut20, wt498);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(15+16*c39)+(ptrdiff_t)0, 63>>cut20, wt499);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(16+16*c39)+(ptrdiff_t)0, 63>>cut20, wt500);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(1+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt485);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(2+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt486);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(3+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt487);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(4+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt488);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(5+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt489);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(6+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt490);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(7+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt491);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(8+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt492);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(9+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt493);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(10+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt494);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(11+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt495);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(12+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt496);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(13+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt497);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(14+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt498);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(15+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt499);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(16+16*c39)+(ptrdiff_t)12288, 4032>>cut20, wt500);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(1+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt485);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(2+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt486);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(3+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt487);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(4+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt488);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(5+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt489);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(6+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt490);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(7+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt491);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(8+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt492);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(9+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt493);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(10+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt494);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(11+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt495);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(12+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt496);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(13+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt497);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(14+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt498);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(15+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt499);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+24*(16+16*c39)+(ptrdiff_t)24576, 258048>>cut20, wt500);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(1+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt485);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(2+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt486);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(3+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt487);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(4+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt488);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(5+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt489);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(6+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt490);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(7+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt491);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(8+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt492);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(9+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt493);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(10+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt494);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(11+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt495);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(12+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt496);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(13+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt497);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(14+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt498);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(15+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt499);
_mm512_mask_storeu_ps(arranged13+2626560*i49+12312*l60+4*cut20+8*(16+16*c39)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt500);
}
}
}
}
}

static void ResNet50OneArrangeWts7(ResNet50ThreaderTeam1* team53, char** tensors79) {
ResNet50ThreaderTask1 task83;
task83.callee1 = ResNet50OneArrangeWts7Callee1;
task83.any1 = tensors79;
task83.nd1 = 3;
task83.hull1[0] = 80;
task83.hull1[1] = 1;
task83.hull1[2] = 1;
ResNet50ThreaderDo1(team53, &task83);
}

static void ResNet50OneArrangeDats7Callee1(ResNet50ThreaderTask1* task84, int64_t* pt47) {
char** tensors82 = task84->any1;
ptrdiff_t s37 = pt47[0];
ptrdiff_t c42 = pt47[1];
char*restrict datPtr25 = tensors82[0]+(ptrdiff_t)0+(ptrdiff_t)2618560*0+(ptrdiff_t)1605632*0;
char*restrict arranged14 = tensors82[1]+(ptrdiff_t)748160*0+(ptrdiff_t)458752*0;
ptrdiff_t ii20 = 1;
for (ptrdiff_t i50 = 0; i50 < ii20; ++i50) {
ptrdiff_t j42 = 1*c42;
ptrdiff_t jj42 = j42+0;
if (j42 < 3) {
ptrdiff_t h50 = 0+((size_t)j42-0)/1*8;
switch (((size_t)j42-0)%1) {
default: {
wrap5:;
ptrdiff_t k143 = 128*s37;
ptrdiff_t kk44 = k143+128;
for (; k143 < kk44; ++k143) {
__m512 dat2167 = _mm512_maskz_loadu_ps(32767, datPtr25+1605632*i50+112*h50+3136*k143+(ptrdiff_t)0);
__m512 dat2168 = _mm512_maskz_loadu_ps(2047, datPtr25+1605632*i50+112*h50+3136*k143+(ptrdiff_t)64);
__m512i pm213 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2169 = _mm512_permutex2var_ps(dat2167, pm213, dat2168);
__m512 dat2170 = _mm512_maskz_loadu_ps(32767, datPtr25+1605632*i50+112*h50+3136*k143+(ptrdiff_t)224);
__m512 dat2171 = _mm512_maskz_loadu_ps(2047, datPtr25+1605632*i50+112*h50+3136*k143+(ptrdiff_t)288);
__m512i pm214 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2172 = _mm512_permutex2var_ps(dat2170, pm214, dat2171);
__m512 dat2173 = _mm512_maskz_loadu_ps(32767, datPtr25+1605632*i50+112*h50+3136*k143+(ptrdiff_t)448);
__m512 dat2174 = _mm512_maskz_loadu_ps(2047, datPtr25+1605632*i50+112*h50+3136*k143+(ptrdiff_t)512);
__m512i pm215 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2175 = _mm512_permutex2var_ps(dat2173, pm215, dat2174);
__m512 dat2176 = _mm512_maskz_loadu_ps(32767, datPtr25+1605632*i50+112*h50+3136*k143+(ptrdiff_t)672);
__m512 dat2177 = _mm512_maskz_loadu_ps(2047, datPtr25+1605632*i50+112*h50+3136*k143+(ptrdiff_t)736);
__m512i pm216 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2178 = _mm512_permutex2var_ps(dat2176, pm216, dat2177);
_mm512_storeu_ps(arranged14+458752*i50+131072*j42+256*k143+(ptrdiff_t)0, dat2169);
_mm512_storeu_ps(arranged14+458752*i50+131072*j42+256*k143+(ptrdiff_t)64, dat2172);
_mm512_storeu_ps(arranged14+458752*i50+131072*j42+256*k143+(ptrdiff_t)128, dat2175);
_mm512_storeu_ps(arranged14+458752*i50+131072*j42+256*k143+(ptrdiff_t)192, dat2178);
}
if (j42 >= jj42) goto next7;
if (j42 >= 2) break;
++j42;
h50 += 8;
goto wrap5;
}
}
j42 = 3;
}
switch ((size_t)j42-3) {
default: {
j42 = 3;
ptrdiff_t k144 = 128*s37;
ptrdiff_t kk45 = k144+128;
for (; k144 < kk45; ++k144) {
__m512 dat2179 = _mm512_maskz_loadu_ps(32767, datPtr25+1605632*i50+112*(ptrdiff_t)24+3136*k144+(ptrdiff_t)0);
__m512 dat2180 = _mm512_maskz_loadu_ps(2047, datPtr25+1605632*i50+112*(ptrdiff_t)24+3136*k144+(ptrdiff_t)64);
__m512i pm217 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2181 = _mm512_permutex2var_ps(dat2179, pm217, dat2180);
__m512 dat2182 = _mm512_maskz_loadu_ps(32767, datPtr25+1605632*i50+112*(ptrdiff_t)24+3136*k144+(ptrdiff_t)224);
__m512 dat2183 = _mm512_maskz_loadu_ps(2047, datPtr25+1605632*i50+112*(ptrdiff_t)24+3136*k144+(ptrdiff_t)288);
__m512i pm218 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2184 = _mm512_permutex2var_ps(dat2182, pm218, dat2183);
_mm512_storeu_ps(arranged14+458752*i50+131072*j42+128*k144+(ptrdiff_t)0, dat2181);
_mm512_storeu_ps(arranged14+458752*i50+131072*j42+128*k144+(ptrdiff_t)64, dat2184);
}
if (j42 >= jj42) goto next7;
}
}
j42 = 4;
next7:;
}
}

static void ResNet50OneArrangeDats7(ResNet50ThreaderTeam1* team54, char** tensors81) {
ResNet50ThreaderTask1 task85;
task85.callee1 = ResNet50OneArrangeDats7Callee1;
task85.any1 = tensors81;
task85.nd1 = 4;
task85.hull1[0] = 4;
task85.hull1[1] = 4;
task85.hull1[2] = 1;
task85.hull1[3] = 1;
ResNet50ThreaderDo1(team54, &task85);
}

static void ResNet50OneApply7Callee1(ResNet50ThreaderTask1* task86, int64_t* pt48) {
void** pair22 = task86->any1;
char** tensors84 = pair22[0];
ptrdiff_t e24 = 0;
ptrdiff_t g28 = 0;
ptrdiff_t d17 = pt48[1];
ptrdiff_t w64 = pt48[0];
char*restrict arrangedWts7 = tensors84[0]+4280320*e24+(ptrdiff_t)2626560*1*g28;
char*restrict arrangedDats7 = tensors84[1]+748160*e24+(ptrdiff_t)458752*1*g28;
char*restrict datPtr26 = tensors84[2]+(ptrdiff_t)1064960*1*g28;
ptrdiff_t ii21 = 1;
for (ptrdiff_t i51 = 0; i51 < ii21; ++i51) {
ptrdiff_t j43 = 1*d17;
ptrdiff_t jj43 = j43+0;
if (j43 < 3) {
ptrdiff_t h51 = 0+((size_t)j43-0)/1*4;
switch (((size_t)j43-0)%1) {
default: {
wrap6:;
ptrdiff_t k145 = 1*w64;
ptrdiff_t kk46 = k145+0;
for (; k145 != 213; ++k145) {
ptrdiff_t s38 = -1;
__m512 sum397 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)24));
__m512 sum401 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)28));
__m512 sum405 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)32));
__m512 sum409 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)36));
__m512 sum413 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)40));
__m512 sum417 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)44));
__m512 sum398 = sum397;
__m512 sum399 = sum397;
__m512 sum400 = sum397;
__m512 sum402 = sum401;
__m512 sum403 = sum401;
__m512 sum404 = sum401;
__m512 sum406 = sum405;
__m512 sum407 = sum405;
__m512 sum408 = sum405;
__m512 sum410 = sum409;
__m512 sum411 = sum409;
__m512 sum412 = sum409;
__m512 sum414 = sum413;
__m512 sum415 = sum413;
__m512 sum416 = sum413;
__m512 sum418 = sum417;
__m512 sum419 = sum417;
__m512 sum420 = sum417;
for (s38 = 0; s38 < 512; ++s38) {
__m512 dat2185 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+256*s38+(ptrdiff_t)0);
__m512 dat2186 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+256*s38+(ptrdiff_t)64);
__m512 dat2187 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+256*s38+(ptrdiff_t)128);
__m512 dat2188 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+256*s38+(ptrdiff_t)192);
__m512 wt533 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)24));
sum397 = _mm512_fmadd_ps(wt533, dat2185, sum397);
sum398 = _mm512_fmadd_ps(wt533, dat2186, sum398);
sum399 = _mm512_fmadd_ps(wt533, dat2187, sum399);
sum400 = _mm512_fmadd_ps(wt533, dat2188, sum400);
__m512 wt534 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)28));
sum401 = _mm512_fmadd_ps(wt534, dat2185, sum401);
sum402 = _mm512_fmadd_ps(wt534, dat2186, sum402);
sum403 = _mm512_fmadd_ps(wt534, dat2187, sum403);
sum404 = _mm512_fmadd_ps(wt534, dat2188, sum404);
__m512 wt535 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)32));
sum405 = _mm512_fmadd_ps(wt535, dat2185, sum405);
sum406 = _mm512_fmadd_ps(wt535, dat2186, sum406);
sum407 = _mm512_fmadd_ps(wt535, dat2187, sum407);
sum408 = _mm512_fmadd_ps(wt535, dat2188, sum408);
__m512 wt536 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)36));
sum409 = _mm512_fmadd_ps(wt536, dat2185, sum409);
sum410 = _mm512_fmadd_ps(wt536, dat2186, sum410);
sum411 = _mm512_fmadd_ps(wt536, dat2187, sum411);
sum412 = _mm512_fmadd_ps(wt536, dat2188, sum412);
__m512 wt537 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)40));
sum413 = _mm512_fmadd_ps(wt537, dat2185, sum413);
sum414 = _mm512_fmadd_ps(wt537, dat2186, sum414);
sum415 = _mm512_fmadd_ps(wt537, dat2187, sum415);
sum416 = _mm512_fmadd_ps(wt537, dat2188, sum416);
__m512 wt538 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+24*s38+(ptrdiff_t)44));
sum417 = _mm512_fmadd_ps(wt538, dat2185, sum417);
sum418 = _mm512_fmadd_ps(wt538, dat2186, sum418);
sum419 = _mm512_fmadd_ps(wt538, dat2187, sum419);
sum420 = _mm512_fmadd_ps(wt538, dat2188, sum420);
}
__m512 dat2189 = sum397;
__m512 dat2190 = sum398;
__m512 dat2191 = sum399;
__m512 dat2192 = sum400;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)0, 16383, dat2189);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)56, 16383, dat2190);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)112, 16383, dat2191);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)168, 16383, dat2192);
__m512 dat2193 = sum401;
__m512 dat2194 = sum402;
__m512 dat2195 = sum403;
__m512 dat2196 = sum404;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)832, 16383, dat2193);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)888, 16383, dat2194);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)944, 16383, dat2195);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)1000, 16383, dat2196);
__m512 dat2197 = sum405;
__m512 dat2198 = sum406;
__m512 dat2199 = sum407;
__m512 dat2200 = sum408;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)1664, 16383, dat2197);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)1720, 16383, dat2198);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)1776, 16383, dat2199);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)1832, 16383, dat2200);
__m512 dat2201 = sum409;
__m512 dat2202 = sum410;
__m512 dat2203 = sum411;
__m512 dat2204 = sum412;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)2496, 16383, dat2201);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)2552, 16383, dat2202);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)2608, 16383, dat2203);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)2664, 16383, dat2204);
__m512 dat2205 = sum413;
__m512 dat2206 = sum414;
__m512 dat2207 = sum415;
__m512 dat2208 = sum416;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)3328, 16383, dat2205);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)3384, 16383, dat2206);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)3440, 16383, dat2207);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)3496, 16383, dat2208);
__m512 dat2209 = sum417;
__m512 dat2210 = sum418;
__m512 dat2211 = sum419;
__m512 dat2212 = sum420;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)4160, 16383, dat2209);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)4216, 16383, dat2210);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)4272, 16383, dat2211);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)4328, 16383, dat2212);
if (k145 >= kk46) return;
}
ptrdiff_t s39 = -1;
__m512 sum421 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+8*s39+(ptrdiff_t)8));
__m512 sum425 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+8*s39+(ptrdiff_t)12));
__m512 sum422 = sum421;
__m512 sum423 = sum421;
__m512 sum424 = sum421;
__m512 sum426 = sum425;
__m512 sum427 = sum425;
__m512 sum428 = sum425;
for (s39 = 0; s39 < 512; ++s39) {
__m512 dat2213 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+256*s39+(ptrdiff_t)0);
__m512 dat2214 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+256*s39+(ptrdiff_t)64);
__m512 dat2215 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+256*s39+(ptrdiff_t)128);
__m512 dat2216 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+256*s39+(ptrdiff_t)192);
__m512 wt539 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+8*s39+(ptrdiff_t)8));
sum421 = _mm512_fmadd_ps(wt539, dat2213, sum421);
sum422 = _mm512_fmadd_ps(wt539, dat2214, sum422);
sum423 = _mm512_fmadd_ps(wt539, dat2215, sum423);
sum424 = _mm512_fmadd_ps(wt539, dat2216, sum424);
__m512 wt540 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k145+8*s39+(ptrdiff_t)12));
sum425 = _mm512_fmadd_ps(wt540, dat2213, sum425);
sum426 = _mm512_fmadd_ps(wt540, dat2214, sum426);
sum427 = _mm512_fmadd_ps(wt540, dat2215, sum427);
sum428 = _mm512_fmadd_ps(wt540, dat2216, sum428);
}
__m512 dat2217 = sum421;
__m512 dat2218 = sum422;
__m512 dat2219 = sum423;
__m512 dat2220 = sum424;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)0, 16383, dat2217);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)56, 16383, dat2218);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)112, 16383, dat2219);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)168, 16383, dat2220);
__m512 dat2221 = sum425;
__m512 dat2222 = sum426;
__m512 dat2223 = sum427;
__m512 dat2224 = sum428;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)832, 16383, dat2221);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)888, 16383, dat2222);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)944, 16383, dat2223);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h51+4992*k145+(ptrdiff_t)1000, 16383, dat2224);
if (j43 >= jj43) return;
if (j43 >= 2) break;
++j43;
h51 += 4;
goto wrap6;
}
}
j43 = 3;
}
ptrdiff_t h52 = 12;
switch (j43) {
default: {
j43 = 3;
ptrdiff_t k146 = 1*w64;
ptrdiff_t kk47 = k146+0;
for (; k146 != 213; ++k146) {
ptrdiff_t s40 = -1;
__m512 sum429 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)24));
__m512 sum431 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)28));
__m512 sum433 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)32));
__m512 sum435 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)36));
__m512 sum437 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)40));
__m512 sum439 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)44));
__m512 sum430 = sum429;
__m512 sum432 = sum431;
__m512 sum434 = sum433;
__m512 sum436 = sum435;
__m512 sum438 = sum437;
__m512 sum440 = sum439;
for (s40 = 0; s40 < 512; ++s40) {
__m512 dat2225 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+128*s40+(ptrdiff_t)0);
__m512 dat2226 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+128*s40+(ptrdiff_t)64);
__m512 wt541 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)24));
sum429 = _mm512_fmadd_ps(wt541, dat2225, sum429);
sum430 = _mm512_fmadd_ps(wt541, dat2226, sum430);
__m512 wt542 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)28));
sum431 = _mm512_fmadd_ps(wt542, dat2225, sum431);
sum432 = _mm512_fmadd_ps(wt542, dat2226, sum432);
__m512 wt543 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)32));
sum433 = _mm512_fmadd_ps(wt543, dat2225, sum433);
sum434 = _mm512_fmadd_ps(wt543, dat2226, sum434);
__m512 wt544 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)36));
sum435 = _mm512_fmadd_ps(wt544, dat2225, sum435);
sum436 = _mm512_fmadd_ps(wt544, dat2226, sum436);
__m512 wt545 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)40));
sum437 = _mm512_fmadd_ps(wt545, dat2225, sum437);
sum438 = _mm512_fmadd_ps(wt545, dat2226, sum438);
__m512 wt546 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+24*s40+(ptrdiff_t)44));
sum439 = _mm512_fmadd_ps(wt546, dat2225, sum439);
sum440 = _mm512_fmadd_ps(wt546, dat2226, sum440);
}
__m512 dat2227 = sum429;
__m512 dat2228 = sum430;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)0, 16383, dat2227);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)56, 16383, dat2228);
__m512 dat2229 = sum431;
__m512 dat2230 = sum432;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)832, 16383, dat2229);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)888, 16383, dat2230);
__m512 dat2231 = sum433;
__m512 dat2232 = sum434;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)1664, 16383, dat2231);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)1720, 16383, dat2232);
__m512 dat2233 = sum435;
__m512 dat2234 = sum436;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)2496, 16383, dat2233);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)2552, 16383, dat2234);
__m512 dat2235 = sum437;
__m512 dat2236 = sum438;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)3328, 16383, dat2235);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)3384, 16383, dat2236);
__m512 dat2237 = sum439;
__m512 dat2238 = sum440;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)4160, 16383, dat2237);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)4216, 16383, dat2238);
if (k146 >= kk47) return;
}
ptrdiff_t s41 = -1;
__m512 sum441 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+8*s41+(ptrdiff_t)8));
__m512 sum443 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+8*s41+(ptrdiff_t)12));
__m512 sum442 = sum441;
__m512 sum444 = sum443;
for (s41 = 0; s41 < 512; ++s41) {
__m512 dat2239 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+128*s41+(ptrdiff_t)0);
__m512 dat2240 = _mm512_loadu_ps(arrangedDats7+458752*i51+131072*j43+128*s41+(ptrdiff_t)64);
__m512 wt547 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+8*s41+(ptrdiff_t)8));
sum441 = _mm512_fmadd_ps(wt547, dat2239, sum441);
sum442 = _mm512_fmadd_ps(wt547, dat2240, sum442);
__m512 wt548 = _mm512_set1_ps(*(float*)(arrangedWts7+2626560*i51+12312*k146+8*s41+(ptrdiff_t)12));
sum443 = _mm512_fmadd_ps(wt548, dat2239, sum443);
sum444 = _mm512_fmadd_ps(wt548, dat2240, sum444);
}
__m512 dat2241 = sum441;
__m512 dat2242 = sum442;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)0, 16383, dat2241);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)56, 16383, dat2242);
__m512 dat2243 = sum443;
__m512 dat2244 = sum444;
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)832, 16383, dat2243);
_mm512_mask_storeu_ps(datPtr26+1064960*i51+56*h52+4992*k146+(ptrdiff_t)888, 16383, dat2244);
if (j43 >= jj43) return;
}
}
j43 = 4;
}
}

static void ResNet50OneApply7(ResNet50ThreaderTeam1* team55, char** tensors83) {
void* pair21[] = {tensors83, 0};
ResNet50ThreaderTask1 task87;
task87.callee1 = ResNet50OneApply7Callee1;
task87.any1 = pair21;
task87.nd1 = 3;
task87.hull1[0] = 214;
task87.hull1[1] = 4;
task87.hull1[2] = 1;
ResNet50ThreaderDo1(team55, &task87);
}

static void ResNet50OneArrangeWts8Callee1(ResNet50ThreaderTask1* task96, int64_t* pt53) {
char** tensors94 = task96->any1;
ptrdiff_t b66 = pt53[0];
char*restrict wtPtr17 = tensors94[0]+(ptrdiff_t)3340*0+(ptrdiff_t)1048576*0;
char*restrict biasPtr17 = tensors94[1]+(ptrdiff_t)4096*0;
char*restrict bnPtr17 = tensors94[2]+(ptrdiff_t)8*1024*0;
char*restrict arranged15 = tensors94[3]+(ptrdiff_t)3424256*0+(ptrdiff_t)1052672*0;
ptrdiff_t ii22 = 1;
for (ptrdiff_t i56 = 0; i56 < ii22; ++i56) {
ptrdiff_t j48 = 2*b66;
ptrdiff_t jj45 = j48+2;
for (; j48 < jj45; ++j48) {
if (j48 < 63) {
ptrdiff_t k154 = 0+16*(j48-0);
ptrdiff_t l67 = (size_t)(0+k154)/6;
ptrdiff_t cut24 = (size_t)(0+k154)%6;
switch (cut24) {
case 0:;
case 2: {
__m512 sum482 = _mm512_maskz_loadu_ps(65535, biasPtr17+4096*i56+4*k154);
__m512i pmMul34 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd34 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo28 = _mm512_loadu_ps(bnPtr17+(ptrdiff_t)8*(k154+1024*i56));
__m512 masHi28 = _mm512_maskz_loadu_ps(65535, bnPtr17+(ptrdiff_t)8*(k154+1024*i56)+(ptrdiff_t)64);
__m512 postMul57 = _mm512_permutex2var_ps(masLo28, pmMul34, masHi28);
__m512 postAdd35 = _mm512_permutex2var_ps(masLo28, pmAdd34, masHi28);
sum482 = _mm512_fmadd_ps(sum482, postMul57, postAdd35);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*0+(ptrdiff_t)0, 63>>cut24, sum482);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*0+(ptrdiff_t)6144, 4032>>cut24, sum482);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*0+(ptrdiff_t)12288, 65535-(4095>>cut24), sum482);
ptrdiff_t c45 = 0;
for (; c45 != 16; ++c45) {
__m512 wt569 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)0);
__m512 wt570 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)1024);
__m512 wt571 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)2048);
__m512 wt572 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)3072);
__m512 wt573 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)4096);
__m512 wt574 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)5120);
__m512 wt575 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)6144);
__m512 wt576 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)7168);
__m512 wt577 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)8192);
__m512 wt578 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)9216);
__m512 wt579 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)10240);
__m512 wt580 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)11264);
__m512 wt581 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)12288);
__m512 wt582 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)13312);
__m512 wt583 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)14336);
__m512 wt584 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c45+(ptrdiff_t)15360);
__m512 tmp17773 = _mm512_unpacklo_ps(wt569, wt570);
__m512 tmp17774 = _mm512_unpackhi_ps(wt569, wt570);
__m512 tmp17775 = _mm512_unpacklo_ps(wt571, wt572);
__m512 tmp17776 = _mm512_unpackhi_ps(wt571, wt572);
__m512 tmp17777 = _mm512_unpacklo_ps(wt573, wt574);
__m512 tmp17778 = _mm512_unpackhi_ps(wt573, wt574);
__m512 tmp17779 = _mm512_unpacklo_ps(wt575, wt576);
__m512 tmp17780 = _mm512_unpackhi_ps(wt575, wt576);
__m512 tmp17781 = _mm512_unpacklo_ps(wt577, wt578);
__m512 tmp17782 = _mm512_unpackhi_ps(wt577, wt578);
__m512 tmp17783 = _mm512_unpacklo_ps(wt579, wt580);
__m512 tmp17784 = _mm512_unpackhi_ps(wt579, wt580);
__m512 tmp17785 = _mm512_unpacklo_ps(wt581, wt582);
__m512 tmp17786 = _mm512_unpackhi_ps(wt581, wt582);
__m512 tmp17787 = _mm512_unpacklo_ps(wt583, wt584);
__m512 tmp17788 = _mm512_unpackhi_ps(wt583, wt584);
__m512 tmp17789 = _mm512_shuffle_ps(tmp17773, tmp17775, 68);
__m512 tmp17790 = _mm512_shuffle_ps(tmp17773, tmp17775, 238);
__m512 tmp17791 = _mm512_shuffle_ps(tmp17774, tmp17776, 68);
__m512 tmp17792 = _mm512_shuffle_ps(tmp17774, tmp17776, 238);
__m512 tmp17793 = _mm512_shuffle_ps(tmp17777, tmp17779, 68);
__m512 tmp17794 = _mm512_shuffle_ps(tmp17777, tmp17779, 238);
__m512 tmp17795 = _mm512_shuffle_ps(tmp17778, tmp17780, 68);
__m512 tmp17796 = _mm512_shuffle_ps(tmp17778, tmp17780, 238);
__m512 tmp17797 = _mm512_shuffle_ps(tmp17781, tmp17783, 68);
__m512 tmp17798 = _mm512_shuffle_ps(tmp17781, tmp17783, 238);
__m512 tmp17799 = _mm512_shuffle_ps(tmp17782, tmp17784, 68);
__m512 tmp17800 = _mm512_shuffle_ps(tmp17782, tmp17784, 238);
__m512 tmp17801 = _mm512_shuffle_ps(tmp17785, tmp17787, 68);
__m512 tmp17802 = _mm512_shuffle_ps(tmp17785, tmp17787, 238);
__m512 tmp17803 = _mm512_shuffle_ps(tmp17786, tmp17788, 68);
__m512 tmp17804 = _mm512_shuffle_ps(tmp17786, tmp17788, 238);
__m512 tmp17805 = _mm512_shuffle_f32x4(tmp17789, tmp17793, 136);
__m512 tmp17806 = _mm512_shuffle_f32x4(tmp17789, tmp17793, 221);
__m512 tmp17807 = _mm512_shuffle_f32x4(tmp17790, tmp17794, 136);
__m512 tmp17808 = _mm512_shuffle_f32x4(tmp17790, tmp17794, 221);
__m512 tmp17809 = _mm512_shuffle_f32x4(tmp17791, tmp17795, 136);
__m512 tmp17810 = _mm512_shuffle_f32x4(tmp17791, tmp17795, 221);
__m512 tmp17811 = _mm512_shuffle_f32x4(tmp17792, tmp17796, 136);
__m512 tmp17812 = _mm512_shuffle_f32x4(tmp17792, tmp17796, 221);
__m512 tmp17813 = _mm512_shuffle_f32x4(tmp17797, tmp17801, 136);
__m512 tmp17814 = _mm512_shuffle_f32x4(tmp17797, tmp17801, 221);
__m512 tmp17815 = _mm512_shuffle_f32x4(tmp17798, tmp17802, 136);
__m512 tmp17816 = _mm512_shuffle_f32x4(tmp17798, tmp17802, 221);
__m512 tmp17817 = _mm512_shuffle_f32x4(tmp17799, tmp17803, 136);
__m512 tmp17818 = _mm512_shuffle_f32x4(tmp17799, tmp17803, 221);
__m512 tmp17819 = _mm512_shuffle_f32x4(tmp17800, tmp17804, 136);
__m512 tmp17820 = _mm512_shuffle_f32x4(tmp17800, tmp17804, 221);
wt569 = _mm512_shuffle_f32x4(tmp17805, tmp17813, 136);
wt577 = _mm512_shuffle_f32x4(tmp17805, tmp17813, 221);
wt570 = _mm512_shuffle_f32x4(tmp17807, tmp17815, 136);
wt578 = _mm512_shuffle_f32x4(tmp17807, tmp17815, 221);
wt571 = _mm512_shuffle_f32x4(tmp17809, tmp17817, 136);
wt579 = _mm512_shuffle_f32x4(tmp17809, tmp17817, 221);
wt572 = _mm512_shuffle_f32x4(tmp17811, tmp17819, 136);
wt580 = _mm512_shuffle_f32x4(tmp17811, tmp17819, 221);
wt573 = _mm512_shuffle_f32x4(tmp17806, tmp17814, 136);
wt581 = _mm512_shuffle_f32x4(tmp17806, tmp17814, 221);
wt574 = _mm512_shuffle_f32x4(tmp17808, tmp17816, 136);
wt582 = _mm512_shuffle_f32x4(tmp17808, tmp17816, 221);
wt575 = _mm512_shuffle_f32x4(tmp17810, tmp17818, 136);
wt583 = _mm512_shuffle_f32x4(tmp17810, tmp17818, 221);
wt576 = _mm512_shuffle_f32x4(tmp17812, tmp17820, 136);
wt584 = _mm512_shuffle_f32x4(tmp17812, tmp17820, 221);
wt569 = _mm512_mul_ps(wt569, postMul57);
wt570 = _mm512_mul_ps(wt570, postMul57);
wt571 = _mm512_mul_ps(wt571, postMul57);
wt572 = _mm512_mul_ps(wt572, postMul57);
wt573 = _mm512_mul_ps(wt573, postMul57);
wt574 = _mm512_mul_ps(wt574, postMul57);
wt575 = _mm512_mul_ps(wt575, postMul57);
wt576 = _mm512_mul_ps(wt576, postMul57);
wt577 = _mm512_mul_ps(wt577, postMul57);
wt578 = _mm512_mul_ps(wt578, postMul57);
wt579 = _mm512_mul_ps(wt579, postMul57);
wt580 = _mm512_mul_ps(wt580, postMul57);
wt581 = _mm512_mul_ps(wt581, postMul57);
wt582 = _mm512_mul_ps(wt582, postMul57);
wt583 = _mm512_mul_ps(wt583, postMul57);
wt584 = _mm512_mul_ps(wt584, postMul57);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(1+16*c45)+(ptrdiff_t)0, 63>>cut24, wt569);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(2+16*c45)+(ptrdiff_t)0, 63>>cut24, wt570);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(3+16*c45)+(ptrdiff_t)0, 63>>cut24, wt571);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(4+16*c45)+(ptrdiff_t)0, 63>>cut24, wt572);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(5+16*c45)+(ptrdiff_t)0, 63>>cut24, wt573);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(6+16*c45)+(ptrdiff_t)0, 63>>cut24, wt574);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(7+16*c45)+(ptrdiff_t)0, 63>>cut24, wt575);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(8+16*c45)+(ptrdiff_t)0, 63>>cut24, wt576);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(9+16*c45)+(ptrdiff_t)0, 63>>cut24, wt577);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(10+16*c45)+(ptrdiff_t)0, 63>>cut24, wt578);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(11+16*c45)+(ptrdiff_t)0, 63>>cut24, wt579);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(12+16*c45)+(ptrdiff_t)0, 63>>cut24, wt580);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(13+16*c45)+(ptrdiff_t)0, 63>>cut24, wt581);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(14+16*c45)+(ptrdiff_t)0, 63>>cut24, wt582);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(15+16*c45)+(ptrdiff_t)0, 63>>cut24, wt583);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(16+16*c45)+(ptrdiff_t)0, 63>>cut24, wt584);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(1+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt569);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(2+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt570);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(3+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt571);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(4+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt572);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(5+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt573);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(6+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt574);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(7+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt575);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(8+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt576);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(9+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt577);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(10+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt578);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(11+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt579);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(12+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt580);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(13+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt581);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(14+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt582);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(15+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt583);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(16+16*c45)+(ptrdiff_t)6144, 4032>>cut24, wt584);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(1+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt569);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(2+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt570);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(3+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt571);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(4+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt572);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(5+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt573);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(6+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt574);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(7+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt575);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(8+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt576);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(9+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt577);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(10+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt578);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(11+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt579);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(12+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt580);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(13+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt581);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(14+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt582);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(15+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt583);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(16+16*c45)+(ptrdiff_t)12288, 65535-(4095>>cut24), wt584);
}
break;
}
default: {
cut24 = 4;
__m512 sum483 = _mm512_maskz_loadu_ps(65535, biasPtr17+4096*i56+4*k154);
__m512i pmMul35 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd35 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo29 = _mm512_loadu_ps(bnPtr17+(ptrdiff_t)8*(k154+1024*i56));
__m512 masHi29 = _mm512_maskz_loadu_ps(65535, bnPtr17+(ptrdiff_t)8*(k154+1024*i56)+(ptrdiff_t)64);
__m512 postMul58 = _mm512_permutex2var_ps(masLo29, pmMul35, masHi29);
__m512 postAdd36 = _mm512_permutex2var_ps(masLo29, pmAdd35, masHi29);
sum483 = _mm512_fmadd_ps(sum483, postMul58, postAdd36);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*0+(ptrdiff_t)0, 63>>cut24, sum483);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*0+(ptrdiff_t)6144, 4032>>cut24, sum483);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*0+(ptrdiff_t)12288, 258048>>cut24, sum483);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*0+(ptrdiff_t)18432, 65535-(262143>>cut24), sum483);
ptrdiff_t c46 = 0;
for (; c46 != 16; ++c46) {
__m512 wt585 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)0);
__m512 wt586 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)1024);
__m512 wt587 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)2048);
__m512 wt588 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)3072);
__m512 wt589 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)4096);
__m512 wt590 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)5120);
__m512 wt591 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)6144);
__m512 wt592 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)7168);
__m512 wt593 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)8192);
__m512 wt594 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)9216);
__m512 wt595 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)10240);
__m512 wt596 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)11264);
__m512 wt597 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)12288);
__m512 wt598 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)13312);
__m512 wt599 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)14336);
__m512 wt600 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k154+64*c46+(ptrdiff_t)15360);
__m512 tmp17821 = _mm512_unpacklo_ps(wt585, wt586);
__m512 tmp17822 = _mm512_unpackhi_ps(wt585, wt586);
__m512 tmp17823 = _mm512_unpacklo_ps(wt587, wt588);
__m512 tmp17824 = _mm512_unpackhi_ps(wt587, wt588);
__m512 tmp17825 = _mm512_unpacklo_ps(wt589, wt590);
__m512 tmp17826 = _mm512_unpackhi_ps(wt589, wt590);
__m512 tmp17827 = _mm512_unpacklo_ps(wt591, wt592);
__m512 tmp17828 = _mm512_unpackhi_ps(wt591, wt592);
__m512 tmp17829 = _mm512_unpacklo_ps(wt593, wt594);
__m512 tmp17830 = _mm512_unpackhi_ps(wt593, wt594);
__m512 tmp17831 = _mm512_unpacklo_ps(wt595, wt596);
__m512 tmp17832 = _mm512_unpackhi_ps(wt595, wt596);
__m512 tmp17833 = _mm512_unpacklo_ps(wt597, wt598);
__m512 tmp17834 = _mm512_unpackhi_ps(wt597, wt598);
__m512 tmp17835 = _mm512_unpacklo_ps(wt599, wt600);
__m512 tmp17836 = _mm512_unpackhi_ps(wt599, wt600);
__m512 tmp17837 = _mm512_shuffle_ps(tmp17821, tmp17823, 68);
__m512 tmp17838 = _mm512_shuffle_ps(tmp17821, tmp17823, 238);
__m512 tmp17839 = _mm512_shuffle_ps(tmp17822, tmp17824, 68);
__m512 tmp17840 = _mm512_shuffle_ps(tmp17822, tmp17824, 238);
__m512 tmp17841 = _mm512_shuffle_ps(tmp17825, tmp17827, 68);
__m512 tmp17842 = _mm512_shuffle_ps(tmp17825, tmp17827, 238);
__m512 tmp17843 = _mm512_shuffle_ps(tmp17826, tmp17828, 68);
__m512 tmp17844 = _mm512_shuffle_ps(tmp17826, tmp17828, 238);
__m512 tmp17845 = _mm512_shuffle_ps(tmp17829, tmp17831, 68);
__m512 tmp17846 = _mm512_shuffle_ps(tmp17829, tmp17831, 238);
__m512 tmp17847 = _mm512_shuffle_ps(tmp17830, tmp17832, 68);
__m512 tmp17848 = _mm512_shuffle_ps(tmp17830, tmp17832, 238);
__m512 tmp17849 = _mm512_shuffle_ps(tmp17833, tmp17835, 68);
__m512 tmp17850 = _mm512_shuffle_ps(tmp17833, tmp17835, 238);
__m512 tmp17851 = _mm512_shuffle_ps(tmp17834, tmp17836, 68);
__m512 tmp17852 = _mm512_shuffle_ps(tmp17834, tmp17836, 238);
__m512 tmp17853 = _mm512_shuffle_f32x4(tmp17837, tmp17841, 136);
__m512 tmp17854 = _mm512_shuffle_f32x4(tmp17837, tmp17841, 221);
__m512 tmp17855 = _mm512_shuffle_f32x4(tmp17838, tmp17842, 136);
__m512 tmp17856 = _mm512_shuffle_f32x4(tmp17838, tmp17842, 221);
__m512 tmp17857 = _mm512_shuffle_f32x4(tmp17839, tmp17843, 136);
__m512 tmp17858 = _mm512_shuffle_f32x4(tmp17839, tmp17843, 221);
__m512 tmp17859 = _mm512_shuffle_f32x4(tmp17840, tmp17844, 136);
__m512 tmp17860 = _mm512_shuffle_f32x4(tmp17840, tmp17844, 221);
__m512 tmp17861 = _mm512_shuffle_f32x4(tmp17845, tmp17849, 136);
__m512 tmp17862 = _mm512_shuffle_f32x4(tmp17845, tmp17849, 221);
__m512 tmp17863 = _mm512_shuffle_f32x4(tmp17846, tmp17850, 136);
__m512 tmp17864 = _mm512_shuffle_f32x4(tmp17846, tmp17850, 221);
__m512 tmp17865 = _mm512_shuffle_f32x4(tmp17847, tmp17851, 136);
__m512 tmp17866 = _mm512_shuffle_f32x4(tmp17847, tmp17851, 221);
__m512 tmp17867 = _mm512_shuffle_f32x4(tmp17848, tmp17852, 136);
__m512 tmp17868 = _mm512_shuffle_f32x4(tmp17848, tmp17852, 221);
wt585 = _mm512_shuffle_f32x4(tmp17853, tmp17861, 136);
wt593 = _mm512_shuffle_f32x4(tmp17853, tmp17861, 221);
wt586 = _mm512_shuffle_f32x4(tmp17855, tmp17863, 136);
wt594 = _mm512_shuffle_f32x4(tmp17855, tmp17863, 221);
wt587 = _mm512_shuffle_f32x4(tmp17857, tmp17865, 136);
wt595 = _mm512_shuffle_f32x4(tmp17857, tmp17865, 221);
wt588 = _mm512_shuffle_f32x4(tmp17859, tmp17867, 136);
wt596 = _mm512_shuffle_f32x4(tmp17859, tmp17867, 221);
wt589 = _mm512_shuffle_f32x4(tmp17854, tmp17862, 136);
wt597 = _mm512_shuffle_f32x4(tmp17854, tmp17862, 221);
wt590 = _mm512_shuffle_f32x4(tmp17856, tmp17864, 136);
wt598 = _mm512_shuffle_f32x4(tmp17856, tmp17864, 221);
wt591 = _mm512_shuffle_f32x4(tmp17858, tmp17866, 136);
wt599 = _mm512_shuffle_f32x4(tmp17858, tmp17866, 221);
wt592 = _mm512_shuffle_f32x4(tmp17860, tmp17868, 136);
wt600 = _mm512_shuffle_f32x4(tmp17860, tmp17868, 221);
wt585 = _mm512_mul_ps(wt585, postMul58);
wt586 = _mm512_mul_ps(wt586, postMul58);
wt587 = _mm512_mul_ps(wt587, postMul58);
wt588 = _mm512_mul_ps(wt588, postMul58);
wt589 = _mm512_mul_ps(wt589, postMul58);
wt590 = _mm512_mul_ps(wt590, postMul58);
wt591 = _mm512_mul_ps(wt591, postMul58);
wt592 = _mm512_mul_ps(wt592, postMul58);
wt593 = _mm512_mul_ps(wt593, postMul58);
wt594 = _mm512_mul_ps(wt594, postMul58);
wt595 = _mm512_mul_ps(wt595, postMul58);
wt596 = _mm512_mul_ps(wt596, postMul58);
wt597 = _mm512_mul_ps(wt597, postMul58);
wt598 = _mm512_mul_ps(wt598, postMul58);
wt599 = _mm512_mul_ps(wt599, postMul58);
wt600 = _mm512_mul_ps(wt600, postMul58);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(1+16*c46)+(ptrdiff_t)0, 63>>cut24, wt585);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(2+16*c46)+(ptrdiff_t)0, 63>>cut24, wt586);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(3+16*c46)+(ptrdiff_t)0, 63>>cut24, wt587);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(4+16*c46)+(ptrdiff_t)0, 63>>cut24, wt588);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(5+16*c46)+(ptrdiff_t)0, 63>>cut24, wt589);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(6+16*c46)+(ptrdiff_t)0, 63>>cut24, wt590);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(7+16*c46)+(ptrdiff_t)0, 63>>cut24, wt591);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(8+16*c46)+(ptrdiff_t)0, 63>>cut24, wt592);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(9+16*c46)+(ptrdiff_t)0, 63>>cut24, wt593);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(10+16*c46)+(ptrdiff_t)0, 63>>cut24, wt594);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(11+16*c46)+(ptrdiff_t)0, 63>>cut24, wt595);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(12+16*c46)+(ptrdiff_t)0, 63>>cut24, wt596);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(13+16*c46)+(ptrdiff_t)0, 63>>cut24, wt597);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(14+16*c46)+(ptrdiff_t)0, 63>>cut24, wt598);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(15+16*c46)+(ptrdiff_t)0, 63>>cut24, wt599);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(16+16*c46)+(ptrdiff_t)0, 63>>cut24, wt600);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(1+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt585);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(2+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt586);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(3+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt587);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(4+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt588);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(5+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt589);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(6+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt590);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(7+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt591);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(8+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt592);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(9+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt593);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(10+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt594);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(11+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt595);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(12+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt596);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(13+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt597);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(14+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt598);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(15+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt599);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(16+16*c46)+(ptrdiff_t)6144, 4032>>cut24, wt600);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(1+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt585);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(2+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt586);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(3+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt587);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(4+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt588);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(5+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt589);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(6+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt590);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(7+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt591);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(8+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt592);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(9+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt593);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(10+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt594);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(11+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt595);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(12+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt596);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(13+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt597);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(14+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt598);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(15+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt599);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(16+16*c46)+(ptrdiff_t)12288, 258048>>cut24, wt600);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(1+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt585);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(2+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt586);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(3+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt587);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(4+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt588);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(5+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt589);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(6+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt590);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(7+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt591);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(8+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt592);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(9+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt593);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(10+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt594);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(11+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt595);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(12+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt596);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(13+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt597);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(14+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt598);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(15+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt599);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l67+4*cut24+24*(16+16*c46)+(ptrdiff_t)18432, 65535-(262143>>cut24), wt600);
}
}
}
} else {
ptrdiff_t k153 = 1008;
ptrdiff_t l66 = (size_t)(0+k153)/6;
ptrdiff_t cut23 = (size_t)(0+k153)%6;
__m512 sum481 = _mm512_maskz_loadu_ps(65535, biasPtr17+4096*i56+4*k153);
__m512i pmMul36 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd36 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo30 = _mm512_loadu_ps(bnPtr17+(ptrdiff_t)8*(k153+1024*i56));
__m512 masHi30 = _mm512_maskz_loadu_ps(65535, bnPtr17+(ptrdiff_t)8*(k153+1024*i56)+(ptrdiff_t)64);
__m512 postMul56 = _mm512_permutex2var_ps(masLo30, pmMul36, masHi30);
__m512 postAdd34 = _mm512_permutex2var_ps(masLo30, pmAdd36, masHi30);
sum481 = _mm512_fmadd_ps(sum481, postMul56, postAdd34);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*0+(ptrdiff_t)0, 63>>cut23, sum481);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*0+(ptrdiff_t)6144, 4032>>cut23, sum481);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*0+(ptrdiff_t)12288, 65535-(4095>>cut23), sum481);
ptrdiff_t c44 = 0;
for (; c44 != 16; ++c44) {
__m512 wt553 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)0);
__m512 wt554 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)1024);
__m512 wt555 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)2048);
__m512 wt556 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)3072);
__m512 wt557 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)4096);
__m512 wt558 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)5120);
__m512 wt559 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)6144);
__m512 wt560 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)7168);
__m512 wt561 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)8192);
__m512 wt562 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)9216);
__m512 wt563 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)10240);
__m512 wt564 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)11264);
__m512 wt565 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)12288);
__m512 wt566 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)13312);
__m512 wt567 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)14336);
__m512 wt568 = _mm512_maskz_loadu_ps(65535, wtPtr17+1048576*i56+1024*k153+64*c44+(ptrdiff_t)15360);
__m512 tmp17869 = _mm512_unpacklo_ps(wt553, wt554);
__m512 tmp17870 = _mm512_unpackhi_ps(wt553, wt554);
__m512 tmp17871 = _mm512_unpacklo_ps(wt555, wt556);
__m512 tmp17872 = _mm512_unpackhi_ps(wt555, wt556);
__m512 tmp17873 = _mm512_unpacklo_ps(wt557, wt558);
__m512 tmp17874 = _mm512_unpackhi_ps(wt557, wt558);
__m512 tmp17875 = _mm512_unpacklo_ps(wt559, wt560);
__m512 tmp17876 = _mm512_unpackhi_ps(wt559, wt560);
__m512 tmp17877 = _mm512_unpacklo_ps(wt561, wt562);
__m512 tmp17878 = _mm512_unpackhi_ps(wt561, wt562);
__m512 tmp17879 = _mm512_unpacklo_ps(wt563, wt564);
__m512 tmp17880 = _mm512_unpackhi_ps(wt563, wt564);
__m512 tmp17881 = _mm512_unpacklo_ps(wt565, wt566);
__m512 tmp17882 = _mm512_unpackhi_ps(wt565, wt566);
__m512 tmp17883 = _mm512_unpacklo_ps(wt567, wt568);
__m512 tmp17884 = _mm512_unpackhi_ps(wt567, wt568);
__m512 tmp17885 = _mm512_shuffle_ps(tmp17869, tmp17871, 68);
__m512 tmp17886 = _mm512_shuffle_ps(tmp17869, tmp17871, 238);
__m512 tmp17887 = _mm512_shuffle_ps(tmp17870, tmp17872, 68);
__m512 tmp17888 = _mm512_shuffle_ps(tmp17870, tmp17872, 238);
__m512 tmp17889 = _mm512_shuffle_ps(tmp17873, tmp17875, 68);
__m512 tmp17890 = _mm512_shuffle_ps(tmp17873, tmp17875, 238);
__m512 tmp17891 = _mm512_shuffle_ps(tmp17874, tmp17876, 68);
__m512 tmp17892 = _mm512_shuffle_ps(tmp17874, tmp17876, 238);
__m512 tmp17893 = _mm512_shuffle_ps(tmp17877, tmp17879, 68);
__m512 tmp17894 = _mm512_shuffle_ps(tmp17877, tmp17879, 238);
__m512 tmp17895 = _mm512_shuffle_ps(tmp17878, tmp17880, 68);
__m512 tmp17896 = _mm512_shuffle_ps(tmp17878, tmp17880, 238);
__m512 tmp17897 = _mm512_shuffle_ps(tmp17881, tmp17883, 68);
__m512 tmp17898 = _mm512_shuffle_ps(tmp17881, tmp17883, 238);
__m512 tmp17899 = _mm512_shuffle_ps(tmp17882, tmp17884, 68);
__m512 tmp17900 = _mm512_shuffle_ps(tmp17882, tmp17884, 238);
__m512 tmp17901 = _mm512_shuffle_f32x4(tmp17885, tmp17889, 136);
__m512 tmp17902 = _mm512_shuffle_f32x4(tmp17885, tmp17889, 221);
__m512 tmp17903 = _mm512_shuffle_f32x4(tmp17886, tmp17890, 136);
__m512 tmp17904 = _mm512_shuffle_f32x4(tmp17886, tmp17890, 221);
__m512 tmp17905 = _mm512_shuffle_f32x4(tmp17887, tmp17891, 136);
__m512 tmp17906 = _mm512_shuffle_f32x4(tmp17887, tmp17891, 221);
__m512 tmp17907 = _mm512_shuffle_f32x4(tmp17888, tmp17892, 136);
__m512 tmp17908 = _mm512_shuffle_f32x4(tmp17888, tmp17892, 221);
__m512 tmp17909 = _mm512_shuffle_f32x4(tmp17893, tmp17897, 136);
__m512 tmp17910 = _mm512_shuffle_f32x4(tmp17893, tmp17897, 221);
__m512 tmp17911 = _mm512_shuffle_f32x4(tmp17894, tmp17898, 136);
__m512 tmp17912 = _mm512_shuffle_f32x4(tmp17894, tmp17898, 221);
__m512 tmp17913 = _mm512_shuffle_f32x4(tmp17895, tmp17899, 136);
__m512 tmp17914 = _mm512_shuffle_f32x4(tmp17895, tmp17899, 221);
__m512 tmp17915 = _mm512_shuffle_f32x4(tmp17896, tmp17900, 136);
__m512 tmp17916 = _mm512_shuffle_f32x4(tmp17896, tmp17900, 221);
wt553 = _mm512_shuffle_f32x4(tmp17901, tmp17909, 136);
wt561 = _mm512_shuffle_f32x4(tmp17901, tmp17909, 221);
wt554 = _mm512_shuffle_f32x4(tmp17903, tmp17911, 136);
wt562 = _mm512_shuffle_f32x4(tmp17903, tmp17911, 221);
wt555 = _mm512_shuffle_f32x4(tmp17905, tmp17913, 136);
wt563 = _mm512_shuffle_f32x4(tmp17905, tmp17913, 221);
wt556 = _mm512_shuffle_f32x4(tmp17907, tmp17915, 136);
wt564 = _mm512_shuffle_f32x4(tmp17907, tmp17915, 221);
wt557 = _mm512_shuffle_f32x4(tmp17902, tmp17910, 136);
wt565 = _mm512_shuffle_f32x4(tmp17902, tmp17910, 221);
wt558 = _mm512_shuffle_f32x4(tmp17904, tmp17912, 136);
wt566 = _mm512_shuffle_f32x4(tmp17904, tmp17912, 221);
wt559 = _mm512_shuffle_f32x4(tmp17906, tmp17914, 136);
wt567 = _mm512_shuffle_f32x4(tmp17906, tmp17914, 221);
wt560 = _mm512_shuffle_f32x4(tmp17908, tmp17916, 136);
wt568 = _mm512_shuffle_f32x4(tmp17908, tmp17916, 221);
wt553 = _mm512_mul_ps(wt553, postMul56);
wt554 = _mm512_mul_ps(wt554, postMul56);
wt555 = _mm512_mul_ps(wt555, postMul56);
wt556 = _mm512_mul_ps(wt556, postMul56);
wt557 = _mm512_mul_ps(wt557, postMul56);
wt558 = _mm512_mul_ps(wt558, postMul56);
wt559 = _mm512_mul_ps(wt559, postMul56);
wt560 = _mm512_mul_ps(wt560, postMul56);
wt561 = _mm512_mul_ps(wt561, postMul56);
wt562 = _mm512_mul_ps(wt562, postMul56);
wt563 = _mm512_mul_ps(wt563, postMul56);
wt564 = _mm512_mul_ps(wt564, postMul56);
wt565 = _mm512_mul_ps(wt565, postMul56);
wt566 = _mm512_mul_ps(wt566, postMul56);
wt567 = _mm512_mul_ps(wt567, postMul56);
wt568 = _mm512_mul_ps(wt568, postMul56);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(1+16*c44)+(ptrdiff_t)0, 63>>cut23, wt553);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(2+16*c44)+(ptrdiff_t)0, 63>>cut23, wt554);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(3+16*c44)+(ptrdiff_t)0, 63>>cut23, wt555);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(4+16*c44)+(ptrdiff_t)0, 63>>cut23, wt556);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(5+16*c44)+(ptrdiff_t)0, 63>>cut23, wt557);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(6+16*c44)+(ptrdiff_t)0, 63>>cut23, wt558);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(7+16*c44)+(ptrdiff_t)0, 63>>cut23, wt559);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(8+16*c44)+(ptrdiff_t)0, 63>>cut23, wt560);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(9+16*c44)+(ptrdiff_t)0, 63>>cut23, wt561);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(10+16*c44)+(ptrdiff_t)0, 63>>cut23, wt562);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(11+16*c44)+(ptrdiff_t)0, 63>>cut23, wt563);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(12+16*c44)+(ptrdiff_t)0, 63>>cut23, wt564);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(13+16*c44)+(ptrdiff_t)0, 63>>cut23, wt565);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(14+16*c44)+(ptrdiff_t)0, 63>>cut23, wt566);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(15+16*c44)+(ptrdiff_t)0, 63>>cut23, wt567);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(16+16*c44)+(ptrdiff_t)0, 63>>cut23, wt568);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(1+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt553);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(2+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt554);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(3+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt555);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(4+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt556);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(5+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt557);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(6+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt558);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(7+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt559);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(8+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt560);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(9+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt561);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(10+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt562);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(11+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt563);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(12+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt564);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(13+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt565);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(14+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt566);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(15+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt567);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+24*(16+16*c44)+(ptrdiff_t)6144, 4032>>cut23, wt568);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(1+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt553);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(2+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt554);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(3+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt555);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(4+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt556);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(5+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt557);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(6+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt558);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(7+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt559);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(8+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt560);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(9+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt561);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(10+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt562);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(11+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt563);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(12+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt564);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(13+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt565);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(14+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt566);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(15+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt567);
_mm512_mask_storeu_ps(arranged15+1052672*i56+6168*l66+4*cut23+16*(16+16*c44)+(ptrdiff_t)12288, 65535-(4095>>cut23), wt568);
}
}
}
}
}

static void ResNet50OneArrangeWts8(ResNet50ThreaderTeam1* team60, char** tensors93) {
ResNet50ThreaderTask1 task97;
task97.callee1 = ResNet50OneArrangeWts8Callee1;
task97.any1 = tensors93;
task97.nd1 = 3;
task97.hull1[0] = 32;
task97.hull1[1] = 1;
task97.hull1[2] = 1;
ResNet50ThreaderDo1(team60, &task97);
}

static void ResNet50OneArrangeDats8Callee1(ResNet50ThreaderTask1* task98, int64_t* pt54) {
char** tensors96 = task98->any1;
ptrdiff_t s44 = pt54[0];
ptrdiff_t c47 = pt54[1];
char*restrict datPtr29 = tensors96[0]+(ptrdiff_t)0+(ptrdiff_t)694720*0+(ptrdiff_t)212992*0;
char*restrict arranged16 = tensors96[1]+(ptrdiff_t)694720*0+(ptrdiff_t)212992*0;
ptrdiff_t ii23 = 1;
for (ptrdiff_t i57 = 0; i57 < ii23; ++i57) {
ptrdiff_t j49 = 1*c47;
ptrdiff_t jj46 = j49+0;
for (; j49 != 3; ++j49) {
ptrdiff_t k155 = 128*s44;
ptrdiff_t kk51 = k155+128;
for (; k155 < kk51; ++k155) {
__m512 dat2308 = _mm512_maskz_loadu_ps(65535, datPtr29+212992*i57+256*j49+832*k155+(ptrdiff_t)0);
__m512 dat2309 = _mm512_maskz_loadu_ps(65535, datPtr29+212992*i57+256*j49+832*k155+(ptrdiff_t)64);
__m512 dat2310 = _mm512_maskz_loadu_ps(65535, datPtr29+212992*i57+256*j49+832*k155+(ptrdiff_t)128);
__m512 dat2311 = _mm512_maskz_loadu_ps(65535, datPtr29+212992*i57+256*j49+832*k155+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged16+212992*i57+65536*j49+256*k155+(ptrdiff_t)0, 65535, dat2308);
_mm512_mask_storeu_ps(arranged16+212992*i57+65536*j49+256*k155+(ptrdiff_t)64, 65535, dat2309);
_mm512_mask_storeu_ps(arranged16+212992*i57+65536*j49+256*k155+(ptrdiff_t)128, 65535, dat2310);
_mm512_mask_storeu_ps(arranged16+212992*i57+65536*j49+256*k155+(ptrdiff_t)192, 65535, dat2311);
}
if (j49 >= jj46) goto next8;
}
ptrdiff_t k156 = 128*s44;
ptrdiff_t kk52 = k156+128;
for (; k156 < kk52; ++k156) {
__m512 dat2312 = _mm512_maskz_loadu_ps(15, datPtr29+212992*i57+256*j49+832*k156+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged16+212992*i57+65536*j49+64*k156+(ptrdiff_t)0, 15, dat2312);
}
next8:;
}
}

static void ResNet50OneArrangeDats8(ResNet50ThreaderTeam1* team61, char** tensors95) {
ResNet50ThreaderTask1 task99;
task99.callee1 = ResNet50OneArrangeDats8Callee1;
task99.any1 = tensors95;
task99.nd1 = 4;
task99.hull1[0] = 2;
task99.hull1[1] = 4;
task99.hull1[2] = 1;
task99.hull1[3] = 1;
ResNet50ThreaderDo1(team61, &task99);
}

static void ResNet50OneApply8Callee1(ResNet50ThreaderTask1* task100, int64_t* pt55) {
void** pair26 = task100->any1;
char** tensors98 = pair26[0];
ptrdiff_t e28 = 0;
ptrdiff_t g33 = 0;
ptrdiff_t d20 = pt55[1];
ptrdiff_t w69 = pt55[0];
char*restrict arrangedWts8 = tensors98[0]+3424256*e28+(ptrdiff_t)1052672*1*g33;
char*restrict arrangedDats8 = tensors98[1]+694720*e28+(ptrdiff_t)212992*1*g33;
char*restrict datPtr30 = tensors98[2]+(ptrdiff_t)851968*1*g33;
char*restrict datPtr31 = tensors98[3]+(ptrdiff_t)851968*1*g33;
ptrdiff_t ii24 = 1;
for (ptrdiff_t i58 = 0; i58 < ii24; ++i58) {
ptrdiff_t j50 = 1*d20;
ptrdiff_t jj47 = j50+0;
for (; j50 != 3; ++j50) {
ptrdiff_t k157 = 2*w69;
ptrdiff_t kk53 = k157+(w69 < 84 ? 1 : 2);
for (; k157 != 170; ++k157) {
ptrdiff_t s45 = -1;
__m512 sum484 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)24));
__m512 sum488 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)28));
__m512 sum492 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)32));
__m512 sum496 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)36));
__m512 sum500 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)40));
__m512 sum504 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)44));
__m512 sum485 = sum484;
__m512 sum486 = sum484;
__m512 sum487 = sum484;
__m512 sum489 = sum488;
__m512 sum490 = sum488;
__m512 sum491 = sum488;
__m512 sum493 = sum492;
__m512 sum494 = sum492;
__m512 sum495 = sum492;
__m512 sum497 = sum496;
__m512 sum498 = sum496;
__m512 sum499 = sum496;
__m512 sum501 = sum500;
__m512 sum502 = sum500;
__m512 sum503 = sum500;
__m512 sum505 = sum504;
__m512 sum506 = sum504;
__m512 sum507 = sum504;
for (s45 = 0; s45 < 256; ++s45) {
__m512 dat2313 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+256*s45+(ptrdiff_t)0);
__m512 dat2314 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+256*s45+(ptrdiff_t)64);
__m512 dat2315 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+256*s45+(ptrdiff_t)128);
__m512 dat2316 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+256*s45+(ptrdiff_t)192);
__m512 wt601 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)24));
sum484 = _mm512_fmadd_ps(wt601, dat2313, sum484);
sum485 = _mm512_fmadd_ps(wt601, dat2314, sum485);
sum486 = _mm512_fmadd_ps(wt601, dat2315, sum486);
sum487 = _mm512_fmadd_ps(wt601, dat2316, sum487);
__m512 wt602 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)28));
sum488 = _mm512_fmadd_ps(wt602, dat2313, sum488);
sum489 = _mm512_fmadd_ps(wt602, dat2314, sum489);
sum490 = _mm512_fmadd_ps(wt602, dat2315, sum490);
sum491 = _mm512_fmadd_ps(wt602, dat2316, sum491);
__m512 wt603 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)32));
sum492 = _mm512_fmadd_ps(wt603, dat2313, sum492);
sum493 = _mm512_fmadd_ps(wt603, dat2314, sum493);
sum494 = _mm512_fmadd_ps(wt603, dat2315, sum494);
sum495 = _mm512_fmadd_ps(wt603, dat2316, sum495);
__m512 wt604 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)36));
sum496 = _mm512_fmadd_ps(wt604, dat2313, sum496);
sum497 = _mm512_fmadd_ps(wt604, dat2314, sum497);
sum498 = _mm512_fmadd_ps(wt604, dat2315, sum498);
sum499 = _mm512_fmadd_ps(wt604, dat2316, sum499);
__m512 wt605 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)40));
sum500 = _mm512_fmadd_ps(wt605, dat2313, sum500);
sum501 = _mm512_fmadd_ps(wt605, dat2314, sum501);
sum502 = _mm512_fmadd_ps(wt605, dat2315, sum502);
sum503 = _mm512_fmadd_ps(wt605, dat2316, sum503);
__m512 wt606 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+24*s45+(ptrdiff_t)44));
sum504 = _mm512_fmadd_ps(wt606, dat2313, sum504);
sum505 = _mm512_fmadd_ps(wt606, dat2314, sum505);
sum506 = _mm512_fmadd_ps(wt606, dat2315, sum506);
sum507 = _mm512_fmadd_ps(wt606, dat2316, sum507);
}
sum484 = _mm512_add_ps(sum484, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)0));
sum485 = _mm512_add_ps(sum485, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)64));
sum486 = _mm512_add_ps(sum486, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)128));
sum487 = _mm512_add_ps(sum487, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)192));
sum484 = _mm512_max_ps(_mm512_setzero_ps(), sum484);
sum485 = _mm512_max_ps(_mm512_setzero_ps(), sum485);
sum486 = _mm512_max_ps(_mm512_setzero_ps(), sum486);
sum487 = _mm512_max_ps(_mm512_setzero_ps(), sum487);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)0, 65535, sum484);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)64, 65535, sum485);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)128, 65535, sum486);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)192, 65535, sum487);
sum488 = _mm512_add_ps(sum488, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)832));
sum489 = _mm512_add_ps(sum489, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)896));
sum490 = _mm512_add_ps(sum490, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)960));
sum491 = _mm512_add_ps(sum491, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1024));
sum488 = _mm512_max_ps(_mm512_setzero_ps(), sum488);
sum489 = _mm512_max_ps(_mm512_setzero_ps(), sum489);
sum490 = _mm512_max_ps(_mm512_setzero_ps(), sum490);
sum491 = _mm512_max_ps(_mm512_setzero_ps(), sum491);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)832, 65535, sum488);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)896, 65535, sum489);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)960, 65535, sum490);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1024, 65535, sum491);
sum492 = _mm512_add_ps(sum492, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1664));
sum493 = _mm512_add_ps(sum493, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1728));
sum494 = _mm512_add_ps(sum494, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1792));
sum495 = _mm512_add_ps(sum495, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1856));
sum492 = _mm512_max_ps(_mm512_setzero_ps(), sum492);
sum493 = _mm512_max_ps(_mm512_setzero_ps(), sum493);
sum494 = _mm512_max_ps(_mm512_setzero_ps(), sum494);
sum495 = _mm512_max_ps(_mm512_setzero_ps(), sum495);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1664, 65535, sum492);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1728, 65535, sum493);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1792, 65535, sum494);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1856, 65535, sum495);
sum496 = _mm512_add_ps(sum496, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)2496));
sum497 = _mm512_add_ps(sum497, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)2560));
sum498 = _mm512_add_ps(sum498, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)2624));
sum499 = _mm512_add_ps(sum499, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)2688));
sum496 = _mm512_max_ps(_mm512_setzero_ps(), sum496);
sum497 = _mm512_max_ps(_mm512_setzero_ps(), sum497);
sum498 = _mm512_max_ps(_mm512_setzero_ps(), sum498);
sum499 = _mm512_max_ps(_mm512_setzero_ps(), sum499);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)2496, 65535, sum496);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)2560, 65535, sum497);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)2624, 65535, sum498);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)2688, 65535, sum499);
sum500 = _mm512_add_ps(sum500, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)3328));
sum501 = _mm512_add_ps(sum501, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)3392));
sum502 = _mm512_add_ps(sum502, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)3456));
sum503 = _mm512_add_ps(sum503, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)3520));
sum500 = _mm512_max_ps(_mm512_setzero_ps(), sum500);
sum501 = _mm512_max_ps(_mm512_setzero_ps(), sum501);
sum502 = _mm512_max_ps(_mm512_setzero_ps(), sum502);
sum503 = _mm512_max_ps(_mm512_setzero_ps(), sum503);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)3328, 65535, sum500);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)3392, 65535, sum501);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)3456, 65535, sum502);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)3520, 65535, sum503);
sum504 = _mm512_add_ps(sum504, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)4160));
sum505 = _mm512_add_ps(sum505, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)4224));
sum506 = _mm512_add_ps(sum506, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)4288));
sum507 = _mm512_add_ps(sum507, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)4352));
sum504 = _mm512_max_ps(_mm512_setzero_ps(), sum504);
sum505 = _mm512_max_ps(_mm512_setzero_ps(), sum505);
sum506 = _mm512_max_ps(_mm512_setzero_ps(), sum506);
sum507 = _mm512_max_ps(_mm512_setzero_ps(), sum507);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)4160, 65535, sum504);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)4224, 65535, sum505);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)4288, 65535, sum506);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)4352, 65535, sum507);
if (k157 >= kk53) return;
}
ptrdiff_t s46 = -1;
__m512 sum508 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+16*s46+(ptrdiff_t)16));
__m512 sum512 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+16*s46+(ptrdiff_t)20));
__m512 sum516 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+16*s46+(ptrdiff_t)24));
__m512 sum520 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+16*s46+(ptrdiff_t)28));
__m512 sum509 = sum508;
__m512 sum510 = sum508;
__m512 sum511 = sum508;
__m512 sum513 = sum512;
__m512 sum514 = sum512;
__m512 sum515 = sum512;
__m512 sum517 = sum516;
__m512 sum518 = sum516;
__m512 sum519 = sum516;
__m512 sum521 = sum520;
__m512 sum522 = sum520;
__m512 sum523 = sum520;
for (s46 = 0; s46 < 256; ++s46) {
__m512 dat2317 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+256*s46+(ptrdiff_t)0);
__m512 dat2318 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+256*s46+(ptrdiff_t)64);
__m512 dat2319 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+256*s46+(ptrdiff_t)128);
__m512 dat2320 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+256*s46+(ptrdiff_t)192);
__m512 wt607 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+16*s46+(ptrdiff_t)16));
sum508 = _mm512_fmadd_ps(wt607, dat2317, sum508);
sum509 = _mm512_fmadd_ps(wt607, dat2318, sum509);
sum510 = _mm512_fmadd_ps(wt607, dat2319, sum510);
sum511 = _mm512_fmadd_ps(wt607, dat2320, sum511);
__m512 wt608 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+16*s46+(ptrdiff_t)20));
sum512 = _mm512_fmadd_ps(wt608, dat2317, sum512);
sum513 = _mm512_fmadd_ps(wt608, dat2318, sum513);
sum514 = _mm512_fmadd_ps(wt608, dat2319, sum514);
sum515 = _mm512_fmadd_ps(wt608, dat2320, sum515);
__m512 wt609 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+16*s46+(ptrdiff_t)24));
sum516 = _mm512_fmadd_ps(wt609, dat2317, sum516);
sum517 = _mm512_fmadd_ps(wt609, dat2318, sum517);
sum518 = _mm512_fmadd_ps(wt609, dat2319, sum518);
sum519 = _mm512_fmadd_ps(wt609, dat2320, sum519);
__m512 wt610 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k157+16*s46+(ptrdiff_t)28));
sum520 = _mm512_fmadd_ps(wt610, dat2317, sum520);
sum521 = _mm512_fmadd_ps(wt610, dat2318, sum521);
sum522 = _mm512_fmadd_ps(wt610, dat2319, sum522);
sum523 = _mm512_fmadd_ps(wt610, dat2320, sum523);
}
sum508 = _mm512_add_ps(sum508, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)0));
sum509 = _mm512_add_ps(sum509, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)64));
sum510 = _mm512_add_ps(sum510, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)128));
sum511 = _mm512_add_ps(sum511, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)192));
sum508 = _mm512_max_ps(_mm512_setzero_ps(), sum508);
sum509 = _mm512_max_ps(_mm512_setzero_ps(), sum509);
sum510 = _mm512_max_ps(_mm512_setzero_ps(), sum510);
sum511 = _mm512_max_ps(_mm512_setzero_ps(), sum511);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)0, 65535, sum508);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)64, 65535, sum509);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)128, 65535, sum510);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)192, 65535, sum511);
sum512 = _mm512_add_ps(sum512, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)832));
sum513 = _mm512_add_ps(sum513, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)896));
sum514 = _mm512_add_ps(sum514, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)960));
sum515 = _mm512_add_ps(sum515, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1024));
sum512 = _mm512_max_ps(_mm512_setzero_ps(), sum512);
sum513 = _mm512_max_ps(_mm512_setzero_ps(), sum513);
sum514 = _mm512_max_ps(_mm512_setzero_ps(), sum514);
sum515 = _mm512_max_ps(_mm512_setzero_ps(), sum515);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)832, 65535, sum512);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)896, 65535, sum513);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)960, 65535, sum514);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1024, 65535, sum515);
sum516 = _mm512_add_ps(sum516, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1664));
sum517 = _mm512_add_ps(sum517, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1728));
sum518 = _mm512_add_ps(sum518, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1792));
sum519 = _mm512_add_ps(sum519, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)1856));
sum516 = _mm512_max_ps(_mm512_setzero_ps(), sum516);
sum517 = _mm512_max_ps(_mm512_setzero_ps(), sum517);
sum518 = _mm512_max_ps(_mm512_setzero_ps(), sum518);
sum519 = _mm512_max_ps(_mm512_setzero_ps(), sum519);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1664, 65535, sum516);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1728, 65535, sum517);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1792, 65535, sum518);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)1856, 65535, sum519);
sum520 = _mm512_add_ps(sum520, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)2496));
sum521 = _mm512_add_ps(sum521, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)2560));
sum522 = _mm512_add_ps(sum522, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)2624));
sum523 = _mm512_add_ps(sum523, _mm512_maskz_loadu_ps(65535, datPtr30+851968*i58+256*j50+4992*k157+(ptrdiff_t)2688));
sum520 = _mm512_max_ps(_mm512_setzero_ps(), sum520);
sum521 = _mm512_max_ps(_mm512_setzero_ps(), sum521);
sum522 = _mm512_max_ps(_mm512_setzero_ps(), sum522);
sum523 = _mm512_max_ps(_mm512_setzero_ps(), sum523);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)2496, 65535, sum520);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)2560, 65535, sum521);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)2624, 65535, sum522);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k157+(ptrdiff_t)2688, 65535, sum523);
if (j50 >= jj47) return;
}
ptrdiff_t k158 = 2*w69;
ptrdiff_t kk54 = k158+(w69 < 84 ? 1 : 2);
for (; k158 != 170; ++k158) {
ptrdiff_t s47 = -1;
__m512 sum524 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)24));
__m512 sum525 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)28));
__m512 sum526 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)32));
__m512 sum527 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)36));
__m512 sum528 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)40));
__m512 sum529 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)44));
for (s47 = 0; s47 < 256; ++s47) {
__m512 dat2321 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+64*s47+(ptrdiff_t)0);
__m512 wt611 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)24));
sum524 = _mm512_fmadd_ps(wt611, dat2321, sum524);
__m512 wt612 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)28));
sum525 = _mm512_fmadd_ps(wt612, dat2321, sum525);
__m512 wt613 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)32));
sum526 = _mm512_fmadd_ps(wt613, dat2321, sum526);
__m512 wt614 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)36));
sum527 = _mm512_fmadd_ps(wt614, dat2321, sum527);
__m512 wt615 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)40));
sum528 = _mm512_fmadd_ps(wt615, dat2321, sum528);
__m512 wt616 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+24*s47+(ptrdiff_t)44));
sum529 = _mm512_fmadd_ps(wt616, dat2321, sum529);
}
sum524 = _mm512_add_ps(sum524, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)0));
sum524 = _mm512_max_ps(_mm512_setzero_ps(), sum524);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)0, 15, sum524);
sum525 = _mm512_add_ps(sum525, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)832));
sum525 = _mm512_max_ps(_mm512_setzero_ps(), sum525);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)832, 15, sum525);
sum526 = _mm512_add_ps(sum526, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)1664));
sum526 = _mm512_max_ps(_mm512_setzero_ps(), sum526);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)1664, 15, sum526);
sum527 = _mm512_add_ps(sum527, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)2496));
sum527 = _mm512_max_ps(_mm512_setzero_ps(), sum527);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)2496, 15, sum527);
sum528 = _mm512_add_ps(sum528, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)3328));
sum528 = _mm512_max_ps(_mm512_setzero_ps(), sum528);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)3328, 15, sum528);
sum529 = _mm512_add_ps(sum529, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)4160));
sum529 = _mm512_max_ps(_mm512_setzero_ps(), sum529);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)4160, 15, sum529);
if (k158 >= kk54) return;
}
ptrdiff_t s48 = -1;
__m512 sum530 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+16*s48+(ptrdiff_t)16));
__m512 sum531 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+16*s48+(ptrdiff_t)20));
__m512 sum532 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+16*s48+(ptrdiff_t)24));
__m512 sum533 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+16*s48+(ptrdiff_t)28));
for (s48 = 0; s48 < 256; ++s48) {
__m512 dat2322 = _mm512_loadu_ps(arrangedDats8+212992*i58+65536*j50+64*s48+(ptrdiff_t)0);
__m512 wt617 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+16*s48+(ptrdiff_t)16));
sum530 = _mm512_fmadd_ps(wt617, dat2322, sum530);
__m512 wt618 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+16*s48+(ptrdiff_t)20));
sum531 = _mm512_fmadd_ps(wt618, dat2322, sum531);
__m512 wt619 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+16*s48+(ptrdiff_t)24));
sum532 = _mm512_fmadd_ps(wt619, dat2322, sum532);
__m512 wt620 = _mm512_set1_ps(*(float*)(arrangedWts8+1052672*i58+6168*k158+16*s48+(ptrdiff_t)28));
sum533 = _mm512_fmadd_ps(wt620, dat2322, sum533);
}
sum530 = _mm512_add_ps(sum530, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)0));
sum530 = _mm512_max_ps(_mm512_setzero_ps(), sum530);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)0, 15, sum530);
sum531 = _mm512_add_ps(sum531, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)832));
sum531 = _mm512_max_ps(_mm512_setzero_ps(), sum531);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)832, 15, sum531);
sum532 = _mm512_add_ps(sum532, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)1664));
sum532 = _mm512_max_ps(_mm512_setzero_ps(), sum532);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)1664, 15, sum532);
sum533 = _mm512_add_ps(sum533, _mm512_maskz_loadu_ps(15, datPtr30+851968*i58+256*j50+4992*k158+(ptrdiff_t)2496));
sum533 = _mm512_max_ps(_mm512_setzero_ps(), sum533);
_mm512_mask_storeu_ps(datPtr31+851968*i58+256*j50+4992*k158+(ptrdiff_t)2496, 15, sum533);
}
}

static void ResNet50OneApply8(ResNet50ThreaderTeam1* team62, char** tensors97) {
void* pair25[] = {tensors97, 0};
ResNet50ThreaderTask1 task101;
task101.callee1 = ResNet50OneApply8Callee1;
task101.any1 = pair25;
task101.nd1 = 3;
task101.hull1[0] = 85;
task101.hull1[1] = 4;
task101.hull1[2] = 1;
ResNet50ThreaderDo1(team62, &task101);
}

static void ResNet50OneArrangeWts9Callee1(ResNet50ThreaderTask1* task102, int64_t* pt56) {
char** tensors100 = task102->any1;
ptrdiff_t b67 = pt56[0];
char*restrict wtPtr18 = tensors100[0]+(ptrdiff_t)3340*0+(ptrdiff_t)1048576*0;
char*restrict biasPtr18 = tensors100[1]+(ptrdiff_t)1024*0;
char*restrict bnPtr18 = tensors100[2]+(ptrdiff_t)8*256*0;
char*restrict arranged17 = tensors100[3]+(ptrdiff_t)856064*0+(ptrdiff_t)1049600*0;
ptrdiff_t ii25 = 1;
for (ptrdiff_t i59 = 0; i59 < ii25; ++i59) {
ptrdiff_t j51 = 1*b67;
ptrdiff_t jj48 = j51+1;
for (; j51 < jj48; ++j51) {
if (j51 < 15) {
ptrdiff_t k160 = 0+16*(j51-0);
ptrdiff_t l69 = (size_t)(0+k160)/6;
ptrdiff_t cut26 = (size_t)(0+k160)%6;
switch (cut26) {
case 0:;
case 2: {
__m512 sum535 = _mm512_maskz_loadu_ps(65535, biasPtr18+1024*i59+4*k160);
__m512i pmMul37 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd37 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo31 = _mm512_loadu_ps(bnPtr18+(ptrdiff_t)8*(k160+256*i59));
__m512 masHi31 = _mm512_maskz_loadu_ps(65535, bnPtr18+(ptrdiff_t)8*(k160+256*i59)+(ptrdiff_t)64);
__m512 postMul60 = _mm512_permutex2var_ps(masLo31, pmMul37, masHi31);
__m512 postAdd38 = _mm512_permutex2var_ps(masLo31, pmAdd37, masHi31);
sum535 = _mm512_fmadd_ps(sum535, postMul60, postAdd38);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*0+(ptrdiff_t)0, 63>>cut26, sum535);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*0+(ptrdiff_t)24576, 4032>>cut26, sum535);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*0+(ptrdiff_t)49152, 65535-(4095>>cut26), sum535);
ptrdiff_t c49 = 0;
for (; c49 != 64; ++c49) {
__m512 wt637 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)0);
__m512 wt638 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)4096);
__m512 wt639 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)8192);
__m512 wt640 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)12288);
__m512 wt641 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)16384);
__m512 wt642 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)20480);
__m512 wt643 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)24576);
__m512 wt644 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)28672);
__m512 wt645 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)32768);
__m512 wt646 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)36864);
__m512 wt647 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)40960);
__m512 wt648 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)45056);
__m512 wt649 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)49152);
__m512 wt650 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)53248);
__m512 wt651 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)57344);
__m512 wt652 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c49+(ptrdiff_t)61440);
__m512 tmp17917 = _mm512_unpacklo_ps(wt637, wt638);
__m512 tmp17918 = _mm512_unpackhi_ps(wt637, wt638);
__m512 tmp17919 = _mm512_unpacklo_ps(wt639, wt640);
__m512 tmp17920 = _mm512_unpackhi_ps(wt639, wt640);
__m512 tmp17921 = _mm512_unpacklo_ps(wt641, wt642);
__m512 tmp17922 = _mm512_unpackhi_ps(wt641, wt642);
__m512 tmp17923 = _mm512_unpacklo_ps(wt643, wt644);
__m512 tmp17924 = _mm512_unpackhi_ps(wt643, wt644);
__m512 tmp17925 = _mm512_unpacklo_ps(wt645, wt646);
__m512 tmp17926 = _mm512_unpackhi_ps(wt645, wt646);
__m512 tmp17927 = _mm512_unpacklo_ps(wt647, wt648);
__m512 tmp17928 = _mm512_unpackhi_ps(wt647, wt648);
__m512 tmp17929 = _mm512_unpacklo_ps(wt649, wt650);
__m512 tmp17930 = _mm512_unpackhi_ps(wt649, wt650);
__m512 tmp17931 = _mm512_unpacklo_ps(wt651, wt652);
__m512 tmp17932 = _mm512_unpackhi_ps(wt651, wt652);
__m512 tmp17933 = _mm512_shuffle_ps(tmp17917, tmp17919, 68);
__m512 tmp17934 = _mm512_shuffle_ps(tmp17917, tmp17919, 238);
__m512 tmp17935 = _mm512_shuffle_ps(tmp17918, tmp17920, 68);
__m512 tmp17936 = _mm512_shuffle_ps(tmp17918, tmp17920, 238);
__m512 tmp17937 = _mm512_shuffle_ps(tmp17921, tmp17923, 68);
__m512 tmp17938 = _mm512_shuffle_ps(tmp17921, tmp17923, 238);
__m512 tmp17939 = _mm512_shuffle_ps(tmp17922, tmp17924, 68);
__m512 tmp17940 = _mm512_shuffle_ps(tmp17922, tmp17924, 238);
__m512 tmp17941 = _mm512_shuffle_ps(tmp17925, tmp17927, 68);
__m512 tmp17942 = _mm512_shuffle_ps(tmp17925, tmp17927, 238);
__m512 tmp17943 = _mm512_shuffle_ps(tmp17926, tmp17928, 68);
__m512 tmp17944 = _mm512_shuffle_ps(tmp17926, tmp17928, 238);
__m512 tmp17945 = _mm512_shuffle_ps(tmp17929, tmp17931, 68);
__m512 tmp17946 = _mm512_shuffle_ps(tmp17929, tmp17931, 238);
__m512 tmp17947 = _mm512_shuffle_ps(tmp17930, tmp17932, 68);
__m512 tmp17948 = _mm512_shuffle_ps(tmp17930, tmp17932, 238);
__m512 tmp17949 = _mm512_shuffle_f32x4(tmp17933, tmp17937, 136);
__m512 tmp17950 = _mm512_shuffle_f32x4(tmp17933, tmp17937, 221);
__m512 tmp17951 = _mm512_shuffle_f32x4(tmp17934, tmp17938, 136);
__m512 tmp17952 = _mm512_shuffle_f32x4(tmp17934, tmp17938, 221);
__m512 tmp17953 = _mm512_shuffle_f32x4(tmp17935, tmp17939, 136);
__m512 tmp17954 = _mm512_shuffle_f32x4(tmp17935, tmp17939, 221);
__m512 tmp17955 = _mm512_shuffle_f32x4(tmp17936, tmp17940, 136);
__m512 tmp17956 = _mm512_shuffle_f32x4(tmp17936, tmp17940, 221);
__m512 tmp17957 = _mm512_shuffle_f32x4(tmp17941, tmp17945, 136);
__m512 tmp17958 = _mm512_shuffle_f32x4(tmp17941, tmp17945, 221);
__m512 tmp17959 = _mm512_shuffle_f32x4(tmp17942, tmp17946, 136);
__m512 tmp17960 = _mm512_shuffle_f32x4(tmp17942, tmp17946, 221);
__m512 tmp17961 = _mm512_shuffle_f32x4(tmp17943, tmp17947, 136);
__m512 tmp17962 = _mm512_shuffle_f32x4(tmp17943, tmp17947, 221);
__m512 tmp17963 = _mm512_shuffle_f32x4(tmp17944, tmp17948, 136);
__m512 tmp17964 = _mm512_shuffle_f32x4(tmp17944, tmp17948, 221);
wt637 = _mm512_shuffle_f32x4(tmp17949, tmp17957, 136);
wt645 = _mm512_shuffle_f32x4(tmp17949, tmp17957, 221);
wt638 = _mm512_shuffle_f32x4(tmp17951, tmp17959, 136);
wt646 = _mm512_shuffle_f32x4(tmp17951, tmp17959, 221);
wt639 = _mm512_shuffle_f32x4(tmp17953, tmp17961, 136);
wt647 = _mm512_shuffle_f32x4(tmp17953, tmp17961, 221);
wt640 = _mm512_shuffle_f32x4(tmp17955, tmp17963, 136);
wt648 = _mm512_shuffle_f32x4(tmp17955, tmp17963, 221);
wt641 = _mm512_shuffle_f32x4(tmp17950, tmp17958, 136);
wt649 = _mm512_shuffle_f32x4(tmp17950, tmp17958, 221);
wt642 = _mm512_shuffle_f32x4(tmp17952, tmp17960, 136);
wt650 = _mm512_shuffle_f32x4(tmp17952, tmp17960, 221);
wt643 = _mm512_shuffle_f32x4(tmp17954, tmp17962, 136);
wt651 = _mm512_shuffle_f32x4(tmp17954, tmp17962, 221);
wt644 = _mm512_shuffle_f32x4(tmp17956, tmp17964, 136);
wt652 = _mm512_shuffle_f32x4(tmp17956, tmp17964, 221);
wt637 = _mm512_mul_ps(wt637, postMul60);
wt638 = _mm512_mul_ps(wt638, postMul60);
wt639 = _mm512_mul_ps(wt639, postMul60);
wt640 = _mm512_mul_ps(wt640, postMul60);
wt641 = _mm512_mul_ps(wt641, postMul60);
wt642 = _mm512_mul_ps(wt642, postMul60);
wt643 = _mm512_mul_ps(wt643, postMul60);
wt644 = _mm512_mul_ps(wt644, postMul60);
wt645 = _mm512_mul_ps(wt645, postMul60);
wt646 = _mm512_mul_ps(wt646, postMul60);
wt647 = _mm512_mul_ps(wt647, postMul60);
wt648 = _mm512_mul_ps(wt648, postMul60);
wt649 = _mm512_mul_ps(wt649, postMul60);
wt650 = _mm512_mul_ps(wt650, postMul60);
wt651 = _mm512_mul_ps(wt651, postMul60);
wt652 = _mm512_mul_ps(wt652, postMul60);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(1+16*c49)+(ptrdiff_t)0, 63>>cut26, wt637);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(2+16*c49)+(ptrdiff_t)0, 63>>cut26, wt638);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(3+16*c49)+(ptrdiff_t)0, 63>>cut26, wt639);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(4+16*c49)+(ptrdiff_t)0, 63>>cut26, wt640);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(5+16*c49)+(ptrdiff_t)0, 63>>cut26, wt641);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(6+16*c49)+(ptrdiff_t)0, 63>>cut26, wt642);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(7+16*c49)+(ptrdiff_t)0, 63>>cut26, wt643);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(8+16*c49)+(ptrdiff_t)0, 63>>cut26, wt644);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(9+16*c49)+(ptrdiff_t)0, 63>>cut26, wt645);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(10+16*c49)+(ptrdiff_t)0, 63>>cut26, wt646);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(11+16*c49)+(ptrdiff_t)0, 63>>cut26, wt647);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(12+16*c49)+(ptrdiff_t)0, 63>>cut26, wt648);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(13+16*c49)+(ptrdiff_t)0, 63>>cut26, wt649);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(14+16*c49)+(ptrdiff_t)0, 63>>cut26, wt650);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(15+16*c49)+(ptrdiff_t)0, 63>>cut26, wt651);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(16+16*c49)+(ptrdiff_t)0, 63>>cut26, wt652);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(1+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt637);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(2+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt638);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(3+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt639);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(4+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt640);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(5+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt641);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(6+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt642);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(7+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt643);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(8+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt644);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(9+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt645);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(10+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt646);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(11+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt647);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(12+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt648);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(13+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt649);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(14+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt650);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(15+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt651);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(16+16*c49)+(ptrdiff_t)24576, 4032>>cut26, wt652);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(1+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt637);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(2+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt638);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(3+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt639);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(4+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt640);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(5+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt641);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(6+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt642);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(7+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt643);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(8+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt644);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(9+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt645);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(10+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt646);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(11+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt647);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(12+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt648);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(13+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt649);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(14+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt650);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(15+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt651);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(16+16*c49)+(ptrdiff_t)49152, 65535-(4095>>cut26), wt652);
}
break;
}
default: {
cut26 = 4;
__m512 sum536 = _mm512_maskz_loadu_ps(65535, biasPtr18+1024*i59+4*k160);
__m512i pmMul38 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd38 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo32 = _mm512_loadu_ps(bnPtr18+(ptrdiff_t)8*(k160+256*i59));
__m512 masHi32 = _mm512_maskz_loadu_ps(65535, bnPtr18+(ptrdiff_t)8*(k160+256*i59)+(ptrdiff_t)64);
__m512 postMul61 = _mm512_permutex2var_ps(masLo32, pmMul38, masHi32);
__m512 postAdd39 = _mm512_permutex2var_ps(masLo32, pmAdd38, masHi32);
sum536 = _mm512_fmadd_ps(sum536, postMul61, postAdd39);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*0+(ptrdiff_t)0, 63>>cut26, sum536);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*0+(ptrdiff_t)24576, 4032>>cut26, sum536);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*0+(ptrdiff_t)49152, 258048>>cut26, sum536);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*0+(ptrdiff_t)73728, 65535-(262143>>cut26), sum536);
ptrdiff_t c50 = 0;
for (; c50 != 64; ++c50) {
__m512 wt653 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)0);
__m512 wt654 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)4096);
__m512 wt655 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)8192);
__m512 wt656 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)12288);
__m512 wt657 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)16384);
__m512 wt658 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)20480);
__m512 wt659 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)24576);
__m512 wt660 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)28672);
__m512 wt661 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)32768);
__m512 wt662 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)36864);
__m512 wt663 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)40960);
__m512 wt664 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)45056);
__m512 wt665 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)49152);
__m512 wt666 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)53248);
__m512 wt667 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)57344);
__m512 wt668 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k160+64*c50+(ptrdiff_t)61440);
__m512 tmp17965 = _mm512_unpacklo_ps(wt653, wt654);
__m512 tmp17966 = _mm512_unpackhi_ps(wt653, wt654);
__m512 tmp17967 = _mm512_unpacklo_ps(wt655, wt656);
__m512 tmp17968 = _mm512_unpackhi_ps(wt655, wt656);
__m512 tmp17969 = _mm512_unpacklo_ps(wt657, wt658);
__m512 tmp17970 = _mm512_unpackhi_ps(wt657, wt658);
__m512 tmp17971 = _mm512_unpacklo_ps(wt659, wt660);
__m512 tmp17972 = _mm512_unpackhi_ps(wt659, wt660);
__m512 tmp17973 = _mm512_unpacklo_ps(wt661, wt662);
__m512 tmp17974 = _mm512_unpackhi_ps(wt661, wt662);
__m512 tmp17975 = _mm512_unpacklo_ps(wt663, wt664);
__m512 tmp17976 = _mm512_unpackhi_ps(wt663, wt664);
__m512 tmp17977 = _mm512_unpacklo_ps(wt665, wt666);
__m512 tmp17978 = _mm512_unpackhi_ps(wt665, wt666);
__m512 tmp17979 = _mm512_unpacklo_ps(wt667, wt668);
__m512 tmp17980 = _mm512_unpackhi_ps(wt667, wt668);
__m512 tmp17981 = _mm512_shuffle_ps(tmp17965, tmp17967, 68);
__m512 tmp17982 = _mm512_shuffle_ps(tmp17965, tmp17967, 238);
__m512 tmp17983 = _mm512_shuffle_ps(tmp17966, tmp17968, 68);
__m512 tmp17984 = _mm512_shuffle_ps(tmp17966, tmp17968, 238);
__m512 tmp17985 = _mm512_shuffle_ps(tmp17969, tmp17971, 68);
__m512 tmp17986 = _mm512_shuffle_ps(tmp17969, tmp17971, 238);
__m512 tmp17987 = _mm512_shuffle_ps(tmp17970, tmp17972, 68);
__m512 tmp17988 = _mm512_shuffle_ps(tmp17970, tmp17972, 238);
__m512 tmp17989 = _mm512_shuffle_ps(tmp17973, tmp17975, 68);
__m512 tmp17990 = _mm512_shuffle_ps(tmp17973, tmp17975, 238);
__m512 tmp17991 = _mm512_shuffle_ps(tmp17974, tmp17976, 68);
__m512 tmp17992 = _mm512_shuffle_ps(tmp17974, tmp17976, 238);
__m512 tmp17993 = _mm512_shuffle_ps(tmp17977, tmp17979, 68);
__m512 tmp17994 = _mm512_shuffle_ps(tmp17977, tmp17979, 238);
__m512 tmp17995 = _mm512_shuffle_ps(tmp17978, tmp17980, 68);
__m512 tmp17996 = _mm512_shuffle_ps(tmp17978, tmp17980, 238);
__m512 tmp17997 = _mm512_shuffle_f32x4(tmp17981, tmp17985, 136);
__m512 tmp17998 = _mm512_shuffle_f32x4(tmp17981, tmp17985, 221);
__m512 tmp17999 = _mm512_shuffle_f32x4(tmp17982, tmp17986, 136);
__m512 tmp18000 = _mm512_shuffle_f32x4(tmp17982, tmp17986, 221);
__m512 tmp18001 = _mm512_shuffle_f32x4(tmp17983, tmp17987, 136);
__m512 tmp18002 = _mm512_shuffle_f32x4(tmp17983, tmp17987, 221);
__m512 tmp18003 = _mm512_shuffle_f32x4(tmp17984, tmp17988, 136);
__m512 tmp18004 = _mm512_shuffle_f32x4(tmp17984, tmp17988, 221);
__m512 tmp18005 = _mm512_shuffle_f32x4(tmp17989, tmp17993, 136);
__m512 tmp18006 = _mm512_shuffle_f32x4(tmp17989, tmp17993, 221);
__m512 tmp18007 = _mm512_shuffle_f32x4(tmp17990, tmp17994, 136);
__m512 tmp18008 = _mm512_shuffle_f32x4(tmp17990, tmp17994, 221);
__m512 tmp18009 = _mm512_shuffle_f32x4(tmp17991, tmp17995, 136);
__m512 tmp18010 = _mm512_shuffle_f32x4(tmp17991, tmp17995, 221);
__m512 tmp18011 = _mm512_shuffle_f32x4(tmp17992, tmp17996, 136);
__m512 tmp18012 = _mm512_shuffle_f32x4(tmp17992, tmp17996, 221);
wt653 = _mm512_shuffle_f32x4(tmp17997, tmp18005, 136);
wt661 = _mm512_shuffle_f32x4(tmp17997, tmp18005, 221);
wt654 = _mm512_shuffle_f32x4(tmp17999, tmp18007, 136);
wt662 = _mm512_shuffle_f32x4(tmp17999, tmp18007, 221);
wt655 = _mm512_shuffle_f32x4(tmp18001, tmp18009, 136);
wt663 = _mm512_shuffle_f32x4(tmp18001, tmp18009, 221);
wt656 = _mm512_shuffle_f32x4(tmp18003, tmp18011, 136);
wt664 = _mm512_shuffle_f32x4(tmp18003, tmp18011, 221);
wt657 = _mm512_shuffle_f32x4(tmp17998, tmp18006, 136);
wt665 = _mm512_shuffle_f32x4(tmp17998, tmp18006, 221);
wt658 = _mm512_shuffle_f32x4(tmp18000, tmp18008, 136);
wt666 = _mm512_shuffle_f32x4(tmp18000, tmp18008, 221);
wt659 = _mm512_shuffle_f32x4(tmp18002, tmp18010, 136);
wt667 = _mm512_shuffle_f32x4(tmp18002, tmp18010, 221);
wt660 = _mm512_shuffle_f32x4(tmp18004, tmp18012, 136);
wt668 = _mm512_shuffle_f32x4(tmp18004, tmp18012, 221);
wt653 = _mm512_mul_ps(wt653, postMul61);
wt654 = _mm512_mul_ps(wt654, postMul61);
wt655 = _mm512_mul_ps(wt655, postMul61);
wt656 = _mm512_mul_ps(wt656, postMul61);
wt657 = _mm512_mul_ps(wt657, postMul61);
wt658 = _mm512_mul_ps(wt658, postMul61);
wt659 = _mm512_mul_ps(wt659, postMul61);
wt660 = _mm512_mul_ps(wt660, postMul61);
wt661 = _mm512_mul_ps(wt661, postMul61);
wt662 = _mm512_mul_ps(wt662, postMul61);
wt663 = _mm512_mul_ps(wt663, postMul61);
wt664 = _mm512_mul_ps(wt664, postMul61);
wt665 = _mm512_mul_ps(wt665, postMul61);
wt666 = _mm512_mul_ps(wt666, postMul61);
wt667 = _mm512_mul_ps(wt667, postMul61);
wt668 = _mm512_mul_ps(wt668, postMul61);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(1+16*c50)+(ptrdiff_t)0, 63>>cut26, wt653);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(2+16*c50)+(ptrdiff_t)0, 63>>cut26, wt654);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(3+16*c50)+(ptrdiff_t)0, 63>>cut26, wt655);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(4+16*c50)+(ptrdiff_t)0, 63>>cut26, wt656);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(5+16*c50)+(ptrdiff_t)0, 63>>cut26, wt657);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(6+16*c50)+(ptrdiff_t)0, 63>>cut26, wt658);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(7+16*c50)+(ptrdiff_t)0, 63>>cut26, wt659);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(8+16*c50)+(ptrdiff_t)0, 63>>cut26, wt660);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(9+16*c50)+(ptrdiff_t)0, 63>>cut26, wt661);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(10+16*c50)+(ptrdiff_t)0, 63>>cut26, wt662);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(11+16*c50)+(ptrdiff_t)0, 63>>cut26, wt663);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(12+16*c50)+(ptrdiff_t)0, 63>>cut26, wt664);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(13+16*c50)+(ptrdiff_t)0, 63>>cut26, wt665);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(14+16*c50)+(ptrdiff_t)0, 63>>cut26, wt666);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(15+16*c50)+(ptrdiff_t)0, 63>>cut26, wt667);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(16+16*c50)+(ptrdiff_t)0, 63>>cut26, wt668);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(1+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt653);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(2+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt654);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(3+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt655);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(4+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt656);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(5+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt657);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(6+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt658);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(7+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt659);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(8+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt660);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(9+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt661);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(10+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt662);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(11+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt663);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(12+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt664);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(13+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt665);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(14+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt666);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(15+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt667);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(16+16*c50)+(ptrdiff_t)24576, 4032>>cut26, wt668);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(1+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt653);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(2+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt654);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(3+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt655);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(4+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt656);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(5+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt657);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(6+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt658);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(7+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt659);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(8+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt660);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(9+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt661);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(10+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt662);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(11+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt663);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(12+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt664);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(13+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt665);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(14+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt666);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(15+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt667);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(16+16*c50)+(ptrdiff_t)49152, 258048>>cut26, wt668);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(1+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt653);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(2+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt654);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(3+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt655);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(4+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt656);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(5+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt657);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(6+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt658);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(7+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt659);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(8+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt660);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(9+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt661);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(10+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt662);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(11+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt663);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(12+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt664);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(13+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt665);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(14+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt666);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(15+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt667);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l69+4*cut26+24*(16+16*c50)+(ptrdiff_t)73728, 65535-(262143>>cut26), wt668);
}
}
}
} else {
ptrdiff_t k159 = 240;
ptrdiff_t l68 = (size_t)(0+k159)/6;
ptrdiff_t cut25 = (size_t)(0+k159)%6;
__m512 sum534 = _mm512_maskz_loadu_ps(65535, biasPtr18+1024*i59+4*k159);
__m512i pmMul39 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd39 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo33 = _mm512_loadu_ps(bnPtr18+(ptrdiff_t)8*(k159+256*i59));
__m512 masHi33 = _mm512_maskz_loadu_ps(65535, bnPtr18+(ptrdiff_t)8*(k159+256*i59)+(ptrdiff_t)64);
__m512 postMul59 = _mm512_permutex2var_ps(masLo33, pmMul39, masHi33);
__m512 postAdd37 = _mm512_permutex2var_ps(masLo33, pmAdd39, masHi33);
sum534 = _mm512_fmadd_ps(sum534, postMul59, postAdd37);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*0+(ptrdiff_t)0, 63>>cut25, sum534);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*0+(ptrdiff_t)24576, 4032>>cut25, sum534);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*0+(ptrdiff_t)49152, 65535-(4095>>cut25), sum534);
ptrdiff_t c48 = 0;
for (; c48 != 64; ++c48) {
__m512 wt621 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)0);
__m512 wt622 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)4096);
__m512 wt623 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)8192);
__m512 wt624 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)12288);
__m512 wt625 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)16384);
__m512 wt626 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)20480);
__m512 wt627 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)24576);
__m512 wt628 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)28672);
__m512 wt629 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)32768);
__m512 wt630 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)36864);
__m512 wt631 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)40960);
__m512 wt632 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)45056);
__m512 wt633 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)49152);
__m512 wt634 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)53248);
__m512 wt635 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)57344);
__m512 wt636 = _mm512_maskz_loadu_ps(65535, wtPtr18+1048576*i59+4096*k159+64*c48+(ptrdiff_t)61440);
__m512 tmp18013 = _mm512_unpacklo_ps(wt621, wt622);
__m512 tmp18014 = _mm512_unpackhi_ps(wt621, wt622);
__m512 tmp18015 = _mm512_unpacklo_ps(wt623, wt624);
__m512 tmp18016 = _mm512_unpackhi_ps(wt623, wt624);
__m512 tmp18017 = _mm512_unpacklo_ps(wt625, wt626);
__m512 tmp18018 = _mm512_unpackhi_ps(wt625, wt626);
__m512 tmp18019 = _mm512_unpacklo_ps(wt627, wt628);
__m512 tmp18020 = _mm512_unpackhi_ps(wt627, wt628);
__m512 tmp18021 = _mm512_unpacklo_ps(wt629, wt630);
__m512 tmp18022 = _mm512_unpackhi_ps(wt629, wt630);
__m512 tmp18023 = _mm512_unpacklo_ps(wt631, wt632);
__m512 tmp18024 = _mm512_unpackhi_ps(wt631, wt632);
__m512 tmp18025 = _mm512_unpacklo_ps(wt633, wt634);
__m512 tmp18026 = _mm512_unpackhi_ps(wt633, wt634);
__m512 tmp18027 = _mm512_unpacklo_ps(wt635, wt636);
__m512 tmp18028 = _mm512_unpackhi_ps(wt635, wt636);
__m512 tmp18029 = _mm512_shuffle_ps(tmp18013, tmp18015, 68);
__m512 tmp18030 = _mm512_shuffle_ps(tmp18013, tmp18015, 238);
__m512 tmp18031 = _mm512_shuffle_ps(tmp18014, tmp18016, 68);
__m512 tmp18032 = _mm512_shuffle_ps(tmp18014, tmp18016, 238);
__m512 tmp18033 = _mm512_shuffle_ps(tmp18017, tmp18019, 68);
__m512 tmp18034 = _mm512_shuffle_ps(tmp18017, tmp18019, 238);
__m512 tmp18035 = _mm512_shuffle_ps(tmp18018, tmp18020, 68);
__m512 tmp18036 = _mm512_shuffle_ps(tmp18018, tmp18020, 238);
__m512 tmp18037 = _mm512_shuffle_ps(tmp18021, tmp18023, 68);
__m512 tmp18038 = _mm512_shuffle_ps(tmp18021, tmp18023, 238);
__m512 tmp18039 = _mm512_shuffle_ps(tmp18022, tmp18024, 68);
__m512 tmp18040 = _mm512_shuffle_ps(tmp18022, tmp18024, 238);
__m512 tmp18041 = _mm512_shuffle_ps(tmp18025, tmp18027, 68);
__m512 tmp18042 = _mm512_shuffle_ps(tmp18025, tmp18027, 238);
__m512 tmp18043 = _mm512_shuffle_ps(tmp18026, tmp18028, 68);
__m512 tmp18044 = _mm512_shuffle_ps(tmp18026, tmp18028, 238);
__m512 tmp18045 = _mm512_shuffle_f32x4(tmp18029, tmp18033, 136);
__m512 tmp18046 = _mm512_shuffle_f32x4(tmp18029, tmp18033, 221);
__m512 tmp18047 = _mm512_shuffle_f32x4(tmp18030, tmp18034, 136);
__m512 tmp18048 = _mm512_shuffle_f32x4(tmp18030, tmp18034, 221);
__m512 tmp18049 = _mm512_shuffle_f32x4(tmp18031, tmp18035, 136);
__m512 tmp18050 = _mm512_shuffle_f32x4(tmp18031, tmp18035, 221);
__m512 tmp18051 = _mm512_shuffle_f32x4(tmp18032, tmp18036, 136);
__m512 tmp18052 = _mm512_shuffle_f32x4(tmp18032, tmp18036, 221);
__m512 tmp18053 = _mm512_shuffle_f32x4(tmp18037, tmp18041, 136);
__m512 tmp18054 = _mm512_shuffle_f32x4(tmp18037, tmp18041, 221);
__m512 tmp18055 = _mm512_shuffle_f32x4(tmp18038, tmp18042, 136);
__m512 tmp18056 = _mm512_shuffle_f32x4(tmp18038, tmp18042, 221);
__m512 tmp18057 = _mm512_shuffle_f32x4(tmp18039, tmp18043, 136);
__m512 tmp18058 = _mm512_shuffle_f32x4(tmp18039, tmp18043, 221);
__m512 tmp18059 = _mm512_shuffle_f32x4(tmp18040, tmp18044, 136);
__m512 tmp18060 = _mm512_shuffle_f32x4(tmp18040, tmp18044, 221);
wt621 = _mm512_shuffle_f32x4(tmp18045, tmp18053, 136);
wt629 = _mm512_shuffle_f32x4(tmp18045, tmp18053, 221);
wt622 = _mm512_shuffle_f32x4(tmp18047, tmp18055, 136);
wt630 = _mm512_shuffle_f32x4(tmp18047, tmp18055, 221);
wt623 = _mm512_shuffle_f32x4(tmp18049, tmp18057, 136);
wt631 = _mm512_shuffle_f32x4(tmp18049, tmp18057, 221);
wt624 = _mm512_shuffle_f32x4(tmp18051, tmp18059, 136);
wt632 = _mm512_shuffle_f32x4(tmp18051, tmp18059, 221);
wt625 = _mm512_shuffle_f32x4(tmp18046, tmp18054, 136);
wt633 = _mm512_shuffle_f32x4(tmp18046, tmp18054, 221);
wt626 = _mm512_shuffle_f32x4(tmp18048, tmp18056, 136);
wt634 = _mm512_shuffle_f32x4(tmp18048, tmp18056, 221);
wt627 = _mm512_shuffle_f32x4(tmp18050, tmp18058, 136);
wt635 = _mm512_shuffle_f32x4(tmp18050, tmp18058, 221);
wt628 = _mm512_shuffle_f32x4(tmp18052, tmp18060, 136);
wt636 = _mm512_shuffle_f32x4(tmp18052, tmp18060, 221);
wt621 = _mm512_mul_ps(wt621, postMul59);
wt622 = _mm512_mul_ps(wt622, postMul59);
wt623 = _mm512_mul_ps(wt623, postMul59);
wt624 = _mm512_mul_ps(wt624, postMul59);
wt625 = _mm512_mul_ps(wt625, postMul59);
wt626 = _mm512_mul_ps(wt626, postMul59);
wt627 = _mm512_mul_ps(wt627, postMul59);
wt628 = _mm512_mul_ps(wt628, postMul59);
wt629 = _mm512_mul_ps(wt629, postMul59);
wt630 = _mm512_mul_ps(wt630, postMul59);
wt631 = _mm512_mul_ps(wt631, postMul59);
wt632 = _mm512_mul_ps(wt632, postMul59);
wt633 = _mm512_mul_ps(wt633, postMul59);
wt634 = _mm512_mul_ps(wt634, postMul59);
wt635 = _mm512_mul_ps(wt635, postMul59);
wt636 = _mm512_mul_ps(wt636, postMul59);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(1+16*c48)+(ptrdiff_t)0, 63>>cut25, wt621);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(2+16*c48)+(ptrdiff_t)0, 63>>cut25, wt622);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(3+16*c48)+(ptrdiff_t)0, 63>>cut25, wt623);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(4+16*c48)+(ptrdiff_t)0, 63>>cut25, wt624);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(5+16*c48)+(ptrdiff_t)0, 63>>cut25, wt625);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(6+16*c48)+(ptrdiff_t)0, 63>>cut25, wt626);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(7+16*c48)+(ptrdiff_t)0, 63>>cut25, wt627);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(8+16*c48)+(ptrdiff_t)0, 63>>cut25, wt628);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(9+16*c48)+(ptrdiff_t)0, 63>>cut25, wt629);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(10+16*c48)+(ptrdiff_t)0, 63>>cut25, wt630);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(11+16*c48)+(ptrdiff_t)0, 63>>cut25, wt631);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(12+16*c48)+(ptrdiff_t)0, 63>>cut25, wt632);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(13+16*c48)+(ptrdiff_t)0, 63>>cut25, wt633);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(14+16*c48)+(ptrdiff_t)0, 63>>cut25, wt634);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(15+16*c48)+(ptrdiff_t)0, 63>>cut25, wt635);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(16+16*c48)+(ptrdiff_t)0, 63>>cut25, wt636);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(1+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt621);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(2+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt622);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(3+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt623);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(4+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt624);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(5+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt625);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(6+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt626);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(7+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt627);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(8+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt628);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(9+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt629);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(10+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt630);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(11+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt631);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(12+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt632);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(13+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt633);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(14+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt634);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(15+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt635);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+24*(16+16*c48)+(ptrdiff_t)24576, 4032>>cut25, wt636);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(1+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt621);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(2+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt622);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(3+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt623);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(4+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt624);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(5+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt625);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(6+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt626);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(7+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt627);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(8+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt628);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(9+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt629);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(10+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt630);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(11+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt631);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(12+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt632);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(13+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt633);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(14+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt634);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(15+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt635);
_mm512_mask_storeu_ps(arranged17+1049600*i59+24600*l68+4*cut25+16*(16+16*c48)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt636);
}
}
}
}
}

static void ResNet50OneArrangeWts9(ResNet50ThreaderTeam1* team63, char** tensors99) {
ResNet50ThreaderTask1 task103;
task103.callee1 = ResNet50OneArrangeWts9Callee1;
task103.any1 = tensors99;
task103.nd1 = 3;
task103.hull1[0] = 16;
task103.hull1[1] = 1;
task103.hull1[2] = 1;
ResNet50ThreaderDo1(team63, &task103);
}

static void ResNet50OneArrangeDats9Callee1(ResNet50ThreaderTask1* task104, int64_t* pt57) {
char** tensors102 = task104->any1;
ptrdiff_t s49 = pt57[0];
ptrdiff_t c51 = pt57[1];
char*restrict datPtr32 = tensors102[0]+(ptrdiff_t)0+(ptrdiff_t)694720*0+(ptrdiff_t)851968*0;
char*restrict arranged18 = tensors102[1]+(ptrdiff_t)694720*0+(ptrdiff_t)851968*0;
ptrdiff_t ii26 = 1;
for (ptrdiff_t i60 = 0; i60 < ii26; ++i60) {
ptrdiff_t j52 = 1*c51;
ptrdiff_t jj49 = j52+0;
for (; j52 != 3; ++j52) {
ptrdiff_t k161 = 128*s49;
ptrdiff_t kk55 = k161+128;
for (; k161 < kk55; ++k161) {
__m512 dat2323 = _mm512_maskz_loadu_ps(65535, datPtr32+851968*i60+256*j52+832*k161+(ptrdiff_t)0);
__m512 dat2324 = _mm512_maskz_loadu_ps(65535, datPtr32+851968*i60+256*j52+832*k161+(ptrdiff_t)64);
__m512 dat2325 = _mm512_maskz_loadu_ps(65535, datPtr32+851968*i60+256*j52+832*k161+(ptrdiff_t)128);
__m512 dat2326 = _mm512_maskz_loadu_ps(65535, datPtr32+851968*i60+256*j52+832*k161+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged18+851968*i60+262144*j52+256*k161+(ptrdiff_t)0, 65535, dat2323);
_mm512_mask_storeu_ps(arranged18+851968*i60+262144*j52+256*k161+(ptrdiff_t)64, 65535, dat2324);
_mm512_mask_storeu_ps(arranged18+851968*i60+262144*j52+256*k161+(ptrdiff_t)128, 65535, dat2325);
_mm512_mask_storeu_ps(arranged18+851968*i60+262144*j52+256*k161+(ptrdiff_t)192, 65535, dat2326);
}
if (j52 >= jj49) goto next9;
}
ptrdiff_t k162 = 128*s49;
ptrdiff_t kk56 = k162+128;
for (; k162 < kk56; ++k162) {
__m512 dat2327 = _mm512_maskz_loadu_ps(15, datPtr32+851968*i60+256*j52+832*k162+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged18+851968*i60+262144*j52+64*k162+(ptrdiff_t)0, 15, dat2327);
}
next9:;
}
}

static void ResNet50OneArrangeDats9(ResNet50ThreaderTeam1* team64, char** tensors101) {
ResNet50ThreaderTask1 task105;
task105.callee1 = ResNet50OneArrangeDats9Callee1;
task105.any1 = tensors101;
task105.nd1 = 4;
task105.hull1[0] = 8;
task105.hull1[1] = 4;
task105.hull1[2] = 1;
task105.hull1[3] = 1;
ResNet50ThreaderDo1(team64, &task105);
}

static void ResNet50OneApply9Callee1(ResNet50ThreaderTask1* task106, int64_t* pt58) {
void** pair28 = task106->any1;
char** tensors104 = pair28[0];
ptrdiff_t e29 = 0;
ptrdiff_t g34 = 0;
ptrdiff_t d21 = pt58[1];
ptrdiff_t w70 = pt58[0];
char*restrict arrangedWts9 = tensors104[0]+856064*e29+(ptrdiff_t)1049600*1*g34;
char*restrict arrangedDats9 = tensors104[1]+694720*e29+(ptrdiff_t)851968*1*g34;
char*restrict datPtr33 = tensors104[2]+(ptrdiff_t)212992*1*g34;
ptrdiff_t ii27 = 1;
for (ptrdiff_t i61 = 0; i61 < ii27; ++i61) {
ptrdiff_t j53 = 1*d21;
ptrdiff_t jj50 = j53+0;
for (; j53 != 3; ++j53) {
ptrdiff_t k163 = 1*w70;
ptrdiff_t kk57 = k163+0;
for (; k163 != 42; ++k163) {
ptrdiff_t s50 = -1;
__m512 sum537 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)24));
__m512 sum541 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)28));
__m512 sum545 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)32));
__m512 sum549 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)36));
__m512 sum553 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)40));
__m512 sum557 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)44));
__m512 sum538 = sum537;
__m512 sum539 = sum537;
__m512 sum540 = sum537;
__m512 sum542 = sum541;
__m512 sum543 = sum541;
__m512 sum544 = sum541;
__m512 sum546 = sum545;
__m512 sum547 = sum545;
__m512 sum548 = sum545;
__m512 sum550 = sum549;
__m512 sum551 = sum549;
__m512 sum552 = sum549;
__m512 sum554 = sum553;
__m512 sum555 = sum553;
__m512 sum556 = sum553;
__m512 sum558 = sum557;
__m512 sum559 = sum557;
__m512 sum560 = sum557;
for (s50 = 0; s50 < 1024; ++s50) {
__m512 dat2328 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+256*s50+(ptrdiff_t)0);
__m512 dat2329 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+256*s50+(ptrdiff_t)64);
__m512 dat2330 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+256*s50+(ptrdiff_t)128);
__m512 dat2331 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+256*s50+(ptrdiff_t)192);
__m512 wt669 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)24));
sum537 = _mm512_fmadd_ps(wt669, dat2328, sum537);
sum538 = _mm512_fmadd_ps(wt669, dat2329, sum538);
sum539 = _mm512_fmadd_ps(wt669, dat2330, sum539);
sum540 = _mm512_fmadd_ps(wt669, dat2331, sum540);
__m512 wt670 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)28));
sum541 = _mm512_fmadd_ps(wt670, dat2328, sum541);
sum542 = _mm512_fmadd_ps(wt670, dat2329, sum542);
sum543 = _mm512_fmadd_ps(wt670, dat2330, sum543);
sum544 = _mm512_fmadd_ps(wt670, dat2331, sum544);
__m512 wt671 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)32));
sum545 = _mm512_fmadd_ps(wt671, dat2328, sum545);
sum546 = _mm512_fmadd_ps(wt671, dat2329, sum546);
sum547 = _mm512_fmadd_ps(wt671, dat2330, sum547);
sum548 = _mm512_fmadd_ps(wt671, dat2331, sum548);
__m512 wt672 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)36));
sum549 = _mm512_fmadd_ps(wt672, dat2328, sum549);
sum550 = _mm512_fmadd_ps(wt672, dat2329, sum550);
sum551 = _mm512_fmadd_ps(wt672, dat2330, sum551);
sum552 = _mm512_fmadd_ps(wt672, dat2331, sum552);
__m512 wt673 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)40));
sum553 = _mm512_fmadd_ps(wt673, dat2328, sum553);
sum554 = _mm512_fmadd_ps(wt673, dat2329, sum554);
sum555 = _mm512_fmadd_ps(wt673, dat2330, sum555);
sum556 = _mm512_fmadd_ps(wt673, dat2331, sum556);
__m512 wt674 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+24*s50+(ptrdiff_t)44));
sum557 = _mm512_fmadd_ps(wt674, dat2328, sum557);
sum558 = _mm512_fmadd_ps(wt674, dat2329, sum558);
sum559 = _mm512_fmadd_ps(wt674, dat2330, sum559);
sum560 = _mm512_fmadd_ps(wt674, dat2331, sum560);
}
sum537 = _mm512_max_ps(_mm512_setzero_ps(), sum537);
sum538 = _mm512_max_ps(_mm512_setzero_ps(), sum538);
sum539 = _mm512_max_ps(_mm512_setzero_ps(), sum539);
sum540 = _mm512_max_ps(_mm512_setzero_ps(), sum540);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)0, 65535, sum537);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)64, 65535, sum538);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)128, 65535, sum539);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)192, 65535, sum540);
sum541 = _mm512_max_ps(_mm512_setzero_ps(), sum541);
sum542 = _mm512_max_ps(_mm512_setzero_ps(), sum542);
sum543 = _mm512_max_ps(_mm512_setzero_ps(), sum543);
sum544 = _mm512_max_ps(_mm512_setzero_ps(), sum544);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)832, 65535, sum541);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)896, 65535, sum542);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)960, 65535, sum543);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1024, 65535, sum544);
sum545 = _mm512_max_ps(_mm512_setzero_ps(), sum545);
sum546 = _mm512_max_ps(_mm512_setzero_ps(), sum546);
sum547 = _mm512_max_ps(_mm512_setzero_ps(), sum547);
sum548 = _mm512_max_ps(_mm512_setzero_ps(), sum548);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1664, 65535, sum545);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1728, 65535, sum546);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1792, 65535, sum547);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1856, 65535, sum548);
sum549 = _mm512_max_ps(_mm512_setzero_ps(), sum549);
sum550 = _mm512_max_ps(_mm512_setzero_ps(), sum550);
sum551 = _mm512_max_ps(_mm512_setzero_ps(), sum551);
sum552 = _mm512_max_ps(_mm512_setzero_ps(), sum552);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)2496, 65535, sum549);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)2560, 65535, sum550);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)2624, 65535, sum551);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)2688, 65535, sum552);
sum553 = _mm512_max_ps(_mm512_setzero_ps(), sum553);
sum554 = _mm512_max_ps(_mm512_setzero_ps(), sum554);
sum555 = _mm512_max_ps(_mm512_setzero_ps(), sum555);
sum556 = _mm512_max_ps(_mm512_setzero_ps(), sum556);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)3328, 65535, sum553);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)3392, 65535, sum554);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)3456, 65535, sum555);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)3520, 65535, sum556);
sum557 = _mm512_max_ps(_mm512_setzero_ps(), sum557);
sum558 = _mm512_max_ps(_mm512_setzero_ps(), sum558);
sum559 = _mm512_max_ps(_mm512_setzero_ps(), sum559);
sum560 = _mm512_max_ps(_mm512_setzero_ps(), sum560);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)4160, 65535, sum557);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)4224, 65535, sum558);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)4288, 65535, sum559);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)4352, 65535, sum560);
if (k163 >= kk57) return;
}
ptrdiff_t s51 = -1;
__m512 sum561 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+16*s51+(ptrdiff_t)16));
__m512 sum565 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+16*s51+(ptrdiff_t)20));
__m512 sum569 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+16*s51+(ptrdiff_t)24));
__m512 sum573 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+16*s51+(ptrdiff_t)28));
__m512 sum562 = sum561;
__m512 sum563 = sum561;
__m512 sum564 = sum561;
__m512 sum566 = sum565;
__m512 sum567 = sum565;
__m512 sum568 = sum565;
__m512 sum570 = sum569;
__m512 sum571 = sum569;
__m512 sum572 = sum569;
__m512 sum574 = sum573;
__m512 sum575 = sum573;
__m512 sum576 = sum573;
for (s51 = 0; s51 < 1024; ++s51) {
__m512 dat2332 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+256*s51+(ptrdiff_t)0);
__m512 dat2333 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+256*s51+(ptrdiff_t)64);
__m512 dat2334 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+256*s51+(ptrdiff_t)128);
__m512 dat2335 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+256*s51+(ptrdiff_t)192);
__m512 wt675 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+16*s51+(ptrdiff_t)16));
sum561 = _mm512_fmadd_ps(wt675, dat2332, sum561);
sum562 = _mm512_fmadd_ps(wt675, dat2333, sum562);
sum563 = _mm512_fmadd_ps(wt675, dat2334, sum563);
sum564 = _mm512_fmadd_ps(wt675, dat2335, sum564);
__m512 wt676 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+16*s51+(ptrdiff_t)20));
sum565 = _mm512_fmadd_ps(wt676, dat2332, sum565);
sum566 = _mm512_fmadd_ps(wt676, dat2333, sum566);
sum567 = _mm512_fmadd_ps(wt676, dat2334, sum567);
sum568 = _mm512_fmadd_ps(wt676, dat2335, sum568);
__m512 wt677 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+16*s51+(ptrdiff_t)24));
sum569 = _mm512_fmadd_ps(wt677, dat2332, sum569);
sum570 = _mm512_fmadd_ps(wt677, dat2333, sum570);
sum571 = _mm512_fmadd_ps(wt677, dat2334, sum571);
sum572 = _mm512_fmadd_ps(wt677, dat2335, sum572);
__m512 wt678 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k163+16*s51+(ptrdiff_t)28));
sum573 = _mm512_fmadd_ps(wt678, dat2332, sum573);
sum574 = _mm512_fmadd_ps(wt678, dat2333, sum574);
sum575 = _mm512_fmadd_ps(wt678, dat2334, sum575);
sum576 = _mm512_fmadd_ps(wt678, dat2335, sum576);
}
sum561 = _mm512_max_ps(_mm512_setzero_ps(), sum561);
sum562 = _mm512_max_ps(_mm512_setzero_ps(), sum562);
sum563 = _mm512_max_ps(_mm512_setzero_ps(), sum563);
sum564 = _mm512_max_ps(_mm512_setzero_ps(), sum564);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)0, 65535, sum561);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)64, 65535, sum562);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)128, 65535, sum563);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)192, 65535, sum564);
sum565 = _mm512_max_ps(_mm512_setzero_ps(), sum565);
sum566 = _mm512_max_ps(_mm512_setzero_ps(), sum566);
sum567 = _mm512_max_ps(_mm512_setzero_ps(), sum567);
sum568 = _mm512_max_ps(_mm512_setzero_ps(), sum568);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)832, 65535, sum565);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)896, 65535, sum566);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)960, 65535, sum567);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1024, 65535, sum568);
sum569 = _mm512_max_ps(_mm512_setzero_ps(), sum569);
sum570 = _mm512_max_ps(_mm512_setzero_ps(), sum570);
sum571 = _mm512_max_ps(_mm512_setzero_ps(), sum571);
sum572 = _mm512_max_ps(_mm512_setzero_ps(), sum572);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1664, 65535, sum569);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1728, 65535, sum570);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1792, 65535, sum571);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)1856, 65535, sum572);
sum573 = _mm512_max_ps(_mm512_setzero_ps(), sum573);
sum574 = _mm512_max_ps(_mm512_setzero_ps(), sum574);
sum575 = _mm512_max_ps(_mm512_setzero_ps(), sum575);
sum576 = _mm512_max_ps(_mm512_setzero_ps(), sum576);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)2496, 65535, sum573);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)2560, 65535, sum574);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)2624, 65535, sum575);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k163+(ptrdiff_t)2688, 65535, sum576);
if (j53 >= jj50) return;
}
ptrdiff_t k164 = 1*w70;
ptrdiff_t kk58 = k164+0;
for (; k164 != 42; ++k164) {
ptrdiff_t s52 = -1;
__m512 sum577 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)24));
__m512 sum578 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)28));
__m512 sum579 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)32));
__m512 sum580 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)36));
__m512 sum581 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)40));
__m512 sum582 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)44));
for (s52 = 0; s52 < 1024; ++s52) {
__m512 dat2336 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+64*s52+(ptrdiff_t)0);
__m512 wt679 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)24));
sum577 = _mm512_fmadd_ps(wt679, dat2336, sum577);
__m512 wt680 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)28));
sum578 = _mm512_fmadd_ps(wt680, dat2336, sum578);
__m512 wt681 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)32));
sum579 = _mm512_fmadd_ps(wt681, dat2336, sum579);
__m512 wt682 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)36));
sum580 = _mm512_fmadd_ps(wt682, dat2336, sum580);
__m512 wt683 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)40));
sum581 = _mm512_fmadd_ps(wt683, dat2336, sum581);
__m512 wt684 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+24*s52+(ptrdiff_t)44));
sum582 = _mm512_fmadd_ps(wt684, dat2336, sum582);
}
sum577 = _mm512_max_ps(_mm512_setzero_ps(), sum577);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)0, 15, sum577);
sum578 = _mm512_max_ps(_mm512_setzero_ps(), sum578);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)832, 15, sum578);
sum579 = _mm512_max_ps(_mm512_setzero_ps(), sum579);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)1664, 15, sum579);
sum580 = _mm512_max_ps(_mm512_setzero_ps(), sum580);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)2496, 15, sum580);
sum581 = _mm512_max_ps(_mm512_setzero_ps(), sum581);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)3328, 15, sum581);
sum582 = _mm512_max_ps(_mm512_setzero_ps(), sum582);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)4160, 15, sum582);
if (k164 >= kk58) return;
}
ptrdiff_t s53 = -1;
__m512 sum583 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+16*s53+(ptrdiff_t)16));
__m512 sum584 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+16*s53+(ptrdiff_t)20));
__m512 sum585 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+16*s53+(ptrdiff_t)24));
__m512 sum586 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+16*s53+(ptrdiff_t)28));
for (s53 = 0; s53 < 1024; ++s53) {
__m512 dat2337 = _mm512_loadu_ps(arrangedDats9+851968*i61+262144*j53+64*s53+(ptrdiff_t)0);
__m512 wt685 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+16*s53+(ptrdiff_t)16));
sum583 = _mm512_fmadd_ps(wt685, dat2337, sum583);
__m512 wt686 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+16*s53+(ptrdiff_t)20));
sum584 = _mm512_fmadd_ps(wt686, dat2337, sum584);
__m512 wt687 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+16*s53+(ptrdiff_t)24));
sum585 = _mm512_fmadd_ps(wt687, dat2337, sum585);
__m512 wt688 = _mm512_set1_ps(*(float*)(arrangedWts9+1049600*i61+24600*k164+16*s53+(ptrdiff_t)28));
sum586 = _mm512_fmadd_ps(wt688, dat2337, sum586);
}
sum583 = _mm512_max_ps(_mm512_setzero_ps(), sum583);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)0, 15, sum583);
sum584 = _mm512_max_ps(_mm512_setzero_ps(), sum584);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)832, 15, sum584);
sum585 = _mm512_max_ps(_mm512_setzero_ps(), sum585);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)1664, 15, sum585);
sum586 = _mm512_max_ps(_mm512_setzero_ps(), sum586);
_mm512_mask_storeu_ps(datPtr33+212992*i61+256*j53+4992*k164+(ptrdiff_t)2496, 15, sum586);
}
}

static void ResNet50OneApply9(ResNet50ThreaderTeam1* team65, char** tensors103) {
void* pair27[] = {tensors103, 0};
ResNet50ThreaderTask1 task107;
task107.callee1 = ResNet50OneApply9Callee1;
task107.any1 = pair27;
task107.nd1 = 3;
task107.hull1[0] = 43;
task107.hull1[1] = 4;
task107.hull1[2] = 1;
ResNet50ThreaderDo1(team65, &task107);
}

static void ResNet50OneArrangeWts10Callee1(ResNet50ThreaderTask1* task116, int64_t* pt63) {
char** tensors114 = task116->any1;
ptrdiff_t b71 = pt63[0];
char*restrict wtPtr20 = tensors114[0]+(ptrdiff_t)3340*0+(ptrdiff_t)10485760*0;
char*restrict biasPtr20 = tensors114[1]+(ptrdiff_t)10240*0;
char*restrict bnPtr20 = tensors114[2]+(ptrdiff_t)8*2560*0;
char*restrict wtPtr21 = tensors114[3]+(ptrdiff_t)3340*0+(ptrdiff_t)10485760*0;
char*restrict biasPtr21 = tensors114[4]+(ptrdiff_t)10240*0;
char*restrict bnPtr21 = tensors114[5]+(ptrdiff_t)8*2560*0;
char*restrict arranged19 = tensors114[6]+(ptrdiff_t)8560640*0+(ptrdiff_t)10496000*0;
ptrdiff_t ii28 = 1;
for (ptrdiff_t i67 = 0; i67 < ii28; ++i67) {
ptrdiff_t j58 = 1*b71;
ptrdiff_t jj52 = j58+1;
for (; j58 < jj52; ++j58) {
if (j58 < 128) {
ptrdiff_t k171 = 0+16*(j58-0);
ptrdiff_t l74 = (size_t)(0+k171)/6;
ptrdiff_t cut28 = (size_t)(0+k171)%6;
switch (cut28) {
case 0:;
case 2: {
__m512 sum623 = _mm512_maskz_loadu_ps(65535, biasPtr20+10240*i67+4*k171);
__m512i pmMul41 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd41 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo34 = _mm512_loadu_ps(bnPtr20+(ptrdiff_t)8*(k171+2560*i67));
__m512 masHi34 = _mm512_maskz_loadu_ps(65535, bnPtr20+(ptrdiff_t)8*(k171+2560*i67)+(ptrdiff_t)64);
__m512 postMul67 = _mm512_permutex2var_ps(masLo34, pmMul41, masHi34);
__m512 postAdd41 = _mm512_permutex2var_ps(masLo34, pmAdd41, masHi34);
sum623 = _mm512_fmadd_ps(sum623, postMul67, postAdd41);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*0+(ptrdiff_t)0, 63>>cut28, sum623);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*0+(ptrdiff_t)24576, 4032>>cut28, sum623);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*0+(ptrdiff_t)49152, 65535-(4095>>cut28), sum623);
ptrdiff_t c53 = 0;
for (; c53 != 64; ++c53) {
__m512 wt693 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)0);
__m512 wt694 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)4096);
__m512 wt695 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)8192);
__m512 wt696 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)12288);
__m512 wt697 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)16384);
__m512 wt698 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)20480);
__m512 wt699 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)24576);
__m512 wt700 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)28672);
__m512 wt701 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)32768);
__m512 wt702 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)36864);
__m512 wt703 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)40960);
__m512 wt704 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)45056);
__m512 wt705 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)49152);
__m512 wt706 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)53248);
__m512 wt707 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)57344);
__m512 wt708 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c53+(ptrdiff_t)61440);
__m512 tmp19277 = _mm512_unpacklo_ps(wt693, wt694);
__m512 tmp19278 = _mm512_unpackhi_ps(wt693, wt694);
__m512 tmp19279 = _mm512_unpacklo_ps(wt695, wt696);
__m512 tmp19280 = _mm512_unpackhi_ps(wt695, wt696);
__m512 tmp19281 = _mm512_unpacklo_ps(wt697, wt698);
__m512 tmp19282 = _mm512_unpackhi_ps(wt697, wt698);
__m512 tmp19283 = _mm512_unpacklo_ps(wt699, wt700);
__m512 tmp19284 = _mm512_unpackhi_ps(wt699, wt700);
__m512 tmp19285 = _mm512_unpacklo_ps(wt701, wt702);
__m512 tmp19286 = _mm512_unpackhi_ps(wt701, wt702);
__m512 tmp19287 = _mm512_unpacklo_ps(wt703, wt704);
__m512 tmp19288 = _mm512_unpackhi_ps(wt703, wt704);
__m512 tmp19289 = _mm512_unpacklo_ps(wt705, wt706);
__m512 tmp19290 = _mm512_unpackhi_ps(wt705, wt706);
__m512 tmp19291 = _mm512_unpacklo_ps(wt707, wt708);
__m512 tmp19292 = _mm512_unpackhi_ps(wt707, wt708);
__m512 tmp19293 = _mm512_shuffle_ps(tmp19277, tmp19279, 68);
__m512 tmp19294 = _mm512_shuffle_ps(tmp19277, tmp19279, 238);
__m512 tmp19295 = _mm512_shuffle_ps(tmp19278, tmp19280, 68);
__m512 tmp19296 = _mm512_shuffle_ps(tmp19278, tmp19280, 238);
__m512 tmp19297 = _mm512_shuffle_ps(tmp19281, tmp19283, 68);
__m512 tmp19298 = _mm512_shuffle_ps(tmp19281, tmp19283, 238);
__m512 tmp19299 = _mm512_shuffle_ps(tmp19282, tmp19284, 68);
__m512 tmp19300 = _mm512_shuffle_ps(tmp19282, tmp19284, 238);
__m512 tmp19301 = _mm512_shuffle_ps(tmp19285, tmp19287, 68);
__m512 tmp19302 = _mm512_shuffle_ps(tmp19285, tmp19287, 238);
__m512 tmp19303 = _mm512_shuffle_ps(tmp19286, tmp19288, 68);
__m512 tmp19304 = _mm512_shuffle_ps(tmp19286, tmp19288, 238);
__m512 tmp19305 = _mm512_shuffle_ps(tmp19289, tmp19291, 68);
__m512 tmp19306 = _mm512_shuffle_ps(tmp19289, tmp19291, 238);
__m512 tmp19307 = _mm512_shuffle_ps(tmp19290, tmp19292, 68);
__m512 tmp19308 = _mm512_shuffle_ps(tmp19290, tmp19292, 238);
__m512 tmp19309 = _mm512_shuffle_f32x4(tmp19293, tmp19297, 136);
__m512 tmp19310 = _mm512_shuffle_f32x4(tmp19293, tmp19297, 221);
__m512 tmp19311 = _mm512_shuffle_f32x4(tmp19294, tmp19298, 136);
__m512 tmp19312 = _mm512_shuffle_f32x4(tmp19294, tmp19298, 221);
__m512 tmp19313 = _mm512_shuffle_f32x4(tmp19295, tmp19299, 136);
__m512 tmp19314 = _mm512_shuffle_f32x4(tmp19295, tmp19299, 221);
__m512 tmp19315 = _mm512_shuffle_f32x4(tmp19296, tmp19300, 136);
__m512 tmp19316 = _mm512_shuffle_f32x4(tmp19296, tmp19300, 221);
__m512 tmp19317 = _mm512_shuffle_f32x4(tmp19301, tmp19305, 136);
__m512 tmp19318 = _mm512_shuffle_f32x4(tmp19301, tmp19305, 221);
__m512 tmp19319 = _mm512_shuffle_f32x4(tmp19302, tmp19306, 136);
__m512 tmp19320 = _mm512_shuffle_f32x4(tmp19302, tmp19306, 221);
__m512 tmp19321 = _mm512_shuffle_f32x4(tmp19303, tmp19307, 136);
__m512 tmp19322 = _mm512_shuffle_f32x4(tmp19303, tmp19307, 221);
__m512 tmp19323 = _mm512_shuffle_f32x4(tmp19304, tmp19308, 136);
__m512 tmp19324 = _mm512_shuffle_f32x4(tmp19304, tmp19308, 221);
wt693 = _mm512_shuffle_f32x4(tmp19309, tmp19317, 136);
wt701 = _mm512_shuffle_f32x4(tmp19309, tmp19317, 221);
wt694 = _mm512_shuffle_f32x4(tmp19311, tmp19319, 136);
wt702 = _mm512_shuffle_f32x4(tmp19311, tmp19319, 221);
wt695 = _mm512_shuffle_f32x4(tmp19313, tmp19321, 136);
wt703 = _mm512_shuffle_f32x4(tmp19313, tmp19321, 221);
wt696 = _mm512_shuffle_f32x4(tmp19315, tmp19323, 136);
wt704 = _mm512_shuffle_f32x4(tmp19315, tmp19323, 221);
wt697 = _mm512_shuffle_f32x4(tmp19310, tmp19318, 136);
wt705 = _mm512_shuffle_f32x4(tmp19310, tmp19318, 221);
wt698 = _mm512_shuffle_f32x4(tmp19312, tmp19320, 136);
wt706 = _mm512_shuffle_f32x4(tmp19312, tmp19320, 221);
wt699 = _mm512_shuffle_f32x4(tmp19314, tmp19322, 136);
wt707 = _mm512_shuffle_f32x4(tmp19314, tmp19322, 221);
wt700 = _mm512_shuffle_f32x4(tmp19316, tmp19324, 136);
wt708 = _mm512_shuffle_f32x4(tmp19316, tmp19324, 221);
wt693 = _mm512_mul_ps(wt693, postMul67);
wt694 = _mm512_mul_ps(wt694, postMul67);
wt695 = _mm512_mul_ps(wt695, postMul67);
wt696 = _mm512_mul_ps(wt696, postMul67);
wt697 = _mm512_mul_ps(wt697, postMul67);
wt698 = _mm512_mul_ps(wt698, postMul67);
wt699 = _mm512_mul_ps(wt699, postMul67);
wt700 = _mm512_mul_ps(wt700, postMul67);
wt701 = _mm512_mul_ps(wt701, postMul67);
wt702 = _mm512_mul_ps(wt702, postMul67);
wt703 = _mm512_mul_ps(wt703, postMul67);
wt704 = _mm512_mul_ps(wt704, postMul67);
wt705 = _mm512_mul_ps(wt705, postMul67);
wt706 = _mm512_mul_ps(wt706, postMul67);
wt707 = _mm512_mul_ps(wt707, postMul67);
wt708 = _mm512_mul_ps(wt708, postMul67);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(1+16*c53)+(ptrdiff_t)0, 63>>cut28, wt693);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(2+16*c53)+(ptrdiff_t)0, 63>>cut28, wt694);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(3+16*c53)+(ptrdiff_t)0, 63>>cut28, wt695);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(4+16*c53)+(ptrdiff_t)0, 63>>cut28, wt696);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(5+16*c53)+(ptrdiff_t)0, 63>>cut28, wt697);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(6+16*c53)+(ptrdiff_t)0, 63>>cut28, wt698);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(7+16*c53)+(ptrdiff_t)0, 63>>cut28, wt699);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(8+16*c53)+(ptrdiff_t)0, 63>>cut28, wt700);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(9+16*c53)+(ptrdiff_t)0, 63>>cut28, wt701);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(10+16*c53)+(ptrdiff_t)0, 63>>cut28, wt702);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(11+16*c53)+(ptrdiff_t)0, 63>>cut28, wt703);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(12+16*c53)+(ptrdiff_t)0, 63>>cut28, wt704);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(13+16*c53)+(ptrdiff_t)0, 63>>cut28, wt705);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(14+16*c53)+(ptrdiff_t)0, 63>>cut28, wt706);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(15+16*c53)+(ptrdiff_t)0, 63>>cut28, wt707);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(16+16*c53)+(ptrdiff_t)0, 63>>cut28, wt708);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(1+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt693);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(2+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt694);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(3+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt695);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(4+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt696);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(5+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt697);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(6+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt698);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(7+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt699);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(8+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt700);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(9+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt701);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(10+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt702);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(11+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt703);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(12+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt704);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(13+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt705);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(14+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt706);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(15+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt707);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(16+16*c53)+(ptrdiff_t)24576, 4032>>cut28, wt708);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(1+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt693);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(2+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt694);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(3+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt695);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(4+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt696);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(5+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt697);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(6+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt698);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(7+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt699);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(8+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt700);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(9+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt701);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(10+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt702);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(11+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt703);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(12+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt704);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(13+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt705);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(14+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt706);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(15+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt707);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(16+16*c53)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt708);
}
break;
}
default: {
cut28 = 4;
__m512 sum624 = _mm512_maskz_loadu_ps(65535, biasPtr20+10240*i67+4*k171);
__m512i pmMul42 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd42 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo35 = _mm512_loadu_ps(bnPtr20+(ptrdiff_t)8*(k171+2560*i67));
__m512 masHi35 = _mm512_maskz_loadu_ps(65535, bnPtr20+(ptrdiff_t)8*(k171+2560*i67)+(ptrdiff_t)64);
__m512 postMul68 = _mm512_permutex2var_ps(masLo35, pmMul42, masHi35);
__m512 postAdd42 = _mm512_permutex2var_ps(masLo35, pmAdd42, masHi35);
sum624 = _mm512_fmadd_ps(sum624, postMul68, postAdd42);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*0+(ptrdiff_t)0, 63>>cut28, sum624);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*0+(ptrdiff_t)24576, 4032>>cut28, sum624);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*0+(ptrdiff_t)49152, 258048>>cut28, sum624);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*0+(ptrdiff_t)73728, 65535-(262143>>cut28), sum624);
ptrdiff_t c54 = 0;
for (; c54 != 64; ++c54) {
__m512 wt709 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)0);
__m512 wt710 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)4096);
__m512 wt711 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)8192);
__m512 wt712 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)12288);
__m512 wt713 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)16384);
__m512 wt714 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)20480);
__m512 wt715 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)24576);
__m512 wt716 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)28672);
__m512 wt717 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)32768);
__m512 wt718 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)36864);
__m512 wt719 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)40960);
__m512 wt720 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)45056);
__m512 wt721 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)49152);
__m512 wt722 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)53248);
__m512 wt723 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)57344);
__m512 wt724 = _mm512_maskz_loadu_ps(65535, wtPtr20+10485760*i67+4096*k171+64*c54+(ptrdiff_t)61440);
__m512 tmp19325 = _mm512_unpacklo_ps(wt709, wt710);
__m512 tmp19326 = _mm512_unpackhi_ps(wt709, wt710);
__m512 tmp19327 = _mm512_unpacklo_ps(wt711, wt712);
__m512 tmp19328 = _mm512_unpackhi_ps(wt711, wt712);
__m512 tmp19329 = _mm512_unpacklo_ps(wt713, wt714);
__m512 tmp19330 = _mm512_unpackhi_ps(wt713, wt714);
__m512 tmp19331 = _mm512_unpacklo_ps(wt715, wt716);
__m512 tmp19332 = _mm512_unpackhi_ps(wt715, wt716);
__m512 tmp19333 = _mm512_unpacklo_ps(wt717, wt718);
__m512 tmp19334 = _mm512_unpackhi_ps(wt717, wt718);
__m512 tmp19335 = _mm512_unpacklo_ps(wt719, wt720);
__m512 tmp19336 = _mm512_unpackhi_ps(wt719, wt720);
__m512 tmp19337 = _mm512_unpacklo_ps(wt721, wt722);
__m512 tmp19338 = _mm512_unpackhi_ps(wt721, wt722);
__m512 tmp19339 = _mm512_unpacklo_ps(wt723, wt724);
__m512 tmp19340 = _mm512_unpackhi_ps(wt723, wt724);
__m512 tmp19341 = _mm512_shuffle_ps(tmp19325, tmp19327, 68);
__m512 tmp19342 = _mm512_shuffle_ps(tmp19325, tmp19327, 238);
__m512 tmp19343 = _mm512_shuffle_ps(tmp19326, tmp19328, 68);
__m512 tmp19344 = _mm512_shuffle_ps(tmp19326, tmp19328, 238);
__m512 tmp19345 = _mm512_shuffle_ps(tmp19329, tmp19331, 68);
__m512 tmp19346 = _mm512_shuffle_ps(tmp19329, tmp19331, 238);
__m512 tmp19347 = _mm512_shuffle_ps(tmp19330, tmp19332, 68);
__m512 tmp19348 = _mm512_shuffle_ps(tmp19330, tmp19332, 238);
__m512 tmp19349 = _mm512_shuffle_ps(tmp19333, tmp19335, 68);
__m512 tmp19350 = _mm512_shuffle_ps(tmp19333, tmp19335, 238);
__m512 tmp19351 = _mm512_shuffle_ps(tmp19334, tmp19336, 68);
__m512 tmp19352 = _mm512_shuffle_ps(tmp19334, tmp19336, 238);
__m512 tmp19353 = _mm512_shuffle_ps(tmp19337, tmp19339, 68);
__m512 tmp19354 = _mm512_shuffle_ps(tmp19337, tmp19339, 238);
__m512 tmp19355 = _mm512_shuffle_ps(tmp19338, tmp19340, 68);
__m512 tmp19356 = _mm512_shuffle_ps(tmp19338, tmp19340, 238);
__m512 tmp19357 = _mm512_shuffle_f32x4(tmp19341, tmp19345, 136);
__m512 tmp19358 = _mm512_shuffle_f32x4(tmp19341, tmp19345, 221);
__m512 tmp19359 = _mm512_shuffle_f32x4(tmp19342, tmp19346, 136);
__m512 tmp19360 = _mm512_shuffle_f32x4(tmp19342, tmp19346, 221);
__m512 tmp19361 = _mm512_shuffle_f32x4(tmp19343, tmp19347, 136);
__m512 tmp19362 = _mm512_shuffle_f32x4(tmp19343, tmp19347, 221);
__m512 tmp19363 = _mm512_shuffle_f32x4(tmp19344, tmp19348, 136);
__m512 tmp19364 = _mm512_shuffle_f32x4(tmp19344, tmp19348, 221);
__m512 tmp19365 = _mm512_shuffle_f32x4(tmp19349, tmp19353, 136);
__m512 tmp19366 = _mm512_shuffle_f32x4(tmp19349, tmp19353, 221);
__m512 tmp19367 = _mm512_shuffle_f32x4(tmp19350, tmp19354, 136);
__m512 tmp19368 = _mm512_shuffle_f32x4(tmp19350, tmp19354, 221);
__m512 tmp19369 = _mm512_shuffle_f32x4(tmp19351, tmp19355, 136);
__m512 tmp19370 = _mm512_shuffle_f32x4(tmp19351, tmp19355, 221);
__m512 tmp19371 = _mm512_shuffle_f32x4(tmp19352, tmp19356, 136);
__m512 tmp19372 = _mm512_shuffle_f32x4(tmp19352, tmp19356, 221);
wt709 = _mm512_shuffle_f32x4(tmp19357, tmp19365, 136);
wt717 = _mm512_shuffle_f32x4(tmp19357, tmp19365, 221);
wt710 = _mm512_shuffle_f32x4(tmp19359, tmp19367, 136);
wt718 = _mm512_shuffle_f32x4(tmp19359, tmp19367, 221);
wt711 = _mm512_shuffle_f32x4(tmp19361, tmp19369, 136);
wt719 = _mm512_shuffle_f32x4(tmp19361, tmp19369, 221);
wt712 = _mm512_shuffle_f32x4(tmp19363, tmp19371, 136);
wt720 = _mm512_shuffle_f32x4(tmp19363, tmp19371, 221);
wt713 = _mm512_shuffle_f32x4(tmp19358, tmp19366, 136);
wt721 = _mm512_shuffle_f32x4(tmp19358, tmp19366, 221);
wt714 = _mm512_shuffle_f32x4(tmp19360, tmp19368, 136);
wt722 = _mm512_shuffle_f32x4(tmp19360, tmp19368, 221);
wt715 = _mm512_shuffle_f32x4(tmp19362, tmp19370, 136);
wt723 = _mm512_shuffle_f32x4(tmp19362, tmp19370, 221);
wt716 = _mm512_shuffle_f32x4(tmp19364, tmp19372, 136);
wt724 = _mm512_shuffle_f32x4(tmp19364, tmp19372, 221);
wt709 = _mm512_mul_ps(wt709, postMul68);
wt710 = _mm512_mul_ps(wt710, postMul68);
wt711 = _mm512_mul_ps(wt711, postMul68);
wt712 = _mm512_mul_ps(wt712, postMul68);
wt713 = _mm512_mul_ps(wt713, postMul68);
wt714 = _mm512_mul_ps(wt714, postMul68);
wt715 = _mm512_mul_ps(wt715, postMul68);
wt716 = _mm512_mul_ps(wt716, postMul68);
wt717 = _mm512_mul_ps(wt717, postMul68);
wt718 = _mm512_mul_ps(wt718, postMul68);
wt719 = _mm512_mul_ps(wt719, postMul68);
wt720 = _mm512_mul_ps(wt720, postMul68);
wt721 = _mm512_mul_ps(wt721, postMul68);
wt722 = _mm512_mul_ps(wt722, postMul68);
wt723 = _mm512_mul_ps(wt723, postMul68);
wt724 = _mm512_mul_ps(wt724, postMul68);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(1+16*c54)+(ptrdiff_t)0, 63>>cut28, wt709);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(2+16*c54)+(ptrdiff_t)0, 63>>cut28, wt710);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(3+16*c54)+(ptrdiff_t)0, 63>>cut28, wt711);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(4+16*c54)+(ptrdiff_t)0, 63>>cut28, wt712);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(5+16*c54)+(ptrdiff_t)0, 63>>cut28, wt713);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(6+16*c54)+(ptrdiff_t)0, 63>>cut28, wt714);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(7+16*c54)+(ptrdiff_t)0, 63>>cut28, wt715);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(8+16*c54)+(ptrdiff_t)0, 63>>cut28, wt716);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(9+16*c54)+(ptrdiff_t)0, 63>>cut28, wt717);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(10+16*c54)+(ptrdiff_t)0, 63>>cut28, wt718);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(11+16*c54)+(ptrdiff_t)0, 63>>cut28, wt719);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(12+16*c54)+(ptrdiff_t)0, 63>>cut28, wt720);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(13+16*c54)+(ptrdiff_t)0, 63>>cut28, wt721);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(14+16*c54)+(ptrdiff_t)0, 63>>cut28, wt722);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(15+16*c54)+(ptrdiff_t)0, 63>>cut28, wt723);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(16+16*c54)+(ptrdiff_t)0, 63>>cut28, wt724);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(1+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt709);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(2+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt710);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(3+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt711);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(4+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt712);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(5+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt713);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(6+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt714);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(7+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt715);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(8+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt716);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(9+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt717);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(10+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt718);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(11+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt719);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(12+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt720);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(13+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt721);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(14+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt722);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(15+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt723);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(16+16*c54)+(ptrdiff_t)24576, 4032>>cut28, wt724);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(1+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt709);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(2+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt710);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(3+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt711);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(4+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt712);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(5+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt713);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(6+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt714);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(7+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt715);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(8+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt716);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(9+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt717);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(10+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt718);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(11+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt719);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(12+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt720);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(13+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt721);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(14+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt722);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(15+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt723);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(16+16*c54)+(ptrdiff_t)49152, 258048>>cut28, wt724);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(1+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt709);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(2+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt710);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(3+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt711);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(4+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt712);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(5+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt713);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(6+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt714);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(7+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt715);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(8+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt716);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(9+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt717);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(10+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt718);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(11+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt719);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(12+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt720);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(13+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt721);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(14+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt722);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(15+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt723);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l74+4*cut28+24*(16+16*c54)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt724);
}
}
}
} else if (j58 < 159) {
ptrdiff_t k173 = 0+16*(j58-128);
ptrdiff_t l76 = (size_t)(2048+k173)/6;
ptrdiff_t cut30 = (size_t)(2048+k173)%6;
switch (cut30) {
case 0:;
case 2: {
__m512 sum626 = _mm512_maskz_loadu_ps(65535, biasPtr21+10240*i67+4*k173);
__m512i pmMul43 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd43 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo36 = _mm512_loadu_ps(bnPtr21+(ptrdiff_t)8*(k173+2560*i67));
__m512 masHi36 = _mm512_maskz_loadu_ps(65535, bnPtr21+(ptrdiff_t)8*(k173+2560*i67)+(ptrdiff_t)64);
__m512 postMul70 = _mm512_permutex2var_ps(masLo36, pmMul43, masHi36);
__m512 postAdd44 = _mm512_permutex2var_ps(masLo36, pmAdd43, masHi36);
sum626 = _mm512_fmadd_ps(sum626, postMul70, postAdd44);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*0+(ptrdiff_t)0, 63>>cut30, sum626);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*0+(ptrdiff_t)24576, 4032>>cut30, sum626);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*0+(ptrdiff_t)49152, 65535-(4095>>cut30), sum626);
ptrdiff_t c56 = 0;
for (; c56 != 64; ++c56) {
__m512 wt741 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)0);
__m512 wt742 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)4096);
__m512 wt743 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)8192);
__m512 wt744 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)12288);
__m512 wt745 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)16384);
__m512 wt746 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)20480);
__m512 wt747 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)24576);
__m512 wt748 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)28672);
__m512 wt749 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)32768);
__m512 wt750 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)36864);
__m512 wt751 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)40960);
__m512 wt752 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)45056);
__m512 wt753 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)49152);
__m512 wt754 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)53248);
__m512 wt755 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)57344);
__m512 wt756 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c56+(ptrdiff_t)61440);
__m512 tmp19373 = _mm512_unpacklo_ps(wt741, wt742);
__m512 tmp19374 = _mm512_unpackhi_ps(wt741, wt742);
__m512 tmp19375 = _mm512_unpacklo_ps(wt743, wt744);
__m512 tmp19376 = _mm512_unpackhi_ps(wt743, wt744);
__m512 tmp19377 = _mm512_unpacklo_ps(wt745, wt746);
__m512 tmp19378 = _mm512_unpackhi_ps(wt745, wt746);
__m512 tmp19379 = _mm512_unpacklo_ps(wt747, wt748);
__m512 tmp19380 = _mm512_unpackhi_ps(wt747, wt748);
__m512 tmp19381 = _mm512_unpacklo_ps(wt749, wt750);
__m512 tmp19382 = _mm512_unpackhi_ps(wt749, wt750);
__m512 tmp19383 = _mm512_unpacklo_ps(wt751, wt752);
__m512 tmp19384 = _mm512_unpackhi_ps(wt751, wt752);
__m512 tmp19385 = _mm512_unpacklo_ps(wt753, wt754);
__m512 tmp19386 = _mm512_unpackhi_ps(wt753, wt754);
__m512 tmp19387 = _mm512_unpacklo_ps(wt755, wt756);
__m512 tmp19388 = _mm512_unpackhi_ps(wt755, wt756);
__m512 tmp19389 = _mm512_shuffle_ps(tmp19373, tmp19375, 68);
__m512 tmp19390 = _mm512_shuffle_ps(tmp19373, tmp19375, 238);
__m512 tmp19391 = _mm512_shuffle_ps(tmp19374, tmp19376, 68);
__m512 tmp19392 = _mm512_shuffle_ps(tmp19374, tmp19376, 238);
__m512 tmp19393 = _mm512_shuffle_ps(tmp19377, tmp19379, 68);
__m512 tmp19394 = _mm512_shuffle_ps(tmp19377, tmp19379, 238);
__m512 tmp19395 = _mm512_shuffle_ps(tmp19378, tmp19380, 68);
__m512 tmp19396 = _mm512_shuffle_ps(tmp19378, tmp19380, 238);
__m512 tmp19397 = _mm512_shuffle_ps(tmp19381, tmp19383, 68);
__m512 tmp19398 = _mm512_shuffle_ps(tmp19381, tmp19383, 238);
__m512 tmp19399 = _mm512_shuffle_ps(tmp19382, tmp19384, 68);
__m512 tmp19400 = _mm512_shuffle_ps(tmp19382, tmp19384, 238);
__m512 tmp19401 = _mm512_shuffle_ps(tmp19385, tmp19387, 68);
__m512 tmp19402 = _mm512_shuffle_ps(tmp19385, tmp19387, 238);
__m512 tmp19403 = _mm512_shuffle_ps(tmp19386, tmp19388, 68);
__m512 tmp19404 = _mm512_shuffle_ps(tmp19386, tmp19388, 238);
__m512 tmp19405 = _mm512_shuffle_f32x4(tmp19389, tmp19393, 136);
__m512 tmp19406 = _mm512_shuffle_f32x4(tmp19389, tmp19393, 221);
__m512 tmp19407 = _mm512_shuffle_f32x4(tmp19390, tmp19394, 136);
__m512 tmp19408 = _mm512_shuffle_f32x4(tmp19390, tmp19394, 221);
__m512 tmp19409 = _mm512_shuffle_f32x4(tmp19391, tmp19395, 136);
__m512 tmp19410 = _mm512_shuffle_f32x4(tmp19391, tmp19395, 221);
__m512 tmp19411 = _mm512_shuffle_f32x4(tmp19392, tmp19396, 136);
__m512 tmp19412 = _mm512_shuffle_f32x4(tmp19392, tmp19396, 221);
__m512 tmp19413 = _mm512_shuffle_f32x4(tmp19397, tmp19401, 136);
__m512 tmp19414 = _mm512_shuffle_f32x4(tmp19397, tmp19401, 221);
__m512 tmp19415 = _mm512_shuffle_f32x4(tmp19398, tmp19402, 136);
__m512 tmp19416 = _mm512_shuffle_f32x4(tmp19398, tmp19402, 221);
__m512 tmp19417 = _mm512_shuffle_f32x4(tmp19399, tmp19403, 136);
__m512 tmp19418 = _mm512_shuffle_f32x4(tmp19399, tmp19403, 221);
__m512 tmp19419 = _mm512_shuffle_f32x4(tmp19400, tmp19404, 136);
__m512 tmp19420 = _mm512_shuffle_f32x4(tmp19400, tmp19404, 221);
wt741 = _mm512_shuffle_f32x4(tmp19405, tmp19413, 136);
wt749 = _mm512_shuffle_f32x4(tmp19405, tmp19413, 221);
wt742 = _mm512_shuffle_f32x4(tmp19407, tmp19415, 136);
wt750 = _mm512_shuffle_f32x4(tmp19407, tmp19415, 221);
wt743 = _mm512_shuffle_f32x4(tmp19409, tmp19417, 136);
wt751 = _mm512_shuffle_f32x4(tmp19409, tmp19417, 221);
wt744 = _mm512_shuffle_f32x4(tmp19411, tmp19419, 136);
wt752 = _mm512_shuffle_f32x4(tmp19411, tmp19419, 221);
wt745 = _mm512_shuffle_f32x4(tmp19406, tmp19414, 136);
wt753 = _mm512_shuffle_f32x4(tmp19406, tmp19414, 221);
wt746 = _mm512_shuffle_f32x4(tmp19408, tmp19416, 136);
wt754 = _mm512_shuffle_f32x4(tmp19408, tmp19416, 221);
wt747 = _mm512_shuffle_f32x4(tmp19410, tmp19418, 136);
wt755 = _mm512_shuffle_f32x4(tmp19410, tmp19418, 221);
wt748 = _mm512_shuffle_f32x4(tmp19412, tmp19420, 136);
wt756 = _mm512_shuffle_f32x4(tmp19412, tmp19420, 221);
wt741 = _mm512_mul_ps(wt741, postMul70);
wt742 = _mm512_mul_ps(wt742, postMul70);
wt743 = _mm512_mul_ps(wt743, postMul70);
wt744 = _mm512_mul_ps(wt744, postMul70);
wt745 = _mm512_mul_ps(wt745, postMul70);
wt746 = _mm512_mul_ps(wt746, postMul70);
wt747 = _mm512_mul_ps(wt747, postMul70);
wt748 = _mm512_mul_ps(wt748, postMul70);
wt749 = _mm512_mul_ps(wt749, postMul70);
wt750 = _mm512_mul_ps(wt750, postMul70);
wt751 = _mm512_mul_ps(wt751, postMul70);
wt752 = _mm512_mul_ps(wt752, postMul70);
wt753 = _mm512_mul_ps(wt753, postMul70);
wt754 = _mm512_mul_ps(wt754, postMul70);
wt755 = _mm512_mul_ps(wt755, postMul70);
wt756 = _mm512_mul_ps(wt756, postMul70);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(1+16*c56)+(ptrdiff_t)0, 63>>cut30, wt741);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(2+16*c56)+(ptrdiff_t)0, 63>>cut30, wt742);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(3+16*c56)+(ptrdiff_t)0, 63>>cut30, wt743);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(4+16*c56)+(ptrdiff_t)0, 63>>cut30, wt744);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(5+16*c56)+(ptrdiff_t)0, 63>>cut30, wt745);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(6+16*c56)+(ptrdiff_t)0, 63>>cut30, wt746);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(7+16*c56)+(ptrdiff_t)0, 63>>cut30, wt747);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(8+16*c56)+(ptrdiff_t)0, 63>>cut30, wt748);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(9+16*c56)+(ptrdiff_t)0, 63>>cut30, wt749);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(10+16*c56)+(ptrdiff_t)0, 63>>cut30, wt750);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(11+16*c56)+(ptrdiff_t)0, 63>>cut30, wt751);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(12+16*c56)+(ptrdiff_t)0, 63>>cut30, wt752);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(13+16*c56)+(ptrdiff_t)0, 63>>cut30, wt753);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(14+16*c56)+(ptrdiff_t)0, 63>>cut30, wt754);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(15+16*c56)+(ptrdiff_t)0, 63>>cut30, wt755);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(16+16*c56)+(ptrdiff_t)0, 63>>cut30, wt756);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(1+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt741);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(2+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt742);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(3+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt743);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(4+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt744);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(5+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt745);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(6+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt746);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(7+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt747);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(8+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt748);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(9+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt749);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(10+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt750);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(11+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt751);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(12+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt752);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(13+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt753);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(14+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt754);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(15+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt755);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(16+16*c56)+(ptrdiff_t)24576, 4032>>cut30, wt756);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(1+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt741);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(2+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt742);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(3+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt743);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(4+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt744);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(5+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt745);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(6+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt746);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(7+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt747);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(8+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt748);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(9+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt749);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(10+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt750);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(11+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt751);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(12+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt752);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(13+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt753);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(14+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt754);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(15+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt755);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(16+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt756);
}
break;
}
default: {
cut30 = 4;
__m512 sum627 = _mm512_maskz_loadu_ps(65535, biasPtr21+10240*i67+4*k173);
__m512i pmMul44 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd44 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo37 = _mm512_loadu_ps(bnPtr21+(ptrdiff_t)8*(k173+2560*i67));
__m512 masHi37 = _mm512_maskz_loadu_ps(65535, bnPtr21+(ptrdiff_t)8*(k173+2560*i67)+(ptrdiff_t)64);
__m512 postMul71 = _mm512_permutex2var_ps(masLo37, pmMul44, masHi37);
__m512 postAdd45 = _mm512_permutex2var_ps(masLo37, pmAdd44, masHi37);
sum627 = _mm512_fmadd_ps(sum627, postMul71, postAdd45);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*0+(ptrdiff_t)0, 63>>cut30, sum627);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*0+(ptrdiff_t)24576, 4032>>cut30, sum627);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*0+(ptrdiff_t)49152, 258048>>cut30, sum627);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*0+(ptrdiff_t)73728, 65535-(262143>>cut30), sum627);
ptrdiff_t c57 = 0;
for (; c57 != 64; ++c57) {
__m512 wt757 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)0);
__m512 wt758 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)4096);
__m512 wt759 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)8192);
__m512 wt760 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)12288);
__m512 wt761 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)16384);
__m512 wt762 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)20480);
__m512 wt763 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)24576);
__m512 wt764 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)28672);
__m512 wt765 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)32768);
__m512 wt766 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)36864);
__m512 wt767 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)40960);
__m512 wt768 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)45056);
__m512 wt769 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)49152);
__m512 wt770 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)53248);
__m512 wt771 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)57344);
__m512 wt772 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k173+64*c57+(ptrdiff_t)61440);
__m512 tmp19421 = _mm512_unpacklo_ps(wt757, wt758);
__m512 tmp19422 = _mm512_unpackhi_ps(wt757, wt758);
__m512 tmp19423 = _mm512_unpacklo_ps(wt759, wt760);
__m512 tmp19424 = _mm512_unpackhi_ps(wt759, wt760);
__m512 tmp19425 = _mm512_unpacklo_ps(wt761, wt762);
__m512 tmp19426 = _mm512_unpackhi_ps(wt761, wt762);
__m512 tmp19427 = _mm512_unpacklo_ps(wt763, wt764);
__m512 tmp19428 = _mm512_unpackhi_ps(wt763, wt764);
__m512 tmp19429 = _mm512_unpacklo_ps(wt765, wt766);
__m512 tmp19430 = _mm512_unpackhi_ps(wt765, wt766);
__m512 tmp19431 = _mm512_unpacklo_ps(wt767, wt768);
__m512 tmp19432 = _mm512_unpackhi_ps(wt767, wt768);
__m512 tmp19433 = _mm512_unpacklo_ps(wt769, wt770);
__m512 tmp19434 = _mm512_unpackhi_ps(wt769, wt770);
__m512 tmp19435 = _mm512_unpacklo_ps(wt771, wt772);
__m512 tmp19436 = _mm512_unpackhi_ps(wt771, wt772);
__m512 tmp19437 = _mm512_shuffle_ps(tmp19421, tmp19423, 68);
__m512 tmp19438 = _mm512_shuffle_ps(tmp19421, tmp19423, 238);
__m512 tmp19439 = _mm512_shuffle_ps(tmp19422, tmp19424, 68);
__m512 tmp19440 = _mm512_shuffle_ps(tmp19422, tmp19424, 238);
__m512 tmp19441 = _mm512_shuffle_ps(tmp19425, tmp19427, 68);
__m512 tmp19442 = _mm512_shuffle_ps(tmp19425, tmp19427, 238);
__m512 tmp19443 = _mm512_shuffle_ps(tmp19426, tmp19428, 68);
__m512 tmp19444 = _mm512_shuffle_ps(tmp19426, tmp19428, 238);
__m512 tmp19445 = _mm512_shuffle_ps(tmp19429, tmp19431, 68);
__m512 tmp19446 = _mm512_shuffle_ps(tmp19429, tmp19431, 238);
__m512 tmp19447 = _mm512_shuffle_ps(tmp19430, tmp19432, 68);
__m512 tmp19448 = _mm512_shuffle_ps(tmp19430, tmp19432, 238);
__m512 tmp19449 = _mm512_shuffle_ps(tmp19433, tmp19435, 68);
__m512 tmp19450 = _mm512_shuffle_ps(tmp19433, tmp19435, 238);
__m512 tmp19451 = _mm512_shuffle_ps(tmp19434, tmp19436, 68);
__m512 tmp19452 = _mm512_shuffle_ps(tmp19434, tmp19436, 238);
__m512 tmp19453 = _mm512_shuffle_f32x4(tmp19437, tmp19441, 136);
__m512 tmp19454 = _mm512_shuffle_f32x4(tmp19437, tmp19441, 221);
__m512 tmp19455 = _mm512_shuffle_f32x4(tmp19438, tmp19442, 136);
__m512 tmp19456 = _mm512_shuffle_f32x4(tmp19438, tmp19442, 221);
__m512 tmp19457 = _mm512_shuffle_f32x4(tmp19439, tmp19443, 136);
__m512 tmp19458 = _mm512_shuffle_f32x4(tmp19439, tmp19443, 221);
__m512 tmp19459 = _mm512_shuffle_f32x4(tmp19440, tmp19444, 136);
__m512 tmp19460 = _mm512_shuffle_f32x4(tmp19440, tmp19444, 221);
__m512 tmp19461 = _mm512_shuffle_f32x4(tmp19445, tmp19449, 136);
__m512 tmp19462 = _mm512_shuffle_f32x4(tmp19445, tmp19449, 221);
__m512 tmp19463 = _mm512_shuffle_f32x4(tmp19446, tmp19450, 136);
__m512 tmp19464 = _mm512_shuffle_f32x4(tmp19446, tmp19450, 221);
__m512 tmp19465 = _mm512_shuffle_f32x4(tmp19447, tmp19451, 136);
__m512 tmp19466 = _mm512_shuffle_f32x4(tmp19447, tmp19451, 221);
__m512 tmp19467 = _mm512_shuffle_f32x4(tmp19448, tmp19452, 136);
__m512 tmp19468 = _mm512_shuffle_f32x4(tmp19448, tmp19452, 221);
wt757 = _mm512_shuffle_f32x4(tmp19453, tmp19461, 136);
wt765 = _mm512_shuffle_f32x4(tmp19453, tmp19461, 221);
wt758 = _mm512_shuffle_f32x4(tmp19455, tmp19463, 136);
wt766 = _mm512_shuffle_f32x4(tmp19455, tmp19463, 221);
wt759 = _mm512_shuffle_f32x4(tmp19457, tmp19465, 136);
wt767 = _mm512_shuffle_f32x4(tmp19457, tmp19465, 221);
wt760 = _mm512_shuffle_f32x4(tmp19459, tmp19467, 136);
wt768 = _mm512_shuffle_f32x4(tmp19459, tmp19467, 221);
wt761 = _mm512_shuffle_f32x4(tmp19454, tmp19462, 136);
wt769 = _mm512_shuffle_f32x4(tmp19454, tmp19462, 221);
wt762 = _mm512_shuffle_f32x4(tmp19456, tmp19464, 136);
wt770 = _mm512_shuffle_f32x4(tmp19456, tmp19464, 221);
wt763 = _mm512_shuffle_f32x4(tmp19458, tmp19466, 136);
wt771 = _mm512_shuffle_f32x4(tmp19458, tmp19466, 221);
wt764 = _mm512_shuffle_f32x4(tmp19460, tmp19468, 136);
wt772 = _mm512_shuffle_f32x4(tmp19460, tmp19468, 221);
wt757 = _mm512_mul_ps(wt757, postMul71);
wt758 = _mm512_mul_ps(wt758, postMul71);
wt759 = _mm512_mul_ps(wt759, postMul71);
wt760 = _mm512_mul_ps(wt760, postMul71);
wt761 = _mm512_mul_ps(wt761, postMul71);
wt762 = _mm512_mul_ps(wt762, postMul71);
wt763 = _mm512_mul_ps(wt763, postMul71);
wt764 = _mm512_mul_ps(wt764, postMul71);
wt765 = _mm512_mul_ps(wt765, postMul71);
wt766 = _mm512_mul_ps(wt766, postMul71);
wt767 = _mm512_mul_ps(wt767, postMul71);
wt768 = _mm512_mul_ps(wt768, postMul71);
wt769 = _mm512_mul_ps(wt769, postMul71);
wt770 = _mm512_mul_ps(wt770, postMul71);
wt771 = _mm512_mul_ps(wt771, postMul71);
wt772 = _mm512_mul_ps(wt772, postMul71);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(1+16*c57)+(ptrdiff_t)0, 63>>cut30, wt757);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(2+16*c57)+(ptrdiff_t)0, 63>>cut30, wt758);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(3+16*c57)+(ptrdiff_t)0, 63>>cut30, wt759);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(4+16*c57)+(ptrdiff_t)0, 63>>cut30, wt760);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(5+16*c57)+(ptrdiff_t)0, 63>>cut30, wt761);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(6+16*c57)+(ptrdiff_t)0, 63>>cut30, wt762);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(7+16*c57)+(ptrdiff_t)0, 63>>cut30, wt763);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(8+16*c57)+(ptrdiff_t)0, 63>>cut30, wt764);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(9+16*c57)+(ptrdiff_t)0, 63>>cut30, wt765);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(10+16*c57)+(ptrdiff_t)0, 63>>cut30, wt766);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(11+16*c57)+(ptrdiff_t)0, 63>>cut30, wt767);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(12+16*c57)+(ptrdiff_t)0, 63>>cut30, wt768);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(13+16*c57)+(ptrdiff_t)0, 63>>cut30, wt769);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(14+16*c57)+(ptrdiff_t)0, 63>>cut30, wt770);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(15+16*c57)+(ptrdiff_t)0, 63>>cut30, wt771);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(16+16*c57)+(ptrdiff_t)0, 63>>cut30, wt772);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(1+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt757);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(2+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt758);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(3+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt759);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(4+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt760);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(5+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt761);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(6+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt762);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(7+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt763);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(8+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt764);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(9+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt765);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(10+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt766);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(11+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt767);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(12+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt768);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(13+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt769);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(14+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt770);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(15+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt771);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(16+16*c57)+(ptrdiff_t)24576, 4032>>cut30, wt772);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(1+16*c57)+(ptrdiff_t)49152, 258048>>cut30, wt757);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(2+16*c57)+(ptrdiff_t)49152, 258048>>cut30, wt758);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(3+16*c57)+(ptrdiff_t)49152, 258048>>cut30, wt759);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(4+16*c57)+(ptrdiff_t)49152, 258048>>cut30, wt760);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(5+16*c57)+(ptrdiff_t)49152, 258048>>cut30, wt761);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(6+16*c57)+(ptrdiff_t)49152, 258048>>cut30, wt762);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(7+16*c57)+(ptrdiff_t)49152, 258048>>cut30, wt763);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(8+16*c57)+(ptrdiff_t)49152, 258048>>cut30, wt764);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(9+16*c57)+(ptrdiff_t)49152, 258048>>cut30, wt765);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(10+16*c57)+(ptrdiff_t)49152, 258048>>cut30, wt766);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(11+16*c57)+(ptrdiff_t)49152, 258048>>cut30, wt767);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(12+16*c57)+(ptrdiff_t)49152, 258048>>cut30, wt768);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(13+16*c57)+(ptrdiff_t)49152, 258048>>cut30, wt769);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(14+16*c57)+(ptrdiff_t)49152, 258048>>cut30, wt770);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(15+16*c57)+(ptrdiff_t)49152, 258048>>cut30, wt771);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(16+16*c57)+(ptrdiff_t)49152, 258048>>cut30, wt772);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(1+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt757);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(2+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt758);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(3+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt759);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(4+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt760);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(5+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt761);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(6+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt762);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(7+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt763);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(8+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt764);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(9+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt765);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(10+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt766);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(11+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt767);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(12+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt768);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(13+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt769);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(14+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt770);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(15+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt771);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l76+4*cut30+24*(16+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt772);
}
}
}
} else {
ptrdiff_t k172 = 496;
ptrdiff_t l75 = (size_t)(2048+k172)/6;
ptrdiff_t cut29 = (size_t)(2048+k172)%6;
__m512 sum625 = _mm512_maskz_loadu_ps(65535, biasPtr21+10240*i67+4*k172);
__m512i pmMul45 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd45 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo38 = _mm512_loadu_ps(bnPtr21+(ptrdiff_t)8*(k172+2560*i67));
__m512 masHi38 = _mm512_maskz_loadu_ps(65535, bnPtr21+(ptrdiff_t)8*(k172+2560*i67)+(ptrdiff_t)64);
__m512 postMul69 = _mm512_permutex2var_ps(masLo38, pmMul45, masHi38);
__m512 postAdd43 = _mm512_permutex2var_ps(masLo38, pmAdd45, masHi38);
sum625 = _mm512_fmadd_ps(sum625, postMul69, postAdd43);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*0+(ptrdiff_t)0, 63>>cut29, sum625);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*0+(ptrdiff_t)24576, 4032>>cut29, sum625);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*0+(ptrdiff_t)49152, 65535-(4095>>cut29), sum625);
ptrdiff_t c55 = 0;
for (; c55 != 64; ++c55) {
__m512 wt725 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k172+64*c55+(ptrdiff_t)0);
__m512 wt726 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k172+64*c55+(ptrdiff_t)4096);
__m512 wt727 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k172+64*c55+(ptrdiff_t)8192);
__m512 wt728 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k172+64*c55+(ptrdiff_t)12288);
__m512 wt729 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k172+64*c55+(ptrdiff_t)16384);
__m512 wt730 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k172+64*c55+(ptrdiff_t)20480);
__m512 wt731 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k172+64*c55+(ptrdiff_t)24576);
__m512 wt732 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k172+64*c55+(ptrdiff_t)28672);
__m512 wt733 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k172+64*c55+(ptrdiff_t)32768);
__m512 wt734 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k172+64*c55+(ptrdiff_t)36864);
__m512 wt735 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k172+64*c55+(ptrdiff_t)40960);
__m512 wt736 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k172+64*c55+(ptrdiff_t)45056);
__m512 wt737 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k172+64*c55+(ptrdiff_t)49152);
__m512 wt738 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k172+64*c55+(ptrdiff_t)53248);
__m512 wt739 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k172+64*c55+(ptrdiff_t)57344);
__m512 wt740 = _mm512_maskz_loadu_ps(65535, wtPtr21+10485760*i67+4096*k172+64*c55+(ptrdiff_t)61440);
__m512 tmp19469 = _mm512_unpacklo_ps(wt725, wt726);
__m512 tmp19470 = _mm512_unpackhi_ps(wt725, wt726);
__m512 tmp19471 = _mm512_unpacklo_ps(wt727, wt728);
__m512 tmp19472 = _mm512_unpackhi_ps(wt727, wt728);
__m512 tmp19473 = _mm512_unpacklo_ps(wt729, wt730);
__m512 tmp19474 = _mm512_unpackhi_ps(wt729, wt730);
__m512 tmp19475 = _mm512_unpacklo_ps(wt731, wt732);
__m512 tmp19476 = _mm512_unpackhi_ps(wt731, wt732);
__m512 tmp19477 = _mm512_unpacklo_ps(wt733, wt734);
__m512 tmp19478 = _mm512_unpackhi_ps(wt733, wt734);
__m512 tmp19479 = _mm512_unpacklo_ps(wt735, wt736);
__m512 tmp19480 = _mm512_unpackhi_ps(wt735, wt736);
__m512 tmp19481 = _mm512_unpacklo_ps(wt737, wt738);
__m512 tmp19482 = _mm512_unpackhi_ps(wt737, wt738);
__m512 tmp19483 = _mm512_unpacklo_ps(wt739, wt740);
__m512 tmp19484 = _mm512_unpackhi_ps(wt739, wt740);
__m512 tmp19485 = _mm512_shuffle_ps(tmp19469, tmp19471, 68);
__m512 tmp19486 = _mm512_shuffle_ps(tmp19469, tmp19471, 238);
__m512 tmp19487 = _mm512_shuffle_ps(tmp19470, tmp19472, 68);
__m512 tmp19488 = _mm512_shuffle_ps(tmp19470, tmp19472, 238);
__m512 tmp19489 = _mm512_shuffle_ps(tmp19473, tmp19475, 68);
__m512 tmp19490 = _mm512_shuffle_ps(tmp19473, tmp19475, 238);
__m512 tmp19491 = _mm512_shuffle_ps(tmp19474, tmp19476, 68);
__m512 tmp19492 = _mm512_shuffle_ps(tmp19474, tmp19476, 238);
__m512 tmp19493 = _mm512_shuffle_ps(tmp19477, tmp19479, 68);
__m512 tmp19494 = _mm512_shuffle_ps(tmp19477, tmp19479, 238);
__m512 tmp19495 = _mm512_shuffle_ps(tmp19478, tmp19480, 68);
__m512 tmp19496 = _mm512_shuffle_ps(tmp19478, tmp19480, 238);
__m512 tmp19497 = _mm512_shuffle_ps(tmp19481, tmp19483, 68);
__m512 tmp19498 = _mm512_shuffle_ps(tmp19481, tmp19483, 238);
__m512 tmp19499 = _mm512_shuffle_ps(tmp19482, tmp19484, 68);
__m512 tmp19500 = _mm512_shuffle_ps(tmp19482, tmp19484, 238);
__m512 tmp19501 = _mm512_shuffle_f32x4(tmp19485, tmp19489, 136);
__m512 tmp19502 = _mm512_shuffle_f32x4(tmp19485, tmp19489, 221);
__m512 tmp19503 = _mm512_shuffle_f32x4(tmp19486, tmp19490, 136);
__m512 tmp19504 = _mm512_shuffle_f32x4(tmp19486, tmp19490, 221);
__m512 tmp19505 = _mm512_shuffle_f32x4(tmp19487, tmp19491, 136);
__m512 tmp19506 = _mm512_shuffle_f32x4(tmp19487, tmp19491, 221);
__m512 tmp19507 = _mm512_shuffle_f32x4(tmp19488, tmp19492, 136);
__m512 tmp19508 = _mm512_shuffle_f32x4(tmp19488, tmp19492, 221);
__m512 tmp19509 = _mm512_shuffle_f32x4(tmp19493, tmp19497, 136);
__m512 tmp19510 = _mm512_shuffle_f32x4(tmp19493, tmp19497, 221);
__m512 tmp19511 = _mm512_shuffle_f32x4(tmp19494, tmp19498, 136);
__m512 tmp19512 = _mm512_shuffle_f32x4(tmp19494, tmp19498, 221);
__m512 tmp19513 = _mm512_shuffle_f32x4(tmp19495, tmp19499, 136);
__m512 tmp19514 = _mm512_shuffle_f32x4(tmp19495, tmp19499, 221);
__m512 tmp19515 = _mm512_shuffle_f32x4(tmp19496, tmp19500, 136);
__m512 tmp19516 = _mm512_shuffle_f32x4(tmp19496, tmp19500, 221);
wt725 = _mm512_shuffle_f32x4(tmp19501, tmp19509, 136);
wt733 = _mm512_shuffle_f32x4(tmp19501, tmp19509, 221);
wt726 = _mm512_shuffle_f32x4(tmp19503, tmp19511, 136);
wt734 = _mm512_shuffle_f32x4(tmp19503, tmp19511, 221);
wt727 = _mm512_shuffle_f32x4(tmp19505, tmp19513, 136);
wt735 = _mm512_shuffle_f32x4(tmp19505, tmp19513, 221);
wt728 = _mm512_shuffle_f32x4(tmp19507, tmp19515, 136);
wt736 = _mm512_shuffle_f32x4(tmp19507, tmp19515, 221);
wt729 = _mm512_shuffle_f32x4(tmp19502, tmp19510, 136);
wt737 = _mm512_shuffle_f32x4(tmp19502, tmp19510, 221);
wt730 = _mm512_shuffle_f32x4(tmp19504, tmp19512, 136);
wt738 = _mm512_shuffle_f32x4(tmp19504, tmp19512, 221);
wt731 = _mm512_shuffle_f32x4(tmp19506, tmp19514, 136);
wt739 = _mm512_shuffle_f32x4(tmp19506, tmp19514, 221);
wt732 = _mm512_shuffle_f32x4(tmp19508, tmp19516, 136);
wt740 = _mm512_shuffle_f32x4(tmp19508, tmp19516, 221);
wt725 = _mm512_mul_ps(wt725, postMul69);
wt726 = _mm512_mul_ps(wt726, postMul69);
wt727 = _mm512_mul_ps(wt727, postMul69);
wt728 = _mm512_mul_ps(wt728, postMul69);
wt729 = _mm512_mul_ps(wt729, postMul69);
wt730 = _mm512_mul_ps(wt730, postMul69);
wt731 = _mm512_mul_ps(wt731, postMul69);
wt732 = _mm512_mul_ps(wt732, postMul69);
wt733 = _mm512_mul_ps(wt733, postMul69);
wt734 = _mm512_mul_ps(wt734, postMul69);
wt735 = _mm512_mul_ps(wt735, postMul69);
wt736 = _mm512_mul_ps(wt736, postMul69);
wt737 = _mm512_mul_ps(wt737, postMul69);
wt738 = _mm512_mul_ps(wt738, postMul69);
wt739 = _mm512_mul_ps(wt739, postMul69);
wt740 = _mm512_mul_ps(wt740, postMul69);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(1+16*c55)+(ptrdiff_t)0, 63>>cut29, wt725);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(2+16*c55)+(ptrdiff_t)0, 63>>cut29, wt726);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(3+16*c55)+(ptrdiff_t)0, 63>>cut29, wt727);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(4+16*c55)+(ptrdiff_t)0, 63>>cut29, wt728);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(5+16*c55)+(ptrdiff_t)0, 63>>cut29, wt729);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(6+16*c55)+(ptrdiff_t)0, 63>>cut29, wt730);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(7+16*c55)+(ptrdiff_t)0, 63>>cut29, wt731);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(8+16*c55)+(ptrdiff_t)0, 63>>cut29, wt732);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(9+16*c55)+(ptrdiff_t)0, 63>>cut29, wt733);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(10+16*c55)+(ptrdiff_t)0, 63>>cut29, wt734);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(11+16*c55)+(ptrdiff_t)0, 63>>cut29, wt735);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(12+16*c55)+(ptrdiff_t)0, 63>>cut29, wt736);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(13+16*c55)+(ptrdiff_t)0, 63>>cut29, wt737);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(14+16*c55)+(ptrdiff_t)0, 63>>cut29, wt738);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(15+16*c55)+(ptrdiff_t)0, 63>>cut29, wt739);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(16+16*c55)+(ptrdiff_t)0, 63>>cut29, wt740);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(1+16*c55)+(ptrdiff_t)24576, 4032>>cut29, wt725);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(2+16*c55)+(ptrdiff_t)24576, 4032>>cut29, wt726);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(3+16*c55)+(ptrdiff_t)24576, 4032>>cut29, wt727);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(4+16*c55)+(ptrdiff_t)24576, 4032>>cut29, wt728);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(5+16*c55)+(ptrdiff_t)24576, 4032>>cut29, wt729);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(6+16*c55)+(ptrdiff_t)24576, 4032>>cut29, wt730);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(7+16*c55)+(ptrdiff_t)24576, 4032>>cut29, wt731);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(8+16*c55)+(ptrdiff_t)24576, 4032>>cut29, wt732);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(9+16*c55)+(ptrdiff_t)24576, 4032>>cut29, wt733);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(10+16*c55)+(ptrdiff_t)24576, 4032>>cut29, wt734);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(11+16*c55)+(ptrdiff_t)24576, 4032>>cut29, wt735);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(12+16*c55)+(ptrdiff_t)24576, 4032>>cut29, wt736);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(13+16*c55)+(ptrdiff_t)24576, 4032>>cut29, wt737);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(14+16*c55)+(ptrdiff_t)24576, 4032>>cut29, wt738);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(15+16*c55)+(ptrdiff_t)24576, 4032>>cut29, wt739);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+24*(16+16*c55)+(ptrdiff_t)24576, 4032>>cut29, wt740);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*(1+16*c55)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt725);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*(2+16*c55)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt726);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*(3+16*c55)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt727);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*(4+16*c55)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt728);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*(5+16*c55)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt729);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*(6+16*c55)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt730);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*(7+16*c55)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt731);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*(8+16*c55)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt732);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*(9+16*c55)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt733);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*(10+16*c55)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt734);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*(11+16*c55)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt735);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*(12+16*c55)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt736);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*(13+16*c55)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt737);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*(14+16*c55)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt738);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*(15+16*c55)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt739);
_mm512_mask_storeu_ps(arranged19+10496000*i67+24600*l75+4*cut29+16*(16+16*c55)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt740);
}
}
}
}
}

static void ResNet50OneArrangeWts10(ResNet50ThreaderTeam1* team70, char** tensors113) {
ResNet50ThreaderTask1 task117;
task117.callee1 = ResNet50OneArrangeWts10Callee1;
task117.any1 = tensors113;
task117.nd1 = 3;
task117.hull1[0] = 160;
task117.hull1[1] = 1;
task117.hull1[2] = 1;
ResNet50ThreaderDo1(team70, &task117);
}

static void ResNet50OneArrangeDats10Callee1(ResNet50ThreaderTask1* task118, int64_t* pt64) {
char** tensors116 = task118->any1;
ptrdiff_t s56 = pt64[0];
char*restrict datPtr36 = tensors116[0]+(ptrdiff_t)0+(ptrdiff_t)694720*0+(ptrdiff_t)851968*0;
char*restrict arranged20 = tensors116[1]+(ptrdiff_t)213760*0+(ptrdiff_t)262144*0;
ptrdiff_t ii29 = 1;
for (ptrdiff_t i68 = 0; i68 < ii29; ++i68) {
ptrdiff_t j59 = 0;
switch ((size_t)j59-0) {
default: {
j59 = 0;
ptrdiff_t k174 = 128*s56;
ptrdiff_t kk62 = k174+128;
for (; k174 < kk62; ++k174) {
__m512 dat2401 = _mm512_maskz_loadu_ps(8191, datPtr36+851968*i68+56*(ptrdiff_t)0+832*k174+(ptrdiff_t)0);
__m512i pm251 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2402 = _mm512_permutexvar_ps(pm251, dat2401);
__m512 dat2403 = _mm512_maskz_loadu_ps(8191, datPtr36+851968*i68+56*(ptrdiff_t)0+832*k174+(ptrdiff_t)112);
__m512i pm252 = _mm512_set_epi32(16, 14, 12, 10, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat2404 = _mm512_permutexvar_ps(pm252, dat2403);
dat2402 = _mm512_mask_mov_ps(dat2402, 16256, dat2404);
__m512 dat2405 = _mm512_maskz_loadu_ps(8191, datPtr36+851968*i68+56*(ptrdiff_t)0+832*k174+(ptrdiff_t)224);
__m512i pm253 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2406 = _mm512_permutexvar_ps(pm253, dat2405);
__m512 dat2407 = _mm512_maskz_loadu_ps(8191, datPtr36+851968*i68+56*(ptrdiff_t)0+832*k174+(ptrdiff_t)336);
__m512i pm254 = _mm512_set_epi32(16, 14, 12, 10, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat2408 = _mm512_permutexvar_ps(pm254, dat2407);
dat2406 = _mm512_mask_mov_ps(dat2406, 16256, dat2408);
__m512 dat2409 = _mm512_maskz_loadu_ps(8191, datPtr36+851968*i68+56*(ptrdiff_t)0+832*k174+(ptrdiff_t)448);
__m512i pm255 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2410 = _mm512_permutexvar_ps(pm255, dat2409);
__m512 dat2411 = _mm512_maskz_loadu_ps(8191, datPtr36+851968*i68+56*(ptrdiff_t)0+832*k174+(ptrdiff_t)560);
__m512i pm256 = _mm512_set_epi32(16, 14, 12, 10, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat2412 = _mm512_permutexvar_ps(pm256, dat2411);
dat2410 = _mm512_mask_mov_ps(dat2410, 16256, dat2412);
__m512 dat2413 = _mm512_maskz_loadu_ps(8191, datPtr36+851968*i68+56*(ptrdiff_t)0+832*k174+(ptrdiff_t)672);
__m512i pm257 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2414 = _mm512_permutexvar_ps(pm257, dat2413);
_mm512_storeu_ps(arranged20+262144*i68+262144*j59+256*k174+(ptrdiff_t)0, dat2402);
_mm512_storeu_ps(arranged20+262144*i68+262144*j59+256*k174+(ptrdiff_t)64, dat2406);
_mm512_storeu_ps(arranged20+262144*i68+262144*j59+256*k174+(ptrdiff_t)128, dat2410);
_mm512_storeu_ps(arranged20+262144*i68+262144*j59+256*k174+(ptrdiff_t)192, dat2414);
}
}
}
j59 = 1;
}
}

static void ResNet50OneArrangeDats10(ResNet50ThreaderTeam1* team71, char** tensors115) {
ResNet50ThreaderTask1 task119;
task119.callee1 = ResNet50OneArrangeDats10Callee1;
task119.any1 = tensors115;
task119.nd1 = 4;
task119.hull1[0] = 8;
task119.hull1[1] = 1;
task119.hull1[2] = 1;
task119.hull1[3] = 1;
ResNet50ThreaderDo1(team71, &task119);
}

static void ResNet50OneApply10Callee1(ResNet50ThreaderTask1* task120, int64_t* pt65) {
void** pair32 = task120->any1;
char** tensors118 = pair32[0];
ptrdiff_t e33 = 0;
ptrdiff_t g39 = 0;
ptrdiff_t d24 = 0;
ptrdiff_t w75 = pt65[0];
char*restrict arrangedWts10 = tensors118[0]+8560640*e33+(ptrdiff_t)10496000*1*g39;
char*restrict arrangedDats10 = tensors118[1]+213760*e33+(ptrdiff_t)262144*1*g39;
char*restrict datPtr37 = tensors118[2]+(ptrdiff_t)819200*1*g39;
ptrdiff_t ii30 = 1;
for (ptrdiff_t i69 = 0; i69 < ii30; ++i69) {
ptrdiff_t j60 = 1*d24;
ptrdiff_t h57 = 0;
switch (j60) {
default: {
j60 = 0;
ptrdiff_t k175 = 1*w75;
ptrdiff_t kk63 = k175+0;
for (; k175 != 426; ++k175) {
ptrdiff_t s57 = -1;
__m512 sum628 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+24*s57+(ptrdiff_t)24));
__m512 sum632 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+24*s57+(ptrdiff_t)28));
__m512 sum636 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+24*s57+(ptrdiff_t)32));
__m512 sum640 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+24*s57+(ptrdiff_t)36));
__m512 sum644 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+24*s57+(ptrdiff_t)40));
__m512 sum648 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+24*s57+(ptrdiff_t)44));
__m512 sum629 = sum628;
__m512 sum630 = sum628;
__m512 sum631 = sum628;
__m512 sum633 = sum632;
__m512 sum634 = sum632;
__m512 sum635 = sum632;
__m512 sum637 = sum636;
__m512 sum638 = sum636;
__m512 sum639 = sum636;
__m512 sum641 = sum640;
__m512 sum642 = sum640;
__m512 sum643 = sum640;
__m512 sum645 = sum644;
__m512 sum646 = sum644;
__m512 sum647 = sum644;
__m512 sum649 = sum648;
__m512 sum650 = sum648;
__m512 sum651 = sum648;
for (s57 = 0; s57 < 1024; ++s57) {
__m512 dat2415 = _mm512_loadu_ps(arrangedDats10+262144*i69+262144*j60+256*s57+(ptrdiff_t)0);
__m512 dat2416 = _mm512_loadu_ps(arrangedDats10+262144*i69+262144*j60+256*s57+(ptrdiff_t)64);
__m512 dat2417 = _mm512_loadu_ps(arrangedDats10+262144*i69+262144*j60+256*s57+(ptrdiff_t)128);
__m512 dat2418 = _mm512_loadu_ps(arrangedDats10+262144*i69+262144*j60+256*s57+(ptrdiff_t)192);
__m512 wt773 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+24*s57+(ptrdiff_t)24));
sum628 = _mm512_fmadd_ps(wt773, dat2415, sum628);
sum629 = _mm512_fmadd_ps(wt773, dat2416, sum629);
sum630 = _mm512_fmadd_ps(wt773, dat2417, sum630);
sum631 = _mm512_fmadd_ps(wt773, dat2418, sum631);
__m512 wt774 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+24*s57+(ptrdiff_t)28));
sum632 = _mm512_fmadd_ps(wt774, dat2415, sum632);
sum633 = _mm512_fmadd_ps(wt774, dat2416, sum633);
sum634 = _mm512_fmadd_ps(wt774, dat2417, sum634);
sum635 = _mm512_fmadd_ps(wt774, dat2418, sum635);
__m512 wt775 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+24*s57+(ptrdiff_t)32));
sum636 = _mm512_fmadd_ps(wt775, dat2415, sum636);
sum637 = _mm512_fmadd_ps(wt775, dat2416, sum637);
sum638 = _mm512_fmadd_ps(wt775, dat2417, sum638);
sum639 = _mm512_fmadd_ps(wt775, dat2418, sum639);
__m512 wt776 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+24*s57+(ptrdiff_t)36));
sum640 = _mm512_fmadd_ps(wt776, dat2415, sum640);
sum641 = _mm512_fmadd_ps(wt776, dat2416, sum641);
sum642 = _mm512_fmadd_ps(wt776, dat2417, sum642);
sum643 = _mm512_fmadd_ps(wt776, dat2418, sum643);
__m512 wt777 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+24*s57+(ptrdiff_t)40));
sum644 = _mm512_fmadd_ps(wt777, dat2415, sum644);
sum645 = _mm512_fmadd_ps(wt777, dat2416, sum645);
sum646 = _mm512_fmadd_ps(wt777, dat2417, sum646);
sum647 = _mm512_fmadd_ps(wt777, dat2418, sum647);
__m512 wt778 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+24*s57+(ptrdiff_t)44));
sum648 = _mm512_fmadd_ps(wt778, dat2415, sum648);
sum649 = _mm512_fmadd_ps(wt778, dat2416, sum649);
sum650 = _mm512_fmadd_ps(wt778, dat2417, sum650);
sum651 = _mm512_fmadd_ps(wt778, dat2418, sum651);
}
__m512 dat2419 = sum628;
__m512i via1 = _mm512_castps_si512(sum628);
via1 = _mm512_alignr_epi32(via1, via1, 7);
__m512 dat2420 = _mm512_castsi512_ps(via1);
__m512 dat2421 = sum629;
__m512i via2 = _mm512_castps_si512(sum629);
via2 = _mm512_alignr_epi32(via2, via2, 7);
__m512 dat2422 = _mm512_castsi512_ps(via2);
__m512 dat2423 = sum630;
__m512i via3 = _mm512_castps_si512(sum630);
via3 = _mm512_alignr_epi32(via3, via3, 7);
__m512 dat2424 = _mm512_castsi512_ps(via3);
__m512 dat2425 = sum631;
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)0, 127, dat2419);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)28, 127, dat2420);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)56, 127, dat2421);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)84, 127, dat2422);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)112, 127, dat2423);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)140, 127, dat2424);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)168, 127, dat2425);
__m512 dat2426 = sum632;
__m512i via4 = _mm512_castps_si512(sum632);
via4 = _mm512_alignr_epi32(via4, via4, 7);
__m512 dat2427 = _mm512_castsi512_ps(via4);
__m512 dat2428 = sum633;
__m512i via5 = _mm512_castps_si512(sum633);
via5 = _mm512_alignr_epi32(via5, via5, 7);
__m512 dat2429 = _mm512_castsi512_ps(via5);
__m512 dat2430 = sum634;
__m512i via6 = _mm512_castps_si512(sum634);
via6 = _mm512_alignr_epi32(via6, via6, 7);
__m512 dat2431 = _mm512_castsi512_ps(via6);
__m512 dat2432 = sum635;
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)320, 127, dat2426);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)348, 127, dat2427);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)376, 127, dat2428);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)404, 127, dat2429);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)432, 127, dat2430);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)460, 127, dat2431);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)488, 127, dat2432);
__m512 dat2433 = sum636;
__m512i via7 = _mm512_castps_si512(sum636);
via7 = _mm512_alignr_epi32(via7, via7, 7);
__m512 dat2434 = _mm512_castsi512_ps(via7);
__m512 dat2435 = sum637;
__m512i via8 = _mm512_castps_si512(sum637);
via8 = _mm512_alignr_epi32(via8, via8, 7);
__m512 dat2436 = _mm512_castsi512_ps(via8);
__m512 dat2437 = sum638;
__m512i via9 = _mm512_castps_si512(sum638);
via9 = _mm512_alignr_epi32(via9, via9, 7);
__m512 dat2438 = _mm512_castsi512_ps(via9);
__m512 dat2439 = sum639;
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)640, 127, dat2433);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)668, 127, dat2434);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)696, 127, dat2435);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)724, 127, dat2436);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)752, 127, dat2437);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)780, 127, dat2438);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)808, 127, dat2439);
__m512 dat2440 = sum640;
__m512i via10 = _mm512_castps_si512(sum640);
via10 = _mm512_alignr_epi32(via10, via10, 7);
__m512 dat2441 = _mm512_castsi512_ps(via10);
__m512 dat2442 = sum641;
__m512i via11 = _mm512_castps_si512(sum641);
via11 = _mm512_alignr_epi32(via11, via11, 7);
__m512 dat2443 = _mm512_castsi512_ps(via11);
__m512 dat2444 = sum642;
__m512i via12 = _mm512_castps_si512(sum642);
via12 = _mm512_alignr_epi32(via12, via12, 7);
__m512 dat2445 = _mm512_castsi512_ps(via12);
__m512 dat2446 = sum643;
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)960, 127, dat2440);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)988, 127, dat2441);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1016, 127, dat2442);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1044, 127, dat2443);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1072, 127, dat2444);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1100, 127, dat2445);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1128, 127, dat2446);
__m512 dat2447 = sum644;
__m512i via13 = _mm512_castps_si512(sum644);
via13 = _mm512_alignr_epi32(via13, via13, 7);
__m512 dat2448 = _mm512_castsi512_ps(via13);
__m512 dat2449 = sum645;
__m512i via14 = _mm512_castps_si512(sum645);
via14 = _mm512_alignr_epi32(via14, via14, 7);
__m512 dat2450 = _mm512_castsi512_ps(via14);
__m512 dat2451 = sum646;
__m512i via15 = _mm512_castps_si512(sum646);
via15 = _mm512_alignr_epi32(via15, via15, 7);
__m512 dat2452 = _mm512_castsi512_ps(via15);
__m512 dat2453 = sum647;
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1280, 127, dat2447);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1308, 127, dat2448);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1336, 127, dat2449);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1364, 127, dat2450);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1392, 127, dat2451);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1420, 127, dat2452);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1448, 127, dat2453);
__m512 dat2454 = sum648;
__m512i via16 = _mm512_castps_si512(sum648);
via16 = _mm512_alignr_epi32(via16, via16, 7);
__m512 dat2455 = _mm512_castsi512_ps(via16);
__m512 dat2456 = sum649;
__m512i via17 = _mm512_castps_si512(sum649);
via17 = _mm512_alignr_epi32(via17, via17, 7);
__m512 dat2457 = _mm512_castsi512_ps(via17);
__m512 dat2458 = sum650;
__m512i via18 = _mm512_castps_si512(sum650);
via18 = _mm512_alignr_epi32(via18, via18, 7);
__m512 dat2459 = _mm512_castsi512_ps(via18);
__m512 dat2460 = sum651;
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1600, 127, dat2454);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1628, 127, dat2455);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1656, 127, dat2456);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1684, 127, dat2457);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1712, 127, dat2458);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1740, 127, dat2459);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1768, 127, dat2460);
if (k175 >= kk63) return;
}
ptrdiff_t s58 = -1;
__m512 sum652 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+16*s58+(ptrdiff_t)16));
__m512 sum656 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+16*s58+(ptrdiff_t)20));
__m512 sum660 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+16*s58+(ptrdiff_t)24));
__m512 sum664 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+16*s58+(ptrdiff_t)28));
__m512 sum653 = sum652;
__m512 sum654 = sum652;
__m512 sum655 = sum652;
__m512 sum657 = sum656;
__m512 sum658 = sum656;
__m512 sum659 = sum656;
__m512 sum661 = sum660;
__m512 sum662 = sum660;
__m512 sum663 = sum660;
__m512 sum665 = sum664;
__m512 sum666 = sum664;
__m512 sum667 = sum664;
for (s58 = 0; s58 < 1024; ++s58) {
__m512 dat2461 = _mm512_loadu_ps(arrangedDats10+262144*i69+262144*j60+256*s58+(ptrdiff_t)0);
__m512 dat2462 = _mm512_loadu_ps(arrangedDats10+262144*i69+262144*j60+256*s58+(ptrdiff_t)64);
__m512 dat2463 = _mm512_loadu_ps(arrangedDats10+262144*i69+262144*j60+256*s58+(ptrdiff_t)128);
__m512 dat2464 = _mm512_loadu_ps(arrangedDats10+262144*i69+262144*j60+256*s58+(ptrdiff_t)192);
__m512 wt779 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+16*s58+(ptrdiff_t)16));
sum652 = _mm512_fmadd_ps(wt779, dat2461, sum652);
sum653 = _mm512_fmadd_ps(wt779, dat2462, sum653);
sum654 = _mm512_fmadd_ps(wt779, dat2463, sum654);
sum655 = _mm512_fmadd_ps(wt779, dat2464, sum655);
__m512 wt780 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+16*s58+(ptrdiff_t)20));
sum656 = _mm512_fmadd_ps(wt780, dat2461, sum656);
sum657 = _mm512_fmadd_ps(wt780, dat2462, sum657);
sum658 = _mm512_fmadd_ps(wt780, dat2463, sum658);
sum659 = _mm512_fmadd_ps(wt780, dat2464, sum659);
__m512 wt781 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+16*s58+(ptrdiff_t)24));
sum660 = _mm512_fmadd_ps(wt781, dat2461, sum660);
sum661 = _mm512_fmadd_ps(wt781, dat2462, sum661);
sum662 = _mm512_fmadd_ps(wt781, dat2463, sum662);
sum663 = _mm512_fmadd_ps(wt781, dat2464, sum663);
__m512 wt782 = _mm512_set1_ps(*(float*)(arrangedWts10+10496000*i69+24600*k175+16*s58+(ptrdiff_t)28));
sum664 = _mm512_fmadd_ps(wt782, dat2461, sum664);
sum665 = _mm512_fmadd_ps(wt782, dat2462, sum665);
sum666 = _mm512_fmadd_ps(wt782, dat2463, sum666);
sum667 = _mm512_fmadd_ps(wt782, dat2464, sum667);
}
__m512 dat2465 = sum652;
__m512i via19 = _mm512_castps_si512(sum652);
via19 = _mm512_alignr_epi32(via19, via19, 7);
__m512 dat2466 = _mm512_castsi512_ps(via19);
__m512 dat2467 = sum653;
__m512i via20 = _mm512_castps_si512(sum653);
via20 = _mm512_alignr_epi32(via20, via20, 7);
__m512 dat2468 = _mm512_castsi512_ps(via20);
__m512 dat2469 = sum654;
__m512i via21 = _mm512_castps_si512(sum654);
via21 = _mm512_alignr_epi32(via21, via21, 7);
__m512 dat2470 = _mm512_castsi512_ps(via21);
__m512 dat2471 = sum655;
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)0, 127, dat2465);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)28, 127, dat2466);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)56, 127, dat2467);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)84, 127, dat2468);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)112, 127, dat2469);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)140, 127, dat2470);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)168, 127, dat2471);
__m512 dat2472 = sum656;
__m512i via22 = _mm512_castps_si512(sum656);
via22 = _mm512_alignr_epi32(via22, via22, 7);
__m512 dat2473 = _mm512_castsi512_ps(via22);
__m512 dat2474 = sum657;
__m512i via23 = _mm512_castps_si512(sum657);
via23 = _mm512_alignr_epi32(via23, via23, 7);
__m512 dat2475 = _mm512_castsi512_ps(via23);
__m512 dat2476 = sum658;
__m512i via24 = _mm512_castps_si512(sum658);
via24 = _mm512_alignr_epi32(via24, via24, 7);
__m512 dat2477 = _mm512_castsi512_ps(via24);
__m512 dat2478 = sum659;
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)320, 127, dat2472);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)348, 127, dat2473);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)376, 127, dat2474);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)404, 127, dat2475);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)432, 127, dat2476);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)460, 127, dat2477);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)488, 127, dat2478);
__m512 dat2479 = sum660;
__m512i via25 = _mm512_castps_si512(sum660);
via25 = _mm512_alignr_epi32(via25, via25, 7);
__m512 dat2480 = _mm512_castsi512_ps(via25);
__m512 dat2481 = sum661;
__m512i via26 = _mm512_castps_si512(sum661);
via26 = _mm512_alignr_epi32(via26, via26, 7);
__m512 dat2482 = _mm512_castsi512_ps(via26);
__m512 dat2483 = sum662;
__m512i via27 = _mm512_castps_si512(sum662);
via27 = _mm512_alignr_epi32(via27, via27, 7);
__m512 dat2484 = _mm512_castsi512_ps(via27);
__m512 dat2485 = sum663;
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)640, 127, dat2479);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)668, 127, dat2480);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)696, 127, dat2481);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)724, 127, dat2482);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)752, 127, dat2483);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)780, 127, dat2484);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)808, 127, dat2485);
__m512 dat2486 = sum664;
__m512i via28 = _mm512_castps_si512(sum664);
via28 = _mm512_alignr_epi32(via28, via28, 7);
__m512 dat2487 = _mm512_castsi512_ps(via28);
__m512 dat2488 = sum665;
__m512i via29 = _mm512_castps_si512(sum665);
via29 = _mm512_alignr_epi32(via29, via29, 7);
__m512 dat2489 = _mm512_castsi512_ps(via29);
__m512 dat2490 = sum666;
__m512i via30 = _mm512_castps_si512(sum666);
via30 = _mm512_alignr_epi32(via30, via30, 7);
__m512 dat2491 = _mm512_castsi512_ps(via30);
__m512 dat2492 = sum667;
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)960, 127, dat2486);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)988, 127, dat2487);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1016, 127, dat2488);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1044, 127, dat2489);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1072, 127, dat2490);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1100, 127, dat2491);
_mm512_mask_storeu_ps(datPtr37+819200*i69+28*h57+1920*k175+(ptrdiff_t)1128, 127, dat2492);
}
}
j60 = 1;
}
}

static void ResNet50OneApply10(ResNet50ThreaderTeam1* team72, char** tensors117) {
void* pair31[] = {tensors117, 0};
ResNet50ThreaderTask1 task121;
task121.callee1 = ResNet50OneApply10Callee1;
task121.any1 = pair31;
task121.nd1 = 3;
task121.hull1[0] = 427;
task121.hull1[1] = 1;
task121.hull1[2] = 1;
ResNet50ThreaderDo1(team72, &task121);
}

static void ResNet50OneArrangeWts11Callee1(ResNet50ThreaderTask1* task130, int64_t* pt70) {
char** tensors128 = task130->any1;
ptrdiff_t b74 = pt70[0];
char*restrict wtPtr23 = tensors128[0]+(ptrdiff_t)3340*0+(ptrdiff_t)4194304*0;
char*restrict biasPtr23 = tensors128[1]+(ptrdiff_t)8192*0;
char*restrict bnPtr23 = tensors128[2]+(ptrdiff_t)8*2048*0;
char*restrict arranged21 = tensors128[3]+(ptrdiff_t)6848512*0+(ptrdiff_t)4202496*0;
ptrdiff_t ii31 = 1;
for (ptrdiff_t i74 = 0; i74 < ii31; ++i74) {
ptrdiff_t j65 = 1*b74;
ptrdiff_t jj54 = j65+1;
for (; j65 < jj54; ++j65) {
if (j65 < 127) {
ptrdiff_t k181 = 0+16*(j65-0);
ptrdiff_t l80 = (size_t)(0+k181)/6;
ptrdiff_t cut33 = (size_t)(0+k181)%6;
switch (cut33) {
case 0:;
case 2: {
__m512 sum685 = _mm512_maskz_loadu_ps(65535, biasPtr23+8192*i74+4*k181);
__m512i pmMul47 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd47 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo39 = _mm512_loadu_ps(bnPtr23+(ptrdiff_t)8*(k181+2048*i74));
__m512 masHi39 = _mm512_maskz_loadu_ps(65535, bnPtr23+(ptrdiff_t)8*(k181+2048*i74)+(ptrdiff_t)64);
__m512 postMul78 = _mm512_permutex2var_ps(masLo39, pmMul47, masHi39);
__m512 postAdd48 = _mm512_permutex2var_ps(masLo39, pmAdd47, masHi39);
sum685 = _mm512_fmadd_ps(sum685, postMul78, postAdd48);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*0+(ptrdiff_t)0, 63>>cut33, sum685);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*0+(ptrdiff_t)12288, 4032>>cut33, sum685);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*0+(ptrdiff_t)24576, 65535-(4095>>cut33), sum685);
ptrdiff_t c60 = 0;
for (; c60 != 32; ++c60) {
__m512 wt803 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c60+(ptrdiff_t)0);
__m512 wt804 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c60+(ptrdiff_t)2048);
__m512 wt805 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c60+(ptrdiff_t)4096);
__m512 wt806 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c60+(ptrdiff_t)6144);
__m512 wt807 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c60+(ptrdiff_t)8192);
__m512 wt808 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c60+(ptrdiff_t)10240);
__m512 wt809 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c60+(ptrdiff_t)12288);
__m512 wt810 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c60+(ptrdiff_t)14336);
__m512 wt811 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c60+(ptrdiff_t)16384);
__m512 wt812 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c60+(ptrdiff_t)18432);
__m512 wt813 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c60+(ptrdiff_t)20480);
__m512 wt814 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c60+(ptrdiff_t)22528);
__m512 wt815 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c60+(ptrdiff_t)24576);
__m512 wt816 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c60+(ptrdiff_t)26624);
__m512 wt817 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c60+(ptrdiff_t)28672);
__m512 wt818 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c60+(ptrdiff_t)30720);
__m512 tmp19733 = _mm512_unpacklo_ps(wt803, wt804);
__m512 tmp19734 = _mm512_unpackhi_ps(wt803, wt804);
__m512 tmp19735 = _mm512_unpacklo_ps(wt805, wt806);
__m512 tmp19736 = _mm512_unpackhi_ps(wt805, wt806);
__m512 tmp19737 = _mm512_unpacklo_ps(wt807, wt808);
__m512 tmp19738 = _mm512_unpackhi_ps(wt807, wt808);
__m512 tmp19739 = _mm512_unpacklo_ps(wt809, wt810);
__m512 tmp19740 = _mm512_unpackhi_ps(wt809, wt810);
__m512 tmp19741 = _mm512_unpacklo_ps(wt811, wt812);
__m512 tmp19742 = _mm512_unpackhi_ps(wt811, wt812);
__m512 tmp19743 = _mm512_unpacklo_ps(wt813, wt814);
__m512 tmp19744 = _mm512_unpackhi_ps(wt813, wt814);
__m512 tmp19745 = _mm512_unpacklo_ps(wt815, wt816);
__m512 tmp19746 = _mm512_unpackhi_ps(wt815, wt816);
__m512 tmp19747 = _mm512_unpacklo_ps(wt817, wt818);
__m512 tmp19748 = _mm512_unpackhi_ps(wt817, wt818);
__m512 tmp19749 = _mm512_shuffle_ps(tmp19733, tmp19735, 68);
__m512 tmp19750 = _mm512_shuffle_ps(tmp19733, tmp19735, 238);
__m512 tmp19751 = _mm512_shuffle_ps(tmp19734, tmp19736, 68);
__m512 tmp19752 = _mm512_shuffle_ps(tmp19734, tmp19736, 238);
__m512 tmp19753 = _mm512_shuffle_ps(tmp19737, tmp19739, 68);
__m512 tmp19754 = _mm512_shuffle_ps(tmp19737, tmp19739, 238);
__m512 tmp19755 = _mm512_shuffle_ps(tmp19738, tmp19740, 68);
__m512 tmp19756 = _mm512_shuffle_ps(tmp19738, tmp19740, 238);
__m512 tmp19757 = _mm512_shuffle_ps(tmp19741, tmp19743, 68);
__m512 tmp19758 = _mm512_shuffle_ps(tmp19741, tmp19743, 238);
__m512 tmp19759 = _mm512_shuffle_ps(tmp19742, tmp19744, 68);
__m512 tmp19760 = _mm512_shuffle_ps(tmp19742, tmp19744, 238);
__m512 tmp19761 = _mm512_shuffle_ps(tmp19745, tmp19747, 68);
__m512 tmp19762 = _mm512_shuffle_ps(tmp19745, tmp19747, 238);
__m512 tmp19763 = _mm512_shuffle_ps(tmp19746, tmp19748, 68);
__m512 tmp19764 = _mm512_shuffle_ps(tmp19746, tmp19748, 238);
__m512 tmp19765 = _mm512_shuffle_f32x4(tmp19749, tmp19753, 136);
__m512 tmp19766 = _mm512_shuffle_f32x4(tmp19749, tmp19753, 221);
__m512 tmp19767 = _mm512_shuffle_f32x4(tmp19750, tmp19754, 136);
__m512 tmp19768 = _mm512_shuffle_f32x4(tmp19750, tmp19754, 221);
__m512 tmp19769 = _mm512_shuffle_f32x4(tmp19751, tmp19755, 136);
__m512 tmp19770 = _mm512_shuffle_f32x4(tmp19751, tmp19755, 221);
__m512 tmp19771 = _mm512_shuffle_f32x4(tmp19752, tmp19756, 136);
__m512 tmp19772 = _mm512_shuffle_f32x4(tmp19752, tmp19756, 221);
__m512 tmp19773 = _mm512_shuffle_f32x4(tmp19757, tmp19761, 136);
__m512 tmp19774 = _mm512_shuffle_f32x4(tmp19757, tmp19761, 221);
__m512 tmp19775 = _mm512_shuffle_f32x4(tmp19758, tmp19762, 136);
__m512 tmp19776 = _mm512_shuffle_f32x4(tmp19758, tmp19762, 221);
__m512 tmp19777 = _mm512_shuffle_f32x4(tmp19759, tmp19763, 136);
__m512 tmp19778 = _mm512_shuffle_f32x4(tmp19759, tmp19763, 221);
__m512 tmp19779 = _mm512_shuffle_f32x4(tmp19760, tmp19764, 136);
__m512 tmp19780 = _mm512_shuffle_f32x4(tmp19760, tmp19764, 221);
wt803 = _mm512_shuffle_f32x4(tmp19765, tmp19773, 136);
wt811 = _mm512_shuffle_f32x4(tmp19765, tmp19773, 221);
wt804 = _mm512_shuffle_f32x4(tmp19767, tmp19775, 136);
wt812 = _mm512_shuffle_f32x4(tmp19767, tmp19775, 221);
wt805 = _mm512_shuffle_f32x4(tmp19769, tmp19777, 136);
wt813 = _mm512_shuffle_f32x4(tmp19769, tmp19777, 221);
wt806 = _mm512_shuffle_f32x4(tmp19771, tmp19779, 136);
wt814 = _mm512_shuffle_f32x4(tmp19771, tmp19779, 221);
wt807 = _mm512_shuffle_f32x4(tmp19766, tmp19774, 136);
wt815 = _mm512_shuffle_f32x4(tmp19766, tmp19774, 221);
wt808 = _mm512_shuffle_f32x4(tmp19768, tmp19776, 136);
wt816 = _mm512_shuffle_f32x4(tmp19768, tmp19776, 221);
wt809 = _mm512_shuffle_f32x4(tmp19770, tmp19778, 136);
wt817 = _mm512_shuffle_f32x4(tmp19770, tmp19778, 221);
wt810 = _mm512_shuffle_f32x4(tmp19772, tmp19780, 136);
wt818 = _mm512_shuffle_f32x4(tmp19772, tmp19780, 221);
wt803 = _mm512_mul_ps(wt803, postMul78);
wt804 = _mm512_mul_ps(wt804, postMul78);
wt805 = _mm512_mul_ps(wt805, postMul78);
wt806 = _mm512_mul_ps(wt806, postMul78);
wt807 = _mm512_mul_ps(wt807, postMul78);
wt808 = _mm512_mul_ps(wt808, postMul78);
wt809 = _mm512_mul_ps(wt809, postMul78);
wt810 = _mm512_mul_ps(wt810, postMul78);
wt811 = _mm512_mul_ps(wt811, postMul78);
wt812 = _mm512_mul_ps(wt812, postMul78);
wt813 = _mm512_mul_ps(wt813, postMul78);
wt814 = _mm512_mul_ps(wt814, postMul78);
wt815 = _mm512_mul_ps(wt815, postMul78);
wt816 = _mm512_mul_ps(wt816, postMul78);
wt817 = _mm512_mul_ps(wt817, postMul78);
wt818 = _mm512_mul_ps(wt818, postMul78);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(1+16*c60)+(ptrdiff_t)0, 63>>cut33, wt803);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(2+16*c60)+(ptrdiff_t)0, 63>>cut33, wt804);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(3+16*c60)+(ptrdiff_t)0, 63>>cut33, wt805);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(4+16*c60)+(ptrdiff_t)0, 63>>cut33, wt806);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(5+16*c60)+(ptrdiff_t)0, 63>>cut33, wt807);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(6+16*c60)+(ptrdiff_t)0, 63>>cut33, wt808);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(7+16*c60)+(ptrdiff_t)0, 63>>cut33, wt809);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(8+16*c60)+(ptrdiff_t)0, 63>>cut33, wt810);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(9+16*c60)+(ptrdiff_t)0, 63>>cut33, wt811);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(10+16*c60)+(ptrdiff_t)0, 63>>cut33, wt812);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(11+16*c60)+(ptrdiff_t)0, 63>>cut33, wt813);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(12+16*c60)+(ptrdiff_t)0, 63>>cut33, wt814);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(13+16*c60)+(ptrdiff_t)0, 63>>cut33, wt815);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(14+16*c60)+(ptrdiff_t)0, 63>>cut33, wt816);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(15+16*c60)+(ptrdiff_t)0, 63>>cut33, wt817);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(16+16*c60)+(ptrdiff_t)0, 63>>cut33, wt818);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(1+16*c60)+(ptrdiff_t)12288, 4032>>cut33, wt803);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(2+16*c60)+(ptrdiff_t)12288, 4032>>cut33, wt804);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(3+16*c60)+(ptrdiff_t)12288, 4032>>cut33, wt805);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(4+16*c60)+(ptrdiff_t)12288, 4032>>cut33, wt806);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(5+16*c60)+(ptrdiff_t)12288, 4032>>cut33, wt807);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(6+16*c60)+(ptrdiff_t)12288, 4032>>cut33, wt808);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(7+16*c60)+(ptrdiff_t)12288, 4032>>cut33, wt809);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(8+16*c60)+(ptrdiff_t)12288, 4032>>cut33, wt810);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(9+16*c60)+(ptrdiff_t)12288, 4032>>cut33, wt811);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(10+16*c60)+(ptrdiff_t)12288, 4032>>cut33, wt812);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(11+16*c60)+(ptrdiff_t)12288, 4032>>cut33, wt813);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(12+16*c60)+(ptrdiff_t)12288, 4032>>cut33, wt814);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(13+16*c60)+(ptrdiff_t)12288, 4032>>cut33, wt815);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(14+16*c60)+(ptrdiff_t)12288, 4032>>cut33, wt816);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(15+16*c60)+(ptrdiff_t)12288, 4032>>cut33, wt817);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(16+16*c60)+(ptrdiff_t)12288, 4032>>cut33, wt818);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(1+16*c60)+(ptrdiff_t)24576, 65535-(4095>>cut33), wt803);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(2+16*c60)+(ptrdiff_t)24576, 65535-(4095>>cut33), wt804);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(3+16*c60)+(ptrdiff_t)24576, 65535-(4095>>cut33), wt805);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(4+16*c60)+(ptrdiff_t)24576, 65535-(4095>>cut33), wt806);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(5+16*c60)+(ptrdiff_t)24576, 65535-(4095>>cut33), wt807);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(6+16*c60)+(ptrdiff_t)24576, 65535-(4095>>cut33), wt808);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(7+16*c60)+(ptrdiff_t)24576, 65535-(4095>>cut33), wt809);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(8+16*c60)+(ptrdiff_t)24576, 65535-(4095>>cut33), wt810);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(9+16*c60)+(ptrdiff_t)24576, 65535-(4095>>cut33), wt811);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(10+16*c60)+(ptrdiff_t)24576, 65535-(4095>>cut33), wt812);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(11+16*c60)+(ptrdiff_t)24576, 65535-(4095>>cut33), wt813);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(12+16*c60)+(ptrdiff_t)24576, 65535-(4095>>cut33), wt814);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(13+16*c60)+(ptrdiff_t)24576, 65535-(4095>>cut33), wt815);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(14+16*c60)+(ptrdiff_t)24576, 65535-(4095>>cut33), wt816);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(15+16*c60)+(ptrdiff_t)24576, 65535-(4095>>cut33), wt817);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(16+16*c60)+(ptrdiff_t)24576, 65535-(4095>>cut33), wt818);
}
break;
}
default: {
cut33 = 4;
__m512 sum686 = _mm512_maskz_loadu_ps(65535, biasPtr23+8192*i74+4*k181);
__m512i pmMul48 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd48 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo40 = _mm512_loadu_ps(bnPtr23+(ptrdiff_t)8*(k181+2048*i74));
__m512 masHi40 = _mm512_maskz_loadu_ps(65535, bnPtr23+(ptrdiff_t)8*(k181+2048*i74)+(ptrdiff_t)64);
__m512 postMul79 = _mm512_permutex2var_ps(masLo40, pmMul48, masHi40);
__m512 postAdd49 = _mm512_permutex2var_ps(masLo40, pmAdd48, masHi40);
sum686 = _mm512_fmadd_ps(sum686, postMul79, postAdd49);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*0+(ptrdiff_t)0, 63>>cut33, sum686);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*0+(ptrdiff_t)12288, 4032>>cut33, sum686);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*0+(ptrdiff_t)24576, 258048>>cut33, sum686);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*0+(ptrdiff_t)36864, 65535-(262143>>cut33), sum686);
ptrdiff_t c61 = 0;
for (; c61 != 32; ++c61) {
__m512 wt819 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c61+(ptrdiff_t)0);
__m512 wt820 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c61+(ptrdiff_t)2048);
__m512 wt821 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c61+(ptrdiff_t)4096);
__m512 wt822 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c61+(ptrdiff_t)6144);
__m512 wt823 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c61+(ptrdiff_t)8192);
__m512 wt824 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c61+(ptrdiff_t)10240);
__m512 wt825 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c61+(ptrdiff_t)12288);
__m512 wt826 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c61+(ptrdiff_t)14336);
__m512 wt827 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c61+(ptrdiff_t)16384);
__m512 wt828 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c61+(ptrdiff_t)18432);
__m512 wt829 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c61+(ptrdiff_t)20480);
__m512 wt830 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c61+(ptrdiff_t)22528);
__m512 wt831 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c61+(ptrdiff_t)24576);
__m512 wt832 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c61+(ptrdiff_t)26624);
__m512 wt833 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c61+(ptrdiff_t)28672);
__m512 wt834 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k181+64*c61+(ptrdiff_t)30720);
__m512 tmp19781 = _mm512_unpacklo_ps(wt819, wt820);
__m512 tmp19782 = _mm512_unpackhi_ps(wt819, wt820);
__m512 tmp19783 = _mm512_unpacklo_ps(wt821, wt822);
__m512 tmp19784 = _mm512_unpackhi_ps(wt821, wt822);
__m512 tmp19785 = _mm512_unpacklo_ps(wt823, wt824);
__m512 tmp19786 = _mm512_unpackhi_ps(wt823, wt824);
__m512 tmp19787 = _mm512_unpacklo_ps(wt825, wt826);
__m512 tmp19788 = _mm512_unpackhi_ps(wt825, wt826);
__m512 tmp19789 = _mm512_unpacklo_ps(wt827, wt828);
__m512 tmp19790 = _mm512_unpackhi_ps(wt827, wt828);
__m512 tmp19791 = _mm512_unpacklo_ps(wt829, wt830);
__m512 tmp19792 = _mm512_unpackhi_ps(wt829, wt830);
__m512 tmp19793 = _mm512_unpacklo_ps(wt831, wt832);
__m512 tmp19794 = _mm512_unpackhi_ps(wt831, wt832);
__m512 tmp19795 = _mm512_unpacklo_ps(wt833, wt834);
__m512 tmp19796 = _mm512_unpackhi_ps(wt833, wt834);
__m512 tmp19797 = _mm512_shuffle_ps(tmp19781, tmp19783, 68);
__m512 tmp19798 = _mm512_shuffle_ps(tmp19781, tmp19783, 238);
__m512 tmp19799 = _mm512_shuffle_ps(tmp19782, tmp19784, 68);
__m512 tmp19800 = _mm512_shuffle_ps(tmp19782, tmp19784, 238);
__m512 tmp19801 = _mm512_shuffle_ps(tmp19785, tmp19787, 68);
__m512 tmp19802 = _mm512_shuffle_ps(tmp19785, tmp19787, 238);
__m512 tmp19803 = _mm512_shuffle_ps(tmp19786, tmp19788, 68);
__m512 tmp19804 = _mm512_shuffle_ps(tmp19786, tmp19788, 238);
__m512 tmp19805 = _mm512_shuffle_ps(tmp19789, tmp19791, 68);
__m512 tmp19806 = _mm512_shuffle_ps(tmp19789, tmp19791, 238);
__m512 tmp19807 = _mm512_shuffle_ps(tmp19790, tmp19792, 68);
__m512 tmp19808 = _mm512_shuffle_ps(tmp19790, tmp19792, 238);
__m512 tmp19809 = _mm512_shuffle_ps(tmp19793, tmp19795, 68);
__m512 tmp19810 = _mm512_shuffle_ps(tmp19793, tmp19795, 238);
__m512 tmp19811 = _mm512_shuffle_ps(tmp19794, tmp19796, 68);
__m512 tmp19812 = _mm512_shuffle_ps(tmp19794, tmp19796, 238);
__m512 tmp19813 = _mm512_shuffle_f32x4(tmp19797, tmp19801, 136);
__m512 tmp19814 = _mm512_shuffle_f32x4(tmp19797, tmp19801, 221);
__m512 tmp19815 = _mm512_shuffle_f32x4(tmp19798, tmp19802, 136);
__m512 tmp19816 = _mm512_shuffle_f32x4(tmp19798, tmp19802, 221);
__m512 tmp19817 = _mm512_shuffle_f32x4(tmp19799, tmp19803, 136);
__m512 tmp19818 = _mm512_shuffle_f32x4(tmp19799, tmp19803, 221);
__m512 tmp19819 = _mm512_shuffle_f32x4(tmp19800, tmp19804, 136);
__m512 tmp19820 = _mm512_shuffle_f32x4(tmp19800, tmp19804, 221);
__m512 tmp19821 = _mm512_shuffle_f32x4(tmp19805, tmp19809, 136);
__m512 tmp19822 = _mm512_shuffle_f32x4(tmp19805, tmp19809, 221);
__m512 tmp19823 = _mm512_shuffle_f32x4(tmp19806, tmp19810, 136);
__m512 tmp19824 = _mm512_shuffle_f32x4(tmp19806, tmp19810, 221);
__m512 tmp19825 = _mm512_shuffle_f32x4(tmp19807, tmp19811, 136);
__m512 tmp19826 = _mm512_shuffle_f32x4(tmp19807, tmp19811, 221);
__m512 tmp19827 = _mm512_shuffle_f32x4(tmp19808, tmp19812, 136);
__m512 tmp19828 = _mm512_shuffle_f32x4(tmp19808, tmp19812, 221);
wt819 = _mm512_shuffle_f32x4(tmp19813, tmp19821, 136);
wt827 = _mm512_shuffle_f32x4(tmp19813, tmp19821, 221);
wt820 = _mm512_shuffle_f32x4(tmp19815, tmp19823, 136);
wt828 = _mm512_shuffle_f32x4(tmp19815, tmp19823, 221);
wt821 = _mm512_shuffle_f32x4(tmp19817, tmp19825, 136);
wt829 = _mm512_shuffle_f32x4(tmp19817, tmp19825, 221);
wt822 = _mm512_shuffle_f32x4(tmp19819, tmp19827, 136);
wt830 = _mm512_shuffle_f32x4(tmp19819, tmp19827, 221);
wt823 = _mm512_shuffle_f32x4(tmp19814, tmp19822, 136);
wt831 = _mm512_shuffle_f32x4(tmp19814, tmp19822, 221);
wt824 = _mm512_shuffle_f32x4(tmp19816, tmp19824, 136);
wt832 = _mm512_shuffle_f32x4(tmp19816, tmp19824, 221);
wt825 = _mm512_shuffle_f32x4(tmp19818, tmp19826, 136);
wt833 = _mm512_shuffle_f32x4(tmp19818, tmp19826, 221);
wt826 = _mm512_shuffle_f32x4(tmp19820, tmp19828, 136);
wt834 = _mm512_shuffle_f32x4(tmp19820, tmp19828, 221);
wt819 = _mm512_mul_ps(wt819, postMul79);
wt820 = _mm512_mul_ps(wt820, postMul79);
wt821 = _mm512_mul_ps(wt821, postMul79);
wt822 = _mm512_mul_ps(wt822, postMul79);
wt823 = _mm512_mul_ps(wt823, postMul79);
wt824 = _mm512_mul_ps(wt824, postMul79);
wt825 = _mm512_mul_ps(wt825, postMul79);
wt826 = _mm512_mul_ps(wt826, postMul79);
wt827 = _mm512_mul_ps(wt827, postMul79);
wt828 = _mm512_mul_ps(wt828, postMul79);
wt829 = _mm512_mul_ps(wt829, postMul79);
wt830 = _mm512_mul_ps(wt830, postMul79);
wt831 = _mm512_mul_ps(wt831, postMul79);
wt832 = _mm512_mul_ps(wt832, postMul79);
wt833 = _mm512_mul_ps(wt833, postMul79);
wt834 = _mm512_mul_ps(wt834, postMul79);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(1+16*c61)+(ptrdiff_t)0, 63>>cut33, wt819);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(2+16*c61)+(ptrdiff_t)0, 63>>cut33, wt820);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(3+16*c61)+(ptrdiff_t)0, 63>>cut33, wt821);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(4+16*c61)+(ptrdiff_t)0, 63>>cut33, wt822);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(5+16*c61)+(ptrdiff_t)0, 63>>cut33, wt823);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(6+16*c61)+(ptrdiff_t)0, 63>>cut33, wt824);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(7+16*c61)+(ptrdiff_t)0, 63>>cut33, wt825);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(8+16*c61)+(ptrdiff_t)0, 63>>cut33, wt826);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(9+16*c61)+(ptrdiff_t)0, 63>>cut33, wt827);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(10+16*c61)+(ptrdiff_t)0, 63>>cut33, wt828);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(11+16*c61)+(ptrdiff_t)0, 63>>cut33, wt829);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(12+16*c61)+(ptrdiff_t)0, 63>>cut33, wt830);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(13+16*c61)+(ptrdiff_t)0, 63>>cut33, wt831);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(14+16*c61)+(ptrdiff_t)0, 63>>cut33, wt832);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(15+16*c61)+(ptrdiff_t)0, 63>>cut33, wt833);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(16+16*c61)+(ptrdiff_t)0, 63>>cut33, wt834);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(1+16*c61)+(ptrdiff_t)12288, 4032>>cut33, wt819);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(2+16*c61)+(ptrdiff_t)12288, 4032>>cut33, wt820);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(3+16*c61)+(ptrdiff_t)12288, 4032>>cut33, wt821);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(4+16*c61)+(ptrdiff_t)12288, 4032>>cut33, wt822);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(5+16*c61)+(ptrdiff_t)12288, 4032>>cut33, wt823);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(6+16*c61)+(ptrdiff_t)12288, 4032>>cut33, wt824);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(7+16*c61)+(ptrdiff_t)12288, 4032>>cut33, wt825);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(8+16*c61)+(ptrdiff_t)12288, 4032>>cut33, wt826);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(9+16*c61)+(ptrdiff_t)12288, 4032>>cut33, wt827);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(10+16*c61)+(ptrdiff_t)12288, 4032>>cut33, wt828);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(11+16*c61)+(ptrdiff_t)12288, 4032>>cut33, wt829);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(12+16*c61)+(ptrdiff_t)12288, 4032>>cut33, wt830);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(13+16*c61)+(ptrdiff_t)12288, 4032>>cut33, wt831);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(14+16*c61)+(ptrdiff_t)12288, 4032>>cut33, wt832);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(15+16*c61)+(ptrdiff_t)12288, 4032>>cut33, wt833);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(16+16*c61)+(ptrdiff_t)12288, 4032>>cut33, wt834);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(1+16*c61)+(ptrdiff_t)24576, 258048>>cut33, wt819);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(2+16*c61)+(ptrdiff_t)24576, 258048>>cut33, wt820);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(3+16*c61)+(ptrdiff_t)24576, 258048>>cut33, wt821);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(4+16*c61)+(ptrdiff_t)24576, 258048>>cut33, wt822);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(5+16*c61)+(ptrdiff_t)24576, 258048>>cut33, wt823);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(6+16*c61)+(ptrdiff_t)24576, 258048>>cut33, wt824);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(7+16*c61)+(ptrdiff_t)24576, 258048>>cut33, wt825);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(8+16*c61)+(ptrdiff_t)24576, 258048>>cut33, wt826);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(9+16*c61)+(ptrdiff_t)24576, 258048>>cut33, wt827);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(10+16*c61)+(ptrdiff_t)24576, 258048>>cut33, wt828);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(11+16*c61)+(ptrdiff_t)24576, 258048>>cut33, wt829);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(12+16*c61)+(ptrdiff_t)24576, 258048>>cut33, wt830);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(13+16*c61)+(ptrdiff_t)24576, 258048>>cut33, wt831);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(14+16*c61)+(ptrdiff_t)24576, 258048>>cut33, wt832);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(15+16*c61)+(ptrdiff_t)24576, 258048>>cut33, wt833);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(16+16*c61)+(ptrdiff_t)24576, 258048>>cut33, wt834);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(1+16*c61)+(ptrdiff_t)36864, 65535-(262143>>cut33), wt819);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(2+16*c61)+(ptrdiff_t)36864, 65535-(262143>>cut33), wt820);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(3+16*c61)+(ptrdiff_t)36864, 65535-(262143>>cut33), wt821);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(4+16*c61)+(ptrdiff_t)36864, 65535-(262143>>cut33), wt822);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(5+16*c61)+(ptrdiff_t)36864, 65535-(262143>>cut33), wt823);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(6+16*c61)+(ptrdiff_t)36864, 65535-(262143>>cut33), wt824);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(7+16*c61)+(ptrdiff_t)36864, 65535-(262143>>cut33), wt825);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(8+16*c61)+(ptrdiff_t)36864, 65535-(262143>>cut33), wt826);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(9+16*c61)+(ptrdiff_t)36864, 65535-(262143>>cut33), wt827);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(10+16*c61)+(ptrdiff_t)36864, 65535-(262143>>cut33), wt828);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(11+16*c61)+(ptrdiff_t)36864, 65535-(262143>>cut33), wt829);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(12+16*c61)+(ptrdiff_t)36864, 65535-(262143>>cut33), wt830);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(13+16*c61)+(ptrdiff_t)36864, 65535-(262143>>cut33), wt831);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(14+16*c61)+(ptrdiff_t)36864, 65535-(262143>>cut33), wt832);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(15+16*c61)+(ptrdiff_t)36864, 65535-(262143>>cut33), wt833);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l80+4*cut33+24*(16+16*c61)+(ptrdiff_t)36864, 65535-(262143>>cut33), wt834);
}
}
}
} else {
ptrdiff_t k180 = 2032;
ptrdiff_t l79 = (size_t)(0+k180)/6;
ptrdiff_t cut32 = (size_t)(0+k180)%6;
__m512 sum684 = _mm512_maskz_loadu_ps(65535, biasPtr23+8192*i74+4*k180);
__m512i pmMul49 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd49 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo41 = _mm512_loadu_ps(bnPtr23+(ptrdiff_t)8*(k180+2048*i74));
__m512 masHi41 = _mm512_maskz_loadu_ps(65535, bnPtr23+(ptrdiff_t)8*(k180+2048*i74)+(ptrdiff_t)64);
__m512 postMul77 = _mm512_permutex2var_ps(masLo41, pmMul49, masHi41);
__m512 postAdd47 = _mm512_permutex2var_ps(masLo41, pmAdd49, masHi41);
sum684 = _mm512_fmadd_ps(sum684, postMul77, postAdd47);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*0+(ptrdiff_t)0, 63>>cut32, sum684);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*0+(ptrdiff_t)12288, 4032>>cut32, sum684);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*0+(ptrdiff_t)24576, 258048>>cut32, sum684);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*0+(ptrdiff_t)36864, 65535-(262143>>cut32), sum684);
ptrdiff_t c59 = 0;
for (; c59 != 32; ++c59) {
__m512 wt787 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k180+64*c59+(ptrdiff_t)0);
__m512 wt788 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k180+64*c59+(ptrdiff_t)2048);
__m512 wt789 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k180+64*c59+(ptrdiff_t)4096);
__m512 wt790 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k180+64*c59+(ptrdiff_t)6144);
__m512 wt791 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k180+64*c59+(ptrdiff_t)8192);
__m512 wt792 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k180+64*c59+(ptrdiff_t)10240);
__m512 wt793 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k180+64*c59+(ptrdiff_t)12288);
__m512 wt794 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k180+64*c59+(ptrdiff_t)14336);
__m512 wt795 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k180+64*c59+(ptrdiff_t)16384);
__m512 wt796 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k180+64*c59+(ptrdiff_t)18432);
__m512 wt797 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k180+64*c59+(ptrdiff_t)20480);
__m512 wt798 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k180+64*c59+(ptrdiff_t)22528);
__m512 wt799 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k180+64*c59+(ptrdiff_t)24576);
__m512 wt800 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k180+64*c59+(ptrdiff_t)26624);
__m512 wt801 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k180+64*c59+(ptrdiff_t)28672);
__m512 wt802 = _mm512_maskz_loadu_ps(65535, wtPtr23+4194304*i74+2048*k180+64*c59+(ptrdiff_t)30720);
__m512 tmp19829 = _mm512_unpacklo_ps(wt787, wt788);
__m512 tmp19830 = _mm512_unpackhi_ps(wt787, wt788);
__m512 tmp19831 = _mm512_unpacklo_ps(wt789, wt790);
__m512 tmp19832 = _mm512_unpackhi_ps(wt789, wt790);
__m512 tmp19833 = _mm512_unpacklo_ps(wt791, wt792);
__m512 tmp19834 = _mm512_unpackhi_ps(wt791, wt792);
__m512 tmp19835 = _mm512_unpacklo_ps(wt793, wt794);
__m512 tmp19836 = _mm512_unpackhi_ps(wt793, wt794);
__m512 tmp19837 = _mm512_unpacklo_ps(wt795, wt796);
__m512 tmp19838 = _mm512_unpackhi_ps(wt795, wt796);
__m512 tmp19839 = _mm512_unpacklo_ps(wt797, wt798);
__m512 tmp19840 = _mm512_unpackhi_ps(wt797, wt798);
__m512 tmp19841 = _mm512_unpacklo_ps(wt799, wt800);
__m512 tmp19842 = _mm512_unpackhi_ps(wt799, wt800);
__m512 tmp19843 = _mm512_unpacklo_ps(wt801, wt802);
__m512 tmp19844 = _mm512_unpackhi_ps(wt801, wt802);
__m512 tmp19845 = _mm512_shuffle_ps(tmp19829, tmp19831, 68);
__m512 tmp19846 = _mm512_shuffle_ps(tmp19829, tmp19831, 238);
__m512 tmp19847 = _mm512_shuffle_ps(tmp19830, tmp19832, 68);
__m512 tmp19848 = _mm512_shuffle_ps(tmp19830, tmp19832, 238);
__m512 tmp19849 = _mm512_shuffle_ps(tmp19833, tmp19835, 68);
__m512 tmp19850 = _mm512_shuffle_ps(tmp19833, tmp19835, 238);
__m512 tmp19851 = _mm512_shuffle_ps(tmp19834, tmp19836, 68);
__m512 tmp19852 = _mm512_shuffle_ps(tmp19834, tmp19836, 238);
__m512 tmp19853 = _mm512_shuffle_ps(tmp19837, tmp19839, 68);
__m512 tmp19854 = _mm512_shuffle_ps(tmp19837, tmp19839, 238);
__m512 tmp19855 = _mm512_shuffle_ps(tmp19838, tmp19840, 68);
__m512 tmp19856 = _mm512_shuffle_ps(tmp19838, tmp19840, 238);
__m512 tmp19857 = _mm512_shuffle_ps(tmp19841, tmp19843, 68);
__m512 tmp19858 = _mm512_shuffle_ps(tmp19841, tmp19843, 238);
__m512 tmp19859 = _mm512_shuffle_ps(tmp19842, tmp19844, 68);
__m512 tmp19860 = _mm512_shuffle_ps(tmp19842, tmp19844, 238);
__m512 tmp19861 = _mm512_shuffle_f32x4(tmp19845, tmp19849, 136);
__m512 tmp19862 = _mm512_shuffle_f32x4(tmp19845, tmp19849, 221);
__m512 tmp19863 = _mm512_shuffle_f32x4(tmp19846, tmp19850, 136);
__m512 tmp19864 = _mm512_shuffle_f32x4(tmp19846, tmp19850, 221);
__m512 tmp19865 = _mm512_shuffle_f32x4(tmp19847, tmp19851, 136);
__m512 tmp19866 = _mm512_shuffle_f32x4(tmp19847, tmp19851, 221);
__m512 tmp19867 = _mm512_shuffle_f32x4(tmp19848, tmp19852, 136);
__m512 tmp19868 = _mm512_shuffle_f32x4(tmp19848, tmp19852, 221);
__m512 tmp19869 = _mm512_shuffle_f32x4(tmp19853, tmp19857, 136);
__m512 tmp19870 = _mm512_shuffle_f32x4(tmp19853, tmp19857, 221);
__m512 tmp19871 = _mm512_shuffle_f32x4(tmp19854, tmp19858, 136);
__m512 tmp19872 = _mm512_shuffle_f32x4(tmp19854, tmp19858, 221);
__m512 tmp19873 = _mm512_shuffle_f32x4(tmp19855, tmp19859, 136);
__m512 tmp19874 = _mm512_shuffle_f32x4(tmp19855, tmp19859, 221);
__m512 tmp19875 = _mm512_shuffle_f32x4(tmp19856, tmp19860, 136);
__m512 tmp19876 = _mm512_shuffle_f32x4(tmp19856, tmp19860, 221);
wt787 = _mm512_shuffle_f32x4(tmp19861, tmp19869, 136);
wt795 = _mm512_shuffle_f32x4(tmp19861, tmp19869, 221);
wt788 = _mm512_shuffle_f32x4(tmp19863, tmp19871, 136);
wt796 = _mm512_shuffle_f32x4(tmp19863, tmp19871, 221);
wt789 = _mm512_shuffle_f32x4(tmp19865, tmp19873, 136);
wt797 = _mm512_shuffle_f32x4(tmp19865, tmp19873, 221);
wt790 = _mm512_shuffle_f32x4(tmp19867, tmp19875, 136);
wt798 = _mm512_shuffle_f32x4(tmp19867, tmp19875, 221);
wt791 = _mm512_shuffle_f32x4(tmp19862, tmp19870, 136);
wt799 = _mm512_shuffle_f32x4(tmp19862, tmp19870, 221);
wt792 = _mm512_shuffle_f32x4(tmp19864, tmp19872, 136);
wt800 = _mm512_shuffle_f32x4(tmp19864, tmp19872, 221);
wt793 = _mm512_shuffle_f32x4(tmp19866, tmp19874, 136);
wt801 = _mm512_shuffle_f32x4(tmp19866, tmp19874, 221);
wt794 = _mm512_shuffle_f32x4(tmp19868, tmp19876, 136);
wt802 = _mm512_shuffle_f32x4(tmp19868, tmp19876, 221);
wt787 = _mm512_mul_ps(wt787, postMul77);
wt788 = _mm512_mul_ps(wt788, postMul77);
wt789 = _mm512_mul_ps(wt789, postMul77);
wt790 = _mm512_mul_ps(wt790, postMul77);
wt791 = _mm512_mul_ps(wt791, postMul77);
wt792 = _mm512_mul_ps(wt792, postMul77);
wt793 = _mm512_mul_ps(wt793, postMul77);
wt794 = _mm512_mul_ps(wt794, postMul77);
wt795 = _mm512_mul_ps(wt795, postMul77);
wt796 = _mm512_mul_ps(wt796, postMul77);
wt797 = _mm512_mul_ps(wt797, postMul77);
wt798 = _mm512_mul_ps(wt798, postMul77);
wt799 = _mm512_mul_ps(wt799, postMul77);
wt800 = _mm512_mul_ps(wt800, postMul77);
wt801 = _mm512_mul_ps(wt801, postMul77);
wt802 = _mm512_mul_ps(wt802, postMul77);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(1+16*c59)+(ptrdiff_t)0, 63>>cut32, wt787);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(2+16*c59)+(ptrdiff_t)0, 63>>cut32, wt788);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(3+16*c59)+(ptrdiff_t)0, 63>>cut32, wt789);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(4+16*c59)+(ptrdiff_t)0, 63>>cut32, wt790);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(5+16*c59)+(ptrdiff_t)0, 63>>cut32, wt791);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(6+16*c59)+(ptrdiff_t)0, 63>>cut32, wt792);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(7+16*c59)+(ptrdiff_t)0, 63>>cut32, wt793);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(8+16*c59)+(ptrdiff_t)0, 63>>cut32, wt794);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(9+16*c59)+(ptrdiff_t)0, 63>>cut32, wt795);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(10+16*c59)+(ptrdiff_t)0, 63>>cut32, wt796);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(11+16*c59)+(ptrdiff_t)0, 63>>cut32, wt797);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(12+16*c59)+(ptrdiff_t)0, 63>>cut32, wt798);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(13+16*c59)+(ptrdiff_t)0, 63>>cut32, wt799);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(14+16*c59)+(ptrdiff_t)0, 63>>cut32, wt800);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(15+16*c59)+(ptrdiff_t)0, 63>>cut32, wt801);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(16+16*c59)+(ptrdiff_t)0, 63>>cut32, wt802);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(1+16*c59)+(ptrdiff_t)12288, 4032>>cut32, wt787);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(2+16*c59)+(ptrdiff_t)12288, 4032>>cut32, wt788);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(3+16*c59)+(ptrdiff_t)12288, 4032>>cut32, wt789);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(4+16*c59)+(ptrdiff_t)12288, 4032>>cut32, wt790);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(5+16*c59)+(ptrdiff_t)12288, 4032>>cut32, wt791);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(6+16*c59)+(ptrdiff_t)12288, 4032>>cut32, wt792);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(7+16*c59)+(ptrdiff_t)12288, 4032>>cut32, wt793);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(8+16*c59)+(ptrdiff_t)12288, 4032>>cut32, wt794);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(9+16*c59)+(ptrdiff_t)12288, 4032>>cut32, wt795);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(10+16*c59)+(ptrdiff_t)12288, 4032>>cut32, wt796);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(11+16*c59)+(ptrdiff_t)12288, 4032>>cut32, wt797);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(12+16*c59)+(ptrdiff_t)12288, 4032>>cut32, wt798);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(13+16*c59)+(ptrdiff_t)12288, 4032>>cut32, wt799);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(14+16*c59)+(ptrdiff_t)12288, 4032>>cut32, wt800);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(15+16*c59)+(ptrdiff_t)12288, 4032>>cut32, wt801);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(16+16*c59)+(ptrdiff_t)12288, 4032>>cut32, wt802);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(1+16*c59)+(ptrdiff_t)24576, 258048>>cut32, wt787);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(2+16*c59)+(ptrdiff_t)24576, 258048>>cut32, wt788);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(3+16*c59)+(ptrdiff_t)24576, 258048>>cut32, wt789);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(4+16*c59)+(ptrdiff_t)24576, 258048>>cut32, wt790);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(5+16*c59)+(ptrdiff_t)24576, 258048>>cut32, wt791);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(6+16*c59)+(ptrdiff_t)24576, 258048>>cut32, wt792);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(7+16*c59)+(ptrdiff_t)24576, 258048>>cut32, wt793);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(8+16*c59)+(ptrdiff_t)24576, 258048>>cut32, wt794);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(9+16*c59)+(ptrdiff_t)24576, 258048>>cut32, wt795);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(10+16*c59)+(ptrdiff_t)24576, 258048>>cut32, wt796);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(11+16*c59)+(ptrdiff_t)24576, 258048>>cut32, wt797);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(12+16*c59)+(ptrdiff_t)24576, 258048>>cut32, wt798);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(13+16*c59)+(ptrdiff_t)24576, 258048>>cut32, wt799);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(14+16*c59)+(ptrdiff_t)24576, 258048>>cut32, wt800);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(15+16*c59)+(ptrdiff_t)24576, 258048>>cut32, wt801);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+24*(16+16*c59)+(ptrdiff_t)24576, 258048>>cut32, wt802);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*(1+16*c59)+(ptrdiff_t)36864, 65535-(262143>>cut32), wt787);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*(2+16*c59)+(ptrdiff_t)36864, 65535-(262143>>cut32), wt788);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*(3+16*c59)+(ptrdiff_t)36864, 65535-(262143>>cut32), wt789);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*(4+16*c59)+(ptrdiff_t)36864, 65535-(262143>>cut32), wt790);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*(5+16*c59)+(ptrdiff_t)36864, 65535-(262143>>cut32), wt791);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*(6+16*c59)+(ptrdiff_t)36864, 65535-(262143>>cut32), wt792);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*(7+16*c59)+(ptrdiff_t)36864, 65535-(262143>>cut32), wt793);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*(8+16*c59)+(ptrdiff_t)36864, 65535-(262143>>cut32), wt794);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*(9+16*c59)+(ptrdiff_t)36864, 65535-(262143>>cut32), wt795);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*(10+16*c59)+(ptrdiff_t)36864, 65535-(262143>>cut32), wt796);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*(11+16*c59)+(ptrdiff_t)36864, 65535-(262143>>cut32), wt797);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*(12+16*c59)+(ptrdiff_t)36864, 65535-(262143>>cut32), wt798);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*(13+16*c59)+(ptrdiff_t)36864, 65535-(262143>>cut32), wt799);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*(14+16*c59)+(ptrdiff_t)36864, 65535-(262143>>cut32), wt800);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*(15+16*c59)+(ptrdiff_t)36864, 65535-(262143>>cut32), wt801);
_mm512_mask_storeu_ps(arranged21+4202496*i74+12312*l79+4*cut32+8*(16+16*c59)+(ptrdiff_t)36864, 65535-(262143>>cut32), wt802);
}
}
}
}
}

static void ResNet50OneArrangeWts11(ResNet50ThreaderTeam1* team77, char** tensors127) {
ResNet50ThreaderTask1 task131;
task131.callee1 = ResNet50OneArrangeWts11Callee1;
task131.any1 = tensors127;
task131.nd1 = 3;
task131.hull1[0] = 128;
task131.hull1[1] = 1;
task131.hull1[2] = 1;
ResNet50ThreaderDo1(team77, &task131);
}

static void ResNet50OneArrangeDats11Callee1(ResNet50ThreaderTask1* task132, int64_t* pt71) {
char** tensors130 = task132->any1;
ptrdiff_t s61 = pt71[0];
char*restrict datPtr40 = tensors130[0]+(ptrdiff_t)0+(ptrdiff_t)267200*0+(ptrdiff_t)163840*0;
char*restrict arranged22 = tensors130[1]+(ptrdiff_t)213760*0+(ptrdiff_t)131072*0;
ptrdiff_t ii32 = 1;
for (ptrdiff_t i75 = 0; i75 < ii32; ++i75) {
ptrdiff_t j66 = 0;
ptrdiff_t k182 = 128*s61;
ptrdiff_t kk65 = k182+128;
for (; k182 < kk65; ++k182) {
__m512 dat2502 = _mm512_maskz_loadu_ps(65535, datPtr40+163840*i75+256*j66+320*k182+(ptrdiff_t)0);
__m512 dat2503 = _mm512_maskz_loadu_ps(65535, datPtr40+163840*i75+256*j66+320*k182+(ptrdiff_t)64);
__m512 dat2504 = _mm512_maskz_loadu_ps(65535, datPtr40+163840*i75+256*j66+320*k182+(ptrdiff_t)128);
__m512 dat2505 = _mm512_maskz_loadu_ps(1, datPtr40+163840*i75+256*j66+320*k182+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged22+131072*i75+131072*j66+256*k182+(ptrdiff_t)0, 65535, dat2502);
_mm512_mask_storeu_ps(arranged22+131072*i75+131072*j66+256*k182+(ptrdiff_t)64, 65535, dat2503);
_mm512_mask_storeu_ps(arranged22+131072*i75+131072*j66+256*k182+(ptrdiff_t)128, 65535, dat2504);
_mm512_mask_storeu_ps(arranged22+131072*i75+131072*j66+256*k182+(ptrdiff_t)192, 1, dat2505);
}
}
}

static void ResNet50OneArrangeDats11(ResNet50ThreaderTeam1* team78, char** tensors129) {
ResNet50ThreaderTask1 task133;
task133.callee1 = ResNet50OneArrangeDats11Callee1;
task133.any1 = tensors129;
task133.nd1 = 4;
task133.hull1[0] = 4;
task133.hull1[1] = 1;
task133.hull1[2] = 1;
task133.hull1[3] = 1;
ResNet50ThreaderDo1(team78, &task133);
}

static void ResNet50OneApply11Callee1(ResNet50ThreaderTask1* task134, int64_t* pt72) {
void** pair36 = task134->any1;
char** tensors132 = pair36[0];
ptrdiff_t e37 = 0;
ptrdiff_t g44 = 0;
ptrdiff_t d27 = 0;
ptrdiff_t w79 = pt72[0];
char*restrict arrangedWts11 = tensors132[0]+6848512*e37+(ptrdiff_t)4202496*1*g44;
char*restrict arrangedDats11 = tensors132[1]+213760*e37+(ptrdiff_t)131072*1*g44;
char*restrict datPtr41 = tensors132[2]+(ptrdiff_t)655360*1*g44;
char*restrict datPtr42 = tensors132[3]+(ptrdiff_t)655360*1*g44;
ptrdiff_t ii33 = 1;
for (ptrdiff_t i76 = 0; i76 < ii33; ++i76) {
ptrdiff_t j67 = 1*d27;
ptrdiff_t k183 = 1*w79;
ptrdiff_t kk66 = k183+0;
for (; k183 != 341; ++k183) {
ptrdiff_t s62 = -1;
__m512 sum687 = _mm512_set1_ps(*(float*)(arrangedWts11+4202496*i76+12312*k183+24*s62+(ptrdiff_t)24));
__m512 sum691 = _mm512_set1_ps(*(float*)(arrangedWts11+4202496*i76+12312*k183+24*s62+(ptrdiff_t)28));
__m512 sum695 = _mm512_set1_ps(*(float*)(arrangedWts11+4202496*i76+12312*k183+24*s62+(ptrdiff_t)32));
__m512 sum699 = _mm512_set1_ps(*(float*)(arrangedWts11+4202496*i76+12312*k183+24*s62+(ptrdiff_t)36));
__m512 sum703 = _mm512_set1_ps(*(float*)(arrangedWts11+4202496*i76+12312*k183+24*s62+(ptrdiff_t)40));
__m512 sum707 = _mm512_set1_ps(*(float*)(arrangedWts11+4202496*i76+12312*k183+24*s62+(ptrdiff_t)44));
__m512 sum688 = sum687;
__m512 sum689 = sum687;
__m512 sum690 = sum687;
__m512 sum692 = sum691;
__m512 sum693 = sum691;
__m512 sum694 = sum691;
__m512 sum696 = sum695;
__m512 sum697 = sum695;
__m512 sum698 = sum695;
__m512 sum700 = sum699;
__m512 sum701 = sum699;
__m512 sum702 = sum699;
__m512 sum704 = sum703;
__m512 sum705 = sum703;
__m512 sum706 = sum703;
__m512 sum708 = sum707;
__m512 sum709 = sum707;
__m512 sum710 = sum707;
for (s62 = 0; s62 < 512; ++s62) {
__m512 dat2506 = _mm512_loadu_ps(arrangedDats11+131072*i76+131072*j67+256*s62+(ptrdiff_t)0);
__m512 dat2507 = _mm512_loadu_ps(arrangedDats11+131072*i76+131072*j67+256*s62+(ptrdiff_t)64);
__m512 dat2508 = _mm512_loadu_ps(arrangedDats11+131072*i76+131072*j67+256*s62+(ptrdiff_t)128);
__m512 dat2509 = _mm512_loadu_ps(arrangedDats11+131072*i76+131072*j67+256*s62+(ptrdiff_t)192);
__m512 wt835 = _mm512_set1_ps(*(float*)(arrangedWts11+4202496*i76+12312*k183+24*s62+(ptrdiff_t)24));
sum687 = _mm512_fmadd_ps(wt835, dat2506, sum687);
sum688 = _mm512_fmadd_ps(wt835, dat2507, sum688);
sum689 = _mm512_fmadd_ps(wt835, dat2508, sum689);
sum690 = _mm512_fmadd_ps(wt835, dat2509, sum690);
__m512 wt836 = _mm512_set1_ps(*(float*)(arrangedWts11+4202496*i76+12312*k183+24*s62+(ptrdiff_t)28));
sum691 = _mm512_fmadd_ps(wt836, dat2506, sum691);
sum692 = _mm512_fmadd_ps(wt836, dat2507, sum692);
sum693 = _mm512_fmadd_ps(wt836, dat2508, sum693);
sum694 = _mm512_fmadd_ps(wt836, dat2509, sum694);
__m512 wt837 = _mm512_set1_ps(*(float*)(arrangedWts11+4202496*i76+12312*k183+24*s62+(ptrdiff_t)32));
sum695 = _mm512_fmadd_ps(wt837, dat2506, sum695);
sum696 = _mm512_fmadd_ps(wt837, dat2507, sum696);
sum697 = _mm512_fmadd_ps(wt837, dat2508, sum697);
sum698 = _mm512_fmadd_ps(wt837, dat2509, sum698);
__m512 wt838 = _mm512_set1_ps(*(float*)(arrangedWts11+4202496*i76+12312*k183+24*s62+(ptrdiff_t)36));
sum699 = _mm512_fmadd_ps(wt838, dat2506, sum699);
sum700 = _mm512_fmadd_ps(wt838, dat2507, sum700);
sum701 = _mm512_fmadd_ps(wt838, dat2508, sum701);
sum702 = _mm512_fmadd_ps(wt838, dat2509, sum702);
__m512 wt839 = _mm512_set1_ps(*(float*)(arrangedWts11+4202496*i76+12312*k183+24*s62+(ptrdiff_t)40));
sum703 = _mm512_fmadd_ps(wt839, dat2506, sum703);
sum704 = _mm512_fmadd_ps(wt839, dat2507, sum704);
sum705 = _mm512_fmadd_ps(wt839, dat2508, sum705);
sum706 = _mm512_fmadd_ps(wt839, dat2509, sum706);
__m512 wt840 = _mm512_set1_ps(*(float*)(arrangedWts11+4202496*i76+12312*k183+24*s62+(ptrdiff_t)44));
sum707 = _mm512_fmadd_ps(wt840, dat2506, sum707);
sum708 = _mm512_fmadd_ps(wt840, dat2507, sum708);
sum709 = _mm512_fmadd_ps(wt840, dat2508, sum709);
sum710 = _mm512_fmadd_ps(wt840, dat2509, sum710);
}
sum687 = _mm512_add_ps(sum687, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)0));
sum688 = _mm512_add_ps(sum688, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)64));
sum689 = _mm512_add_ps(sum689, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)128));
sum690 = _mm512_add_ps(sum690, _mm512_maskz_loadu_ps(1, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)192));
sum687 = _mm512_max_ps(_mm512_setzero_ps(), sum687);
sum688 = _mm512_max_ps(_mm512_setzero_ps(), sum688);
sum689 = _mm512_max_ps(_mm512_setzero_ps(), sum689);
sum690 = _mm512_max_ps(_mm512_setzero_ps(), sum690);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)0, 65535, sum687);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)64, 65535, sum688);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)128, 65535, sum689);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)192, 1, sum690);
sum691 = _mm512_add_ps(sum691, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)320));
sum692 = _mm512_add_ps(sum692, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)384));
sum693 = _mm512_add_ps(sum693, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)448));
sum694 = _mm512_add_ps(sum694, _mm512_maskz_loadu_ps(1, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)512));
sum691 = _mm512_max_ps(_mm512_setzero_ps(), sum691);
sum692 = _mm512_max_ps(_mm512_setzero_ps(), sum692);
sum693 = _mm512_max_ps(_mm512_setzero_ps(), sum693);
sum694 = _mm512_max_ps(_mm512_setzero_ps(), sum694);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)320, 65535, sum691);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)384, 65535, sum692);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)448, 65535, sum693);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)512, 1, sum694);
sum695 = _mm512_add_ps(sum695, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)640));
sum696 = _mm512_add_ps(sum696, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)704));
sum697 = _mm512_add_ps(sum697, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)768));
sum698 = _mm512_add_ps(sum698, _mm512_maskz_loadu_ps(1, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)832));
sum695 = _mm512_max_ps(_mm512_setzero_ps(), sum695);
sum696 = _mm512_max_ps(_mm512_setzero_ps(), sum696);
sum697 = _mm512_max_ps(_mm512_setzero_ps(), sum697);
sum698 = _mm512_max_ps(_mm512_setzero_ps(), sum698);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)640, 65535, sum695);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)704, 65535, sum696);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)768, 65535, sum697);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)832, 1, sum698);
sum699 = _mm512_add_ps(sum699, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)960));
sum700 = _mm512_add_ps(sum700, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)1024));
sum701 = _mm512_add_ps(sum701, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)1088));
sum702 = _mm512_add_ps(sum702, _mm512_maskz_loadu_ps(1, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)1152));
sum699 = _mm512_max_ps(_mm512_setzero_ps(), sum699);
sum700 = _mm512_max_ps(_mm512_setzero_ps(), sum700);
sum701 = _mm512_max_ps(_mm512_setzero_ps(), sum701);
sum702 = _mm512_max_ps(_mm512_setzero_ps(), sum702);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)960, 65535, sum699);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)1024, 65535, sum700);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)1088, 65535, sum701);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)1152, 1, sum702);
sum703 = _mm512_add_ps(sum703, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)1280));
sum704 = _mm512_add_ps(sum704, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)1344));
sum705 = _mm512_add_ps(sum705, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)1408));
sum706 = _mm512_add_ps(sum706, _mm512_maskz_loadu_ps(1, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)1472));
sum703 = _mm512_max_ps(_mm512_setzero_ps(), sum703);
sum704 = _mm512_max_ps(_mm512_setzero_ps(), sum704);
sum705 = _mm512_max_ps(_mm512_setzero_ps(), sum705);
sum706 = _mm512_max_ps(_mm512_setzero_ps(), sum706);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)1280, 65535, sum703);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)1344, 65535, sum704);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)1408, 65535, sum705);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)1472, 1, sum706);
sum707 = _mm512_add_ps(sum707, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)1600));
sum708 = _mm512_add_ps(sum708, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)1664));
sum709 = _mm512_add_ps(sum709, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)1728));
sum710 = _mm512_add_ps(sum710, _mm512_maskz_loadu_ps(1, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)1792));
sum707 = _mm512_max_ps(_mm512_setzero_ps(), sum707);
sum708 = _mm512_max_ps(_mm512_setzero_ps(), sum708);
sum709 = _mm512_max_ps(_mm512_setzero_ps(), sum709);
sum710 = _mm512_max_ps(_mm512_setzero_ps(), sum710);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)1600, 65535, sum707);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)1664, 65535, sum708);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)1728, 65535, sum709);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)1792, 1, sum710);
if (k183 >= kk66) return;
}
ptrdiff_t s63 = -1;
__m512 sum711 = _mm512_set1_ps(*(float*)(arrangedWts11+4202496*i76+12312*k183+8*s63+(ptrdiff_t)8));
__m512 sum715 = _mm512_set1_ps(*(float*)(arrangedWts11+4202496*i76+12312*k183+8*s63+(ptrdiff_t)12));
__m512 sum712 = sum711;
__m512 sum713 = sum711;
__m512 sum714 = sum711;
__m512 sum716 = sum715;
__m512 sum717 = sum715;
__m512 sum718 = sum715;
for (s63 = 0; s63 < 512; ++s63) {
__m512 dat2510 = _mm512_loadu_ps(arrangedDats11+131072*i76+131072*j67+256*s63+(ptrdiff_t)0);
__m512 dat2511 = _mm512_loadu_ps(arrangedDats11+131072*i76+131072*j67+256*s63+(ptrdiff_t)64);
__m512 dat2512 = _mm512_loadu_ps(arrangedDats11+131072*i76+131072*j67+256*s63+(ptrdiff_t)128);
__m512 dat2513 = _mm512_loadu_ps(arrangedDats11+131072*i76+131072*j67+256*s63+(ptrdiff_t)192);
__m512 wt841 = _mm512_set1_ps(*(float*)(arrangedWts11+4202496*i76+12312*k183+8*s63+(ptrdiff_t)8));
sum711 = _mm512_fmadd_ps(wt841, dat2510, sum711);
sum712 = _mm512_fmadd_ps(wt841, dat2511, sum712);
sum713 = _mm512_fmadd_ps(wt841, dat2512, sum713);
sum714 = _mm512_fmadd_ps(wt841, dat2513, sum714);
__m512 wt842 = _mm512_set1_ps(*(float*)(arrangedWts11+4202496*i76+12312*k183+8*s63+(ptrdiff_t)12));
sum715 = _mm512_fmadd_ps(wt842, dat2510, sum715);
sum716 = _mm512_fmadd_ps(wt842, dat2511, sum716);
sum717 = _mm512_fmadd_ps(wt842, dat2512, sum717);
sum718 = _mm512_fmadd_ps(wt842, dat2513, sum718);
}
sum711 = _mm512_add_ps(sum711, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)0));
sum712 = _mm512_add_ps(sum712, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)64));
sum713 = _mm512_add_ps(sum713, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)128));
sum714 = _mm512_add_ps(sum714, _mm512_maskz_loadu_ps(1, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)192));
sum711 = _mm512_max_ps(_mm512_setzero_ps(), sum711);
sum712 = _mm512_max_ps(_mm512_setzero_ps(), sum712);
sum713 = _mm512_max_ps(_mm512_setzero_ps(), sum713);
sum714 = _mm512_max_ps(_mm512_setzero_ps(), sum714);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)0, 65535, sum711);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)64, 65535, sum712);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)128, 65535, sum713);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)192, 1, sum714);
sum715 = _mm512_add_ps(sum715, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)320));
sum716 = _mm512_add_ps(sum716, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)384));
sum717 = _mm512_add_ps(sum717, _mm512_maskz_loadu_ps(65535, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)448));
sum718 = _mm512_add_ps(sum718, _mm512_maskz_loadu_ps(1, datPtr41+655360*i76+256*j67+1920*k183+(ptrdiff_t)512));
sum715 = _mm512_max_ps(_mm512_setzero_ps(), sum715);
sum716 = _mm512_max_ps(_mm512_setzero_ps(), sum716);
sum717 = _mm512_max_ps(_mm512_setzero_ps(), sum717);
sum718 = _mm512_max_ps(_mm512_setzero_ps(), sum718);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)320, 65535, sum715);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)384, 65535, sum716);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)448, 65535, sum717);
_mm512_mask_storeu_ps(datPtr42+655360*i76+256*j67+1920*k183+(ptrdiff_t)512, 1, sum718);
}
}

static void ResNet50OneApply11(ResNet50ThreaderTeam1* team79, char** tensors131) {
void* pair35[] = {tensors131, 0};
ResNet50ThreaderTask1 task135;
task135.callee1 = ResNet50OneApply11Callee1;
task135.any1 = pair35;
task135.nd1 = 3;
task135.hull1[0] = 342;
task135.hull1[1] = 1;
task135.hull1[2] = 1;
ResNet50ThreaderDo1(team79, &task135);
}

static void ResNet50OneArrangeWts12Callee1(ResNet50ThreaderTask1* task136, int64_t* pt73) {
char** tensors134 = task136->any1;
ptrdiff_t b75 = pt73[0];
ptrdiff_t e38 = pt73[2];
if (e38 < 2) {
char*restrict wtPtr24 = tensors134[0]+(ptrdiff_t)3340*e38+(ptrdiff_t)4194304*0;
char*restrict biasPtr24 = tensors134[1]+(ptrdiff_t)2048*0;
char*restrict bnPtr24 = tensors134[2]+(ptrdiff_t)8*512*0;
char*restrict arranged23 = tensors134[3]+(ptrdiff_t)1712128*e38+(ptrdiff_t)1712128*0;
ptrdiff_t ii34 = 1;
for (ptrdiff_t i77 = 0; i77 < ii34; ++i77) {
ptrdiff_t j68 = 1*b75;
ptrdiff_t jj55 = j68+1;
for (; j68 < jj55; ++j68) {
if (j68 < 31) {
ptrdiff_t k185 = 0+16*(j68-0);
ptrdiff_t l82 = (size_t)(0+k185)/6;
ptrdiff_t cut35 = (size_t)(0+k185)%6;
switch (cut35) {
case 0:;
case 2: {
__m512 sum720;
if (!e38) {
sum720 = _mm512_maskz_loadu_ps(65535, biasPtr24+2048*i77+4*k185);
} else {
sum720 = _mm512_setzero_ps();
}
__m512i pmMul50 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd50 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo42 = _mm512_loadu_ps(bnPtr24+(ptrdiff_t)8*(k185+512*i77));
__m512 masHi42 = _mm512_maskz_loadu_ps(65535, bnPtr24+(ptrdiff_t)8*(k185+512*i77)+(ptrdiff_t)64);
__m512 postMul81 = _mm512_permutex2var_ps(masLo42, pmMul50, masHi42);
__m512 postAdd51 = _mm512_permutex2var_ps(masLo42, pmAdd50, masHi42);
if (!e38) sum720 = _mm512_fmadd_ps(sum720, postMul81, postAdd51);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*0+(ptrdiff_t)0, 63>>cut35, sum720);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*0+(ptrdiff_t)20040, 4032>>cut35, sum720);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*0+(ptrdiff_t)40080, 65535-(4095>>cut35), sum720);
ptrdiff_t c63 = 0;
for (; c63 != 52; ++c63) {
__m512 wt875 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)0);
__m512 wt876 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)8192);
__m512 wt877 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)16384);
__m512 wt878 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)24576);
__m512 wt879 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)32768);
__m512 wt880 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)40960);
__m512 wt881 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)49152);
__m512 wt882 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)57344);
__m512 wt883 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)65536);
__m512 wt884 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)73728);
__m512 wt885 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)81920);
__m512 wt886 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)90112);
__m512 wt887 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)98304);
__m512 wt888 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)106496);
__m512 wt889 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)114688);
__m512 wt890 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)122880);
__m512 tmp19877 = _mm512_unpacklo_ps(wt875, wt876);
__m512 tmp19878 = _mm512_unpackhi_ps(wt875, wt876);
__m512 tmp19879 = _mm512_unpacklo_ps(wt877, wt878);
__m512 tmp19880 = _mm512_unpackhi_ps(wt877, wt878);
__m512 tmp19881 = _mm512_unpacklo_ps(wt879, wt880);
__m512 tmp19882 = _mm512_unpackhi_ps(wt879, wt880);
__m512 tmp19883 = _mm512_unpacklo_ps(wt881, wt882);
__m512 tmp19884 = _mm512_unpackhi_ps(wt881, wt882);
__m512 tmp19885 = _mm512_unpacklo_ps(wt883, wt884);
__m512 tmp19886 = _mm512_unpackhi_ps(wt883, wt884);
__m512 tmp19887 = _mm512_unpacklo_ps(wt885, wt886);
__m512 tmp19888 = _mm512_unpackhi_ps(wt885, wt886);
__m512 tmp19889 = _mm512_unpacklo_ps(wt887, wt888);
__m512 tmp19890 = _mm512_unpackhi_ps(wt887, wt888);
__m512 tmp19891 = _mm512_unpacklo_ps(wt889, wt890);
__m512 tmp19892 = _mm512_unpackhi_ps(wt889, wt890);
__m512 tmp19893 = _mm512_shuffle_ps(tmp19877, tmp19879, 68);
__m512 tmp19894 = _mm512_shuffle_ps(tmp19877, tmp19879, 238);
__m512 tmp19895 = _mm512_shuffle_ps(tmp19878, tmp19880, 68);
__m512 tmp19896 = _mm512_shuffle_ps(tmp19878, tmp19880, 238);
__m512 tmp19897 = _mm512_shuffle_ps(tmp19881, tmp19883, 68);
__m512 tmp19898 = _mm512_shuffle_ps(tmp19881, tmp19883, 238);
__m512 tmp19899 = _mm512_shuffle_ps(tmp19882, tmp19884, 68);
__m512 tmp19900 = _mm512_shuffle_ps(tmp19882, tmp19884, 238);
__m512 tmp19901 = _mm512_shuffle_ps(tmp19885, tmp19887, 68);
__m512 tmp19902 = _mm512_shuffle_ps(tmp19885, tmp19887, 238);
__m512 tmp19903 = _mm512_shuffle_ps(tmp19886, tmp19888, 68);
__m512 tmp19904 = _mm512_shuffle_ps(tmp19886, tmp19888, 238);
__m512 tmp19905 = _mm512_shuffle_ps(tmp19889, tmp19891, 68);
__m512 tmp19906 = _mm512_shuffle_ps(tmp19889, tmp19891, 238);
__m512 tmp19907 = _mm512_shuffle_ps(tmp19890, tmp19892, 68);
__m512 tmp19908 = _mm512_shuffle_ps(tmp19890, tmp19892, 238);
__m512 tmp19909 = _mm512_shuffle_f32x4(tmp19893, tmp19897, 136);
__m512 tmp19910 = _mm512_shuffle_f32x4(tmp19893, tmp19897, 221);
__m512 tmp19911 = _mm512_shuffle_f32x4(tmp19894, tmp19898, 136);
__m512 tmp19912 = _mm512_shuffle_f32x4(tmp19894, tmp19898, 221);
__m512 tmp19913 = _mm512_shuffle_f32x4(tmp19895, tmp19899, 136);
__m512 tmp19914 = _mm512_shuffle_f32x4(tmp19895, tmp19899, 221);
__m512 tmp19915 = _mm512_shuffle_f32x4(tmp19896, tmp19900, 136);
__m512 tmp19916 = _mm512_shuffle_f32x4(tmp19896, tmp19900, 221);
__m512 tmp19917 = _mm512_shuffle_f32x4(tmp19901, tmp19905, 136);
__m512 tmp19918 = _mm512_shuffle_f32x4(tmp19901, tmp19905, 221);
__m512 tmp19919 = _mm512_shuffle_f32x4(tmp19902, tmp19906, 136);
__m512 tmp19920 = _mm512_shuffle_f32x4(tmp19902, tmp19906, 221);
__m512 tmp19921 = _mm512_shuffle_f32x4(tmp19903, tmp19907, 136);
__m512 tmp19922 = _mm512_shuffle_f32x4(tmp19903, tmp19907, 221);
__m512 tmp19923 = _mm512_shuffle_f32x4(tmp19904, tmp19908, 136);
__m512 tmp19924 = _mm512_shuffle_f32x4(tmp19904, tmp19908, 221);
wt875 = _mm512_shuffle_f32x4(tmp19909, tmp19917, 136);
wt883 = _mm512_shuffle_f32x4(tmp19909, tmp19917, 221);
wt876 = _mm512_shuffle_f32x4(tmp19911, tmp19919, 136);
wt884 = _mm512_shuffle_f32x4(tmp19911, tmp19919, 221);
wt877 = _mm512_shuffle_f32x4(tmp19913, tmp19921, 136);
wt885 = _mm512_shuffle_f32x4(tmp19913, tmp19921, 221);
wt878 = _mm512_shuffle_f32x4(tmp19915, tmp19923, 136);
wt886 = _mm512_shuffle_f32x4(tmp19915, tmp19923, 221);
wt879 = _mm512_shuffle_f32x4(tmp19910, tmp19918, 136);
wt887 = _mm512_shuffle_f32x4(tmp19910, tmp19918, 221);
wt880 = _mm512_shuffle_f32x4(tmp19912, tmp19920, 136);
wt888 = _mm512_shuffle_f32x4(tmp19912, tmp19920, 221);
wt881 = _mm512_shuffle_f32x4(tmp19914, tmp19922, 136);
wt889 = _mm512_shuffle_f32x4(tmp19914, tmp19922, 221);
wt882 = _mm512_shuffle_f32x4(tmp19916, tmp19924, 136);
wt890 = _mm512_shuffle_f32x4(tmp19916, tmp19924, 221);
wt875 = _mm512_mul_ps(wt875, postMul81);
wt876 = _mm512_mul_ps(wt876, postMul81);
wt877 = _mm512_mul_ps(wt877, postMul81);
wt878 = _mm512_mul_ps(wt878, postMul81);
wt879 = _mm512_mul_ps(wt879, postMul81);
wt880 = _mm512_mul_ps(wt880, postMul81);
wt881 = _mm512_mul_ps(wt881, postMul81);
wt882 = _mm512_mul_ps(wt882, postMul81);
wt883 = _mm512_mul_ps(wt883, postMul81);
wt884 = _mm512_mul_ps(wt884, postMul81);
wt885 = _mm512_mul_ps(wt885, postMul81);
wt886 = _mm512_mul_ps(wt886, postMul81);
wt887 = _mm512_mul_ps(wt887, postMul81);
wt888 = _mm512_mul_ps(wt888, postMul81);
wt889 = _mm512_mul_ps(wt889, postMul81);
wt890 = _mm512_mul_ps(wt890, postMul81);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(1+16*c63)+(ptrdiff_t)0, 63>>cut35, wt875);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(2+16*c63)+(ptrdiff_t)0, 63>>cut35, wt876);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(3+16*c63)+(ptrdiff_t)0, 63>>cut35, wt877);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(4+16*c63)+(ptrdiff_t)0, 63>>cut35, wt878);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(5+16*c63)+(ptrdiff_t)0, 63>>cut35, wt879);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(6+16*c63)+(ptrdiff_t)0, 63>>cut35, wt880);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(7+16*c63)+(ptrdiff_t)0, 63>>cut35, wt881);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(8+16*c63)+(ptrdiff_t)0, 63>>cut35, wt882);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(9+16*c63)+(ptrdiff_t)0, 63>>cut35, wt883);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(10+16*c63)+(ptrdiff_t)0, 63>>cut35, wt884);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(11+16*c63)+(ptrdiff_t)0, 63>>cut35, wt885);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(12+16*c63)+(ptrdiff_t)0, 63>>cut35, wt886);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(13+16*c63)+(ptrdiff_t)0, 63>>cut35, wt887);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(14+16*c63)+(ptrdiff_t)0, 63>>cut35, wt888);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(15+16*c63)+(ptrdiff_t)0, 63>>cut35, wt889);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(16+16*c63)+(ptrdiff_t)0, 63>>cut35, wt890);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(1+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt875);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(2+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt876);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(3+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt877);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(4+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt878);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(5+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt879);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(6+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt880);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(7+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt881);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(8+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt882);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(9+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt883);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(10+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt884);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(11+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt885);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(12+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt886);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(13+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt887);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(14+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt888);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(15+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt889);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(16+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt890);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(1+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt875);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(2+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt876);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(3+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt877);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(4+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt878);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(5+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt879);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(6+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt880);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(7+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt881);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(8+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt882);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(9+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt883);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(10+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt884);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(11+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt885);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(12+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt886);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(13+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt887);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(14+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt888);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(15+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt889);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(16+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt890);
}
__m512 wt891 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)0);
__m512 wt892 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)8192);
__m512 wt893 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)16384);
__m512 wt894 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)24576);
__m512 wt895 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)32768);
__m512 wt896 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)40960);
__m512 wt897 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)49152);
__m512 wt898 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)57344);
__m512 wt899 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)65536);
__m512 wt900 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)73728);
__m512 wt901 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)81920);
__m512 wt902 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)90112);
__m512 wt903 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)98304);
__m512 wt904 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)106496);
__m512 wt905 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)114688);
__m512 wt906 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c63+(ptrdiff_t)122880);
__m512 tmp19925 = _mm512_unpacklo_ps(wt891, wt892);
__m512 tmp19926 = _mm512_unpackhi_ps(wt891, wt892);
__m512 tmp19927 = _mm512_unpacklo_ps(wt893, wt894);
__m512 tmp19928 = _mm512_unpackhi_ps(wt893, wt894);
__m512 tmp19929 = _mm512_unpacklo_ps(wt895, wt896);
__m512 tmp19930 = _mm512_unpackhi_ps(wt895, wt896);
__m512 tmp19931 = _mm512_unpacklo_ps(wt897, wt898);
__m512 tmp19932 = _mm512_unpackhi_ps(wt897, wt898);
__m512 tmp19933 = _mm512_unpacklo_ps(wt899, wt900);
__m512 tmp19934 = _mm512_unpackhi_ps(wt899, wt900);
__m512 tmp19935 = _mm512_unpacklo_ps(wt901, wt902);
__m512 tmp19936 = _mm512_unpackhi_ps(wt901, wt902);
__m512 tmp19937 = _mm512_unpacklo_ps(wt903, wt904);
__m512 tmp19938 = _mm512_unpackhi_ps(wt903, wt904);
__m512 tmp19939 = _mm512_unpacklo_ps(wt905, wt906);
__m512 tmp19940 = _mm512_unpackhi_ps(wt905, wt906);
__m512 tmp19941 = _mm512_shuffle_ps(tmp19925, tmp19927, 68);
__m512 tmp19942 = _mm512_shuffle_ps(tmp19925, tmp19927, 238);
__m512 tmp19943 = _mm512_shuffle_ps(tmp19926, tmp19928, 68);
__m512 tmp19944 = _mm512_shuffle_ps(tmp19929, tmp19931, 68);
__m512 tmp19945 = _mm512_shuffle_ps(tmp19929, tmp19931, 238);
__m512 tmp19946 = _mm512_shuffle_ps(tmp19930, tmp19932, 68);
__m512 tmp19947 = _mm512_shuffle_ps(tmp19933, tmp19935, 68);
__m512 tmp19948 = _mm512_shuffle_ps(tmp19933, tmp19935, 238);
__m512 tmp19949 = _mm512_shuffle_ps(tmp19934, tmp19936, 68);
__m512 tmp19950 = _mm512_shuffle_ps(tmp19937, tmp19939, 68);
__m512 tmp19951 = _mm512_shuffle_ps(tmp19937, tmp19939, 238);
__m512 tmp19952 = _mm512_shuffle_ps(tmp19938, tmp19940, 68);
__m512 tmp19953 = _mm512_shuffle_f32x4(tmp19941, tmp19944, 136);
__m512 tmp19954 = _mm512_shuffle_f32x4(tmp19942, tmp19945, 136);
__m512 tmp19955 = _mm512_shuffle_f32x4(tmp19943, tmp19946, 136);
__m512 tmp19956 = _mm512_shuffle_f32x4(tmp19947, tmp19950, 136);
__m512 tmp19957 = _mm512_shuffle_f32x4(tmp19948, tmp19951, 136);
__m512 tmp19958 = _mm512_shuffle_f32x4(tmp19949, tmp19952, 136);
wt891 = _mm512_shuffle_f32x4(tmp19953, tmp19956, 136);
wt892 = _mm512_shuffle_f32x4(tmp19954, tmp19957, 136);
wt893 = _mm512_shuffle_f32x4(tmp19955, tmp19958, 136);
wt891 = _mm512_mul_ps(wt891, postMul81);
wt892 = _mm512_mul_ps(wt892, postMul81);
wt893 = _mm512_mul_ps(wt893, postMul81);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(1+16*c63)+(ptrdiff_t)0, 63>>cut35, wt891);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(2+16*c63)+(ptrdiff_t)0, 63>>cut35, wt892);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(3+16*c63)+(ptrdiff_t)0, 63>>cut35, wt893);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(1+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt891);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(2+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt892);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(3+16*c63)+(ptrdiff_t)20040, 4032>>cut35, wt893);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(1+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt891);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(2+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt892);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(3+16*c63)+(ptrdiff_t)40080, 65535-(4095>>cut35), wt893);
break;
}
default: {
cut35 = 4;
__m512 sum721;
if (!e38) {
sum721 = _mm512_maskz_loadu_ps(65535, biasPtr24+2048*i77+4*k185);
} else {
sum721 = _mm512_setzero_ps();
}
__m512i pmMul51 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd51 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo43 = _mm512_loadu_ps(bnPtr24+(ptrdiff_t)8*(k185+512*i77));
__m512 masHi43 = _mm512_maskz_loadu_ps(65535, bnPtr24+(ptrdiff_t)8*(k185+512*i77)+(ptrdiff_t)64);
__m512 postMul82 = _mm512_permutex2var_ps(masLo43, pmMul51, masHi43);
__m512 postAdd52 = _mm512_permutex2var_ps(masLo43, pmAdd51, masHi43);
if (!e38) sum721 = _mm512_fmadd_ps(sum721, postMul82, postAdd52);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*0+(ptrdiff_t)0, 63>>cut35, sum721);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*0+(ptrdiff_t)20040, 4032>>cut35, sum721);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*0+(ptrdiff_t)40080, 258048>>cut35, sum721);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*0+(ptrdiff_t)60120, 65535-(262143>>cut35), sum721);
ptrdiff_t c64 = 0;
for (; c64 != 52; ++c64) {
__m512 wt907 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)0);
__m512 wt908 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)8192);
__m512 wt909 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)16384);
__m512 wt910 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)24576);
__m512 wt911 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)32768);
__m512 wt912 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)40960);
__m512 wt913 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)49152);
__m512 wt914 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)57344);
__m512 wt915 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)65536);
__m512 wt916 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)73728);
__m512 wt917 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)81920);
__m512 wt918 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)90112);
__m512 wt919 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)98304);
__m512 wt920 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)106496);
__m512 wt921 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)114688);
__m512 wt922 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)122880);
__m512 tmp19959 = _mm512_unpacklo_ps(wt907, wt908);
__m512 tmp19960 = _mm512_unpackhi_ps(wt907, wt908);
__m512 tmp19961 = _mm512_unpacklo_ps(wt909, wt910);
__m512 tmp19962 = _mm512_unpackhi_ps(wt909, wt910);
__m512 tmp19963 = _mm512_unpacklo_ps(wt911, wt912);
__m512 tmp19964 = _mm512_unpackhi_ps(wt911, wt912);
__m512 tmp19965 = _mm512_unpacklo_ps(wt913, wt914);
__m512 tmp19966 = _mm512_unpackhi_ps(wt913, wt914);
__m512 tmp19967 = _mm512_unpacklo_ps(wt915, wt916);
__m512 tmp19968 = _mm512_unpackhi_ps(wt915, wt916);
__m512 tmp19969 = _mm512_unpacklo_ps(wt917, wt918);
__m512 tmp19970 = _mm512_unpackhi_ps(wt917, wt918);
__m512 tmp19971 = _mm512_unpacklo_ps(wt919, wt920);
__m512 tmp19972 = _mm512_unpackhi_ps(wt919, wt920);
__m512 tmp19973 = _mm512_unpacklo_ps(wt921, wt922);
__m512 tmp19974 = _mm512_unpackhi_ps(wt921, wt922);
__m512 tmp19975 = _mm512_shuffle_ps(tmp19959, tmp19961, 68);
__m512 tmp19976 = _mm512_shuffle_ps(tmp19959, tmp19961, 238);
__m512 tmp19977 = _mm512_shuffle_ps(tmp19960, tmp19962, 68);
__m512 tmp19978 = _mm512_shuffle_ps(tmp19960, tmp19962, 238);
__m512 tmp19979 = _mm512_shuffle_ps(tmp19963, tmp19965, 68);
__m512 tmp19980 = _mm512_shuffle_ps(tmp19963, tmp19965, 238);
__m512 tmp19981 = _mm512_shuffle_ps(tmp19964, tmp19966, 68);
__m512 tmp19982 = _mm512_shuffle_ps(tmp19964, tmp19966, 238);
__m512 tmp19983 = _mm512_shuffle_ps(tmp19967, tmp19969, 68);
__m512 tmp19984 = _mm512_shuffle_ps(tmp19967, tmp19969, 238);
__m512 tmp19985 = _mm512_shuffle_ps(tmp19968, tmp19970, 68);
__m512 tmp19986 = _mm512_shuffle_ps(tmp19968, tmp19970, 238);
__m512 tmp19987 = _mm512_shuffle_ps(tmp19971, tmp19973, 68);
__m512 tmp19988 = _mm512_shuffle_ps(tmp19971, tmp19973, 238);
__m512 tmp19989 = _mm512_shuffle_ps(tmp19972, tmp19974, 68);
__m512 tmp19990 = _mm512_shuffle_ps(tmp19972, tmp19974, 238);
__m512 tmp19991 = _mm512_shuffle_f32x4(tmp19975, tmp19979, 136);
__m512 tmp19992 = _mm512_shuffle_f32x4(tmp19975, tmp19979, 221);
__m512 tmp19993 = _mm512_shuffle_f32x4(tmp19976, tmp19980, 136);
__m512 tmp19994 = _mm512_shuffle_f32x4(tmp19976, tmp19980, 221);
__m512 tmp19995 = _mm512_shuffle_f32x4(tmp19977, tmp19981, 136);
__m512 tmp19996 = _mm512_shuffle_f32x4(tmp19977, tmp19981, 221);
__m512 tmp19997 = _mm512_shuffle_f32x4(tmp19978, tmp19982, 136);
__m512 tmp19998 = _mm512_shuffle_f32x4(tmp19978, tmp19982, 221);
__m512 tmp19999 = _mm512_shuffle_f32x4(tmp19983, tmp19987, 136);
__m512 tmp20000 = _mm512_shuffle_f32x4(tmp19983, tmp19987, 221);
__m512 tmp20001 = _mm512_shuffle_f32x4(tmp19984, tmp19988, 136);
__m512 tmp20002 = _mm512_shuffle_f32x4(tmp19984, tmp19988, 221);
__m512 tmp20003 = _mm512_shuffle_f32x4(tmp19985, tmp19989, 136);
__m512 tmp20004 = _mm512_shuffle_f32x4(tmp19985, tmp19989, 221);
__m512 tmp20005 = _mm512_shuffle_f32x4(tmp19986, tmp19990, 136);
__m512 tmp20006 = _mm512_shuffle_f32x4(tmp19986, tmp19990, 221);
wt907 = _mm512_shuffle_f32x4(tmp19991, tmp19999, 136);
wt915 = _mm512_shuffle_f32x4(tmp19991, tmp19999, 221);
wt908 = _mm512_shuffle_f32x4(tmp19993, tmp20001, 136);
wt916 = _mm512_shuffle_f32x4(tmp19993, tmp20001, 221);
wt909 = _mm512_shuffle_f32x4(tmp19995, tmp20003, 136);
wt917 = _mm512_shuffle_f32x4(tmp19995, tmp20003, 221);
wt910 = _mm512_shuffle_f32x4(tmp19997, tmp20005, 136);
wt918 = _mm512_shuffle_f32x4(tmp19997, tmp20005, 221);
wt911 = _mm512_shuffle_f32x4(tmp19992, tmp20000, 136);
wt919 = _mm512_shuffle_f32x4(tmp19992, tmp20000, 221);
wt912 = _mm512_shuffle_f32x4(tmp19994, tmp20002, 136);
wt920 = _mm512_shuffle_f32x4(tmp19994, tmp20002, 221);
wt913 = _mm512_shuffle_f32x4(tmp19996, tmp20004, 136);
wt921 = _mm512_shuffle_f32x4(tmp19996, tmp20004, 221);
wt914 = _mm512_shuffle_f32x4(tmp19998, tmp20006, 136);
wt922 = _mm512_shuffle_f32x4(tmp19998, tmp20006, 221);
wt907 = _mm512_mul_ps(wt907, postMul82);
wt908 = _mm512_mul_ps(wt908, postMul82);
wt909 = _mm512_mul_ps(wt909, postMul82);
wt910 = _mm512_mul_ps(wt910, postMul82);
wt911 = _mm512_mul_ps(wt911, postMul82);
wt912 = _mm512_mul_ps(wt912, postMul82);
wt913 = _mm512_mul_ps(wt913, postMul82);
wt914 = _mm512_mul_ps(wt914, postMul82);
wt915 = _mm512_mul_ps(wt915, postMul82);
wt916 = _mm512_mul_ps(wt916, postMul82);
wt917 = _mm512_mul_ps(wt917, postMul82);
wt918 = _mm512_mul_ps(wt918, postMul82);
wt919 = _mm512_mul_ps(wt919, postMul82);
wt920 = _mm512_mul_ps(wt920, postMul82);
wt921 = _mm512_mul_ps(wt921, postMul82);
wt922 = _mm512_mul_ps(wt922, postMul82);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(1+16*c64)+(ptrdiff_t)0, 63>>cut35, wt907);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(2+16*c64)+(ptrdiff_t)0, 63>>cut35, wt908);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(3+16*c64)+(ptrdiff_t)0, 63>>cut35, wt909);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(4+16*c64)+(ptrdiff_t)0, 63>>cut35, wt910);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(5+16*c64)+(ptrdiff_t)0, 63>>cut35, wt911);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(6+16*c64)+(ptrdiff_t)0, 63>>cut35, wt912);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(7+16*c64)+(ptrdiff_t)0, 63>>cut35, wt913);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(8+16*c64)+(ptrdiff_t)0, 63>>cut35, wt914);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(9+16*c64)+(ptrdiff_t)0, 63>>cut35, wt915);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(10+16*c64)+(ptrdiff_t)0, 63>>cut35, wt916);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(11+16*c64)+(ptrdiff_t)0, 63>>cut35, wt917);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(12+16*c64)+(ptrdiff_t)0, 63>>cut35, wt918);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(13+16*c64)+(ptrdiff_t)0, 63>>cut35, wt919);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(14+16*c64)+(ptrdiff_t)0, 63>>cut35, wt920);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(15+16*c64)+(ptrdiff_t)0, 63>>cut35, wt921);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(16+16*c64)+(ptrdiff_t)0, 63>>cut35, wt922);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(1+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt907);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(2+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt908);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(3+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt909);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(4+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt910);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(5+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt911);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(6+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt912);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(7+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt913);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(8+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt914);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(9+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt915);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(10+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt916);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(11+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt917);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(12+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt918);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(13+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt919);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(14+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt920);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(15+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt921);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(16+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt922);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(1+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt907);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(2+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt908);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(3+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt909);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(4+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt910);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(5+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt911);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(6+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt912);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(7+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt913);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(8+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt914);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(9+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt915);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(10+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt916);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(11+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt917);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(12+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt918);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(13+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt919);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(14+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt920);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(15+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt921);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(16+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt922);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(1+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt907);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(2+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt908);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(3+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt909);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(4+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt910);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(5+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt911);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(6+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt912);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(7+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt913);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(8+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt914);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(9+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt915);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(10+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt916);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(11+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt917);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(12+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt918);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(13+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt919);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(14+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt920);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(15+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt921);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(16+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt922);
}
__m512 wt923 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)0);
__m512 wt924 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)8192);
__m512 wt925 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)16384);
__m512 wt926 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)24576);
__m512 wt927 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)32768);
__m512 wt928 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)40960);
__m512 wt929 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)49152);
__m512 wt930 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)57344);
__m512 wt931 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)65536);
__m512 wt932 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)73728);
__m512 wt933 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)81920);
__m512 wt934 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)90112);
__m512 wt935 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)98304);
__m512 wt936 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)106496);
__m512 wt937 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)114688);
__m512 wt938 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k185+64*c64+(ptrdiff_t)122880);
__m512 tmp20007 = _mm512_unpacklo_ps(wt923, wt924);
__m512 tmp20008 = _mm512_unpackhi_ps(wt923, wt924);
__m512 tmp20009 = _mm512_unpacklo_ps(wt925, wt926);
__m512 tmp20010 = _mm512_unpackhi_ps(wt925, wt926);
__m512 tmp20011 = _mm512_unpacklo_ps(wt927, wt928);
__m512 tmp20012 = _mm512_unpackhi_ps(wt927, wt928);
__m512 tmp20013 = _mm512_unpacklo_ps(wt929, wt930);
__m512 tmp20014 = _mm512_unpackhi_ps(wt929, wt930);
__m512 tmp20015 = _mm512_unpacklo_ps(wt931, wt932);
__m512 tmp20016 = _mm512_unpackhi_ps(wt931, wt932);
__m512 tmp20017 = _mm512_unpacklo_ps(wt933, wt934);
__m512 tmp20018 = _mm512_unpackhi_ps(wt933, wt934);
__m512 tmp20019 = _mm512_unpacklo_ps(wt935, wt936);
__m512 tmp20020 = _mm512_unpackhi_ps(wt935, wt936);
__m512 tmp20021 = _mm512_unpacklo_ps(wt937, wt938);
__m512 tmp20022 = _mm512_unpackhi_ps(wt937, wt938);
__m512 tmp20023 = _mm512_shuffle_ps(tmp20007, tmp20009, 68);
__m512 tmp20024 = _mm512_shuffle_ps(tmp20007, tmp20009, 238);
__m512 tmp20025 = _mm512_shuffle_ps(tmp20008, tmp20010, 68);
__m512 tmp20026 = _mm512_shuffle_ps(tmp20011, tmp20013, 68);
__m512 tmp20027 = _mm512_shuffle_ps(tmp20011, tmp20013, 238);
__m512 tmp20028 = _mm512_shuffle_ps(tmp20012, tmp20014, 68);
__m512 tmp20029 = _mm512_shuffle_ps(tmp20015, tmp20017, 68);
__m512 tmp20030 = _mm512_shuffle_ps(tmp20015, tmp20017, 238);
__m512 tmp20031 = _mm512_shuffle_ps(tmp20016, tmp20018, 68);
__m512 tmp20032 = _mm512_shuffle_ps(tmp20019, tmp20021, 68);
__m512 tmp20033 = _mm512_shuffle_ps(tmp20019, tmp20021, 238);
__m512 tmp20034 = _mm512_shuffle_ps(tmp20020, tmp20022, 68);
__m512 tmp20035 = _mm512_shuffle_f32x4(tmp20023, tmp20026, 136);
__m512 tmp20036 = _mm512_shuffle_f32x4(tmp20024, tmp20027, 136);
__m512 tmp20037 = _mm512_shuffle_f32x4(tmp20025, tmp20028, 136);
__m512 tmp20038 = _mm512_shuffle_f32x4(tmp20029, tmp20032, 136);
__m512 tmp20039 = _mm512_shuffle_f32x4(tmp20030, tmp20033, 136);
__m512 tmp20040 = _mm512_shuffle_f32x4(tmp20031, tmp20034, 136);
wt923 = _mm512_shuffle_f32x4(tmp20035, tmp20038, 136);
wt924 = _mm512_shuffle_f32x4(tmp20036, tmp20039, 136);
wt925 = _mm512_shuffle_f32x4(tmp20037, tmp20040, 136);
wt923 = _mm512_mul_ps(wt923, postMul82);
wt924 = _mm512_mul_ps(wt924, postMul82);
wt925 = _mm512_mul_ps(wt925, postMul82);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(1+16*c64)+(ptrdiff_t)0, 63>>cut35, wt923);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(2+16*c64)+(ptrdiff_t)0, 63>>cut35, wt924);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(3+16*c64)+(ptrdiff_t)0, 63>>cut35, wt925);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(1+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt923);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(2+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt924);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(3+16*c64)+(ptrdiff_t)20040, 4032>>cut35, wt925);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(1+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt923);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(2+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt924);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(3+16*c64)+(ptrdiff_t)40080, 258048>>cut35, wt925);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(1+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt923);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(2+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt924);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l82+4*cut35+24*(3+16*c64)+(ptrdiff_t)60120, 65535-(262143>>cut35), wt925);
}
}
} else {
ptrdiff_t k184 = 496;
ptrdiff_t l81 = (size_t)(0+k184)/6;
ptrdiff_t cut34 = (size_t)(0+k184)%6;
__m512 sum719;
if (!e38) {
sum719 = _mm512_maskz_loadu_ps(65535, biasPtr24+2048*i77+4*k184);
} else {
sum719 = _mm512_setzero_ps();
}
__m512i pmMul52 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd52 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo44 = _mm512_loadu_ps(bnPtr24+(ptrdiff_t)8*(k184+512*i77));
__m512 masHi44 = _mm512_maskz_loadu_ps(65535, bnPtr24+(ptrdiff_t)8*(k184+512*i77)+(ptrdiff_t)64);
__m512 postMul80 = _mm512_permutex2var_ps(masLo44, pmMul52, masHi44);
__m512 postAdd50 = _mm512_permutex2var_ps(masLo44, pmAdd52, masHi44);
if (!e38) sum719 = _mm512_fmadd_ps(sum719, postMul80, postAdd50);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*0+(ptrdiff_t)0, 63>>cut34, sum719);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*0+(ptrdiff_t)20040, 4032>>cut34, sum719);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*0+(ptrdiff_t)40080, 258048>>cut34, sum719);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*0+(ptrdiff_t)60120, 65535-(262143>>cut34), sum719);
ptrdiff_t c62 = 0;
for (; c62 != 52; ++c62) {
__m512 wt843 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)0);
__m512 wt844 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)8192);
__m512 wt845 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)16384);
__m512 wt846 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)24576);
__m512 wt847 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)32768);
__m512 wt848 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)40960);
__m512 wt849 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)49152);
__m512 wt850 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)57344);
__m512 wt851 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)65536);
__m512 wt852 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)73728);
__m512 wt853 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)81920);
__m512 wt854 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)90112);
__m512 wt855 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)98304);
__m512 wt856 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)106496);
__m512 wt857 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)114688);
__m512 wt858 = _mm512_maskz_loadu_ps(65535, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)122880);
__m512 tmp20041 = _mm512_unpacklo_ps(wt843, wt844);
__m512 tmp20042 = _mm512_unpackhi_ps(wt843, wt844);
__m512 tmp20043 = _mm512_unpacklo_ps(wt845, wt846);
__m512 tmp20044 = _mm512_unpackhi_ps(wt845, wt846);
__m512 tmp20045 = _mm512_unpacklo_ps(wt847, wt848);
__m512 tmp20046 = _mm512_unpackhi_ps(wt847, wt848);
__m512 tmp20047 = _mm512_unpacklo_ps(wt849, wt850);
__m512 tmp20048 = _mm512_unpackhi_ps(wt849, wt850);
__m512 tmp20049 = _mm512_unpacklo_ps(wt851, wt852);
__m512 tmp20050 = _mm512_unpackhi_ps(wt851, wt852);
__m512 tmp20051 = _mm512_unpacklo_ps(wt853, wt854);
__m512 tmp20052 = _mm512_unpackhi_ps(wt853, wt854);
__m512 tmp20053 = _mm512_unpacklo_ps(wt855, wt856);
__m512 tmp20054 = _mm512_unpackhi_ps(wt855, wt856);
__m512 tmp20055 = _mm512_unpacklo_ps(wt857, wt858);
__m512 tmp20056 = _mm512_unpackhi_ps(wt857, wt858);
__m512 tmp20057 = _mm512_shuffle_ps(tmp20041, tmp20043, 68);
__m512 tmp20058 = _mm512_shuffle_ps(tmp20041, tmp20043, 238);
__m512 tmp20059 = _mm512_shuffle_ps(tmp20042, tmp20044, 68);
__m512 tmp20060 = _mm512_shuffle_ps(tmp20042, tmp20044, 238);
__m512 tmp20061 = _mm512_shuffle_ps(tmp20045, tmp20047, 68);
__m512 tmp20062 = _mm512_shuffle_ps(tmp20045, tmp20047, 238);
__m512 tmp20063 = _mm512_shuffle_ps(tmp20046, tmp20048, 68);
__m512 tmp20064 = _mm512_shuffle_ps(tmp20046, tmp20048, 238);
__m512 tmp20065 = _mm512_shuffle_ps(tmp20049, tmp20051, 68);
__m512 tmp20066 = _mm512_shuffle_ps(tmp20049, tmp20051, 238);
__m512 tmp20067 = _mm512_shuffle_ps(tmp20050, tmp20052, 68);
__m512 tmp20068 = _mm512_shuffle_ps(tmp20050, tmp20052, 238);
__m512 tmp20069 = _mm512_shuffle_ps(tmp20053, tmp20055, 68);
__m512 tmp20070 = _mm512_shuffle_ps(tmp20053, tmp20055, 238);
__m512 tmp20071 = _mm512_shuffle_ps(tmp20054, tmp20056, 68);
__m512 tmp20072 = _mm512_shuffle_ps(tmp20054, tmp20056, 238);
__m512 tmp20073 = _mm512_shuffle_f32x4(tmp20057, tmp20061, 136);
__m512 tmp20074 = _mm512_shuffle_f32x4(tmp20057, tmp20061, 221);
__m512 tmp20075 = _mm512_shuffle_f32x4(tmp20058, tmp20062, 136);
__m512 tmp20076 = _mm512_shuffle_f32x4(tmp20058, tmp20062, 221);
__m512 tmp20077 = _mm512_shuffle_f32x4(tmp20059, tmp20063, 136);
__m512 tmp20078 = _mm512_shuffle_f32x4(tmp20059, tmp20063, 221);
__m512 tmp20079 = _mm512_shuffle_f32x4(tmp20060, tmp20064, 136);
__m512 tmp20080 = _mm512_shuffle_f32x4(tmp20060, tmp20064, 221);
__m512 tmp20081 = _mm512_shuffle_f32x4(tmp20065, tmp20069, 136);
__m512 tmp20082 = _mm512_shuffle_f32x4(tmp20065, tmp20069, 221);
__m512 tmp20083 = _mm512_shuffle_f32x4(tmp20066, tmp20070, 136);
__m512 tmp20084 = _mm512_shuffle_f32x4(tmp20066, tmp20070, 221);
__m512 tmp20085 = _mm512_shuffle_f32x4(tmp20067, tmp20071, 136);
__m512 tmp20086 = _mm512_shuffle_f32x4(tmp20067, tmp20071, 221);
__m512 tmp20087 = _mm512_shuffle_f32x4(tmp20068, tmp20072, 136);
__m512 tmp20088 = _mm512_shuffle_f32x4(tmp20068, tmp20072, 221);
wt843 = _mm512_shuffle_f32x4(tmp20073, tmp20081, 136);
wt851 = _mm512_shuffle_f32x4(tmp20073, tmp20081, 221);
wt844 = _mm512_shuffle_f32x4(tmp20075, tmp20083, 136);
wt852 = _mm512_shuffle_f32x4(tmp20075, tmp20083, 221);
wt845 = _mm512_shuffle_f32x4(tmp20077, tmp20085, 136);
wt853 = _mm512_shuffle_f32x4(tmp20077, tmp20085, 221);
wt846 = _mm512_shuffle_f32x4(tmp20079, tmp20087, 136);
wt854 = _mm512_shuffle_f32x4(tmp20079, tmp20087, 221);
wt847 = _mm512_shuffle_f32x4(tmp20074, tmp20082, 136);
wt855 = _mm512_shuffle_f32x4(tmp20074, tmp20082, 221);
wt848 = _mm512_shuffle_f32x4(tmp20076, tmp20084, 136);
wt856 = _mm512_shuffle_f32x4(tmp20076, tmp20084, 221);
wt849 = _mm512_shuffle_f32x4(tmp20078, tmp20086, 136);
wt857 = _mm512_shuffle_f32x4(tmp20078, tmp20086, 221);
wt850 = _mm512_shuffle_f32x4(tmp20080, tmp20088, 136);
wt858 = _mm512_shuffle_f32x4(tmp20080, tmp20088, 221);
wt843 = _mm512_mul_ps(wt843, postMul80);
wt844 = _mm512_mul_ps(wt844, postMul80);
wt845 = _mm512_mul_ps(wt845, postMul80);
wt846 = _mm512_mul_ps(wt846, postMul80);
wt847 = _mm512_mul_ps(wt847, postMul80);
wt848 = _mm512_mul_ps(wt848, postMul80);
wt849 = _mm512_mul_ps(wt849, postMul80);
wt850 = _mm512_mul_ps(wt850, postMul80);
wt851 = _mm512_mul_ps(wt851, postMul80);
wt852 = _mm512_mul_ps(wt852, postMul80);
wt853 = _mm512_mul_ps(wt853, postMul80);
wt854 = _mm512_mul_ps(wt854, postMul80);
wt855 = _mm512_mul_ps(wt855, postMul80);
wt856 = _mm512_mul_ps(wt856, postMul80);
wt857 = _mm512_mul_ps(wt857, postMul80);
wt858 = _mm512_mul_ps(wt858, postMul80);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(1+16*c62)+(ptrdiff_t)0, 63>>cut34, wt843);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(2+16*c62)+(ptrdiff_t)0, 63>>cut34, wt844);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(3+16*c62)+(ptrdiff_t)0, 63>>cut34, wt845);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(4+16*c62)+(ptrdiff_t)0, 63>>cut34, wt846);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(5+16*c62)+(ptrdiff_t)0, 63>>cut34, wt847);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(6+16*c62)+(ptrdiff_t)0, 63>>cut34, wt848);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(7+16*c62)+(ptrdiff_t)0, 63>>cut34, wt849);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(8+16*c62)+(ptrdiff_t)0, 63>>cut34, wt850);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(9+16*c62)+(ptrdiff_t)0, 63>>cut34, wt851);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(10+16*c62)+(ptrdiff_t)0, 63>>cut34, wt852);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(11+16*c62)+(ptrdiff_t)0, 63>>cut34, wt853);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(12+16*c62)+(ptrdiff_t)0, 63>>cut34, wt854);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(13+16*c62)+(ptrdiff_t)0, 63>>cut34, wt855);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(14+16*c62)+(ptrdiff_t)0, 63>>cut34, wt856);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(15+16*c62)+(ptrdiff_t)0, 63>>cut34, wt857);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(16+16*c62)+(ptrdiff_t)0, 63>>cut34, wt858);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(1+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt843);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(2+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt844);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(3+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt845);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(4+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt846);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(5+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt847);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(6+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt848);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(7+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt849);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(8+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt850);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(9+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt851);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(10+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt852);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(11+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt853);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(12+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt854);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(13+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt855);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(14+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt856);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(15+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt857);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(16+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt858);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(1+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt843);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(2+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt844);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(3+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt845);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(4+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt846);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(5+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt847);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(6+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt848);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(7+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt849);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(8+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt850);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(9+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt851);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(10+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt852);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(11+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt853);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(12+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt854);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(13+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt855);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(14+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt856);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(15+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt857);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(16+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt858);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(1+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt843);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(2+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt844);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(3+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt845);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(4+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt846);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(5+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt847);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(6+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt848);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(7+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt849);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(8+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt850);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(9+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt851);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(10+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt852);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(11+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt853);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(12+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt854);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(13+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt855);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(14+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt856);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(15+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt857);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(16+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt858);
}
__m512 wt859 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)0);
__m512 wt860 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)8192);
__m512 wt861 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)16384);
__m512 wt862 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)24576);
__m512 wt863 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)32768);
__m512 wt864 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)40960);
__m512 wt865 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)49152);
__m512 wt866 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)57344);
__m512 wt867 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)65536);
__m512 wt868 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)73728);
__m512 wt869 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)81920);
__m512 wt870 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)90112);
__m512 wt871 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)98304);
__m512 wt872 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)106496);
__m512 wt873 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)114688);
__m512 wt874 = _mm512_maskz_loadu_ps(7, wtPtr24+4194304*i77+8192*k184+64*c62+(ptrdiff_t)122880);
__m512 tmp20089 = _mm512_unpacklo_ps(wt859, wt860);
__m512 tmp20090 = _mm512_unpackhi_ps(wt859, wt860);
__m512 tmp20091 = _mm512_unpacklo_ps(wt861, wt862);
__m512 tmp20092 = _mm512_unpackhi_ps(wt861, wt862);
__m512 tmp20093 = _mm512_unpacklo_ps(wt863, wt864);
__m512 tmp20094 = _mm512_unpackhi_ps(wt863, wt864);
__m512 tmp20095 = _mm512_unpacklo_ps(wt865, wt866);
__m512 tmp20096 = _mm512_unpackhi_ps(wt865, wt866);
__m512 tmp20097 = _mm512_unpacklo_ps(wt867, wt868);
__m512 tmp20098 = _mm512_unpackhi_ps(wt867, wt868);
__m512 tmp20099 = _mm512_unpacklo_ps(wt869, wt870);
__m512 tmp20100 = _mm512_unpackhi_ps(wt869, wt870);
__m512 tmp20101 = _mm512_unpacklo_ps(wt871, wt872);
__m512 tmp20102 = _mm512_unpackhi_ps(wt871, wt872);
__m512 tmp20103 = _mm512_unpacklo_ps(wt873, wt874);
__m512 tmp20104 = _mm512_unpackhi_ps(wt873, wt874);
__m512 tmp20105 = _mm512_shuffle_ps(tmp20089, tmp20091, 68);
__m512 tmp20106 = _mm512_shuffle_ps(tmp20089, tmp20091, 238);
__m512 tmp20107 = _mm512_shuffle_ps(tmp20090, tmp20092, 68);
__m512 tmp20108 = _mm512_shuffle_ps(tmp20093, tmp20095, 68);
__m512 tmp20109 = _mm512_shuffle_ps(tmp20093, tmp20095, 238);
__m512 tmp20110 = _mm512_shuffle_ps(tmp20094, tmp20096, 68);
__m512 tmp20111 = _mm512_shuffle_ps(tmp20097, tmp20099, 68);
__m512 tmp20112 = _mm512_shuffle_ps(tmp20097, tmp20099, 238);
__m512 tmp20113 = _mm512_shuffle_ps(tmp20098, tmp20100, 68);
__m512 tmp20114 = _mm512_shuffle_ps(tmp20101, tmp20103, 68);
__m512 tmp20115 = _mm512_shuffle_ps(tmp20101, tmp20103, 238);
__m512 tmp20116 = _mm512_shuffle_ps(tmp20102, tmp20104, 68);
__m512 tmp20117 = _mm512_shuffle_f32x4(tmp20105, tmp20108, 136);
__m512 tmp20118 = _mm512_shuffle_f32x4(tmp20106, tmp20109, 136);
__m512 tmp20119 = _mm512_shuffle_f32x4(tmp20107, tmp20110, 136);
__m512 tmp20120 = _mm512_shuffle_f32x4(tmp20111, tmp20114, 136);
__m512 tmp20121 = _mm512_shuffle_f32x4(tmp20112, tmp20115, 136);
__m512 tmp20122 = _mm512_shuffle_f32x4(tmp20113, tmp20116, 136);
wt859 = _mm512_shuffle_f32x4(tmp20117, tmp20120, 136);
wt860 = _mm512_shuffle_f32x4(tmp20118, tmp20121, 136);
wt861 = _mm512_shuffle_f32x4(tmp20119, tmp20122, 136);
wt859 = _mm512_mul_ps(wt859, postMul80);
wt860 = _mm512_mul_ps(wt860, postMul80);
wt861 = _mm512_mul_ps(wt861, postMul80);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(1+16*c62)+(ptrdiff_t)0, 63>>cut34, wt859);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(2+16*c62)+(ptrdiff_t)0, 63>>cut34, wt860);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(3+16*c62)+(ptrdiff_t)0, 63>>cut34, wt861);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(1+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt859);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(2+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt860);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(3+16*c62)+(ptrdiff_t)20040, 4032>>cut34, wt861);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(1+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt859);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(2+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt860);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+24*(3+16*c62)+(ptrdiff_t)40080, 258048>>cut34, wt861);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(1+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt859);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(2+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt860);
_mm512_mask_storeu_ps(arranged23+1712128*i77+20064*l81+4*cut34+8*(3+16*c62)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt861);
}
}
}
return;
}
char*restrict wtPtr25 = tensors134[0]+(ptrdiff_t)3340*2+(ptrdiff_t)4194304*0;
char*restrict bnPtr25 = tensors134[2]+(ptrdiff_t)8*512*0;
char*restrict arranged24 = tensors134[3]+(ptrdiff_t)1712128*2+(ptrdiff_t)776192*0;
ptrdiff_t ii35 = 1;
for (ptrdiff_t i78 = 0; i78 < ii35; ++i78) {
ptrdiff_t j69 = 1*b75;
ptrdiff_t jj56 = j69+1;
for (; j69 < jj56; ++j69) {
if (j69 < 31) {
ptrdiff_t k187 = 0+16*(j69-0);
ptrdiff_t l84 = (size_t)(0+k187)/6;
ptrdiff_t cut37 = (size_t)(0+k187)%6;
switch (cut37) {
case 0:;
case 2: {
__m512 sum723 = _mm512_setzero_ps();
__m512i pmMul53 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd53 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo45 = _mm512_loadu_ps(bnPtr25+(ptrdiff_t)8*(k187+512*i78));
__m512 masHi45 = _mm512_maskz_loadu_ps(65535, bnPtr25+(ptrdiff_t)8*(k187+512*i78)+(ptrdiff_t)64);
__m512 postMul84 = _mm512_permutex2var_ps(masLo45, pmMul53, masHi45);
__m512 postAdd54 = _mm512_permutex2var_ps(masLo45, pmAdd53, masHi45);
(void)postAdd54;
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*0+(ptrdiff_t)0, 63>>cut37, sum723);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*0+(ptrdiff_t)9072, 4032>>cut37, sum723);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*0+(ptrdiff_t)18144, 65535-(4095>>cut37), sum723);
ptrdiff_t c66 = 0;
for (; c66 != 23; ++c66) {
__m512 wt971 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)0);
__m512 wt972 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)8192);
__m512 wt973 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)16384);
__m512 wt974 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)24576);
__m512 wt975 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)32768);
__m512 wt976 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)40960);
__m512 wt977 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)49152);
__m512 wt978 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)57344);
__m512 wt979 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)65536);
__m512 wt980 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)73728);
__m512 wt981 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)81920);
__m512 wt982 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)90112);
__m512 wt983 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)98304);
__m512 wt984 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)106496);
__m512 wt985 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)114688);
__m512 wt986 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)122880);
__m512 tmp20123 = _mm512_unpacklo_ps(wt971, wt972);
__m512 tmp20124 = _mm512_unpackhi_ps(wt971, wt972);
__m512 tmp20125 = _mm512_unpacklo_ps(wt973, wt974);
__m512 tmp20126 = _mm512_unpackhi_ps(wt973, wt974);
__m512 tmp20127 = _mm512_unpacklo_ps(wt975, wt976);
__m512 tmp20128 = _mm512_unpackhi_ps(wt975, wt976);
__m512 tmp20129 = _mm512_unpacklo_ps(wt977, wt978);
__m512 tmp20130 = _mm512_unpackhi_ps(wt977, wt978);
__m512 tmp20131 = _mm512_unpacklo_ps(wt979, wt980);
__m512 tmp20132 = _mm512_unpackhi_ps(wt979, wt980);
__m512 tmp20133 = _mm512_unpacklo_ps(wt981, wt982);
__m512 tmp20134 = _mm512_unpackhi_ps(wt981, wt982);
__m512 tmp20135 = _mm512_unpacklo_ps(wt983, wt984);
__m512 tmp20136 = _mm512_unpackhi_ps(wt983, wt984);
__m512 tmp20137 = _mm512_unpacklo_ps(wt985, wt986);
__m512 tmp20138 = _mm512_unpackhi_ps(wt985, wt986);
__m512 tmp20139 = _mm512_shuffle_ps(tmp20123, tmp20125, 68);
__m512 tmp20140 = _mm512_shuffle_ps(tmp20123, tmp20125, 238);
__m512 tmp20141 = _mm512_shuffle_ps(tmp20124, tmp20126, 68);
__m512 tmp20142 = _mm512_shuffle_ps(tmp20124, tmp20126, 238);
__m512 tmp20143 = _mm512_shuffle_ps(tmp20127, tmp20129, 68);
__m512 tmp20144 = _mm512_shuffle_ps(tmp20127, tmp20129, 238);
__m512 tmp20145 = _mm512_shuffle_ps(tmp20128, tmp20130, 68);
__m512 tmp20146 = _mm512_shuffle_ps(tmp20128, tmp20130, 238);
__m512 tmp20147 = _mm512_shuffle_ps(tmp20131, tmp20133, 68);
__m512 tmp20148 = _mm512_shuffle_ps(tmp20131, tmp20133, 238);
__m512 tmp20149 = _mm512_shuffle_ps(tmp20132, tmp20134, 68);
__m512 tmp20150 = _mm512_shuffle_ps(tmp20132, tmp20134, 238);
__m512 tmp20151 = _mm512_shuffle_ps(tmp20135, tmp20137, 68);
__m512 tmp20152 = _mm512_shuffle_ps(tmp20135, tmp20137, 238);
__m512 tmp20153 = _mm512_shuffle_ps(tmp20136, tmp20138, 68);
__m512 tmp20154 = _mm512_shuffle_ps(tmp20136, tmp20138, 238);
__m512 tmp20155 = _mm512_shuffle_f32x4(tmp20139, tmp20143, 136);
__m512 tmp20156 = _mm512_shuffle_f32x4(tmp20139, tmp20143, 221);
__m512 tmp20157 = _mm512_shuffle_f32x4(tmp20140, tmp20144, 136);
__m512 tmp20158 = _mm512_shuffle_f32x4(tmp20140, tmp20144, 221);
__m512 tmp20159 = _mm512_shuffle_f32x4(tmp20141, tmp20145, 136);
__m512 tmp20160 = _mm512_shuffle_f32x4(tmp20141, tmp20145, 221);
__m512 tmp20161 = _mm512_shuffle_f32x4(tmp20142, tmp20146, 136);
__m512 tmp20162 = _mm512_shuffle_f32x4(tmp20142, tmp20146, 221);
__m512 tmp20163 = _mm512_shuffle_f32x4(tmp20147, tmp20151, 136);
__m512 tmp20164 = _mm512_shuffle_f32x4(tmp20147, tmp20151, 221);
__m512 tmp20165 = _mm512_shuffle_f32x4(tmp20148, tmp20152, 136);
__m512 tmp20166 = _mm512_shuffle_f32x4(tmp20148, tmp20152, 221);
__m512 tmp20167 = _mm512_shuffle_f32x4(tmp20149, tmp20153, 136);
__m512 tmp20168 = _mm512_shuffle_f32x4(tmp20149, tmp20153, 221);
__m512 tmp20169 = _mm512_shuffle_f32x4(tmp20150, tmp20154, 136);
__m512 tmp20170 = _mm512_shuffle_f32x4(tmp20150, tmp20154, 221);
wt971 = _mm512_shuffle_f32x4(tmp20155, tmp20163, 136);
wt979 = _mm512_shuffle_f32x4(tmp20155, tmp20163, 221);
wt972 = _mm512_shuffle_f32x4(tmp20157, tmp20165, 136);
wt980 = _mm512_shuffle_f32x4(tmp20157, tmp20165, 221);
wt973 = _mm512_shuffle_f32x4(tmp20159, tmp20167, 136);
wt981 = _mm512_shuffle_f32x4(tmp20159, tmp20167, 221);
wt974 = _mm512_shuffle_f32x4(tmp20161, tmp20169, 136);
wt982 = _mm512_shuffle_f32x4(tmp20161, tmp20169, 221);
wt975 = _mm512_shuffle_f32x4(tmp20156, tmp20164, 136);
wt983 = _mm512_shuffle_f32x4(tmp20156, tmp20164, 221);
wt976 = _mm512_shuffle_f32x4(tmp20158, tmp20166, 136);
wt984 = _mm512_shuffle_f32x4(tmp20158, tmp20166, 221);
wt977 = _mm512_shuffle_f32x4(tmp20160, tmp20168, 136);
wt985 = _mm512_shuffle_f32x4(tmp20160, tmp20168, 221);
wt978 = _mm512_shuffle_f32x4(tmp20162, tmp20170, 136);
wt986 = _mm512_shuffle_f32x4(tmp20162, tmp20170, 221);
wt971 = _mm512_mul_ps(wt971, postMul84);
wt972 = _mm512_mul_ps(wt972, postMul84);
wt973 = _mm512_mul_ps(wt973, postMul84);
wt974 = _mm512_mul_ps(wt974, postMul84);
wt975 = _mm512_mul_ps(wt975, postMul84);
wt976 = _mm512_mul_ps(wt976, postMul84);
wt977 = _mm512_mul_ps(wt977, postMul84);
wt978 = _mm512_mul_ps(wt978, postMul84);
wt979 = _mm512_mul_ps(wt979, postMul84);
wt980 = _mm512_mul_ps(wt980, postMul84);
wt981 = _mm512_mul_ps(wt981, postMul84);
wt982 = _mm512_mul_ps(wt982, postMul84);
wt983 = _mm512_mul_ps(wt983, postMul84);
wt984 = _mm512_mul_ps(wt984, postMul84);
wt985 = _mm512_mul_ps(wt985, postMul84);
wt986 = _mm512_mul_ps(wt986, postMul84);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(1+16*c66)+(ptrdiff_t)0, 63>>cut37, wt971);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(2+16*c66)+(ptrdiff_t)0, 63>>cut37, wt972);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(3+16*c66)+(ptrdiff_t)0, 63>>cut37, wt973);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(4+16*c66)+(ptrdiff_t)0, 63>>cut37, wt974);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(5+16*c66)+(ptrdiff_t)0, 63>>cut37, wt975);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(6+16*c66)+(ptrdiff_t)0, 63>>cut37, wt976);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(7+16*c66)+(ptrdiff_t)0, 63>>cut37, wt977);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(8+16*c66)+(ptrdiff_t)0, 63>>cut37, wt978);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(9+16*c66)+(ptrdiff_t)0, 63>>cut37, wt979);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(10+16*c66)+(ptrdiff_t)0, 63>>cut37, wt980);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(11+16*c66)+(ptrdiff_t)0, 63>>cut37, wt981);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(12+16*c66)+(ptrdiff_t)0, 63>>cut37, wt982);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(13+16*c66)+(ptrdiff_t)0, 63>>cut37, wt983);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(14+16*c66)+(ptrdiff_t)0, 63>>cut37, wt984);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(15+16*c66)+(ptrdiff_t)0, 63>>cut37, wt985);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(16+16*c66)+(ptrdiff_t)0, 63>>cut37, wt986);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(1+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt971);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(2+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt972);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(3+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt973);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(4+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt974);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(5+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt975);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(6+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt976);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(7+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt977);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(8+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt978);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(9+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt979);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(10+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt980);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(11+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt981);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(12+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt982);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(13+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt983);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(14+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt984);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(15+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt985);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(16+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt986);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(1+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt971);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(2+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt972);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(3+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt973);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(4+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt974);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(5+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt975);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(6+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt976);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(7+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt977);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(8+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt978);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(9+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt979);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(10+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt980);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(11+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt981);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(12+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt982);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(13+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt983);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(14+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt984);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(15+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt985);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(16+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt986);
}
__m512 wt987 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)0);
__m512 wt988 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)8192);
__m512 wt989 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)16384);
__m512 wt990 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)24576);
__m512 wt991 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)32768);
__m512 wt992 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)40960);
__m512 wt993 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)49152);
__m512 wt994 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)57344);
__m512 wt995 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)65536);
__m512 wt996 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)73728);
__m512 wt997 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)81920);
__m512 wt998 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)90112);
__m512 wt999 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)98304);
__m512 wt1000 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)106496);
__m512 wt1001 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)114688);
__m512 wt1002 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c66+(ptrdiff_t)122880);
__m512 tmp20171 = _mm512_unpacklo_ps(wt987, wt988);
__m512 tmp20172 = _mm512_unpackhi_ps(wt987, wt988);
__m512 tmp20173 = _mm512_unpacklo_ps(wt989, wt990);
__m512 tmp20174 = _mm512_unpackhi_ps(wt989, wt990);
__m512 tmp20175 = _mm512_unpacklo_ps(wt991, wt992);
__m512 tmp20176 = _mm512_unpackhi_ps(wt991, wt992);
__m512 tmp20177 = _mm512_unpacklo_ps(wt993, wt994);
__m512 tmp20178 = _mm512_unpackhi_ps(wt993, wt994);
__m512 tmp20179 = _mm512_unpacklo_ps(wt995, wt996);
__m512 tmp20180 = _mm512_unpackhi_ps(wt995, wt996);
__m512 tmp20181 = _mm512_unpacklo_ps(wt997, wt998);
__m512 tmp20182 = _mm512_unpackhi_ps(wt997, wt998);
__m512 tmp20183 = _mm512_unpacklo_ps(wt999, wt1000);
__m512 tmp20184 = _mm512_unpackhi_ps(wt999, wt1000);
__m512 tmp20185 = _mm512_unpacklo_ps(wt1001, wt1002);
__m512 tmp20186 = _mm512_unpackhi_ps(wt1001, wt1002);
__m512 tmp20187 = _mm512_shuffle_ps(tmp20171, tmp20173, 68);
__m512 tmp20188 = _mm512_shuffle_ps(tmp20171, tmp20173, 238);
__m512 tmp20189 = _mm512_shuffle_ps(tmp20172, tmp20174, 68);
__m512 tmp20190 = _mm512_shuffle_ps(tmp20172, tmp20174, 238);
__m512 tmp20191 = _mm512_shuffle_ps(tmp20175, tmp20177, 68);
__m512 tmp20192 = _mm512_shuffle_ps(tmp20175, tmp20177, 238);
__m512 tmp20193 = _mm512_shuffle_ps(tmp20176, tmp20178, 68);
__m512 tmp20194 = _mm512_shuffle_ps(tmp20176, tmp20178, 238);
__m512 tmp20195 = _mm512_shuffle_ps(tmp20179, tmp20181, 68);
__m512 tmp20196 = _mm512_shuffle_ps(tmp20179, tmp20181, 238);
__m512 tmp20197 = _mm512_shuffle_ps(tmp20180, tmp20182, 68);
__m512 tmp20198 = _mm512_shuffle_ps(tmp20180, tmp20182, 238);
__m512 tmp20199 = _mm512_shuffle_ps(tmp20183, tmp20185, 68);
__m512 tmp20200 = _mm512_shuffle_ps(tmp20183, tmp20185, 238);
__m512 tmp20201 = _mm512_shuffle_ps(tmp20184, tmp20186, 68);
__m512 tmp20202 = _mm512_shuffle_ps(tmp20184, tmp20186, 238);
__m512 tmp20203 = _mm512_shuffle_f32x4(tmp20187, tmp20191, 136);
__m512 tmp20204 = _mm512_shuffle_f32x4(tmp20187, tmp20191, 221);
__m512 tmp20205 = _mm512_shuffle_f32x4(tmp20188, tmp20192, 136);
__m512 tmp20206 = _mm512_shuffle_f32x4(tmp20188, tmp20192, 221);
__m512 tmp20207 = _mm512_shuffle_f32x4(tmp20189, tmp20193, 136);
__m512 tmp20208 = _mm512_shuffle_f32x4(tmp20189, tmp20193, 221);
__m512 tmp20209 = _mm512_shuffle_f32x4(tmp20190, tmp20194, 136);
__m512 tmp20210 = _mm512_shuffle_f32x4(tmp20190, tmp20194, 221);
__m512 tmp20211 = _mm512_shuffle_f32x4(tmp20195, tmp20199, 136);
__m512 tmp20212 = _mm512_shuffle_f32x4(tmp20195, tmp20199, 221);
__m512 tmp20213 = _mm512_shuffle_f32x4(tmp20196, tmp20200, 136);
__m512 tmp20214 = _mm512_shuffle_f32x4(tmp20196, tmp20200, 221);
__m512 tmp20215 = _mm512_shuffle_f32x4(tmp20197, tmp20201, 136);
__m512 tmp20216 = _mm512_shuffle_f32x4(tmp20197, tmp20201, 221);
__m512 tmp20217 = _mm512_shuffle_f32x4(tmp20198, tmp20202, 136);
__m512 tmp20218 = _mm512_shuffle_f32x4(tmp20198, tmp20202, 221);
wt987 = _mm512_shuffle_f32x4(tmp20203, tmp20211, 136);
wt995 = _mm512_shuffle_f32x4(tmp20203, tmp20211, 221);
wt988 = _mm512_shuffle_f32x4(tmp20205, tmp20213, 136);
wt996 = _mm512_shuffle_f32x4(tmp20205, tmp20213, 221);
wt989 = _mm512_shuffle_f32x4(tmp20207, tmp20215, 136);
wt990 = _mm512_shuffle_f32x4(tmp20209, tmp20217, 136);
wt991 = _mm512_shuffle_f32x4(tmp20204, tmp20212, 136);
wt992 = _mm512_shuffle_f32x4(tmp20206, tmp20214, 136);
wt993 = _mm512_shuffle_f32x4(tmp20208, tmp20216, 136);
wt994 = _mm512_shuffle_f32x4(tmp20210, tmp20218, 136);
wt987 = _mm512_mul_ps(wt987, postMul84);
wt988 = _mm512_mul_ps(wt988, postMul84);
wt989 = _mm512_mul_ps(wt989, postMul84);
wt990 = _mm512_mul_ps(wt990, postMul84);
wt991 = _mm512_mul_ps(wt991, postMul84);
wt992 = _mm512_mul_ps(wt992, postMul84);
wt993 = _mm512_mul_ps(wt993, postMul84);
wt994 = _mm512_mul_ps(wt994, postMul84);
wt995 = _mm512_mul_ps(wt995, postMul84);
wt996 = _mm512_mul_ps(wt996, postMul84);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(1+16*c66)+(ptrdiff_t)0, 63>>cut37, wt987);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(2+16*c66)+(ptrdiff_t)0, 63>>cut37, wt988);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(3+16*c66)+(ptrdiff_t)0, 63>>cut37, wt989);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(4+16*c66)+(ptrdiff_t)0, 63>>cut37, wt990);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(5+16*c66)+(ptrdiff_t)0, 63>>cut37, wt991);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(6+16*c66)+(ptrdiff_t)0, 63>>cut37, wt992);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(7+16*c66)+(ptrdiff_t)0, 63>>cut37, wt993);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(8+16*c66)+(ptrdiff_t)0, 63>>cut37, wt994);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(9+16*c66)+(ptrdiff_t)0, 63>>cut37, wt995);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(10+16*c66)+(ptrdiff_t)0, 63>>cut37, wt996);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(1+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt987);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(2+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt988);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(3+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt989);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(4+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt990);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(5+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt991);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(6+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt992);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(7+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt993);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(8+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt994);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(9+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt995);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(10+16*c66)+(ptrdiff_t)9072, 4032>>cut37, wt996);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(1+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt987);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(2+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt988);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(3+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt989);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(4+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt990);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(5+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt991);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(6+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt992);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(7+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt993);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(8+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt994);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(9+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt995);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(10+16*c66)+(ptrdiff_t)18144, 65535-(4095>>cut37), wt996);
break;
}
default: {
cut37 = 4;
__m512 sum724 = _mm512_setzero_ps();
__m512i pmMul54 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd54 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo46 = _mm512_loadu_ps(bnPtr25+(ptrdiff_t)8*(k187+512*i78));
__m512 masHi46 = _mm512_maskz_loadu_ps(65535, bnPtr25+(ptrdiff_t)8*(k187+512*i78)+(ptrdiff_t)64);
__m512 postMul85 = _mm512_permutex2var_ps(masLo46, pmMul54, masHi46);
__m512 postAdd55 = _mm512_permutex2var_ps(masLo46, pmAdd54, masHi46);
(void)postAdd55;
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*0+(ptrdiff_t)0, 63>>cut37, sum724);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*0+(ptrdiff_t)9072, 4032>>cut37, sum724);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*0+(ptrdiff_t)18144, 258048>>cut37, sum724);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*0+(ptrdiff_t)27216, 65535-(262143>>cut37), sum724);
ptrdiff_t c67 = 0;
for (; c67 != 23; ++c67) {
__m512 wt1003 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)0);
__m512 wt1004 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)8192);
__m512 wt1005 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)16384);
__m512 wt1006 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)24576);
__m512 wt1007 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)32768);
__m512 wt1008 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)40960);
__m512 wt1009 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)49152);
__m512 wt1010 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)57344);
__m512 wt1011 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)65536);
__m512 wt1012 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)73728);
__m512 wt1013 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)81920);
__m512 wt1014 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)90112);
__m512 wt1015 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)98304);
__m512 wt1016 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)106496);
__m512 wt1017 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)114688);
__m512 wt1018 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)122880);
__m512 tmp20219 = _mm512_unpacklo_ps(wt1003, wt1004);
__m512 tmp20220 = _mm512_unpackhi_ps(wt1003, wt1004);
__m512 tmp20221 = _mm512_unpacklo_ps(wt1005, wt1006);
__m512 tmp20222 = _mm512_unpackhi_ps(wt1005, wt1006);
__m512 tmp20223 = _mm512_unpacklo_ps(wt1007, wt1008);
__m512 tmp20224 = _mm512_unpackhi_ps(wt1007, wt1008);
__m512 tmp20225 = _mm512_unpacklo_ps(wt1009, wt1010);
__m512 tmp20226 = _mm512_unpackhi_ps(wt1009, wt1010);
__m512 tmp20227 = _mm512_unpacklo_ps(wt1011, wt1012);
__m512 tmp20228 = _mm512_unpackhi_ps(wt1011, wt1012);
__m512 tmp20229 = _mm512_unpacklo_ps(wt1013, wt1014);
__m512 tmp20230 = _mm512_unpackhi_ps(wt1013, wt1014);
__m512 tmp20231 = _mm512_unpacklo_ps(wt1015, wt1016);
__m512 tmp20232 = _mm512_unpackhi_ps(wt1015, wt1016);
__m512 tmp20233 = _mm512_unpacklo_ps(wt1017, wt1018);
__m512 tmp20234 = _mm512_unpackhi_ps(wt1017, wt1018);
__m512 tmp20235 = _mm512_shuffle_ps(tmp20219, tmp20221, 68);
__m512 tmp20236 = _mm512_shuffle_ps(tmp20219, tmp20221, 238);
__m512 tmp20237 = _mm512_shuffle_ps(tmp20220, tmp20222, 68);
__m512 tmp20238 = _mm512_shuffle_ps(tmp20220, tmp20222, 238);
__m512 tmp20239 = _mm512_shuffle_ps(tmp20223, tmp20225, 68);
__m512 tmp20240 = _mm512_shuffle_ps(tmp20223, tmp20225, 238);
__m512 tmp20241 = _mm512_shuffle_ps(tmp20224, tmp20226, 68);
__m512 tmp20242 = _mm512_shuffle_ps(tmp20224, tmp20226, 238);
__m512 tmp20243 = _mm512_shuffle_ps(tmp20227, tmp20229, 68);
__m512 tmp20244 = _mm512_shuffle_ps(tmp20227, tmp20229, 238);
__m512 tmp20245 = _mm512_shuffle_ps(tmp20228, tmp20230, 68);
__m512 tmp20246 = _mm512_shuffle_ps(tmp20228, tmp20230, 238);
__m512 tmp20247 = _mm512_shuffle_ps(tmp20231, tmp20233, 68);
__m512 tmp20248 = _mm512_shuffle_ps(tmp20231, tmp20233, 238);
__m512 tmp20249 = _mm512_shuffle_ps(tmp20232, tmp20234, 68);
__m512 tmp20250 = _mm512_shuffle_ps(tmp20232, tmp20234, 238);
__m512 tmp20251 = _mm512_shuffle_f32x4(tmp20235, tmp20239, 136);
__m512 tmp20252 = _mm512_shuffle_f32x4(tmp20235, tmp20239, 221);
__m512 tmp20253 = _mm512_shuffle_f32x4(tmp20236, tmp20240, 136);
__m512 tmp20254 = _mm512_shuffle_f32x4(tmp20236, tmp20240, 221);
__m512 tmp20255 = _mm512_shuffle_f32x4(tmp20237, tmp20241, 136);
__m512 tmp20256 = _mm512_shuffle_f32x4(tmp20237, tmp20241, 221);
__m512 tmp20257 = _mm512_shuffle_f32x4(tmp20238, tmp20242, 136);
__m512 tmp20258 = _mm512_shuffle_f32x4(tmp20238, tmp20242, 221);
__m512 tmp20259 = _mm512_shuffle_f32x4(tmp20243, tmp20247, 136);
__m512 tmp20260 = _mm512_shuffle_f32x4(tmp20243, tmp20247, 221);
__m512 tmp20261 = _mm512_shuffle_f32x4(tmp20244, tmp20248, 136);
__m512 tmp20262 = _mm512_shuffle_f32x4(tmp20244, tmp20248, 221);
__m512 tmp20263 = _mm512_shuffle_f32x4(tmp20245, tmp20249, 136);
__m512 tmp20264 = _mm512_shuffle_f32x4(tmp20245, tmp20249, 221);
__m512 tmp20265 = _mm512_shuffle_f32x4(tmp20246, tmp20250, 136);
__m512 tmp20266 = _mm512_shuffle_f32x4(tmp20246, tmp20250, 221);
wt1003 = _mm512_shuffle_f32x4(tmp20251, tmp20259, 136);
wt1011 = _mm512_shuffle_f32x4(tmp20251, tmp20259, 221);
wt1004 = _mm512_shuffle_f32x4(tmp20253, tmp20261, 136);
wt1012 = _mm512_shuffle_f32x4(tmp20253, tmp20261, 221);
wt1005 = _mm512_shuffle_f32x4(tmp20255, tmp20263, 136);
wt1013 = _mm512_shuffle_f32x4(tmp20255, tmp20263, 221);
wt1006 = _mm512_shuffle_f32x4(tmp20257, tmp20265, 136);
wt1014 = _mm512_shuffle_f32x4(tmp20257, tmp20265, 221);
wt1007 = _mm512_shuffle_f32x4(tmp20252, tmp20260, 136);
wt1015 = _mm512_shuffle_f32x4(tmp20252, tmp20260, 221);
wt1008 = _mm512_shuffle_f32x4(tmp20254, tmp20262, 136);
wt1016 = _mm512_shuffle_f32x4(tmp20254, tmp20262, 221);
wt1009 = _mm512_shuffle_f32x4(tmp20256, tmp20264, 136);
wt1017 = _mm512_shuffle_f32x4(tmp20256, tmp20264, 221);
wt1010 = _mm512_shuffle_f32x4(tmp20258, tmp20266, 136);
wt1018 = _mm512_shuffle_f32x4(tmp20258, tmp20266, 221);
wt1003 = _mm512_mul_ps(wt1003, postMul85);
wt1004 = _mm512_mul_ps(wt1004, postMul85);
wt1005 = _mm512_mul_ps(wt1005, postMul85);
wt1006 = _mm512_mul_ps(wt1006, postMul85);
wt1007 = _mm512_mul_ps(wt1007, postMul85);
wt1008 = _mm512_mul_ps(wt1008, postMul85);
wt1009 = _mm512_mul_ps(wt1009, postMul85);
wt1010 = _mm512_mul_ps(wt1010, postMul85);
wt1011 = _mm512_mul_ps(wt1011, postMul85);
wt1012 = _mm512_mul_ps(wt1012, postMul85);
wt1013 = _mm512_mul_ps(wt1013, postMul85);
wt1014 = _mm512_mul_ps(wt1014, postMul85);
wt1015 = _mm512_mul_ps(wt1015, postMul85);
wt1016 = _mm512_mul_ps(wt1016, postMul85);
wt1017 = _mm512_mul_ps(wt1017, postMul85);
wt1018 = _mm512_mul_ps(wt1018, postMul85);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(1+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1003);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(2+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1004);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(3+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1005);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(4+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1006);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(5+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1007);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(6+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1008);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(7+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1009);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(8+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1010);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(9+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1011);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(10+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1012);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(11+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1013);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(12+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1014);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(13+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1015);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(14+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1016);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(15+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1017);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(16+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1018);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(1+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1003);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(2+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1004);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(3+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1005);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(4+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1006);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(5+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1007);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(6+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1008);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(7+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1009);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(8+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1010);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(9+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1011);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(10+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1012);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(11+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1013);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(12+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1014);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(13+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1015);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(14+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1016);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(15+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1017);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(16+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1018);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(1+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1003);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(2+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1004);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(3+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1005);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(4+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1006);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(5+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1007);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(6+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1008);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(7+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1009);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(8+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1010);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(9+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1011);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(10+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1012);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(11+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1013);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(12+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1014);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(13+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1015);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(14+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1016);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(15+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1017);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(16+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1018);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(1+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1003);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(2+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1004);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(3+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1005);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(4+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1006);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(5+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1007);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(6+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1008);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(7+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1009);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(8+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1010);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(9+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1011);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(10+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1012);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(11+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1013);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(12+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1014);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(13+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1015);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(14+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1016);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(15+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1017);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(16+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1018);
}
__m512 wt1019 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)0);
__m512 wt1020 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)8192);
__m512 wt1021 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)16384);
__m512 wt1022 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)24576);
__m512 wt1023 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)32768);
__m512 wt1024 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)40960);
__m512 wt1025 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)49152);
__m512 wt1026 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)57344);
__m512 wt1027 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)65536);
__m512 wt1028 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)73728);
__m512 wt1029 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)81920);
__m512 wt1030 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)90112);
__m512 wt1031 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)98304);
__m512 wt1032 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)106496);
__m512 wt1033 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)114688);
__m512 wt1034 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k187+64*c67+(ptrdiff_t)122880);
__m512 tmp20267 = _mm512_unpacklo_ps(wt1019, wt1020);
__m512 tmp20268 = _mm512_unpackhi_ps(wt1019, wt1020);
__m512 tmp20269 = _mm512_unpacklo_ps(wt1021, wt1022);
__m512 tmp20270 = _mm512_unpackhi_ps(wt1021, wt1022);
__m512 tmp20271 = _mm512_unpacklo_ps(wt1023, wt1024);
__m512 tmp20272 = _mm512_unpackhi_ps(wt1023, wt1024);
__m512 tmp20273 = _mm512_unpacklo_ps(wt1025, wt1026);
__m512 tmp20274 = _mm512_unpackhi_ps(wt1025, wt1026);
__m512 tmp20275 = _mm512_unpacklo_ps(wt1027, wt1028);
__m512 tmp20276 = _mm512_unpackhi_ps(wt1027, wt1028);
__m512 tmp20277 = _mm512_unpacklo_ps(wt1029, wt1030);
__m512 tmp20278 = _mm512_unpackhi_ps(wt1029, wt1030);
__m512 tmp20279 = _mm512_unpacklo_ps(wt1031, wt1032);
__m512 tmp20280 = _mm512_unpackhi_ps(wt1031, wt1032);
__m512 tmp20281 = _mm512_unpacklo_ps(wt1033, wt1034);
__m512 tmp20282 = _mm512_unpackhi_ps(wt1033, wt1034);
__m512 tmp20283 = _mm512_shuffle_ps(tmp20267, tmp20269, 68);
__m512 tmp20284 = _mm512_shuffle_ps(tmp20267, tmp20269, 238);
__m512 tmp20285 = _mm512_shuffle_ps(tmp20268, tmp20270, 68);
__m512 tmp20286 = _mm512_shuffle_ps(tmp20268, tmp20270, 238);
__m512 tmp20287 = _mm512_shuffle_ps(tmp20271, tmp20273, 68);
__m512 tmp20288 = _mm512_shuffle_ps(tmp20271, tmp20273, 238);
__m512 tmp20289 = _mm512_shuffle_ps(tmp20272, tmp20274, 68);
__m512 tmp20290 = _mm512_shuffle_ps(tmp20272, tmp20274, 238);
__m512 tmp20291 = _mm512_shuffle_ps(tmp20275, tmp20277, 68);
__m512 tmp20292 = _mm512_shuffle_ps(tmp20275, tmp20277, 238);
__m512 tmp20293 = _mm512_shuffle_ps(tmp20276, tmp20278, 68);
__m512 tmp20294 = _mm512_shuffle_ps(tmp20276, tmp20278, 238);
__m512 tmp20295 = _mm512_shuffle_ps(tmp20279, tmp20281, 68);
__m512 tmp20296 = _mm512_shuffle_ps(tmp20279, tmp20281, 238);
__m512 tmp20297 = _mm512_shuffle_ps(tmp20280, tmp20282, 68);
__m512 tmp20298 = _mm512_shuffle_ps(tmp20280, tmp20282, 238);
__m512 tmp20299 = _mm512_shuffle_f32x4(tmp20283, tmp20287, 136);
__m512 tmp20300 = _mm512_shuffle_f32x4(tmp20283, tmp20287, 221);
__m512 tmp20301 = _mm512_shuffle_f32x4(tmp20284, tmp20288, 136);
__m512 tmp20302 = _mm512_shuffle_f32x4(tmp20284, tmp20288, 221);
__m512 tmp20303 = _mm512_shuffle_f32x4(tmp20285, tmp20289, 136);
__m512 tmp20304 = _mm512_shuffle_f32x4(tmp20285, tmp20289, 221);
__m512 tmp20305 = _mm512_shuffle_f32x4(tmp20286, tmp20290, 136);
__m512 tmp20306 = _mm512_shuffle_f32x4(tmp20286, tmp20290, 221);
__m512 tmp20307 = _mm512_shuffle_f32x4(tmp20291, tmp20295, 136);
__m512 tmp20308 = _mm512_shuffle_f32x4(tmp20291, tmp20295, 221);
__m512 tmp20309 = _mm512_shuffle_f32x4(tmp20292, tmp20296, 136);
__m512 tmp20310 = _mm512_shuffle_f32x4(tmp20292, tmp20296, 221);
__m512 tmp20311 = _mm512_shuffle_f32x4(tmp20293, tmp20297, 136);
__m512 tmp20312 = _mm512_shuffle_f32x4(tmp20293, tmp20297, 221);
__m512 tmp20313 = _mm512_shuffle_f32x4(tmp20294, tmp20298, 136);
__m512 tmp20314 = _mm512_shuffle_f32x4(tmp20294, tmp20298, 221);
wt1019 = _mm512_shuffle_f32x4(tmp20299, tmp20307, 136);
wt1027 = _mm512_shuffle_f32x4(tmp20299, tmp20307, 221);
wt1020 = _mm512_shuffle_f32x4(tmp20301, tmp20309, 136);
wt1028 = _mm512_shuffle_f32x4(tmp20301, tmp20309, 221);
wt1021 = _mm512_shuffle_f32x4(tmp20303, tmp20311, 136);
wt1022 = _mm512_shuffle_f32x4(tmp20305, tmp20313, 136);
wt1023 = _mm512_shuffle_f32x4(tmp20300, tmp20308, 136);
wt1024 = _mm512_shuffle_f32x4(tmp20302, tmp20310, 136);
wt1025 = _mm512_shuffle_f32x4(tmp20304, tmp20312, 136);
wt1026 = _mm512_shuffle_f32x4(tmp20306, tmp20314, 136);
wt1019 = _mm512_mul_ps(wt1019, postMul85);
wt1020 = _mm512_mul_ps(wt1020, postMul85);
wt1021 = _mm512_mul_ps(wt1021, postMul85);
wt1022 = _mm512_mul_ps(wt1022, postMul85);
wt1023 = _mm512_mul_ps(wt1023, postMul85);
wt1024 = _mm512_mul_ps(wt1024, postMul85);
wt1025 = _mm512_mul_ps(wt1025, postMul85);
wt1026 = _mm512_mul_ps(wt1026, postMul85);
wt1027 = _mm512_mul_ps(wt1027, postMul85);
wt1028 = _mm512_mul_ps(wt1028, postMul85);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(1+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1019);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(2+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1020);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(3+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1021);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(4+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1022);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(5+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1023);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(6+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1024);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(7+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1025);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(8+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1026);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(9+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1027);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(10+16*c67)+(ptrdiff_t)0, 63>>cut37, wt1028);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(1+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1019);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(2+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1020);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(3+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1021);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(4+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1022);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(5+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1023);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(6+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1024);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(7+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1025);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(8+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1026);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(9+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1027);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(10+16*c67)+(ptrdiff_t)9072, 4032>>cut37, wt1028);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(1+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1019);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(2+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1020);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(3+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1021);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(4+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1022);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(5+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1023);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(6+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1024);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(7+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1025);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(8+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1026);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(9+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1027);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(10+16*c67)+(ptrdiff_t)18144, 258048>>cut37, wt1028);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(1+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1019);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(2+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1020);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(3+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1021);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(4+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1022);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(5+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1023);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(6+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1024);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(7+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1025);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(8+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1026);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(9+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1027);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l84+4*cut37+24*(10+16*c67)+(ptrdiff_t)27216, 65535-(262143>>cut37), wt1028);
}
}
} else {
ptrdiff_t k186 = 496;
ptrdiff_t l83 = (size_t)(0+k186)/6;
ptrdiff_t cut36 = (size_t)(0+k186)%6;
__m512 sum722 = _mm512_setzero_ps();
__m512i pmMul55 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd55 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo47 = _mm512_loadu_ps(bnPtr25+(ptrdiff_t)8*(k186+512*i78));
__m512 masHi47 = _mm512_maskz_loadu_ps(65535, bnPtr25+(ptrdiff_t)8*(k186+512*i78)+(ptrdiff_t)64);
__m512 postMul83 = _mm512_permutex2var_ps(masLo47, pmMul55, masHi47);
__m512 postAdd53 = _mm512_permutex2var_ps(masLo47, pmAdd55, masHi47);
(void)postAdd53;
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*0+(ptrdiff_t)0, 63>>cut36, sum722);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*0+(ptrdiff_t)9072, 4032>>cut36, sum722);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*0+(ptrdiff_t)18144, 258048>>cut36, sum722);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*0+(ptrdiff_t)27216, 65535-(262143>>cut36), sum722);
ptrdiff_t c65 = 0;
for (; c65 != 23; ++c65) {
__m512 wt939 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)0);
__m512 wt940 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)8192);
__m512 wt941 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)16384);
__m512 wt942 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)24576);
__m512 wt943 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)32768);
__m512 wt944 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)40960);
__m512 wt945 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)49152);
__m512 wt946 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)57344);
__m512 wt947 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)65536);
__m512 wt948 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)73728);
__m512 wt949 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)81920);
__m512 wt950 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)90112);
__m512 wt951 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)98304);
__m512 wt952 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)106496);
__m512 wt953 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)114688);
__m512 wt954 = _mm512_maskz_loadu_ps(65535, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)122880);
__m512 tmp20315 = _mm512_unpacklo_ps(wt939, wt940);
__m512 tmp20316 = _mm512_unpackhi_ps(wt939, wt940);
__m512 tmp20317 = _mm512_unpacklo_ps(wt941, wt942);
__m512 tmp20318 = _mm512_unpackhi_ps(wt941, wt942);
__m512 tmp20319 = _mm512_unpacklo_ps(wt943, wt944);
__m512 tmp20320 = _mm512_unpackhi_ps(wt943, wt944);
__m512 tmp20321 = _mm512_unpacklo_ps(wt945, wt946);
__m512 tmp20322 = _mm512_unpackhi_ps(wt945, wt946);
__m512 tmp20323 = _mm512_unpacklo_ps(wt947, wt948);
__m512 tmp20324 = _mm512_unpackhi_ps(wt947, wt948);
__m512 tmp20325 = _mm512_unpacklo_ps(wt949, wt950);
__m512 tmp20326 = _mm512_unpackhi_ps(wt949, wt950);
__m512 tmp20327 = _mm512_unpacklo_ps(wt951, wt952);
__m512 tmp20328 = _mm512_unpackhi_ps(wt951, wt952);
__m512 tmp20329 = _mm512_unpacklo_ps(wt953, wt954);
__m512 tmp20330 = _mm512_unpackhi_ps(wt953, wt954);
__m512 tmp20331 = _mm512_shuffle_ps(tmp20315, tmp20317, 68);
__m512 tmp20332 = _mm512_shuffle_ps(tmp20315, tmp20317, 238);
__m512 tmp20333 = _mm512_shuffle_ps(tmp20316, tmp20318, 68);
__m512 tmp20334 = _mm512_shuffle_ps(tmp20316, tmp20318, 238);
__m512 tmp20335 = _mm512_shuffle_ps(tmp20319, tmp20321, 68);
__m512 tmp20336 = _mm512_shuffle_ps(tmp20319, tmp20321, 238);
__m512 tmp20337 = _mm512_shuffle_ps(tmp20320, tmp20322, 68);
__m512 tmp20338 = _mm512_shuffle_ps(tmp20320, tmp20322, 238);
__m512 tmp20339 = _mm512_shuffle_ps(tmp20323, tmp20325, 68);
__m512 tmp20340 = _mm512_shuffle_ps(tmp20323, tmp20325, 238);
__m512 tmp20341 = _mm512_shuffle_ps(tmp20324, tmp20326, 68);
__m512 tmp20342 = _mm512_shuffle_ps(tmp20324, tmp20326, 238);
__m512 tmp20343 = _mm512_shuffle_ps(tmp20327, tmp20329, 68);
__m512 tmp20344 = _mm512_shuffle_ps(tmp20327, tmp20329, 238);
__m512 tmp20345 = _mm512_shuffle_ps(tmp20328, tmp20330, 68);
__m512 tmp20346 = _mm512_shuffle_ps(tmp20328, tmp20330, 238);
__m512 tmp20347 = _mm512_shuffle_f32x4(tmp20331, tmp20335, 136);
__m512 tmp20348 = _mm512_shuffle_f32x4(tmp20331, tmp20335, 221);
__m512 tmp20349 = _mm512_shuffle_f32x4(tmp20332, tmp20336, 136);
__m512 tmp20350 = _mm512_shuffle_f32x4(tmp20332, tmp20336, 221);
__m512 tmp20351 = _mm512_shuffle_f32x4(tmp20333, tmp20337, 136);
__m512 tmp20352 = _mm512_shuffle_f32x4(tmp20333, tmp20337, 221);
__m512 tmp20353 = _mm512_shuffle_f32x4(tmp20334, tmp20338, 136);
__m512 tmp20354 = _mm512_shuffle_f32x4(tmp20334, tmp20338, 221);
__m512 tmp20355 = _mm512_shuffle_f32x4(tmp20339, tmp20343, 136);
__m512 tmp20356 = _mm512_shuffle_f32x4(tmp20339, tmp20343, 221);
__m512 tmp20357 = _mm512_shuffle_f32x4(tmp20340, tmp20344, 136);
__m512 tmp20358 = _mm512_shuffle_f32x4(tmp20340, tmp20344, 221);
__m512 tmp20359 = _mm512_shuffle_f32x4(tmp20341, tmp20345, 136);
__m512 tmp20360 = _mm512_shuffle_f32x4(tmp20341, tmp20345, 221);
__m512 tmp20361 = _mm512_shuffle_f32x4(tmp20342, tmp20346, 136);
__m512 tmp20362 = _mm512_shuffle_f32x4(tmp20342, tmp20346, 221);
wt939 = _mm512_shuffle_f32x4(tmp20347, tmp20355, 136);
wt947 = _mm512_shuffle_f32x4(tmp20347, tmp20355, 221);
wt940 = _mm512_shuffle_f32x4(tmp20349, tmp20357, 136);
wt948 = _mm512_shuffle_f32x4(tmp20349, tmp20357, 221);
wt941 = _mm512_shuffle_f32x4(tmp20351, tmp20359, 136);
wt949 = _mm512_shuffle_f32x4(tmp20351, tmp20359, 221);
wt942 = _mm512_shuffle_f32x4(tmp20353, tmp20361, 136);
wt950 = _mm512_shuffle_f32x4(tmp20353, tmp20361, 221);
wt943 = _mm512_shuffle_f32x4(tmp20348, tmp20356, 136);
wt951 = _mm512_shuffle_f32x4(tmp20348, tmp20356, 221);
wt944 = _mm512_shuffle_f32x4(tmp20350, tmp20358, 136);
wt952 = _mm512_shuffle_f32x4(tmp20350, tmp20358, 221);
wt945 = _mm512_shuffle_f32x4(tmp20352, tmp20360, 136);
wt953 = _mm512_shuffle_f32x4(tmp20352, tmp20360, 221);
wt946 = _mm512_shuffle_f32x4(tmp20354, tmp20362, 136);
wt954 = _mm512_shuffle_f32x4(tmp20354, tmp20362, 221);
wt939 = _mm512_mul_ps(wt939, postMul83);
wt940 = _mm512_mul_ps(wt940, postMul83);
wt941 = _mm512_mul_ps(wt941, postMul83);
wt942 = _mm512_mul_ps(wt942, postMul83);
wt943 = _mm512_mul_ps(wt943, postMul83);
wt944 = _mm512_mul_ps(wt944, postMul83);
wt945 = _mm512_mul_ps(wt945, postMul83);
wt946 = _mm512_mul_ps(wt946, postMul83);
wt947 = _mm512_mul_ps(wt947, postMul83);
wt948 = _mm512_mul_ps(wt948, postMul83);
wt949 = _mm512_mul_ps(wt949, postMul83);
wt950 = _mm512_mul_ps(wt950, postMul83);
wt951 = _mm512_mul_ps(wt951, postMul83);
wt952 = _mm512_mul_ps(wt952, postMul83);
wt953 = _mm512_mul_ps(wt953, postMul83);
wt954 = _mm512_mul_ps(wt954, postMul83);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(1+16*c65)+(ptrdiff_t)0, 63>>cut36, wt939);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(2+16*c65)+(ptrdiff_t)0, 63>>cut36, wt940);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(3+16*c65)+(ptrdiff_t)0, 63>>cut36, wt941);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(4+16*c65)+(ptrdiff_t)0, 63>>cut36, wt942);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(5+16*c65)+(ptrdiff_t)0, 63>>cut36, wt943);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(6+16*c65)+(ptrdiff_t)0, 63>>cut36, wt944);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(7+16*c65)+(ptrdiff_t)0, 63>>cut36, wt945);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(8+16*c65)+(ptrdiff_t)0, 63>>cut36, wt946);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(9+16*c65)+(ptrdiff_t)0, 63>>cut36, wt947);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(10+16*c65)+(ptrdiff_t)0, 63>>cut36, wt948);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(11+16*c65)+(ptrdiff_t)0, 63>>cut36, wt949);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(12+16*c65)+(ptrdiff_t)0, 63>>cut36, wt950);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(13+16*c65)+(ptrdiff_t)0, 63>>cut36, wt951);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(14+16*c65)+(ptrdiff_t)0, 63>>cut36, wt952);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(15+16*c65)+(ptrdiff_t)0, 63>>cut36, wt953);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(16+16*c65)+(ptrdiff_t)0, 63>>cut36, wt954);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(1+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt939);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(2+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt940);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(3+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt941);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(4+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt942);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(5+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt943);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(6+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt944);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(7+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt945);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(8+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt946);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(9+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt947);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(10+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt948);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(11+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt949);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(12+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt950);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(13+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt951);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(14+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt952);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(15+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt953);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(16+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt954);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(1+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt939);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(2+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt940);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(3+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt941);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(4+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt942);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(5+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt943);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(6+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt944);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(7+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt945);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(8+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt946);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(9+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt947);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(10+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt948);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(11+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt949);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(12+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt950);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(13+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt951);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(14+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt952);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(15+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt953);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(16+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt954);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(1+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt939);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(2+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt940);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(3+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt941);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(4+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt942);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(5+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt943);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(6+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt944);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(7+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt945);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(8+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt946);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(9+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt947);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(10+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt948);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(11+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt949);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(12+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt950);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(13+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt951);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(14+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt952);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(15+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt953);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(16+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt954);
}
__m512 wt955 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)0);
__m512 wt956 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)8192);
__m512 wt957 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)16384);
__m512 wt958 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)24576);
__m512 wt959 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)32768);
__m512 wt960 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)40960);
__m512 wt961 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)49152);
__m512 wt962 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)57344);
__m512 wt963 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)65536);
__m512 wt964 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)73728);
__m512 wt965 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)81920);
__m512 wt966 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)90112);
__m512 wt967 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)98304);
__m512 wt968 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)106496);
__m512 wt969 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)114688);
__m512 wt970 = _mm512_maskz_loadu_ps(1023, wtPtr25+4194304*i78+8192*k186+64*c65+(ptrdiff_t)122880);
__m512 tmp20363 = _mm512_unpacklo_ps(wt955, wt956);
__m512 tmp20364 = _mm512_unpackhi_ps(wt955, wt956);
__m512 tmp20365 = _mm512_unpacklo_ps(wt957, wt958);
__m512 tmp20366 = _mm512_unpackhi_ps(wt957, wt958);
__m512 tmp20367 = _mm512_unpacklo_ps(wt959, wt960);
__m512 tmp20368 = _mm512_unpackhi_ps(wt959, wt960);
__m512 tmp20369 = _mm512_unpacklo_ps(wt961, wt962);
__m512 tmp20370 = _mm512_unpackhi_ps(wt961, wt962);
__m512 tmp20371 = _mm512_unpacklo_ps(wt963, wt964);
__m512 tmp20372 = _mm512_unpackhi_ps(wt963, wt964);
__m512 tmp20373 = _mm512_unpacklo_ps(wt965, wt966);
__m512 tmp20374 = _mm512_unpackhi_ps(wt965, wt966);
__m512 tmp20375 = _mm512_unpacklo_ps(wt967, wt968);
__m512 tmp20376 = _mm512_unpackhi_ps(wt967, wt968);
__m512 tmp20377 = _mm512_unpacklo_ps(wt969, wt970);
__m512 tmp20378 = _mm512_unpackhi_ps(wt969, wt970);
__m512 tmp20379 = _mm512_shuffle_ps(tmp20363, tmp20365, 68);
__m512 tmp20380 = _mm512_shuffle_ps(tmp20363, tmp20365, 238);
__m512 tmp20381 = _mm512_shuffle_ps(tmp20364, tmp20366, 68);
__m512 tmp20382 = _mm512_shuffle_ps(tmp20364, tmp20366, 238);
__m512 tmp20383 = _mm512_shuffle_ps(tmp20367, tmp20369, 68);
__m512 tmp20384 = _mm512_shuffle_ps(tmp20367, tmp20369, 238);
__m512 tmp20385 = _mm512_shuffle_ps(tmp20368, tmp20370, 68);
__m512 tmp20386 = _mm512_shuffle_ps(tmp20368, tmp20370, 238);
__m512 tmp20387 = _mm512_shuffle_ps(tmp20371, tmp20373, 68);
__m512 tmp20388 = _mm512_shuffle_ps(tmp20371, tmp20373, 238);
__m512 tmp20389 = _mm512_shuffle_ps(tmp20372, tmp20374, 68);
__m512 tmp20390 = _mm512_shuffle_ps(tmp20372, tmp20374, 238);
__m512 tmp20391 = _mm512_shuffle_ps(tmp20375, tmp20377, 68);
__m512 tmp20392 = _mm512_shuffle_ps(tmp20375, tmp20377, 238);
__m512 tmp20393 = _mm512_shuffle_ps(tmp20376, tmp20378, 68);
__m512 tmp20394 = _mm512_shuffle_ps(tmp20376, tmp20378, 238);
__m512 tmp20395 = _mm512_shuffle_f32x4(tmp20379, tmp20383, 136);
__m512 tmp20396 = _mm512_shuffle_f32x4(tmp20379, tmp20383, 221);
__m512 tmp20397 = _mm512_shuffle_f32x4(tmp20380, tmp20384, 136);
__m512 tmp20398 = _mm512_shuffle_f32x4(tmp20380, tmp20384, 221);
__m512 tmp20399 = _mm512_shuffle_f32x4(tmp20381, tmp20385, 136);
__m512 tmp20400 = _mm512_shuffle_f32x4(tmp20381, tmp20385, 221);
__m512 tmp20401 = _mm512_shuffle_f32x4(tmp20382, tmp20386, 136);
__m512 tmp20402 = _mm512_shuffle_f32x4(tmp20382, tmp20386, 221);
__m512 tmp20403 = _mm512_shuffle_f32x4(tmp20387, tmp20391, 136);
__m512 tmp20404 = _mm512_shuffle_f32x4(tmp20387, tmp20391, 221);
__m512 tmp20405 = _mm512_shuffle_f32x4(tmp20388, tmp20392, 136);
__m512 tmp20406 = _mm512_shuffle_f32x4(tmp20388, tmp20392, 221);
__m512 tmp20407 = _mm512_shuffle_f32x4(tmp20389, tmp20393, 136);
__m512 tmp20408 = _mm512_shuffle_f32x4(tmp20389, tmp20393, 221);
__m512 tmp20409 = _mm512_shuffle_f32x4(tmp20390, tmp20394, 136);
__m512 tmp20410 = _mm512_shuffle_f32x4(tmp20390, tmp20394, 221);
wt955 = _mm512_shuffle_f32x4(tmp20395, tmp20403, 136);
wt963 = _mm512_shuffle_f32x4(tmp20395, tmp20403, 221);
wt956 = _mm512_shuffle_f32x4(tmp20397, tmp20405, 136);
wt964 = _mm512_shuffle_f32x4(tmp20397, tmp20405, 221);
wt957 = _mm512_shuffle_f32x4(tmp20399, tmp20407, 136);
wt958 = _mm512_shuffle_f32x4(tmp20401, tmp20409, 136);
wt959 = _mm512_shuffle_f32x4(tmp20396, tmp20404, 136);
wt960 = _mm512_shuffle_f32x4(tmp20398, tmp20406, 136);
wt961 = _mm512_shuffle_f32x4(tmp20400, tmp20408, 136);
wt962 = _mm512_shuffle_f32x4(tmp20402, tmp20410, 136);
wt955 = _mm512_mul_ps(wt955, postMul83);
wt956 = _mm512_mul_ps(wt956, postMul83);
wt957 = _mm512_mul_ps(wt957, postMul83);
wt958 = _mm512_mul_ps(wt958, postMul83);
wt959 = _mm512_mul_ps(wt959, postMul83);
wt960 = _mm512_mul_ps(wt960, postMul83);
wt961 = _mm512_mul_ps(wt961, postMul83);
wt962 = _mm512_mul_ps(wt962, postMul83);
wt963 = _mm512_mul_ps(wt963, postMul83);
wt964 = _mm512_mul_ps(wt964, postMul83);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(1+16*c65)+(ptrdiff_t)0, 63>>cut36, wt955);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(2+16*c65)+(ptrdiff_t)0, 63>>cut36, wt956);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(3+16*c65)+(ptrdiff_t)0, 63>>cut36, wt957);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(4+16*c65)+(ptrdiff_t)0, 63>>cut36, wt958);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(5+16*c65)+(ptrdiff_t)0, 63>>cut36, wt959);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(6+16*c65)+(ptrdiff_t)0, 63>>cut36, wt960);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(7+16*c65)+(ptrdiff_t)0, 63>>cut36, wt961);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(8+16*c65)+(ptrdiff_t)0, 63>>cut36, wt962);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(9+16*c65)+(ptrdiff_t)0, 63>>cut36, wt963);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(10+16*c65)+(ptrdiff_t)0, 63>>cut36, wt964);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(1+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt955);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(2+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt956);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(3+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt957);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(4+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt958);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(5+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt959);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(6+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt960);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(7+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt961);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(8+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt962);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(9+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt963);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(10+16*c65)+(ptrdiff_t)9072, 4032>>cut36, wt964);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(1+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt955);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(2+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt956);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(3+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt957);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(4+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt958);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(5+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt959);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(6+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt960);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(7+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt961);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(8+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt962);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(9+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt963);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+24*(10+16*c65)+(ptrdiff_t)18144, 258048>>cut36, wt964);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(1+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt955);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(2+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt956);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(3+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt957);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(4+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt958);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(5+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt959);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(6+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt960);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(7+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt961);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(8+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt962);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(9+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt963);
_mm512_mask_storeu_ps(arranged24+776192*i78+9096*l83+4*cut36+8*(10+16*c65)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt964);
}
}
}
}

static void ResNet50OneArrangeWts12(ResNet50ThreaderTeam1* team80, char** tensors133) {
ResNet50ThreaderTask1 task137;
task137.callee1 = ResNet50OneArrangeWts12Callee1;
task137.any1 = tensors133;
task137.nd1 = 3;
task137.hull1[0] = 32;
task137.hull1[1] = 1;
task137.hull1[2] = 3;
ResNet50ThreaderDo1(team80, &task137);
}

static void ResNet50OneArrangeDats12Callee1(ResNet50ThreaderTask1* task138, int64_t* pt74) {
char** tensors136 = task138->any1;
ptrdiff_t s64 = pt74[0];
ptrdiff_t e39 = pt74[3];
if (e39 < 2) {
char*restrict datPtr43 = tensors136[0]+(ptrdiff_t)0+(ptrdiff_t)267200*e39+(ptrdiff_t)655360*0;
char*restrict arranged25 = tensors136[1]+(ptrdiff_t)213760*e39+(ptrdiff_t)213760*0;
ptrdiff_t ii36 = 1;
for (ptrdiff_t i79 = 0; i79 < ii36; ++i79) {
ptrdiff_t j70 = 0;
ptrdiff_t k188 = 417*s64;
ptrdiff_t kk67 = k188+(s64 < 1 ? 417 : 418);
for (; k188 < kk67; ++k188) {
__m512 dat2514 = _mm512_maskz_loadu_ps(65535, datPtr43+655360*i79+256*j70+320*k188+(ptrdiff_t)0);
__m512 dat2515 = _mm512_maskz_loadu_ps(65535, datPtr43+655360*i79+256*j70+320*k188+(ptrdiff_t)64);
__m512 dat2516 = _mm512_maskz_loadu_ps(65535, datPtr43+655360*i79+256*j70+320*k188+(ptrdiff_t)128);
__m512 dat2517 = _mm512_maskz_loadu_ps(1, datPtr43+655360*i79+256*j70+320*k188+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged25+213760*i79+213760*j70+256*k188+(ptrdiff_t)0, 65535, dat2514);
_mm512_mask_storeu_ps(arranged25+213760*i79+213760*j70+256*k188+(ptrdiff_t)64, 65535, dat2515);
_mm512_mask_storeu_ps(arranged25+213760*i79+213760*j70+256*k188+(ptrdiff_t)128, 65535, dat2516);
_mm512_mask_storeu_ps(arranged25+213760*i79+213760*j70+256*k188+(ptrdiff_t)192, 1, dat2517);
}
}
return;
}
char*restrict datPtr44 = tensors136[0]+(ptrdiff_t)0+(ptrdiff_t)267200*2+(ptrdiff_t)655360*0;
char*restrict arranged26 = tensors136[1]+(ptrdiff_t)213760*2+(ptrdiff_t)96768*0;
ptrdiff_t ii37 = 1;
for (ptrdiff_t i80 = 0; i80 < ii37; ++i80) {
ptrdiff_t j71 = 0;
ptrdiff_t k189 = 189*s64;
ptrdiff_t kk68 = k189+189;
for (; k189 < kk68; ++k189) {
__m512 dat2518 = _mm512_maskz_loadu_ps(65535, datPtr44+655360*i80+256*j71+320*k189+(ptrdiff_t)0);
__m512 dat2519 = _mm512_maskz_loadu_ps(65535, datPtr44+655360*i80+256*j71+320*k189+(ptrdiff_t)64);
__m512 dat2520 = _mm512_maskz_loadu_ps(65535, datPtr44+655360*i80+256*j71+320*k189+(ptrdiff_t)128);
__m512 dat2521 = _mm512_maskz_loadu_ps(1, datPtr44+655360*i80+256*j71+320*k189+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged26+96768*i80+96768*j71+256*k189+(ptrdiff_t)0, 65535, dat2518);
_mm512_mask_storeu_ps(arranged26+96768*i80+96768*j71+256*k189+(ptrdiff_t)64, 65535, dat2519);
_mm512_mask_storeu_ps(arranged26+96768*i80+96768*j71+256*k189+(ptrdiff_t)128, 65535, dat2520);
_mm512_mask_storeu_ps(arranged26+96768*i80+96768*j71+256*k189+(ptrdiff_t)192, 1, dat2521);
}
}
}

static void ResNet50OneArrangeDats12(ResNet50ThreaderTeam1* team81, char** tensors135) {
ResNet50ThreaderTask1 task139;
task139.callee1 = ResNet50OneArrangeDats12Callee1;
task139.any1 = tensors135;
task139.nd1 = 4;
task139.hull1[0] = 2;
task139.hull1[1] = 1;
task139.hull1[2] = 1;
task139.hull1[3] = 3;
ResNet50ThreaderDo1(team81, &task139);
}

static void ResNet50OneApply12Callee1(ResNet50ThreaderTask1* task140, int64_t* pt75) {
void** pair38 = task140->any1;
char** tensors138 = pair38[0];
ptrdiff_t e40 = 0;
ptrdiff_t g45 = 0;
ptrdiff_t d28 = 0;
ptrdiff_t w80 = pt75[0];
char*restrict arrangedWts12 = tensors138[0]+1712128*e40+(ptrdiff_t)1712128*1*g45;
char*restrict arrangedDats12 = tensors138[1]+213760*e40+(ptrdiff_t)213760*1*g45;
char*restrict datPtr45 = tensors138[2]+(ptrdiff_t)163840*1*g45;
ptrdiff_t ii38 = 1;
for (ptrdiff_t i81 = 0; i81 < ii38; ++i81) {
ptrdiff_t j72 = 1*d28;
ptrdiff_t k190 = 1*w80;
ptrdiff_t kk69 = k190+0;
for (; k190 != 85; ++k190) {
ptrdiff_t s65 = -1;
__m512 sum725 = _mm512_set1_ps(*(float*)(arrangedWts12+1712128*i81+20064*k190+24*s65+(ptrdiff_t)24));
__m512 sum729 = _mm512_set1_ps(*(float*)(arrangedWts12+1712128*i81+20064*k190+24*s65+(ptrdiff_t)28));
__m512 sum733 = _mm512_set1_ps(*(float*)(arrangedWts12+1712128*i81+20064*k190+24*s65+(ptrdiff_t)32));
__m512 sum737 = _mm512_set1_ps(*(float*)(arrangedWts12+1712128*i81+20064*k190+24*s65+(ptrdiff_t)36));
__m512 sum741 = _mm512_set1_ps(*(float*)(arrangedWts12+1712128*i81+20064*k190+24*s65+(ptrdiff_t)40));
__m512 sum745 = _mm512_set1_ps(*(float*)(arrangedWts12+1712128*i81+20064*k190+24*s65+(ptrdiff_t)44));
__m512 sum726 = sum725;
__m512 sum727 = sum725;
__m512 sum728 = sum725;
__m512 sum730 = sum729;
__m512 sum731 = sum729;
__m512 sum732 = sum729;
__m512 sum734 = sum733;
__m512 sum735 = sum733;
__m512 sum736 = sum733;
__m512 sum738 = sum737;
__m512 sum739 = sum737;
__m512 sum740 = sum737;
__m512 sum742 = sum741;
__m512 sum743 = sum741;
__m512 sum744 = sum741;
__m512 sum746 = sum745;
__m512 sum747 = sum745;
__m512 sum748 = sum745;
for (s65 = 0; s65 < 835; ++s65) {
__m512 dat2522 = _mm512_loadu_ps(arrangedDats12+213760*i81+213760*j72+256*s65+(ptrdiff_t)0);
__m512 dat2523 = _mm512_loadu_ps(arrangedDats12+213760*i81+213760*j72+256*s65+(ptrdiff_t)64);
__m512 dat2524 = _mm512_loadu_ps(arrangedDats12+213760*i81+213760*j72+256*s65+(ptrdiff_t)128);
__m512 dat2525 = _mm512_loadu_ps(arrangedDats12+213760*i81+213760*j72+256*s65+(ptrdiff_t)192);
__m512 wt1035 = _mm512_set1_ps(*(float*)(arrangedWts12+1712128*i81+20064*k190+24*s65+(ptrdiff_t)24));
sum725 = _mm512_fmadd_ps(wt1035, dat2522, sum725);
sum726 = _mm512_fmadd_ps(wt1035, dat2523, sum726);
sum727 = _mm512_fmadd_ps(wt1035, dat2524, sum727);
sum728 = _mm512_fmadd_ps(wt1035, dat2525, sum728);
__m512 wt1036 = _mm512_set1_ps(*(float*)(arrangedWts12+1712128*i81+20064*k190+24*s65+(ptrdiff_t)28));
sum729 = _mm512_fmadd_ps(wt1036, dat2522, sum729);
sum730 = _mm512_fmadd_ps(wt1036, dat2523, sum730);
sum731 = _mm512_fmadd_ps(wt1036, dat2524, sum731);
sum732 = _mm512_fmadd_ps(wt1036, dat2525, sum732);
__m512 wt1037 = _mm512_set1_ps(*(float*)(arrangedWts12+1712128*i81+20064*k190+24*s65+(ptrdiff_t)32));
sum733 = _mm512_fmadd_ps(wt1037, dat2522, sum733);
sum734 = _mm512_fmadd_ps(wt1037, dat2523, sum734);
sum735 = _mm512_fmadd_ps(wt1037, dat2524, sum735);
sum736 = _mm512_fmadd_ps(wt1037, dat2525, sum736);
__m512 wt1038 = _mm512_set1_ps(*(float*)(arrangedWts12+1712128*i81+20064*k190+24*s65+(ptrdiff_t)36));
sum737 = _mm512_fmadd_ps(wt1038, dat2522, sum737);
sum738 = _mm512_fmadd_ps(wt1038, dat2523, sum738);
sum739 = _mm512_fmadd_ps(wt1038, dat2524, sum739);
sum740 = _mm512_fmadd_ps(wt1038, dat2525, sum740);
__m512 wt1039 = _mm512_set1_ps(*(float*)(arrangedWts12+1712128*i81+20064*k190+24*s65+(ptrdiff_t)40));
sum741 = _mm512_fmadd_ps(wt1039, dat2522, sum741);
sum742 = _mm512_fmadd_ps(wt1039, dat2523, sum742);
sum743 = _mm512_fmadd_ps(wt1039, dat2524, sum743);
sum744 = _mm512_fmadd_ps(wt1039, dat2525, sum744);
__m512 wt1040 = _mm512_set1_ps(*(float*)(arrangedWts12+1712128*i81+20064*k190+24*s65+(ptrdiff_t)44));
sum745 = _mm512_fmadd_ps(wt1040, dat2522, sum745);
sum746 = _mm512_fmadd_ps(wt1040, dat2523, sum746);
sum747 = _mm512_fmadd_ps(wt1040, dat2524, sum747);
sum748 = _mm512_fmadd_ps(wt1040, dat2525, sum748);
}
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)0, 65535, sum725);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)64, 65535, sum726);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)128, 65535, sum727);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)192, 1, sum728);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)320, 65535, sum729);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)384, 65535, sum730);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)448, 65535, sum731);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)512, 1, sum732);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)640, 65535, sum733);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)704, 65535, sum734);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)768, 65535, sum735);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)832, 1, sum736);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)960, 65535, sum737);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)1024, 65535, sum738);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)1088, 65535, sum739);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)1152, 1, sum740);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)1280, 65535, sum741);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)1344, 65535, sum742);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)1408, 65535, sum743);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)1472, 1, sum744);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)1600, 65535, sum745);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)1664, 65535, sum746);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)1728, 65535, sum747);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)1792, 1, sum748);
if (k190 >= kk69) return;
}
ptrdiff_t s66 = -1;
__m512 sum749 = _mm512_set1_ps(*(float*)(arrangedWts12+1712128*i81+20064*k190+8*s66+(ptrdiff_t)8));
__m512 sum753 = _mm512_set1_ps(*(float*)(arrangedWts12+1712128*i81+20064*k190+8*s66+(ptrdiff_t)12));
__m512 sum750 = sum749;
__m512 sum751 = sum749;
__m512 sum752 = sum749;
__m512 sum754 = sum753;
__m512 sum755 = sum753;
__m512 sum756 = sum753;
for (s66 = 0; s66 < 835; ++s66) {
__m512 dat2526 = _mm512_loadu_ps(arrangedDats12+213760*i81+213760*j72+256*s66+(ptrdiff_t)0);
__m512 dat2527 = _mm512_loadu_ps(arrangedDats12+213760*i81+213760*j72+256*s66+(ptrdiff_t)64);
__m512 dat2528 = _mm512_loadu_ps(arrangedDats12+213760*i81+213760*j72+256*s66+(ptrdiff_t)128);
__m512 dat2529 = _mm512_loadu_ps(arrangedDats12+213760*i81+213760*j72+256*s66+(ptrdiff_t)192);
__m512 wt1041 = _mm512_set1_ps(*(float*)(arrangedWts12+1712128*i81+20064*k190+8*s66+(ptrdiff_t)8));
sum749 = _mm512_fmadd_ps(wt1041, dat2526, sum749);
sum750 = _mm512_fmadd_ps(wt1041, dat2527, sum750);
sum751 = _mm512_fmadd_ps(wt1041, dat2528, sum751);
sum752 = _mm512_fmadd_ps(wt1041, dat2529, sum752);
__m512 wt1042 = _mm512_set1_ps(*(float*)(arrangedWts12+1712128*i81+20064*k190+8*s66+(ptrdiff_t)12));
sum753 = _mm512_fmadd_ps(wt1042, dat2526, sum753);
sum754 = _mm512_fmadd_ps(wt1042, dat2527, sum754);
sum755 = _mm512_fmadd_ps(wt1042, dat2528, sum755);
sum756 = _mm512_fmadd_ps(wt1042, dat2529, sum756);
}
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)0, 65535, sum749);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)64, 65535, sum750);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)128, 65535, sum751);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)192, 1, sum752);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)320, 65535, sum753);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)384, 65535, sum754);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)448, 65535, sum755);
_mm512_mask_storeu_ps(datPtr45+163840*i81+256*j72+1920*k190+(ptrdiff_t)512, 1, sum756);
}
}

static void ResNet50OneApply12Callee2(ResNet50ThreaderTask1* task141, int64_t* pt76) {
void** pair39 = task141->any1;
char** tensors139 = pair39[0];
ptrdiff_t e41 = (ptrdiff_t)pair39[1];
ptrdiff_t g46 = 0;
ptrdiff_t d29 = 0;
ptrdiff_t w81 = pt76[0];
char*restrict arrangedWts13 = tensors139[0]+1712128*e41+(ptrdiff_t)1712128*1*g46;
char*restrict arrangedDats13 = tensors139[1]+213760*e41+(ptrdiff_t)213760*1*g46;
char*restrict datPtr46 = tensors139[2]+(ptrdiff_t)163840*1*g46;
ptrdiff_t ii39 = 1;
for (ptrdiff_t i82 = 0; i82 < ii39; ++i82) {
ptrdiff_t j73 = 1*d29;
ptrdiff_t k191 = 1*w81;
ptrdiff_t kk70 = k191+0;
for (; k191 != 85; ++k191) {
ptrdiff_t s67 = -1;
__m512 sum757 = _mm512_set1_ps(*(float*)(arrangedWts13+1712128*i82+20064*k191+24*s67+(ptrdiff_t)24));
__m512 sum761 = _mm512_set1_ps(*(float*)(arrangedWts13+1712128*i82+20064*k191+24*s67+(ptrdiff_t)28));
__m512 sum765 = _mm512_set1_ps(*(float*)(arrangedWts13+1712128*i82+20064*k191+24*s67+(ptrdiff_t)32));
__m512 sum769 = _mm512_set1_ps(*(float*)(arrangedWts13+1712128*i82+20064*k191+24*s67+(ptrdiff_t)36));
__m512 sum773 = _mm512_set1_ps(*(float*)(arrangedWts13+1712128*i82+20064*k191+24*s67+(ptrdiff_t)40));
__m512 sum777 = _mm512_set1_ps(*(float*)(arrangedWts13+1712128*i82+20064*k191+24*s67+(ptrdiff_t)44));
__m512 sum758 = sum757;
__m512 sum759 = sum757;
__m512 sum760 = sum757;
__m512 sum762 = sum761;
__m512 sum763 = sum761;
__m512 sum764 = sum761;
__m512 sum766 = sum765;
__m512 sum767 = sum765;
__m512 sum768 = sum765;
__m512 sum770 = sum769;
__m512 sum771 = sum769;
__m512 sum772 = sum769;
__m512 sum774 = sum773;
__m512 sum775 = sum773;
__m512 sum776 = sum773;
__m512 sum778 = sum777;
__m512 sum779 = sum777;
__m512 sum780 = sum777;
for (s67 = 0; s67 < 835; ++s67) {
__m512 dat2530 = _mm512_loadu_ps(arrangedDats13+213760*i82+213760*j73+256*s67+(ptrdiff_t)0);
__m512 dat2531 = _mm512_loadu_ps(arrangedDats13+213760*i82+213760*j73+256*s67+(ptrdiff_t)64);
__m512 dat2532 = _mm512_loadu_ps(arrangedDats13+213760*i82+213760*j73+256*s67+(ptrdiff_t)128);
__m512 dat2533 = _mm512_loadu_ps(arrangedDats13+213760*i82+213760*j73+256*s67+(ptrdiff_t)192);
__m512 wt1043 = _mm512_set1_ps(*(float*)(arrangedWts13+1712128*i82+20064*k191+24*s67+(ptrdiff_t)24));
sum757 = _mm512_fmadd_ps(wt1043, dat2530, sum757);
sum758 = _mm512_fmadd_ps(wt1043, dat2531, sum758);
sum759 = _mm512_fmadd_ps(wt1043, dat2532, sum759);
sum760 = _mm512_fmadd_ps(wt1043, dat2533, sum760);
__m512 wt1044 = _mm512_set1_ps(*(float*)(arrangedWts13+1712128*i82+20064*k191+24*s67+(ptrdiff_t)28));
sum761 = _mm512_fmadd_ps(wt1044, dat2530, sum761);
sum762 = _mm512_fmadd_ps(wt1044, dat2531, sum762);
sum763 = _mm512_fmadd_ps(wt1044, dat2532, sum763);
sum764 = _mm512_fmadd_ps(wt1044, dat2533, sum764);
__m512 wt1045 = _mm512_set1_ps(*(float*)(arrangedWts13+1712128*i82+20064*k191+24*s67+(ptrdiff_t)32));
sum765 = _mm512_fmadd_ps(wt1045, dat2530, sum765);
sum766 = _mm512_fmadd_ps(wt1045, dat2531, sum766);
sum767 = _mm512_fmadd_ps(wt1045, dat2532, sum767);
sum768 = _mm512_fmadd_ps(wt1045, dat2533, sum768);
__m512 wt1046 = _mm512_set1_ps(*(float*)(arrangedWts13+1712128*i82+20064*k191+24*s67+(ptrdiff_t)36));
sum769 = _mm512_fmadd_ps(wt1046, dat2530, sum769);
sum770 = _mm512_fmadd_ps(wt1046, dat2531, sum770);
sum771 = _mm512_fmadd_ps(wt1046, dat2532, sum771);
sum772 = _mm512_fmadd_ps(wt1046, dat2533, sum772);
__m512 wt1047 = _mm512_set1_ps(*(float*)(arrangedWts13+1712128*i82+20064*k191+24*s67+(ptrdiff_t)40));
sum773 = _mm512_fmadd_ps(wt1047, dat2530, sum773);
sum774 = _mm512_fmadd_ps(wt1047, dat2531, sum774);
sum775 = _mm512_fmadd_ps(wt1047, dat2532, sum775);
sum776 = _mm512_fmadd_ps(wt1047, dat2533, sum776);
__m512 wt1048 = _mm512_set1_ps(*(float*)(arrangedWts13+1712128*i82+20064*k191+24*s67+(ptrdiff_t)44));
sum777 = _mm512_fmadd_ps(wt1048, dat2530, sum777);
sum778 = _mm512_fmadd_ps(wt1048, dat2531, sum778);
sum779 = _mm512_fmadd_ps(wt1048, dat2532, sum779);
sum780 = _mm512_fmadd_ps(wt1048, dat2533, sum780);
}
sum757 = _mm512_add_ps(sum757, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)0));
sum758 = _mm512_add_ps(sum758, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)64));
sum759 = _mm512_add_ps(sum759, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)128));
sum760 = _mm512_add_ps(sum760, _mm512_maskz_loadu_ps(1, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)192));
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)0, 65535, sum757);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)64, 65535, sum758);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)128, 65535, sum759);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)192, 1, sum760);
sum761 = _mm512_add_ps(sum761, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)320));
sum762 = _mm512_add_ps(sum762, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)384));
sum763 = _mm512_add_ps(sum763, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)448));
sum764 = _mm512_add_ps(sum764, _mm512_maskz_loadu_ps(1, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)512));
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)320, 65535, sum761);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)384, 65535, sum762);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)448, 65535, sum763);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)512, 1, sum764);
sum765 = _mm512_add_ps(sum765, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)640));
sum766 = _mm512_add_ps(sum766, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)704));
sum767 = _mm512_add_ps(sum767, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)768));
sum768 = _mm512_add_ps(sum768, _mm512_maskz_loadu_ps(1, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)832));
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)640, 65535, sum765);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)704, 65535, sum766);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)768, 65535, sum767);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)832, 1, sum768);
sum769 = _mm512_add_ps(sum769, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)960));
sum770 = _mm512_add_ps(sum770, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1024));
sum771 = _mm512_add_ps(sum771, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1088));
sum772 = _mm512_add_ps(sum772, _mm512_maskz_loadu_ps(1, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1152));
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)960, 65535, sum769);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1024, 65535, sum770);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1088, 65535, sum771);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1152, 1, sum772);
sum773 = _mm512_add_ps(sum773, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1280));
sum774 = _mm512_add_ps(sum774, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1344));
sum775 = _mm512_add_ps(sum775, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1408));
sum776 = _mm512_add_ps(sum776, _mm512_maskz_loadu_ps(1, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1472));
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1280, 65535, sum773);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1344, 65535, sum774);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1408, 65535, sum775);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1472, 1, sum776);
sum777 = _mm512_add_ps(sum777, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1600));
sum778 = _mm512_add_ps(sum778, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1664));
sum779 = _mm512_add_ps(sum779, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1728));
sum780 = _mm512_add_ps(sum780, _mm512_maskz_loadu_ps(1, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1792));
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1600, 65535, sum777);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1664, 65535, sum778);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1728, 65535, sum779);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)1792, 1, sum780);
if (k191 >= kk70) return;
}
ptrdiff_t s68 = -1;
__m512 sum781 = _mm512_set1_ps(*(float*)(arrangedWts13+1712128*i82+20064*k191+8*s68+(ptrdiff_t)8));
__m512 sum785 = _mm512_set1_ps(*(float*)(arrangedWts13+1712128*i82+20064*k191+8*s68+(ptrdiff_t)12));
__m512 sum782 = sum781;
__m512 sum783 = sum781;
__m512 sum784 = sum781;
__m512 sum786 = sum785;
__m512 sum787 = sum785;
__m512 sum788 = sum785;
for (s68 = 0; s68 < 835; ++s68) {
__m512 dat2534 = _mm512_loadu_ps(arrangedDats13+213760*i82+213760*j73+256*s68+(ptrdiff_t)0);
__m512 dat2535 = _mm512_loadu_ps(arrangedDats13+213760*i82+213760*j73+256*s68+(ptrdiff_t)64);
__m512 dat2536 = _mm512_loadu_ps(arrangedDats13+213760*i82+213760*j73+256*s68+(ptrdiff_t)128);
__m512 dat2537 = _mm512_loadu_ps(arrangedDats13+213760*i82+213760*j73+256*s68+(ptrdiff_t)192);
__m512 wt1049 = _mm512_set1_ps(*(float*)(arrangedWts13+1712128*i82+20064*k191+8*s68+(ptrdiff_t)8));
sum781 = _mm512_fmadd_ps(wt1049, dat2534, sum781);
sum782 = _mm512_fmadd_ps(wt1049, dat2535, sum782);
sum783 = _mm512_fmadd_ps(wt1049, dat2536, sum783);
sum784 = _mm512_fmadd_ps(wt1049, dat2537, sum784);
__m512 wt1050 = _mm512_set1_ps(*(float*)(arrangedWts13+1712128*i82+20064*k191+8*s68+(ptrdiff_t)12));
sum785 = _mm512_fmadd_ps(wt1050, dat2534, sum785);
sum786 = _mm512_fmadd_ps(wt1050, dat2535, sum786);
sum787 = _mm512_fmadd_ps(wt1050, dat2536, sum787);
sum788 = _mm512_fmadd_ps(wt1050, dat2537, sum788);
}
sum781 = _mm512_add_ps(sum781, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)0));
sum782 = _mm512_add_ps(sum782, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)64));
sum783 = _mm512_add_ps(sum783, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)128));
sum784 = _mm512_add_ps(sum784, _mm512_maskz_loadu_ps(1, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)192));
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)0, 65535, sum781);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)64, 65535, sum782);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)128, 65535, sum783);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)192, 1, sum784);
sum785 = _mm512_add_ps(sum785, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)320));
sum786 = _mm512_add_ps(sum786, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)384));
sum787 = _mm512_add_ps(sum787, _mm512_maskz_loadu_ps(65535, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)448));
sum788 = _mm512_add_ps(sum788, _mm512_maskz_loadu_ps(1, datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)512));
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)320, 65535, sum785);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)384, 65535, sum786);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)448, 65535, sum787);
_mm512_mask_storeu_ps(datPtr46+163840*i82+256*j73+1920*k191+(ptrdiff_t)512, 1, sum788);
}
}

static void ResNet50OneApply12Callee3(ResNet50ThreaderTask1* task142, int64_t* pt77) {
void** pair40 = task142->any1;
char** tensors140 = pair40[0];
ptrdiff_t e43 = 2;
ptrdiff_t g47 = 0;
ptrdiff_t d30 = 0;
ptrdiff_t w82 = pt77[0];
char*restrict arrangedWts14 = tensors140[0]+1712128*e43+(ptrdiff_t)776192*1*g47;
char*restrict arrangedDats14 = tensors140[1]+213760*e43+(ptrdiff_t)96768*1*g47;
char*restrict datPtr47 = tensors140[2]+(ptrdiff_t)163840*1*g47;
ptrdiff_t ii40 = 1;
for (ptrdiff_t i83 = 0; i83 < ii40; ++i83) {
ptrdiff_t j74 = 1*d30;
ptrdiff_t k192 = 2*w82;
ptrdiff_t kk71 = k192+1;
for (; k192 != 85; ++k192) {
ptrdiff_t s69 = -1;
__m512 sum789 = _mm512_set1_ps(*(float*)(arrangedWts14+776192*i83+9096*k192+24*s69+(ptrdiff_t)24));
__m512 sum793 = _mm512_set1_ps(*(float*)(arrangedWts14+776192*i83+9096*k192+24*s69+(ptrdiff_t)28));
__m512 sum797 = _mm512_set1_ps(*(float*)(arrangedWts14+776192*i83+9096*k192+24*s69+(ptrdiff_t)32));
__m512 sum801 = _mm512_set1_ps(*(float*)(arrangedWts14+776192*i83+9096*k192+24*s69+(ptrdiff_t)36));
__m512 sum805 = _mm512_set1_ps(*(float*)(arrangedWts14+776192*i83+9096*k192+24*s69+(ptrdiff_t)40));
__m512 sum809 = _mm512_set1_ps(*(float*)(arrangedWts14+776192*i83+9096*k192+24*s69+(ptrdiff_t)44));
__m512 sum790 = sum789;
__m512 sum791 = sum789;
__m512 sum792 = sum789;
__m512 sum794 = sum793;
__m512 sum795 = sum793;
__m512 sum796 = sum793;
__m512 sum798 = sum797;
__m512 sum799 = sum797;
__m512 sum800 = sum797;
__m512 sum802 = sum801;
__m512 sum803 = sum801;
__m512 sum804 = sum801;
__m512 sum806 = sum805;
__m512 sum807 = sum805;
__m512 sum808 = sum805;
__m512 sum810 = sum809;
__m512 sum811 = sum809;
__m512 sum812 = sum809;
for (s69 = 0; s69 < 378; ++s69) {
__m512 dat2538 = _mm512_loadu_ps(arrangedDats14+96768*i83+96768*j74+256*s69+(ptrdiff_t)0);
__m512 dat2539 = _mm512_loadu_ps(arrangedDats14+96768*i83+96768*j74+256*s69+(ptrdiff_t)64);
__m512 dat2540 = _mm512_loadu_ps(arrangedDats14+96768*i83+96768*j74+256*s69+(ptrdiff_t)128);
__m512 dat2541 = _mm512_loadu_ps(arrangedDats14+96768*i83+96768*j74+256*s69+(ptrdiff_t)192);
__m512 wt1051 = _mm512_set1_ps(*(float*)(arrangedWts14+776192*i83+9096*k192+24*s69+(ptrdiff_t)24));
sum789 = _mm512_fmadd_ps(wt1051, dat2538, sum789);
sum790 = _mm512_fmadd_ps(wt1051, dat2539, sum790);
sum791 = _mm512_fmadd_ps(wt1051, dat2540, sum791);
sum792 = _mm512_fmadd_ps(wt1051, dat2541, sum792);
__m512 wt1052 = _mm512_set1_ps(*(float*)(arrangedWts14+776192*i83+9096*k192+24*s69+(ptrdiff_t)28));
sum793 = _mm512_fmadd_ps(wt1052, dat2538, sum793);
sum794 = _mm512_fmadd_ps(wt1052, dat2539, sum794);
sum795 = _mm512_fmadd_ps(wt1052, dat2540, sum795);
sum796 = _mm512_fmadd_ps(wt1052, dat2541, sum796);
__m512 wt1053 = _mm512_set1_ps(*(float*)(arrangedWts14+776192*i83+9096*k192+24*s69+(ptrdiff_t)32));
sum797 = _mm512_fmadd_ps(wt1053, dat2538, sum797);
sum798 = _mm512_fmadd_ps(wt1053, dat2539, sum798);
sum799 = _mm512_fmadd_ps(wt1053, dat2540, sum799);
sum800 = _mm512_fmadd_ps(wt1053, dat2541, sum800);
__m512 wt1054 = _mm512_set1_ps(*(float*)(arrangedWts14+776192*i83+9096*k192+24*s69+(ptrdiff_t)36));
sum801 = _mm512_fmadd_ps(wt1054, dat2538, sum801);
sum802 = _mm512_fmadd_ps(wt1054, dat2539, sum802);
sum803 = _mm512_fmadd_ps(wt1054, dat2540, sum803);
sum804 = _mm512_fmadd_ps(wt1054, dat2541, sum804);
__m512 wt1055 = _mm512_set1_ps(*(float*)(arrangedWts14+776192*i83+9096*k192+24*s69+(ptrdiff_t)40));
sum805 = _mm512_fmadd_ps(wt1055, dat2538, sum805);
sum806 = _mm512_fmadd_ps(wt1055, dat2539, sum806);
sum807 = _mm512_fmadd_ps(wt1055, dat2540, sum807);
sum808 = _mm512_fmadd_ps(wt1055, dat2541, sum808);
__m512 wt1056 = _mm512_set1_ps(*(float*)(arrangedWts14+776192*i83+9096*k192+24*s69+(ptrdiff_t)44));
sum809 = _mm512_fmadd_ps(wt1056, dat2538, sum809);
sum810 = _mm512_fmadd_ps(wt1056, dat2539, sum810);
sum811 = _mm512_fmadd_ps(wt1056, dat2540, sum811);
sum812 = _mm512_fmadd_ps(wt1056, dat2541, sum812);
}
sum789 = _mm512_add_ps(sum789, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)0));
sum790 = _mm512_add_ps(sum790, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)64));
sum791 = _mm512_add_ps(sum791, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)128));
sum792 = _mm512_add_ps(sum792, _mm512_maskz_loadu_ps(1, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)192));
sum789 = _mm512_max_ps(_mm512_setzero_ps(), sum789);
sum790 = _mm512_max_ps(_mm512_setzero_ps(), sum790);
sum791 = _mm512_max_ps(_mm512_setzero_ps(), sum791);
sum792 = _mm512_max_ps(_mm512_setzero_ps(), sum792);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)0, 65535, sum789);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)64, 65535, sum790);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)128, 65535, sum791);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)192, 1, sum792);
sum793 = _mm512_add_ps(sum793, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)320));
sum794 = _mm512_add_ps(sum794, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)384));
sum795 = _mm512_add_ps(sum795, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)448));
sum796 = _mm512_add_ps(sum796, _mm512_maskz_loadu_ps(1, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)512));
sum793 = _mm512_max_ps(_mm512_setzero_ps(), sum793);
sum794 = _mm512_max_ps(_mm512_setzero_ps(), sum794);
sum795 = _mm512_max_ps(_mm512_setzero_ps(), sum795);
sum796 = _mm512_max_ps(_mm512_setzero_ps(), sum796);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)320, 65535, sum793);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)384, 65535, sum794);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)448, 65535, sum795);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)512, 1, sum796);
sum797 = _mm512_add_ps(sum797, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)640));
sum798 = _mm512_add_ps(sum798, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)704));
sum799 = _mm512_add_ps(sum799, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)768));
sum800 = _mm512_add_ps(sum800, _mm512_maskz_loadu_ps(1, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)832));
sum797 = _mm512_max_ps(_mm512_setzero_ps(), sum797);
sum798 = _mm512_max_ps(_mm512_setzero_ps(), sum798);
sum799 = _mm512_max_ps(_mm512_setzero_ps(), sum799);
sum800 = _mm512_max_ps(_mm512_setzero_ps(), sum800);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)640, 65535, sum797);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)704, 65535, sum798);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)768, 65535, sum799);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)832, 1, sum800);
sum801 = _mm512_add_ps(sum801, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)960));
sum802 = _mm512_add_ps(sum802, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1024));
sum803 = _mm512_add_ps(sum803, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1088));
sum804 = _mm512_add_ps(sum804, _mm512_maskz_loadu_ps(1, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1152));
sum801 = _mm512_max_ps(_mm512_setzero_ps(), sum801);
sum802 = _mm512_max_ps(_mm512_setzero_ps(), sum802);
sum803 = _mm512_max_ps(_mm512_setzero_ps(), sum803);
sum804 = _mm512_max_ps(_mm512_setzero_ps(), sum804);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)960, 65535, sum801);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1024, 65535, sum802);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1088, 65535, sum803);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1152, 1, sum804);
sum805 = _mm512_add_ps(sum805, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1280));
sum806 = _mm512_add_ps(sum806, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1344));
sum807 = _mm512_add_ps(sum807, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1408));
sum808 = _mm512_add_ps(sum808, _mm512_maskz_loadu_ps(1, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1472));
sum805 = _mm512_max_ps(_mm512_setzero_ps(), sum805);
sum806 = _mm512_max_ps(_mm512_setzero_ps(), sum806);
sum807 = _mm512_max_ps(_mm512_setzero_ps(), sum807);
sum808 = _mm512_max_ps(_mm512_setzero_ps(), sum808);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1280, 65535, sum805);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1344, 65535, sum806);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1408, 65535, sum807);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1472, 1, sum808);
sum809 = _mm512_add_ps(sum809, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1600));
sum810 = _mm512_add_ps(sum810, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1664));
sum811 = _mm512_add_ps(sum811, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1728));
sum812 = _mm512_add_ps(sum812, _mm512_maskz_loadu_ps(1, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1792));
sum809 = _mm512_max_ps(_mm512_setzero_ps(), sum809);
sum810 = _mm512_max_ps(_mm512_setzero_ps(), sum810);
sum811 = _mm512_max_ps(_mm512_setzero_ps(), sum811);
sum812 = _mm512_max_ps(_mm512_setzero_ps(), sum812);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1600, 65535, sum809);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1664, 65535, sum810);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1728, 65535, sum811);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)1792, 1, sum812);
if (k192 >= kk71) return;
}
ptrdiff_t s70 = -1;
__m512 sum813 = _mm512_set1_ps(*(float*)(arrangedWts14+776192*i83+9096*k192+8*s70+(ptrdiff_t)8));
__m512 sum817 = _mm512_set1_ps(*(float*)(arrangedWts14+776192*i83+9096*k192+8*s70+(ptrdiff_t)12));
__m512 sum814 = sum813;
__m512 sum815 = sum813;
__m512 sum816 = sum813;
__m512 sum818 = sum817;
__m512 sum819 = sum817;
__m512 sum820 = sum817;
for (s70 = 0; s70 < 378; ++s70) {
__m512 dat2542 = _mm512_loadu_ps(arrangedDats14+96768*i83+96768*j74+256*s70+(ptrdiff_t)0);
__m512 dat2543 = _mm512_loadu_ps(arrangedDats14+96768*i83+96768*j74+256*s70+(ptrdiff_t)64);
__m512 dat2544 = _mm512_loadu_ps(arrangedDats14+96768*i83+96768*j74+256*s70+(ptrdiff_t)128);
__m512 dat2545 = _mm512_loadu_ps(arrangedDats14+96768*i83+96768*j74+256*s70+(ptrdiff_t)192);
__m512 wt1057 = _mm512_set1_ps(*(float*)(arrangedWts14+776192*i83+9096*k192+8*s70+(ptrdiff_t)8));
sum813 = _mm512_fmadd_ps(wt1057, dat2542, sum813);
sum814 = _mm512_fmadd_ps(wt1057, dat2543, sum814);
sum815 = _mm512_fmadd_ps(wt1057, dat2544, sum815);
sum816 = _mm512_fmadd_ps(wt1057, dat2545, sum816);
__m512 wt1058 = _mm512_set1_ps(*(float*)(arrangedWts14+776192*i83+9096*k192+8*s70+(ptrdiff_t)12));
sum817 = _mm512_fmadd_ps(wt1058, dat2542, sum817);
sum818 = _mm512_fmadd_ps(wt1058, dat2543, sum818);
sum819 = _mm512_fmadd_ps(wt1058, dat2544, sum819);
sum820 = _mm512_fmadd_ps(wt1058, dat2545, sum820);
}
sum813 = _mm512_add_ps(sum813, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)0));
sum814 = _mm512_add_ps(sum814, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)64));
sum815 = _mm512_add_ps(sum815, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)128));
sum816 = _mm512_add_ps(sum816, _mm512_maskz_loadu_ps(1, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)192));
sum813 = _mm512_max_ps(_mm512_setzero_ps(), sum813);
sum814 = _mm512_max_ps(_mm512_setzero_ps(), sum814);
sum815 = _mm512_max_ps(_mm512_setzero_ps(), sum815);
sum816 = _mm512_max_ps(_mm512_setzero_ps(), sum816);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)0, 65535, sum813);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)64, 65535, sum814);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)128, 65535, sum815);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)192, 1, sum816);
sum817 = _mm512_add_ps(sum817, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)320));
sum818 = _mm512_add_ps(sum818, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)384));
sum819 = _mm512_add_ps(sum819, _mm512_maskz_loadu_ps(65535, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)448));
sum820 = _mm512_add_ps(sum820, _mm512_maskz_loadu_ps(1, datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)512));
sum817 = _mm512_max_ps(_mm512_setzero_ps(), sum817);
sum818 = _mm512_max_ps(_mm512_setzero_ps(), sum818);
sum819 = _mm512_max_ps(_mm512_setzero_ps(), sum819);
sum820 = _mm512_max_ps(_mm512_setzero_ps(), sum820);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)320, 65535, sum817);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)384, 65535, sum818);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)448, 65535, sum819);
_mm512_mask_storeu_ps(datPtr47+163840*i83+256*j74+1920*k192+(ptrdiff_t)512, 1, sum820);
}
}

static void ResNet50OneApply12(ResNet50ThreaderTeam1* team82, char** tensors137) {
void* pair37[] = {tensors137, 0};
ResNet50ThreaderTask1 task143;
task143.callee1 = ResNet50OneApply12Callee1;
task143.any1 = pair37;
task143.nd1 = 3;
task143.hull1[0] = 86;
task143.hull1[1] = 1;
task143.hull1[2] = 1;
ResNet50ThreaderDo1(team82, &task143);
for (ptrdiff_t e42 = 1; e42 < 2; ++e42) {
pair37[1] = (void*)e42;
ResNet50ThreaderTask1 task144;
task144.callee1 = ResNet50OneApply12Callee2;
task144.any1 = pair37;
task144.nd1 = 3;
task144.hull1[0] = 86;
task144.hull1[1] = 1;
task144.hull1[2] = 1;
ResNet50ThreaderDo1(team82, &task144);
}
pair37[1] = (void*)2;
ResNet50ThreaderTask1 task145;
task145.callee1 = ResNet50OneApply12Callee3;
task145.any1 = pair37;
task145.nd1 = 3;
task145.hull1[0] = 43;
task145.hull1[1] = 1;
task145.hull1[2] = 1;
ResNet50ThreaderDo1(team82, &task145);
}

static void ResNet50ThreeArrangeFilts1Callee1(ResNet50ThreaderTask1* task20, int64_t* pt15) {
char** tensors18 = task20->any1;
ptrdiff_t b45 = pt15[0];
ptrdiff_t g7 = 0;
ptrdiff_t e7 = 0;
char*restrict bfPtr4 = tensors18[3]+256*e7;
char*restrict wfPtr4 = tensors18[3]+256+3244032*e7;
char*restrict wtPtr4 = tensors18[0]+14256*e7;
char*restrict biasPtr4 = tensors18[1];
char*restrict bnPtr4 = tensors18[2];
ptrdiff_t i15 = 1*g7;
ptrdiff_t j10 = 2*b45;
ptrdiff_t jj22 = j10+1;
if (j10 < 16) {
for (; j10 != 16; ++j10) {
ptrdiff_t k51 = 0+1*j10;
ptrdiff_t cut4 = 0;
__m512 postMul9 = _mm512_set1_ps(((float*)bnPtr4+(ptrdiff_t)2*(0+64*i15+4*j10))[0]);
__m512 postMul10 = _mm512_set1_ps(((float*)bnPtr4+(ptrdiff_t)2*(1+64*i15+4*j10))[0]);
__m512 postMul11 = _mm512_set1_ps(((float*)bnPtr4+(ptrdiff_t)2*(2+64*i15+4*j10))[0]);
__m512 postMul12 = _mm512_set1_ps(((float*)bnPtr4+(ptrdiff_t)2*(3+64*i15+4*j10))[0]);
ptrdiff_t s12 = 0;
for (; s12 != 64; ++s12) {
__m512 wt103 = _mm512_maskz_loadu_ps(511, wtPtr4+0+147456*i15+9216*j10+36*s12);
__m512 wt104 = _mm512_maskz_loadu_ps(511, wtPtr4+2304+147456*i15+9216*j10+36*s12);
__m512 wt105 = _mm512_maskz_loadu_ps(511, wtPtr4+4608+147456*i15+9216*j10+36*s12);
__m512 wt106 = _mm512_maskz_loadu_ps(511, wtPtr4+6912+147456*i15+9216*j10+36*s12);
wt103 = _mm512_mul_ps(wt103, postMul9);
wt104 = _mm512_mul_ps(wt104, postMul10);
wt105 = _mm512_mul_ps(wt105, postMul11);
wt106 = _mm512_mul_ps(wt106, postMul12);
__m512i pm63 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm64 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp241 = _mm512_permutex2var_ps(wt103, pm63, wt105);
__m512 tmp242 = _mm512_permutex2var_ps(wt104, pm63, wt106);
__m512 tmp243 = _mm512_permutex2var_ps(wt103, pm64, wt105);
__m512 tmp244 = _mm512_permutex2var_ps(wt104, pm64, wt106);
__m512 in11 = _mm512_permutex2var_ps(tmp241, pm63, tmp242);
__m512 in12 = _mm512_permutex2var_ps(tmp241, pm64, tmp242);
__m512 in13 = _mm512_permutex2var_ps(tmp243, pm63, tmp244);
__m512 tmp245 = _mm512_fmadd_ps(in11, _mm512_set1_ps(4e+00f), in13);
__m512 tmp246 = _mm512_add_ps(in11, in13);
__m512 tmp247 = _mm512_fmadd_ps(in13, _mm512_set1_ps(4e+00f), in11);
__m512 tmp248 = _mm512_add_ps(in12, tmp246);
__m512 tmp249 = _mm512_fmadd_ps(in12, _mm512_set1_ps(2e+00f), tmp247);
tmp247 = _mm512_fnmadd_ps(in12, _mm512_set1_ps(2e+00f), tmp247);
__m512 tmp250 = _mm512_fnmadd_ps(in12, _mm512_set1_ps(2e+00f), tmp245);
tmp245 = _mm512_fmadd_ps(in12, _mm512_set1_ps(2e+00f), tmp245);
tmp246 = _mm512_sub_ps(tmp246, in12);
__m512 tmp267 = _mm512_unpacklo_ps(in11, tmp248);
__m512 tmp268 = _mm512_unpackhi_ps(in11, tmp248);
__m512 tmp269 = _mm512_unpacklo_ps(tmp246, tmp249);
__m512 tmp270 = _mm512_unpackhi_ps(tmp246, tmp249);
__m512 tmp271 = _mm512_unpacklo_ps(tmp247, tmp245);
__m512 tmp272 = _mm512_unpackhi_ps(tmp247, tmp245);
__m512 tmp273 = _mm512_unpacklo_ps(tmp250, in13);
__m512 tmp274 = _mm512_unpackhi_ps(tmp250, in13);
__m512 tmp275 = _mm512_shuffle_ps(tmp267, tmp269, 68);
__m512 tmp276 = _mm512_shuffle_ps(tmp267, tmp269, 238);
__m512 tmp277 = _mm512_shuffle_ps(tmp268, tmp270, 68);
__m512 tmp278 = _mm512_shuffle_ps(tmp268, tmp270, 238);
__m512 tmp279 = _mm512_shuffle_ps(tmp271, tmp273, 68);
__m512 tmp280 = _mm512_shuffle_ps(tmp271, tmp273, 238);
__m512 tmp281 = _mm512_shuffle_ps(tmp272, tmp274, 68);
__m512 tmp282 = _mm512_shuffle_ps(tmp272, tmp274, 238);
__m512 tmp283 = _mm512_shuffle_f32x4(tmp275, tmp279, 136);
__m512 tmp284 = _mm512_shuffle_f32x4(tmp275, tmp279, 221);
__m512 tmp285 = _mm512_shuffle_f32x4(tmp276, tmp280, 136);
__m512 tmp286 = _mm512_shuffle_f32x4(tmp276, tmp280, 221);
__m512 tmp287 = _mm512_shuffle_f32x4(tmp277, tmp281, 136);
__m512 tmp288 = _mm512_shuffle_f32x4(tmp277, tmp281, 221);
__m512 tmp289 = _mm512_shuffle_f32x4(tmp278, tmp282, 136);
__m512 tmp290 = _mm512_shuffle_f32x4(tmp278, tmp282, 221);
in11 = _mm512_shuffle_f32x4(tmp283, tmp283, 136);
__m512 tmp251 = _mm512_shuffle_f32x4(tmp283, tmp283, 221);
tmp248 = _mm512_shuffle_f32x4(tmp285, tmp285, 136);
__m512 tmp252 = _mm512_shuffle_f32x4(tmp285, tmp285, 221);
tmp246 = _mm512_shuffle_f32x4(tmp287, tmp287, 136);
__m512 tmp253 = _mm512_shuffle_f32x4(tmp287, tmp287, 221);
tmp249 = _mm512_shuffle_f32x4(tmp289, tmp289, 136);
__m512 tmp254 = _mm512_shuffle_f32x4(tmp289, tmp289, 221);
tmp247 = _mm512_shuffle_f32x4(tmp284, tmp284, 136);
tmp245 = _mm512_shuffle_f32x4(tmp286, tmp286, 136);
tmp250 = _mm512_shuffle_f32x4(tmp288, tmp288, 136);
in13 = _mm512_shuffle_f32x4(tmp290, tmp290, 136);
in11 = _mm512_shuffle_f32x4(in11, tmp249, 68);
tmp248 = _mm512_shuffle_f32x4(tmp248, tmp247, 68);
tmp246 = _mm512_shuffle_f32x4(tmp246, tmp245, 68);
tmp250 = _mm512_shuffle_f32x4(tmp250, tmp252, 68);
in13 = _mm512_shuffle_f32x4(in13, tmp253, 68);
tmp251 = _mm512_shuffle_f32x4(tmp251, tmp254, 68);
__m512 tmp255 = _mm512_fmadd_ps(in11, _mm512_set1_ps(4e+00f), tmp246);
__m512 tmp261 = _mm512_fmadd_ps(tmp250, _mm512_set1_ps(4e+00f), tmp251);
__m512 tmp256 = _mm512_add_ps(in11, tmp246);
__m512 tmp262 = _mm512_add_ps(tmp250, tmp251);
__m512 tmp257 = _mm512_fmadd_ps(tmp246, _mm512_set1_ps(4e+00f), in11);
__m512 tmp263 = _mm512_fmadd_ps(tmp251, _mm512_set1_ps(4e+00f), tmp250);
__m512 tmp258 = _mm512_add_ps(tmp248, tmp256);
__m512 tmp264 = _mm512_add_ps(in13, tmp262);
__m512 tmp259 = _mm512_fmadd_ps(tmp248, _mm512_set1_ps(2e+00f), tmp257);
__m512 tmp265 = _mm512_fmadd_ps(in13, _mm512_set1_ps(2e+00f), tmp263);
tmp257 = _mm512_fnmadd_ps(tmp248, _mm512_set1_ps(2e+00f), tmp257);
tmp263 = _mm512_fnmadd_ps(in13, _mm512_set1_ps(2e+00f), tmp263);
__m512 tmp260 = _mm512_fnmadd_ps(tmp248, _mm512_set1_ps(2e+00f), tmp255);
__m512 tmp266 = _mm512_fnmadd_ps(in13, _mm512_set1_ps(2e+00f), tmp261);
tmp255 = _mm512_fmadd_ps(tmp248, _mm512_set1_ps(2e+00f), tmp255);
tmp261 = _mm512_fmadd_ps(in13, _mm512_set1_ps(2e+00f), tmp261);
tmp256 = _mm512_sub_ps(tmp256, tmp248);
tmp262 = _mm512_sub_ps(tmp262, in13);
in11 = _mm512_mul_ps(in11, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp258 = _mm512_mul_ps(tmp258, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp256 = _mm512_mul_ps(tmp256, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp259 = _mm512_mul_ps(tmp259, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp257 = _mm512_mul_ps(tmp257, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp255 = _mm512_mul_ps(tmp255, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp260 = _mm512_mul_ps(tmp260, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp246 = _mm512_mul_ps(tmp246, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp250 = _mm512_mul_ps(tmp250, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp264 = _mm512_mul_ps(tmp264, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp262 = _mm512_mul_ps(tmp262, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp265 = _mm512_mul_ps(tmp265, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp263 = _mm512_mul_ps(tmp263, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp261 = _mm512_mul_ps(tmp261, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp266 = _mm512_mul_ps(tmp266, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp251 = _mm512_mul_ps(tmp251, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out7 = _mm512_shuffle_f32x4(in11, tmp258, 68);
__m512 out11 = _mm512_shuffle_f32x4(in11, tmp258, 238);
__m512 out8 = _mm512_shuffle_f32x4(tmp256, tmp259, 68);
__m512 out12 = _mm512_shuffle_f32x4(tmp256, tmp259, 238);
__m512 out9 = _mm512_shuffle_f32x4(tmp257, tmp255, 68);
__m512 out13 = _mm512_shuffle_f32x4(tmp257, tmp255, 238);
__m512 out10 = _mm512_shuffle_f32x4(tmp260, tmp246, 68);
__m512 out14 = _mm512_shuffle_f32x4(tmp260, tmp246, 238);
__m512 out15 = _mm512_shuffle_f32x4(tmp250, tmp264, 68);
__m512 out19 = _mm512_shuffle_f32x4(tmp250, tmp264, 238);
__m512 out16 = _mm512_shuffle_f32x4(tmp262, tmp265, 68);
__m512 out20 = _mm512_shuffle_f32x4(tmp262, tmp265, 238);
__m512 out17 = _mm512_shuffle_f32x4(tmp263, tmp261, 68);
__m512 out21 = _mm512_shuffle_f32x4(tmp263, tmp261, 238);
__m512 out18 = _mm512_shuffle_f32x4(tmp266, tmp251, 68);
__m512 out22 = _mm512_shuffle_f32x4(tmp266, tmp251, 238);
ptrdiff_t off1 = 32*cut4;
ptrdiff_t off2 = (size_t)(cut4+1)/4*8192+(size_t)(cut4+1)%4*32;
ptrdiff_t off3 = (size_t)(cut4+2)/4*8192+(size_t)(cut4+2)%4*32;
ptrdiff_t off4 = (size_t)(cut4+3)/4*8192+(size_t)(cut4+3)%4*32;
__m512i wf33 = _mm512_castsi256_si512(_mm512_cvtps_ph(out7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf34 = _mm512_castsi256_si512(_mm512_cvtps_ph(out11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf35 = _mm512_castsi256_si512(_mm512_cvtps_ph(out15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf36 = _mm512_castsi256_si512(_mm512_cvtps_ph(out19, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf37 = _mm512_castsi256_si512(_mm512_cvtps_ph(out8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf38 = _mm512_castsi256_si512(_mm512_cvtps_ph(out12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf39 = _mm512_castsi256_si512(_mm512_cvtps_ph(out16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf40 = _mm512_castsi256_si512(_mm512_cvtps_ph(out20, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf41 = _mm512_castsi256_si512(_mm512_cvtps_ph(out9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf42 = _mm512_castsi256_si512(_mm512_cvtps_ph(out13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf43 = _mm512_castsi256_si512(_mm512_cvtps_ph(out17, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf44 = _mm512_castsi256_si512(_mm512_cvtps_ph(out21, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf45 = _mm512_castsi256_si512(_mm512_cvtps_ph(out10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf46 = _mm512_castsi256_si512(_mm512_cvtps_ph(out14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf47 = _mm512_castsi256_si512(_mm512_cvtps_ph(out18, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf48 = _mm512_castsi256_si512(_mm512_cvtps_ph(out22, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr4+0+524288*i15+8192*k51+off1+128*s12, 255, wf33);
_mm512_mask_storeu_epi32(wfPtr4+0+524288*i15+8192*k51+off2+128*s12, 255, wf34);
_mm512_mask_storeu_epi32(wfPtr4+0+524288*i15+8192*k51+off3+128*s12, 255, wf35);
_mm512_mask_storeu_epi32(wfPtr4+0+524288*i15+8192*k51+off4+128*s12, 255, wf36);
_mm512_mask_storeu_epi32(wfPtr4+131072+524288*i15+8192*k51+off1+128*s12, 255, wf37);
_mm512_mask_storeu_epi32(wfPtr4+131072+524288*i15+8192*k51+off2+128*s12, 255, wf38);
_mm512_mask_storeu_epi32(wfPtr4+131072+524288*i15+8192*k51+off3+128*s12, 255, wf39);
_mm512_mask_storeu_epi32(wfPtr4+131072+524288*i15+8192*k51+off4+128*s12, 255, wf40);
_mm512_mask_storeu_epi32(wfPtr4+262144+524288*i15+8192*k51+off1+128*s12, 255, wf41);
_mm512_mask_storeu_epi32(wfPtr4+262144+524288*i15+8192*k51+off2+128*s12, 255, wf42);
_mm512_mask_storeu_epi32(wfPtr4+262144+524288*i15+8192*k51+off3+128*s12, 255, wf43);
_mm512_mask_storeu_epi32(wfPtr4+262144+524288*i15+8192*k51+off4+128*s12, 255, wf44);
_mm512_mask_storeu_epi32(wfPtr4+393216+524288*i15+8192*k51+off1+128*s12, 255, wf45);
_mm512_mask_storeu_epi32(wfPtr4+393216+524288*i15+8192*k51+off2+128*s12, 255, wf46);
_mm512_mask_storeu_epi32(wfPtr4+393216+524288*i15+8192*k51+off3+128*s12, 255, wf47);
_mm512_mask_storeu_epi32(wfPtr4+393216+524288*i15+8192*k51+off4+128*s12, 255, wf48);
}
__m512 bias2 = _mm512_setzero_ps();
if (!e7) {
bias2 = _mm512_maskz_loadu_ps(15, biasPtr4-0+256*i15+16*j10);
__m512i pmMul7 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd7 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas4 = _mm512_maskz_loadu_ps(255, bnPtr4+(ptrdiff_t)8*(0+64*i15+4*j10));
__m512 postMul13 = _mm512_permutexvar_ps(pmMul7, mas4);
__m512 postAdd7 = _mm512_permutexvar_ps(pmAdd7, mas4);
bias2 = _mm512_fmadd_ps(bias2, postMul13, postAdd7);
}
_mm512_mask_storeu_ps(bfPtr4-0+256*i15+16*j10, 15, bias2);
if (j10 >= jj22) return;
}
}
}

static void ResNet50ThreeArrangeFilts1(ResNet50ThreaderTeam1* team22, char** tensors17) {
ResNet50ThreaderTask1 task21;
task21.callee1 = ResNet50ThreeArrangeFilts1Callee1;
task21.any1 = tensors17;
task21.nd1 = 3;
task21.hull1[0] = 8;
task21.hull1[1] = 1;
task21.hull1[2] = 1;
ResNet50ThreaderDo1(team22, &task21);
}

static void ResNet50ThreeArrangeDats1Callee1(ResNet50ThreaderTask1* task22, int64_t* pt16) {
char** tensors20 = task22->any1;
ptrdiff_t s13 = 0;
ptrdiff_t c11 = pt16[1];
ptrdiff_t g8 = 0;
ptrdiff_t e8 = 0;
char*restrict datPtr5 = tensors20[0]-228+4992768*e8;
char*restrict dfPtr4 = tensors20[1]+10137600*e8;
ptrdiff_t i16 = 1*g8;
ptrdiff_t j11 = 2*c11;
ptrdiff_t last3 = j11+(c11 < 7 ? 1 : 2);
if (j11 < 2) {
ptrdiff_t rel7 = j11-0;
ptrdiff_t base7 = 0;
if (rel7 < 1) {
ptrdiff_t h20 = base7+0;
ptrdiff_t w23 = 0;
ptrdiff_t k52 = 0;
for (; k52 != 32; ++k52) {
__m512 dat921 = _mm512_maskz_loadu_ps(8191, datPtr5+228+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat921 = _mm512_max_ps(_mm512_setzero_ps(), dat921);
__m512 dat922 = _mm512_maskz_loadu_ps(16383, datPtr5+272+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat922 = _mm512_max_ps(_mm512_setzero_ps(), dat922);
__m512i pm65 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in14 = _mm512_permutexvar_ps(pm65, dat921);
__m512i pm66 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in21 = _mm512_permutexvar_ps(pm66, dat922);
__m512 dat923 = _mm512_maskz_loadu_ps(8191, datPtr5+452+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat923 = _mm512_max_ps(_mm512_setzero_ps(), dat923);
__m512 dat924 = _mm512_maskz_loadu_ps(16383, datPtr5+496+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat924 = _mm512_max_ps(_mm512_setzero_ps(), dat924);
__m512 in15 = _mm512_permutexvar_ps(pm65, dat923);
__m512 in22 = _mm512_permutexvar_ps(pm66, dat924);
__m512 dat925 = _mm512_maskz_loadu_ps(8191, datPtr5+676+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat925 = _mm512_max_ps(_mm512_setzero_ps(), dat925);
__m512 dat926 = _mm512_maskz_loadu_ps(16383, datPtr5+720+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat926 = _mm512_max_ps(_mm512_setzero_ps(), dat926);
__m512 in16 = _mm512_permutexvar_ps(pm65, dat925);
__m512 in23 = _mm512_permutexvar_ps(pm66, dat926);
__m512 dat927 = _mm512_maskz_loadu_ps(8191, datPtr5+900+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat927 = _mm512_max_ps(_mm512_setzero_ps(), dat927);
__m512 dat928 = _mm512_maskz_loadu_ps(16383, datPtr5+944+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat928 = _mm512_max_ps(_mm512_setzero_ps(), dat928);
__m512 in17 = _mm512_permutexvar_ps(pm65, dat927);
__m512 in24 = _mm512_permutexvar_ps(pm66, dat928);
__m512 dat929 = _mm512_maskz_loadu_ps(8191, datPtr5+1124+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat929 = _mm512_max_ps(_mm512_setzero_ps(), dat929);
__m512 dat930 = _mm512_maskz_loadu_ps(16383, datPtr5+1168+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat930 = _mm512_max_ps(_mm512_setzero_ps(), dat930);
__m512 in18 = _mm512_permutexvar_ps(pm65, dat929);
__m512 in25 = _mm512_permutexvar_ps(pm66, dat930);
__m512 dat931 = _mm512_maskz_loadu_ps(8191, datPtr5+1348+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat931 = _mm512_max_ps(_mm512_setzero_ps(), dat931);
__m512 dat932 = _mm512_maskz_loadu_ps(16383, datPtr5+1392+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat932 = _mm512_max_ps(_mm512_setzero_ps(), dat932);
__m512 in19 = _mm512_permutexvar_ps(pm65, dat931);
__m512 in26 = _mm512_permutexvar_ps(pm66, dat932);
__m512 dat933 = _mm512_maskz_loadu_ps(8191, datPtr5+1572+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat933 = _mm512_max_ps(_mm512_setzero_ps(), dat933);
__m512 dat934 = _mm512_maskz_loadu_ps(16383, datPtr5+1616+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat934 = _mm512_max_ps(_mm512_setzero_ps(), dat934);
__m512 in20 = _mm512_permutexvar_ps(pm65, dat933);
__m512 in27 = _mm512_permutexvar_ps(pm66, dat934);
__m512 tmp291 = _mm512_add_ps(in14, in18);
__m512 tmp296 = _mm512_add_ps(in21, in25);
__m512 tmp292 = _mm512_sub_ps(in17, in15);
__m512 tmp297 = _mm512_sub_ps(in24, in22);
__m512 tmp293 = _mm512_add_ps(in15, in19);
__m512 tmp298 = _mm512_add_ps(in22, in26);
__m512 tmp294 = _mm512_sub_ps(_mm512_setzero_ps(), in19);
__m512 tmp299 = _mm512_sub_ps(_mm512_setzero_ps(), in26);
tmp291 = _mm512_fmadd_ps(in16, _mm512_set1_ps(-4.25e+00f), tmp291);
tmp296 = _mm512_fmadd_ps(in23, _mm512_set1_ps(-4.25e+00f), tmp296);
tmp293 = _mm512_fmadd_ps(in17, _mm512_set1_ps(-4.25e+00f), tmp293);
tmp298 = _mm512_fmadd_ps(in24, _mm512_set1_ps(-4.25e+00f), tmp298);
tmp294 = _mm512_fmadd_ps(tmp292, _mm512_set1_ps(5.25e+00f), tmp294);
tmp299 = _mm512_fmadd_ps(tmp297, _mm512_set1_ps(5.25e+00f), tmp299);
tmp292 = _mm512_fmadd_ps(in15, _mm512_set1_ps(2.5e-01f), in19);
tmp297 = _mm512_fmadd_ps(in22, _mm512_set1_ps(2.5e-01f), in26);
in15 = _mm512_fmadd_ps(in15, _mm512_set1_ps(4e+00f), in19);
in22 = _mm512_fmadd_ps(in22, _mm512_set1_ps(4e+00f), in26);
__m512 tmp295 = _mm512_sub_ps(tmp293, tmp291);
__m512 tmp300 = _mm512_sub_ps(tmp298, tmp296);
tmp293 = _mm512_add_ps(tmp291, tmp293);
tmp298 = _mm512_add_ps(tmp296, tmp298);
tmp291 = _mm512_fmadd_ps(in14, _mm512_set1_ps(2.5e-01f), in18);
tmp296 = _mm512_fmadd_ps(in21, _mm512_set1_ps(2.5e-01f), in25);
tmp292 = _mm512_fmadd_ps(in17, _mm512_set1_ps(-1.25e+00f), tmp292);
tmp297 = _mm512_fmadd_ps(in24, _mm512_set1_ps(-1.25e+00f), tmp297);
in17 = _mm512_fmadd_ps(in17, _mm512_set1_ps(-5e+00f), in15);
in24 = _mm512_fmadd_ps(in24, _mm512_set1_ps(-5e+00f), in22);
tmp291 = _mm512_fmadd_ps(in16, _mm512_set1_ps(-1.25e+00f), tmp291);
tmp296 = _mm512_fmadd_ps(in23, _mm512_set1_ps(-1.25e+00f), tmp296);
in19 = _mm512_fmadd_ps(tmp291, _mm512_set1_ps(2e+00f), tmp292);
in26 = _mm512_fmadd_ps(tmp296, _mm512_set1_ps(2e+00f), tmp297);
tmp292 = _mm512_fnmadd_ps(tmp291, _mm512_set1_ps(2e+00f), tmp292);
tmp297 = _mm512_fnmadd_ps(tmp296, _mm512_set1_ps(2e+00f), tmp297);
tmp291 = _mm512_fmadd_ps(in18, _mm512_set1_ps(2.5e-01f), in14);
tmp296 = _mm512_fmadd_ps(in25, _mm512_set1_ps(2.5e-01f), in21);
in14 = _mm512_sub_ps(in20, in14);
in21 = _mm512_sub_ps(in27, in21);
tmp291 = _mm512_fmadd_ps(in16, _mm512_set1_ps(-1.25e+00f), tmp291);
tmp296 = _mm512_fmadd_ps(in23, _mm512_set1_ps(-1.25e+00f), tmp296);
in16 = _mm512_sub_ps(in16, in18);
in23 = _mm512_sub_ps(in23, in25);
in16 = _mm512_fmadd_ps(in16, _mm512_set1_ps(5.25e+00f), in14);
in23 = _mm512_fmadd_ps(in23, _mm512_set1_ps(5.25e+00f), in21);
in15 = _mm512_fmadd_ps(tmp291, _mm512_set1_ps(2e+00f), in17);
in22 = _mm512_fmadd_ps(tmp296, _mm512_set1_ps(2e+00f), in24);
in17 = _mm512_fnmadd_ps(tmp291, _mm512_set1_ps(2e+00f), in17);
in24 = _mm512_fnmadd_ps(tmp296, _mm512_set1_ps(2e+00f), in24);
__m512 tmp309 = _mm512_unpacklo_ps(tmp294, tmp293);
__m512 tmp310 = _mm512_unpackhi_ps(tmp294, tmp293);
__m512 tmp311 = _mm512_unpacklo_ps(tmp295, in19);
__m512 tmp312 = _mm512_unpackhi_ps(tmp295, in19);
__m512 tmp313 = _mm512_unpacklo_ps(tmp292, in15);
__m512 tmp314 = _mm512_unpackhi_ps(tmp292, in15);
__m512 tmp315 = _mm512_unpacklo_ps(in17, in16);
__m512 tmp316 = _mm512_unpackhi_ps(in17, in16);
__m512 tmp317 = _mm512_unpacklo_ps(tmp299, tmp298);
__m512 tmp318 = _mm512_unpackhi_ps(tmp299, tmp298);
__m512 tmp319 = _mm512_unpacklo_ps(tmp300, in26);
__m512 tmp320 = _mm512_unpackhi_ps(tmp300, in26);
__m512 tmp321 = _mm512_unpacklo_ps(tmp297, in22);
__m512 tmp322 = _mm512_unpackhi_ps(tmp297, in22);
__m512 tmp323 = _mm512_unpacklo_ps(in24, in23);
__m512 tmp324 = _mm512_unpackhi_ps(in24, in23);
__m512 tmp325 = _mm512_shuffle_ps(tmp309, tmp311, 68);
__m512 tmp326 = _mm512_shuffle_ps(tmp309, tmp311, 238);
__m512 tmp327 = _mm512_shuffle_ps(tmp310, tmp312, 68);
__m512 tmp328 = _mm512_shuffle_ps(tmp310, tmp312, 238);
__m512 tmp329 = _mm512_shuffle_ps(tmp313, tmp315, 68);
__m512 tmp330 = _mm512_shuffle_ps(tmp313, tmp315, 238);
__m512 tmp331 = _mm512_shuffle_ps(tmp314, tmp316, 68);
__m512 tmp332 = _mm512_shuffle_ps(tmp314, tmp316, 238);
__m512 tmp333 = _mm512_shuffle_ps(tmp317, tmp319, 68);
__m512 tmp334 = _mm512_shuffle_ps(tmp317, tmp319, 238);
__m512 tmp335 = _mm512_shuffle_ps(tmp318, tmp320, 68);
__m512 tmp336 = _mm512_shuffle_ps(tmp318, tmp320, 238);
__m512 tmp337 = _mm512_shuffle_ps(tmp321, tmp323, 68);
__m512 tmp338 = _mm512_shuffle_ps(tmp321, tmp323, 238);
__m512 tmp339 = _mm512_shuffle_ps(tmp322, tmp324, 68);
__m512 tmp340 = _mm512_shuffle_ps(tmp322, tmp324, 238);
__m512 tmp341 = _mm512_shuffle_f32x4(tmp325, tmp329, 136);
__m512 tmp342 = _mm512_shuffle_f32x4(tmp325, tmp329, 221);
__m512 tmp343 = _mm512_shuffle_f32x4(tmp326, tmp330, 136);
__m512 tmp344 = _mm512_shuffle_f32x4(tmp326, tmp330, 221);
__m512 tmp345 = _mm512_shuffle_f32x4(tmp327, tmp331, 136);
__m512 tmp346 = _mm512_shuffle_f32x4(tmp327, tmp331, 221);
__m512 tmp347 = _mm512_shuffle_f32x4(tmp328, tmp332, 136);
__m512 tmp348 = _mm512_shuffle_f32x4(tmp328, tmp332, 221);
__m512 tmp349 = _mm512_shuffle_f32x4(tmp333, tmp337, 136);
__m512 tmp350 = _mm512_shuffle_f32x4(tmp333, tmp337, 221);
__m512 tmp351 = _mm512_shuffle_f32x4(tmp334, tmp338, 136);
__m512 tmp352 = _mm512_shuffle_f32x4(tmp334, tmp338, 221);
__m512 tmp353 = _mm512_shuffle_f32x4(tmp335, tmp339, 136);
__m512 tmp354 = _mm512_shuffle_f32x4(tmp335, tmp339, 221);
__m512 tmp355 = _mm512_shuffle_f32x4(tmp336, tmp340, 136);
__m512 tmp356 = _mm512_shuffle_f32x4(tmp336, tmp340, 221);
tmp294 = _mm512_shuffle_f32x4(tmp341, tmp349, 136);
tmp299 = _mm512_shuffle_f32x4(tmp341, tmp349, 221);
tmp293 = _mm512_shuffle_f32x4(tmp343, tmp351, 136);
tmp298 = _mm512_shuffle_f32x4(tmp343, tmp351, 221);
tmp295 = _mm512_shuffle_f32x4(tmp345, tmp353, 136);
tmp300 = _mm512_shuffle_f32x4(tmp345, tmp353, 221);
in19 = _mm512_shuffle_f32x4(tmp347, tmp355, 136);
in26 = _mm512_shuffle_f32x4(tmp347, tmp355, 221);
tmp292 = _mm512_shuffle_f32x4(tmp342, tmp350, 136);
tmp297 = _mm512_shuffle_f32x4(tmp342, tmp350, 221);
in15 = _mm512_shuffle_f32x4(tmp344, tmp352, 136);
in22 = _mm512_shuffle_f32x4(tmp344, tmp352, 221);
in17 = _mm512_shuffle_f32x4(tmp346, tmp354, 136);
in24 = _mm512_shuffle_f32x4(tmp346, tmp354, 221);
in16 = _mm512_shuffle_f32x4(tmp348, tmp356, 136);
in23 = _mm512_shuffle_f32x4(tmp348, tmp356, 221);
__m512 tmp301 = _mm512_add_ps(tmp293, in15);
__m512 tmp305 = _mm512_add_ps(tmp298, in22);
__m512 tmp302 = _mm512_sub_ps(tmp292, tmp295);
__m512 tmp306 = _mm512_sub_ps(tmp297, tmp300);
__m512 tmp303 = _mm512_add_ps(tmp295, in17);
__m512 tmp307 = _mm512_add_ps(tmp300, in24);
tmp294 = _mm512_sub_ps(tmp294, in17);
tmp299 = _mm512_sub_ps(tmp299, in24);
tmp301 = _mm512_fmadd_ps(in19, _mm512_set1_ps(-4.25e+00f), tmp301);
tmp305 = _mm512_fmadd_ps(in26, _mm512_set1_ps(-4.25e+00f), tmp305);
tmp303 = _mm512_fmadd_ps(tmp292, _mm512_set1_ps(-4.25e+00f), tmp303);
tmp307 = _mm512_fmadd_ps(tmp297, _mm512_set1_ps(-4.25e+00f), tmp307);
tmp294 = _mm512_fmadd_ps(tmp302, _mm512_set1_ps(5.25e+00f), tmp294);
tmp299 = _mm512_fmadd_ps(tmp306, _mm512_set1_ps(5.25e+00f), tmp299);
tmp302 = _mm512_fmadd_ps(tmp295, _mm512_set1_ps(2.5e-01f), in17);
tmp306 = _mm512_fmadd_ps(tmp300, _mm512_set1_ps(2.5e-01f), in24);
tmp295 = _mm512_fmadd_ps(tmp295, _mm512_set1_ps(4e+00f), in17);
tmp300 = _mm512_fmadd_ps(tmp300, _mm512_set1_ps(4e+00f), in24);
__m512 tmp304 = _mm512_sub_ps(tmp303, tmp301);
__m512 tmp308 = _mm512_sub_ps(tmp307, tmp305);
tmp303 = _mm512_add_ps(tmp301, tmp303);
tmp307 = _mm512_add_ps(tmp305, tmp307);
tmp301 = _mm512_fmadd_ps(tmp293, _mm512_set1_ps(2.5e-01f), in15);
tmp305 = _mm512_fmadd_ps(tmp298, _mm512_set1_ps(2.5e-01f), in22);
tmp302 = _mm512_fmadd_ps(tmp292, _mm512_set1_ps(-1.25e+00f), tmp302);
tmp306 = _mm512_fmadd_ps(tmp297, _mm512_set1_ps(-1.25e+00f), tmp306);
tmp292 = _mm512_fmadd_ps(tmp292, _mm512_set1_ps(-5e+00f), tmp295);
tmp297 = _mm512_fmadd_ps(tmp297, _mm512_set1_ps(-5e+00f), tmp300);
tmp301 = _mm512_fmadd_ps(in19, _mm512_set1_ps(-1.25e+00f), tmp301);
tmp305 = _mm512_fmadd_ps(in26, _mm512_set1_ps(-1.25e+00f), tmp305);
in17 = _mm512_fmadd_ps(tmp301, _mm512_set1_ps(2e+00f), tmp302);
in24 = _mm512_fmadd_ps(tmp305, _mm512_set1_ps(2e+00f), tmp306);
tmp302 = _mm512_fnmadd_ps(tmp301, _mm512_set1_ps(2e+00f), tmp302);
tmp306 = _mm512_fnmadd_ps(tmp305, _mm512_set1_ps(2e+00f), tmp306);
tmp301 = _mm512_fmadd_ps(in15, _mm512_set1_ps(2.5e-01f), tmp293);
tmp305 = _mm512_fmadd_ps(in22, _mm512_set1_ps(2.5e-01f), tmp298);
tmp293 = _mm512_sub_ps(in16, tmp293);
tmp298 = _mm512_sub_ps(in23, tmp298);
tmp301 = _mm512_fmadd_ps(in19, _mm512_set1_ps(-1.25e+00f), tmp301);
tmp305 = _mm512_fmadd_ps(in26, _mm512_set1_ps(-1.25e+00f), tmp305);
in19 = _mm512_sub_ps(in19, in15);
in26 = _mm512_sub_ps(in26, in22);
in19 = _mm512_fmadd_ps(in19, _mm512_set1_ps(5.25e+00f), tmp293);
in26 = _mm512_fmadd_ps(in26, _mm512_set1_ps(5.25e+00f), tmp298);
tmp295 = _mm512_fmadd_ps(tmp301, _mm512_set1_ps(2e+00f), tmp292);
tmp300 = _mm512_fmadd_ps(tmp305, _mm512_set1_ps(2e+00f), tmp297);
tmp292 = _mm512_fnmadd_ps(tmp301, _mm512_set1_ps(2e+00f), tmp292);
tmp297 = _mm512_fnmadd_ps(tmp305, _mm512_set1_ps(2e+00f), tmp297);
__m512 out23 = _mm512_shuffle_f32x4(tmp294, tmp303, 68);
__m512 out31 = _mm512_shuffle_f32x4(tmp294, tmp303, 238);
__m512 out24 = _mm512_shuffle_f32x4(tmp304, in17, 68);
__m512 out32 = _mm512_shuffle_f32x4(tmp304, in17, 238);
__m512 out25 = _mm512_shuffle_f32x4(tmp302, tmp295, 68);
__m512 out33 = _mm512_shuffle_f32x4(tmp302, tmp295, 238);
__m512 out26 = _mm512_shuffle_f32x4(tmp292, in19, 68);
__m512 out34 = _mm512_shuffle_f32x4(tmp292, in19, 238);
__m512 out27 = _mm512_shuffle_f32x4(tmp299, tmp307, 68);
__m512 out35 = _mm512_shuffle_f32x4(tmp299, tmp307, 238);
__m512 out28 = _mm512_shuffle_f32x4(tmp308, in24, 68);
__m512 out36 = _mm512_shuffle_f32x4(tmp308, in24, 238);
__m512 out29 = _mm512_shuffle_f32x4(tmp306, tmp300, 68);
__m512 out37 = _mm512_shuffle_f32x4(tmp306, tmp300, 238);
__m512 out30 = _mm512_shuffle_f32x4(tmp297, in26, 68);
__m512 out38 = _mm512_shuffle_f32x4(tmp297, in26, 238);
_mm512_storeu_ps(dfPtr4+0+1638400*i16+24576*j11+24576*s13+768*k52, out23);
_mm512_storeu_ps(dfPtr4+128+1638400*i16+24576*j11+24576*s13+768*k52, out31);
_mm512_storeu_ps(dfPtr4+64+1638400*i16+24576*j11+24576*s13+768*k52, out27);
_mm512_storeu_ps(dfPtr4+192+1638400*i16+24576*j11+24576*s13+768*k52, out35);
_mm512_storeu_ps(dfPtr4+409600+1638400*i16+24576*j11+24576*s13+768*k52, out24);
_mm512_storeu_ps(dfPtr4+409728+1638400*i16+24576*j11+24576*s13+768*k52, out32);
_mm512_storeu_ps(dfPtr4+409664+1638400*i16+24576*j11+24576*s13+768*k52, out28);
_mm512_storeu_ps(dfPtr4+409792+1638400*i16+24576*j11+24576*s13+768*k52, out36);
_mm512_storeu_ps(dfPtr4+819200+1638400*i16+24576*j11+24576*s13+768*k52, out25);
_mm512_storeu_ps(dfPtr4+819328+1638400*i16+24576*j11+24576*s13+768*k52, out33);
_mm512_storeu_ps(dfPtr4+819264+1638400*i16+24576*j11+24576*s13+768*k52, out29);
_mm512_storeu_ps(dfPtr4+819392+1638400*i16+24576*j11+24576*s13+768*k52, out37);
_mm512_storeu_ps(dfPtr4+1228800+1638400*i16+24576*j11+24576*s13+768*k52, out26);
_mm512_storeu_ps(dfPtr4+1228928+1638400*i16+24576*j11+24576*s13+768*k52, out34);
_mm512_storeu_ps(dfPtr4+1228864+1638400*i16+24576*j11+24576*s13+768*k52, out30);
_mm512_storeu_ps(dfPtr4+1228992+1638400*i16+24576*j11+24576*s13+768*k52, out38);
__m512 dat935 = _mm512_maskz_loadu_ps(16383, datPtr5+320+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat935 = _mm512_max_ps(_mm512_setzero_ps(), dat935);
__m512 dat936 = _mm512_maskz_loadu_ps(8191, datPtr5+12836+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat936 = _mm512_max_ps(_mm512_setzero_ps(), dat936);
__m512i pm67 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in28 = _mm512_permutexvar_ps(pm67, dat935);
__m512i pm68 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in35 = _mm512_permutexvar_ps(pm68, dat936);
__m512 dat937 = _mm512_maskz_loadu_ps(16383, datPtr5+544+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat937 = _mm512_max_ps(_mm512_setzero_ps(), dat937);
__m512 dat938 = _mm512_maskz_loadu_ps(8191, datPtr5+13060+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat938 = _mm512_max_ps(_mm512_setzero_ps(), dat938);
__m512 in29 = _mm512_permutexvar_ps(pm67, dat937);
__m512 in36 = _mm512_permutexvar_ps(pm68, dat938);
__m512 dat939 = _mm512_maskz_loadu_ps(16383, datPtr5+768+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat939 = _mm512_max_ps(_mm512_setzero_ps(), dat939);
__m512 dat940 = _mm512_maskz_loadu_ps(8191, datPtr5+13284+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat940 = _mm512_max_ps(_mm512_setzero_ps(), dat940);
__m512 in30 = _mm512_permutexvar_ps(pm67, dat939);
__m512 in37 = _mm512_permutexvar_ps(pm68, dat940);
__m512 dat941 = _mm512_maskz_loadu_ps(16383, datPtr5+992+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat941 = _mm512_max_ps(_mm512_setzero_ps(), dat941);
__m512 dat942 = _mm512_maskz_loadu_ps(8191, datPtr5+13508+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat942 = _mm512_max_ps(_mm512_setzero_ps(), dat942);
__m512 in31 = _mm512_permutexvar_ps(pm67, dat941);
__m512 in38 = _mm512_permutexvar_ps(pm68, dat942);
__m512 dat943 = _mm512_maskz_loadu_ps(16383, datPtr5+1216+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat943 = _mm512_max_ps(_mm512_setzero_ps(), dat943);
__m512 dat944 = _mm512_maskz_loadu_ps(8191, datPtr5+13732+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat944 = _mm512_max_ps(_mm512_setzero_ps(), dat944);
__m512 in32 = _mm512_permutexvar_ps(pm67, dat943);
__m512 in39 = _mm512_permutexvar_ps(pm68, dat944);
__m512 dat945 = _mm512_maskz_loadu_ps(16383, datPtr5+1440+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat945 = _mm512_max_ps(_mm512_setzero_ps(), dat945);
__m512 dat946 = _mm512_maskz_loadu_ps(8191, datPtr5+13956+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat946 = _mm512_max_ps(_mm512_setzero_ps(), dat946);
__m512 in33 = _mm512_permutexvar_ps(pm67, dat945);
__m512 in40 = _mm512_permutexvar_ps(pm68, dat946);
__m512 dat947 = _mm512_maskz_loadu_ps(16383, datPtr5+1664+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat947 = _mm512_max_ps(_mm512_setzero_ps(), dat947);
__m512 dat948 = _mm512_maskz_loadu_ps(8191, datPtr5+14180+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat948 = _mm512_max_ps(_mm512_setzero_ps(), dat948);
__m512 in34 = _mm512_permutexvar_ps(pm67, dat947);
__m512 in41 = _mm512_permutexvar_ps(pm68, dat948);
__m512 tmp357 = _mm512_add_ps(in28, in32);
__m512 tmp362 = _mm512_add_ps(in35, in39);
__m512 tmp358 = _mm512_sub_ps(in31, in29);
__m512 tmp363 = _mm512_sub_ps(in38, in36);
__m512 tmp359 = _mm512_add_ps(in29, in33);
__m512 tmp364 = _mm512_add_ps(in36, in40);
__m512 tmp360 = _mm512_sub_ps(_mm512_setzero_ps(), in33);
__m512 tmp365 = _mm512_sub_ps(_mm512_setzero_ps(), in40);
tmp357 = _mm512_fmadd_ps(in30, _mm512_set1_ps(-4.25e+00f), tmp357);
tmp362 = _mm512_fmadd_ps(in37, _mm512_set1_ps(-4.25e+00f), tmp362);
tmp359 = _mm512_fmadd_ps(in31, _mm512_set1_ps(-4.25e+00f), tmp359);
tmp364 = _mm512_fmadd_ps(in38, _mm512_set1_ps(-4.25e+00f), tmp364);
tmp360 = _mm512_fmadd_ps(tmp358, _mm512_set1_ps(5.25e+00f), tmp360);
tmp365 = _mm512_fmadd_ps(tmp363, _mm512_set1_ps(5.25e+00f), tmp365);
tmp358 = _mm512_fmadd_ps(in29, _mm512_set1_ps(2.5e-01f), in33);
tmp363 = _mm512_fmadd_ps(in36, _mm512_set1_ps(2.5e-01f), in40);
in29 = _mm512_fmadd_ps(in29, _mm512_set1_ps(4e+00f), in33);
in36 = _mm512_fmadd_ps(in36, _mm512_set1_ps(4e+00f), in40);
__m512 tmp361 = _mm512_sub_ps(tmp359, tmp357);
__m512 tmp366 = _mm512_sub_ps(tmp364, tmp362);
tmp359 = _mm512_add_ps(tmp357, tmp359);
tmp364 = _mm512_add_ps(tmp362, tmp364);
tmp357 = _mm512_fmadd_ps(in28, _mm512_set1_ps(2.5e-01f), in32);
tmp362 = _mm512_fmadd_ps(in35, _mm512_set1_ps(2.5e-01f), in39);
tmp358 = _mm512_fmadd_ps(in31, _mm512_set1_ps(-1.25e+00f), tmp358);
tmp363 = _mm512_fmadd_ps(in38, _mm512_set1_ps(-1.25e+00f), tmp363);
in31 = _mm512_fmadd_ps(in31, _mm512_set1_ps(-5e+00f), in29);
in38 = _mm512_fmadd_ps(in38, _mm512_set1_ps(-5e+00f), in36);
tmp357 = _mm512_fmadd_ps(in30, _mm512_set1_ps(-1.25e+00f), tmp357);
tmp362 = _mm512_fmadd_ps(in37, _mm512_set1_ps(-1.25e+00f), tmp362);
in33 = _mm512_fmadd_ps(tmp357, _mm512_set1_ps(2e+00f), tmp358);
in40 = _mm512_fmadd_ps(tmp362, _mm512_set1_ps(2e+00f), tmp363);
tmp358 = _mm512_fnmadd_ps(tmp357, _mm512_set1_ps(2e+00f), tmp358);
tmp363 = _mm512_fnmadd_ps(tmp362, _mm512_set1_ps(2e+00f), tmp363);
tmp357 = _mm512_fmadd_ps(in32, _mm512_set1_ps(2.5e-01f), in28);
tmp362 = _mm512_fmadd_ps(in39, _mm512_set1_ps(2.5e-01f), in35);
in28 = _mm512_sub_ps(in34, in28);
in35 = _mm512_sub_ps(in41, in35);
tmp357 = _mm512_fmadd_ps(in30, _mm512_set1_ps(-1.25e+00f), tmp357);
tmp362 = _mm512_fmadd_ps(in37, _mm512_set1_ps(-1.25e+00f), tmp362);
in30 = _mm512_sub_ps(in30, in32);
in37 = _mm512_sub_ps(in37, in39);
in30 = _mm512_fmadd_ps(in30, _mm512_set1_ps(5.25e+00f), in28);
in37 = _mm512_fmadd_ps(in37, _mm512_set1_ps(5.25e+00f), in35);
in29 = _mm512_fmadd_ps(tmp357, _mm512_set1_ps(2e+00f), in31);
in36 = _mm512_fmadd_ps(tmp362, _mm512_set1_ps(2e+00f), in38);
in31 = _mm512_fnmadd_ps(tmp357, _mm512_set1_ps(2e+00f), in31);
in38 = _mm512_fnmadd_ps(tmp362, _mm512_set1_ps(2e+00f), in38);
__m512 tmp375 = _mm512_unpacklo_ps(tmp360, tmp359);
__m512 tmp376 = _mm512_unpackhi_ps(tmp360, tmp359);
__m512 tmp377 = _mm512_unpacklo_ps(tmp361, in33);
__m512 tmp378 = _mm512_unpackhi_ps(tmp361, in33);
__m512 tmp379 = _mm512_unpacklo_ps(tmp358, in29);
__m512 tmp380 = _mm512_unpackhi_ps(tmp358, in29);
__m512 tmp381 = _mm512_unpacklo_ps(in31, in30);
__m512 tmp382 = _mm512_unpackhi_ps(in31, in30);
__m512 tmp383 = _mm512_unpacklo_ps(tmp365, tmp364);
__m512 tmp384 = _mm512_unpackhi_ps(tmp365, tmp364);
__m512 tmp385 = _mm512_unpacklo_ps(tmp366, in40);
__m512 tmp386 = _mm512_unpackhi_ps(tmp366, in40);
__m512 tmp387 = _mm512_unpacklo_ps(tmp363, in36);
__m512 tmp388 = _mm512_unpackhi_ps(tmp363, in36);
__m512 tmp389 = _mm512_unpacklo_ps(in38, in37);
__m512 tmp390 = _mm512_unpackhi_ps(in38, in37);
__m512 tmp391 = _mm512_shuffle_ps(tmp375, tmp377, 68);
__m512 tmp392 = _mm512_shuffle_ps(tmp375, tmp377, 238);
__m512 tmp393 = _mm512_shuffle_ps(tmp376, tmp378, 68);
__m512 tmp394 = _mm512_shuffle_ps(tmp376, tmp378, 238);
__m512 tmp395 = _mm512_shuffle_ps(tmp379, tmp381, 68);
__m512 tmp396 = _mm512_shuffle_ps(tmp379, tmp381, 238);
__m512 tmp397 = _mm512_shuffle_ps(tmp380, tmp382, 68);
__m512 tmp398 = _mm512_shuffle_ps(tmp380, tmp382, 238);
__m512 tmp399 = _mm512_shuffle_ps(tmp383, tmp385, 68);
__m512 tmp400 = _mm512_shuffle_ps(tmp383, tmp385, 238);
__m512 tmp401 = _mm512_shuffle_ps(tmp384, tmp386, 68);
__m512 tmp402 = _mm512_shuffle_ps(tmp384, tmp386, 238);
__m512 tmp403 = _mm512_shuffle_ps(tmp387, tmp389, 68);
__m512 tmp404 = _mm512_shuffle_ps(tmp387, tmp389, 238);
__m512 tmp405 = _mm512_shuffle_ps(tmp388, tmp390, 68);
__m512 tmp406 = _mm512_shuffle_ps(tmp388, tmp390, 238);
__m512 tmp407 = _mm512_shuffle_f32x4(tmp391, tmp395, 136);
__m512 tmp408 = _mm512_shuffle_f32x4(tmp391, tmp395, 221);
__m512 tmp409 = _mm512_shuffle_f32x4(tmp392, tmp396, 136);
__m512 tmp410 = _mm512_shuffle_f32x4(tmp392, tmp396, 221);
__m512 tmp411 = _mm512_shuffle_f32x4(tmp393, tmp397, 136);
__m512 tmp412 = _mm512_shuffle_f32x4(tmp393, tmp397, 221);
__m512 tmp413 = _mm512_shuffle_f32x4(tmp394, tmp398, 136);
__m512 tmp414 = _mm512_shuffle_f32x4(tmp394, tmp398, 221);
__m512 tmp415 = _mm512_shuffle_f32x4(tmp399, tmp403, 136);
__m512 tmp416 = _mm512_shuffle_f32x4(tmp399, tmp403, 221);
__m512 tmp417 = _mm512_shuffle_f32x4(tmp400, tmp404, 136);
__m512 tmp418 = _mm512_shuffle_f32x4(tmp400, tmp404, 221);
__m512 tmp419 = _mm512_shuffle_f32x4(tmp401, tmp405, 136);
__m512 tmp420 = _mm512_shuffle_f32x4(tmp401, tmp405, 221);
__m512 tmp421 = _mm512_shuffle_f32x4(tmp402, tmp406, 136);
__m512 tmp422 = _mm512_shuffle_f32x4(tmp402, tmp406, 221);
tmp360 = _mm512_shuffle_f32x4(tmp407, tmp415, 136);
tmp365 = _mm512_shuffle_f32x4(tmp407, tmp415, 221);
tmp359 = _mm512_shuffle_f32x4(tmp409, tmp417, 136);
tmp364 = _mm512_shuffle_f32x4(tmp409, tmp417, 221);
tmp361 = _mm512_shuffle_f32x4(tmp411, tmp419, 136);
tmp366 = _mm512_shuffle_f32x4(tmp411, tmp419, 221);
in33 = _mm512_shuffle_f32x4(tmp413, tmp421, 136);
in40 = _mm512_shuffle_f32x4(tmp413, tmp421, 221);
tmp358 = _mm512_shuffle_f32x4(tmp408, tmp416, 136);
tmp363 = _mm512_shuffle_f32x4(tmp408, tmp416, 221);
in29 = _mm512_shuffle_f32x4(tmp410, tmp418, 136);
in36 = _mm512_shuffle_f32x4(tmp410, tmp418, 221);
in31 = _mm512_shuffle_f32x4(tmp412, tmp420, 136);
in38 = _mm512_shuffle_f32x4(tmp412, tmp420, 221);
in30 = _mm512_shuffle_f32x4(tmp414, tmp422, 136);
in37 = _mm512_shuffle_f32x4(tmp414, tmp422, 221);
__m512 tmp367 = _mm512_add_ps(tmp359, in29);
__m512 tmp371 = _mm512_add_ps(tmp364, in36);
__m512 tmp368 = _mm512_sub_ps(tmp358, tmp361);
__m512 tmp372 = _mm512_sub_ps(tmp363, tmp366);
__m512 tmp369 = _mm512_add_ps(tmp361, in31);
__m512 tmp373 = _mm512_add_ps(tmp366, in38);
tmp360 = _mm512_sub_ps(tmp360, in31);
tmp365 = _mm512_sub_ps(tmp365, in38);
tmp367 = _mm512_fmadd_ps(in33, _mm512_set1_ps(-4.25e+00f), tmp367);
tmp371 = _mm512_fmadd_ps(in40, _mm512_set1_ps(-4.25e+00f), tmp371);
tmp369 = _mm512_fmadd_ps(tmp358, _mm512_set1_ps(-4.25e+00f), tmp369);
tmp373 = _mm512_fmadd_ps(tmp363, _mm512_set1_ps(-4.25e+00f), tmp373);
tmp360 = _mm512_fmadd_ps(tmp368, _mm512_set1_ps(5.25e+00f), tmp360);
tmp365 = _mm512_fmadd_ps(tmp372, _mm512_set1_ps(5.25e+00f), tmp365);
tmp368 = _mm512_fmadd_ps(tmp361, _mm512_set1_ps(2.5e-01f), in31);
tmp372 = _mm512_fmadd_ps(tmp366, _mm512_set1_ps(2.5e-01f), in38);
tmp361 = _mm512_fmadd_ps(tmp361, _mm512_set1_ps(4e+00f), in31);
tmp366 = _mm512_fmadd_ps(tmp366, _mm512_set1_ps(4e+00f), in38);
__m512 tmp370 = _mm512_sub_ps(tmp369, tmp367);
__m512 tmp374 = _mm512_sub_ps(tmp373, tmp371);
tmp369 = _mm512_add_ps(tmp367, tmp369);
tmp373 = _mm512_add_ps(tmp371, tmp373);
tmp367 = _mm512_fmadd_ps(tmp359, _mm512_set1_ps(2.5e-01f), in29);
tmp371 = _mm512_fmadd_ps(tmp364, _mm512_set1_ps(2.5e-01f), in36);
tmp368 = _mm512_fmadd_ps(tmp358, _mm512_set1_ps(-1.25e+00f), tmp368);
tmp372 = _mm512_fmadd_ps(tmp363, _mm512_set1_ps(-1.25e+00f), tmp372);
tmp358 = _mm512_fmadd_ps(tmp358, _mm512_set1_ps(-5e+00f), tmp361);
tmp363 = _mm512_fmadd_ps(tmp363, _mm512_set1_ps(-5e+00f), tmp366);
tmp367 = _mm512_fmadd_ps(in33, _mm512_set1_ps(-1.25e+00f), tmp367);
tmp371 = _mm512_fmadd_ps(in40, _mm512_set1_ps(-1.25e+00f), tmp371);
in31 = _mm512_fmadd_ps(tmp367, _mm512_set1_ps(2e+00f), tmp368);
in38 = _mm512_fmadd_ps(tmp371, _mm512_set1_ps(2e+00f), tmp372);
tmp368 = _mm512_fnmadd_ps(tmp367, _mm512_set1_ps(2e+00f), tmp368);
tmp372 = _mm512_fnmadd_ps(tmp371, _mm512_set1_ps(2e+00f), tmp372);
tmp367 = _mm512_fmadd_ps(in29, _mm512_set1_ps(2.5e-01f), tmp359);
tmp371 = _mm512_fmadd_ps(in36, _mm512_set1_ps(2.5e-01f), tmp364);
tmp359 = _mm512_sub_ps(in30, tmp359);
tmp364 = _mm512_sub_ps(in37, tmp364);
tmp367 = _mm512_fmadd_ps(in33, _mm512_set1_ps(-1.25e+00f), tmp367);
tmp371 = _mm512_fmadd_ps(in40, _mm512_set1_ps(-1.25e+00f), tmp371);
in33 = _mm512_sub_ps(in33, in29);
in40 = _mm512_sub_ps(in40, in36);
in33 = _mm512_fmadd_ps(in33, _mm512_set1_ps(5.25e+00f), tmp359);
in40 = _mm512_fmadd_ps(in40, _mm512_set1_ps(5.25e+00f), tmp364);
tmp361 = _mm512_fmadd_ps(tmp367, _mm512_set1_ps(2e+00f), tmp358);
tmp366 = _mm512_fmadd_ps(tmp371, _mm512_set1_ps(2e+00f), tmp363);
tmp358 = _mm512_fnmadd_ps(tmp367, _mm512_set1_ps(2e+00f), tmp358);
tmp363 = _mm512_fnmadd_ps(tmp371, _mm512_set1_ps(2e+00f), tmp363);
__m512 out39 = _mm512_shuffle_f32x4(tmp360, tmp369, 68);
__m512 out47 = _mm512_shuffle_f32x4(tmp360, tmp369, 238);
__m512 out40 = _mm512_shuffle_f32x4(tmp370, in31, 68);
__m512 out48 = _mm512_shuffle_f32x4(tmp370, in31, 238);
__m512 out41 = _mm512_shuffle_f32x4(tmp368, tmp361, 68);
__m512 out49 = _mm512_shuffle_f32x4(tmp368, tmp361, 238);
__m512 out42 = _mm512_shuffle_f32x4(tmp358, in33, 68);
__m512 out50 = _mm512_shuffle_f32x4(tmp358, in33, 238);
__m512 out43 = _mm512_shuffle_f32x4(tmp365, tmp373, 68);
__m512 out51 = _mm512_shuffle_f32x4(tmp365, tmp373, 238);
__m512 out44 = _mm512_shuffle_f32x4(tmp374, in38, 68);
__m512 out52 = _mm512_shuffle_f32x4(tmp374, in38, 238);
__m512 out45 = _mm512_shuffle_f32x4(tmp372, tmp366, 68);
__m512 out53 = _mm512_shuffle_f32x4(tmp372, tmp366, 238);
__m512 out46 = _mm512_shuffle_f32x4(tmp363, in40, 68);
__m512 out54 = _mm512_shuffle_f32x4(tmp363, in40, 238);
_mm512_storeu_ps(dfPtr4+256+1638400*i16+24576*j11+24576*s13+768*k52, out39);
_mm512_storeu_ps(dfPtr4+384+1638400*i16+24576*j11+24576*s13+768*k52, out47);
_mm512_storeu_ps(dfPtr4+320+1638400*i16+24576*j11+24576*s13+768*k52, out43);
_mm512_storeu_ps(dfPtr4+448+1638400*i16+24576*j11+24576*s13+768*k52, out51);
_mm512_storeu_ps(dfPtr4+409856+1638400*i16+24576*j11+24576*s13+768*k52, out40);
_mm512_storeu_ps(dfPtr4+409984+1638400*i16+24576*j11+24576*s13+768*k52, out48);
_mm512_storeu_ps(dfPtr4+409920+1638400*i16+24576*j11+24576*s13+768*k52, out44);
_mm512_storeu_ps(dfPtr4+410048+1638400*i16+24576*j11+24576*s13+768*k52, out52);
_mm512_storeu_ps(dfPtr4+819456+1638400*i16+24576*j11+24576*s13+768*k52, out41);
_mm512_storeu_ps(dfPtr4+819584+1638400*i16+24576*j11+24576*s13+768*k52, out49);
_mm512_storeu_ps(dfPtr4+819520+1638400*i16+24576*j11+24576*s13+768*k52, out45);
_mm512_storeu_ps(dfPtr4+819648+1638400*i16+24576*j11+24576*s13+768*k52, out53);
_mm512_storeu_ps(dfPtr4+1229056+1638400*i16+24576*j11+24576*s13+768*k52, out42);
_mm512_storeu_ps(dfPtr4+1229184+1638400*i16+24576*j11+24576*s13+768*k52, out50);
_mm512_storeu_ps(dfPtr4+1229120+1638400*i16+24576*j11+24576*s13+768*k52, out46);
_mm512_storeu_ps(dfPtr4+1229248+1638400*i16+24576*j11+24576*s13+768*k52, out54);
__m512 dat949 = _mm512_maskz_loadu_ps(16383, datPtr5+12880+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat949 = _mm512_max_ps(_mm512_setzero_ps(), dat949);
__m512 dat950 = _mm512_maskz_loadu_ps(16383, datPtr5+12928+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat950 = _mm512_max_ps(_mm512_setzero_ps(), dat950);
__m512i pm69 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in42 = _mm512_permutexvar_ps(pm69, dat949);
__m512 in49 = _mm512_permutexvar_ps(pm69, dat950);
__m512 dat951 = _mm512_maskz_loadu_ps(16383, datPtr5+13104+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat951 = _mm512_max_ps(_mm512_setzero_ps(), dat951);
__m512 dat952 = _mm512_maskz_loadu_ps(16383, datPtr5+13152+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat952 = _mm512_max_ps(_mm512_setzero_ps(), dat952);
__m512 in43 = _mm512_permutexvar_ps(pm69, dat951);
__m512 in50 = _mm512_permutexvar_ps(pm69, dat952);
__m512 dat953 = _mm512_maskz_loadu_ps(16383, datPtr5+13328+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat953 = _mm512_max_ps(_mm512_setzero_ps(), dat953);
__m512 dat954 = _mm512_maskz_loadu_ps(16383, datPtr5+13376+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat954 = _mm512_max_ps(_mm512_setzero_ps(), dat954);
__m512 in44 = _mm512_permutexvar_ps(pm69, dat953);
__m512 in51 = _mm512_permutexvar_ps(pm69, dat954);
__m512 dat955 = _mm512_maskz_loadu_ps(16383, datPtr5+13552+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat955 = _mm512_max_ps(_mm512_setzero_ps(), dat955);
__m512 dat956 = _mm512_maskz_loadu_ps(16383, datPtr5+13600+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat956 = _mm512_max_ps(_mm512_setzero_ps(), dat956);
__m512 in45 = _mm512_permutexvar_ps(pm69, dat955);
__m512 in52 = _mm512_permutexvar_ps(pm69, dat956);
__m512 dat957 = _mm512_maskz_loadu_ps(16383, datPtr5+13776+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat957 = _mm512_max_ps(_mm512_setzero_ps(), dat957);
__m512 dat958 = _mm512_maskz_loadu_ps(16383, datPtr5+13824+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat958 = _mm512_max_ps(_mm512_setzero_ps(), dat958);
__m512 in46 = _mm512_permutexvar_ps(pm69, dat957);
__m512 in53 = _mm512_permutexvar_ps(pm69, dat958);
__m512 dat959 = _mm512_maskz_loadu_ps(16383, datPtr5+14000+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat959 = _mm512_max_ps(_mm512_setzero_ps(), dat959);
__m512 dat960 = _mm512_maskz_loadu_ps(16383, datPtr5+14048+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat960 = _mm512_max_ps(_mm512_setzero_ps(), dat960);
__m512 in47 = _mm512_permutexvar_ps(pm69, dat959);
__m512 in54 = _mm512_permutexvar_ps(pm69, dat960);
__m512 dat961 = _mm512_maskz_loadu_ps(16383, datPtr5+14224+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat961 = _mm512_max_ps(_mm512_setzero_ps(), dat961);
__m512 dat962 = _mm512_maskz_loadu_ps(16383, datPtr5+14272+806912*i16+224*h20+4*w23+806912*s13+25216*k52);
dat962 = _mm512_max_ps(_mm512_setzero_ps(), dat962);
__m512 in48 = _mm512_permutexvar_ps(pm69, dat961);
__m512 in55 = _mm512_permutexvar_ps(pm69, dat962);
__m512 tmp423 = _mm512_add_ps(in42, in46);
__m512 tmp428 = _mm512_add_ps(in49, in53);
__m512 tmp424 = _mm512_sub_ps(in45, in43);
__m512 tmp429 = _mm512_sub_ps(in52, in50);
__m512 tmp425 = _mm512_add_ps(in43, in47);
__m512 tmp430 = _mm512_add_ps(in50, in54);
__m512 tmp426 = _mm512_sub_ps(_mm512_setzero_ps(), in47);
__m512 tmp431 = _mm512_sub_ps(_mm512_setzero_ps(), in54);
tmp423 = _mm512_fmadd_ps(in44, _mm512_set1_ps(-4.25e+00f), tmp423);
tmp428 = _mm512_fmadd_ps(in51, _mm512_set1_ps(-4.25e+00f), tmp428);
tmp425 = _mm512_fmadd_ps(in45, _mm512_set1_ps(-4.25e+00f), tmp425);
tmp430 = _mm512_fmadd_ps(in52, _mm512_set1_ps(-4.25e+00f), tmp430);
tmp426 = _mm512_fmadd_ps(tmp424, _mm512_set1_ps(5.25e+00f), tmp426);
tmp431 = _mm512_fmadd_ps(tmp429, _mm512_set1_ps(5.25e+00f), tmp431);
tmp424 = _mm512_fmadd_ps(in43, _mm512_set1_ps(2.5e-01f), in47);
tmp429 = _mm512_fmadd_ps(in50, _mm512_set1_ps(2.5e-01f), in54);
in43 = _mm512_fmadd_ps(in43, _mm512_set1_ps(4e+00f), in47);
in50 = _mm512_fmadd_ps(in50, _mm512_set1_ps(4e+00f), in54);
__m512 tmp427 = _mm512_sub_ps(tmp425, tmp423);
__m512 tmp432 = _mm512_sub_ps(tmp430, tmp428);
tmp425 = _mm512_add_ps(tmp423, tmp425);
tmp430 = _mm512_add_ps(tmp428, tmp430);
tmp423 = _mm512_fmadd_ps(in42, _mm512_set1_ps(2.5e-01f), in46);
tmp428 = _mm512_fmadd_ps(in49, _mm512_set1_ps(2.5e-01f), in53);
tmp424 = _mm512_fmadd_ps(in45, _mm512_set1_ps(-1.25e+00f), tmp424);
tmp429 = _mm512_fmadd_ps(in52, _mm512_set1_ps(-1.25e+00f), tmp429);
in45 = _mm512_fmadd_ps(in45, _mm512_set1_ps(-5e+00f), in43);
in52 = _mm512_fmadd_ps(in52, _mm512_set1_ps(-5e+00f), in50);
tmp423 = _mm512_fmadd_ps(in44, _mm512_set1_ps(-1.25e+00f), tmp423);
tmp428 = _mm512_fmadd_ps(in51, _mm512_set1_ps(-1.25e+00f), tmp428);
in47 = _mm512_fmadd_ps(tmp423, _mm512_set1_ps(2e+00f), tmp424);
in54 = _mm512_fmadd_ps(tmp428, _mm512_set1_ps(2e+00f), tmp429);
tmp424 = _mm512_fnmadd_ps(tmp423, _mm512_set1_ps(2e+00f), tmp424);
tmp429 = _mm512_fnmadd_ps(tmp428, _mm512_set1_ps(2e+00f), tmp429);
tmp423 = _mm512_fmadd_ps(in46, _mm512_set1_ps(2.5e-01f), in42);
tmp428 = _mm512_fmadd_ps(in53, _mm512_set1_ps(2.5e-01f), in49);
in42 = _mm512_sub_ps(in48, in42);
in49 = _mm512_sub_ps(in55, in49);
tmp423 = _mm512_fmadd_ps(in44, _mm512_set1_ps(-1.25e+00f), tmp423);
tmp428 = _mm512_fmadd_ps(in51, _mm512_set1_ps(-1.25e+00f), tmp428);
in44 = _mm512_sub_ps(in44, in46);
in51 = _mm512_sub_ps(in51, in53);
in44 = _mm512_fmadd_ps(in44, _mm512_set1_ps(5.25e+00f), in42);
in51 = _mm512_fmadd_ps(in51, _mm512_set1_ps(5.25e+00f), in49);
in43 = _mm512_fmadd_ps(tmp423, _mm512_set1_ps(2e+00f), in45);
in50 = _mm512_fmadd_ps(tmp428, _mm512_set1_ps(2e+00f), in52);
in45 = _mm512_fnmadd_ps(tmp423, _mm512_set1_ps(2e+00f), in45);
in52 = _mm512_fnmadd_ps(tmp428, _mm512_set1_ps(2e+00f), in52);
__m512 tmp441 = _mm512_unpacklo_ps(tmp426, tmp425);
__m512 tmp442 = _mm512_unpackhi_ps(tmp426, tmp425);
__m512 tmp443 = _mm512_unpacklo_ps(tmp427, in47);
__m512 tmp444 = _mm512_unpackhi_ps(tmp427, in47);
__m512 tmp445 = _mm512_unpacklo_ps(tmp424, in43);
__m512 tmp446 = _mm512_unpackhi_ps(tmp424, in43);
__m512 tmp447 = _mm512_unpacklo_ps(in45, in44);
__m512 tmp448 = _mm512_unpackhi_ps(in45, in44);
__m512 tmp449 = _mm512_unpacklo_ps(tmp431, tmp430);
__m512 tmp450 = _mm512_unpackhi_ps(tmp431, tmp430);
__m512 tmp451 = _mm512_unpacklo_ps(tmp432, in54);
__m512 tmp452 = _mm512_unpackhi_ps(tmp432, in54);
__m512 tmp453 = _mm512_unpacklo_ps(tmp429, in50);
__m512 tmp454 = _mm512_unpackhi_ps(tmp429, in50);
__m512 tmp455 = _mm512_unpacklo_ps(in52, in51);
__m512 tmp456 = _mm512_unpackhi_ps(in52, in51);
__m512 tmp457 = _mm512_shuffle_ps(tmp441, tmp443, 68);
__m512 tmp458 = _mm512_shuffle_ps(tmp441, tmp443, 238);
__m512 tmp459 = _mm512_shuffle_ps(tmp442, tmp444, 68);
__m512 tmp460 = _mm512_shuffle_ps(tmp442, tmp444, 238);
__m512 tmp461 = _mm512_shuffle_ps(tmp445, tmp447, 68);
__m512 tmp462 = _mm512_shuffle_ps(tmp445, tmp447, 238);
__m512 tmp463 = _mm512_shuffle_ps(tmp446, tmp448, 68);
__m512 tmp464 = _mm512_shuffle_ps(tmp446, tmp448, 238);
__m512 tmp465 = _mm512_shuffle_ps(tmp449, tmp451, 68);
__m512 tmp466 = _mm512_shuffle_ps(tmp449, tmp451, 238);
__m512 tmp467 = _mm512_shuffle_ps(tmp450, tmp452, 68);
__m512 tmp468 = _mm512_shuffle_ps(tmp450, tmp452, 238);
__m512 tmp469 = _mm512_shuffle_ps(tmp453, tmp455, 68);
__m512 tmp470 = _mm512_shuffle_ps(tmp453, tmp455, 238);
__m512 tmp471 = _mm512_shuffle_ps(tmp454, tmp456, 68);
__m512 tmp472 = _mm512_shuffle_ps(tmp454, tmp456, 238);
__m512 tmp473 = _mm512_shuffle_f32x4(tmp457, tmp461, 136);
__m512 tmp474 = _mm512_shuffle_f32x4(tmp457, tmp461, 221);
__m512 tmp475 = _mm512_shuffle_f32x4(tmp458, tmp462, 136);
__m512 tmp476 = _mm512_shuffle_f32x4(tmp458, tmp462, 221);
__m512 tmp477 = _mm512_shuffle_f32x4(tmp459, tmp463, 136);
__m512 tmp478 = _mm512_shuffle_f32x4(tmp459, tmp463, 221);
__m512 tmp479 = _mm512_shuffle_f32x4(tmp460, tmp464, 136);
__m512 tmp480 = _mm512_shuffle_f32x4(tmp460, tmp464, 221);
__m512 tmp481 = _mm512_shuffle_f32x4(tmp465, tmp469, 136);
__m512 tmp482 = _mm512_shuffle_f32x4(tmp465, tmp469, 221);
__m512 tmp483 = _mm512_shuffle_f32x4(tmp466, tmp470, 136);
__m512 tmp484 = _mm512_shuffle_f32x4(tmp466, tmp470, 221);
__m512 tmp485 = _mm512_shuffle_f32x4(tmp467, tmp471, 136);
__m512 tmp486 = _mm512_shuffle_f32x4(tmp467, tmp471, 221);
__m512 tmp487 = _mm512_shuffle_f32x4(tmp468, tmp472, 136);
__m512 tmp488 = _mm512_shuffle_f32x4(tmp468, tmp472, 221);
tmp426 = _mm512_shuffle_f32x4(tmp473, tmp481, 136);
tmp431 = _mm512_shuffle_f32x4(tmp473, tmp481, 221);
tmp425 = _mm512_shuffle_f32x4(tmp475, tmp483, 136);
tmp430 = _mm512_shuffle_f32x4(tmp475, tmp483, 221);
tmp427 = _mm512_shuffle_f32x4(tmp477, tmp485, 136);
tmp432 = _mm512_shuffle_f32x4(tmp477, tmp485, 221);
in47 = _mm512_shuffle_f32x4(tmp479, tmp487, 136);
in54 = _mm512_shuffle_f32x4(tmp479, tmp487, 221);
tmp424 = _mm512_shuffle_f32x4(tmp474, tmp482, 136);
tmp429 = _mm512_shuffle_f32x4(tmp474, tmp482, 221);
in43 = _mm512_shuffle_f32x4(tmp476, tmp484, 136);
in50 = _mm512_shuffle_f32x4(tmp476, tmp484, 221);
in45 = _mm512_shuffle_f32x4(tmp478, tmp486, 136);
in52 = _mm512_shuffle_f32x4(tmp478, tmp486, 221);
in44 = _mm512_shuffle_f32x4(tmp480, tmp488, 136);
in51 = _mm512_shuffle_f32x4(tmp480, tmp488, 221);
__m512 tmp433 = _mm512_add_ps(tmp425, in43);
__m512 tmp437 = _mm512_add_ps(tmp430, in50);
__m512 tmp434 = _mm512_sub_ps(tmp424, tmp427);
__m512 tmp438 = _mm512_sub_ps(tmp429, tmp432);
__m512 tmp435 = _mm512_add_ps(tmp427, in45);
__m512 tmp439 = _mm512_add_ps(tmp432, in52);
tmp426 = _mm512_sub_ps(tmp426, in45);
tmp431 = _mm512_sub_ps(tmp431, in52);
tmp433 = _mm512_fmadd_ps(in47, _mm512_set1_ps(-4.25e+00f), tmp433);
tmp437 = _mm512_fmadd_ps(in54, _mm512_set1_ps(-4.25e+00f), tmp437);
tmp435 = _mm512_fmadd_ps(tmp424, _mm512_set1_ps(-4.25e+00f), tmp435);
tmp439 = _mm512_fmadd_ps(tmp429, _mm512_set1_ps(-4.25e+00f), tmp439);
tmp426 = _mm512_fmadd_ps(tmp434, _mm512_set1_ps(5.25e+00f), tmp426);
tmp431 = _mm512_fmadd_ps(tmp438, _mm512_set1_ps(5.25e+00f), tmp431);
tmp434 = _mm512_fmadd_ps(tmp427, _mm512_set1_ps(2.5e-01f), in45);
tmp438 = _mm512_fmadd_ps(tmp432, _mm512_set1_ps(2.5e-01f), in52);
tmp427 = _mm512_fmadd_ps(tmp427, _mm512_set1_ps(4e+00f), in45);
tmp432 = _mm512_fmadd_ps(tmp432, _mm512_set1_ps(4e+00f), in52);
__m512 tmp436 = _mm512_sub_ps(tmp435, tmp433);
__m512 tmp440 = _mm512_sub_ps(tmp439, tmp437);
tmp435 = _mm512_add_ps(tmp433, tmp435);
tmp439 = _mm512_add_ps(tmp437, tmp439);
tmp433 = _mm512_fmadd_ps(tmp425, _mm512_set1_ps(2.5e-01f), in43);
tmp437 = _mm512_fmadd_ps(tmp430, _mm512_set1_ps(2.5e-01f), in50);
tmp434 = _mm512_fmadd_ps(tmp424, _mm512_set1_ps(-1.25e+00f), tmp434);
tmp438 = _mm512_fmadd_ps(tmp429, _mm512_set1_ps(-1.25e+00f), tmp438);
tmp424 = _mm512_fmadd_ps(tmp424, _mm512_set1_ps(-5e+00f), tmp427);
tmp429 = _mm512_fmadd_ps(tmp429, _mm512_set1_ps(-5e+00f), tmp432);
tmp433 = _mm512_fmadd_ps(in47, _mm512_set1_ps(-1.25e+00f), tmp433);
tmp437 = _mm512_fmadd_ps(in54, _mm512_set1_ps(-1.25e+00f), tmp437);
in45 = _mm512_fmadd_ps(tmp433, _mm512_set1_ps(2e+00f), tmp434);
in52 = _mm512_fmadd_ps(tmp437, _mm512_set1_ps(2e+00f), tmp438);
tmp434 = _mm512_fnmadd_ps(tmp433, _mm512_set1_ps(2e+00f), tmp434);
tmp438 = _mm512_fnmadd_ps(tmp437, _mm512_set1_ps(2e+00f), tmp438);
tmp433 = _mm512_fmadd_ps(in43, _mm512_set1_ps(2.5e-01f), tmp425);
tmp437 = _mm512_fmadd_ps(in50, _mm512_set1_ps(2.5e-01f), tmp430);
tmp425 = _mm512_sub_ps(in44, tmp425);
tmp430 = _mm512_sub_ps(in51, tmp430);
tmp433 = _mm512_fmadd_ps(in47, _mm512_set1_ps(-1.25e+00f), tmp433);
tmp437 = _mm512_fmadd_ps(in54, _mm512_set1_ps(-1.25e+00f), tmp437);
in47 = _mm512_sub_ps(in47, in43);
in54 = _mm512_sub_ps(in54, in50);
in47 = _mm512_fmadd_ps(in47, _mm512_set1_ps(5.25e+00f), tmp425);
in54 = _mm512_fmadd_ps(in54, _mm512_set1_ps(5.25e+00f), tmp430);
tmp427 = _mm512_fmadd_ps(tmp433, _mm512_set1_ps(2e+00f), tmp424);
tmp432 = _mm512_fmadd_ps(tmp437, _mm512_set1_ps(2e+00f), tmp429);
tmp424 = _mm512_fnmadd_ps(tmp433, _mm512_set1_ps(2e+00f), tmp424);
tmp429 = _mm512_fnmadd_ps(tmp437, _mm512_set1_ps(2e+00f), tmp429);
__m512 out55 = _mm512_shuffle_f32x4(tmp426, tmp435, 68);
__m512 out63 = _mm512_shuffle_f32x4(tmp426, tmp435, 238);
__m512 out56 = _mm512_shuffle_f32x4(tmp436, in45, 68);
__m512 out64 = _mm512_shuffle_f32x4(tmp436, in45, 238);
__m512 out57 = _mm512_shuffle_f32x4(tmp434, tmp427, 68);
__m512 out65 = _mm512_shuffle_f32x4(tmp434, tmp427, 238);
__m512 out58 = _mm512_shuffle_f32x4(tmp424, in47, 68);
__m512 out66 = _mm512_shuffle_f32x4(tmp424, in47, 238);
__m512 out59 = _mm512_shuffle_f32x4(tmp431, tmp439, 68);
__m512 out67 = _mm512_shuffle_f32x4(tmp431, tmp439, 238);
__m512 out60 = _mm512_shuffle_f32x4(tmp440, in52, 68);
__m512 out68 = _mm512_shuffle_f32x4(tmp440, in52, 238);
__m512 out61 = _mm512_shuffle_f32x4(tmp438, tmp432, 68);
__m512 out69 = _mm512_shuffle_f32x4(tmp438, tmp432, 238);
__m512 out62 = _mm512_shuffle_f32x4(tmp429, in54, 68);
__m512 out70 = _mm512_shuffle_f32x4(tmp429, in54, 238);
_mm512_storeu_ps(dfPtr4+512+1638400*i16+24576*j11+24576*s13+768*k52, out55);
_mm512_storeu_ps(dfPtr4+640+1638400*i16+24576*j11+24576*s13+768*k52, out63);
_mm512_storeu_ps(dfPtr4+576+1638400*i16+24576*j11+24576*s13+768*k52, out59);
_mm512_storeu_ps(dfPtr4+704+1638400*i16+24576*j11+24576*s13+768*k52, out67);
_mm512_storeu_ps(dfPtr4+410112+1638400*i16+24576*j11+24576*s13+768*k52, out56);
_mm512_storeu_ps(dfPtr4+410240+1638400*i16+24576*j11+24576*s13+768*k52, out64);
_mm512_storeu_ps(dfPtr4+410176+1638400*i16+24576*j11+24576*s13+768*k52, out60);
_mm512_storeu_ps(dfPtr4+410304+1638400*i16+24576*j11+24576*s13+768*k52, out68);
_mm512_storeu_ps(dfPtr4+819712+1638400*i16+24576*j11+24576*s13+768*k52, out57);
_mm512_storeu_ps(dfPtr4+819840+1638400*i16+24576*j11+24576*s13+768*k52, out65);
_mm512_storeu_ps(dfPtr4+819776+1638400*i16+24576*j11+24576*s13+768*k52, out61);
_mm512_storeu_ps(dfPtr4+819904+1638400*i16+24576*j11+24576*s13+768*k52, out69);
_mm512_storeu_ps(dfPtr4+1229312+1638400*i16+24576*j11+24576*s13+768*k52, out58);
_mm512_storeu_ps(dfPtr4+1229440+1638400*i16+24576*j11+24576*s13+768*k52, out66);
_mm512_storeu_ps(dfPtr4+1229376+1638400*i16+24576*j11+24576*s13+768*k52, out62);
_mm512_storeu_ps(dfPtr4+1229504+1638400*i16+24576*j11+24576*s13+768*k52, out70);
}
if (j11 >= last3) return;
++j11;
rel7 = 1;
}
ptrdiff_t h21 = base7+0;
ptrdiff_t w24 = 36;
ptrdiff_t k53 = 0;
for (; k53 != 32; ++k53) {
__m512 dat963 = _mm512_maskz_loadu_ps(16383, datPtr5+224+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat963 = _mm512_max_ps(_mm512_setzero_ps(), dat963);
__m512 dat964 = _mm512_maskz_loadu_ps(511, datPtr5+272+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat964 = _mm512_max_ps(_mm512_setzero_ps(), dat964);
__m512i pm70 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in56 = _mm512_permutexvar_ps(pm70, dat963);
__m512i pm71 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in63 = _mm512_permutexvar_ps(pm71, dat964);
__m512 dat965 = _mm512_maskz_loadu_ps(16383, datPtr5+448+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat965 = _mm512_max_ps(_mm512_setzero_ps(), dat965);
__m512 dat966 = _mm512_maskz_loadu_ps(511, datPtr5+496+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat966 = _mm512_max_ps(_mm512_setzero_ps(), dat966);
__m512 in57 = _mm512_permutexvar_ps(pm70, dat965);
__m512 in64 = _mm512_permutexvar_ps(pm71, dat966);
__m512 dat967 = _mm512_maskz_loadu_ps(16383, datPtr5+672+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat967 = _mm512_max_ps(_mm512_setzero_ps(), dat967);
__m512 dat968 = _mm512_maskz_loadu_ps(511, datPtr5+720+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat968 = _mm512_max_ps(_mm512_setzero_ps(), dat968);
__m512 in58 = _mm512_permutexvar_ps(pm70, dat967);
__m512 in65 = _mm512_permutexvar_ps(pm71, dat968);
__m512 dat969 = _mm512_maskz_loadu_ps(16383, datPtr5+896+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat969 = _mm512_max_ps(_mm512_setzero_ps(), dat969);
__m512 dat970 = _mm512_maskz_loadu_ps(511, datPtr5+944+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat970 = _mm512_max_ps(_mm512_setzero_ps(), dat970);
__m512 in59 = _mm512_permutexvar_ps(pm70, dat969);
__m512 in66 = _mm512_permutexvar_ps(pm71, dat970);
__m512 dat971 = _mm512_maskz_loadu_ps(16383, datPtr5+1120+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat971 = _mm512_max_ps(_mm512_setzero_ps(), dat971);
__m512 dat972 = _mm512_maskz_loadu_ps(511, datPtr5+1168+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat972 = _mm512_max_ps(_mm512_setzero_ps(), dat972);
__m512 in60 = _mm512_permutexvar_ps(pm70, dat971);
__m512 in67 = _mm512_permutexvar_ps(pm71, dat972);
__m512 dat973 = _mm512_maskz_loadu_ps(16383, datPtr5+1344+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat973 = _mm512_max_ps(_mm512_setzero_ps(), dat973);
__m512 dat974 = _mm512_maskz_loadu_ps(511, datPtr5+1392+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat974 = _mm512_max_ps(_mm512_setzero_ps(), dat974);
__m512 in61 = _mm512_permutexvar_ps(pm70, dat973);
__m512 in68 = _mm512_permutexvar_ps(pm71, dat974);
__m512 dat975 = _mm512_maskz_loadu_ps(16383, datPtr5+1568+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat975 = _mm512_max_ps(_mm512_setzero_ps(), dat975);
__m512 dat976 = _mm512_maskz_loadu_ps(511, datPtr5+1616+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat976 = _mm512_max_ps(_mm512_setzero_ps(), dat976);
__m512 in62 = _mm512_permutexvar_ps(pm70, dat975);
__m512 in69 = _mm512_permutexvar_ps(pm71, dat976);
__m512 tmp489 = _mm512_add_ps(in56, in60);
__m512 tmp494 = _mm512_add_ps(in63, in67);
__m512 tmp490 = _mm512_sub_ps(in59, in57);
__m512 tmp495 = _mm512_sub_ps(in66, in64);
__m512 tmp491 = _mm512_add_ps(in57, in61);
__m512 tmp496 = _mm512_add_ps(in64, in68);
__m512 tmp492 = _mm512_sub_ps(_mm512_setzero_ps(), in61);
__m512 tmp497 = _mm512_sub_ps(_mm512_setzero_ps(), in68);
tmp489 = _mm512_fmadd_ps(in58, _mm512_set1_ps(-4.25e+00f), tmp489);
tmp494 = _mm512_fmadd_ps(in65, _mm512_set1_ps(-4.25e+00f), tmp494);
tmp491 = _mm512_fmadd_ps(in59, _mm512_set1_ps(-4.25e+00f), tmp491);
tmp496 = _mm512_fmadd_ps(in66, _mm512_set1_ps(-4.25e+00f), tmp496);
tmp492 = _mm512_fmadd_ps(tmp490, _mm512_set1_ps(5.25e+00f), tmp492);
tmp497 = _mm512_fmadd_ps(tmp495, _mm512_set1_ps(5.25e+00f), tmp497);
tmp490 = _mm512_fmadd_ps(in57, _mm512_set1_ps(2.5e-01f), in61);
tmp495 = _mm512_fmadd_ps(in64, _mm512_set1_ps(2.5e-01f), in68);
in57 = _mm512_fmadd_ps(in57, _mm512_set1_ps(4e+00f), in61);
in64 = _mm512_fmadd_ps(in64, _mm512_set1_ps(4e+00f), in68);
__m512 tmp493 = _mm512_sub_ps(tmp491, tmp489);
__m512 tmp498 = _mm512_sub_ps(tmp496, tmp494);
tmp491 = _mm512_add_ps(tmp489, tmp491);
tmp496 = _mm512_add_ps(tmp494, tmp496);
tmp489 = _mm512_fmadd_ps(in56, _mm512_set1_ps(2.5e-01f), in60);
tmp494 = _mm512_fmadd_ps(in63, _mm512_set1_ps(2.5e-01f), in67);
tmp490 = _mm512_fmadd_ps(in59, _mm512_set1_ps(-1.25e+00f), tmp490);
tmp495 = _mm512_fmadd_ps(in66, _mm512_set1_ps(-1.25e+00f), tmp495);
in59 = _mm512_fmadd_ps(in59, _mm512_set1_ps(-5e+00f), in57);
in66 = _mm512_fmadd_ps(in66, _mm512_set1_ps(-5e+00f), in64);
tmp489 = _mm512_fmadd_ps(in58, _mm512_set1_ps(-1.25e+00f), tmp489);
tmp494 = _mm512_fmadd_ps(in65, _mm512_set1_ps(-1.25e+00f), tmp494);
in61 = _mm512_fmadd_ps(tmp489, _mm512_set1_ps(2e+00f), tmp490);
in68 = _mm512_fmadd_ps(tmp494, _mm512_set1_ps(2e+00f), tmp495);
tmp490 = _mm512_fnmadd_ps(tmp489, _mm512_set1_ps(2e+00f), tmp490);
tmp495 = _mm512_fnmadd_ps(tmp494, _mm512_set1_ps(2e+00f), tmp495);
tmp489 = _mm512_fmadd_ps(in60, _mm512_set1_ps(2.5e-01f), in56);
tmp494 = _mm512_fmadd_ps(in67, _mm512_set1_ps(2.5e-01f), in63);
in56 = _mm512_sub_ps(in62, in56);
in63 = _mm512_sub_ps(in69, in63);
tmp489 = _mm512_fmadd_ps(in58, _mm512_set1_ps(-1.25e+00f), tmp489);
tmp494 = _mm512_fmadd_ps(in65, _mm512_set1_ps(-1.25e+00f), tmp494);
in58 = _mm512_sub_ps(in58, in60);
in65 = _mm512_sub_ps(in65, in67);
in58 = _mm512_fmadd_ps(in58, _mm512_set1_ps(5.25e+00f), in56);
in65 = _mm512_fmadd_ps(in65, _mm512_set1_ps(5.25e+00f), in63);
in57 = _mm512_fmadd_ps(tmp489, _mm512_set1_ps(2e+00f), in59);
in64 = _mm512_fmadd_ps(tmp494, _mm512_set1_ps(2e+00f), in66);
in59 = _mm512_fnmadd_ps(tmp489, _mm512_set1_ps(2e+00f), in59);
in66 = _mm512_fnmadd_ps(tmp494, _mm512_set1_ps(2e+00f), in66);
__m512 tmp507 = _mm512_unpacklo_ps(tmp492, tmp491);
__m512 tmp508 = _mm512_unpackhi_ps(tmp492, tmp491);
__m512 tmp509 = _mm512_unpacklo_ps(tmp493, in61);
__m512 tmp510 = _mm512_unpackhi_ps(tmp493, in61);
__m512 tmp511 = _mm512_unpacklo_ps(tmp490, in57);
__m512 tmp512 = _mm512_unpackhi_ps(tmp490, in57);
__m512 tmp513 = _mm512_unpacklo_ps(in59, in58);
__m512 tmp514 = _mm512_unpackhi_ps(in59, in58);
__m512 tmp515 = _mm512_unpacklo_ps(tmp497, tmp496);
__m512 tmp516 = _mm512_unpackhi_ps(tmp497, tmp496);
__m512 tmp517 = _mm512_unpacklo_ps(tmp498, in68);
__m512 tmp518 = _mm512_unpackhi_ps(tmp498, in68);
__m512 tmp519 = _mm512_unpacklo_ps(tmp495, in64);
__m512 tmp520 = _mm512_unpackhi_ps(tmp495, in64);
__m512 tmp521 = _mm512_unpacklo_ps(in66, in65);
__m512 tmp522 = _mm512_unpackhi_ps(in66, in65);
__m512 tmp523 = _mm512_shuffle_ps(tmp507, tmp509, 68);
__m512 tmp524 = _mm512_shuffle_ps(tmp507, tmp509, 238);
__m512 tmp525 = _mm512_shuffle_ps(tmp508, tmp510, 68);
__m512 tmp526 = _mm512_shuffle_ps(tmp508, tmp510, 238);
__m512 tmp527 = _mm512_shuffle_ps(tmp511, tmp513, 68);
__m512 tmp528 = _mm512_shuffle_ps(tmp511, tmp513, 238);
__m512 tmp529 = _mm512_shuffle_ps(tmp512, tmp514, 68);
__m512 tmp530 = _mm512_shuffle_ps(tmp512, tmp514, 238);
__m512 tmp531 = _mm512_shuffle_ps(tmp515, tmp517, 68);
__m512 tmp532 = _mm512_shuffle_ps(tmp515, tmp517, 238);
__m512 tmp533 = _mm512_shuffle_ps(tmp516, tmp518, 68);
__m512 tmp534 = _mm512_shuffle_ps(tmp516, tmp518, 238);
__m512 tmp535 = _mm512_shuffle_ps(tmp519, tmp521, 68);
__m512 tmp536 = _mm512_shuffle_ps(tmp519, tmp521, 238);
__m512 tmp537 = _mm512_shuffle_ps(tmp520, tmp522, 68);
__m512 tmp538 = _mm512_shuffle_ps(tmp520, tmp522, 238);
__m512 tmp539 = _mm512_shuffle_f32x4(tmp523, tmp527, 136);
__m512 tmp540 = _mm512_shuffle_f32x4(tmp523, tmp527, 221);
__m512 tmp541 = _mm512_shuffle_f32x4(tmp524, tmp528, 136);
__m512 tmp542 = _mm512_shuffle_f32x4(tmp524, tmp528, 221);
__m512 tmp543 = _mm512_shuffle_f32x4(tmp525, tmp529, 136);
__m512 tmp544 = _mm512_shuffle_f32x4(tmp525, tmp529, 221);
__m512 tmp545 = _mm512_shuffle_f32x4(tmp526, tmp530, 136);
__m512 tmp546 = _mm512_shuffle_f32x4(tmp526, tmp530, 221);
__m512 tmp547 = _mm512_shuffle_f32x4(tmp531, tmp535, 136);
__m512 tmp548 = _mm512_shuffle_f32x4(tmp531, tmp535, 221);
__m512 tmp549 = _mm512_shuffle_f32x4(tmp532, tmp536, 136);
__m512 tmp550 = _mm512_shuffle_f32x4(tmp532, tmp536, 221);
__m512 tmp551 = _mm512_shuffle_f32x4(tmp533, tmp537, 136);
__m512 tmp552 = _mm512_shuffle_f32x4(tmp533, tmp537, 221);
__m512 tmp553 = _mm512_shuffle_f32x4(tmp534, tmp538, 136);
__m512 tmp554 = _mm512_shuffle_f32x4(tmp534, tmp538, 221);
tmp492 = _mm512_shuffle_f32x4(tmp539, tmp547, 136);
tmp497 = _mm512_shuffle_f32x4(tmp539, tmp547, 221);
tmp491 = _mm512_shuffle_f32x4(tmp541, tmp549, 136);
tmp496 = _mm512_shuffle_f32x4(tmp541, tmp549, 221);
tmp493 = _mm512_shuffle_f32x4(tmp543, tmp551, 136);
tmp498 = _mm512_shuffle_f32x4(tmp543, tmp551, 221);
in61 = _mm512_shuffle_f32x4(tmp545, tmp553, 136);
in68 = _mm512_shuffle_f32x4(tmp545, tmp553, 221);
tmp490 = _mm512_shuffle_f32x4(tmp540, tmp548, 136);
tmp495 = _mm512_shuffle_f32x4(tmp540, tmp548, 221);
in57 = _mm512_shuffle_f32x4(tmp542, tmp550, 136);
in64 = _mm512_shuffle_f32x4(tmp542, tmp550, 221);
in59 = _mm512_shuffle_f32x4(tmp544, tmp552, 136);
in66 = _mm512_shuffle_f32x4(tmp544, tmp552, 221);
in58 = _mm512_shuffle_f32x4(tmp546, tmp554, 136);
in65 = _mm512_shuffle_f32x4(tmp546, tmp554, 221);
__m512 tmp499 = _mm512_add_ps(tmp491, in57);
__m512 tmp503 = _mm512_add_ps(tmp496, in64);
__m512 tmp500 = _mm512_sub_ps(tmp490, tmp493);
__m512 tmp504 = _mm512_sub_ps(tmp495, tmp498);
__m512 tmp501 = _mm512_add_ps(tmp493, in59);
__m512 tmp505 = _mm512_add_ps(tmp498, in66);
tmp492 = _mm512_sub_ps(tmp492, in59);
tmp497 = _mm512_sub_ps(tmp497, in66);
tmp499 = _mm512_fmadd_ps(in61, _mm512_set1_ps(-4.25e+00f), tmp499);
tmp503 = _mm512_fmadd_ps(in68, _mm512_set1_ps(-4.25e+00f), tmp503);
tmp501 = _mm512_fmadd_ps(tmp490, _mm512_set1_ps(-4.25e+00f), tmp501);
tmp505 = _mm512_fmadd_ps(tmp495, _mm512_set1_ps(-4.25e+00f), tmp505);
tmp492 = _mm512_fmadd_ps(tmp500, _mm512_set1_ps(5.25e+00f), tmp492);
tmp497 = _mm512_fmadd_ps(tmp504, _mm512_set1_ps(5.25e+00f), tmp497);
tmp500 = _mm512_fmadd_ps(tmp493, _mm512_set1_ps(2.5e-01f), in59);
tmp504 = _mm512_fmadd_ps(tmp498, _mm512_set1_ps(2.5e-01f), in66);
tmp493 = _mm512_fmadd_ps(tmp493, _mm512_set1_ps(4e+00f), in59);
tmp498 = _mm512_fmadd_ps(tmp498, _mm512_set1_ps(4e+00f), in66);
__m512 tmp502 = _mm512_sub_ps(tmp501, tmp499);
__m512 tmp506 = _mm512_sub_ps(tmp505, tmp503);
tmp501 = _mm512_add_ps(tmp499, tmp501);
tmp505 = _mm512_add_ps(tmp503, tmp505);
tmp499 = _mm512_fmadd_ps(tmp491, _mm512_set1_ps(2.5e-01f), in57);
tmp503 = _mm512_fmadd_ps(tmp496, _mm512_set1_ps(2.5e-01f), in64);
tmp500 = _mm512_fmadd_ps(tmp490, _mm512_set1_ps(-1.25e+00f), tmp500);
tmp504 = _mm512_fmadd_ps(tmp495, _mm512_set1_ps(-1.25e+00f), tmp504);
tmp490 = _mm512_fmadd_ps(tmp490, _mm512_set1_ps(-5e+00f), tmp493);
tmp495 = _mm512_fmadd_ps(tmp495, _mm512_set1_ps(-5e+00f), tmp498);
tmp499 = _mm512_fmadd_ps(in61, _mm512_set1_ps(-1.25e+00f), tmp499);
tmp503 = _mm512_fmadd_ps(in68, _mm512_set1_ps(-1.25e+00f), tmp503);
in59 = _mm512_fmadd_ps(tmp499, _mm512_set1_ps(2e+00f), tmp500);
in66 = _mm512_fmadd_ps(tmp503, _mm512_set1_ps(2e+00f), tmp504);
tmp500 = _mm512_fnmadd_ps(tmp499, _mm512_set1_ps(2e+00f), tmp500);
tmp504 = _mm512_fnmadd_ps(tmp503, _mm512_set1_ps(2e+00f), tmp504);
tmp499 = _mm512_fmadd_ps(in57, _mm512_set1_ps(2.5e-01f), tmp491);
tmp503 = _mm512_fmadd_ps(in64, _mm512_set1_ps(2.5e-01f), tmp496);
tmp491 = _mm512_sub_ps(in58, tmp491);
tmp496 = _mm512_sub_ps(in65, tmp496);
tmp499 = _mm512_fmadd_ps(in61, _mm512_set1_ps(-1.25e+00f), tmp499);
tmp503 = _mm512_fmadd_ps(in68, _mm512_set1_ps(-1.25e+00f), tmp503);
in61 = _mm512_sub_ps(in61, in57);
in68 = _mm512_sub_ps(in68, in64);
in61 = _mm512_fmadd_ps(in61, _mm512_set1_ps(5.25e+00f), tmp491);
in68 = _mm512_fmadd_ps(in68, _mm512_set1_ps(5.25e+00f), tmp496);
tmp493 = _mm512_fmadd_ps(tmp499, _mm512_set1_ps(2e+00f), tmp490);
tmp498 = _mm512_fmadd_ps(tmp503, _mm512_set1_ps(2e+00f), tmp495);
tmp490 = _mm512_fnmadd_ps(tmp499, _mm512_set1_ps(2e+00f), tmp490);
tmp495 = _mm512_fnmadd_ps(tmp503, _mm512_set1_ps(2e+00f), tmp495);
__m512 out71 = _mm512_shuffle_f32x4(tmp492, tmp501, 68);
__m512 out79 = _mm512_shuffle_f32x4(tmp492, tmp501, 238);
__m512 out72 = _mm512_shuffle_f32x4(tmp502, in59, 68);
__m512 out80 = _mm512_shuffle_f32x4(tmp502, in59, 238);
__m512 out73 = _mm512_shuffle_f32x4(tmp500, tmp493, 68);
__m512 out81 = _mm512_shuffle_f32x4(tmp500, tmp493, 238);
__m512 out74 = _mm512_shuffle_f32x4(tmp490, in61, 68);
__m512 out82 = _mm512_shuffle_f32x4(tmp490, in61, 238);
__m512 out75 = _mm512_shuffle_f32x4(tmp497, tmp505, 68);
__m512 out83 = _mm512_shuffle_f32x4(tmp497, tmp505, 238);
__m512 out76 = _mm512_shuffle_f32x4(tmp506, in66, 68);
__m512 out84 = _mm512_shuffle_f32x4(tmp506, in66, 238);
__m512 out77 = _mm512_shuffle_f32x4(tmp504, tmp498, 68);
__m512 out85 = _mm512_shuffle_f32x4(tmp504, tmp498, 238);
__m512 out78 = _mm512_shuffle_f32x4(tmp495, in68, 68);
__m512 out86 = _mm512_shuffle_f32x4(tmp495, in68, 238);
_mm512_storeu_ps(dfPtr4+0+1638400*i16+24576*j11+24576*s13+768*k53, out71);
_mm512_storeu_ps(dfPtr4+128+1638400*i16+24576*j11+24576*s13+768*k53, out79);
_mm512_storeu_ps(dfPtr4+64+1638400*i16+24576*j11+24576*s13+768*k53, out75);
_mm512_storeu_ps(dfPtr4+192+1638400*i16+24576*j11+24576*s13+768*k53, out83);
_mm512_storeu_ps(dfPtr4+409600+1638400*i16+24576*j11+24576*s13+768*k53, out72);
_mm512_storeu_ps(dfPtr4+409728+1638400*i16+24576*j11+24576*s13+768*k53, out80);
_mm512_storeu_ps(dfPtr4+409664+1638400*i16+24576*j11+24576*s13+768*k53, out76);
_mm512_storeu_ps(dfPtr4+409792+1638400*i16+24576*j11+24576*s13+768*k53, out84);
_mm512_storeu_ps(dfPtr4+819200+1638400*i16+24576*j11+24576*s13+768*k53, out73);
_mm512_storeu_ps(dfPtr4+819328+1638400*i16+24576*j11+24576*s13+768*k53, out81);
_mm512_storeu_ps(dfPtr4+819264+1638400*i16+24576*j11+24576*s13+768*k53, out77);
_mm512_storeu_ps(dfPtr4+819392+1638400*i16+24576*j11+24576*s13+768*k53, out85);
_mm512_storeu_ps(dfPtr4+1228800+1638400*i16+24576*j11+24576*s13+768*k53, out74);
_mm512_storeu_ps(dfPtr4+1228928+1638400*i16+24576*j11+24576*s13+768*k53, out82);
_mm512_storeu_ps(dfPtr4+1228864+1638400*i16+24576*j11+24576*s13+768*k53, out78);
_mm512_storeu_ps(dfPtr4+1228992+1638400*i16+24576*j11+24576*s13+768*k53, out86);
__m512 dat977 = _mm512_maskz_loadu_ps(8191, datPtr5+1204+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat977 = _mm512_max_ps(_mm512_setzero_ps(), dat977);
__m512i pm72 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in70 = _mm512_permutexvar_ps(pm72, dat977);
__m512 dat978 = _mm512_maskz_loadu_ps(8191, datPtr5+1428+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat978 = _mm512_max_ps(_mm512_setzero_ps(), dat978);
__m512 dat979 = _mm512_maskz_loadu_ps(16383, datPtr5+12832+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat979 = _mm512_max_ps(_mm512_setzero_ps(), dat979);
__m512 in71 = _mm512_permutexvar_ps(pm72, dat978);
__m512i pm73 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in78 = _mm512_permutexvar_ps(pm73, dat979);
__m512 dat980 = _mm512_maskz_loadu_ps(8191, datPtr5+1652+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat980 = _mm512_max_ps(_mm512_setzero_ps(), dat980);
__m512 dat981 = _mm512_maskz_loadu_ps(16383, datPtr5+13056+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat981 = _mm512_max_ps(_mm512_setzero_ps(), dat981);
__m512 in72 = _mm512_permutexvar_ps(pm72, dat980);
__m512 in79 = _mm512_permutexvar_ps(pm73, dat981);
__m512 dat982 = _mm512_maskz_loadu_ps(8191, datPtr5+1876+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat982 = _mm512_max_ps(_mm512_setzero_ps(), dat982);
__m512 dat983 = _mm512_maskz_loadu_ps(16383, datPtr5+13280+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat983 = _mm512_max_ps(_mm512_setzero_ps(), dat983);
__m512 in73 = _mm512_permutexvar_ps(pm72, dat982);
__m512 in80 = _mm512_permutexvar_ps(pm73, dat983);
__m512 dat984 = _mm512_maskz_loadu_ps(8191, datPtr5+2100+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat984 = _mm512_max_ps(_mm512_setzero_ps(), dat984);
__m512 dat985 = _mm512_maskz_loadu_ps(16383, datPtr5+13504+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat985 = _mm512_max_ps(_mm512_setzero_ps(), dat985);
__m512 in74 = _mm512_permutexvar_ps(pm72, dat984);
__m512 in81 = _mm512_permutexvar_ps(pm73, dat985);
__m512 dat986 = _mm512_maskz_loadu_ps(8191, datPtr5+2324+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat986 = _mm512_max_ps(_mm512_setzero_ps(), dat986);
__m512 dat987 = _mm512_maskz_loadu_ps(16383, datPtr5+13728+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat987 = _mm512_max_ps(_mm512_setzero_ps(), dat987);
__m512 in75 = _mm512_permutexvar_ps(pm72, dat986);
__m512 in82 = _mm512_permutexvar_ps(pm73, dat987);
__m512 dat988 = _mm512_maskz_loadu_ps(8191, datPtr5+2548+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat988 = _mm512_max_ps(_mm512_setzero_ps(), dat988);
__m512 dat989 = _mm512_maskz_loadu_ps(16383, datPtr5+13952+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat989 = _mm512_max_ps(_mm512_setzero_ps(), dat989);
__m512 in76 = _mm512_permutexvar_ps(pm72, dat988);
__m512 in83 = _mm512_permutexvar_ps(pm73, dat989);
__m512 dat990 = _mm512_maskz_loadu_ps(8191, datPtr5+2772+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat990 = _mm512_max_ps(_mm512_setzero_ps(), dat990);
__m512 dat991 = _mm512_maskz_loadu_ps(16383, datPtr5+14176+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat991 = _mm512_max_ps(_mm512_setzero_ps(), dat991);
__m512 in77 = _mm512_permutexvar_ps(pm72, dat990);
__m512 in84 = _mm512_permutexvar_ps(pm73, dat991);
__m512 tmp555 = _mm512_add_ps(in71, in75);
__m512 tmp559 = _mm512_add_ps(in78, in82);
__m512 tmp556 = _mm512_sub_ps(in74, in72);
__m512 tmp560 = _mm512_sub_ps(in81, in79);
__m512 tmp557 = _mm512_add_ps(in72, in76);
__m512 tmp561 = _mm512_add_ps(in79, in83);
in70 = _mm512_sub_ps(in70, in76);
__m512 tmp562 = _mm512_sub_ps(_mm512_setzero_ps(), in83);
tmp555 = _mm512_fmadd_ps(in73, _mm512_set1_ps(-4.25e+00f), tmp555);
tmp559 = _mm512_fmadd_ps(in80, _mm512_set1_ps(-4.25e+00f), tmp559);
tmp557 = _mm512_fmadd_ps(in74, _mm512_set1_ps(-4.25e+00f), tmp557);
tmp561 = _mm512_fmadd_ps(in81, _mm512_set1_ps(-4.25e+00f), tmp561);
in70 = _mm512_fmadd_ps(tmp556, _mm512_set1_ps(5.25e+00f), in70);
tmp562 = _mm512_fmadd_ps(tmp560, _mm512_set1_ps(5.25e+00f), tmp562);
tmp556 = _mm512_fmadd_ps(in72, _mm512_set1_ps(2.5e-01f), in76);
tmp560 = _mm512_fmadd_ps(in79, _mm512_set1_ps(2.5e-01f), in83);
in72 = _mm512_fmadd_ps(in72, _mm512_set1_ps(4e+00f), in76);
in79 = _mm512_fmadd_ps(in79, _mm512_set1_ps(4e+00f), in83);
__m512 tmp558 = _mm512_sub_ps(tmp557, tmp555);
__m512 tmp563 = _mm512_sub_ps(tmp561, tmp559);
tmp557 = _mm512_add_ps(tmp555, tmp557);
tmp561 = _mm512_add_ps(tmp559, tmp561);
tmp555 = _mm512_fmadd_ps(in71, _mm512_set1_ps(2.5e-01f), in75);
tmp559 = _mm512_fmadd_ps(in78, _mm512_set1_ps(2.5e-01f), in82);
tmp556 = _mm512_fmadd_ps(in74, _mm512_set1_ps(-1.25e+00f), tmp556);
tmp560 = _mm512_fmadd_ps(in81, _mm512_set1_ps(-1.25e+00f), tmp560);
in74 = _mm512_fmadd_ps(in74, _mm512_set1_ps(-5e+00f), in72);
in81 = _mm512_fmadd_ps(in81, _mm512_set1_ps(-5e+00f), in79);
tmp555 = _mm512_fmadd_ps(in73, _mm512_set1_ps(-1.25e+00f), tmp555);
tmp559 = _mm512_fmadd_ps(in80, _mm512_set1_ps(-1.25e+00f), tmp559);
in76 = _mm512_fmadd_ps(tmp555, _mm512_set1_ps(2e+00f), tmp556);
in83 = _mm512_fmadd_ps(tmp559, _mm512_set1_ps(2e+00f), tmp560);
tmp556 = _mm512_fnmadd_ps(tmp555, _mm512_set1_ps(2e+00f), tmp556);
tmp560 = _mm512_fnmadd_ps(tmp559, _mm512_set1_ps(2e+00f), tmp560);
tmp555 = _mm512_fmadd_ps(in75, _mm512_set1_ps(2.5e-01f), in71);
tmp559 = _mm512_fmadd_ps(in82, _mm512_set1_ps(2.5e-01f), in78);
in71 = _mm512_sub_ps(in77, in71);
in78 = _mm512_sub_ps(in84, in78);
tmp555 = _mm512_fmadd_ps(in73, _mm512_set1_ps(-1.25e+00f), tmp555);
tmp559 = _mm512_fmadd_ps(in80, _mm512_set1_ps(-1.25e+00f), tmp559);
in73 = _mm512_sub_ps(in73, in75);
in80 = _mm512_sub_ps(in80, in82);
in73 = _mm512_fmadd_ps(in73, _mm512_set1_ps(5.25e+00f), in71);
in80 = _mm512_fmadd_ps(in80, _mm512_set1_ps(5.25e+00f), in78);
in72 = _mm512_fmadd_ps(tmp555, _mm512_set1_ps(2e+00f), in74);
in79 = _mm512_fmadd_ps(tmp559, _mm512_set1_ps(2e+00f), in81);
in74 = _mm512_fnmadd_ps(tmp555, _mm512_set1_ps(2e+00f), in74);
in81 = _mm512_fnmadd_ps(tmp559, _mm512_set1_ps(2e+00f), in81);
__m512 tmp572 = _mm512_unpacklo_ps(in70, tmp557);
__m512 tmp573 = _mm512_unpackhi_ps(in70, tmp557);
__m512 tmp574 = _mm512_unpacklo_ps(tmp558, in76);
__m512 tmp575 = _mm512_unpackhi_ps(tmp558, in76);
__m512 tmp576 = _mm512_unpacklo_ps(tmp556, in72);
__m512 tmp577 = _mm512_unpackhi_ps(tmp556, in72);
__m512 tmp578 = _mm512_unpacklo_ps(in74, in73);
__m512 tmp579 = _mm512_unpackhi_ps(in74, in73);
__m512 tmp580 = _mm512_unpacklo_ps(tmp562, tmp561);
__m512 tmp581 = _mm512_unpackhi_ps(tmp562, tmp561);
__m512 tmp582 = _mm512_unpacklo_ps(tmp563, in83);
__m512 tmp583 = _mm512_unpackhi_ps(tmp563, in83);
__m512 tmp584 = _mm512_unpacklo_ps(tmp560, in79);
__m512 tmp585 = _mm512_unpackhi_ps(tmp560, in79);
__m512 tmp586 = _mm512_unpacklo_ps(in81, in80);
__m512 tmp587 = _mm512_unpackhi_ps(in81, in80);
__m512 tmp588 = _mm512_shuffle_ps(tmp572, tmp574, 68);
__m512 tmp589 = _mm512_shuffle_ps(tmp572, tmp574, 238);
__m512 tmp590 = _mm512_shuffle_ps(tmp573, tmp575, 68);
__m512 tmp591 = _mm512_shuffle_ps(tmp573, tmp575, 238);
__m512 tmp592 = _mm512_shuffle_ps(tmp576, tmp578, 68);
__m512 tmp593 = _mm512_shuffle_ps(tmp576, tmp578, 238);
__m512 tmp594 = _mm512_shuffle_ps(tmp577, tmp579, 68);
__m512 tmp595 = _mm512_shuffle_ps(tmp577, tmp579, 238);
__m512 tmp596 = _mm512_shuffle_ps(tmp580, tmp582, 68);
__m512 tmp597 = _mm512_shuffle_ps(tmp580, tmp582, 238);
__m512 tmp598 = _mm512_shuffle_ps(tmp581, tmp583, 68);
__m512 tmp599 = _mm512_shuffle_ps(tmp581, tmp583, 238);
__m512 tmp600 = _mm512_shuffle_ps(tmp584, tmp586, 68);
__m512 tmp601 = _mm512_shuffle_ps(tmp584, tmp586, 238);
__m512 tmp602 = _mm512_shuffle_ps(tmp585, tmp587, 68);
__m512 tmp603 = _mm512_shuffle_ps(tmp585, tmp587, 238);
__m512 tmp604 = _mm512_shuffle_f32x4(tmp588, tmp592, 136);
__m512 tmp605 = _mm512_shuffle_f32x4(tmp588, tmp592, 221);
__m512 tmp606 = _mm512_shuffle_f32x4(tmp589, tmp593, 136);
__m512 tmp607 = _mm512_shuffle_f32x4(tmp589, tmp593, 221);
__m512 tmp608 = _mm512_shuffle_f32x4(tmp590, tmp594, 136);
__m512 tmp609 = _mm512_shuffle_f32x4(tmp590, tmp594, 221);
__m512 tmp610 = _mm512_shuffle_f32x4(tmp591, tmp595, 136);
__m512 tmp611 = _mm512_shuffle_f32x4(tmp591, tmp595, 221);
__m512 tmp612 = _mm512_shuffle_f32x4(tmp596, tmp600, 136);
__m512 tmp613 = _mm512_shuffle_f32x4(tmp596, tmp600, 221);
__m512 tmp614 = _mm512_shuffle_f32x4(tmp597, tmp601, 136);
__m512 tmp615 = _mm512_shuffle_f32x4(tmp597, tmp601, 221);
__m512 tmp616 = _mm512_shuffle_f32x4(tmp598, tmp602, 136);
__m512 tmp617 = _mm512_shuffle_f32x4(tmp598, tmp602, 221);
__m512 tmp618 = _mm512_shuffle_f32x4(tmp599, tmp603, 136);
__m512 tmp619 = _mm512_shuffle_f32x4(tmp599, tmp603, 221);
in70 = _mm512_shuffle_f32x4(tmp604, tmp612, 136);
tmp562 = _mm512_shuffle_f32x4(tmp604, tmp612, 221);
tmp557 = _mm512_shuffle_f32x4(tmp606, tmp614, 136);
tmp561 = _mm512_shuffle_f32x4(tmp606, tmp614, 221);
tmp558 = _mm512_shuffle_f32x4(tmp608, tmp616, 136);
tmp563 = _mm512_shuffle_f32x4(tmp608, tmp616, 221);
in76 = _mm512_shuffle_f32x4(tmp610, tmp618, 136);
in83 = _mm512_shuffle_f32x4(tmp610, tmp618, 221);
tmp556 = _mm512_shuffle_f32x4(tmp605, tmp613, 136);
tmp560 = _mm512_shuffle_f32x4(tmp605, tmp613, 221);
in72 = _mm512_shuffle_f32x4(tmp607, tmp615, 136);
in79 = _mm512_shuffle_f32x4(tmp607, tmp615, 221);
in74 = _mm512_shuffle_f32x4(tmp609, tmp617, 136);
in81 = _mm512_shuffle_f32x4(tmp609, tmp617, 221);
in73 = _mm512_shuffle_f32x4(tmp611, tmp619, 136);
in80 = _mm512_shuffle_f32x4(tmp611, tmp619, 221);
__m512 tmp564 = _mm512_add_ps(tmp557, in72);
__m512 tmp568 = _mm512_add_ps(tmp561, in79);
__m512 tmp565 = _mm512_sub_ps(tmp556, tmp558);
__m512 tmp569 = _mm512_sub_ps(tmp560, tmp563);
__m512 tmp566 = _mm512_add_ps(tmp558, in74);
__m512 tmp570 = _mm512_add_ps(tmp563, in81);
in70 = _mm512_sub_ps(in70, in74);
tmp562 = _mm512_sub_ps(tmp562, in81);
tmp564 = _mm512_fmadd_ps(in76, _mm512_set1_ps(-4.25e+00f), tmp564);
tmp568 = _mm512_fmadd_ps(in83, _mm512_set1_ps(-4.25e+00f), tmp568);
tmp566 = _mm512_fmadd_ps(tmp556, _mm512_set1_ps(-4.25e+00f), tmp566);
tmp570 = _mm512_fmadd_ps(tmp560, _mm512_set1_ps(-4.25e+00f), tmp570);
in70 = _mm512_fmadd_ps(tmp565, _mm512_set1_ps(5.25e+00f), in70);
tmp562 = _mm512_fmadd_ps(tmp569, _mm512_set1_ps(5.25e+00f), tmp562);
tmp565 = _mm512_fmadd_ps(tmp558, _mm512_set1_ps(2.5e-01f), in74);
tmp569 = _mm512_fmadd_ps(tmp563, _mm512_set1_ps(2.5e-01f), in81);
tmp558 = _mm512_fmadd_ps(tmp558, _mm512_set1_ps(4e+00f), in74);
tmp563 = _mm512_fmadd_ps(tmp563, _mm512_set1_ps(4e+00f), in81);
__m512 tmp567 = _mm512_sub_ps(tmp566, tmp564);
__m512 tmp571 = _mm512_sub_ps(tmp570, tmp568);
tmp566 = _mm512_add_ps(tmp564, tmp566);
tmp570 = _mm512_add_ps(tmp568, tmp570);
tmp564 = _mm512_fmadd_ps(tmp557, _mm512_set1_ps(2.5e-01f), in72);
tmp568 = _mm512_fmadd_ps(tmp561, _mm512_set1_ps(2.5e-01f), in79);
tmp565 = _mm512_fmadd_ps(tmp556, _mm512_set1_ps(-1.25e+00f), tmp565);
tmp569 = _mm512_fmadd_ps(tmp560, _mm512_set1_ps(-1.25e+00f), tmp569);
tmp556 = _mm512_fmadd_ps(tmp556, _mm512_set1_ps(-5e+00f), tmp558);
tmp560 = _mm512_fmadd_ps(tmp560, _mm512_set1_ps(-5e+00f), tmp563);
tmp564 = _mm512_fmadd_ps(in76, _mm512_set1_ps(-1.25e+00f), tmp564);
tmp568 = _mm512_fmadd_ps(in83, _mm512_set1_ps(-1.25e+00f), tmp568);
in74 = _mm512_fmadd_ps(tmp564, _mm512_set1_ps(2e+00f), tmp565);
in81 = _mm512_fmadd_ps(tmp568, _mm512_set1_ps(2e+00f), tmp569);
tmp565 = _mm512_fnmadd_ps(tmp564, _mm512_set1_ps(2e+00f), tmp565);
tmp569 = _mm512_fnmadd_ps(tmp568, _mm512_set1_ps(2e+00f), tmp569);
tmp564 = _mm512_fmadd_ps(in72, _mm512_set1_ps(2.5e-01f), tmp557);
tmp568 = _mm512_fmadd_ps(in79, _mm512_set1_ps(2.5e-01f), tmp561);
tmp557 = _mm512_sub_ps(in73, tmp557);
tmp561 = _mm512_sub_ps(in80, tmp561);
tmp564 = _mm512_fmadd_ps(in76, _mm512_set1_ps(-1.25e+00f), tmp564);
tmp568 = _mm512_fmadd_ps(in83, _mm512_set1_ps(-1.25e+00f), tmp568);
in76 = _mm512_sub_ps(in76, in72);
in83 = _mm512_sub_ps(in83, in79);
in76 = _mm512_fmadd_ps(in76, _mm512_set1_ps(5.25e+00f), tmp557);
in83 = _mm512_fmadd_ps(in83, _mm512_set1_ps(5.25e+00f), tmp561);
tmp558 = _mm512_fmadd_ps(tmp564, _mm512_set1_ps(2e+00f), tmp556);
tmp563 = _mm512_fmadd_ps(tmp568, _mm512_set1_ps(2e+00f), tmp560);
tmp556 = _mm512_fnmadd_ps(tmp564, _mm512_set1_ps(2e+00f), tmp556);
tmp560 = _mm512_fnmadd_ps(tmp568, _mm512_set1_ps(2e+00f), tmp560);
__m512 out87 = _mm512_shuffle_f32x4(in70, tmp566, 68);
__m512 out95 = _mm512_shuffle_f32x4(in70, tmp566, 238);
__m512 out88 = _mm512_shuffle_f32x4(tmp567, in74, 68);
__m512 out96 = _mm512_shuffle_f32x4(tmp567, in74, 238);
__m512 out89 = _mm512_shuffle_f32x4(tmp565, tmp558, 68);
__m512 out97 = _mm512_shuffle_f32x4(tmp565, tmp558, 238);
__m512 out90 = _mm512_shuffle_f32x4(tmp556, in76, 68);
__m512 out98 = _mm512_shuffle_f32x4(tmp556, in76, 238);
__m512 out91 = _mm512_shuffle_f32x4(tmp562, tmp570, 68);
__m512 out99 = _mm512_shuffle_f32x4(tmp562, tmp570, 238);
__m512 out92 = _mm512_shuffle_f32x4(tmp571, in81, 68);
__m512 out100 = _mm512_shuffle_f32x4(tmp571, in81, 238);
__m512 out93 = _mm512_shuffle_f32x4(tmp569, tmp563, 68);
__m512 out101 = _mm512_shuffle_f32x4(tmp569, tmp563, 238);
__m512 out94 = _mm512_shuffle_f32x4(tmp560, in83, 68);
__m512 out102 = _mm512_shuffle_f32x4(tmp560, in83, 238);
_mm512_storeu_ps(dfPtr4+256+1638400*i16+24576*j11+24576*s13+768*k53, out87);
_mm512_storeu_ps(dfPtr4+384+1638400*i16+24576*j11+24576*s13+768*k53, out95);
_mm512_storeu_ps(dfPtr4+320+1638400*i16+24576*j11+24576*s13+768*k53, out91);
_mm512_storeu_ps(dfPtr4+448+1638400*i16+24576*j11+24576*s13+768*k53, out99);
_mm512_storeu_ps(dfPtr4+409856+1638400*i16+24576*j11+24576*s13+768*k53, out88);
_mm512_storeu_ps(dfPtr4+409984+1638400*i16+24576*j11+24576*s13+768*k53, out96);
_mm512_storeu_ps(dfPtr4+409920+1638400*i16+24576*j11+24576*s13+768*k53, out92);
_mm512_storeu_ps(dfPtr4+410048+1638400*i16+24576*j11+24576*s13+768*k53, out100);
_mm512_storeu_ps(dfPtr4+819456+1638400*i16+24576*j11+24576*s13+768*k53, out89);
_mm512_storeu_ps(dfPtr4+819584+1638400*i16+24576*j11+24576*s13+768*k53, out97);
_mm512_storeu_ps(dfPtr4+819520+1638400*i16+24576*j11+24576*s13+768*k53, out93);
_mm512_storeu_ps(dfPtr4+819648+1638400*i16+24576*j11+24576*s13+768*k53, out101);
_mm512_storeu_ps(dfPtr4+1229056+1638400*i16+24576*j11+24576*s13+768*k53, out90);
_mm512_storeu_ps(dfPtr4+1229184+1638400*i16+24576*j11+24576*s13+768*k53, out98);
_mm512_storeu_ps(dfPtr4+1229120+1638400*i16+24576*j11+24576*s13+768*k53, out94);
_mm512_storeu_ps(dfPtr4+1229248+1638400*i16+24576*j11+24576*s13+768*k53, out102);
__m512 dat992 = _mm512_maskz_loadu_ps(8191, datPtr5+13812+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat992 = _mm512_max_ps(_mm512_setzero_ps(), dat992);
__m512i pm74 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in92 = _mm512_permutexvar_ps(pm74, dat992);
__m512 dat993 = _mm512_maskz_loadu_ps(511, datPtr5+12880+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat993 = _mm512_max_ps(_mm512_setzero_ps(), dat993);
__m512 dat994 = _mm512_maskz_loadu_ps(8191, datPtr5+14036+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat994 = _mm512_max_ps(_mm512_setzero_ps(), dat994);
__m512i pm75 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in85 = _mm512_permutexvar_ps(pm75, dat993);
__m512 in93 = _mm512_permutexvar_ps(pm74, dat994);
__m512 dat995 = _mm512_maskz_loadu_ps(511, datPtr5+13104+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat995 = _mm512_max_ps(_mm512_setzero_ps(), dat995);
__m512 dat996 = _mm512_maskz_loadu_ps(8191, datPtr5+14260+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat996 = _mm512_max_ps(_mm512_setzero_ps(), dat996);
__m512 in86 = _mm512_permutexvar_ps(pm75, dat995);
__m512 in94 = _mm512_permutexvar_ps(pm74, dat996);
__m512 dat997 = _mm512_maskz_loadu_ps(511, datPtr5+13328+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat997 = _mm512_max_ps(_mm512_setzero_ps(), dat997);
__m512 dat998 = _mm512_maskz_loadu_ps(8191, datPtr5+14484+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat998 = _mm512_max_ps(_mm512_setzero_ps(), dat998);
__m512 in87 = _mm512_permutexvar_ps(pm75, dat997);
__m512 in95 = _mm512_permutexvar_ps(pm74, dat998);
__m512 dat999 = _mm512_maskz_loadu_ps(511, datPtr5+13552+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat999 = _mm512_max_ps(_mm512_setzero_ps(), dat999);
__m512 dat1000 = _mm512_maskz_loadu_ps(8191, datPtr5+14708+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat1000 = _mm512_max_ps(_mm512_setzero_ps(), dat1000);
__m512 in88 = _mm512_permutexvar_ps(pm75, dat999);
__m512 in96 = _mm512_permutexvar_ps(pm74, dat1000);
__m512 dat1001 = _mm512_maskz_loadu_ps(511, datPtr5+13776+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat1001 = _mm512_max_ps(_mm512_setzero_ps(), dat1001);
__m512 dat1002 = _mm512_maskz_loadu_ps(8191, datPtr5+14932+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat1002 = _mm512_max_ps(_mm512_setzero_ps(), dat1002);
__m512 in89 = _mm512_permutexvar_ps(pm75, dat1001);
__m512 in97 = _mm512_permutexvar_ps(pm74, dat1002);
__m512 dat1003 = _mm512_maskz_loadu_ps(511, datPtr5+14000+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat1003 = _mm512_max_ps(_mm512_setzero_ps(), dat1003);
__m512 dat1004 = _mm512_maskz_loadu_ps(8191, datPtr5+15156+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat1004 = _mm512_max_ps(_mm512_setzero_ps(), dat1004);
__m512 in90 = _mm512_permutexvar_ps(pm75, dat1003);
__m512 in98 = _mm512_permutexvar_ps(pm74, dat1004);
__m512 dat1005 = _mm512_maskz_loadu_ps(511, datPtr5+14224+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat1005 = _mm512_max_ps(_mm512_setzero_ps(), dat1005);
__m512 dat1006 = _mm512_maskz_loadu_ps(8191, datPtr5+15380+806912*i16+224*h21+4*w24+806912*s13+25216*k53);
dat1006 = _mm512_max_ps(_mm512_setzero_ps(), dat1006);
__m512 in91 = _mm512_permutexvar_ps(pm75, dat1005);
__m512 in99 = _mm512_permutexvar_ps(pm74, dat1006);
__m512 tmp620 = _mm512_add_ps(in85, in89);
__m512 tmp625 = _mm512_add_ps(in93, in97);
__m512 tmp621 = _mm512_sub_ps(in88, in86);
__m512 tmp626 = _mm512_sub_ps(in96, in94);
__m512 tmp622 = _mm512_add_ps(in86, in90);
__m512 tmp627 = _mm512_add_ps(in94, in98);
__m512 tmp623 = _mm512_sub_ps(_mm512_setzero_ps(), in90);
in92 = _mm512_sub_ps(in92, in98);
tmp620 = _mm512_fmadd_ps(in87, _mm512_set1_ps(-4.25e+00f), tmp620);
tmp625 = _mm512_fmadd_ps(in95, _mm512_set1_ps(-4.25e+00f), tmp625);
tmp622 = _mm512_fmadd_ps(in88, _mm512_set1_ps(-4.25e+00f), tmp622);
tmp627 = _mm512_fmadd_ps(in96, _mm512_set1_ps(-4.25e+00f), tmp627);
tmp623 = _mm512_fmadd_ps(tmp621, _mm512_set1_ps(5.25e+00f), tmp623);
in92 = _mm512_fmadd_ps(tmp626, _mm512_set1_ps(5.25e+00f), in92);
tmp621 = _mm512_fmadd_ps(in86, _mm512_set1_ps(2.5e-01f), in90);
tmp626 = _mm512_fmadd_ps(in94, _mm512_set1_ps(2.5e-01f), in98);
in86 = _mm512_fmadd_ps(in86, _mm512_set1_ps(4e+00f), in90);
in94 = _mm512_fmadd_ps(in94, _mm512_set1_ps(4e+00f), in98);
__m512 tmp624 = _mm512_sub_ps(tmp622, tmp620);
__m512 tmp628 = _mm512_sub_ps(tmp627, tmp625);
tmp622 = _mm512_add_ps(tmp620, tmp622);
tmp627 = _mm512_add_ps(tmp625, tmp627);
tmp620 = _mm512_fmadd_ps(in85, _mm512_set1_ps(2.5e-01f), in89);
tmp625 = _mm512_fmadd_ps(in93, _mm512_set1_ps(2.5e-01f), in97);
tmp621 = _mm512_fmadd_ps(in88, _mm512_set1_ps(-1.25e+00f), tmp621);
tmp626 = _mm512_fmadd_ps(in96, _mm512_set1_ps(-1.25e+00f), tmp626);
in88 = _mm512_fmadd_ps(in88, _mm512_set1_ps(-5e+00f), in86);
in96 = _mm512_fmadd_ps(in96, _mm512_set1_ps(-5e+00f), in94);
tmp620 = _mm512_fmadd_ps(in87, _mm512_set1_ps(-1.25e+00f), tmp620);
tmp625 = _mm512_fmadd_ps(in95, _mm512_set1_ps(-1.25e+00f), tmp625);
in90 = _mm512_fmadd_ps(tmp620, _mm512_set1_ps(2e+00f), tmp621);
in98 = _mm512_fmadd_ps(tmp625, _mm512_set1_ps(2e+00f), tmp626);
tmp621 = _mm512_fnmadd_ps(tmp620, _mm512_set1_ps(2e+00f), tmp621);
tmp626 = _mm512_fnmadd_ps(tmp625, _mm512_set1_ps(2e+00f), tmp626);
tmp620 = _mm512_fmadd_ps(in89, _mm512_set1_ps(2.5e-01f), in85);
tmp625 = _mm512_fmadd_ps(in97, _mm512_set1_ps(2.5e-01f), in93);
in85 = _mm512_sub_ps(in91, in85);
in93 = _mm512_sub_ps(in99, in93);
tmp620 = _mm512_fmadd_ps(in87, _mm512_set1_ps(-1.25e+00f), tmp620);
tmp625 = _mm512_fmadd_ps(in95, _mm512_set1_ps(-1.25e+00f), tmp625);
in87 = _mm512_sub_ps(in87, in89);
in95 = _mm512_sub_ps(in95, in97);
in87 = _mm512_fmadd_ps(in87, _mm512_set1_ps(5.25e+00f), in85);
in95 = _mm512_fmadd_ps(in95, _mm512_set1_ps(5.25e+00f), in93);
in86 = _mm512_fmadd_ps(tmp620, _mm512_set1_ps(2e+00f), in88);
in94 = _mm512_fmadd_ps(tmp625, _mm512_set1_ps(2e+00f), in96);
in88 = _mm512_fnmadd_ps(tmp620, _mm512_set1_ps(2e+00f), in88);
in96 = _mm512_fnmadd_ps(tmp625, _mm512_set1_ps(2e+00f), in96);
__m512 tmp637 = _mm512_unpacklo_ps(tmp623, tmp622);
__m512 tmp638 = _mm512_unpackhi_ps(tmp623, tmp622);
__m512 tmp639 = _mm512_unpacklo_ps(tmp624, in90);
__m512 tmp640 = _mm512_unpackhi_ps(tmp624, in90);
__m512 tmp641 = _mm512_unpacklo_ps(tmp621, in86);
__m512 tmp642 = _mm512_unpackhi_ps(tmp621, in86);
__m512 tmp643 = _mm512_unpacklo_ps(in88, in87);
__m512 tmp644 = _mm512_unpackhi_ps(in88, in87);
__m512 tmp645 = _mm512_unpacklo_ps(in92, tmp627);
__m512 tmp646 = _mm512_unpackhi_ps(in92, tmp627);
__m512 tmp647 = _mm512_unpacklo_ps(tmp628, in98);
__m512 tmp648 = _mm512_unpackhi_ps(tmp628, in98);
__m512 tmp649 = _mm512_unpacklo_ps(tmp626, in94);
__m512 tmp650 = _mm512_unpackhi_ps(tmp626, in94);
__m512 tmp651 = _mm512_unpacklo_ps(in96, in95);
__m512 tmp652 = _mm512_unpackhi_ps(in96, in95);
__m512 tmp653 = _mm512_shuffle_ps(tmp637, tmp639, 68);
__m512 tmp654 = _mm512_shuffle_ps(tmp637, tmp639, 238);
__m512 tmp655 = _mm512_shuffle_ps(tmp638, tmp640, 68);
__m512 tmp656 = _mm512_shuffle_ps(tmp638, tmp640, 238);
__m512 tmp657 = _mm512_shuffle_ps(tmp641, tmp643, 68);
__m512 tmp658 = _mm512_shuffle_ps(tmp641, tmp643, 238);
__m512 tmp659 = _mm512_shuffle_ps(tmp642, tmp644, 68);
__m512 tmp660 = _mm512_shuffle_ps(tmp642, tmp644, 238);
__m512 tmp661 = _mm512_shuffle_ps(tmp645, tmp647, 68);
__m512 tmp662 = _mm512_shuffle_ps(tmp645, tmp647, 238);
__m512 tmp663 = _mm512_shuffle_ps(tmp646, tmp648, 68);
__m512 tmp664 = _mm512_shuffle_ps(tmp646, tmp648, 238);
__m512 tmp665 = _mm512_shuffle_ps(tmp649, tmp651, 68);
__m512 tmp666 = _mm512_shuffle_ps(tmp649, tmp651, 238);
__m512 tmp667 = _mm512_shuffle_ps(tmp650, tmp652, 68);
__m512 tmp668 = _mm512_shuffle_ps(tmp650, tmp652, 238);
__m512 tmp669 = _mm512_shuffle_f32x4(tmp653, tmp657, 136);
__m512 tmp670 = _mm512_shuffle_f32x4(tmp653, tmp657, 221);
__m512 tmp671 = _mm512_shuffle_f32x4(tmp654, tmp658, 136);
__m512 tmp672 = _mm512_shuffle_f32x4(tmp654, tmp658, 221);
__m512 tmp673 = _mm512_shuffle_f32x4(tmp655, tmp659, 136);
__m512 tmp674 = _mm512_shuffle_f32x4(tmp655, tmp659, 221);
__m512 tmp675 = _mm512_shuffle_f32x4(tmp656, tmp660, 136);
__m512 tmp676 = _mm512_shuffle_f32x4(tmp656, tmp660, 221);
__m512 tmp677 = _mm512_shuffle_f32x4(tmp661, tmp665, 136);
__m512 tmp678 = _mm512_shuffle_f32x4(tmp661, tmp665, 221);
__m512 tmp679 = _mm512_shuffle_f32x4(tmp662, tmp666, 136);
__m512 tmp680 = _mm512_shuffle_f32x4(tmp662, tmp666, 221);
__m512 tmp681 = _mm512_shuffle_f32x4(tmp663, tmp667, 136);
__m512 tmp682 = _mm512_shuffle_f32x4(tmp663, tmp667, 221);
__m512 tmp683 = _mm512_shuffle_f32x4(tmp664, tmp668, 136);
__m512 tmp684 = _mm512_shuffle_f32x4(tmp664, tmp668, 221);
tmp623 = _mm512_shuffle_f32x4(tmp669, tmp677, 136);
in92 = _mm512_shuffle_f32x4(tmp669, tmp677, 221);
tmp622 = _mm512_shuffle_f32x4(tmp671, tmp679, 136);
tmp627 = _mm512_shuffle_f32x4(tmp671, tmp679, 221);
tmp624 = _mm512_shuffle_f32x4(tmp673, tmp681, 136);
tmp628 = _mm512_shuffle_f32x4(tmp673, tmp681, 221);
in90 = _mm512_shuffle_f32x4(tmp675, tmp683, 136);
in98 = _mm512_shuffle_f32x4(tmp675, tmp683, 221);
tmp621 = _mm512_shuffle_f32x4(tmp670, tmp678, 136);
tmp626 = _mm512_shuffle_f32x4(tmp670, tmp678, 221);
in86 = _mm512_shuffle_f32x4(tmp672, tmp680, 136);
in94 = _mm512_shuffle_f32x4(tmp672, tmp680, 221);
in88 = _mm512_shuffle_f32x4(tmp674, tmp682, 136);
in96 = _mm512_shuffle_f32x4(tmp674, tmp682, 221);
in87 = _mm512_shuffle_f32x4(tmp676, tmp684, 136);
in95 = _mm512_shuffle_f32x4(tmp676, tmp684, 221);
__m512 tmp629 = _mm512_add_ps(tmp622, in86);
__m512 tmp633 = _mm512_add_ps(tmp627, in94);
__m512 tmp630 = _mm512_sub_ps(tmp621, tmp624);
__m512 tmp634 = _mm512_sub_ps(tmp626, tmp628);
__m512 tmp631 = _mm512_add_ps(tmp624, in88);
__m512 tmp635 = _mm512_add_ps(tmp628, in96);
tmp623 = _mm512_sub_ps(tmp623, in88);
in92 = _mm512_sub_ps(in92, in96);
tmp629 = _mm512_fmadd_ps(in90, _mm512_set1_ps(-4.25e+00f), tmp629);
tmp633 = _mm512_fmadd_ps(in98, _mm512_set1_ps(-4.25e+00f), tmp633);
tmp631 = _mm512_fmadd_ps(tmp621, _mm512_set1_ps(-4.25e+00f), tmp631);
tmp635 = _mm512_fmadd_ps(tmp626, _mm512_set1_ps(-4.25e+00f), tmp635);
tmp623 = _mm512_fmadd_ps(tmp630, _mm512_set1_ps(5.25e+00f), tmp623);
in92 = _mm512_fmadd_ps(tmp634, _mm512_set1_ps(5.25e+00f), in92);
tmp630 = _mm512_fmadd_ps(tmp624, _mm512_set1_ps(2.5e-01f), in88);
tmp634 = _mm512_fmadd_ps(tmp628, _mm512_set1_ps(2.5e-01f), in96);
tmp624 = _mm512_fmadd_ps(tmp624, _mm512_set1_ps(4e+00f), in88);
tmp628 = _mm512_fmadd_ps(tmp628, _mm512_set1_ps(4e+00f), in96);
__m512 tmp632 = _mm512_sub_ps(tmp631, tmp629);
__m512 tmp636 = _mm512_sub_ps(tmp635, tmp633);
tmp631 = _mm512_add_ps(tmp629, tmp631);
tmp635 = _mm512_add_ps(tmp633, tmp635);
tmp629 = _mm512_fmadd_ps(tmp622, _mm512_set1_ps(2.5e-01f), in86);
tmp633 = _mm512_fmadd_ps(tmp627, _mm512_set1_ps(2.5e-01f), in94);
tmp630 = _mm512_fmadd_ps(tmp621, _mm512_set1_ps(-1.25e+00f), tmp630);
tmp634 = _mm512_fmadd_ps(tmp626, _mm512_set1_ps(-1.25e+00f), tmp634);
tmp621 = _mm512_fmadd_ps(tmp621, _mm512_set1_ps(-5e+00f), tmp624);
tmp626 = _mm512_fmadd_ps(tmp626, _mm512_set1_ps(-5e+00f), tmp628);
tmp629 = _mm512_fmadd_ps(in90, _mm512_set1_ps(-1.25e+00f), tmp629);
tmp633 = _mm512_fmadd_ps(in98, _mm512_set1_ps(-1.25e+00f), tmp633);
in88 = _mm512_fmadd_ps(tmp629, _mm512_set1_ps(2e+00f), tmp630);
in96 = _mm512_fmadd_ps(tmp633, _mm512_set1_ps(2e+00f), tmp634);
tmp630 = _mm512_fnmadd_ps(tmp629, _mm512_set1_ps(2e+00f), tmp630);
tmp634 = _mm512_fnmadd_ps(tmp633, _mm512_set1_ps(2e+00f), tmp634);
tmp629 = _mm512_fmadd_ps(in86, _mm512_set1_ps(2.5e-01f), tmp622);
tmp633 = _mm512_fmadd_ps(in94, _mm512_set1_ps(2.5e-01f), tmp627);
tmp622 = _mm512_sub_ps(in87, tmp622);
tmp627 = _mm512_sub_ps(in95, tmp627);
tmp629 = _mm512_fmadd_ps(in90, _mm512_set1_ps(-1.25e+00f), tmp629);
tmp633 = _mm512_fmadd_ps(in98, _mm512_set1_ps(-1.25e+00f), tmp633);
in90 = _mm512_sub_ps(in90, in86);
in98 = _mm512_sub_ps(in98, in94);
in90 = _mm512_fmadd_ps(in90, _mm512_set1_ps(5.25e+00f), tmp622);
in98 = _mm512_fmadd_ps(in98, _mm512_set1_ps(5.25e+00f), tmp627);
tmp624 = _mm512_fmadd_ps(tmp629, _mm512_set1_ps(2e+00f), tmp621);
tmp628 = _mm512_fmadd_ps(tmp633, _mm512_set1_ps(2e+00f), tmp626);
tmp621 = _mm512_fnmadd_ps(tmp629, _mm512_set1_ps(2e+00f), tmp621);
tmp626 = _mm512_fnmadd_ps(tmp633, _mm512_set1_ps(2e+00f), tmp626);
__m512 out103 = _mm512_shuffle_f32x4(tmp623, tmp631, 68);
__m512 out111 = _mm512_shuffle_f32x4(tmp623, tmp631, 238);
__m512 out104 = _mm512_shuffle_f32x4(tmp632, in88, 68);
__m512 out112 = _mm512_shuffle_f32x4(tmp632, in88, 238);
__m512 out105 = _mm512_shuffle_f32x4(tmp630, tmp624, 68);
__m512 out113 = _mm512_shuffle_f32x4(tmp630, tmp624, 238);
__m512 out106 = _mm512_shuffle_f32x4(tmp621, in90, 68);
__m512 out114 = _mm512_shuffle_f32x4(tmp621, in90, 238);
__m512 out107 = _mm512_shuffle_f32x4(in92, tmp635, 68);
__m512 out115 = _mm512_shuffle_f32x4(in92, tmp635, 238);
__m512 out108 = _mm512_shuffle_f32x4(tmp636, in96, 68);
__m512 out116 = _mm512_shuffle_f32x4(tmp636, in96, 238);
__m512 out109 = _mm512_shuffle_f32x4(tmp634, tmp628, 68);
__m512 out117 = _mm512_shuffle_f32x4(tmp634, tmp628, 238);
__m512 out110 = _mm512_shuffle_f32x4(tmp626, in98, 68);
__m512 out118 = _mm512_shuffle_f32x4(tmp626, in98, 238);
_mm512_storeu_ps(dfPtr4+512+1638400*i16+24576*j11+24576*s13+768*k53, out103);
_mm512_storeu_ps(dfPtr4+640+1638400*i16+24576*j11+24576*s13+768*k53, out111);
_mm512_storeu_ps(dfPtr4+576+1638400*i16+24576*j11+24576*s13+768*k53, out107);
_mm512_storeu_ps(dfPtr4+704+1638400*i16+24576*j11+24576*s13+768*k53, out115);
_mm512_storeu_ps(dfPtr4+410112+1638400*i16+24576*j11+24576*s13+768*k53, out104);
_mm512_storeu_ps(dfPtr4+410240+1638400*i16+24576*j11+24576*s13+768*k53, out112);
_mm512_storeu_ps(dfPtr4+410176+1638400*i16+24576*j11+24576*s13+768*k53, out108);
_mm512_storeu_ps(dfPtr4+410304+1638400*i16+24576*j11+24576*s13+768*k53, out116);
_mm512_storeu_ps(dfPtr4+819712+1638400*i16+24576*j11+24576*s13+768*k53, out105);
_mm512_storeu_ps(dfPtr4+819840+1638400*i16+24576*j11+24576*s13+768*k53, out113);
_mm512_storeu_ps(dfPtr4+819776+1638400*i16+24576*j11+24576*s13+768*k53, out109);
_mm512_storeu_ps(dfPtr4+819904+1638400*i16+24576*j11+24576*s13+768*k53, out117);
_mm512_storeu_ps(dfPtr4+1229312+1638400*i16+24576*j11+24576*s13+768*k53, out106);
_mm512_storeu_ps(dfPtr4+1229440+1638400*i16+24576*j11+24576*s13+768*k53, out114);
_mm512_storeu_ps(dfPtr4+1229376+1638400*i16+24576*j11+24576*s13+768*k53, out110);
_mm512_storeu_ps(dfPtr4+1229504+1638400*i16+24576*j11+24576*s13+768*k53, out118);
}
if (j11 >= last3) return;
++j11;
j11 = 2;
}
if (j11 < 15) {
ptrdiff_t rel8 = (size_t)(j11-2)%5;
ptrdiff_t base8 = 6+(size_t)(j11-2)/5*18;
for (; ; rel8 = 0, base8 += 18) {
if (rel8 < 2) {
if (rel8 < 1) {
ptrdiff_t h22 = base8+0;
ptrdiff_t w25 = 12;
ptrdiff_t k54 = 0;
for (; k54 != 32; ++k54) {
__m512 dat1007 = _mm512_maskz_loadu_ps(16383, datPtr5+0+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1007 = _mm512_max_ps(_mm512_setzero_ps(), dat1007);
__m512 dat1008 = _mm512_maskz_loadu_ps(16383, datPtr5+48+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1008 = _mm512_max_ps(_mm512_setzero_ps(), dat1008);
__m512i pm76 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in100 = _mm512_permutexvar_ps(pm76, dat1007);
__m512 in108 = _mm512_permutexvar_ps(pm76, dat1008);
__m512 dat1009 = _mm512_maskz_loadu_ps(16383, datPtr5+224+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1009 = _mm512_max_ps(_mm512_setzero_ps(), dat1009);
__m512 dat1010 = _mm512_maskz_loadu_ps(16383, datPtr5+272+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1010 = _mm512_max_ps(_mm512_setzero_ps(), dat1010);
__m512 in101 = _mm512_permutexvar_ps(pm76, dat1009);
__m512 in109 = _mm512_permutexvar_ps(pm76, dat1010);
__m512 dat1011 = _mm512_maskz_loadu_ps(16383, datPtr5+448+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1011 = _mm512_max_ps(_mm512_setzero_ps(), dat1011);
__m512 dat1012 = _mm512_maskz_loadu_ps(16383, datPtr5+496+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1012 = _mm512_max_ps(_mm512_setzero_ps(), dat1012);
__m512 in102 = _mm512_permutexvar_ps(pm76, dat1011);
__m512 in110 = _mm512_permutexvar_ps(pm76, dat1012);
__m512 dat1013 = _mm512_maskz_loadu_ps(16383, datPtr5+672+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1013 = _mm512_max_ps(_mm512_setzero_ps(), dat1013);
__m512 dat1014 = _mm512_maskz_loadu_ps(16383, datPtr5+720+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1014 = _mm512_max_ps(_mm512_setzero_ps(), dat1014);
__m512 in103 = _mm512_permutexvar_ps(pm76, dat1013);
__m512 in111 = _mm512_permutexvar_ps(pm76, dat1014);
__m512 dat1015 = _mm512_maskz_loadu_ps(16383, datPtr5+896+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1015 = _mm512_max_ps(_mm512_setzero_ps(), dat1015);
__m512 dat1016 = _mm512_maskz_loadu_ps(16383, datPtr5+944+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1016 = _mm512_max_ps(_mm512_setzero_ps(), dat1016);
__m512 in104 = _mm512_permutexvar_ps(pm76, dat1015);
__m512 in112 = _mm512_permutexvar_ps(pm76, dat1016);
__m512 dat1017 = _mm512_maskz_loadu_ps(16383, datPtr5+1120+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1017 = _mm512_max_ps(_mm512_setzero_ps(), dat1017);
__m512 dat1018 = _mm512_maskz_loadu_ps(16383, datPtr5+1168+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1018 = _mm512_max_ps(_mm512_setzero_ps(), dat1018);
__m512 in105 = _mm512_permutexvar_ps(pm76, dat1017);
__m512 in113 = _mm512_permutexvar_ps(pm76, dat1018);
__m512 dat1019 = _mm512_maskz_loadu_ps(16383, datPtr5+1344+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1019 = _mm512_max_ps(_mm512_setzero_ps(), dat1019);
__m512 dat1020 = _mm512_maskz_loadu_ps(16383, datPtr5+1392+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1020 = _mm512_max_ps(_mm512_setzero_ps(), dat1020);
__m512 in106 = _mm512_permutexvar_ps(pm76, dat1019);
__m512 in114 = _mm512_permutexvar_ps(pm76, dat1020);
__m512 dat1021 = _mm512_maskz_loadu_ps(16383, datPtr5+1568+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1021 = _mm512_max_ps(_mm512_setzero_ps(), dat1021);
__m512 dat1022 = _mm512_maskz_loadu_ps(16383, datPtr5+1616+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1022 = _mm512_max_ps(_mm512_setzero_ps(), dat1022);
__m512 in107 = _mm512_permutexvar_ps(pm76, dat1021);
__m512 in115 = _mm512_permutexvar_ps(pm76, dat1022);
__m512 tmp685 = _mm512_add_ps(in101, in105);
__m512 tmp689 = _mm512_add_ps(in109, in113);
__m512 tmp686 = _mm512_sub_ps(in104, in102);
__m512 tmp690 = _mm512_sub_ps(in112, in110);
__m512 tmp687 = _mm512_add_ps(in102, in106);
__m512 tmp691 = _mm512_add_ps(in110, in114);
in100 = _mm512_sub_ps(in100, in106);
in108 = _mm512_sub_ps(in108, in114);
tmp685 = _mm512_fmadd_ps(in103, _mm512_set1_ps(-4.25e+00f), tmp685);
tmp689 = _mm512_fmadd_ps(in111, _mm512_set1_ps(-4.25e+00f), tmp689);
tmp687 = _mm512_fmadd_ps(in104, _mm512_set1_ps(-4.25e+00f), tmp687);
tmp691 = _mm512_fmadd_ps(in112, _mm512_set1_ps(-4.25e+00f), tmp691);
in100 = _mm512_fmadd_ps(tmp686, _mm512_set1_ps(5.25e+00f), in100);
in108 = _mm512_fmadd_ps(tmp690, _mm512_set1_ps(5.25e+00f), in108);
tmp686 = _mm512_fmadd_ps(in102, _mm512_set1_ps(2.5e-01f), in106);
tmp690 = _mm512_fmadd_ps(in110, _mm512_set1_ps(2.5e-01f), in114);
in102 = _mm512_fmadd_ps(in102, _mm512_set1_ps(4e+00f), in106);
in110 = _mm512_fmadd_ps(in110, _mm512_set1_ps(4e+00f), in114);
__m512 tmp688 = _mm512_sub_ps(tmp687, tmp685);
__m512 tmp692 = _mm512_sub_ps(tmp691, tmp689);
tmp687 = _mm512_add_ps(tmp685, tmp687);
tmp691 = _mm512_add_ps(tmp689, tmp691);
tmp685 = _mm512_fmadd_ps(in101, _mm512_set1_ps(2.5e-01f), in105);
tmp689 = _mm512_fmadd_ps(in109, _mm512_set1_ps(2.5e-01f), in113);
tmp686 = _mm512_fmadd_ps(in104, _mm512_set1_ps(-1.25e+00f), tmp686);
tmp690 = _mm512_fmadd_ps(in112, _mm512_set1_ps(-1.25e+00f), tmp690);
in104 = _mm512_fmadd_ps(in104, _mm512_set1_ps(-5e+00f), in102);
in112 = _mm512_fmadd_ps(in112, _mm512_set1_ps(-5e+00f), in110);
tmp685 = _mm512_fmadd_ps(in103, _mm512_set1_ps(-1.25e+00f), tmp685);
tmp689 = _mm512_fmadd_ps(in111, _mm512_set1_ps(-1.25e+00f), tmp689);
in106 = _mm512_fmadd_ps(tmp685, _mm512_set1_ps(2e+00f), tmp686);
in114 = _mm512_fmadd_ps(tmp689, _mm512_set1_ps(2e+00f), tmp690);
tmp686 = _mm512_fnmadd_ps(tmp685, _mm512_set1_ps(2e+00f), tmp686);
tmp690 = _mm512_fnmadd_ps(tmp689, _mm512_set1_ps(2e+00f), tmp690);
tmp685 = _mm512_fmadd_ps(in105, _mm512_set1_ps(2.5e-01f), in101);
tmp689 = _mm512_fmadd_ps(in113, _mm512_set1_ps(2.5e-01f), in109);
in101 = _mm512_sub_ps(in107, in101);
in109 = _mm512_sub_ps(in115, in109);
tmp685 = _mm512_fmadd_ps(in103, _mm512_set1_ps(-1.25e+00f), tmp685);
tmp689 = _mm512_fmadd_ps(in111, _mm512_set1_ps(-1.25e+00f), tmp689);
in103 = _mm512_sub_ps(in103, in105);
in111 = _mm512_sub_ps(in111, in113);
in103 = _mm512_fmadd_ps(in103, _mm512_set1_ps(5.25e+00f), in101);
in111 = _mm512_fmadd_ps(in111, _mm512_set1_ps(5.25e+00f), in109);
in102 = _mm512_fmadd_ps(tmp685, _mm512_set1_ps(2e+00f), in104);
in110 = _mm512_fmadd_ps(tmp689, _mm512_set1_ps(2e+00f), in112);
in104 = _mm512_fnmadd_ps(tmp685, _mm512_set1_ps(2e+00f), in104);
in112 = _mm512_fnmadd_ps(tmp689, _mm512_set1_ps(2e+00f), in112);
__m512 tmp701 = _mm512_unpacklo_ps(in100, tmp687);
__m512 tmp702 = _mm512_unpackhi_ps(in100, tmp687);
__m512 tmp703 = _mm512_unpacklo_ps(tmp688, in106);
__m512 tmp704 = _mm512_unpackhi_ps(tmp688, in106);
__m512 tmp705 = _mm512_unpacklo_ps(tmp686, in102);
__m512 tmp706 = _mm512_unpackhi_ps(tmp686, in102);
__m512 tmp707 = _mm512_unpacklo_ps(in104, in103);
__m512 tmp708 = _mm512_unpackhi_ps(in104, in103);
__m512 tmp709 = _mm512_unpacklo_ps(in108, tmp691);
__m512 tmp710 = _mm512_unpackhi_ps(in108, tmp691);
__m512 tmp711 = _mm512_unpacklo_ps(tmp692, in114);
__m512 tmp712 = _mm512_unpackhi_ps(tmp692, in114);
__m512 tmp713 = _mm512_unpacklo_ps(tmp690, in110);
__m512 tmp714 = _mm512_unpackhi_ps(tmp690, in110);
__m512 tmp715 = _mm512_unpacklo_ps(in112, in111);
__m512 tmp716 = _mm512_unpackhi_ps(in112, in111);
__m512 tmp717 = _mm512_shuffle_ps(tmp701, tmp703, 68);
__m512 tmp718 = _mm512_shuffle_ps(tmp701, tmp703, 238);
__m512 tmp719 = _mm512_shuffle_ps(tmp702, tmp704, 68);
__m512 tmp720 = _mm512_shuffle_ps(tmp702, tmp704, 238);
__m512 tmp721 = _mm512_shuffle_ps(tmp705, tmp707, 68);
__m512 tmp722 = _mm512_shuffle_ps(tmp705, tmp707, 238);
__m512 tmp723 = _mm512_shuffle_ps(tmp706, tmp708, 68);
__m512 tmp724 = _mm512_shuffle_ps(tmp706, tmp708, 238);
__m512 tmp725 = _mm512_shuffle_ps(tmp709, tmp711, 68);
__m512 tmp726 = _mm512_shuffle_ps(tmp709, tmp711, 238);
__m512 tmp727 = _mm512_shuffle_ps(tmp710, tmp712, 68);
__m512 tmp728 = _mm512_shuffle_ps(tmp710, tmp712, 238);
__m512 tmp729 = _mm512_shuffle_ps(tmp713, tmp715, 68);
__m512 tmp730 = _mm512_shuffle_ps(tmp713, tmp715, 238);
__m512 tmp731 = _mm512_shuffle_ps(tmp714, tmp716, 68);
__m512 tmp732 = _mm512_shuffle_ps(tmp714, tmp716, 238);
__m512 tmp733 = _mm512_shuffle_f32x4(tmp717, tmp721, 136);
__m512 tmp734 = _mm512_shuffle_f32x4(tmp717, tmp721, 221);
__m512 tmp735 = _mm512_shuffle_f32x4(tmp718, tmp722, 136);
__m512 tmp736 = _mm512_shuffle_f32x4(tmp718, tmp722, 221);
__m512 tmp737 = _mm512_shuffle_f32x4(tmp719, tmp723, 136);
__m512 tmp738 = _mm512_shuffle_f32x4(tmp719, tmp723, 221);
__m512 tmp739 = _mm512_shuffle_f32x4(tmp720, tmp724, 136);
__m512 tmp740 = _mm512_shuffle_f32x4(tmp720, tmp724, 221);
__m512 tmp741 = _mm512_shuffle_f32x4(tmp725, tmp729, 136);
__m512 tmp742 = _mm512_shuffle_f32x4(tmp725, tmp729, 221);
__m512 tmp743 = _mm512_shuffle_f32x4(tmp726, tmp730, 136);
__m512 tmp744 = _mm512_shuffle_f32x4(tmp726, tmp730, 221);
__m512 tmp745 = _mm512_shuffle_f32x4(tmp727, tmp731, 136);
__m512 tmp746 = _mm512_shuffle_f32x4(tmp727, tmp731, 221);
__m512 tmp747 = _mm512_shuffle_f32x4(tmp728, tmp732, 136);
__m512 tmp748 = _mm512_shuffle_f32x4(tmp728, tmp732, 221);
in100 = _mm512_shuffle_f32x4(tmp733, tmp741, 136);
in108 = _mm512_shuffle_f32x4(tmp733, tmp741, 221);
tmp687 = _mm512_shuffle_f32x4(tmp735, tmp743, 136);
tmp691 = _mm512_shuffle_f32x4(tmp735, tmp743, 221);
tmp688 = _mm512_shuffle_f32x4(tmp737, tmp745, 136);
tmp692 = _mm512_shuffle_f32x4(tmp737, tmp745, 221);
in106 = _mm512_shuffle_f32x4(tmp739, tmp747, 136);
in114 = _mm512_shuffle_f32x4(tmp739, tmp747, 221);
tmp686 = _mm512_shuffle_f32x4(tmp734, tmp742, 136);
tmp690 = _mm512_shuffle_f32x4(tmp734, tmp742, 221);
in102 = _mm512_shuffle_f32x4(tmp736, tmp744, 136);
in110 = _mm512_shuffle_f32x4(tmp736, tmp744, 221);
in104 = _mm512_shuffle_f32x4(tmp738, tmp746, 136);
in112 = _mm512_shuffle_f32x4(tmp738, tmp746, 221);
in103 = _mm512_shuffle_f32x4(tmp740, tmp748, 136);
in111 = _mm512_shuffle_f32x4(tmp740, tmp748, 221);
__m512 tmp693 = _mm512_add_ps(tmp687, in102);
__m512 tmp697 = _mm512_add_ps(tmp691, in110);
__m512 tmp694 = _mm512_sub_ps(tmp686, tmp688);
__m512 tmp698 = _mm512_sub_ps(tmp690, tmp692);
__m512 tmp695 = _mm512_add_ps(tmp688, in104);
__m512 tmp699 = _mm512_add_ps(tmp692, in112);
in100 = _mm512_sub_ps(in100, in104);
in108 = _mm512_sub_ps(in108, in112);
tmp693 = _mm512_fmadd_ps(in106, _mm512_set1_ps(-4.25e+00f), tmp693);
tmp697 = _mm512_fmadd_ps(in114, _mm512_set1_ps(-4.25e+00f), tmp697);
tmp695 = _mm512_fmadd_ps(tmp686, _mm512_set1_ps(-4.25e+00f), tmp695);
tmp699 = _mm512_fmadd_ps(tmp690, _mm512_set1_ps(-4.25e+00f), tmp699);
in100 = _mm512_fmadd_ps(tmp694, _mm512_set1_ps(5.25e+00f), in100);
in108 = _mm512_fmadd_ps(tmp698, _mm512_set1_ps(5.25e+00f), in108);
tmp694 = _mm512_fmadd_ps(tmp688, _mm512_set1_ps(2.5e-01f), in104);
tmp698 = _mm512_fmadd_ps(tmp692, _mm512_set1_ps(2.5e-01f), in112);
tmp688 = _mm512_fmadd_ps(tmp688, _mm512_set1_ps(4e+00f), in104);
tmp692 = _mm512_fmadd_ps(tmp692, _mm512_set1_ps(4e+00f), in112);
__m512 tmp696 = _mm512_sub_ps(tmp695, tmp693);
__m512 tmp700 = _mm512_sub_ps(tmp699, tmp697);
tmp695 = _mm512_add_ps(tmp693, tmp695);
tmp699 = _mm512_add_ps(tmp697, tmp699);
tmp693 = _mm512_fmadd_ps(tmp687, _mm512_set1_ps(2.5e-01f), in102);
tmp697 = _mm512_fmadd_ps(tmp691, _mm512_set1_ps(2.5e-01f), in110);
tmp694 = _mm512_fmadd_ps(tmp686, _mm512_set1_ps(-1.25e+00f), tmp694);
tmp698 = _mm512_fmadd_ps(tmp690, _mm512_set1_ps(-1.25e+00f), tmp698);
tmp686 = _mm512_fmadd_ps(tmp686, _mm512_set1_ps(-5e+00f), tmp688);
tmp690 = _mm512_fmadd_ps(tmp690, _mm512_set1_ps(-5e+00f), tmp692);
tmp693 = _mm512_fmadd_ps(in106, _mm512_set1_ps(-1.25e+00f), tmp693);
tmp697 = _mm512_fmadd_ps(in114, _mm512_set1_ps(-1.25e+00f), tmp697);
in104 = _mm512_fmadd_ps(tmp693, _mm512_set1_ps(2e+00f), tmp694);
in112 = _mm512_fmadd_ps(tmp697, _mm512_set1_ps(2e+00f), tmp698);
tmp694 = _mm512_fnmadd_ps(tmp693, _mm512_set1_ps(2e+00f), tmp694);
tmp698 = _mm512_fnmadd_ps(tmp697, _mm512_set1_ps(2e+00f), tmp698);
tmp693 = _mm512_fmadd_ps(in102, _mm512_set1_ps(2.5e-01f), tmp687);
tmp697 = _mm512_fmadd_ps(in110, _mm512_set1_ps(2.5e-01f), tmp691);
tmp687 = _mm512_sub_ps(in103, tmp687);
tmp691 = _mm512_sub_ps(in111, tmp691);
tmp693 = _mm512_fmadd_ps(in106, _mm512_set1_ps(-1.25e+00f), tmp693);
tmp697 = _mm512_fmadd_ps(in114, _mm512_set1_ps(-1.25e+00f), tmp697);
in106 = _mm512_sub_ps(in106, in102);
in114 = _mm512_sub_ps(in114, in110);
in106 = _mm512_fmadd_ps(in106, _mm512_set1_ps(5.25e+00f), tmp687);
in114 = _mm512_fmadd_ps(in114, _mm512_set1_ps(5.25e+00f), tmp691);
tmp688 = _mm512_fmadd_ps(tmp693, _mm512_set1_ps(2e+00f), tmp686);
tmp692 = _mm512_fmadd_ps(tmp697, _mm512_set1_ps(2e+00f), tmp690);
tmp686 = _mm512_fnmadd_ps(tmp693, _mm512_set1_ps(2e+00f), tmp686);
tmp690 = _mm512_fnmadd_ps(tmp697, _mm512_set1_ps(2e+00f), tmp690);
__m512 out119 = _mm512_shuffle_f32x4(in100, tmp695, 68);
__m512 out127 = _mm512_shuffle_f32x4(in100, tmp695, 238);
__m512 out120 = _mm512_shuffle_f32x4(tmp696, in104, 68);
__m512 out128 = _mm512_shuffle_f32x4(tmp696, in104, 238);
__m512 out121 = _mm512_shuffle_f32x4(tmp694, tmp688, 68);
__m512 out129 = _mm512_shuffle_f32x4(tmp694, tmp688, 238);
__m512 out122 = _mm512_shuffle_f32x4(tmp686, in106, 68);
__m512 out130 = _mm512_shuffle_f32x4(tmp686, in106, 238);
__m512 out123 = _mm512_shuffle_f32x4(in108, tmp699, 68);
__m512 out131 = _mm512_shuffle_f32x4(in108, tmp699, 238);
__m512 out124 = _mm512_shuffle_f32x4(tmp700, in112, 68);
__m512 out132 = _mm512_shuffle_f32x4(tmp700, in112, 238);
__m512 out125 = _mm512_shuffle_f32x4(tmp698, tmp692, 68);
__m512 out133 = _mm512_shuffle_f32x4(tmp698, tmp692, 238);
__m512 out126 = _mm512_shuffle_f32x4(tmp690, in114, 68);
__m512 out134 = _mm512_shuffle_f32x4(tmp690, in114, 238);
_mm512_storeu_ps(dfPtr4+0+1638400*i16+24576*j11+24576*s13+768*k54, out119);
_mm512_storeu_ps(dfPtr4+128+1638400*i16+24576*j11+24576*s13+768*k54, out127);
_mm512_storeu_ps(dfPtr4+64+1638400*i16+24576*j11+24576*s13+768*k54, out123);
_mm512_storeu_ps(dfPtr4+192+1638400*i16+24576*j11+24576*s13+768*k54, out131);
_mm512_storeu_ps(dfPtr4+409600+1638400*i16+24576*j11+24576*s13+768*k54, out120);
_mm512_storeu_ps(dfPtr4+409728+1638400*i16+24576*j11+24576*s13+768*k54, out128);
_mm512_storeu_ps(dfPtr4+409664+1638400*i16+24576*j11+24576*s13+768*k54, out124);
_mm512_storeu_ps(dfPtr4+409792+1638400*i16+24576*j11+24576*s13+768*k54, out132);
_mm512_storeu_ps(dfPtr4+819200+1638400*i16+24576*j11+24576*s13+768*k54, out121);
_mm512_storeu_ps(dfPtr4+819328+1638400*i16+24576*j11+24576*s13+768*k54, out129);
_mm512_storeu_ps(dfPtr4+819264+1638400*i16+24576*j11+24576*s13+768*k54, out125);
_mm512_storeu_ps(dfPtr4+819392+1638400*i16+24576*j11+24576*s13+768*k54, out133);
_mm512_storeu_ps(dfPtr4+1228800+1638400*i16+24576*j11+24576*s13+768*k54, out122);
_mm512_storeu_ps(dfPtr4+1228928+1638400*i16+24576*j11+24576*s13+768*k54, out130);
_mm512_storeu_ps(dfPtr4+1228864+1638400*i16+24576*j11+24576*s13+768*k54, out126);
_mm512_storeu_ps(dfPtr4+1228992+1638400*i16+24576*j11+24576*s13+768*k54, out134);
__m512 dat1023 = _mm512_maskz_loadu_ps(16383, datPtr5+96+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1023 = _mm512_max_ps(_mm512_setzero_ps(), dat1023);
__m512 dat1024 = _mm512_maskz_loadu_ps(16383, datPtr5+12608+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1024 = _mm512_max_ps(_mm512_setzero_ps(), dat1024);
__m512i pm77 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in116 = _mm512_permutexvar_ps(pm77, dat1023);
__m512 in124 = _mm512_permutexvar_ps(pm77, dat1024);
__m512 dat1025 = _mm512_maskz_loadu_ps(16383, datPtr5+320+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1025 = _mm512_max_ps(_mm512_setzero_ps(), dat1025);
__m512 dat1026 = _mm512_maskz_loadu_ps(16383, datPtr5+12832+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1026 = _mm512_max_ps(_mm512_setzero_ps(), dat1026);
__m512 in117 = _mm512_permutexvar_ps(pm77, dat1025);
__m512 in125 = _mm512_permutexvar_ps(pm77, dat1026);
__m512 dat1027 = _mm512_maskz_loadu_ps(16383, datPtr5+544+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1027 = _mm512_max_ps(_mm512_setzero_ps(), dat1027);
__m512 dat1028 = _mm512_maskz_loadu_ps(16383, datPtr5+13056+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1028 = _mm512_max_ps(_mm512_setzero_ps(), dat1028);
__m512 in118 = _mm512_permutexvar_ps(pm77, dat1027);
__m512 in126 = _mm512_permutexvar_ps(pm77, dat1028);
__m512 dat1029 = _mm512_maskz_loadu_ps(16383, datPtr5+768+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1029 = _mm512_max_ps(_mm512_setzero_ps(), dat1029);
__m512 dat1030 = _mm512_maskz_loadu_ps(16383, datPtr5+13280+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1030 = _mm512_max_ps(_mm512_setzero_ps(), dat1030);
__m512 in119 = _mm512_permutexvar_ps(pm77, dat1029);
__m512 in127 = _mm512_permutexvar_ps(pm77, dat1030);
__m512 dat1031 = _mm512_maskz_loadu_ps(16383, datPtr5+992+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1031 = _mm512_max_ps(_mm512_setzero_ps(), dat1031);
__m512 dat1032 = _mm512_maskz_loadu_ps(16383, datPtr5+13504+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1032 = _mm512_max_ps(_mm512_setzero_ps(), dat1032);
__m512 in120 = _mm512_permutexvar_ps(pm77, dat1031);
__m512 in128 = _mm512_permutexvar_ps(pm77, dat1032);
__m512 dat1033 = _mm512_maskz_loadu_ps(16383, datPtr5+1216+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1033 = _mm512_max_ps(_mm512_setzero_ps(), dat1033);
__m512 dat1034 = _mm512_maskz_loadu_ps(16383, datPtr5+13728+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1034 = _mm512_max_ps(_mm512_setzero_ps(), dat1034);
__m512 in121 = _mm512_permutexvar_ps(pm77, dat1033);
__m512 in129 = _mm512_permutexvar_ps(pm77, dat1034);
__m512 dat1035 = _mm512_maskz_loadu_ps(16383, datPtr5+1440+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1035 = _mm512_max_ps(_mm512_setzero_ps(), dat1035);
__m512 dat1036 = _mm512_maskz_loadu_ps(16383, datPtr5+13952+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1036 = _mm512_max_ps(_mm512_setzero_ps(), dat1036);
__m512 in122 = _mm512_permutexvar_ps(pm77, dat1035);
__m512 in130 = _mm512_permutexvar_ps(pm77, dat1036);
__m512 dat1037 = _mm512_maskz_loadu_ps(16383, datPtr5+1664+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1037 = _mm512_max_ps(_mm512_setzero_ps(), dat1037);
__m512 dat1038 = _mm512_maskz_loadu_ps(16383, datPtr5+14176+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1038 = _mm512_max_ps(_mm512_setzero_ps(), dat1038);
__m512 in123 = _mm512_permutexvar_ps(pm77, dat1037);
__m512 in131 = _mm512_permutexvar_ps(pm77, dat1038);
__m512 tmp749 = _mm512_add_ps(in117, in121);
__m512 tmp753 = _mm512_add_ps(in125, in129);
__m512 tmp750 = _mm512_sub_ps(in120, in118);
__m512 tmp754 = _mm512_sub_ps(in128, in126);
__m512 tmp751 = _mm512_add_ps(in118, in122);
__m512 tmp755 = _mm512_add_ps(in126, in130);
in116 = _mm512_sub_ps(in116, in122);
in124 = _mm512_sub_ps(in124, in130);
tmp749 = _mm512_fmadd_ps(in119, _mm512_set1_ps(-4.25e+00f), tmp749);
tmp753 = _mm512_fmadd_ps(in127, _mm512_set1_ps(-4.25e+00f), tmp753);
tmp751 = _mm512_fmadd_ps(in120, _mm512_set1_ps(-4.25e+00f), tmp751);
tmp755 = _mm512_fmadd_ps(in128, _mm512_set1_ps(-4.25e+00f), tmp755);
in116 = _mm512_fmadd_ps(tmp750, _mm512_set1_ps(5.25e+00f), in116);
in124 = _mm512_fmadd_ps(tmp754, _mm512_set1_ps(5.25e+00f), in124);
tmp750 = _mm512_fmadd_ps(in118, _mm512_set1_ps(2.5e-01f), in122);
tmp754 = _mm512_fmadd_ps(in126, _mm512_set1_ps(2.5e-01f), in130);
in118 = _mm512_fmadd_ps(in118, _mm512_set1_ps(4e+00f), in122);
in126 = _mm512_fmadd_ps(in126, _mm512_set1_ps(4e+00f), in130);
__m512 tmp752 = _mm512_sub_ps(tmp751, tmp749);
__m512 tmp756 = _mm512_sub_ps(tmp755, tmp753);
tmp751 = _mm512_add_ps(tmp749, tmp751);
tmp755 = _mm512_add_ps(tmp753, tmp755);
tmp749 = _mm512_fmadd_ps(in117, _mm512_set1_ps(2.5e-01f), in121);
tmp753 = _mm512_fmadd_ps(in125, _mm512_set1_ps(2.5e-01f), in129);
tmp750 = _mm512_fmadd_ps(in120, _mm512_set1_ps(-1.25e+00f), tmp750);
tmp754 = _mm512_fmadd_ps(in128, _mm512_set1_ps(-1.25e+00f), tmp754);
in120 = _mm512_fmadd_ps(in120, _mm512_set1_ps(-5e+00f), in118);
in128 = _mm512_fmadd_ps(in128, _mm512_set1_ps(-5e+00f), in126);
tmp749 = _mm512_fmadd_ps(in119, _mm512_set1_ps(-1.25e+00f), tmp749);
tmp753 = _mm512_fmadd_ps(in127, _mm512_set1_ps(-1.25e+00f), tmp753);
in122 = _mm512_fmadd_ps(tmp749, _mm512_set1_ps(2e+00f), tmp750);
in130 = _mm512_fmadd_ps(tmp753, _mm512_set1_ps(2e+00f), tmp754);
tmp750 = _mm512_fnmadd_ps(tmp749, _mm512_set1_ps(2e+00f), tmp750);
tmp754 = _mm512_fnmadd_ps(tmp753, _mm512_set1_ps(2e+00f), tmp754);
tmp749 = _mm512_fmadd_ps(in121, _mm512_set1_ps(2.5e-01f), in117);
tmp753 = _mm512_fmadd_ps(in129, _mm512_set1_ps(2.5e-01f), in125);
in117 = _mm512_sub_ps(in123, in117);
in125 = _mm512_sub_ps(in131, in125);
tmp749 = _mm512_fmadd_ps(in119, _mm512_set1_ps(-1.25e+00f), tmp749);
tmp753 = _mm512_fmadd_ps(in127, _mm512_set1_ps(-1.25e+00f), tmp753);
in119 = _mm512_sub_ps(in119, in121);
in127 = _mm512_sub_ps(in127, in129);
in119 = _mm512_fmadd_ps(in119, _mm512_set1_ps(5.25e+00f), in117);
in127 = _mm512_fmadd_ps(in127, _mm512_set1_ps(5.25e+00f), in125);
in118 = _mm512_fmadd_ps(tmp749, _mm512_set1_ps(2e+00f), in120);
in126 = _mm512_fmadd_ps(tmp753, _mm512_set1_ps(2e+00f), in128);
in120 = _mm512_fnmadd_ps(tmp749, _mm512_set1_ps(2e+00f), in120);
in128 = _mm512_fnmadd_ps(tmp753, _mm512_set1_ps(2e+00f), in128);
__m512 tmp765 = _mm512_unpacklo_ps(in116, tmp751);
__m512 tmp766 = _mm512_unpackhi_ps(in116, tmp751);
__m512 tmp767 = _mm512_unpacklo_ps(tmp752, in122);
__m512 tmp768 = _mm512_unpackhi_ps(tmp752, in122);
__m512 tmp769 = _mm512_unpacklo_ps(tmp750, in118);
__m512 tmp770 = _mm512_unpackhi_ps(tmp750, in118);
__m512 tmp771 = _mm512_unpacklo_ps(in120, in119);
__m512 tmp772 = _mm512_unpackhi_ps(in120, in119);
__m512 tmp773 = _mm512_unpacklo_ps(in124, tmp755);
__m512 tmp774 = _mm512_unpackhi_ps(in124, tmp755);
__m512 tmp775 = _mm512_unpacklo_ps(tmp756, in130);
__m512 tmp776 = _mm512_unpackhi_ps(tmp756, in130);
__m512 tmp777 = _mm512_unpacklo_ps(tmp754, in126);
__m512 tmp778 = _mm512_unpackhi_ps(tmp754, in126);
__m512 tmp779 = _mm512_unpacklo_ps(in128, in127);
__m512 tmp780 = _mm512_unpackhi_ps(in128, in127);
__m512 tmp781 = _mm512_shuffle_ps(tmp765, tmp767, 68);
__m512 tmp782 = _mm512_shuffle_ps(tmp765, tmp767, 238);
__m512 tmp783 = _mm512_shuffle_ps(tmp766, tmp768, 68);
__m512 tmp784 = _mm512_shuffle_ps(tmp766, tmp768, 238);
__m512 tmp785 = _mm512_shuffle_ps(tmp769, tmp771, 68);
__m512 tmp786 = _mm512_shuffle_ps(tmp769, tmp771, 238);
__m512 tmp787 = _mm512_shuffle_ps(tmp770, tmp772, 68);
__m512 tmp788 = _mm512_shuffle_ps(tmp770, tmp772, 238);
__m512 tmp789 = _mm512_shuffle_ps(tmp773, tmp775, 68);
__m512 tmp790 = _mm512_shuffle_ps(tmp773, tmp775, 238);
__m512 tmp791 = _mm512_shuffle_ps(tmp774, tmp776, 68);
__m512 tmp792 = _mm512_shuffle_ps(tmp774, tmp776, 238);
__m512 tmp793 = _mm512_shuffle_ps(tmp777, tmp779, 68);
__m512 tmp794 = _mm512_shuffle_ps(tmp777, tmp779, 238);
__m512 tmp795 = _mm512_shuffle_ps(tmp778, tmp780, 68);
__m512 tmp796 = _mm512_shuffle_ps(tmp778, tmp780, 238);
__m512 tmp797 = _mm512_shuffle_f32x4(tmp781, tmp785, 136);
__m512 tmp798 = _mm512_shuffle_f32x4(tmp781, tmp785, 221);
__m512 tmp799 = _mm512_shuffle_f32x4(tmp782, tmp786, 136);
__m512 tmp800 = _mm512_shuffle_f32x4(tmp782, tmp786, 221);
__m512 tmp801 = _mm512_shuffle_f32x4(tmp783, tmp787, 136);
__m512 tmp802 = _mm512_shuffle_f32x4(tmp783, tmp787, 221);
__m512 tmp803 = _mm512_shuffle_f32x4(tmp784, tmp788, 136);
__m512 tmp804 = _mm512_shuffle_f32x4(tmp784, tmp788, 221);
__m512 tmp805 = _mm512_shuffle_f32x4(tmp789, tmp793, 136);
__m512 tmp806 = _mm512_shuffle_f32x4(tmp789, tmp793, 221);
__m512 tmp807 = _mm512_shuffle_f32x4(tmp790, tmp794, 136);
__m512 tmp808 = _mm512_shuffle_f32x4(tmp790, tmp794, 221);
__m512 tmp809 = _mm512_shuffle_f32x4(tmp791, tmp795, 136);
__m512 tmp810 = _mm512_shuffle_f32x4(tmp791, tmp795, 221);
__m512 tmp811 = _mm512_shuffle_f32x4(tmp792, tmp796, 136);
__m512 tmp812 = _mm512_shuffle_f32x4(tmp792, tmp796, 221);
in116 = _mm512_shuffle_f32x4(tmp797, tmp805, 136);
in124 = _mm512_shuffle_f32x4(tmp797, tmp805, 221);
tmp751 = _mm512_shuffle_f32x4(tmp799, tmp807, 136);
tmp755 = _mm512_shuffle_f32x4(tmp799, tmp807, 221);
tmp752 = _mm512_shuffle_f32x4(tmp801, tmp809, 136);
tmp756 = _mm512_shuffle_f32x4(tmp801, tmp809, 221);
in122 = _mm512_shuffle_f32x4(tmp803, tmp811, 136);
in130 = _mm512_shuffle_f32x4(tmp803, tmp811, 221);
tmp750 = _mm512_shuffle_f32x4(tmp798, tmp806, 136);
tmp754 = _mm512_shuffle_f32x4(tmp798, tmp806, 221);
in118 = _mm512_shuffle_f32x4(tmp800, tmp808, 136);
in126 = _mm512_shuffle_f32x4(tmp800, tmp808, 221);
in120 = _mm512_shuffle_f32x4(tmp802, tmp810, 136);
in128 = _mm512_shuffle_f32x4(tmp802, tmp810, 221);
in119 = _mm512_shuffle_f32x4(tmp804, tmp812, 136);
in127 = _mm512_shuffle_f32x4(tmp804, tmp812, 221);
__m512 tmp757 = _mm512_add_ps(tmp751, in118);
__m512 tmp761 = _mm512_add_ps(tmp755, in126);
__m512 tmp758 = _mm512_sub_ps(tmp750, tmp752);
__m512 tmp762 = _mm512_sub_ps(tmp754, tmp756);
__m512 tmp759 = _mm512_add_ps(tmp752, in120);
__m512 tmp763 = _mm512_add_ps(tmp756, in128);
in116 = _mm512_sub_ps(in116, in120);
in124 = _mm512_sub_ps(in124, in128);
tmp757 = _mm512_fmadd_ps(in122, _mm512_set1_ps(-4.25e+00f), tmp757);
tmp761 = _mm512_fmadd_ps(in130, _mm512_set1_ps(-4.25e+00f), tmp761);
tmp759 = _mm512_fmadd_ps(tmp750, _mm512_set1_ps(-4.25e+00f), tmp759);
tmp763 = _mm512_fmadd_ps(tmp754, _mm512_set1_ps(-4.25e+00f), tmp763);
in116 = _mm512_fmadd_ps(tmp758, _mm512_set1_ps(5.25e+00f), in116);
in124 = _mm512_fmadd_ps(tmp762, _mm512_set1_ps(5.25e+00f), in124);
tmp758 = _mm512_fmadd_ps(tmp752, _mm512_set1_ps(2.5e-01f), in120);
tmp762 = _mm512_fmadd_ps(tmp756, _mm512_set1_ps(2.5e-01f), in128);
tmp752 = _mm512_fmadd_ps(tmp752, _mm512_set1_ps(4e+00f), in120);
tmp756 = _mm512_fmadd_ps(tmp756, _mm512_set1_ps(4e+00f), in128);
__m512 tmp760 = _mm512_sub_ps(tmp759, tmp757);
__m512 tmp764 = _mm512_sub_ps(tmp763, tmp761);
tmp759 = _mm512_add_ps(tmp757, tmp759);
tmp763 = _mm512_add_ps(tmp761, tmp763);
tmp757 = _mm512_fmadd_ps(tmp751, _mm512_set1_ps(2.5e-01f), in118);
tmp761 = _mm512_fmadd_ps(tmp755, _mm512_set1_ps(2.5e-01f), in126);
tmp758 = _mm512_fmadd_ps(tmp750, _mm512_set1_ps(-1.25e+00f), tmp758);
tmp762 = _mm512_fmadd_ps(tmp754, _mm512_set1_ps(-1.25e+00f), tmp762);
tmp750 = _mm512_fmadd_ps(tmp750, _mm512_set1_ps(-5e+00f), tmp752);
tmp754 = _mm512_fmadd_ps(tmp754, _mm512_set1_ps(-5e+00f), tmp756);
tmp757 = _mm512_fmadd_ps(in122, _mm512_set1_ps(-1.25e+00f), tmp757);
tmp761 = _mm512_fmadd_ps(in130, _mm512_set1_ps(-1.25e+00f), tmp761);
in120 = _mm512_fmadd_ps(tmp757, _mm512_set1_ps(2e+00f), tmp758);
in128 = _mm512_fmadd_ps(tmp761, _mm512_set1_ps(2e+00f), tmp762);
tmp758 = _mm512_fnmadd_ps(tmp757, _mm512_set1_ps(2e+00f), tmp758);
tmp762 = _mm512_fnmadd_ps(tmp761, _mm512_set1_ps(2e+00f), tmp762);
tmp757 = _mm512_fmadd_ps(in118, _mm512_set1_ps(2.5e-01f), tmp751);
tmp761 = _mm512_fmadd_ps(in126, _mm512_set1_ps(2.5e-01f), tmp755);
tmp751 = _mm512_sub_ps(in119, tmp751);
tmp755 = _mm512_sub_ps(in127, tmp755);
tmp757 = _mm512_fmadd_ps(in122, _mm512_set1_ps(-1.25e+00f), tmp757);
tmp761 = _mm512_fmadd_ps(in130, _mm512_set1_ps(-1.25e+00f), tmp761);
in122 = _mm512_sub_ps(in122, in118);
in130 = _mm512_sub_ps(in130, in126);
in122 = _mm512_fmadd_ps(in122, _mm512_set1_ps(5.25e+00f), tmp751);
in130 = _mm512_fmadd_ps(in130, _mm512_set1_ps(5.25e+00f), tmp755);
tmp752 = _mm512_fmadd_ps(tmp757, _mm512_set1_ps(2e+00f), tmp750);
tmp756 = _mm512_fmadd_ps(tmp761, _mm512_set1_ps(2e+00f), tmp754);
tmp750 = _mm512_fnmadd_ps(tmp757, _mm512_set1_ps(2e+00f), tmp750);
tmp754 = _mm512_fnmadd_ps(tmp761, _mm512_set1_ps(2e+00f), tmp754);
__m512 out135 = _mm512_shuffle_f32x4(in116, tmp759, 68);
__m512 out143 = _mm512_shuffle_f32x4(in116, tmp759, 238);
__m512 out136 = _mm512_shuffle_f32x4(tmp760, in120, 68);
__m512 out144 = _mm512_shuffle_f32x4(tmp760, in120, 238);
__m512 out137 = _mm512_shuffle_f32x4(tmp758, tmp752, 68);
__m512 out145 = _mm512_shuffle_f32x4(tmp758, tmp752, 238);
__m512 out138 = _mm512_shuffle_f32x4(tmp750, in122, 68);
__m512 out146 = _mm512_shuffle_f32x4(tmp750, in122, 238);
__m512 out139 = _mm512_shuffle_f32x4(in124, tmp763, 68);
__m512 out147 = _mm512_shuffle_f32x4(in124, tmp763, 238);
__m512 out140 = _mm512_shuffle_f32x4(tmp764, in128, 68);
__m512 out148 = _mm512_shuffle_f32x4(tmp764, in128, 238);
__m512 out141 = _mm512_shuffle_f32x4(tmp762, tmp756, 68);
__m512 out149 = _mm512_shuffle_f32x4(tmp762, tmp756, 238);
__m512 out142 = _mm512_shuffle_f32x4(tmp754, in130, 68);
__m512 out150 = _mm512_shuffle_f32x4(tmp754, in130, 238);
_mm512_storeu_ps(dfPtr4+256+1638400*i16+24576*j11+24576*s13+768*k54, out135);
_mm512_storeu_ps(dfPtr4+384+1638400*i16+24576*j11+24576*s13+768*k54, out143);
_mm512_storeu_ps(dfPtr4+320+1638400*i16+24576*j11+24576*s13+768*k54, out139);
_mm512_storeu_ps(dfPtr4+448+1638400*i16+24576*j11+24576*s13+768*k54, out147);
_mm512_storeu_ps(dfPtr4+409856+1638400*i16+24576*j11+24576*s13+768*k54, out136);
_mm512_storeu_ps(dfPtr4+409984+1638400*i16+24576*j11+24576*s13+768*k54, out144);
_mm512_storeu_ps(dfPtr4+409920+1638400*i16+24576*j11+24576*s13+768*k54, out140);
_mm512_storeu_ps(dfPtr4+410048+1638400*i16+24576*j11+24576*s13+768*k54, out148);
_mm512_storeu_ps(dfPtr4+819456+1638400*i16+24576*j11+24576*s13+768*k54, out137);
_mm512_storeu_ps(dfPtr4+819584+1638400*i16+24576*j11+24576*s13+768*k54, out145);
_mm512_storeu_ps(dfPtr4+819520+1638400*i16+24576*j11+24576*s13+768*k54, out141);
_mm512_storeu_ps(dfPtr4+819648+1638400*i16+24576*j11+24576*s13+768*k54, out149);
_mm512_storeu_ps(dfPtr4+1229056+1638400*i16+24576*j11+24576*s13+768*k54, out138);
_mm512_storeu_ps(dfPtr4+1229184+1638400*i16+24576*j11+24576*s13+768*k54, out146);
_mm512_storeu_ps(dfPtr4+1229120+1638400*i16+24576*j11+24576*s13+768*k54, out142);
_mm512_storeu_ps(dfPtr4+1229248+1638400*i16+24576*j11+24576*s13+768*k54, out150);
__m512 dat1039 = _mm512_maskz_loadu_ps(16383, datPtr5+12656+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1039 = _mm512_max_ps(_mm512_setzero_ps(), dat1039);
__m512 dat1040 = _mm512_maskz_loadu_ps(16383, datPtr5+12704+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1040 = _mm512_max_ps(_mm512_setzero_ps(), dat1040);
__m512i pm78 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in132 = _mm512_permutexvar_ps(pm78, dat1039);
__m512 in140 = _mm512_permutexvar_ps(pm78, dat1040);
__m512 dat1041 = _mm512_maskz_loadu_ps(16383, datPtr5+12880+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1041 = _mm512_max_ps(_mm512_setzero_ps(), dat1041);
__m512 dat1042 = _mm512_maskz_loadu_ps(16383, datPtr5+12928+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1042 = _mm512_max_ps(_mm512_setzero_ps(), dat1042);
__m512 in133 = _mm512_permutexvar_ps(pm78, dat1041);
__m512 in141 = _mm512_permutexvar_ps(pm78, dat1042);
__m512 dat1043 = _mm512_maskz_loadu_ps(16383, datPtr5+13104+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1043 = _mm512_max_ps(_mm512_setzero_ps(), dat1043);
__m512 dat1044 = _mm512_maskz_loadu_ps(16383, datPtr5+13152+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1044 = _mm512_max_ps(_mm512_setzero_ps(), dat1044);
__m512 in134 = _mm512_permutexvar_ps(pm78, dat1043);
__m512 in142 = _mm512_permutexvar_ps(pm78, dat1044);
__m512 dat1045 = _mm512_maskz_loadu_ps(16383, datPtr5+13328+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1045 = _mm512_max_ps(_mm512_setzero_ps(), dat1045);
__m512 dat1046 = _mm512_maskz_loadu_ps(16383, datPtr5+13376+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1046 = _mm512_max_ps(_mm512_setzero_ps(), dat1046);
__m512 in135 = _mm512_permutexvar_ps(pm78, dat1045);
__m512 in143 = _mm512_permutexvar_ps(pm78, dat1046);
__m512 dat1047 = _mm512_maskz_loadu_ps(16383, datPtr5+13552+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1047 = _mm512_max_ps(_mm512_setzero_ps(), dat1047);
__m512 dat1048 = _mm512_maskz_loadu_ps(16383, datPtr5+13600+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1048 = _mm512_max_ps(_mm512_setzero_ps(), dat1048);
__m512 in136 = _mm512_permutexvar_ps(pm78, dat1047);
__m512 in144 = _mm512_permutexvar_ps(pm78, dat1048);
__m512 dat1049 = _mm512_maskz_loadu_ps(16383, datPtr5+13776+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1049 = _mm512_max_ps(_mm512_setzero_ps(), dat1049);
__m512 dat1050 = _mm512_maskz_loadu_ps(16383, datPtr5+13824+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1050 = _mm512_max_ps(_mm512_setzero_ps(), dat1050);
__m512 in137 = _mm512_permutexvar_ps(pm78, dat1049);
__m512 in145 = _mm512_permutexvar_ps(pm78, dat1050);
__m512 dat1051 = _mm512_maskz_loadu_ps(16383, datPtr5+14000+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1051 = _mm512_max_ps(_mm512_setzero_ps(), dat1051);
__m512 dat1052 = _mm512_maskz_loadu_ps(16383, datPtr5+14048+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1052 = _mm512_max_ps(_mm512_setzero_ps(), dat1052);
__m512 in138 = _mm512_permutexvar_ps(pm78, dat1051);
__m512 in146 = _mm512_permutexvar_ps(pm78, dat1052);
__m512 dat1053 = _mm512_maskz_loadu_ps(16383, datPtr5+14224+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1053 = _mm512_max_ps(_mm512_setzero_ps(), dat1053);
__m512 dat1054 = _mm512_maskz_loadu_ps(16383, datPtr5+14272+806912*i16+224*h22+4*w25+806912*s13+25216*k54);
dat1054 = _mm512_max_ps(_mm512_setzero_ps(), dat1054);
__m512 in139 = _mm512_permutexvar_ps(pm78, dat1053);
__m512 in147 = _mm512_permutexvar_ps(pm78, dat1054);
__m512 tmp813 = _mm512_add_ps(in133, in137);
__m512 tmp817 = _mm512_add_ps(in141, in145);
__m512 tmp814 = _mm512_sub_ps(in136, in134);
__m512 tmp818 = _mm512_sub_ps(in144, in142);
__m512 tmp815 = _mm512_add_ps(in134, in138);
__m512 tmp819 = _mm512_add_ps(in142, in146);
in132 = _mm512_sub_ps(in132, in138);
in140 = _mm512_sub_ps(in140, in146);
tmp813 = _mm512_fmadd_ps(in135, _mm512_set1_ps(-4.25e+00f), tmp813);
tmp817 = _mm512_fmadd_ps(in143, _mm512_set1_ps(-4.25e+00f), tmp817);
tmp815 = _mm512_fmadd_ps(in136, _mm512_set1_ps(-4.25e+00f), tmp815);
tmp819 = _mm512_fmadd_ps(in144, _mm512_set1_ps(-4.25e+00f), tmp819);
in132 = _mm512_fmadd_ps(tmp814, _mm512_set1_ps(5.25e+00f), in132);
in140 = _mm512_fmadd_ps(tmp818, _mm512_set1_ps(5.25e+00f), in140);
tmp814 = _mm512_fmadd_ps(in134, _mm512_set1_ps(2.5e-01f), in138);
tmp818 = _mm512_fmadd_ps(in142, _mm512_set1_ps(2.5e-01f), in146);
in134 = _mm512_fmadd_ps(in134, _mm512_set1_ps(4e+00f), in138);
in142 = _mm512_fmadd_ps(in142, _mm512_set1_ps(4e+00f), in146);
__m512 tmp816 = _mm512_sub_ps(tmp815, tmp813);
__m512 tmp820 = _mm512_sub_ps(tmp819, tmp817);
tmp815 = _mm512_add_ps(tmp813, tmp815);
tmp819 = _mm512_add_ps(tmp817, tmp819);
tmp813 = _mm512_fmadd_ps(in133, _mm512_set1_ps(2.5e-01f), in137);
tmp817 = _mm512_fmadd_ps(in141, _mm512_set1_ps(2.5e-01f), in145);
tmp814 = _mm512_fmadd_ps(in136, _mm512_set1_ps(-1.25e+00f), tmp814);
tmp818 = _mm512_fmadd_ps(in144, _mm512_set1_ps(-1.25e+00f), tmp818);
in136 = _mm512_fmadd_ps(in136, _mm512_set1_ps(-5e+00f), in134);
in144 = _mm512_fmadd_ps(in144, _mm512_set1_ps(-5e+00f), in142);
tmp813 = _mm512_fmadd_ps(in135, _mm512_set1_ps(-1.25e+00f), tmp813);
tmp817 = _mm512_fmadd_ps(in143, _mm512_set1_ps(-1.25e+00f), tmp817);
in138 = _mm512_fmadd_ps(tmp813, _mm512_set1_ps(2e+00f), tmp814);
in146 = _mm512_fmadd_ps(tmp817, _mm512_set1_ps(2e+00f), tmp818);
tmp814 = _mm512_fnmadd_ps(tmp813, _mm512_set1_ps(2e+00f), tmp814);
tmp818 = _mm512_fnmadd_ps(tmp817, _mm512_set1_ps(2e+00f), tmp818);
tmp813 = _mm512_fmadd_ps(in137, _mm512_set1_ps(2.5e-01f), in133);
tmp817 = _mm512_fmadd_ps(in145, _mm512_set1_ps(2.5e-01f), in141);
in133 = _mm512_sub_ps(in139, in133);
in141 = _mm512_sub_ps(in147, in141);
tmp813 = _mm512_fmadd_ps(in135, _mm512_set1_ps(-1.25e+00f), tmp813);
tmp817 = _mm512_fmadd_ps(in143, _mm512_set1_ps(-1.25e+00f), tmp817);
in135 = _mm512_sub_ps(in135, in137);
in143 = _mm512_sub_ps(in143, in145);
in135 = _mm512_fmadd_ps(in135, _mm512_set1_ps(5.25e+00f), in133);
in143 = _mm512_fmadd_ps(in143, _mm512_set1_ps(5.25e+00f), in141);
in134 = _mm512_fmadd_ps(tmp813, _mm512_set1_ps(2e+00f), in136);
in142 = _mm512_fmadd_ps(tmp817, _mm512_set1_ps(2e+00f), in144);
in136 = _mm512_fnmadd_ps(tmp813, _mm512_set1_ps(2e+00f), in136);
in144 = _mm512_fnmadd_ps(tmp817, _mm512_set1_ps(2e+00f), in144);
__m512 tmp829 = _mm512_unpacklo_ps(in132, tmp815);
__m512 tmp830 = _mm512_unpackhi_ps(in132, tmp815);
__m512 tmp831 = _mm512_unpacklo_ps(tmp816, in138);
__m512 tmp832 = _mm512_unpackhi_ps(tmp816, in138);
__m512 tmp833 = _mm512_unpacklo_ps(tmp814, in134);
__m512 tmp834 = _mm512_unpackhi_ps(tmp814, in134);
__m512 tmp835 = _mm512_unpacklo_ps(in136, in135);
__m512 tmp836 = _mm512_unpackhi_ps(in136, in135);
__m512 tmp837 = _mm512_unpacklo_ps(in140, tmp819);
__m512 tmp838 = _mm512_unpackhi_ps(in140, tmp819);
__m512 tmp839 = _mm512_unpacklo_ps(tmp820, in146);
__m512 tmp840 = _mm512_unpackhi_ps(tmp820, in146);
__m512 tmp841 = _mm512_unpacklo_ps(tmp818, in142);
__m512 tmp842 = _mm512_unpackhi_ps(tmp818, in142);
__m512 tmp843 = _mm512_unpacklo_ps(in144, in143);
__m512 tmp844 = _mm512_unpackhi_ps(in144, in143);
__m512 tmp845 = _mm512_shuffle_ps(tmp829, tmp831, 68);
__m512 tmp846 = _mm512_shuffle_ps(tmp829, tmp831, 238);
__m512 tmp847 = _mm512_shuffle_ps(tmp830, tmp832, 68);
__m512 tmp848 = _mm512_shuffle_ps(tmp830, tmp832, 238);
__m512 tmp849 = _mm512_shuffle_ps(tmp833, tmp835, 68);
__m512 tmp850 = _mm512_shuffle_ps(tmp833, tmp835, 238);
__m512 tmp851 = _mm512_shuffle_ps(tmp834, tmp836, 68);
__m512 tmp852 = _mm512_shuffle_ps(tmp834, tmp836, 238);
__m512 tmp853 = _mm512_shuffle_ps(tmp837, tmp839, 68);
__m512 tmp854 = _mm512_shuffle_ps(tmp837, tmp839, 238);
__m512 tmp855 = _mm512_shuffle_ps(tmp838, tmp840, 68);
__m512 tmp856 = _mm512_shuffle_ps(tmp838, tmp840, 238);
__m512 tmp857 = _mm512_shuffle_ps(tmp841, tmp843, 68);
__m512 tmp858 = _mm512_shuffle_ps(tmp841, tmp843, 238);
__m512 tmp859 = _mm512_shuffle_ps(tmp842, tmp844, 68);
__m512 tmp860 = _mm512_shuffle_ps(tmp842, tmp844, 238);
__m512 tmp861 = _mm512_shuffle_f32x4(tmp845, tmp849, 136);
__m512 tmp862 = _mm512_shuffle_f32x4(tmp845, tmp849, 221);
__m512 tmp863 = _mm512_shuffle_f32x4(tmp846, tmp850, 136);
__m512 tmp864 = _mm512_shuffle_f32x4(tmp846, tmp850, 221);
__m512 tmp865 = _mm512_shuffle_f32x4(tmp847, tmp851, 136);
__m512 tmp866 = _mm512_shuffle_f32x4(tmp847, tmp851, 221);
__m512 tmp867 = _mm512_shuffle_f32x4(tmp848, tmp852, 136);
__m512 tmp868 = _mm512_shuffle_f32x4(tmp848, tmp852, 221);
__m512 tmp869 = _mm512_shuffle_f32x4(tmp853, tmp857, 136);
__m512 tmp870 = _mm512_shuffle_f32x4(tmp853, tmp857, 221);
__m512 tmp871 = _mm512_shuffle_f32x4(tmp854, tmp858, 136);
__m512 tmp872 = _mm512_shuffle_f32x4(tmp854, tmp858, 221);
__m512 tmp873 = _mm512_shuffle_f32x4(tmp855, tmp859, 136);
__m512 tmp874 = _mm512_shuffle_f32x4(tmp855, tmp859, 221);
__m512 tmp875 = _mm512_shuffle_f32x4(tmp856, tmp860, 136);
__m512 tmp876 = _mm512_shuffle_f32x4(tmp856, tmp860, 221);
in132 = _mm512_shuffle_f32x4(tmp861, tmp869, 136);
in140 = _mm512_shuffle_f32x4(tmp861, tmp869, 221);
tmp815 = _mm512_shuffle_f32x4(tmp863, tmp871, 136);
tmp819 = _mm512_shuffle_f32x4(tmp863, tmp871, 221);
tmp816 = _mm512_shuffle_f32x4(tmp865, tmp873, 136);
tmp820 = _mm512_shuffle_f32x4(tmp865, tmp873, 221);
in138 = _mm512_shuffle_f32x4(tmp867, tmp875, 136);
in146 = _mm512_shuffle_f32x4(tmp867, tmp875, 221);
tmp814 = _mm512_shuffle_f32x4(tmp862, tmp870, 136);
tmp818 = _mm512_shuffle_f32x4(tmp862, tmp870, 221);
in134 = _mm512_shuffle_f32x4(tmp864, tmp872, 136);
in142 = _mm512_shuffle_f32x4(tmp864, tmp872, 221);
in136 = _mm512_shuffle_f32x4(tmp866, tmp874, 136);
in144 = _mm512_shuffle_f32x4(tmp866, tmp874, 221);
in135 = _mm512_shuffle_f32x4(tmp868, tmp876, 136);
in143 = _mm512_shuffle_f32x4(tmp868, tmp876, 221);
__m512 tmp821 = _mm512_add_ps(tmp815, in134);
__m512 tmp825 = _mm512_add_ps(tmp819, in142);
__m512 tmp822 = _mm512_sub_ps(tmp814, tmp816);
__m512 tmp826 = _mm512_sub_ps(tmp818, tmp820);
__m512 tmp823 = _mm512_add_ps(tmp816, in136);
__m512 tmp827 = _mm512_add_ps(tmp820, in144);
in132 = _mm512_sub_ps(in132, in136);
in140 = _mm512_sub_ps(in140, in144);
tmp821 = _mm512_fmadd_ps(in138, _mm512_set1_ps(-4.25e+00f), tmp821);
tmp825 = _mm512_fmadd_ps(in146, _mm512_set1_ps(-4.25e+00f), tmp825);
tmp823 = _mm512_fmadd_ps(tmp814, _mm512_set1_ps(-4.25e+00f), tmp823);
tmp827 = _mm512_fmadd_ps(tmp818, _mm512_set1_ps(-4.25e+00f), tmp827);
in132 = _mm512_fmadd_ps(tmp822, _mm512_set1_ps(5.25e+00f), in132);
in140 = _mm512_fmadd_ps(tmp826, _mm512_set1_ps(5.25e+00f), in140);
tmp822 = _mm512_fmadd_ps(tmp816, _mm512_set1_ps(2.5e-01f), in136);
tmp826 = _mm512_fmadd_ps(tmp820, _mm512_set1_ps(2.5e-01f), in144);
tmp816 = _mm512_fmadd_ps(tmp816, _mm512_set1_ps(4e+00f), in136);
tmp820 = _mm512_fmadd_ps(tmp820, _mm512_set1_ps(4e+00f), in144);
__m512 tmp824 = _mm512_sub_ps(tmp823, tmp821);
__m512 tmp828 = _mm512_sub_ps(tmp827, tmp825);
tmp823 = _mm512_add_ps(tmp821, tmp823);
tmp827 = _mm512_add_ps(tmp825, tmp827);
tmp821 = _mm512_fmadd_ps(tmp815, _mm512_set1_ps(2.5e-01f), in134);
tmp825 = _mm512_fmadd_ps(tmp819, _mm512_set1_ps(2.5e-01f), in142);
tmp822 = _mm512_fmadd_ps(tmp814, _mm512_set1_ps(-1.25e+00f), tmp822);
tmp826 = _mm512_fmadd_ps(tmp818, _mm512_set1_ps(-1.25e+00f), tmp826);
tmp814 = _mm512_fmadd_ps(tmp814, _mm512_set1_ps(-5e+00f), tmp816);
tmp818 = _mm512_fmadd_ps(tmp818, _mm512_set1_ps(-5e+00f), tmp820);
tmp821 = _mm512_fmadd_ps(in138, _mm512_set1_ps(-1.25e+00f), tmp821);
tmp825 = _mm512_fmadd_ps(in146, _mm512_set1_ps(-1.25e+00f), tmp825);
in136 = _mm512_fmadd_ps(tmp821, _mm512_set1_ps(2e+00f), tmp822);
in144 = _mm512_fmadd_ps(tmp825, _mm512_set1_ps(2e+00f), tmp826);
tmp822 = _mm512_fnmadd_ps(tmp821, _mm512_set1_ps(2e+00f), tmp822);
tmp826 = _mm512_fnmadd_ps(tmp825, _mm512_set1_ps(2e+00f), tmp826);
tmp821 = _mm512_fmadd_ps(in134, _mm512_set1_ps(2.5e-01f), tmp815);
tmp825 = _mm512_fmadd_ps(in142, _mm512_set1_ps(2.5e-01f), tmp819);
tmp815 = _mm512_sub_ps(in135, tmp815);
tmp819 = _mm512_sub_ps(in143, tmp819);
tmp821 = _mm512_fmadd_ps(in138, _mm512_set1_ps(-1.25e+00f), tmp821);
tmp825 = _mm512_fmadd_ps(in146, _mm512_set1_ps(-1.25e+00f), tmp825);
in138 = _mm512_sub_ps(in138, in134);
in146 = _mm512_sub_ps(in146, in142);
in138 = _mm512_fmadd_ps(in138, _mm512_set1_ps(5.25e+00f), tmp815);
in146 = _mm512_fmadd_ps(in146, _mm512_set1_ps(5.25e+00f), tmp819);
tmp816 = _mm512_fmadd_ps(tmp821, _mm512_set1_ps(2e+00f), tmp814);
tmp820 = _mm512_fmadd_ps(tmp825, _mm512_set1_ps(2e+00f), tmp818);
tmp814 = _mm512_fnmadd_ps(tmp821, _mm512_set1_ps(2e+00f), tmp814);
tmp818 = _mm512_fnmadd_ps(tmp825, _mm512_set1_ps(2e+00f), tmp818);
__m512 out151 = _mm512_shuffle_f32x4(in132, tmp823, 68);
__m512 out159 = _mm512_shuffle_f32x4(in132, tmp823, 238);
__m512 out152 = _mm512_shuffle_f32x4(tmp824, in136, 68);
__m512 out160 = _mm512_shuffle_f32x4(tmp824, in136, 238);
__m512 out153 = _mm512_shuffle_f32x4(tmp822, tmp816, 68);
__m512 out161 = _mm512_shuffle_f32x4(tmp822, tmp816, 238);
__m512 out154 = _mm512_shuffle_f32x4(tmp814, in138, 68);
__m512 out162 = _mm512_shuffle_f32x4(tmp814, in138, 238);
__m512 out155 = _mm512_shuffle_f32x4(in140, tmp827, 68);
__m512 out163 = _mm512_shuffle_f32x4(in140, tmp827, 238);
__m512 out156 = _mm512_shuffle_f32x4(tmp828, in144, 68);
__m512 out164 = _mm512_shuffle_f32x4(tmp828, in144, 238);
__m512 out157 = _mm512_shuffle_f32x4(tmp826, tmp820, 68);
__m512 out165 = _mm512_shuffle_f32x4(tmp826, tmp820, 238);
__m512 out158 = _mm512_shuffle_f32x4(tmp818, in146, 68);
__m512 out166 = _mm512_shuffle_f32x4(tmp818, in146, 238);
_mm512_storeu_ps(dfPtr4+512+1638400*i16+24576*j11+24576*s13+768*k54, out151);
_mm512_storeu_ps(dfPtr4+640+1638400*i16+24576*j11+24576*s13+768*k54, out159);
_mm512_storeu_ps(dfPtr4+576+1638400*i16+24576*j11+24576*s13+768*k54, out155);
_mm512_storeu_ps(dfPtr4+704+1638400*i16+24576*j11+24576*s13+768*k54, out163);
_mm512_storeu_ps(dfPtr4+410112+1638400*i16+24576*j11+24576*s13+768*k54, out152);
_mm512_storeu_ps(dfPtr4+410240+1638400*i16+24576*j11+24576*s13+768*k54, out160);
_mm512_storeu_ps(dfPtr4+410176+1638400*i16+24576*j11+24576*s13+768*k54, out156);
_mm512_storeu_ps(dfPtr4+410304+1638400*i16+24576*j11+24576*s13+768*k54, out164);
_mm512_storeu_ps(dfPtr4+819712+1638400*i16+24576*j11+24576*s13+768*k54, out153);
_mm512_storeu_ps(dfPtr4+819840+1638400*i16+24576*j11+24576*s13+768*k54, out161);
_mm512_storeu_ps(dfPtr4+819776+1638400*i16+24576*j11+24576*s13+768*k54, out157);
_mm512_storeu_ps(dfPtr4+819904+1638400*i16+24576*j11+24576*s13+768*k54, out165);
_mm512_storeu_ps(dfPtr4+1229312+1638400*i16+24576*j11+24576*s13+768*k54, out154);
_mm512_storeu_ps(dfPtr4+1229440+1638400*i16+24576*j11+24576*s13+768*k54, out162);
_mm512_storeu_ps(dfPtr4+1229376+1638400*i16+24576*j11+24576*s13+768*k54, out158);
_mm512_storeu_ps(dfPtr4+1229504+1638400*i16+24576*j11+24576*s13+768*k54, out166);
}
if (j11 >= last3) return;
++j11;
rel8 = 1;
}
ptrdiff_t h23 = base8+0;
ptrdiff_t w26 = 48;
ptrdiff_t k55 = 0;
for (; k55 != 32; ++k55) {
__m512 dat1055 = _mm512_maskz_loadu_ps(511, datPtr5+0+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1055 = _mm512_max_ps(_mm512_setzero_ps(), dat1055);
__m512 dat1056 = _mm512_maskz_loadu_ps(8191, datPtr5+1156+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1056 = _mm512_max_ps(_mm512_setzero_ps(), dat1056);
__m512i pm79 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in148 = _mm512_permutexvar_ps(pm79, dat1055);
__m512i pm80 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in156 = _mm512_permutexvar_ps(pm80, dat1056);
__m512 dat1057 = _mm512_maskz_loadu_ps(511, datPtr5+224+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1057 = _mm512_max_ps(_mm512_setzero_ps(), dat1057);
__m512 dat1058 = _mm512_maskz_loadu_ps(8191, datPtr5+1380+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1058 = _mm512_max_ps(_mm512_setzero_ps(), dat1058);
__m512 in149 = _mm512_permutexvar_ps(pm79, dat1057);
__m512 in157 = _mm512_permutexvar_ps(pm80, dat1058);
__m512 dat1059 = _mm512_maskz_loadu_ps(511, datPtr5+448+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1059 = _mm512_max_ps(_mm512_setzero_ps(), dat1059);
__m512 dat1060 = _mm512_maskz_loadu_ps(8191, datPtr5+1604+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1060 = _mm512_max_ps(_mm512_setzero_ps(), dat1060);
__m512 in150 = _mm512_permutexvar_ps(pm79, dat1059);
__m512 in158 = _mm512_permutexvar_ps(pm80, dat1060);
__m512 dat1061 = _mm512_maskz_loadu_ps(511, datPtr5+672+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1061 = _mm512_max_ps(_mm512_setzero_ps(), dat1061);
__m512 dat1062 = _mm512_maskz_loadu_ps(8191, datPtr5+1828+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1062 = _mm512_max_ps(_mm512_setzero_ps(), dat1062);
__m512 in151 = _mm512_permutexvar_ps(pm79, dat1061);
__m512 in159 = _mm512_permutexvar_ps(pm80, dat1062);
__m512 dat1063 = _mm512_maskz_loadu_ps(511, datPtr5+896+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1063 = _mm512_max_ps(_mm512_setzero_ps(), dat1063);
__m512 dat1064 = _mm512_maskz_loadu_ps(8191, datPtr5+2052+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1064 = _mm512_max_ps(_mm512_setzero_ps(), dat1064);
__m512 in152 = _mm512_permutexvar_ps(pm79, dat1063);
__m512 in160 = _mm512_permutexvar_ps(pm80, dat1064);
__m512 dat1065 = _mm512_maskz_loadu_ps(511, datPtr5+1120+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1065 = _mm512_max_ps(_mm512_setzero_ps(), dat1065);
__m512 dat1066 = _mm512_maskz_loadu_ps(8191, datPtr5+2276+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1066 = _mm512_max_ps(_mm512_setzero_ps(), dat1066);
__m512 in153 = _mm512_permutexvar_ps(pm79, dat1065);
__m512 in161 = _mm512_permutexvar_ps(pm80, dat1066);
__m512 dat1067 = _mm512_maskz_loadu_ps(511, datPtr5+1344+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1067 = _mm512_max_ps(_mm512_setzero_ps(), dat1067);
__m512 dat1068 = _mm512_maskz_loadu_ps(8191, datPtr5+2500+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1068 = _mm512_max_ps(_mm512_setzero_ps(), dat1068);
__m512 in154 = _mm512_permutexvar_ps(pm79, dat1067);
__m512 in162 = _mm512_permutexvar_ps(pm80, dat1068);
__m512 dat1069 = _mm512_maskz_loadu_ps(511, datPtr5+1568+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1069 = _mm512_max_ps(_mm512_setzero_ps(), dat1069);
__m512 dat1070 = _mm512_maskz_loadu_ps(8191, datPtr5+2724+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1070 = _mm512_max_ps(_mm512_setzero_ps(), dat1070);
__m512 in155 = _mm512_permutexvar_ps(pm79, dat1069);
__m512 in163 = _mm512_permutexvar_ps(pm80, dat1070);
__m512 tmp877 = _mm512_add_ps(in149, in153);
__m512 tmp881 = _mm512_add_ps(in157, in161);
__m512 tmp878 = _mm512_sub_ps(in152, in150);
__m512 tmp882 = _mm512_sub_ps(in160, in158);
__m512 tmp879 = _mm512_add_ps(in150, in154);
__m512 tmp883 = _mm512_add_ps(in158, in162);
in148 = _mm512_sub_ps(in148, in154);
in156 = _mm512_sub_ps(in156, in162);
tmp877 = _mm512_fmadd_ps(in151, _mm512_set1_ps(-4.25e+00f), tmp877);
tmp881 = _mm512_fmadd_ps(in159, _mm512_set1_ps(-4.25e+00f), tmp881);
tmp879 = _mm512_fmadd_ps(in152, _mm512_set1_ps(-4.25e+00f), tmp879);
tmp883 = _mm512_fmadd_ps(in160, _mm512_set1_ps(-4.25e+00f), tmp883);
in148 = _mm512_fmadd_ps(tmp878, _mm512_set1_ps(5.25e+00f), in148);
in156 = _mm512_fmadd_ps(tmp882, _mm512_set1_ps(5.25e+00f), in156);
tmp878 = _mm512_fmadd_ps(in150, _mm512_set1_ps(2.5e-01f), in154);
tmp882 = _mm512_fmadd_ps(in158, _mm512_set1_ps(2.5e-01f), in162);
in150 = _mm512_fmadd_ps(in150, _mm512_set1_ps(4e+00f), in154);
in158 = _mm512_fmadd_ps(in158, _mm512_set1_ps(4e+00f), in162);
__m512 tmp880 = _mm512_sub_ps(tmp879, tmp877);
__m512 tmp884 = _mm512_sub_ps(tmp883, tmp881);
tmp879 = _mm512_add_ps(tmp877, tmp879);
tmp883 = _mm512_add_ps(tmp881, tmp883);
tmp877 = _mm512_fmadd_ps(in149, _mm512_set1_ps(2.5e-01f), in153);
tmp881 = _mm512_fmadd_ps(in157, _mm512_set1_ps(2.5e-01f), in161);
tmp878 = _mm512_fmadd_ps(in152, _mm512_set1_ps(-1.25e+00f), tmp878);
tmp882 = _mm512_fmadd_ps(in160, _mm512_set1_ps(-1.25e+00f), tmp882);
in152 = _mm512_fmadd_ps(in152, _mm512_set1_ps(-5e+00f), in150);
in160 = _mm512_fmadd_ps(in160, _mm512_set1_ps(-5e+00f), in158);
tmp877 = _mm512_fmadd_ps(in151, _mm512_set1_ps(-1.25e+00f), tmp877);
tmp881 = _mm512_fmadd_ps(in159, _mm512_set1_ps(-1.25e+00f), tmp881);
in154 = _mm512_fmadd_ps(tmp877, _mm512_set1_ps(2e+00f), tmp878);
in162 = _mm512_fmadd_ps(tmp881, _mm512_set1_ps(2e+00f), tmp882);
tmp878 = _mm512_fnmadd_ps(tmp877, _mm512_set1_ps(2e+00f), tmp878);
tmp882 = _mm512_fnmadd_ps(tmp881, _mm512_set1_ps(2e+00f), tmp882);
tmp877 = _mm512_fmadd_ps(in153, _mm512_set1_ps(2.5e-01f), in149);
tmp881 = _mm512_fmadd_ps(in161, _mm512_set1_ps(2.5e-01f), in157);
in149 = _mm512_sub_ps(in155, in149);
in157 = _mm512_sub_ps(in163, in157);
tmp877 = _mm512_fmadd_ps(in151, _mm512_set1_ps(-1.25e+00f), tmp877);
tmp881 = _mm512_fmadd_ps(in159, _mm512_set1_ps(-1.25e+00f), tmp881);
in151 = _mm512_sub_ps(in151, in153);
in159 = _mm512_sub_ps(in159, in161);
in151 = _mm512_fmadd_ps(in151, _mm512_set1_ps(5.25e+00f), in149);
in159 = _mm512_fmadd_ps(in159, _mm512_set1_ps(5.25e+00f), in157);
in150 = _mm512_fmadd_ps(tmp877, _mm512_set1_ps(2e+00f), in152);
in158 = _mm512_fmadd_ps(tmp881, _mm512_set1_ps(2e+00f), in160);
in152 = _mm512_fnmadd_ps(tmp877, _mm512_set1_ps(2e+00f), in152);
in160 = _mm512_fnmadd_ps(tmp881, _mm512_set1_ps(2e+00f), in160);
__m512 tmp893 = _mm512_unpacklo_ps(in148, tmp879);
__m512 tmp894 = _mm512_unpackhi_ps(in148, tmp879);
__m512 tmp895 = _mm512_unpacklo_ps(tmp880, in154);
__m512 tmp896 = _mm512_unpackhi_ps(tmp880, in154);
__m512 tmp897 = _mm512_unpacklo_ps(tmp878, in150);
__m512 tmp898 = _mm512_unpackhi_ps(tmp878, in150);
__m512 tmp899 = _mm512_unpacklo_ps(in152, in151);
__m512 tmp900 = _mm512_unpackhi_ps(in152, in151);
__m512 tmp901 = _mm512_unpacklo_ps(in156, tmp883);
__m512 tmp902 = _mm512_unpackhi_ps(in156, tmp883);
__m512 tmp903 = _mm512_unpacklo_ps(tmp884, in162);
__m512 tmp904 = _mm512_unpackhi_ps(tmp884, in162);
__m512 tmp905 = _mm512_unpacklo_ps(tmp882, in158);
__m512 tmp906 = _mm512_unpackhi_ps(tmp882, in158);
__m512 tmp907 = _mm512_unpacklo_ps(in160, in159);
__m512 tmp908 = _mm512_unpackhi_ps(in160, in159);
__m512 tmp909 = _mm512_shuffle_ps(tmp893, tmp895, 68);
__m512 tmp910 = _mm512_shuffle_ps(tmp893, tmp895, 238);
__m512 tmp911 = _mm512_shuffle_ps(tmp894, tmp896, 68);
__m512 tmp912 = _mm512_shuffle_ps(tmp894, tmp896, 238);
__m512 tmp913 = _mm512_shuffle_ps(tmp897, tmp899, 68);
__m512 tmp914 = _mm512_shuffle_ps(tmp897, tmp899, 238);
__m512 tmp915 = _mm512_shuffle_ps(tmp898, tmp900, 68);
__m512 tmp916 = _mm512_shuffle_ps(tmp898, tmp900, 238);
__m512 tmp917 = _mm512_shuffle_ps(tmp901, tmp903, 68);
__m512 tmp918 = _mm512_shuffle_ps(tmp901, tmp903, 238);
__m512 tmp919 = _mm512_shuffle_ps(tmp902, tmp904, 68);
__m512 tmp920 = _mm512_shuffle_ps(tmp902, tmp904, 238);
__m512 tmp921 = _mm512_shuffle_ps(tmp905, tmp907, 68);
__m512 tmp922 = _mm512_shuffle_ps(tmp905, tmp907, 238);
__m512 tmp923 = _mm512_shuffle_ps(tmp906, tmp908, 68);
__m512 tmp924 = _mm512_shuffle_ps(tmp906, tmp908, 238);
__m512 tmp925 = _mm512_shuffle_f32x4(tmp909, tmp913, 136);
__m512 tmp926 = _mm512_shuffle_f32x4(tmp909, tmp913, 221);
__m512 tmp927 = _mm512_shuffle_f32x4(tmp910, tmp914, 136);
__m512 tmp928 = _mm512_shuffle_f32x4(tmp910, tmp914, 221);
__m512 tmp929 = _mm512_shuffle_f32x4(tmp911, tmp915, 136);
__m512 tmp930 = _mm512_shuffle_f32x4(tmp911, tmp915, 221);
__m512 tmp931 = _mm512_shuffle_f32x4(tmp912, tmp916, 136);
__m512 tmp932 = _mm512_shuffle_f32x4(tmp912, tmp916, 221);
__m512 tmp933 = _mm512_shuffle_f32x4(tmp917, tmp921, 136);
__m512 tmp934 = _mm512_shuffle_f32x4(tmp917, tmp921, 221);
__m512 tmp935 = _mm512_shuffle_f32x4(tmp918, tmp922, 136);
__m512 tmp936 = _mm512_shuffle_f32x4(tmp918, tmp922, 221);
__m512 tmp937 = _mm512_shuffle_f32x4(tmp919, tmp923, 136);
__m512 tmp938 = _mm512_shuffle_f32x4(tmp919, tmp923, 221);
__m512 tmp939 = _mm512_shuffle_f32x4(tmp920, tmp924, 136);
__m512 tmp940 = _mm512_shuffle_f32x4(tmp920, tmp924, 221);
in148 = _mm512_shuffle_f32x4(tmp925, tmp933, 136);
in156 = _mm512_shuffle_f32x4(tmp925, tmp933, 221);
tmp879 = _mm512_shuffle_f32x4(tmp927, tmp935, 136);
tmp883 = _mm512_shuffle_f32x4(tmp927, tmp935, 221);
tmp880 = _mm512_shuffle_f32x4(tmp929, tmp937, 136);
tmp884 = _mm512_shuffle_f32x4(tmp929, tmp937, 221);
in154 = _mm512_shuffle_f32x4(tmp931, tmp939, 136);
in162 = _mm512_shuffle_f32x4(tmp931, tmp939, 221);
tmp878 = _mm512_shuffle_f32x4(tmp926, tmp934, 136);
tmp882 = _mm512_shuffle_f32x4(tmp926, tmp934, 221);
in150 = _mm512_shuffle_f32x4(tmp928, tmp936, 136);
in158 = _mm512_shuffle_f32x4(tmp928, tmp936, 221);
in152 = _mm512_shuffle_f32x4(tmp930, tmp938, 136);
in160 = _mm512_shuffle_f32x4(tmp930, tmp938, 221);
in151 = _mm512_shuffle_f32x4(tmp932, tmp940, 136);
in159 = _mm512_shuffle_f32x4(tmp932, tmp940, 221);
__m512 tmp885 = _mm512_add_ps(tmp879, in150);
__m512 tmp889 = _mm512_add_ps(tmp883, in158);
__m512 tmp886 = _mm512_sub_ps(tmp878, tmp880);
__m512 tmp890 = _mm512_sub_ps(tmp882, tmp884);
__m512 tmp887 = _mm512_add_ps(tmp880, in152);
__m512 tmp891 = _mm512_add_ps(tmp884, in160);
in148 = _mm512_sub_ps(in148, in152);
in156 = _mm512_sub_ps(in156, in160);
tmp885 = _mm512_fmadd_ps(in154, _mm512_set1_ps(-4.25e+00f), tmp885);
tmp889 = _mm512_fmadd_ps(in162, _mm512_set1_ps(-4.25e+00f), tmp889);
tmp887 = _mm512_fmadd_ps(tmp878, _mm512_set1_ps(-4.25e+00f), tmp887);
tmp891 = _mm512_fmadd_ps(tmp882, _mm512_set1_ps(-4.25e+00f), tmp891);
in148 = _mm512_fmadd_ps(tmp886, _mm512_set1_ps(5.25e+00f), in148);
in156 = _mm512_fmadd_ps(tmp890, _mm512_set1_ps(5.25e+00f), in156);
tmp886 = _mm512_fmadd_ps(tmp880, _mm512_set1_ps(2.5e-01f), in152);
tmp890 = _mm512_fmadd_ps(tmp884, _mm512_set1_ps(2.5e-01f), in160);
tmp880 = _mm512_fmadd_ps(tmp880, _mm512_set1_ps(4e+00f), in152);
tmp884 = _mm512_fmadd_ps(tmp884, _mm512_set1_ps(4e+00f), in160);
__m512 tmp888 = _mm512_sub_ps(tmp887, tmp885);
__m512 tmp892 = _mm512_sub_ps(tmp891, tmp889);
tmp887 = _mm512_add_ps(tmp885, tmp887);
tmp891 = _mm512_add_ps(tmp889, tmp891);
tmp885 = _mm512_fmadd_ps(tmp879, _mm512_set1_ps(2.5e-01f), in150);
tmp889 = _mm512_fmadd_ps(tmp883, _mm512_set1_ps(2.5e-01f), in158);
tmp886 = _mm512_fmadd_ps(tmp878, _mm512_set1_ps(-1.25e+00f), tmp886);
tmp890 = _mm512_fmadd_ps(tmp882, _mm512_set1_ps(-1.25e+00f), tmp890);
tmp878 = _mm512_fmadd_ps(tmp878, _mm512_set1_ps(-5e+00f), tmp880);
tmp882 = _mm512_fmadd_ps(tmp882, _mm512_set1_ps(-5e+00f), tmp884);
tmp885 = _mm512_fmadd_ps(in154, _mm512_set1_ps(-1.25e+00f), tmp885);
tmp889 = _mm512_fmadd_ps(in162, _mm512_set1_ps(-1.25e+00f), tmp889);
in152 = _mm512_fmadd_ps(tmp885, _mm512_set1_ps(2e+00f), tmp886);
in160 = _mm512_fmadd_ps(tmp889, _mm512_set1_ps(2e+00f), tmp890);
tmp886 = _mm512_fnmadd_ps(tmp885, _mm512_set1_ps(2e+00f), tmp886);
tmp890 = _mm512_fnmadd_ps(tmp889, _mm512_set1_ps(2e+00f), tmp890);
tmp885 = _mm512_fmadd_ps(in150, _mm512_set1_ps(2.5e-01f), tmp879);
tmp889 = _mm512_fmadd_ps(in158, _mm512_set1_ps(2.5e-01f), tmp883);
tmp879 = _mm512_sub_ps(in151, tmp879);
tmp883 = _mm512_sub_ps(in159, tmp883);
tmp885 = _mm512_fmadd_ps(in154, _mm512_set1_ps(-1.25e+00f), tmp885);
tmp889 = _mm512_fmadd_ps(in162, _mm512_set1_ps(-1.25e+00f), tmp889);
in154 = _mm512_sub_ps(in154, in150);
in162 = _mm512_sub_ps(in162, in158);
in154 = _mm512_fmadd_ps(in154, _mm512_set1_ps(5.25e+00f), tmp879);
in162 = _mm512_fmadd_ps(in162, _mm512_set1_ps(5.25e+00f), tmp883);
tmp880 = _mm512_fmadd_ps(tmp885, _mm512_set1_ps(2e+00f), tmp878);
tmp884 = _mm512_fmadd_ps(tmp889, _mm512_set1_ps(2e+00f), tmp882);
tmp878 = _mm512_fnmadd_ps(tmp885, _mm512_set1_ps(2e+00f), tmp878);
tmp882 = _mm512_fnmadd_ps(tmp889, _mm512_set1_ps(2e+00f), tmp882);
__m512 out167 = _mm512_shuffle_f32x4(in148, tmp887, 68);
__m512 out175 = _mm512_shuffle_f32x4(in148, tmp887, 238);
__m512 out168 = _mm512_shuffle_f32x4(tmp888, in152, 68);
__m512 out176 = _mm512_shuffle_f32x4(tmp888, in152, 238);
__m512 out169 = _mm512_shuffle_f32x4(tmp886, tmp880, 68);
__m512 out177 = _mm512_shuffle_f32x4(tmp886, tmp880, 238);
__m512 out170 = _mm512_shuffle_f32x4(tmp878, in154, 68);
__m512 out178 = _mm512_shuffle_f32x4(tmp878, in154, 238);
__m512 out171 = _mm512_shuffle_f32x4(in156, tmp891, 68);
__m512 out179 = _mm512_shuffle_f32x4(in156, tmp891, 238);
__m512 out172 = _mm512_shuffle_f32x4(tmp892, in160, 68);
__m512 out180 = _mm512_shuffle_f32x4(tmp892, in160, 238);
__m512 out173 = _mm512_shuffle_f32x4(tmp890, tmp884, 68);
__m512 out181 = _mm512_shuffle_f32x4(tmp890, tmp884, 238);
__m512 out174 = _mm512_shuffle_f32x4(tmp882, in162, 68);
__m512 out182 = _mm512_shuffle_f32x4(tmp882, in162, 238);
_mm512_storeu_ps(dfPtr4+0+1638400*i16+24576*j11+24576*s13+768*k55, out167);
_mm512_storeu_ps(dfPtr4+128+1638400*i16+24576*j11+24576*s13+768*k55, out175);
_mm512_storeu_ps(dfPtr4+64+1638400*i16+24576*j11+24576*s13+768*k55, out171);
_mm512_storeu_ps(dfPtr4+192+1638400*i16+24576*j11+24576*s13+768*k55, out179);
_mm512_storeu_ps(dfPtr4+409600+1638400*i16+24576*j11+24576*s13+768*k55, out168);
_mm512_storeu_ps(dfPtr4+409728+1638400*i16+24576*j11+24576*s13+768*k55, out176);
_mm512_storeu_ps(dfPtr4+409664+1638400*i16+24576*j11+24576*s13+768*k55, out172);
_mm512_storeu_ps(dfPtr4+409792+1638400*i16+24576*j11+24576*s13+768*k55, out180);
_mm512_storeu_ps(dfPtr4+819200+1638400*i16+24576*j11+24576*s13+768*k55, out169);
_mm512_storeu_ps(dfPtr4+819328+1638400*i16+24576*j11+24576*s13+768*k55, out177);
_mm512_storeu_ps(dfPtr4+819264+1638400*i16+24576*j11+24576*s13+768*k55, out173);
_mm512_storeu_ps(dfPtr4+819392+1638400*i16+24576*j11+24576*s13+768*k55, out181);
_mm512_storeu_ps(dfPtr4+1228800+1638400*i16+24576*j11+24576*s13+768*k55, out170);
_mm512_storeu_ps(dfPtr4+1228928+1638400*i16+24576*j11+24576*s13+768*k55, out178);
_mm512_storeu_ps(dfPtr4+1228864+1638400*i16+24576*j11+24576*s13+768*k55, out174);
_mm512_storeu_ps(dfPtr4+1228992+1638400*i16+24576*j11+24576*s13+768*k55, out182);
__m512 dat1071 = _mm512_maskz_loadu_ps(16383, datPtr5+1200+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1071 = _mm512_max_ps(_mm512_setzero_ps(), dat1071);
__m512 dat1072 = _mm512_maskz_loadu_ps(511, datPtr5+12608+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1072 = _mm512_max_ps(_mm512_setzero_ps(), dat1072);
__m512i pm81 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in164 = _mm512_permutexvar_ps(pm81, dat1071);
__m512i pm82 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in172 = _mm512_permutexvar_ps(pm82, dat1072);
__m512 dat1073 = _mm512_maskz_loadu_ps(16383, datPtr5+1424+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1073 = _mm512_max_ps(_mm512_setzero_ps(), dat1073);
__m512 dat1074 = _mm512_maskz_loadu_ps(511, datPtr5+12832+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1074 = _mm512_max_ps(_mm512_setzero_ps(), dat1074);
__m512 in165 = _mm512_permutexvar_ps(pm81, dat1073);
__m512 in173 = _mm512_permutexvar_ps(pm82, dat1074);
__m512 dat1075 = _mm512_maskz_loadu_ps(16383, datPtr5+1648+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1075 = _mm512_max_ps(_mm512_setzero_ps(), dat1075);
__m512 dat1076 = _mm512_maskz_loadu_ps(511, datPtr5+13056+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1076 = _mm512_max_ps(_mm512_setzero_ps(), dat1076);
__m512 in166 = _mm512_permutexvar_ps(pm81, dat1075);
__m512 in174 = _mm512_permutexvar_ps(pm82, dat1076);
__m512 dat1077 = _mm512_maskz_loadu_ps(16383, datPtr5+1872+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1077 = _mm512_max_ps(_mm512_setzero_ps(), dat1077);
__m512 dat1078 = _mm512_maskz_loadu_ps(511, datPtr5+13280+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1078 = _mm512_max_ps(_mm512_setzero_ps(), dat1078);
__m512 in167 = _mm512_permutexvar_ps(pm81, dat1077);
__m512 in175 = _mm512_permutexvar_ps(pm82, dat1078);
__m512 dat1079 = _mm512_maskz_loadu_ps(16383, datPtr5+2096+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1079 = _mm512_max_ps(_mm512_setzero_ps(), dat1079);
__m512 dat1080 = _mm512_maskz_loadu_ps(511, datPtr5+13504+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1080 = _mm512_max_ps(_mm512_setzero_ps(), dat1080);
__m512 in168 = _mm512_permutexvar_ps(pm81, dat1079);
__m512 in176 = _mm512_permutexvar_ps(pm82, dat1080);
__m512 dat1081 = _mm512_maskz_loadu_ps(16383, datPtr5+2320+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1081 = _mm512_max_ps(_mm512_setzero_ps(), dat1081);
__m512 dat1082 = _mm512_maskz_loadu_ps(511, datPtr5+13728+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1082 = _mm512_max_ps(_mm512_setzero_ps(), dat1082);
__m512 in169 = _mm512_permutexvar_ps(pm81, dat1081);
__m512 in177 = _mm512_permutexvar_ps(pm82, dat1082);
__m512 dat1083 = _mm512_maskz_loadu_ps(16383, datPtr5+2544+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1083 = _mm512_max_ps(_mm512_setzero_ps(), dat1083);
__m512 dat1084 = _mm512_maskz_loadu_ps(511, datPtr5+13952+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1084 = _mm512_max_ps(_mm512_setzero_ps(), dat1084);
__m512 in170 = _mm512_permutexvar_ps(pm81, dat1083);
__m512 in178 = _mm512_permutexvar_ps(pm82, dat1084);
__m512 dat1085 = _mm512_maskz_loadu_ps(16383, datPtr5+2768+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1085 = _mm512_max_ps(_mm512_setzero_ps(), dat1085);
__m512 dat1086 = _mm512_maskz_loadu_ps(511, datPtr5+14176+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1086 = _mm512_max_ps(_mm512_setzero_ps(), dat1086);
__m512 in171 = _mm512_permutexvar_ps(pm81, dat1085);
__m512 in179 = _mm512_permutexvar_ps(pm82, dat1086);
__m512 tmp941 = _mm512_add_ps(in165, in169);
__m512 tmp945 = _mm512_add_ps(in173, in177);
__m512 tmp942 = _mm512_sub_ps(in168, in166);
__m512 tmp946 = _mm512_sub_ps(in176, in174);
__m512 tmp943 = _mm512_add_ps(in166, in170);
__m512 tmp947 = _mm512_add_ps(in174, in178);
in164 = _mm512_sub_ps(in164, in170);
in172 = _mm512_sub_ps(in172, in178);
tmp941 = _mm512_fmadd_ps(in167, _mm512_set1_ps(-4.25e+00f), tmp941);
tmp945 = _mm512_fmadd_ps(in175, _mm512_set1_ps(-4.25e+00f), tmp945);
tmp943 = _mm512_fmadd_ps(in168, _mm512_set1_ps(-4.25e+00f), tmp943);
tmp947 = _mm512_fmadd_ps(in176, _mm512_set1_ps(-4.25e+00f), tmp947);
in164 = _mm512_fmadd_ps(tmp942, _mm512_set1_ps(5.25e+00f), in164);
in172 = _mm512_fmadd_ps(tmp946, _mm512_set1_ps(5.25e+00f), in172);
tmp942 = _mm512_fmadd_ps(in166, _mm512_set1_ps(2.5e-01f), in170);
tmp946 = _mm512_fmadd_ps(in174, _mm512_set1_ps(2.5e-01f), in178);
in166 = _mm512_fmadd_ps(in166, _mm512_set1_ps(4e+00f), in170);
in174 = _mm512_fmadd_ps(in174, _mm512_set1_ps(4e+00f), in178);
__m512 tmp944 = _mm512_sub_ps(tmp943, tmp941);
__m512 tmp948 = _mm512_sub_ps(tmp947, tmp945);
tmp943 = _mm512_add_ps(tmp941, tmp943);
tmp947 = _mm512_add_ps(tmp945, tmp947);
tmp941 = _mm512_fmadd_ps(in165, _mm512_set1_ps(2.5e-01f), in169);
tmp945 = _mm512_fmadd_ps(in173, _mm512_set1_ps(2.5e-01f), in177);
tmp942 = _mm512_fmadd_ps(in168, _mm512_set1_ps(-1.25e+00f), tmp942);
tmp946 = _mm512_fmadd_ps(in176, _mm512_set1_ps(-1.25e+00f), tmp946);
in168 = _mm512_fmadd_ps(in168, _mm512_set1_ps(-5e+00f), in166);
in176 = _mm512_fmadd_ps(in176, _mm512_set1_ps(-5e+00f), in174);
tmp941 = _mm512_fmadd_ps(in167, _mm512_set1_ps(-1.25e+00f), tmp941);
tmp945 = _mm512_fmadd_ps(in175, _mm512_set1_ps(-1.25e+00f), tmp945);
in170 = _mm512_fmadd_ps(tmp941, _mm512_set1_ps(2e+00f), tmp942);
in178 = _mm512_fmadd_ps(tmp945, _mm512_set1_ps(2e+00f), tmp946);
tmp942 = _mm512_fnmadd_ps(tmp941, _mm512_set1_ps(2e+00f), tmp942);
tmp946 = _mm512_fnmadd_ps(tmp945, _mm512_set1_ps(2e+00f), tmp946);
tmp941 = _mm512_fmadd_ps(in169, _mm512_set1_ps(2.5e-01f), in165);
tmp945 = _mm512_fmadd_ps(in177, _mm512_set1_ps(2.5e-01f), in173);
in165 = _mm512_sub_ps(in171, in165);
in173 = _mm512_sub_ps(in179, in173);
tmp941 = _mm512_fmadd_ps(in167, _mm512_set1_ps(-1.25e+00f), tmp941);
tmp945 = _mm512_fmadd_ps(in175, _mm512_set1_ps(-1.25e+00f), tmp945);
in167 = _mm512_sub_ps(in167, in169);
in175 = _mm512_sub_ps(in175, in177);
in167 = _mm512_fmadd_ps(in167, _mm512_set1_ps(5.25e+00f), in165);
in175 = _mm512_fmadd_ps(in175, _mm512_set1_ps(5.25e+00f), in173);
in166 = _mm512_fmadd_ps(tmp941, _mm512_set1_ps(2e+00f), in168);
in174 = _mm512_fmadd_ps(tmp945, _mm512_set1_ps(2e+00f), in176);
in168 = _mm512_fnmadd_ps(tmp941, _mm512_set1_ps(2e+00f), in168);
in176 = _mm512_fnmadd_ps(tmp945, _mm512_set1_ps(2e+00f), in176);
__m512 tmp957 = _mm512_unpacklo_ps(in164, tmp943);
__m512 tmp958 = _mm512_unpackhi_ps(in164, tmp943);
__m512 tmp959 = _mm512_unpacklo_ps(tmp944, in170);
__m512 tmp960 = _mm512_unpackhi_ps(tmp944, in170);
__m512 tmp961 = _mm512_unpacklo_ps(tmp942, in166);
__m512 tmp962 = _mm512_unpackhi_ps(tmp942, in166);
__m512 tmp963 = _mm512_unpacklo_ps(in168, in167);
__m512 tmp964 = _mm512_unpackhi_ps(in168, in167);
__m512 tmp965 = _mm512_unpacklo_ps(in172, tmp947);
__m512 tmp966 = _mm512_unpackhi_ps(in172, tmp947);
__m512 tmp967 = _mm512_unpacklo_ps(tmp948, in178);
__m512 tmp968 = _mm512_unpackhi_ps(tmp948, in178);
__m512 tmp969 = _mm512_unpacklo_ps(tmp946, in174);
__m512 tmp970 = _mm512_unpackhi_ps(tmp946, in174);
__m512 tmp971 = _mm512_unpacklo_ps(in176, in175);
__m512 tmp972 = _mm512_unpackhi_ps(in176, in175);
__m512 tmp973 = _mm512_shuffle_ps(tmp957, tmp959, 68);
__m512 tmp974 = _mm512_shuffle_ps(tmp957, tmp959, 238);
__m512 tmp975 = _mm512_shuffle_ps(tmp958, tmp960, 68);
__m512 tmp976 = _mm512_shuffle_ps(tmp958, tmp960, 238);
__m512 tmp977 = _mm512_shuffle_ps(tmp961, tmp963, 68);
__m512 tmp978 = _mm512_shuffle_ps(tmp961, tmp963, 238);
__m512 tmp979 = _mm512_shuffle_ps(tmp962, tmp964, 68);
__m512 tmp980 = _mm512_shuffle_ps(tmp962, tmp964, 238);
__m512 tmp981 = _mm512_shuffle_ps(tmp965, tmp967, 68);
__m512 tmp982 = _mm512_shuffle_ps(tmp965, tmp967, 238);
__m512 tmp983 = _mm512_shuffle_ps(tmp966, tmp968, 68);
__m512 tmp984 = _mm512_shuffle_ps(tmp966, tmp968, 238);
__m512 tmp985 = _mm512_shuffle_ps(tmp969, tmp971, 68);
__m512 tmp986 = _mm512_shuffle_ps(tmp969, tmp971, 238);
__m512 tmp987 = _mm512_shuffle_ps(tmp970, tmp972, 68);
__m512 tmp988 = _mm512_shuffle_ps(tmp970, tmp972, 238);
__m512 tmp989 = _mm512_shuffle_f32x4(tmp973, tmp977, 136);
__m512 tmp990 = _mm512_shuffle_f32x4(tmp973, tmp977, 221);
__m512 tmp991 = _mm512_shuffle_f32x4(tmp974, tmp978, 136);
__m512 tmp992 = _mm512_shuffle_f32x4(tmp974, tmp978, 221);
__m512 tmp993 = _mm512_shuffle_f32x4(tmp975, tmp979, 136);
__m512 tmp994 = _mm512_shuffle_f32x4(tmp975, tmp979, 221);
__m512 tmp995 = _mm512_shuffle_f32x4(tmp976, tmp980, 136);
__m512 tmp996 = _mm512_shuffle_f32x4(tmp976, tmp980, 221);
__m512 tmp997 = _mm512_shuffle_f32x4(tmp981, tmp985, 136);
__m512 tmp998 = _mm512_shuffle_f32x4(tmp981, tmp985, 221);
__m512 tmp999 = _mm512_shuffle_f32x4(tmp982, tmp986, 136);
__m512 tmp1000 = _mm512_shuffle_f32x4(tmp982, tmp986, 221);
__m512 tmp1001 = _mm512_shuffle_f32x4(tmp983, tmp987, 136);
__m512 tmp1002 = _mm512_shuffle_f32x4(tmp983, tmp987, 221);
__m512 tmp1003 = _mm512_shuffle_f32x4(tmp984, tmp988, 136);
__m512 tmp1004 = _mm512_shuffle_f32x4(tmp984, tmp988, 221);
in164 = _mm512_shuffle_f32x4(tmp989, tmp997, 136);
in172 = _mm512_shuffle_f32x4(tmp989, tmp997, 221);
tmp943 = _mm512_shuffle_f32x4(tmp991, tmp999, 136);
tmp947 = _mm512_shuffle_f32x4(tmp991, tmp999, 221);
tmp944 = _mm512_shuffle_f32x4(tmp993, tmp1001, 136);
tmp948 = _mm512_shuffle_f32x4(tmp993, tmp1001, 221);
in170 = _mm512_shuffle_f32x4(tmp995, tmp1003, 136);
in178 = _mm512_shuffle_f32x4(tmp995, tmp1003, 221);
tmp942 = _mm512_shuffle_f32x4(tmp990, tmp998, 136);
tmp946 = _mm512_shuffle_f32x4(tmp990, tmp998, 221);
in166 = _mm512_shuffle_f32x4(tmp992, tmp1000, 136);
in174 = _mm512_shuffle_f32x4(tmp992, tmp1000, 221);
in168 = _mm512_shuffle_f32x4(tmp994, tmp1002, 136);
in176 = _mm512_shuffle_f32x4(tmp994, tmp1002, 221);
in167 = _mm512_shuffle_f32x4(tmp996, tmp1004, 136);
in175 = _mm512_shuffle_f32x4(tmp996, tmp1004, 221);
__m512 tmp949 = _mm512_add_ps(tmp943, in166);
__m512 tmp953 = _mm512_add_ps(tmp947, in174);
__m512 tmp950 = _mm512_sub_ps(tmp942, tmp944);
__m512 tmp954 = _mm512_sub_ps(tmp946, tmp948);
__m512 tmp951 = _mm512_add_ps(tmp944, in168);
__m512 tmp955 = _mm512_add_ps(tmp948, in176);
in164 = _mm512_sub_ps(in164, in168);
in172 = _mm512_sub_ps(in172, in176);
tmp949 = _mm512_fmadd_ps(in170, _mm512_set1_ps(-4.25e+00f), tmp949);
tmp953 = _mm512_fmadd_ps(in178, _mm512_set1_ps(-4.25e+00f), tmp953);
tmp951 = _mm512_fmadd_ps(tmp942, _mm512_set1_ps(-4.25e+00f), tmp951);
tmp955 = _mm512_fmadd_ps(tmp946, _mm512_set1_ps(-4.25e+00f), tmp955);
in164 = _mm512_fmadd_ps(tmp950, _mm512_set1_ps(5.25e+00f), in164);
in172 = _mm512_fmadd_ps(tmp954, _mm512_set1_ps(5.25e+00f), in172);
tmp950 = _mm512_fmadd_ps(tmp944, _mm512_set1_ps(2.5e-01f), in168);
tmp954 = _mm512_fmadd_ps(tmp948, _mm512_set1_ps(2.5e-01f), in176);
tmp944 = _mm512_fmadd_ps(tmp944, _mm512_set1_ps(4e+00f), in168);
tmp948 = _mm512_fmadd_ps(tmp948, _mm512_set1_ps(4e+00f), in176);
__m512 tmp952 = _mm512_sub_ps(tmp951, tmp949);
__m512 tmp956 = _mm512_sub_ps(tmp955, tmp953);
tmp951 = _mm512_add_ps(tmp949, tmp951);
tmp955 = _mm512_add_ps(tmp953, tmp955);
tmp949 = _mm512_fmadd_ps(tmp943, _mm512_set1_ps(2.5e-01f), in166);
tmp953 = _mm512_fmadd_ps(tmp947, _mm512_set1_ps(2.5e-01f), in174);
tmp950 = _mm512_fmadd_ps(tmp942, _mm512_set1_ps(-1.25e+00f), tmp950);
tmp954 = _mm512_fmadd_ps(tmp946, _mm512_set1_ps(-1.25e+00f), tmp954);
tmp942 = _mm512_fmadd_ps(tmp942, _mm512_set1_ps(-5e+00f), tmp944);
tmp946 = _mm512_fmadd_ps(tmp946, _mm512_set1_ps(-5e+00f), tmp948);
tmp949 = _mm512_fmadd_ps(in170, _mm512_set1_ps(-1.25e+00f), tmp949);
tmp953 = _mm512_fmadd_ps(in178, _mm512_set1_ps(-1.25e+00f), tmp953);
in168 = _mm512_fmadd_ps(tmp949, _mm512_set1_ps(2e+00f), tmp950);
in176 = _mm512_fmadd_ps(tmp953, _mm512_set1_ps(2e+00f), tmp954);
tmp950 = _mm512_fnmadd_ps(tmp949, _mm512_set1_ps(2e+00f), tmp950);
tmp954 = _mm512_fnmadd_ps(tmp953, _mm512_set1_ps(2e+00f), tmp954);
tmp949 = _mm512_fmadd_ps(in166, _mm512_set1_ps(2.5e-01f), tmp943);
tmp953 = _mm512_fmadd_ps(in174, _mm512_set1_ps(2.5e-01f), tmp947);
tmp943 = _mm512_sub_ps(in167, tmp943);
tmp947 = _mm512_sub_ps(in175, tmp947);
tmp949 = _mm512_fmadd_ps(in170, _mm512_set1_ps(-1.25e+00f), tmp949);
tmp953 = _mm512_fmadd_ps(in178, _mm512_set1_ps(-1.25e+00f), tmp953);
in170 = _mm512_sub_ps(in170, in166);
in178 = _mm512_sub_ps(in178, in174);
in170 = _mm512_fmadd_ps(in170, _mm512_set1_ps(5.25e+00f), tmp943);
in178 = _mm512_fmadd_ps(in178, _mm512_set1_ps(5.25e+00f), tmp947);
tmp944 = _mm512_fmadd_ps(tmp949, _mm512_set1_ps(2e+00f), tmp942);
tmp948 = _mm512_fmadd_ps(tmp953, _mm512_set1_ps(2e+00f), tmp946);
tmp942 = _mm512_fnmadd_ps(tmp949, _mm512_set1_ps(2e+00f), tmp942);
tmp946 = _mm512_fnmadd_ps(tmp953, _mm512_set1_ps(2e+00f), tmp946);
__m512 out183 = _mm512_shuffle_f32x4(in164, tmp951, 68);
__m512 out191 = _mm512_shuffle_f32x4(in164, tmp951, 238);
__m512 out184 = _mm512_shuffle_f32x4(tmp952, in168, 68);
__m512 out192 = _mm512_shuffle_f32x4(tmp952, in168, 238);
__m512 out185 = _mm512_shuffle_f32x4(tmp950, tmp944, 68);
__m512 out193 = _mm512_shuffle_f32x4(tmp950, tmp944, 238);
__m512 out186 = _mm512_shuffle_f32x4(tmp942, in170, 68);
__m512 out194 = _mm512_shuffle_f32x4(tmp942, in170, 238);
__m512 out187 = _mm512_shuffle_f32x4(in172, tmp955, 68);
__m512 out195 = _mm512_shuffle_f32x4(in172, tmp955, 238);
__m512 out188 = _mm512_shuffle_f32x4(tmp956, in176, 68);
__m512 out196 = _mm512_shuffle_f32x4(tmp956, in176, 238);
__m512 out189 = _mm512_shuffle_f32x4(tmp954, tmp948, 68);
__m512 out197 = _mm512_shuffle_f32x4(tmp954, tmp948, 238);
__m512 out190 = _mm512_shuffle_f32x4(tmp946, in178, 68);
__m512 out198 = _mm512_shuffle_f32x4(tmp946, in178, 238);
_mm512_storeu_ps(dfPtr4+256+1638400*i16+24576*j11+24576*s13+768*k55, out183);
_mm512_storeu_ps(dfPtr4+384+1638400*i16+24576*j11+24576*s13+768*k55, out191);
_mm512_storeu_ps(dfPtr4+320+1638400*i16+24576*j11+24576*s13+768*k55, out187);
_mm512_storeu_ps(dfPtr4+448+1638400*i16+24576*j11+24576*s13+768*k55, out195);
_mm512_storeu_ps(dfPtr4+409856+1638400*i16+24576*j11+24576*s13+768*k55, out184);
_mm512_storeu_ps(dfPtr4+409984+1638400*i16+24576*j11+24576*s13+768*k55, out192);
_mm512_storeu_ps(dfPtr4+409920+1638400*i16+24576*j11+24576*s13+768*k55, out188);
_mm512_storeu_ps(dfPtr4+410048+1638400*i16+24576*j11+24576*s13+768*k55, out196);
_mm512_storeu_ps(dfPtr4+819456+1638400*i16+24576*j11+24576*s13+768*k55, out185);
_mm512_storeu_ps(dfPtr4+819584+1638400*i16+24576*j11+24576*s13+768*k55, out193);
_mm512_storeu_ps(dfPtr4+819520+1638400*i16+24576*j11+24576*s13+768*k55, out189);
_mm512_storeu_ps(dfPtr4+819648+1638400*i16+24576*j11+24576*s13+768*k55, out197);
_mm512_storeu_ps(dfPtr4+1229056+1638400*i16+24576*j11+24576*s13+768*k55, out186);
_mm512_storeu_ps(dfPtr4+1229184+1638400*i16+24576*j11+24576*s13+768*k55, out194);
_mm512_storeu_ps(dfPtr4+1229120+1638400*i16+24576*j11+24576*s13+768*k55, out190);
_mm512_storeu_ps(dfPtr4+1229248+1638400*i16+24576*j11+24576*s13+768*k55, out198);
__m512 dat1087 = _mm512_maskz_loadu_ps(8191, datPtr5+13764+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1087 = _mm512_max_ps(_mm512_setzero_ps(), dat1087);
__m512 dat1088 = _mm512_maskz_loadu_ps(16383, datPtr5+13808+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1088 = _mm512_max_ps(_mm512_setzero_ps(), dat1088);
__m512i pm83 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in180 = _mm512_permutexvar_ps(pm83, dat1087);
__m512i pm84 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in188 = _mm512_permutexvar_ps(pm84, dat1088);
__m512 dat1089 = _mm512_maskz_loadu_ps(8191, datPtr5+13988+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1089 = _mm512_max_ps(_mm512_setzero_ps(), dat1089);
__m512 dat1090 = _mm512_maskz_loadu_ps(16383, datPtr5+14032+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1090 = _mm512_max_ps(_mm512_setzero_ps(), dat1090);
__m512 in181 = _mm512_permutexvar_ps(pm83, dat1089);
__m512 in189 = _mm512_permutexvar_ps(pm84, dat1090);
__m512 dat1091 = _mm512_maskz_loadu_ps(8191, datPtr5+14212+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1091 = _mm512_max_ps(_mm512_setzero_ps(), dat1091);
__m512 dat1092 = _mm512_maskz_loadu_ps(16383, datPtr5+14256+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1092 = _mm512_max_ps(_mm512_setzero_ps(), dat1092);
__m512 in182 = _mm512_permutexvar_ps(pm83, dat1091);
__m512 in190 = _mm512_permutexvar_ps(pm84, dat1092);
__m512 dat1093 = _mm512_maskz_loadu_ps(8191, datPtr5+14436+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1093 = _mm512_max_ps(_mm512_setzero_ps(), dat1093);
__m512 dat1094 = _mm512_maskz_loadu_ps(16383, datPtr5+14480+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1094 = _mm512_max_ps(_mm512_setzero_ps(), dat1094);
__m512 in183 = _mm512_permutexvar_ps(pm83, dat1093);
__m512 in191 = _mm512_permutexvar_ps(pm84, dat1094);
__m512 dat1095 = _mm512_maskz_loadu_ps(8191, datPtr5+14660+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1095 = _mm512_max_ps(_mm512_setzero_ps(), dat1095);
__m512 dat1096 = _mm512_maskz_loadu_ps(16383, datPtr5+14704+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1096 = _mm512_max_ps(_mm512_setzero_ps(), dat1096);
__m512 in184 = _mm512_permutexvar_ps(pm83, dat1095);
__m512 in192 = _mm512_permutexvar_ps(pm84, dat1096);
__m512 dat1097 = _mm512_maskz_loadu_ps(8191, datPtr5+14884+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1097 = _mm512_max_ps(_mm512_setzero_ps(), dat1097);
__m512 dat1098 = _mm512_maskz_loadu_ps(16383, datPtr5+14928+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1098 = _mm512_max_ps(_mm512_setzero_ps(), dat1098);
__m512 in185 = _mm512_permutexvar_ps(pm83, dat1097);
__m512 in193 = _mm512_permutexvar_ps(pm84, dat1098);
__m512 dat1099 = _mm512_maskz_loadu_ps(8191, datPtr5+15108+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1099 = _mm512_max_ps(_mm512_setzero_ps(), dat1099);
__m512 dat1100 = _mm512_maskz_loadu_ps(16383, datPtr5+15152+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1100 = _mm512_max_ps(_mm512_setzero_ps(), dat1100);
__m512 in186 = _mm512_permutexvar_ps(pm83, dat1099);
__m512 in194 = _mm512_permutexvar_ps(pm84, dat1100);
__m512 dat1101 = _mm512_maskz_loadu_ps(8191, datPtr5+15332+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1101 = _mm512_max_ps(_mm512_setzero_ps(), dat1101);
__m512 dat1102 = _mm512_maskz_loadu_ps(16383, datPtr5+15376+806912*i16+224*h23+4*w26+806912*s13+25216*k55);
dat1102 = _mm512_max_ps(_mm512_setzero_ps(), dat1102);
__m512 in187 = _mm512_permutexvar_ps(pm83, dat1101);
__m512 in195 = _mm512_permutexvar_ps(pm84, dat1102);
__m512 tmp1005 = _mm512_add_ps(in181, in185);
__m512 tmp1009 = _mm512_add_ps(in189, in193);
__m512 tmp1006 = _mm512_sub_ps(in184, in182);
__m512 tmp1010 = _mm512_sub_ps(in192, in190);
__m512 tmp1007 = _mm512_add_ps(in182, in186);
__m512 tmp1011 = _mm512_add_ps(in190, in194);
in180 = _mm512_sub_ps(in180, in186);
in188 = _mm512_sub_ps(in188, in194);
tmp1005 = _mm512_fmadd_ps(in183, _mm512_set1_ps(-4.25e+00f), tmp1005);
tmp1009 = _mm512_fmadd_ps(in191, _mm512_set1_ps(-4.25e+00f), tmp1009);
tmp1007 = _mm512_fmadd_ps(in184, _mm512_set1_ps(-4.25e+00f), tmp1007);
tmp1011 = _mm512_fmadd_ps(in192, _mm512_set1_ps(-4.25e+00f), tmp1011);
in180 = _mm512_fmadd_ps(tmp1006, _mm512_set1_ps(5.25e+00f), in180);
in188 = _mm512_fmadd_ps(tmp1010, _mm512_set1_ps(5.25e+00f), in188);
tmp1006 = _mm512_fmadd_ps(in182, _mm512_set1_ps(2.5e-01f), in186);
tmp1010 = _mm512_fmadd_ps(in190, _mm512_set1_ps(2.5e-01f), in194);
in182 = _mm512_fmadd_ps(in182, _mm512_set1_ps(4e+00f), in186);
in190 = _mm512_fmadd_ps(in190, _mm512_set1_ps(4e+00f), in194);
__m512 tmp1008 = _mm512_sub_ps(tmp1007, tmp1005);
__m512 tmp1012 = _mm512_sub_ps(tmp1011, tmp1009);
tmp1007 = _mm512_add_ps(tmp1005, tmp1007);
tmp1011 = _mm512_add_ps(tmp1009, tmp1011);
tmp1005 = _mm512_fmadd_ps(in181, _mm512_set1_ps(2.5e-01f), in185);
tmp1009 = _mm512_fmadd_ps(in189, _mm512_set1_ps(2.5e-01f), in193);
tmp1006 = _mm512_fmadd_ps(in184, _mm512_set1_ps(-1.25e+00f), tmp1006);
tmp1010 = _mm512_fmadd_ps(in192, _mm512_set1_ps(-1.25e+00f), tmp1010);
in184 = _mm512_fmadd_ps(in184, _mm512_set1_ps(-5e+00f), in182);
in192 = _mm512_fmadd_ps(in192, _mm512_set1_ps(-5e+00f), in190);
tmp1005 = _mm512_fmadd_ps(in183, _mm512_set1_ps(-1.25e+00f), tmp1005);
tmp1009 = _mm512_fmadd_ps(in191, _mm512_set1_ps(-1.25e+00f), tmp1009);
in186 = _mm512_fmadd_ps(tmp1005, _mm512_set1_ps(2e+00f), tmp1006);
in194 = _mm512_fmadd_ps(tmp1009, _mm512_set1_ps(2e+00f), tmp1010);
tmp1006 = _mm512_fnmadd_ps(tmp1005, _mm512_set1_ps(2e+00f), tmp1006);
tmp1010 = _mm512_fnmadd_ps(tmp1009, _mm512_set1_ps(2e+00f), tmp1010);
tmp1005 = _mm512_fmadd_ps(in185, _mm512_set1_ps(2.5e-01f), in181);
tmp1009 = _mm512_fmadd_ps(in193, _mm512_set1_ps(2.5e-01f), in189);
in181 = _mm512_sub_ps(in187, in181);
in189 = _mm512_sub_ps(in195, in189);
tmp1005 = _mm512_fmadd_ps(in183, _mm512_set1_ps(-1.25e+00f), tmp1005);
tmp1009 = _mm512_fmadd_ps(in191, _mm512_set1_ps(-1.25e+00f), tmp1009);
in183 = _mm512_sub_ps(in183, in185);
in191 = _mm512_sub_ps(in191, in193);
in183 = _mm512_fmadd_ps(in183, _mm512_set1_ps(5.25e+00f), in181);
in191 = _mm512_fmadd_ps(in191, _mm512_set1_ps(5.25e+00f), in189);
in182 = _mm512_fmadd_ps(tmp1005, _mm512_set1_ps(2e+00f), in184);
in190 = _mm512_fmadd_ps(tmp1009, _mm512_set1_ps(2e+00f), in192);
in184 = _mm512_fnmadd_ps(tmp1005, _mm512_set1_ps(2e+00f), in184);
in192 = _mm512_fnmadd_ps(tmp1009, _mm512_set1_ps(2e+00f), in192);
__m512 tmp1021 = _mm512_unpacklo_ps(in180, tmp1007);
__m512 tmp1022 = _mm512_unpackhi_ps(in180, tmp1007);
__m512 tmp1023 = _mm512_unpacklo_ps(tmp1008, in186);
__m512 tmp1024 = _mm512_unpackhi_ps(tmp1008, in186);
__m512 tmp1025 = _mm512_unpacklo_ps(tmp1006, in182);
__m512 tmp1026 = _mm512_unpackhi_ps(tmp1006, in182);
__m512 tmp1027 = _mm512_unpacklo_ps(in184, in183);
__m512 tmp1028 = _mm512_unpackhi_ps(in184, in183);
__m512 tmp1029 = _mm512_unpacklo_ps(in188, tmp1011);
__m512 tmp1030 = _mm512_unpackhi_ps(in188, tmp1011);
__m512 tmp1031 = _mm512_unpacklo_ps(tmp1012, in194);
__m512 tmp1032 = _mm512_unpackhi_ps(tmp1012, in194);
__m512 tmp1033 = _mm512_unpacklo_ps(tmp1010, in190);
__m512 tmp1034 = _mm512_unpackhi_ps(tmp1010, in190);
__m512 tmp1035 = _mm512_unpacklo_ps(in192, in191);
__m512 tmp1036 = _mm512_unpackhi_ps(in192, in191);
__m512 tmp1037 = _mm512_shuffle_ps(tmp1021, tmp1023, 68);
__m512 tmp1038 = _mm512_shuffle_ps(tmp1021, tmp1023, 238);
__m512 tmp1039 = _mm512_shuffle_ps(tmp1022, tmp1024, 68);
__m512 tmp1040 = _mm512_shuffle_ps(tmp1022, tmp1024, 238);
__m512 tmp1041 = _mm512_shuffle_ps(tmp1025, tmp1027, 68);
__m512 tmp1042 = _mm512_shuffle_ps(tmp1025, tmp1027, 238);
__m512 tmp1043 = _mm512_shuffle_ps(tmp1026, tmp1028, 68);
__m512 tmp1044 = _mm512_shuffle_ps(tmp1026, tmp1028, 238);
__m512 tmp1045 = _mm512_shuffle_ps(tmp1029, tmp1031, 68);
__m512 tmp1046 = _mm512_shuffle_ps(tmp1029, tmp1031, 238);
__m512 tmp1047 = _mm512_shuffle_ps(tmp1030, tmp1032, 68);
__m512 tmp1048 = _mm512_shuffle_ps(tmp1030, tmp1032, 238);
__m512 tmp1049 = _mm512_shuffle_ps(tmp1033, tmp1035, 68);
__m512 tmp1050 = _mm512_shuffle_ps(tmp1033, tmp1035, 238);
__m512 tmp1051 = _mm512_shuffle_ps(tmp1034, tmp1036, 68);
__m512 tmp1052 = _mm512_shuffle_ps(tmp1034, tmp1036, 238);
__m512 tmp1053 = _mm512_shuffle_f32x4(tmp1037, tmp1041, 136);
__m512 tmp1054 = _mm512_shuffle_f32x4(tmp1037, tmp1041, 221);
__m512 tmp1055 = _mm512_shuffle_f32x4(tmp1038, tmp1042, 136);
__m512 tmp1056 = _mm512_shuffle_f32x4(tmp1038, tmp1042, 221);
__m512 tmp1057 = _mm512_shuffle_f32x4(tmp1039, tmp1043, 136);
__m512 tmp1058 = _mm512_shuffle_f32x4(tmp1039, tmp1043, 221);
__m512 tmp1059 = _mm512_shuffle_f32x4(tmp1040, tmp1044, 136);
__m512 tmp1060 = _mm512_shuffle_f32x4(tmp1040, tmp1044, 221);
__m512 tmp1061 = _mm512_shuffle_f32x4(tmp1045, tmp1049, 136);
__m512 tmp1062 = _mm512_shuffle_f32x4(tmp1045, tmp1049, 221);
__m512 tmp1063 = _mm512_shuffle_f32x4(tmp1046, tmp1050, 136);
__m512 tmp1064 = _mm512_shuffle_f32x4(tmp1046, tmp1050, 221);
__m512 tmp1065 = _mm512_shuffle_f32x4(tmp1047, tmp1051, 136);
__m512 tmp1066 = _mm512_shuffle_f32x4(tmp1047, tmp1051, 221);
__m512 tmp1067 = _mm512_shuffle_f32x4(tmp1048, tmp1052, 136);
__m512 tmp1068 = _mm512_shuffle_f32x4(tmp1048, tmp1052, 221);
in180 = _mm512_shuffle_f32x4(tmp1053, tmp1061, 136);
in188 = _mm512_shuffle_f32x4(tmp1053, tmp1061, 221);
tmp1007 = _mm512_shuffle_f32x4(tmp1055, tmp1063, 136);
tmp1011 = _mm512_shuffle_f32x4(tmp1055, tmp1063, 221);
tmp1008 = _mm512_shuffle_f32x4(tmp1057, tmp1065, 136);
tmp1012 = _mm512_shuffle_f32x4(tmp1057, tmp1065, 221);
in186 = _mm512_shuffle_f32x4(tmp1059, tmp1067, 136);
in194 = _mm512_shuffle_f32x4(tmp1059, tmp1067, 221);
tmp1006 = _mm512_shuffle_f32x4(tmp1054, tmp1062, 136);
tmp1010 = _mm512_shuffle_f32x4(tmp1054, tmp1062, 221);
in182 = _mm512_shuffle_f32x4(tmp1056, tmp1064, 136);
in190 = _mm512_shuffle_f32x4(tmp1056, tmp1064, 221);
in184 = _mm512_shuffle_f32x4(tmp1058, tmp1066, 136);
in192 = _mm512_shuffle_f32x4(tmp1058, tmp1066, 221);
in183 = _mm512_shuffle_f32x4(tmp1060, tmp1068, 136);
in191 = _mm512_shuffle_f32x4(tmp1060, tmp1068, 221);
__m512 tmp1013 = _mm512_add_ps(tmp1007, in182);
__m512 tmp1017 = _mm512_add_ps(tmp1011, in190);
__m512 tmp1014 = _mm512_sub_ps(tmp1006, tmp1008);
__m512 tmp1018 = _mm512_sub_ps(tmp1010, tmp1012);
__m512 tmp1015 = _mm512_add_ps(tmp1008, in184);
__m512 tmp1019 = _mm512_add_ps(tmp1012, in192);
in180 = _mm512_sub_ps(in180, in184);
in188 = _mm512_sub_ps(in188, in192);
tmp1013 = _mm512_fmadd_ps(in186, _mm512_set1_ps(-4.25e+00f), tmp1013);
tmp1017 = _mm512_fmadd_ps(in194, _mm512_set1_ps(-4.25e+00f), tmp1017);
tmp1015 = _mm512_fmadd_ps(tmp1006, _mm512_set1_ps(-4.25e+00f), tmp1015);
tmp1019 = _mm512_fmadd_ps(tmp1010, _mm512_set1_ps(-4.25e+00f), tmp1019);
in180 = _mm512_fmadd_ps(tmp1014, _mm512_set1_ps(5.25e+00f), in180);
in188 = _mm512_fmadd_ps(tmp1018, _mm512_set1_ps(5.25e+00f), in188);
tmp1014 = _mm512_fmadd_ps(tmp1008, _mm512_set1_ps(2.5e-01f), in184);
tmp1018 = _mm512_fmadd_ps(tmp1012, _mm512_set1_ps(2.5e-01f), in192);
tmp1008 = _mm512_fmadd_ps(tmp1008, _mm512_set1_ps(4e+00f), in184);
tmp1012 = _mm512_fmadd_ps(tmp1012, _mm512_set1_ps(4e+00f), in192);
__m512 tmp1016 = _mm512_sub_ps(tmp1015, tmp1013);
__m512 tmp1020 = _mm512_sub_ps(tmp1019, tmp1017);
tmp1015 = _mm512_add_ps(tmp1013, tmp1015);
tmp1019 = _mm512_add_ps(tmp1017, tmp1019);
tmp1013 = _mm512_fmadd_ps(tmp1007, _mm512_set1_ps(2.5e-01f), in182);
tmp1017 = _mm512_fmadd_ps(tmp1011, _mm512_set1_ps(2.5e-01f), in190);
tmp1014 = _mm512_fmadd_ps(tmp1006, _mm512_set1_ps(-1.25e+00f), tmp1014);
tmp1018 = _mm512_fmadd_ps(tmp1010, _mm512_set1_ps(-1.25e+00f), tmp1018);
tmp1006 = _mm512_fmadd_ps(tmp1006, _mm512_set1_ps(-5e+00f), tmp1008);
tmp1010 = _mm512_fmadd_ps(tmp1010, _mm512_set1_ps(-5e+00f), tmp1012);
tmp1013 = _mm512_fmadd_ps(in186, _mm512_set1_ps(-1.25e+00f), tmp1013);
tmp1017 = _mm512_fmadd_ps(in194, _mm512_set1_ps(-1.25e+00f), tmp1017);
in184 = _mm512_fmadd_ps(tmp1013, _mm512_set1_ps(2e+00f), tmp1014);
in192 = _mm512_fmadd_ps(tmp1017, _mm512_set1_ps(2e+00f), tmp1018);
tmp1014 = _mm512_fnmadd_ps(tmp1013, _mm512_set1_ps(2e+00f), tmp1014);
tmp1018 = _mm512_fnmadd_ps(tmp1017, _mm512_set1_ps(2e+00f), tmp1018);
tmp1013 = _mm512_fmadd_ps(in182, _mm512_set1_ps(2.5e-01f), tmp1007);
tmp1017 = _mm512_fmadd_ps(in190, _mm512_set1_ps(2.5e-01f), tmp1011);
tmp1007 = _mm512_sub_ps(in183, tmp1007);
tmp1011 = _mm512_sub_ps(in191, tmp1011);
tmp1013 = _mm512_fmadd_ps(in186, _mm512_set1_ps(-1.25e+00f), tmp1013);
tmp1017 = _mm512_fmadd_ps(in194, _mm512_set1_ps(-1.25e+00f), tmp1017);
in186 = _mm512_sub_ps(in186, in182);
in194 = _mm512_sub_ps(in194, in190);
in186 = _mm512_fmadd_ps(in186, _mm512_set1_ps(5.25e+00f), tmp1007);
in194 = _mm512_fmadd_ps(in194, _mm512_set1_ps(5.25e+00f), tmp1011);
tmp1008 = _mm512_fmadd_ps(tmp1013, _mm512_set1_ps(2e+00f), tmp1006);
tmp1012 = _mm512_fmadd_ps(tmp1017, _mm512_set1_ps(2e+00f), tmp1010);
tmp1006 = _mm512_fnmadd_ps(tmp1013, _mm512_set1_ps(2e+00f), tmp1006);
tmp1010 = _mm512_fnmadd_ps(tmp1017, _mm512_set1_ps(2e+00f), tmp1010);
__m512 out199 = _mm512_shuffle_f32x4(in180, tmp1015, 68);
__m512 out207 = _mm512_shuffle_f32x4(in180, tmp1015, 238);
__m512 out200 = _mm512_shuffle_f32x4(tmp1016, in184, 68);
__m512 out208 = _mm512_shuffle_f32x4(tmp1016, in184, 238);
__m512 out201 = _mm512_shuffle_f32x4(tmp1014, tmp1008, 68);
__m512 out209 = _mm512_shuffle_f32x4(tmp1014, tmp1008, 238);
__m512 out202 = _mm512_shuffle_f32x4(tmp1006, in186, 68);
__m512 out210 = _mm512_shuffle_f32x4(tmp1006, in186, 238);
__m512 out203 = _mm512_shuffle_f32x4(in188, tmp1019, 68);
__m512 out211 = _mm512_shuffle_f32x4(in188, tmp1019, 238);
__m512 out204 = _mm512_shuffle_f32x4(tmp1020, in192, 68);
__m512 out212 = _mm512_shuffle_f32x4(tmp1020, in192, 238);
__m512 out205 = _mm512_shuffle_f32x4(tmp1018, tmp1012, 68);
__m512 out213 = _mm512_shuffle_f32x4(tmp1018, tmp1012, 238);
__m512 out206 = _mm512_shuffle_f32x4(tmp1010, in194, 68);
__m512 out214 = _mm512_shuffle_f32x4(tmp1010, in194, 238);
_mm512_storeu_ps(dfPtr4+512+1638400*i16+24576*j11+24576*s13+768*k55, out199);
_mm512_storeu_ps(dfPtr4+640+1638400*i16+24576*j11+24576*s13+768*k55, out207);
_mm512_storeu_ps(dfPtr4+576+1638400*i16+24576*j11+24576*s13+768*k55, out203);
_mm512_storeu_ps(dfPtr4+704+1638400*i16+24576*j11+24576*s13+768*k55, out211);
_mm512_storeu_ps(dfPtr4+410112+1638400*i16+24576*j11+24576*s13+768*k55, out200);
_mm512_storeu_ps(dfPtr4+410240+1638400*i16+24576*j11+24576*s13+768*k55, out208);
_mm512_storeu_ps(dfPtr4+410176+1638400*i16+24576*j11+24576*s13+768*k55, out204);
_mm512_storeu_ps(dfPtr4+410304+1638400*i16+24576*j11+24576*s13+768*k55, out212);
_mm512_storeu_ps(dfPtr4+819712+1638400*i16+24576*j11+24576*s13+768*k55, out201);
_mm512_storeu_ps(dfPtr4+819840+1638400*i16+24576*j11+24576*s13+768*k55, out209);
_mm512_storeu_ps(dfPtr4+819776+1638400*i16+24576*j11+24576*s13+768*k55, out205);
_mm512_storeu_ps(dfPtr4+819904+1638400*i16+24576*j11+24576*s13+768*k55, out213);
_mm512_storeu_ps(dfPtr4+1229312+1638400*i16+24576*j11+24576*s13+768*k55, out202);
_mm512_storeu_ps(dfPtr4+1229440+1638400*i16+24576*j11+24576*s13+768*k55, out210);
_mm512_storeu_ps(dfPtr4+1229376+1638400*i16+24576*j11+24576*s13+768*k55, out206);
_mm512_storeu_ps(dfPtr4+1229504+1638400*i16+24576*j11+24576*s13+768*k55, out214);
}
if (j11 >= last3) return;
++j11;
rel8 = 2;
}
if (rel8 < 3) {
ptrdiff_t h24 = base8+6;
ptrdiff_t w27 = 24;
ptrdiff_t k56 = 0;
for (; k56 != 32; ++k56) {
__m512 dat1103 = _mm512_maskz_loadu_ps(16383, datPtr5+0+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1103 = _mm512_max_ps(_mm512_setzero_ps(), dat1103);
__m512 dat1104 = _mm512_maskz_loadu_ps(16383, datPtr5+48+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1104 = _mm512_max_ps(_mm512_setzero_ps(), dat1104);
__m512i pm85 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in196 = _mm512_permutexvar_ps(pm85, dat1103);
__m512 in204 = _mm512_permutexvar_ps(pm85, dat1104);
__m512 dat1105 = _mm512_maskz_loadu_ps(16383, datPtr5+224+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1105 = _mm512_max_ps(_mm512_setzero_ps(), dat1105);
__m512 dat1106 = _mm512_maskz_loadu_ps(16383, datPtr5+272+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1106 = _mm512_max_ps(_mm512_setzero_ps(), dat1106);
__m512 in197 = _mm512_permutexvar_ps(pm85, dat1105);
__m512 in205 = _mm512_permutexvar_ps(pm85, dat1106);
__m512 dat1107 = _mm512_maskz_loadu_ps(16383, datPtr5+448+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1107 = _mm512_max_ps(_mm512_setzero_ps(), dat1107);
__m512 dat1108 = _mm512_maskz_loadu_ps(16383, datPtr5+496+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1108 = _mm512_max_ps(_mm512_setzero_ps(), dat1108);
__m512 in198 = _mm512_permutexvar_ps(pm85, dat1107);
__m512 in206 = _mm512_permutexvar_ps(pm85, dat1108);
__m512 dat1109 = _mm512_maskz_loadu_ps(16383, datPtr5+672+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1109 = _mm512_max_ps(_mm512_setzero_ps(), dat1109);
__m512 dat1110 = _mm512_maskz_loadu_ps(16383, datPtr5+720+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1110 = _mm512_max_ps(_mm512_setzero_ps(), dat1110);
__m512 in199 = _mm512_permutexvar_ps(pm85, dat1109);
__m512 in207 = _mm512_permutexvar_ps(pm85, dat1110);
__m512 dat1111 = _mm512_maskz_loadu_ps(16383, datPtr5+896+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1111 = _mm512_max_ps(_mm512_setzero_ps(), dat1111);
__m512 dat1112 = _mm512_maskz_loadu_ps(16383, datPtr5+944+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1112 = _mm512_max_ps(_mm512_setzero_ps(), dat1112);
__m512 in200 = _mm512_permutexvar_ps(pm85, dat1111);
__m512 in208 = _mm512_permutexvar_ps(pm85, dat1112);
__m512 dat1113 = _mm512_maskz_loadu_ps(16383, datPtr5+1120+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1113 = _mm512_max_ps(_mm512_setzero_ps(), dat1113);
__m512 dat1114 = _mm512_maskz_loadu_ps(16383, datPtr5+1168+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1114 = _mm512_max_ps(_mm512_setzero_ps(), dat1114);
__m512 in201 = _mm512_permutexvar_ps(pm85, dat1113);
__m512 in209 = _mm512_permutexvar_ps(pm85, dat1114);
__m512 dat1115 = _mm512_maskz_loadu_ps(16383, datPtr5+1344+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1115 = _mm512_max_ps(_mm512_setzero_ps(), dat1115);
__m512 dat1116 = _mm512_maskz_loadu_ps(16383, datPtr5+1392+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1116 = _mm512_max_ps(_mm512_setzero_ps(), dat1116);
__m512 in202 = _mm512_permutexvar_ps(pm85, dat1115);
__m512 in210 = _mm512_permutexvar_ps(pm85, dat1116);
__m512 dat1117 = _mm512_maskz_loadu_ps(16383, datPtr5+1568+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1117 = _mm512_max_ps(_mm512_setzero_ps(), dat1117);
__m512 dat1118 = _mm512_maskz_loadu_ps(16383, datPtr5+1616+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1118 = _mm512_max_ps(_mm512_setzero_ps(), dat1118);
__m512 in203 = _mm512_permutexvar_ps(pm85, dat1117);
__m512 in211 = _mm512_permutexvar_ps(pm85, dat1118);
__m512 tmp1069 = _mm512_add_ps(in197, in201);
__m512 tmp1073 = _mm512_add_ps(in205, in209);
__m512 tmp1070 = _mm512_sub_ps(in200, in198);
__m512 tmp1074 = _mm512_sub_ps(in208, in206);
__m512 tmp1071 = _mm512_add_ps(in198, in202);
__m512 tmp1075 = _mm512_add_ps(in206, in210);
in196 = _mm512_sub_ps(in196, in202);
in204 = _mm512_sub_ps(in204, in210);
tmp1069 = _mm512_fmadd_ps(in199, _mm512_set1_ps(-4.25e+00f), tmp1069);
tmp1073 = _mm512_fmadd_ps(in207, _mm512_set1_ps(-4.25e+00f), tmp1073);
tmp1071 = _mm512_fmadd_ps(in200, _mm512_set1_ps(-4.25e+00f), tmp1071);
tmp1075 = _mm512_fmadd_ps(in208, _mm512_set1_ps(-4.25e+00f), tmp1075);
in196 = _mm512_fmadd_ps(tmp1070, _mm512_set1_ps(5.25e+00f), in196);
in204 = _mm512_fmadd_ps(tmp1074, _mm512_set1_ps(5.25e+00f), in204);
tmp1070 = _mm512_fmadd_ps(in198, _mm512_set1_ps(2.5e-01f), in202);
tmp1074 = _mm512_fmadd_ps(in206, _mm512_set1_ps(2.5e-01f), in210);
in198 = _mm512_fmadd_ps(in198, _mm512_set1_ps(4e+00f), in202);
in206 = _mm512_fmadd_ps(in206, _mm512_set1_ps(4e+00f), in210);
__m512 tmp1072 = _mm512_sub_ps(tmp1071, tmp1069);
__m512 tmp1076 = _mm512_sub_ps(tmp1075, tmp1073);
tmp1071 = _mm512_add_ps(tmp1069, tmp1071);
tmp1075 = _mm512_add_ps(tmp1073, tmp1075);
tmp1069 = _mm512_fmadd_ps(in197, _mm512_set1_ps(2.5e-01f), in201);
tmp1073 = _mm512_fmadd_ps(in205, _mm512_set1_ps(2.5e-01f), in209);
tmp1070 = _mm512_fmadd_ps(in200, _mm512_set1_ps(-1.25e+00f), tmp1070);
tmp1074 = _mm512_fmadd_ps(in208, _mm512_set1_ps(-1.25e+00f), tmp1074);
in200 = _mm512_fmadd_ps(in200, _mm512_set1_ps(-5e+00f), in198);
in208 = _mm512_fmadd_ps(in208, _mm512_set1_ps(-5e+00f), in206);
tmp1069 = _mm512_fmadd_ps(in199, _mm512_set1_ps(-1.25e+00f), tmp1069);
tmp1073 = _mm512_fmadd_ps(in207, _mm512_set1_ps(-1.25e+00f), tmp1073);
in202 = _mm512_fmadd_ps(tmp1069, _mm512_set1_ps(2e+00f), tmp1070);
in210 = _mm512_fmadd_ps(tmp1073, _mm512_set1_ps(2e+00f), tmp1074);
tmp1070 = _mm512_fnmadd_ps(tmp1069, _mm512_set1_ps(2e+00f), tmp1070);
tmp1074 = _mm512_fnmadd_ps(tmp1073, _mm512_set1_ps(2e+00f), tmp1074);
tmp1069 = _mm512_fmadd_ps(in201, _mm512_set1_ps(2.5e-01f), in197);
tmp1073 = _mm512_fmadd_ps(in209, _mm512_set1_ps(2.5e-01f), in205);
in197 = _mm512_sub_ps(in203, in197);
in205 = _mm512_sub_ps(in211, in205);
tmp1069 = _mm512_fmadd_ps(in199, _mm512_set1_ps(-1.25e+00f), tmp1069);
tmp1073 = _mm512_fmadd_ps(in207, _mm512_set1_ps(-1.25e+00f), tmp1073);
in199 = _mm512_sub_ps(in199, in201);
in207 = _mm512_sub_ps(in207, in209);
in199 = _mm512_fmadd_ps(in199, _mm512_set1_ps(5.25e+00f), in197);
in207 = _mm512_fmadd_ps(in207, _mm512_set1_ps(5.25e+00f), in205);
in198 = _mm512_fmadd_ps(tmp1069, _mm512_set1_ps(2e+00f), in200);
in206 = _mm512_fmadd_ps(tmp1073, _mm512_set1_ps(2e+00f), in208);
in200 = _mm512_fnmadd_ps(tmp1069, _mm512_set1_ps(2e+00f), in200);
in208 = _mm512_fnmadd_ps(tmp1073, _mm512_set1_ps(2e+00f), in208);
__m512 tmp1085 = _mm512_unpacklo_ps(in196, tmp1071);
__m512 tmp1086 = _mm512_unpackhi_ps(in196, tmp1071);
__m512 tmp1087 = _mm512_unpacklo_ps(tmp1072, in202);
__m512 tmp1088 = _mm512_unpackhi_ps(tmp1072, in202);
__m512 tmp1089 = _mm512_unpacklo_ps(tmp1070, in198);
__m512 tmp1090 = _mm512_unpackhi_ps(tmp1070, in198);
__m512 tmp1091 = _mm512_unpacklo_ps(in200, in199);
__m512 tmp1092 = _mm512_unpackhi_ps(in200, in199);
__m512 tmp1093 = _mm512_unpacklo_ps(in204, tmp1075);
__m512 tmp1094 = _mm512_unpackhi_ps(in204, tmp1075);
__m512 tmp1095 = _mm512_unpacklo_ps(tmp1076, in210);
__m512 tmp1096 = _mm512_unpackhi_ps(tmp1076, in210);
__m512 tmp1097 = _mm512_unpacklo_ps(tmp1074, in206);
__m512 tmp1098 = _mm512_unpackhi_ps(tmp1074, in206);
__m512 tmp1099 = _mm512_unpacklo_ps(in208, in207);
__m512 tmp1100 = _mm512_unpackhi_ps(in208, in207);
__m512 tmp1101 = _mm512_shuffle_ps(tmp1085, tmp1087, 68);
__m512 tmp1102 = _mm512_shuffle_ps(tmp1085, tmp1087, 238);
__m512 tmp1103 = _mm512_shuffle_ps(tmp1086, tmp1088, 68);
__m512 tmp1104 = _mm512_shuffle_ps(tmp1086, tmp1088, 238);
__m512 tmp1105 = _mm512_shuffle_ps(tmp1089, tmp1091, 68);
__m512 tmp1106 = _mm512_shuffle_ps(tmp1089, tmp1091, 238);
__m512 tmp1107 = _mm512_shuffle_ps(tmp1090, tmp1092, 68);
__m512 tmp1108 = _mm512_shuffle_ps(tmp1090, tmp1092, 238);
__m512 tmp1109 = _mm512_shuffle_ps(tmp1093, tmp1095, 68);
__m512 tmp1110 = _mm512_shuffle_ps(tmp1093, tmp1095, 238);
__m512 tmp1111 = _mm512_shuffle_ps(tmp1094, tmp1096, 68);
__m512 tmp1112 = _mm512_shuffle_ps(tmp1094, tmp1096, 238);
__m512 tmp1113 = _mm512_shuffle_ps(tmp1097, tmp1099, 68);
__m512 tmp1114 = _mm512_shuffle_ps(tmp1097, tmp1099, 238);
__m512 tmp1115 = _mm512_shuffle_ps(tmp1098, tmp1100, 68);
__m512 tmp1116 = _mm512_shuffle_ps(tmp1098, tmp1100, 238);
__m512 tmp1117 = _mm512_shuffle_f32x4(tmp1101, tmp1105, 136);
__m512 tmp1118 = _mm512_shuffle_f32x4(tmp1101, tmp1105, 221);
__m512 tmp1119 = _mm512_shuffle_f32x4(tmp1102, tmp1106, 136);
__m512 tmp1120 = _mm512_shuffle_f32x4(tmp1102, tmp1106, 221);
__m512 tmp1121 = _mm512_shuffle_f32x4(tmp1103, tmp1107, 136);
__m512 tmp1122 = _mm512_shuffle_f32x4(tmp1103, tmp1107, 221);
__m512 tmp1123 = _mm512_shuffle_f32x4(tmp1104, tmp1108, 136);
__m512 tmp1124 = _mm512_shuffle_f32x4(tmp1104, tmp1108, 221);
__m512 tmp1125 = _mm512_shuffle_f32x4(tmp1109, tmp1113, 136);
__m512 tmp1126 = _mm512_shuffle_f32x4(tmp1109, tmp1113, 221);
__m512 tmp1127 = _mm512_shuffle_f32x4(tmp1110, tmp1114, 136);
__m512 tmp1128 = _mm512_shuffle_f32x4(tmp1110, tmp1114, 221);
__m512 tmp1129 = _mm512_shuffle_f32x4(tmp1111, tmp1115, 136);
__m512 tmp1130 = _mm512_shuffle_f32x4(tmp1111, tmp1115, 221);
__m512 tmp1131 = _mm512_shuffle_f32x4(tmp1112, tmp1116, 136);
__m512 tmp1132 = _mm512_shuffle_f32x4(tmp1112, tmp1116, 221);
in196 = _mm512_shuffle_f32x4(tmp1117, tmp1125, 136);
in204 = _mm512_shuffle_f32x4(tmp1117, tmp1125, 221);
tmp1071 = _mm512_shuffle_f32x4(tmp1119, tmp1127, 136);
tmp1075 = _mm512_shuffle_f32x4(tmp1119, tmp1127, 221);
tmp1072 = _mm512_shuffle_f32x4(tmp1121, tmp1129, 136);
tmp1076 = _mm512_shuffle_f32x4(tmp1121, tmp1129, 221);
in202 = _mm512_shuffle_f32x4(tmp1123, tmp1131, 136);
in210 = _mm512_shuffle_f32x4(tmp1123, tmp1131, 221);
tmp1070 = _mm512_shuffle_f32x4(tmp1118, tmp1126, 136);
tmp1074 = _mm512_shuffle_f32x4(tmp1118, tmp1126, 221);
in198 = _mm512_shuffle_f32x4(tmp1120, tmp1128, 136);
in206 = _mm512_shuffle_f32x4(tmp1120, tmp1128, 221);
in200 = _mm512_shuffle_f32x4(tmp1122, tmp1130, 136);
in208 = _mm512_shuffle_f32x4(tmp1122, tmp1130, 221);
in199 = _mm512_shuffle_f32x4(tmp1124, tmp1132, 136);
in207 = _mm512_shuffle_f32x4(tmp1124, tmp1132, 221);
__m512 tmp1077 = _mm512_add_ps(tmp1071, in198);
__m512 tmp1081 = _mm512_add_ps(tmp1075, in206);
__m512 tmp1078 = _mm512_sub_ps(tmp1070, tmp1072);
__m512 tmp1082 = _mm512_sub_ps(tmp1074, tmp1076);
__m512 tmp1079 = _mm512_add_ps(tmp1072, in200);
__m512 tmp1083 = _mm512_add_ps(tmp1076, in208);
in196 = _mm512_sub_ps(in196, in200);
in204 = _mm512_sub_ps(in204, in208);
tmp1077 = _mm512_fmadd_ps(in202, _mm512_set1_ps(-4.25e+00f), tmp1077);
tmp1081 = _mm512_fmadd_ps(in210, _mm512_set1_ps(-4.25e+00f), tmp1081);
tmp1079 = _mm512_fmadd_ps(tmp1070, _mm512_set1_ps(-4.25e+00f), tmp1079);
tmp1083 = _mm512_fmadd_ps(tmp1074, _mm512_set1_ps(-4.25e+00f), tmp1083);
in196 = _mm512_fmadd_ps(tmp1078, _mm512_set1_ps(5.25e+00f), in196);
in204 = _mm512_fmadd_ps(tmp1082, _mm512_set1_ps(5.25e+00f), in204);
tmp1078 = _mm512_fmadd_ps(tmp1072, _mm512_set1_ps(2.5e-01f), in200);
tmp1082 = _mm512_fmadd_ps(tmp1076, _mm512_set1_ps(2.5e-01f), in208);
tmp1072 = _mm512_fmadd_ps(tmp1072, _mm512_set1_ps(4e+00f), in200);
tmp1076 = _mm512_fmadd_ps(tmp1076, _mm512_set1_ps(4e+00f), in208);
__m512 tmp1080 = _mm512_sub_ps(tmp1079, tmp1077);
__m512 tmp1084 = _mm512_sub_ps(tmp1083, tmp1081);
tmp1079 = _mm512_add_ps(tmp1077, tmp1079);
tmp1083 = _mm512_add_ps(tmp1081, tmp1083);
tmp1077 = _mm512_fmadd_ps(tmp1071, _mm512_set1_ps(2.5e-01f), in198);
tmp1081 = _mm512_fmadd_ps(tmp1075, _mm512_set1_ps(2.5e-01f), in206);
tmp1078 = _mm512_fmadd_ps(tmp1070, _mm512_set1_ps(-1.25e+00f), tmp1078);
tmp1082 = _mm512_fmadd_ps(tmp1074, _mm512_set1_ps(-1.25e+00f), tmp1082);
tmp1070 = _mm512_fmadd_ps(tmp1070, _mm512_set1_ps(-5e+00f), tmp1072);
tmp1074 = _mm512_fmadd_ps(tmp1074, _mm512_set1_ps(-5e+00f), tmp1076);
tmp1077 = _mm512_fmadd_ps(in202, _mm512_set1_ps(-1.25e+00f), tmp1077);
tmp1081 = _mm512_fmadd_ps(in210, _mm512_set1_ps(-1.25e+00f), tmp1081);
in200 = _mm512_fmadd_ps(tmp1077, _mm512_set1_ps(2e+00f), tmp1078);
in208 = _mm512_fmadd_ps(tmp1081, _mm512_set1_ps(2e+00f), tmp1082);
tmp1078 = _mm512_fnmadd_ps(tmp1077, _mm512_set1_ps(2e+00f), tmp1078);
tmp1082 = _mm512_fnmadd_ps(tmp1081, _mm512_set1_ps(2e+00f), tmp1082);
tmp1077 = _mm512_fmadd_ps(in198, _mm512_set1_ps(2.5e-01f), tmp1071);
tmp1081 = _mm512_fmadd_ps(in206, _mm512_set1_ps(2.5e-01f), tmp1075);
tmp1071 = _mm512_sub_ps(in199, tmp1071);
tmp1075 = _mm512_sub_ps(in207, tmp1075);
tmp1077 = _mm512_fmadd_ps(in202, _mm512_set1_ps(-1.25e+00f), tmp1077);
tmp1081 = _mm512_fmadd_ps(in210, _mm512_set1_ps(-1.25e+00f), tmp1081);
in202 = _mm512_sub_ps(in202, in198);
in210 = _mm512_sub_ps(in210, in206);
in202 = _mm512_fmadd_ps(in202, _mm512_set1_ps(5.25e+00f), tmp1071);
in210 = _mm512_fmadd_ps(in210, _mm512_set1_ps(5.25e+00f), tmp1075);
tmp1072 = _mm512_fmadd_ps(tmp1077, _mm512_set1_ps(2e+00f), tmp1070);
tmp1076 = _mm512_fmadd_ps(tmp1081, _mm512_set1_ps(2e+00f), tmp1074);
tmp1070 = _mm512_fnmadd_ps(tmp1077, _mm512_set1_ps(2e+00f), tmp1070);
tmp1074 = _mm512_fnmadd_ps(tmp1081, _mm512_set1_ps(2e+00f), tmp1074);
__m512 out215 = _mm512_shuffle_f32x4(in196, tmp1079, 68);
__m512 out223 = _mm512_shuffle_f32x4(in196, tmp1079, 238);
__m512 out216 = _mm512_shuffle_f32x4(tmp1080, in200, 68);
__m512 out224 = _mm512_shuffle_f32x4(tmp1080, in200, 238);
__m512 out217 = _mm512_shuffle_f32x4(tmp1078, tmp1072, 68);
__m512 out225 = _mm512_shuffle_f32x4(tmp1078, tmp1072, 238);
__m512 out218 = _mm512_shuffle_f32x4(tmp1070, in202, 68);
__m512 out226 = _mm512_shuffle_f32x4(tmp1070, in202, 238);
__m512 out219 = _mm512_shuffle_f32x4(in204, tmp1083, 68);
__m512 out227 = _mm512_shuffle_f32x4(in204, tmp1083, 238);
__m512 out220 = _mm512_shuffle_f32x4(tmp1084, in208, 68);
__m512 out228 = _mm512_shuffle_f32x4(tmp1084, in208, 238);
__m512 out221 = _mm512_shuffle_f32x4(tmp1082, tmp1076, 68);
__m512 out229 = _mm512_shuffle_f32x4(tmp1082, tmp1076, 238);
__m512 out222 = _mm512_shuffle_f32x4(tmp1074, in210, 68);
__m512 out230 = _mm512_shuffle_f32x4(tmp1074, in210, 238);
_mm512_storeu_ps(dfPtr4+0+1638400*i16+24576*j11+24576*s13+768*k56, out215);
_mm512_storeu_ps(dfPtr4+128+1638400*i16+24576*j11+24576*s13+768*k56, out223);
_mm512_storeu_ps(dfPtr4+64+1638400*i16+24576*j11+24576*s13+768*k56, out219);
_mm512_storeu_ps(dfPtr4+192+1638400*i16+24576*j11+24576*s13+768*k56, out227);
_mm512_storeu_ps(dfPtr4+409600+1638400*i16+24576*j11+24576*s13+768*k56, out216);
_mm512_storeu_ps(dfPtr4+409728+1638400*i16+24576*j11+24576*s13+768*k56, out224);
_mm512_storeu_ps(dfPtr4+409664+1638400*i16+24576*j11+24576*s13+768*k56, out220);
_mm512_storeu_ps(dfPtr4+409792+1638400*i16+24576*j11+24576*s13+768*k56, out228);
_mm512_storeu_ps(dfPtr4+819200+1638400*i16+24576*j11+24576*s13+768*k56, out217);
_mm512_storeu_ps(dfPtr4+819328+1638400*i16+24576*j11+24576*s13+768*k56, out225);
_mm512_storeu_ps(dfPtr4+819264+1638400*i16+24576*j11+24576*s13+768*k56, out221);
_mm512_storeu_ps(dfPtr4+819392+1638400*i16+24576*j11+24576*s13+768*k56, out229);
_mm512_storeu_ps(dfPtr4+1228800+1638400*i16+24576*j11+24576*s13+768*k56, out218);
_mm512_storeu_ps(dfPtr4+1228928+1638400*i16+24576*j11+24576*s13+768*k56, out226);
_mm512_storeu_ps(dfPtr4+1228864+1638400*i16+24576*j11+24576*s13+768*k56, out222);
_mm512_storeu_ps(dfPtr4+1228992+1638400*i16+24576*j11+24576*s13+768*k56, out230);
__m512 dat1119 = _mm512_maskz_loadu_ps(511, datPtr5+96+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1119 = _mm512_max_ps(_mm512_setzero_ps(), dat1119);
__m512 dat1120 = _mm512_maskz_loadu_ps(16383, datPtr5+12608+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1120 = _mm512_max_ps(_mm512_setzero_ps(), dat1120);
__m512i pm86 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in212 = _mm512_permutexvar_ps(pm86, dat1119);
__m512i pm87 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in220 = _mm512_permutexvar_ps(pm87, dat1120);
__m512 dat1121 = _mm512_maskz_loadu_ps(511, datPtr5+320+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1121 = _mm512_max_ps(_mm512_setzero_ps(), dat1121);
__m512 dat1122 = _mm512_maskz_loadu_ps(16383, datPtr5+12832+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1122 = _mm512_max_ps(_mm512_setzero_ps(), dat1122);
__m512 in213 = _mm512_permutexvar_ps(pm86, dat1121);
__m512 in221 = _mm512_permutexvar_ps(pm87, dat1122);
__m512 dat1123 = _mm512_maskz_loadu_ps(511, datPtr5+544+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1123 = _mm512_max_ps(_mm512_setzero_ps(), dat1123);
__m512 dat1124 = _mm512_maskz_loadu_ps(16383, datPtr5+13056+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1124 = _mm512_max_ps(_mm512_setzero_ps(), dat1124);
__m512 in214 = _mm512_permutexvar_ps(pm86, dat1123);
__m512 in222 = _mm512_permutexvar_ps(pm87, dat1124);
__m512 dat1125 = _mm512_maskz_loadu_ps(511, datPtr5+768+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1125 = _mm512_max_ps(_mm512_setzero_ps(), dat1125);
__m512 dat1126 = _mm512_maskz_loadu_ps(16383, datPtr5+13280+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1126 = _mm512_max_ps(_mm512_setzero_ps(), dat1126);
__m512 in215 = _mm512_permutexvar_ps(pm86, dat1125);
__m512 in223 = _mm512_permutexvar_ps(pm87, dat1126);
__m512 dat1127 = _mm512_maskz_loadu_ps(511, datPtr5+992+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1127 = _mm512_max_ps(_mm512_setzero_ps(), dat1127);
__m512 dat1128 = _mm512_maskz_loadu_ps(16383, datPtr5+13504+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1128 = _mm512_max_ps(_mm512_setzero_ps(), dat1128);
__m512 in216 = _mm512_permutexvar_ps(pm86, dat1127);
__m512 in224 = _mm512_permutexvar_ps(pm87, dat1128);
__m512 dat1129 = _mm512_maskz_loadu_ps(511, datPtr5+1216+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1129 = _mm512_max_ps(_mm512_setzero_ps(), dat1129);
__m512 dat1130 = _mm512_maskz_loadu_ps(16383, datPtr5+13728+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1130 = _mm512_max_ps(_mm512_setzero_ps(), dat1130);
__m512 in217 = _mm512_permutexvar_ps(pm86, dat1129);
__m512 in225 = _mm512_permutexvar_ps(pm87, dat1130);
__m512 dat1131 = _mm512_maskz_loadu_ps(511, datPtr5+1440+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1131 = _mm512_max_ps(_mm512_setzero_ps(), dat1131);
__m512 dat1132 = _mm512_maskz_loadu_ps(16383, datPtr5+13952+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1132 = _mm512_max_ps(_mm512_setzero_ps(), dat1132);
__m512 in218 = _mm512_permutexvar_ps(pm86, dat1131);
__m512 in226 = _mm512_permutexvar_ps(pm87, dat1132);
__m512 dat1133 = _mm512_maskz_loadu_ps(511, datPtr5+1664+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1133 = _mm512_max_ps(_mm512_setzero_ps(), dat1133);
__m512 dat1134 = _mm512_maskz_loadu_ps(16383, datPtr5+14176+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1134 = _mm512_max_ps(_mm512_setzero_ps(), dat1134);
__m512 in219 = _mm512_permutexvar_ps(pm86, dat1133);
__m512 in227 = _mm512_permutexvar_ps(pm87, dat1134);
__m512 tmp1133 = _mm512_add_ps(in213, in217);
__m512 tmp1137 = _mm512_add_ps(in221, in225);
__m512 tmp1134 = _mm512_sub_ps(in216, in214);
__m512 tmp1138 = _mm512_sub_ps(in224, in222);
__m512 tmp1135 = _mm512_add_ps(in214, in218);
__m512 tmp1139 = _mm512_add_ps(in222, in226);
in212 = _mm512_sub_ps(in212, in218);
in220 = _mm512_sub_ps(in220, in226);
tmp1133 = _mm512_fmadd_ps(in215, _mm512_set1_ps(-4.25e+00f), tmp1133);
tmp1137 = _mm512_fmadd_ps(in223, _mm512_set1_ps(-4.25e+00f), tmp1137);
tmp1135 = _mm512_fmadd_ps(in216, _mm512_set1_ps(-4.25e+00f), tmp1135);
tmp1139 = _mm512_fmadd_ps(in224, _mm512_set1_ps(-4.25e+00f), tmp1139);
in212 = _mm512_fmadd_ps(tmp1134, _mm512_set1_ps(5.25e+00f), in212);
in220 = _mm512_fmadd_ps(tmp1138, _mm512_set1_ps(5.25e+00f), in220);
tmp1134 = _mm512_fmadd_ps(in214, _mm512_set1_ps(2.5e-01f), in218);
tmp1138 = _mm512_fmadd_ps(in222, _mm512_set1_ps(2.5e-01f), in226);
in214 = _mm512_fmadd_ps(in214, _mm512_set1_ps(4e+00f), in218);
in222 = _mm512_fmadd_ps(in222, _mm512_set1_ps(4e+00f), in226);
__m512 tmp1136 = _mm512_sub_ps(tmp1135, tmp1133);
__m512 tmp1140 = _mm512_sub_ps(tmp1139, tmp1137);
tmp1135 = _mm512_add_ps(tmp1133, tmp1135);
tmp1139 = _mm512_add_ps(tmp1137, tmp1139);
tmp1133 = _mm512_fmadd_ps(in213, _mm512_set1_ps(2.5e-01f), in217);
tmp1137 = _mm512_fmadd_ps(in221, _mm512_set1_ps(2.5e-01f), in225);
tmp1134 = _mm512_fmadd_ps(in216, _mm512_set1_ps(-1.25e+00f), tmp1134);
tmp1138 = _mm512_fmadd_ps(in224, _mm512_set1_ps(-1.25e+00f), tmp1138);
in216 = _mm512_fmadd_ps(in216, _mm512_set1_ps(-5e+00f), in214);
in224 = _mm512_fmadd_ps(in224, _mm512_set1_ps(-5e+00f), in222);
tmp1133 = _mm512_fmadd_ps(in215, _mm512_set1_ps(-1.25e+00f), tmp1133);
tmp1137 = _mm512_fmadd_ps(in223, _mm512_set1_ps(-1.25e+00f), tmp1137);
in218 = _mm512_fmadd_ps(tmp1133, _mm512_set1_ps(2e+00f), tmp1134);
in226 = _mm512_fmadd_ps(tmp1137, _mm512_set1_ps(2e+00f), tmp1138);
tmp1134 = _mm512_fnmadd_ps(tmp1133, _mm512_set1_ps(2e+00f), tmp1134);
tmp1138 = _mm512_fnmadd_ps(tmp1137, _mm512_set1_ps(2e+00f), tmp1138);
tmp1133 = _mm512_fmadd_ps(in217, _mm512_set1_ps(2.5e-01f), in213);
tmp1137 = _mm512_fmadd_ps(in225, _mm512_set1_ps(2.5e-01f), in221);
in213 = _mm512_sub_ps(in219, in213);
in221 = _mm512_sub_ps(in227, in221);
tmp1133 = _mm512_fmadd_ps(in215, _mm512_set1_ps(-1.25e+00f), tmp1133);
tmp1137 = _mm512_fmadd_ps(in223, _mm512_set1_ps(-1.25e+00f), tmp1137);
in215 = _mm512_sub_ps(in215, in217);
in223 = _mm512_sub_ps(in223, in225);
in215 = _mm512_fmadd_ps(in215, _mm512_set1_ps(5.25e+00f), in213);
in223 = _mm512_fmadd_ps(in223, _mm512_set1_ps(5.25e+00f), in221);
in214 = _mm512_fmadd_ps(tmp1133, _mm512_set1_ps(2e+00f), in216);
in222 = _mm512_fmadd_ps(tmp1137, _mm512_set1_ps(2e+00f), in224);
in216 = _mm512_fnmadd_ps(tmp1133, _mm512_set1_ps(2e+00f), in216);
in224 = _mm512_fnmadd_ps(tmp1137, _mm512_set1_ps(2e+00f), in224);
__m512 tmp1149 = _mm512_unpacklo_ps(in212, tmp1135);
__m512 tmp1150 = _mm512_unpackhi_ps(in212, tmp1135);
__m512 tmp1151 = _mm512_unpacklo_ps(tmp1136, in218);
__m512 tmp1152 = _mm512_unpackhi_ps(tmp1136, in218);
__m512 tmp1153 = _mm512_unpacklo_ps(tmp1134, in214);
__m512 tmp1154 = _mm512_unpackhi_ps(tmp1134, in214);
__m512 tmp1155 = _mm512_unpacklo_ps(in216, in215);
__m512 tmp1156 = _mm512_unpackhi_ps(in216, in215);
__m512 tmp1157 = _mm512_unpacklo_ps(in220, tmp1139);
__m512 tmp1158 = _mm512_unpackhi_ps(in220, tmp1139);
__m512 tmp1159 = _mm512_unpacklo_ps(tmp1140, in226);
__m512 tmp1160 = _mm512_unpackhi_ps(tmp1140, in226);
__m512 tmp1161 = _mm512_unpacklo_ps(tmp1138, in222);
__m512 tmp1162 = _mm512_unpackhi_ps(tmp1138, in222);
__m512 tmp1163 = _mm512_unpacklo_ps(in224, in223);
__m512 tmp1164 = _mm512_unpackhi_ps(in224, in223);
__m512 tmp1165 = _mm512_shuffle_ps(tmp1149, tmp1151, 68);
__m512 tmp1166 = _mm512_shuffle_ps(tmp1149, tmp1151, 238);
__m512 tmp1167 = _mm512_shuffle_ps(tmp1150, tmp1152, 68);
__m512 tmp1168 = _mm512_shuffle_ps(tmp1150, tmp1152, 238);
__m512 tmp1169 = _mm512_shuffle_ps(tmp1153, tmp1155, 68);
__m512 tmp1170 = _mm512_shuffle_ps(tmp1153, tmp1155, 238);
__m512 tmp1171 = _mm512_shuffle_ps(tmp1154, tmp1156, 68);
__m512 tmp1172 = _mm512_shuffle_ps(tmp1154, tmp1156, 238);
__m512 tmp1173 = _mm512_shuffle_ps(tmp1157, tmp1159, 68);
__m512 tmp1174 = _mm512_shuffle_ps(tmp1157, tmp1159, 238);
__m512 tmp1175 = _mm512_shuffle_ps(tmp1158, tmp1160, 68);
__m512 tmp1176 = _mm512_shuffle_ps(tmp1158, tmp1160, 238);
__m512 tmp1177 = _mm512_shuffle_ps(tmp1161, tmp1163, 68);
__m512 tmp1178 = _mm512_shuffle_ps(tmp1161, tmp1163, 238);
__m512 tmp1179 = _mm512_shuffle_ps(tmp1162, tmp1164, 68);
__m512 tmp1180 = _mm512_shuffle_ps(tmp1162, tmp1164, 238);
__m512 tmp1181 = _mm512_shuffle_f32x4(tmp1165, tmp1169, 136);
__m512 tmp1182 = _mm512_shuffle_f32x4(tmp1165, tmp1169, 221);
__m512 tmp1183 = _mm512_shuffle_f32x4(tmp1166, tmp1170, 136);
__m512 tmp1184 = _mm512_shuffle_f32x4(tmp1166, tmp1170, 221);
__m512 tmp1185 = _mm512_shuffle_f32x4(tmp1167, tmp1171, 136);
__m512 tmp1186 = _mm512_shuffle_f32x4(tmp1167, tmp1171, 221);
__m512 tmp1187 = _mm512_shuffle_f32x4(tmp1168, tmp1172, 136);
__m512 tmp1188 = _mm512_shuffle_f32x4(tmp1168, tmp1172, 221);
__m512 tmp1189 = _mm512_shuffle_f32x4(tmp1173, tmp1177, 136);
__m512 tmp1190 = _mm512_shuffle_f32x4(tmp1173, tmp1177, 221);
__m512 tmp1191 = _mm512_shuffle_f32x4(tmp1174, tmp1178, 136);
__m512 tmp1192 = _mm512_shuffle_f32x4(tmp1174, tmp1178, 221);
__m512 tmp1193 = _mm512_shuffle_f32x4(tmp1175, tmp1179, 136);
__m512 tmp1194 = _mm512_shuffle_f32x4(tmp1175, tmp1179, 221);
__m512 tmp1195 = _mm512_shuffle_f32x4(tmp1176, tmp1180, 136);
__m512 tmp1196 = _mm512_shuffle_f32x4(tmp1176, tmp1180, 221);
in212 = _mm512_shuffle_f32x4(tmp1181, tmp1189, 136);
in220 = _mm512_shuffle_f32x4(tmp1181, tmp1189, 221);
tmp1135 = _mm512_shuffle_f32x4(tmp1183, tmp1191, 136);
tmp1139 = _mm512_shuffle_f32x4(tmp1183, tmp1191, 221);
tmp1136 = _mm512_shuffle_f32x4(tmp1185, tmp1193, 136);
tmp1140 = _mm512_shuffle_f32x4(tmp1185, tmp1193, 221);
in218 = _mm512_shuffle_f32x4(tmp1187, tmp1195, 136);
in226 = _mm512_shuffle_f32x4(tmp1187, tmp1195, 221);
tmp1134 = _mm512_shuffle_f32x4(tmp1182, tmp1190, 136);
tmp1138 = _mm512_shuffle_f32x4(tmp1182, tmp1190, 221);
in214 = _mm512_shuffle_f32x4(tmp1184, tmp1192, 136);
in222 = _mm512_shuffle_f32x4(tmp1184, tmp1192, 221);
in216 = _mm512_shuffle_f32x4(tmp1186, tmp1194, 136);
in224 = _mm512_shuffle_f32x4(tmp1186, tmp1194, 221);
in215 = _mm512_shuffle_f32x4(tmp1188, tmp1196, 136);
in223 = _mm512_shuffle_f32x4(tmp1188, tmp1196, 221);
__m512 tmp1141 = _mm512_add_ps(tmp1135, in214);
__m512 tmp1145 = _mm512_add_ps(tmp1139, in222);
__m512 tmp1142 = _mm512_sub_ps(tmp1134, tmp1136);
__m512 tmp1146 = _mm512_sub_ps(tmp1138, tmp1140);
__m512 tmp1143 = _mm512_add_ps(tmp1136, in216);
__m512 tmp1147 = _mm512_add_ps(tmp1140, in224);
in212 = _mm512_sub_ps(in212, in216);
in220 = _mm512_sub_ps(in220, in224);
tmp1141 = _mm512_fmadd_ps(in218, _mm512_set1_ps(-4.25e+00f), tmp1141);
tmp1145 = _mm512_fmadd_ps(in226, _mm512_set1_ps(-4.25e+00f), tmp1145);
tmp1143 = _mm512_fmadd_ps(tmp1134, _mm512_set1_ps(-4.25e+00f), tmp1143);
tmp1147 = _mm512_fmadd_ps(tmp1138, _mm512_set1_ps(-4.25e+00f), tmp1147);
in212 = _mm512_fmadd_ps(tmp1142, _mm512_set1_ps(5.25e+00f), in212);
in220 = _mm512_fmadd_ps(tmp1146, _mm512_set1_ps(5.25e+00f), in220);
tmp1142 = _mm512_fmadd_ps(tmp1136, _mm512_set1_ps(2.5e-01f), in216);
tmp1146 = _mm512_fmadd_ps(tmp1140, _mm512_set1_ps(2.5e-01f), in224);
tmp1136 = _mm512_fmadd_ps(tmp1136, _mm512_set1_ps(4e+00f), in216);
tmp1140 = _mm512_fmadd_ps(tmp1140, _mm512_set1_ps(4e+00f), in224);
__m512 tmp1144 = _mm512_sub_ps(tmp1143, tmp1141);
__m512 tmp1148 = _mm512_sub_ps(tmp1147, tmp1145);
tmp1143 = _mm512_add_ps(tmp1141, tmp1143);
tmp1147 = _mm512_add_ps(tmp1145, tmp1147);
tmp1141 = _mm512_fmadd_ps(tmp1135, _mm512_set1_ps(2.5e-01f), in214);
tmp1145 = _mm512_fmadd_ps(tmp1139, _mm512_set1_ps(2.5e-01f), in222);
tmp1142 = _mm512_fmadd_ps(tmp1134, _mm512_set1_ps(-1.25e+00f), tmp1142);
tmp1146 = _mm512_fmadd_ps(tmp1138, _mm512_set1_ps(-1.25e+00f), tmp1146);
tmp1134 = _mm512_fmadd_ps(tmp1134, _mm512_set1_ps(-5e+00f), tmp1136);
tmp1138 = _mm512_fmadd_ps(tmp1138, _mm512_set1_ps(-5e+00f), tmp1140);
tmp1141 = _mm512_fmadd_ps(in218, _mm512_set1_ps(-1.25e+00f), tmp1141);
tmp1145 = _mm512_fmadd_ps(in226, _mm512_set1_ps(-1.25e+00f), tmp1145);
in216 = _mm512_fmadd_ps(tmp1141, _mm512_set1_ps(2e+00f), tmp1142);
in224 = _mm512_fmadd_ps(tmp1145, _mm512_set1_ps(2e+00f), tmp1146);
tmp1142 = _mm512_fnmadd_ps(tmp1141, _mm512_set1_ps(2e+00f), tmp1142);
tmp1146 = _mm512_fnmadd_ps(tmp1145, _mm512_set1_ps(2e+00f), tmp1146);
tmp1141 = _mm512_fmadd_ps(in214, _mm512_set1_ps(2.5e-01f), tmp1135);
tmp1145 = _mm512_fmadd_ps(in222, _mm512_set1_ps(2.5e-01f), tmp1139);
tmp1135 = _mm512_sub_ps(in215, tmp1135);
tmp1139 = _mm512_sub_ps(in223, tmp1139);
tmp1141 = _mm512_fmadd_ps(in218, _mm512_set1_ps(-1.25e+00f), tmp1141);
tmp1145 = _mm512_fmadd_ps(in226, _mm512_set1_ps(-1.25e+00f), tmp1145);
in218 = _mm512_sub_ps(in218, in214);
in226 = _mm512_sub_ps(in226, in222);
in218 = _mm512_fmadd_ps(in218, _mm512_set1_ps(5.25e+00f), tmp1135);
in226 = _mm512_fmadd_ps(in226, _mm512_set1_ps(5.25e+00f), tmp1139);
tmp1136 = _mm512_fmadd_ps(tmp1141, _mm512_set1_ps(2e+00f), tmp1134);
tmp1140 = _mm512_fmadd_ps(tmp1145, _mm512_set1_ps(2e+00f), tmp1138);
tmp1134 = _mm512_fnmadd_ps(tmp1141, _mm512_set1_ps(2e+00f), tmp1134);
tmp1138 = _mm512_fnmadd_ps(tmp1145, _mm512_set1_ps(2e+00f), tmp1138);
__m512 out231 = _mm512_shuffle_f32x4(in212, tmp1143, 68);
__m512 out239 = _mm512_shuffle_f32x4(in212, tmp1143, 238);
__m512 out232 = _mm512_shuffle_f32x4(tmp1144, in216, 68);
__m512 out240 = _mm512_shuffle_f32x4(tmp1144, in216, 238);
__m512 out233 = _mm512_shuffle_f32x4(tmp1142, tmp1136, 68);
__m512 out241 = _mm512_shuffle_f32x4(tmp1142, tmp1136, 238);
__m512 out234 = _mm512_shuffle_f32x4(tmp1134, in218, 68);
__m512 out242 = _mm512_shuffle_f32x4(tmp1134, in218, 238);
__m512 out235 = _mm512_shuffle_f32x4(in220, tmp1147, 68);
__m512 out243 = _mm512_shuffle_f32x4(in220, tmp1147, 238);
__m512 out236 = _mm512_shuffle_f32x4(tmp1148, in224, 68);
__m512 out244 = _mm512_shuffle_f32x4(tmp1148, in224, 238);
__m512 out237 = _mm512_shuffle_f32x4(tmp1146, tmp1140, 68);
__m512 out245 = _mm512_shuffle_f32x4(tmp1146, tmp1140, 238);
__m512 out238 = _mm512_shuffle_f32x4(tmp1138, in226, 68);
__m512 out246 = _mm512_shuffle_f32x4(tmp1138, in226, 238);
_mm512_storeu_ps(dfPtr4+256+1638400*i16+24576*j11+24576*s13+768*k56, out231);
_mm512_storeu_ps(dfPtr4+384+1638400*i16+24576*j11+24576*s13+768*k56, out239);
_mm512_storeu_ps(dfPtr4+320+1638400*i16+24576*j11+24576*s13+768*k56, out235);
_mm512_storeu_ps(dfPtr4+448+1638400*i16+24576*j11+24576*s13+768*k56, out243);
_mm512_storeu_ps(dfPtr4+409856+1638400*i16+24576*j11+24576*s13+768*k56, out232);
_mm512_storeu_ps(dfPtr4+409984+1638400*i16+24576*j11+24576*s13+768*k56, out240);
_mm512_storeu_ps(dfPtr4+409920+1638400*i16+24576*j11+24576*s13+768*k56, out236);
_mm512_storeu_ps(dfPtr4+410048+1638400*i16+24576*j11+24576*s13+768*k56, out244);
_mm512_storeu_ps(dfPtr4+819456+1638400*i16+24576*j11+24576*s13+768*k56, out233);
_mm512_storeu_ps(dfPtr4+819584+1638400*i16+24576*j11+24576*s13+768*k56, out241);
_mm512_storeu_ps(dfPtr4+819520+1638400*i16+24576*j11+24576*s13+768*k56, out237);
_mm512_storeu_ps(dfPtr4+819648+1638400*i16+24576*j11+24576*s13+768*k56, out245);
_mm512_storeu_ps(dfPtr4+1229056+1638400*i16+24576*j11+24576*s13+768*k56, out234);
_mm512_storeu_ps(dfPtr4+1229184+1638400*i16+24576*j11+24576*s13+768*k56, out242);
_mm512_storeu_ps(dfPtr4+1229120+1638400*i16+24576*j11+24576*s13+768*k56, out238);
_mm512_storeu_ps(dfPtr4+1229248+1638400*i16+24576*j11+24576*s13+768*k56, out246);
__m512 dat1135 = _mm512_maskz_loadu_ps(16383, datPtr5+12656+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1135 = _mm512_max_ps(_mm512_setzero_ps(), dat1135);
__m512 dat1136 = _mm512_maskz_loadu_ps(511, datPtr5+12704+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1136 = _mm512_max_ps(_mm512_setzero_ps(), dat1136);
__m512i pm88 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in228 = _mm512_permutexvar_ps(pm88, dat1135);
__m512i pm89 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in236 = _mm512_permutexvar_ps(pm89, dat1136);
__m512 dat1137 = _mm512_maskz_loadu_ps(16383, datPtr5+12880+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1137 = _mm512_max_ps(_mm512_setzero_ps(), dat1137);
__m512 dat1138 = _mm512_maskz_loadu_ps(511, datPtr5+12928+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1138 = _mm512_max_ps(_mm512_setzero_ps(), dat1138);
__m512 in229 = _mm512_permutexvar_ps(pm88, dat1137);
__m512 in237 = _mm512_permutexvar_ps(pm89, dat1138);
__m512 dat1139 = _mm512_maskz_loadu_ps(16383, datPtr5+13104+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1139 = _mm512_max_ps(_mm512_setzero_ps(), dat1139);
__m512 dat1140 = _mm512_maskz_loadu_ps(511, datPtr5+13152+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1140 = _mm512_max_ps(_mm512_setzero_ps(), dat1140);
__m512 in230 = _mm512_permutexvar_ps(pm88, dat1139);
__m512 in238 = _mm512_permutexvar_ps(pm89, dat1140);
__m512 dat1141 = _mm512_maskz_loadu_ps(16383, datPtr5+13328+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1141 = _mm512_max_ps(_mm512_setzero_ps(), dat1141);
__m512 dat1142 = _mm512_maskz_loadu_ps(511, datPtr5+13376+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1142 = _mm512_max_ps(_mm512_setzero_ps(), dat1142);
__m512 in231 = _mm512_permutexvar_ps(pm88, dat1141);
__m512 in239 = _mm512_permutexvar_ps(pm89, dat1142);
__m512 dat1143 = _mm512_maskz_loadu_ps(16383, datPtr5+13552+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1143 = _mm512_max_ps(_mm512_setzero_ps(), dat1143);
__m512 dat1144 = _mm512_maskz_loadu_ps(511, datPtr5+13600+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1144 = _mm512_max_ps(_mm512_setzero_ps(), dat1144);
__m512 in232 = _mm512_permutexvar_ps(pm88, dat1143);
__m512 in240 = _mm512_permutexvar_ps(pm89, dat1144);
__m512 dat1145 = _mm512_maskz_loadu_ps(16383, datPtr5+13776+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1145 = _mm512_max_ps(_mm512_setzero_ps(), dat1145);
__m512 dat1146 = _mm512_maskz_loadu_ps(511, datPtr5+13824+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1146 = _mm512_max_ps(_mm512_setzero_ps(), dat1146);
__m512 in233 = _mm512_permutexvar_ps(pm88, dat1145);
__m512 in241 = _mm512_permutexvar_ps(pm89, dat1146);
__m512 dat1147 = _mm512_maskz_loadu_ps(16383, datPtr5+14000+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1147 = _mm512_max_ps(_mm512_setzero_ps(), dat1147);
__m512 dat1148 = _mm512_maskz_loadu_ps(511, datPtr5+14048+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1148 = _mm512_max_ps(_mm512_setzero_ps(), dat1148);
__m512 in234 = _mm512_permutexvar_ps(pm88, dat1147);
__m512 in242 = _mm512_permutexvar_ps(pm89, dat1148);
__m512 dat1149 = _mm512_maskz_loadu_ps(16383, datPtr5+14224+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1149 = _mm512_max_ps(_mm512_setzero_ps(), dat1149);
__m512 dat1150 = _mm512_maskz_loadu_ps(511, datPtr5+14272+806912*i16+224*h24+4*w27+806912*s13+25216*k56);
dat1150 = _mm512_max_ps(_mm512_setzero_ps(), dat1150);
__m512 in235 = _mm512_permutexvar_ps(pm88, dat1149);
__m512 in243 = _mm512_permutexvar_ps(pm89, dat1150);
__m512 tmp1197 = _mm512_add_ps(in229, in233);
__m512 tmp1201 = _mm512_add_ps(in237, in241);
__m512 tmp1198 = _mm512_sub_ps(in232, in230);
__m512 tmp1202 = _mm512_sub_ps(in240, in238);
__m512 tmp1199 = _mm512_add_ps(in230, in234);
__m512 tmp1203 = _mm512_add_ps(in238, in242);
in228 = _mm512_sub_ps(in228, in234);
in236 = _mm512_sub_ps(in236, in242);
tmp1197 = _mm512_fmadd_ps(in231, _mm512_set1_ps(-4.25e+00f), tmp1197);
tmp1201 = _mm512_fmadd_ps(in239, _mm512_set1_ps(-4.25e+00f), tmp1201);
tmp1199 = _mm512_fmadd_ps(in232, _mm512_set1_ps(-4.25e+00f), tmp1199);
tmp1203 = _mm512_fmadd_ps(in240, _mm512_set1_ps(-4.25e+00f), tmp1203);
in228 = _mm512_fmadd_ps(tmp1198, _mm512_set1_ps(5.25e+00f), in228);
in236 = _mm512_fmadd_ps(tmp1202, _mm512_set1_ps(5.25e+00f), in236);
tmp1198 = _mm512_fmadd_ps(in230, _mm512_set1_ps(2.5e-01f), in234);
tmp1202 = _mm512_fmadd_ps(in238, _mm512_set1_ps(2.5e-01f), in242);
in230 = _mm512_fmadd_ps(in230, _mm512_set1_ps(4e+00f), in234);
in238 = _mm512_fmadd_ps(in238, _mm512_set1_ps(4e+00f), in242);
__m512 tmp1200 = _mm512_sub_ps(tmp1199, tmp1197);
__m512 tmp1204 = _mm512_sub_ps(tmp1203, tmp1201);
tmp1199 = _mm512_add_ps(tmp1197, tmp1199);
tmp1203 = _mm512_add_ps(tmp1201, tmp1203);
tmp1197 = _mm512_fmadd_ps(in229, _mm512_set1_ps(2.5e-01f), in233);
tmp1201 = _mm512_fmadd_ps(in237, _mm512_set1_ps(2.5e-01f), in241);
tmp1198 = _mm512_fmadd_ps(in232, _mm512_set1_ps(-1.25e+00f), tmp1198);
tmp1202 = _mm512_fmadd_ps(in240, _mm512_set1_ps(-1.25e+00f), tmp1202);
in232 = _mm512_fmadd_ps(in232, _mm512_set1_ps(-5e+00f), in230);
in240 = _mm512_fmadd_ps(in240, _mm512_set1_ps(-5e+00f), in238);
tmp1197 = _mm512_fmadd_ps(in231, _mm512_set1_ps(-1.25e+00f), tmp1197);
tmp1201 = _mm512_fmadd_ps(in239, _mm512_set1_ps(-1.25e+00f), tmp1201);
in234 = _mm512_fmadd_ps(tmp1197, _mm512_set1_ps(2e+00f), tmp1198);
in242 = _mm512_fmadd_ps(tmp1201, _mm512_set1_ps(2e+00f), tmp1202);
tmp1198 = _mm512_fnmadd_ps(tmp1197, _mm512_set1_ps(2e+00f), tmp1198);
tmp1202 = _mm512_fnmadd_ps(tmp1201, _mm512_set1_ps(2e+00f), tmp1202);
tmp1197 = _mm512_fmadd_ps(in233, _mm512_set1_ps(2.5e-01f), in229);
tmp1201 = _mm512_fmadd_ps(in241, _mm512_set1_ps(2.5e-01f), in237);
in229 = _mm512_sub_ps(in235, in229);
in237 = _mm512_sub_ps(in243, in237);
tmp1197 = _mm512_fmadd_ps(in231, _mm512_set1_ps(-1.25e+00f), tmp1197);
tmp1201 = _mm512_fmadd_ps(in239, _mm512_set1_ps(-1.25e+00f), tmp1201);
in231 = _mm512_sub_ps(in231, in233);
in239 = _mm512_sub_ps(in239, in241);
in231 = _mm512_fmadd_ps(in231, _mm512_set1_ps(5.25e+00f), in229);
in239 = _mm512_fmadd_ps(in239, _mm512_set1_ps(5.25e+00f), in237);
in230 = _mm512_fmadd_ps(tmp1197, _mm512_set1_ps(2e+00f), in232);
in238 = _mm512_fmadd_ps(tmp1201, _mm512_set1_ps(2e+00f), in240);
in232 = _mm512_fnmadd_ps(tmp1197, _mm512_set1_ps(2e+00f), in232);
in240 = _mm512_fnmadd_ps(tmp1201, _mm512_set1_ps(2e+00f), in240);
__m512 tmp1213 = _mm512_unpacklo_ps(in228, tmp1199);
__m512 tmp1214 = _mm512_unpackhi_ps(in228, tmp1199);
__m512 tmp1215 = _mm512_unpacklo_ps(tmp1200, in234);
__m512 tmp1216 = _mm512_unpackhi_ps(tmp1200, in234);
__m512 tmp1217 = _mm512_unpacklo_ps(tmp1198, in230);
__m512 tmp1218 = _mm512_unpackhi_ps(tmp1198, in230);
__m512 tmp1219 = _mm512_unpacklo_ps(in232, in231);
__m512 tmp1220 = _mm512_unpackhi_ps(in232, in231);
__m512 tmp1221 = _mm512_unpacklo_ps(in236, tmp1203);
__m512 tmp1222 = _mm512_unpackhi_ps(in236, tmp1203);
__m512 tmp1223 = _mm512_unpacklo_ps(tmp1204, in242);
__m512 tmp1224 = _mm512_unpackhi_ps(tmp1204, in242);
__m512 tmp1225 = _mm512_unpacklo_ps(tmp1202, in238);
__m512 tmp1226 = _mm512_unpackhi_ps(tmp1202, in238);
__m512 tmp1227 = _mm512_unpacklo_ps(in240, in239);
__m512 tmp1228 = _mm512_unpackhi_ps(in240, in239);
__m512 tmp1229 = _mm512_shuffle_ps(tmp1213, tmp1215, 68);
__m512 tmp1230 = _mm512_shuffle_ps(tmp1213, tmp1215, 238);
__m512 tmp1231 = _mm512_shuffle_ps(tmp1214, tmp1216, 68);
__m512 tmp1232 = _mm512_shuffle_ps(tmp1214, tmp1216, 238);
__m512 tmp1233 = _mm512_shuffle_ps(tmp1217, tmp1219, 68);
__m512 tmp1234 = _mm512_shuffle_ps(tmp1217, tmp1219, 238);
__m512 tmp1235 = _mm512_shuffle_ps(tmp1218, tmp1220, 68);
__m512 tmp1236 = _mm512_shuffle_ps(tmp1218, tmp1220, 238);
__m512 tmp1237 = _mm512_shuffle_ps(tmp1221, tmp1223, 68);
__m512 tmp1238 = _mm512_shuffle_ps(tmp1221, tmp1223, 238);
__m512 tmp1239 = _mm512_shuffle_ps(tmp1222, tmp1224, 68);
__m512 tmp1240 = _mm512_shuffle_ps(tmp1222, tmp1224, 238);
__m512 tmp1241 = _mm512_shuffle_ps(tmp1225, tmp1227, 68);
__m512 tmp1242 = _mm512_shuffle_ps(tmp1225, tmp1227, 238);
__m512 tmp1243 = _mm512_shuffle_ps(tmp1226, tmp1228, 68);
__m512 tmp1244 = _mm512_shuffle_ps(tmp1226, tmp1228, 238);
__m512 tmp1245 = _mm512_shuffle_f32x4(tmp1229, tmp1233, 136);
__m512 tmp1246 = _mm512_shuffle_f32x4(tmp1229, tmp1233, 221);
__m512 tmp1247 = _mm512_shuffle_f32x4(tmp1230, tmp1234, 136);
__m512 tmp1248 = _mm512_shuffle_f32x4(tmp1230, tmp1234, 221);
__m512 tmp1249 = _mm512_shuffle_f32x4(tmp1231, tmp1235, 136);
__m512 tmp1250 = _mm512_shuffle_f32x4(tmp1231, tmp1235, 221);
__m512 tmp1251 = _mm512_shuffle_f32x4(tmp1232, tmp1236, 136);
__m512 tmp1252 = _mm512_shuffle_f32x4(tmp1232, tmp1236, 221);
__m512 tmp1253 = _mm512_shuffle_f32x4(tmp1237, tmp1241, 136);
__m512 tmp1254 = _mm512_shuffle_f32x4(tmp1237, tmp1241, 221);
__m512 tmp1255 = _mm512_shuffle_f32x4(tmp1238, tmp1242, 136);
__m512 tmp1256 = _mm512_shuffle_f32x4(tmp1238, tmp1242, 221);
__m512 tmp1257 = _mm512_shuffle_f32x4(tmp1239, tmp1243, 136);
__m512 tmp1258 = _mm512_shuffle_f32x4(tmp1239, tmp1243, 221);
__m512 tmp1259 = _mm512_shuffle_f32x4(tmp1240, tmp1244, 136);
__m512 tmp1260 = _mm512_shuffle_f32x4(tmp1240, tmp1244, 221);
in228 = _mm512_shuffle_f32x4(tmp1245, tmp1253, 136);
in236 = _mm512_shuffle_f32x4(tmp1245, tmp1253, 221);
tmp1199 = _mm512_shuffle_f32x4(tmp1247, tmp1255, 136);
tmp1203 = _mm512_shuffle_f32x4(tmp1247, tmp1255, 221);
tmp1200 = _mm512_shuffle_f32x4(tmp1249, tmp1257, 136);
tmp1204 = _mm512_shuffle_f32x4(tmp1249, tmp1257, 221);
in234 = _mm512_shuffle_f32x4(tmp1251, tmp1259, 136);
in242 = _mm512_shuffle_f32x4(tmp1251, tmp1259, 221);
tmp1198 = _mm512_shuffle_f32x4(tmp1246, tmp1254, 136);
tmp1202 = _mm512_shuffle_f32x4(tmp1246, tmp1254, 221);
in230 = _mm512_shuffle_f32x4(tmp1248, tmp1256, 136);
in238 = _mm512_shuffle_f32x4(tmp1248, tmp1256, 221);
in232 = _mm512_shuffle_f32x4(tmp1250, tmp1258, 136);
in240 = _mm512_shuffle_f32x4(tmp1250, tmp1258, 221);
in231 = _mm512_shuffle_f32x4(tmp1252, tmp1260, 136);
in239 = _mm512_shuffle_f32x4(tmp1252, tmp1260, 221);
__m512 tmp1205 = _mm512_add_ps(tmp1199, in230);
__m512 tmp1209 = _mm512_add_ps(tmp1203, in238);
__m512 tmp1206 = _mm512_sub_ps(tmp1198, tmp1200);
__m512 tmp1210 = _mm512_sub_ps(tmp1202, tmp1204);
__m512 tmp1207 = _mm512_add_ps(tmp1200, in232);
__m512 tmp1211 = _mm512_add_ps(tmp1204, in240);
in228 = _mm512_sub_ps(in228, in232);
in236 = _mm512_sub_ps(in236, in240);
tmp1205 = _mm512_fmadd_ps(in234, _mm512_set1_ps(-4.25e+00f), tmp1205);
tmp1209 = _mm512_fmadd_ps(in242, _mm512_set1_ps(-4.25e+00f), tmp1209);
tmp1207 = _mm512_fmadd_ps(tmp1198, _mm512_set1_ps(-4.25e+00f), tmp1207);
tmp1211 = _mm512_fmadd_ps(tmp1202, _mm512_set1_ps(-4.25e+00f), tmp1211);
in228 = _mm512_fmadd_ps(tmp1206, _mm512_set1_ps(5.25e+00f), in228);
in236 = _mm512_fmadd_ps(tmp1210, _mm512_set1_ps(5.25e+00f), in236);
tmp1206 = _mm512_fmadd_ps(tmp1200, _mm512_set1_ps(2.5e-01f), in232);
tmp1210 = _mm512_fmadd_ps(tmp1204, _mm512_set1_ps(2.5e-01f), in240);
tmp1200 = _mm512_fmadd_ps(tmp1200, _mm512_set1_ps(4e+00f), in232);
tmp1204 = _mm512_fmadd_ps(tmp1204, _mm512_set1_ps(4e+00f), in240);
__m512 tmp1208 = _mm512_sub_ps(tmp1207, tmp1205);
__m512 tmp1212 = _mm512_sub_ps(tmp1211, tmp1209);
tmp1207 = _mm512_add_ps(tmp1205, tmp1207);
tmp1211 = _mm512_add_ps(tmp1209, tmp1211);
tmp1205 = _mm512_fmadd_ps(tmp1199, _mm512_set1_ps(2.5e-01f), in230);
tmp1209 = _mm512_fmadd_ps(tmp1203, _mm512_set1_ps(2.5e-01f), in238);
tmp1206 = _mm512_fmadd_ps(tmp1198, _mm512_set1_ps(-1.25e+00f), tmp1206);
tmp1210 = _mm512_fmadd_ps(tmp1202, _mm512_set1_ps(-1.25e+00f), tmp1210);
tmp1198 = _mm512_fmadd_ps(tmp1198, _mm512_set1_ps(-5e+00f), tmp1200);
tmp1202 = _mm512_fmadd_ps(tmp1202, _mm512_set1_ps(-5e+00f), tmp1204);
tmp1205 = _mm512_fmadd_ps(in234, _mm512_set1_ps(-1.25e+00f), tmp1205);
tmp1209 = _mm512_fmadd_ps(in242, _mm512_set1_ps(-1.25e+00f), tmp1209);
in232 = _mm512_fmadd_ps(tmp1205, _mm512_set1_ps(2e+00f), tmp1206);
in240 = _mm512_fmadd_ps(tmp1209, _mm512_set1_ps(2e+00f), tmp1210);
tmp1206 = _mm512_fnmadd_ps(tmp1205, _mm512_set1_ps(2e+00f), tmp1206);
tmp1210 = _mm512_fnmadd_ps(tmp1209, _mm512_set1_ps(2e+00f), tmp1210);
tmp1205 = _mm512_fmadd_ps(in230, _mm512_set1_ps(2.5e-01f), tmp1199);
tmp1209 = _mm512_fmadd_ps(in238, _mm512_set1_ps(2.5e-01f), tmp1203);
tmp1199 = _mm512_sub_ps(in231, tmp1199);
tmp1203 = _mm512_sub_ps(in239, tmp1203);
tmp1205 = _mm512_fmadd_ps(in234, _mm512_set1_ps(-1.25e+00f), tmp1205);
tmp1209 = _mm512_fmadd_ps(in242, _mm512_set1_ps(-1.25e+00f), tmp1209);
in234 = _mm512_sub_ps(in234, in230);
in242 = _mm512_sub_ps(in242, in238);
in234 = _mm512_fmadd_ps(in234, _mm512_set1_ps(5.25e+00f), tmp1199);
in242 = _mm512_fmadd_ps(in242, _mm512_set1_ps(5.25e+00f), tmp1203);
tmp1200 = _mm512_fmadd_ps(tmp1205, _mm512_set1_ps(2e+00f), tmp1198);
tmp1204 = _mm512_fmadd_ps(tmp1209, _mm512_set1_ps(2e+00f), tmp1202);
tmp1198 = _mm512_fnmadd_ps(tmp1205, _mm512_set1_ps(2e+00f), tmp1198);
tmp1202 = _mm512_fnmadd_ps(tmp1209, _mm512_set1_ps(2e+00f), tmp1202);
__m512 out247 = _mm512_shuffle_f32x4(in228, tmp1207, 68);
__m512 out255 = _mm512_shuffle_f32x4(in228, tmp1207, 238);
__m512 out248 = _mm512_shuffle_f32x4(tmp1208, in232, 68);
__m512 out256 = _mm512_shuffle_f32x4(tmp1208, in232, 238);
__m512 out249 = _mm512_shuffle_f32x4(tmp1206, tmp1200, 68);
__m512 out257 = _mm512_shuffle_f32x4(tmp1206, tmp1200, 238);
__m512 out250 = _mm512_shuffle_f32x4(tmp1198, in234, 68);
__m512 out258 = _mm512_shuffle_f32x4(tmp1198, in234, 238);
__m512 out251 = _mm512_shuffle_f32x4(in236, tmp1211, 68);
__m512 out259 = _mm512_shuffle_f32x4(in236, tmp1211, 238);
__m512 out252 = _mm512_shuffle_f32x4(tmp1212, in240, 68);
__m512 out260 = _mm512_shuffle_f32x4(tmp1212, in240, 238);
__m512 out253 = _mm512_shuffle_f32x4(tmp1210, tmp1204, 68);
__m512 out261 = _mm512_shuffle_f32x4(tmp1210, tmp1204, 238);
__m512 out254 = _mm512_shuffle_f32x4(tmp1202, in242, 68);
__m512 out262 = _mm512_shuffle_f32x4(tmp1202, in242, 238);
_mm512_storeu_ps(dfPtr4+512+1638400*i16+24576*j11+24576*s13+768*k56, out247);
_mm512_storeu_ps(dfPtr4+640+1638400*i16+24576*j11+24576*s13+768*k56, out255);
_mm512_storeu_ps(dfPtr4+576+1638400*i16+24576*j11+24576*s13+768*k56, out251);
_mm512_storeu_ps(dfPtr4+704+1638400*i16+24576*j11+24576*s13+768*k56, out259);
_mm512_storeu_ps(dfPtr4+410112+1638400*i16+24576*j11+24576*s13+768*k56, out248);
_mm512_storeu_ps(dfPtr4+410240+1638400*i16+24576*j11+24576*s13+768*k56, out256);
_mm512_storeu_ps(dfPtr4+410176+1638400*i16+24576*j11+24576*s13+768*k56, out252);
_mm512_storeu_ps(dfPtr4+410304+1638400*i16+24576*j11+24576*s13+768*k56, out260);
_mm512_storeu_ps(dfPtr4+819712+1638400*i16+24576*j11+24576*s13+768*k56, out249);
_mm512_storeu_ps(dfPtr4+819840+1638400*i16+24576*j11+24576*s13+768*k56, out257);
_mm512_storeu_ps(dfPtr4+819776+1638400*i16+24576*j11+24576*s13+768*k56, out253);
_mm512_storeu_ps(dfPtr4+819904+1638400*i16+24576*j11+24576*s13+768*k56, out261);
_mm512_storeu_ps(dfPtr4+1229312+1638400*i16+24576*j11+24576*s13+768*k56, out250);
_mm512_storeu_ps(dfPtr4+1229440+1638400*i16+24576*j11+24576*s13+768*k56, out258);
_mm512_storeu_ps(dfPtr4+1229376+1638400*i16+24576*j11+24576*s13+768*k56, out254);
_mm512_storeu_ps(dfPtr4+1229504+1638400*i16+24576*j11+24576*s13+768*k56, out262);
}
if (j11 >= last3) return;
++j11;
if (j11 >= 15) break;
rel8 = 3;
}
if (rel8 < 4) {
ptrdiff_t h25 = base8+12;
ptrdiff_t w28 = 0;
ptrdiff_t k57 = 0;
for (; k57 != 32; ++k57) {
__m512 dat1151 = _mm512_maskz_loadu_ps(8191, datPtr5+4+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1151 = _mm512_max_ps(_mm512_setzero_ps(), dat1151);
__m512 dat1152 = _mm512_maskz_loadu_ps(16383, datPtr5+48+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1152 = _mm512_max_ps(_mm512_setzero_ps(), dat1152);
__m512i pm90 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in244 = _mm512_permutexvar_ps(pm90, dat1151);
__m512i pm91 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in252 = _mm512_permutexvar_ps(pm91, dat1152);
__m512 dat1153 = _mm512_maskz_loadu_ps(8191, datPtr5+228+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1153 = _mm512_max_ps(_mm512_setzero_ps(), dat1153);
__m512 dat1154 = _mm512_maskz_loadu_ps(16383, datPtr5+272+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1154 = _mm512_max_ps(_mm512_setzero_ps(), dat1154);
__m512 in245 = _mm512_permutexvar_ps(pm90, dat1153);
__m512 in253 = _mm512_permutexvar_ps(pm91, dat1154);
__m512 dat1155 = _mm512_maskz_loadu_ps(8191, datPtr5+452+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1155 = _mm512_max_ps(_mm512_setzero_ps(), dat1155);
__m512 dat1156 = _mm512_maskz_loadu_ps(16383, datPtr5+496+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1156 = _mm512_max_ps(_mm512_setzero_ps(), dat1156);
__m512 in246 = _mm512_permutexvar_ps(pm90, dat1155);
__m512 in254 = _mm512_permutexvar_ps(pm91, dat1156);
__m512 dat1157 = _mm512_maskz_loadu_ps(8191, datPtr5+676+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1157 = _mm512_max_ps(_mm512_setzero_ps(), dat1157);
__m512 dat1158 = _mm512_maskz_loadu_ps(16383, datPtr5+720+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1158 = _mm512_max_ps(_mm512_setzero_ps(), dat1158);
__m512 in247 = _mm512_permutexvar_ps(pm90, dat1157);
__m512 in255 = _mm512_permutexvar_ps(pm91, dat1158);
__m512 dat1159 = _mm512_maskz_loadu_ps(8191, datPtr5+900+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1159 = _mm512_max_ps(_mm512_setzero_ps(), dat1159);
__m512 dat1160 = _mm512_maskz_loadu_ps(16383, datPtr5+944+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1160 = _mm512_max_ps(_mm512_setzero_ps(), dat1160);
__m512 in248 = _mm512_permutexvar_ps(pm90, dat1159);
__m512 in256 = _mm512_permutexvar_ps(pm91, dat1160);
__m512 dat1161 = _mm512_maskz_loadu_ps(8191, datPtr5+1124+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1161 = _mm512_max_ps(_mm512_setzero_ps(), dat1161);
__m512 dat1162 = _mm512_maskz_loadu_ps(16383, datPtr5+1168+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1162 = _mm512_max_ps(_mm512_setzero_ps(), dat1162);
__m512 in249 = _mm512_permutexvar_ps(pm90, dat1161);
__m512 in257 = _mm512_permutexvar_ps(pm91, dat1162);
__m512 dat1163 = _mm512_maskz_loadu_ps(8191, datPtr5+1348+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1163 = _mm512_max_ps(_mm512_setzero_ps(), dat1163);
__m512 dat1164 = _mm512_maskz_loadu_ps(16383, datPtr5+1392+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1164 = _mm512_max_ps(_mm512_setzero_ps(), dat1164);
__m512 in250 = _mm512_permutexvar_ps(pm90, dat1163);
__m512 in258 = _mm512_permutexvar_ps(pm91, dat1164);
__m512 dat1165 = _mm512_maskz_loadu_ps(8191, datPtr5+1572+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1165 = _mm512_max_ps(_mm512_setzero_ps(), dat1165);
__m512 dat1166 = _mm512_maskz_loadu_ps(16383, datPtr5+1616+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1166 = _mm512_max_ps(_mm512_setzero_ps(), dat1166);
__m512 in251 = _mm512_permutexvar_ps(pm90, dat1165);
__m512 in259 = _mm512_permutexvar_ps(pm91, dat1166);
__m512 tmp1261 = _mm512_add_ps(in245, in249);
__m512 tmp1265 = _mm512_add_ps(in253, in257);
__m512 tmp1262 = _mm512_sub_ps(in248, in246);
__m512 tmp1266 = _mm512_sub_ps(in256, in254);
__m512 tmp1263 = _mm512_add_ps(in246, in250);
__m512 tmp1267 = _mm512_add_ps(in254, in258);
in244 = _mm512_sub_ps(in244, in250);
in252 = _mm512_sub_ps(in252, in258);
tmp1261 = _mm512_fmadd_ps(in247, _mm512_set1_ps(-4.25e+00f), tmp1261);
tmp1265 = _mm512_fmadd_ps(in255, _mm512_set1_ps(-4.25e+00f), tmp1265);
tmp1263 = _mm512_fmadd_ps(in248, _mm512_set1_ps(-4.25e+00f), tmp1263);
tmp1267 = _mm512_fmadd_ps(in256, _mm512_set1_ps(-4.25e+00f), tmp1267);
in244 = _mm512_fmadd_ps(tmp1262, _mm512_set1_ps(5.25e+00f), in244);
in252 = _mm512_fmadd_ps(tmp1266, _mm512_set1_ps(5.25e+00f), in252);
tmp1262 = _mm512_fmadd_ps(in246, _mm512_set1_ps(2.5e-01f), in250);
tmp1266 = _mm512_fmadd_ps(in254, _mm512_set1_ps(2.5e-01f), in258);
in246 = _mm512_fmadd_ps(in246, _mm512_set1_ps(4e+00f), in250);
in254 = _mm512_fmadd_ps(in254, _mm512_set1_ps(4e+00f), in258);
__m512 tmp1264 = _mm512_sub_ps(tmp1263, tmp1261);
__m512 tmp1268 = _mm512_sub_ps(tmp1267, tmp1265);
tmp1263 = _mm512_add_ps(tmp1261, tmp1263);
tmp1267 = _mm512_add_ps(tmp1265, tmp1267);
tmp1261 = _mm512_fmadd_ps(in245, _mm512_set1_ps(2.5e-01f), in249);
tmp1265 = _mm512_fmadd_ps(in253, _mm512_set1_ps(2.5e-01f), in257);
tmp1262 = _mm512_fmadd_ps(in248, _mm512_set1_ps(-1.25e+00f), tmp1262);
tmp1266 = _mm512_fmadd_ps(in256, _mm512_set1_ps(-1.25e+00f), tmp1266);
in248 = _mm512_fmadd_ps(in248, _mm512_set1_ps(-5e+00f), in246);
in256 = _mm512_fmadd_ps(in256, _mm512_set1_ps(-5e+00f), in254);
tmp1261 = _mm512_fmadd_ps(in247, _mm512_set1_ps(-1.25e+00f), tmp1261);
tmp1265 = _mm512_fmadd_ps(in255, _mm512_set1_ps(-1.25e+00f), tmp1265);
in250 = _mm512_fmadd_ps(tmp1261, _mm512_set1_ps(2e+00f), tmp1262);
in258 = _mm512_fmadd_ps(tmp1265, _mm512_set1_ps(2e+00f), tmp1266);
tmp1262 = _mm512_fnmadd_ps(tmp1261, _mm512_set1_ps(2e+00f), tmp1262);
tmp1266 = _mm512_fnmadd_ps(tmp1265, _mm512_set1_ps(2e+00f), tmp1266);
tmp1261 = _mm512_fmadd_ps(in249, _mm512_set1_ps(2.5e-01f), in245);
tmp1265 = _mm512_fmadd_ps(in257, _mm512_set1_ps(2.5e-01f), in253);
in245 = _mm512_sub_ps(in251, in245);
in253 = _mm512_sub_ps(in259, in253);
tmp1261 = _mm512_fmadd_ps(in247, _mm512_set1_ps(-1.25e+00f), tmp1261);
tmp1265 = _mm512_fmadd_ps(in255, _mm512_set1_ps(-1.25e+00f), tmp1265);
in247 = _mm512_sub_ps(in247, in249);
in255 = _mm512_sub_ps(in255, in257);
in247 = _mm512_fmadd_ps(in247, _mm512_set1_ps(5.25e+00f), in245);
in255 = _mm512_fmadd_ps(in255, _mm512_set1_ps(5.25e+00f), in253);
in246 = _mm512_fmadd_ps(tmp1261, _mm512_set1_ps(2e+00f), in248);
in254 = _mm512_fmadd_ps(tmp1265, _mm512_set1_ps(2e+00f), in256);
in248 = _mm512_fnmadd_ps(tmp1261, _mm512_set1_ps(2e+00f), in248);
in256 = _mm512_fnmadd_ps(tmp1265, _mm512_set1_ps(2e+00f), in256);
__m512 tmp1277 = _mm512_unpacklo_ps(in244, tmp1263);
__m512 tmp1278 = _mm512_unpackhi_ps(in244, tmp1263);
__m512 tmp1279 = _mm512_unpacklo_ps(tmp1264, in250);
__m512 tmp1280 = _mm512_unpackhi_ps(tmp1264, in250);
__m512 tmp1281 = _mm512_unpacklo_ps(tmp1262, in246);
__m512 tmp1282 = _mm512_unpackhi_ps(tmp1262, in246);
__m512 tmp1283 = _mm512_unpacklo_ps(in248, in247);
__m512 tmp1284 = _mm512_unpackhi_ps(in248, in247);
__m512 tmp1285 = _mm512_unpacklo_ps(in252, tmp1267);
__m512 tmp1286 = _mm512_unpackhi_ps(in252, tmp1267);
__m512 tmp1287 = _mm512_unpacklo_ps(tmp1268, in258);
__m512 tmp1288 = _mm512_unpackhi_ps(tmp1268, in258);
__m512 tmp1289 = _mm512_unpacklo_ps(tmp1266, in254);
__m512 tmp1290 = _mm512_unpackhi_ps(tmp1266, in254);
__m512 tmp1291 = _mm512_unpacklo_ps(in256, in255);
__m512 tmp1292 = _mm512_unpackhi_ps(in256, in255);
__m512 tmp1293 = _mm512_shuffle_ps(tmp1277, tmp1279, 68);
__m512 tmp1294 = _mm512_shuffle_ps(tmp1277, tmp1279, 238);
__m512 tmp1295 = _mm512_shuffle_ps(tmp1278, tmp1280, 68);
__m512 tmp1296 = _mm512_shuffle_ps(tmp1278, tmp1280, 238);
__m512 tmp1297 = _mm512_shuffle_ps(tmp1281, tmp1283, 68);
__m512 tmp1298 = _mm512_shuffle_ps(tmp1281, tmp1283, 238);
__m512 tmp1299 = _mm512_shuffle_ps(tmp1282, tmp1284, 68);
__m512 tmp1300 = _mm512_shuffle_ps(tmp1282, tmp1284, 238);
__m512 tmp1301 = _mm512_shuffle_ps(tmp1285, tmp1287, 68);
__m512 tmp1302 = _mm512_shuffle_ps(tmp1285, tmp1287, 238);
__m512 tmp1303 = _mm512_shuffle_ps(tmp1286, tmp1288, 68);
__m512 tmp1304 = _mm512_shuffle_ps(tmp1286, tmp1288, 238);
__m512 tmp1305 = _mm512_shuffle_ps(tmp1289, tmp1291, 68);
__m512 tmp1306 = _mm512_shuffle_ps(tmp1289, tmp1291, 238);
__m512 tmp1307 = _mm512_shuffle_ps(tmp1290, tmp1292, 68);
__m512 tmp1308 = _mm512_shuffle_ps(tmp1290, tmp1292, 238);
__m512 tmp1309 = _mm512_shuffle_f32x4(tmp1293, tmp1297, 136);
__m512 tmp1310 = _mm512_shuffle_f32x4(tmp1293, tmp1297, 221);
__m512 tmp1311 = _mm512_shuffle_f32x4(tmp1294, tmp1298, 136);
__m512 tmp1312 = _mm512_shuffle_f32x4(tmp1294, tmp1298, 221);
__m512 tmp1313 = _mm512_shuffle_f32x4(tmp1295, tmp1299, 136);
__m512 tmp1314 = _mm512_shuffle_f32x4(tmp1295, tmp1299, 221);
__m512 tmp1315 = _mm512_shuffle_f32x4(tmp1296, tmp1300, 136);
__m512 tmp1316 = _mm512_shuffle_f32x4(tmp1296, tmp1300, 221);
__m512 tmp1317 = _mm512_shuffle_f32x4(tmp1301, tmp1305, 136);
__m512 tmp1318 = _mm512_shuffle_f32x4(tmp1301, tmp1305, 221);
__m512 tmp1319 = _mm512_shuffle_f32x4(tmp1302, tmp1306, 136);
__m512 tmp1320 = _mm512_shuffle_f32x4(tmp1302, tmp1306, 221);
__m512 tmp1321 = _mm512_shuffle_f32x4(tmp1303, tmp1307, 136);
__m512 tmp1322 = _mm512_shuffle_f32x4(tmp1303, tmp1307, 221);
__m512 tmp1323 = _mm512_shuffle_f32x4(tmp1304, tmp1308, 136);
__m512 tmp1324 = _mm512_shuffle_f32x4(tmp1304, tmp1308, 221);
in244 = _mm512_shuffle_f32x4(tmp1309, tmp1317, 136);
in252 = _mm512_shuffle_f32x4(tmp1309, tmp1317, 221);
tmp1263 = _mm512_shuffle_f32x4(tmp1311, tmp1319, 136);
tmp1267 = _mm512_shuffle_f32x4(tmp1311, tmp1319, 221);
tmp1264 = _mm512_shuffle_f32x4(tmp1313, tmp1321, 136);
tmp1268 = _mm512_shuffle_f32x4(tmp1313, tmp1321, 221);
in250 = _mm512_shuffle_f32x4(tmp1315, tmp1323, 136);
in258 = _mm512_shuffle_f32x4(tmp1315, tmp1323, 221);
tmp1262 = _mm512_shuffle_f32x4(tmp1310, tmp1318, 136);
tmp1266 = _mm512_shuffle_f32x4(tmp1310, tmp1318, 221);
in246 = _mm512_shuffle_f32x4(tmp1312, tmp1320, 136);
in254 = _mm512_shuffle_f32x4(tmp1312, tmp1320, 221);
in248 = _mm512_shuffle_f32x4(tmp1314, tmp1322, 136);
in256 = _mm512_shuffle_f32x4(tmp1314, tmp1322, 221);
in247 = _mm512_shuffle_f32x4(tmp1316, tmp1324, 136);
in255 = _mm512_shuffle_f32x4(tmp1316, tmp1324, 221);
__m512 tmp1269 = _mm512_add_ps(tmp1263, in246);
__m512 tmp1273 = _mm512_add_ps(tmp1267, in254);
__m512 tmp1270 = _mm512_sub_ps(tmp1262, tmp1264);
__m512 tmp1274 = _mm512_sub_ps(tmp1266, tmp1268);
__m512 tmp1271 = _mm512_add_ps(tmp1264, in248);
__m512 tmp1275 = _mm512_add_ps(tmp1268, in256);
in244 = _mm512_sub_ps(in244, in248);
in252 = _mm512_sub_ps(in252, in256);
tmp1269 = _mm512_fmadd_ps(in250, _mm512_set1_ps(-4.25e+00f), tmp1269);
tmp1273 = _mm512_fmadd_ps(in258, _mm512_set1_ps(-4.25e+00f), tmp1273);
tmp1271 = _mm512_fmadd_ps(tmp1262, _mm512_set1_ps(-4.25e+00f), tmp1271);
tmp1275 = _mm512_fmadd_ps(tmp1266, _mm512_set1_ps(-4.25e+00f), tmp1275);
in244 = _mm512_fmadd_ps(tmp1270, _mm512_set1_ps(5.25e+00f), in244);
in252 = _mm512_fmadd_ps(tmp1274, _mm512_set1_ps(5.25e+00f), in252);
tmp1270 = _mm512_fmadd_ps(tmp1264, _mm512_set1_ps(2.5e-01f), in248);
tmp1274 = _mm512_fmadd_ps(tmp1268, _mm512_set1_ps(2.5e-01f), in256);
tmp1264 = _mm512_fmadd_ps(tmp1264, _mm512_set1_ps(4e+00f), in248);
tmp1268 = _mm512_fmadd_ps(tmp1268, _mm512_set1_ps(4e+00f), in256);
__m512 tmp1272 = _mm512_sub_ps(tmp1271, tmp1269);
__m512 tmp1276 = _mm512_sub_ps(tmp1275, tmp1273);
tmp1271 = _mm512_add_ps(tmp1269, tmp1271);
tmp1275 = _mm512_add_ps(tmp1273, tmp1275);
tmp1269 = _mm512_fmadd_ps(tmp1263, _mm512_set1_ps(2.5e-01f), in246);
tmp1273 = _mm512_fmadd_ps(tmp1267, _mm512_set1_ps(2.5e-01f), in254);
tmp1270 = _mm512_fmadd_ps(tmp1262, _mm512_set1_ps(-1.25e+00f), tmp1270);
tmp1274 = _mm512_fmadd_ps(tmp1266, _mm512_set1_ps(-1.25e+00f), tmp1274);
tmp1262 = _mm512_fmadd_ps(tmp1262, _mm512_set1_ps(-5e+00f), tmp1264);
tmp1266 = _mm512_fmadd_ps(tmp1266, _mm512_set1_ps(-5e+00f), tmp1268);
tmp1269 = _mm512_fmadd_ps(in250, _mm512_set1_ps(-1.25e+00f), tmp1269);
tmp1273 = _mm512_fmadd_ps(in258, _mm512_set1_ps(-1.25e+00f), tmp1273);
in248 = _mm512_fmadd_ps(tmp1269, _mm512_set1_ps(2e+00f), tmp1270);
in256 = _mm512_fmadd_ps(tmp1273, _mm512_set1_ps(2e+00f), tmp1274);
tmp1270 = _mm512_fnmadd_ps(tmp1269, _mm512_set1_ps(2e+00f), tmp1270);
tmp1274 = _mm512_fnmadd_ps(tmp1273, _mm512_set1_ps(2e+00f), tmp1274);
tmp1269 = _mm512_fmadd_ps(in246, _mm512_set1_ps(2.5e-01f), tmp1263);
tmp1273 = _mm512_fmadd_ps(in254, _mm512_set1_ps(2.5e-01f), tmp1267);
tmp1263 = _mm512_sub_ps(in247, tmp1263);
tmp1267 = _mm512_sub_ps(in255, tmp1267);
tmp1269 = _mm512_fmadd_ps(in250, _mm512_set1_ps(-1.25e+00f), tmp1269);
tmp1273 = _mm512_fmadd_ps(in258, _mm512_set1_ps(-1.25e+00f), tmp1273);
in250 = _mm512_sub_ps(in250, in246);
in258 = _mm512_sub_ps(in258, in254);
in250 = _mm512_fmadd_ps(in250, _mm512_set1_ps(5.25e+00f), tmp1263);
in258 = _mm512_fmadd_ps(in258, _mm512_set1_ps(5.25e+00f), tmp1267);
tmp1264 = _mm512_fmadd_ps(tmp1269, _mm512_set1_ps(2e+00f), tmp1262);
tmp1268 = _mm512_fmadd_ps(tmp1273, _mm512_set1_ps(2e+00f), tmp1266);
tmp1262 = _mm512_fnmadd_ps(tmp1269, _mm512_set1_ps(2e+00f), tmp1262);
tmp1266 = _mm512_fnmadd_ps(tmp1273, _mm512_set1_ps(2e+00f), tmp1266);
__m512 out263 = _mm512_shuffle_f32x4(in244, tmp1271, 68);
__m512 out271 = _mm512_shuffle_f32x4(in244, tmp1271, 238);
__m512 out264 = _mm512_shuffle_f32x4(tmp1272, in248, 68);
__m512 out272 = _mm512_shuffle_f32x4(tmp1272, in248, 238);
__m512 out265 = _mm512_shuffle_f32x4(tmp1270, tmp1264, 68);
__m512 out273 = _mm512_shuffle_f32x4(tmp1270, tmp1264, 238);
__m512 out266 = _mm512_shuffle_f32x4(tmp1262, in250, 68);
__m512 out274 = _mm512_shuffle_f32x4(tmp1262, in250, 238);
__m512 out267 = _mm512_shuffle_f32x4(in252, tmp1275, 68);
__m512 out275 = _mm512_shuffle_f32x4(in252, tmp1275, 238);
__m512 out268 = _mm512_shuffle_f32x4(tmp1276, in256, 68);
__m512 out276 = _mm512_shuffle_f32x4(tmp1276, in256, 238);
__m512 out269 = _mm512_shuffle_f32x4(tmp1274, tmp1268, 68);
__m512 out277 = _mm512_shuffle_f32x4(tmp1274, tmp1268, 238);
__m512 out270 = _mm512_shuffle_f32x4(tmp1266, in258, 68);
__m512 out278 = _mm512_shuffle_f32x4(tmp1266, in258, 238);
_mm512_storeu_ps(dfPtr4+0+1638400*i16+24576*j11+24576*s13+768*k57, out263);
_mm512_storeu_ps(dfPtr4+128+1638400*i16+24576*j11+24576*s13+768*k57, out271);
_mm512_storeu_ps(dfPtr4+64+1638400*i16+24576*j11+24576*s13+768*k57, out267);
_mm512_storeu_ps(dfPtr4+192+1638400*i16+24576*j11+24576*s13+768*k57, out275);
_mm512_storeu_ps(dfPtr4+409600+1638400*i16+24576*j11+24576*s13+768*k57, out264);
_mm512_storeu_ps(dfPtr4+409728+1638400*i16+24576*j11+24576*s13+768*k57, out272);
_mm512_storeu_ps(dfPtr4+409664+1638400*i16+24576*j11+24576*s13+768*k57, out268);
_mm512_storeu_ps(dfPtr4+409792+1638400*i16+24576*j11+24576*s13+768*k57, out276);
_mm512_storeu_ps(dfPtr4+819200+1638400*i16+24576*j11+24576*s13+768*k57, out265);
_mm512_storeu_ps(dfPtr4+819328+1638400*i16+24576*j11+24576*s13+768*k57, out273);
_mm512_storeu_ps(dfPtr4+819264+1638400*i16+24576*j11+24576*s13+768*k57, out269);
_mm512_storeu_ps(dfPtr4+819392+1638400*i16+24576*j11+24576*s13+768*k57, out277);
_mm512_storeu_ps(dfPtr4+1228800+1638400*i16+24576*j11+24576*s13+768*k57, out266);
_mm512_storeu_ps(dfPtr4+1228928+1638400*i16+24576*j11+24576*s13+768*k57, out274);
_mm512_storeu_ps(dfPtr4+1228864+1638400*i16+24576*j11+24576*s13+768*k57, out270);
_mm512_storeu_ps(dfPtr4+1228992+1638400*i16+24576*j11+24576*s13+768*k57, out278);
__m512 dat1167 = _mm512_maskz_loadu_ps(16383, datPtr5+96+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1167 = _mm512_max_ps(_mm512_setzero_ps(), dat1167);
__m512 dat1168 = _mm512_maskz_loadu_ps(8191, datPtr5+12612+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1168 = _mm512_max_ps(_mm512_setzero_ps(), dat1168);
__m512i pm92 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in260 = _mm512_permutexvar_ps(pm92, dat1167);
__m512i pm93 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in268 = _mm512_permutexvar_ps(pm93, dat1168);
__m512 dat1169 = _mm512_maskz_loadu_ps(16383, datPtr5+320+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1169 = _mm512_max_ps(_mm512_setzero_ps(), dat1169);
__m512 dat1170 = _mm512_maskz_loadu_ps(8191, datPtr5+12836+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1170 = _mm512_max_ps(_mm512_setzero_ps(), dat1170);
__m512 in261 = _mm512_permutexvar_ps(pm92, dat1169);
__m512 in269 = _mm512_permutexvar_ps(pm93, dat1170);
__m512 dat1171 = _mm512_maskz_loadu_ps(16383, datPtr5+544+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1171 = _mm512_max_ps(_mm512_setzero_ps(), dat1171);
__m512 dat1172 = _mm512_maskz_loadu_ps(8191, datPtr5+13060+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1172 = _mm512_max_ps(_mm512_setzero_ps(), dat1172);
__m512 in262 = _mm512_permutexvar_ps(pm92, dat1171);
__m512 in270 = _mm512_permutexvar_ps(pm93, dat1172);
__m512 dat1173 = _mm512_maskz_loadu_ps(16383, datPtr5+768+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1173 = _mm512_max_ps(_mm512_setzero_ps(), dat1173);
__m512 dat1174 = _mm512_maskz_loadu_ps(8191, datPtr5+13284+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1174 = _mm512_max_ps(_mm512_setzero_ps(), dat1174);
__m512 in263 = _mm512_permutexvar_ps(pm92, dat1173);
__m512 in271 = _mm512_permutexvar_ps(pm93, dat1174);
__m512 dat1175 = _mm512_maskz_loadu_ps(16383, datPtr5+992+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1175 = _mm512_max_ps(_mm512_setzero_ps(), dat1175);
__m512 dat1176 = _mm512_maskz_loadu_ps(8191, datPtr5+13508+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1176 = _mm512_max_ps(_mm512_setzero_ps(), dat1176);
__m512 in264 = _mm512_permutexvar_ps(pm92, dat1175);
__m512 in272 = _mm512_permutexvar_ps(pm93, dat1176);
__m512 dat1177 = _mm512_maskz_loadu_ps(16383, datPtr5+1216+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1177 = _mm512_max_ps(_mm512_setzero_ps(), dat1177);
__m512 dat1178 = _mm512_maskz_loadu_ps(8191, datPtr5+13732+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1178 = _mm512_max_ps(_mm512_setzero_ps(), dat1178);
__m512 in265 = _mm512_permutexvar_ps(pm92, dat1177);
__m512 in273 = _mm512_permutexvar_ps(pm93, dat1178);
__m512 dat1179 = _mm512_maskz_loadu_ps(16383, datPtr5+1440+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1179 = _mm512_max_ps(_mm512_setzero_ps(), dat1179);
__m512 dat1180 = _mm512_maskz_loadu_ps(8191, datPtr5+13956+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1180 = _mm512_max_ps(_mm512_setzero_ps(), dat1180);
__m512 in266 = _mm512_permutexvar_ps(pm92, dat1179);
__m512 in274 = _mm512_permutexvar_ps(pm93, dat1180);
__m512 dat1181 = _mm512_maskz_loadu_ps(16383, datPtr5+1664+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1181 = _mm512_max_ps(_mm512_setzero_ps(), dat1181);
__m512 dat1182 = _mm512_maskz_loadu_ps(8191, datPtr5+14180+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1182 = _mm512_max_ps(_mm512_setzero_ps(), dat1182);
__m512 in267 = _mm512_permutexvar_ps(pm92, dat1181);
__m512 in275 = _mm512_permutexvar_ps(pm93, dat1182);
__m512 tmp1325 = _mm512_add_ps(in261, in265);
__m512 tmp1329 = _mm512_add_ps(in269, in273);
__m512 tmp1326 = _mm512_sub_ps(in264, in262);
__m512 tmp1330 = _mm512_sub_ps(in272, in270);
__m512 tmp1327 = _mm512_add_ps(in262, in266);
__m512 tmp1331 = _mm512_add_ps(in270, in274);
in260 = _mm512_sub_ps(in260, in266);
in268 = _mm512_sub_ps(in268, in274);
tmp1325 = _mm512_fmadd_ps(in263, _mm512_set1_ps(-4.25e+00f), tmp1325);
tmp1329 = _mm512_fmadd_ps(in271, _mm512_set1_ps(-4.25e+00f), tmp1329);
tmp1327 = _mm512_fmadd_ps(in264, _mm512_set1_ps(-4.25e+00f), tmp1327);
tmp1331 = _mm512_fmadd_ps(in272, _mm512_set1_ps(-4.25e+00f), tmp1331);
in260 = _mm512_fmadd_ps(tmp1326, _mm512_set1_ps(5.25e+00f), in260);
in268 = _mm512_fmadd_ps(tmp1330, _mm512_set1_ps(5.25e+00f), in268);
tmp1326 = _mm512_fmadd_ps(in262, _mm512_set1_ps(2.5e-01f), in266);
tmp1330 = _mm512_fmadd_ps(in270, _mm512_set1_ps(2.5e-01f), in274);
in262 = _mm512_fmadd_ps(in262, _mm512_set1_ps(4e+00f), in266);
in270 = _mm512_fmadd_ps(in270, _mm512_set1_ps(4e+00f), in274);
__m512 tmp1328 = _mm512_sub_ps(tmp1327, tmp1325);
__m512 tmp1332 = _mm512_sub_ps(tmp1331, tmp1329);
tmp1327 = _mm512_add_ps(tmp1325, tmp1327);
tmp1331 = _mm512_add_ps(tmp1329, tmp1331);
tmp1325 = _mm512_fmadd_ps(in261, _mm512_set1_ps(2.5e-01f), in265);
tmp1329 = _mm512_fmadd_ps(in269, _mm512_set1_ps(2.5e-01f), in273);
tmp1326 = _mm512_fmadd_ps(in264, _mm512_set1_ps(-1.25e+00f), tmp1326);
tmp1330 = _mm512_fmadd_ps(in272, _mm512_set1_ps(-1.25e+00f), tmp1330);
in264 = _mm512_fmadd_ps(in264, _mm512_set1_ps(-5e+00f), in262);
in272 = _mm512_fmadd_ps(in272, _mm512_set1_ps(-5e+00f), in270);
tmp1325 = _mm512_fmadd_ps(in263, _mm512_set1_ps(-1.25e+00f), tmp1325);
tmp1329 = _mm512_fmadd_ps(in271, _mm512_set1_ps(-1.25e+00f), tmp1329);
in266 = _mm512_fmadd_ps(tmp1325, _mm512_set1_ps(2e+00f), tmp1326);
in274 = _mm512_fmadd_ps(tmp1329, _mm512_set1_ps(2e+00f), tmp1330);
tmp1326 = _mm512_fnmadd_ps(tmp1325, _mm512_set1_ps(2e+00f), tmp1326);
tmp1330 = _mm512_fnmadd_ps(tmp1329, _mm512_set1_ps(2e+00f), tmp1330);
tmp1325 = _mm512_fmadd_ps(in265, _mm512_set1_ps(2.5e-01f), in261);
tmp1329 = _mm512_fmadd_ps(in273, _mm512_set1_ps(2.5e-01f), in269);
in261 = _mm512_sub_ps(in267, in261);
in269 = _mm512_sub_ps(in275, in269);
tmp1325 = _mm512_fmadd_ps(in263, _mm512_set1_ps(-1.25e+00f), tmp1325);
tmp1329 = _mm512_fmadd_ps(in271, _mm512_set1_ps(-1.25e+00f), tmp1329);
in263 = _mm512_sub_ps(in263, in265);
in271 = _mm512_sub_ps(in271, in273);
in263 = _mm512_fmadd_ps(in263, _mm512_set1_ps(5.25e+00f), in261);
in271 = _mm512_fmadd_ps(in271, _mm512_set1_ps(5.25e+00f), in269);
in262 = _mm512_fmadd_ps(tmp1325, _mm512_set1_ps(2e+00f), in264);
in270 = _mm512_fmadd_ps(tmp1329, _mm512_set1_ps(2e+00f), in272);
in264 = _mm512_fnmadd_ps(tmp1325, _mm512_set1_ps(2e+00f), in264);
in272 = _mm512_fnmadd_ps(tmp1329, _mm512_set1_ps(2e+00f), in272);
__m512 tmp1341 = _mm512_unpacklo_ps(in260, tmp1327);
__m512 tmp1342 = _mm512_unpackhi_ps(in260, tmp1327);
__m512 tmp1343 = _mm512_unpacklo_ps(tmp1328, in266);
__m512 tmp1344 = _mm512_unpackhi_ps(tmp1328, in266);
__m512 tmp1345 = _mm512_unpacklo_ps(tmp1326, in262);
__m512 tmp1346 = _mm512_unpackhi_ps(tmp1326, in262);
__m512 tmp1347 = _mm512_unpacklo_ps(in264, in263);
__m512 tmp1348 = _mm512_unpackhi_ps(in264, in263);
__m512 tmp1349 = _mm512_unpacklo_ps(in268, tmp1331);
__m512 tmp1350 = _mm512_unpackhi_ps(in268, tmp1331);
__m512 tmp1351 = _mm512_unpacklo_ps(tmp1332, in274);
__m512 tmp1352 = _mm512_unpackhi_ps(tmp1332, in274);
__m512 tmp1353 = _mm512_unpacklo_ps(tmp1330, in270);
__m512 tmp1354 = _mm512_unpackhi_ps(tmp1330, in270);
__m512 tmp1355 = _mm512_unpacklo_ps(in272, in271);
__m512 tmp1356 = _mm512_unpackhi_ps(in272, in271);
__m512 tmp1357 = _mm512_shuffle_ps(tmp1341, tmp1343, 68);
__m512 tmp1358 = _mm512_shuffle_ps(tmp1341, tmp1343, 238);
__m512 tmp1359 = _mm512_shuffle_ps(tmp1342, tmp1344, 68);
__m512 tmp1360 = _mm512_shuffle_ps(tmp1342, tmp1344, 238);
__m512 tmp1361 = _mm512_shuffle_ps(tmp1345, tmp1347, 68);
__m512 tmp1362 = _mm512_shuffle_ps(tmp1345, tmp1347, 238);
__m512 tmp1363 = _mm512_shuffle_ps(tmp1346, tmp1348, 68);
__m512 tmp1364 = _mm512_shuffle_ps(tmp1346, tmp1348, 238);
__m512 tmp1365 = _mm512_shuffle_ps(tmp1349, tmp1351, 68);
__m512 tmp1366 = _mm512_shuffle_ps(tmp1349, tmp1351, 238);
__m512 tmp1367 = _mm512_shuffle_ps(tmp1350, tmp1352, 68);
__m512 tmp1368 = _mm512_shuffle_ps(tmp1350, tmp1352, 238);
__m512 tmp1369 = _mm512_shuffle_ps(tmp1353, tmp1355, 68);
__m512 tmp1370 = _mm512_shuffle_ps(tmp1353, tmp1355, 238);
__m512 tmp1371 = _mm512_shuffle_ps(tmp1354, tmp1356, 68);
__m512 tmp1372 = _mm512_shuffle_ps(tmp1354, tmp1356, 238);
__m512 tmp1373 = _mm512_shuffle_f32x4(tmp1357, tmp1361, 136);
__m512 tmp1374 = _mm512_shuffle_f32x4(tmp1357, tmp1361, 221);
__m512 tmp1375 = _mm512_shuffle_f32x4(tmp1358, tmp1362, 136);
__m512 tmp1376 = _mm512_shuffle_f32x4(tmp1358, tmp1362, 221);
__m512 tmp1377 = _mm512_shuffle_f32x4(tmp1359, tmp1363, 136);
__m512 tmp1378 = _mm512_shuffle_f32x4(tmp1359, tmp1363, 221);
__m512 tmp1379 = _mm512_shuffle_f32x4(tmp1360, tmp1364, 136);
__m512 tmp1380 = _mm512_shuffle_f32x4(tmp1360, tmp1364, 221);
__m512 tmp1381 = _mm512_shuffle_f32x4(tmp1365, tmp1369, 136);
__m512 tmp1382 = _mm512_shuffle_f32x4(tmp1365, tmp1369, 221);
__m512 tmp1383 = _mm512_shuffle_f32x4(tmp1366, tmp1370, 136);
__m512 tmp1384 = _mm512_shuffle_f32x4(tmp1366, tmp1370, 221);
__m512 tmp1385 = _mm512_shuffle_f32x4(tmp1367, tmp1371, 136);
__m512 tmp1386 = _mm512_shuffle_f32x4(tmp1367, tmp1371, 221);
__m512 tmp1387 = _mm512_shuffle_f32x4(tmp1368, tmp1372, 136);
__m512 tmp1388 = _mm512_shuffle_f32x4(tmp1368, tmp1372, 221);
in260 = _mm512_shuffle_f32x4(tmp1373, tmp1381, 136);
in268 = _mm512_shuffle_f32x4(tmp1373, tmp1381, 221);
tmp1327 = _mm512_shuffle_f32x4(tmp1375, tmp1383, 136);
tmp1331 = _mm512_shuffle_f32x4(tmp1375, tmp1383, 221);
tmp1328 = _mm512_shuffle_f32x4(tmp1377, tmp1385, 136);
tmp1332 = _mm512_shuffle_f32x4(tmp1377, tmp1385, 221);
in266 = _mm512_shuffle_f32x4(tmp1379, tmp1387, 136);
in274 = _mm512_shuffle_f32x4(tmp1379, tmp1387, 221);
tmp1326 = _mm512_shuffle_f32x4(tmp1374, tmp1382, 136);
tmp1330 = _mm512_shuffle_f32x4(tmp1374, tmp1382, 221);
in262 = _mm512_shuffle_f32x4(tmp1376, tmp1384, 136);
in270 = _mm512_shuffle_f32x4(tmp1376, tmp1384, 221);
in264 = _mm512_shuffle_f32x4(tmp1378, tmp1386, 136);
in272 = _mm512_shuffle_f32x4(tmp1378, tmp1386, 221);
in263 = _mm512_shuffle_f32x4(tmp1380, tmp1388, 136);
in271 = _mm512_shuffle_f32x4(tmp1380, tmp1388, 221);
__m512 tmp1333 = _mm512_add_ps(tmp1327, in262);
__m512 tmp1337 = _mm512_add_ps(tmp1331, in270);
__m512 tmp1334 = _mm512_sub_ps(tmp1326, tmp1328);
__m512 tmp1338 = _mm512_sub_ps(tmp1330, tmp1332);
__m512 tmp1335 = _mm512_add_ps(tmp1328, in264);
__m512 tmp1339 = _mm512_add_ps(tmp1332, in272);
in260 = _mm512_sub_ps(in260, in264);
in268 = _mm512_sub_ps(in268, in272);
tmp1333 = _mm512_fmadd_ps(in266, _mm512_set1_ps(-4.25e+00f), tmp1333);
tmp1337 = _mm512_fmadd_ps(in274, _mm512_set1_ps(-4.25e+00f), tmp1337);
tmp1335 = _mm512_fmadd_ps(tmp1326, _mm512_set1_ps(-4.25e+00f), tmp1335);
tmp1339 = _mm512_fmadd_ps(tmp1330, _mm512_set1_ps(-4.25e+00f), tmp1339);
in260 = _mm512_fmadd_ps(tmp1334, _mm512_set1_ps(5.25e+00f), in260);
in268 = _mm512_fmadd_ps(tmp1338, _mm512_set1_ps(5.25e+00f), in268);
tmp1334 = _mm512_fmadd_ps(tmp1328, _mm512_set1_ps(2.5e-01f), in264);
tmp1338 = _mm512_fmadd_ps(tmp1332, _mm512_set1_ps(2.5e-01f), in272);
tmp1328 = _mm512_fmadd_ps(tmp1328, _mm512_set1_ps(4e+00f), in264);
tmp1332 = _mm512_fmadd_ps(tmp1332, _mm512_set1_ps(4e+00f), in272);
__m512 tmp1336 = _mm512_sub_ps(tmp1335, tmp1333);
__m512 tmp1340 = _mm512_sub_ps(tmp1339, tmp1337);
tmp1335 = _mm512_add_ps(tmp1333, tmp1335);
tmp1339 = _mm512_add_ps(tmp1337, tmp1339);
tmp1333 = _mm512_fmadd_ps(tmp1327, _mm512_set1_ps(2.5e-01f), in262);
tmp1337 = _mm512_fmadd_ps(tmp1331, _mm512_set1_ps(2.5e-01f), in270);
tmp1334 = _mm512_fmadd_ps(tmp1326, _mm512_set1_ps(-1.25e+00f), tmp1334);
tmp1338 = _mm512_fmadd_ps(tmp1330, _mm512_set1_ps(-1.25e+00f), tmp1338);
tmp1326 = _mm512_fmadd_ps(tmp1326, _mm512_set1_ps(-5e+00f), tmp1328);
tmp1330 = _mm512_fmadd_ps(tmp1330, _mm512_set1_ps(-5e+00f), tmp1332);
tmp1333 = _mm512_fmadd_ps(in266, _mm512_set1_ps(-1.25e+00f), tmp1333);
tmp1337 = _mm512_fmadd_ps(in274, _mm512_set1_ps(-1.25e+00f), tmp1337);
in264 = _mm512_fmadd_ps(tmp1333, _mm512_set1_ps(2e+00f), tmp1334);
in272 = _mm512_fmadd_ps(tmp1337, _mm512_set1_ps(2e+00f), tmp1338);
tmp1334 = _mm512_fnmadd_ps(tmp1333, _mm512_set1_ps(2e+00f), tmp1334);
tmp1338 = _mm512_fnmadd_ps(tmp1337, _mm512_set1_ps(2e+00f), tmp1338);
tmp1333 = _mm512_fmadd_ps(in262, _mm512_set1_ps(2.5e-01f), tmp1327);
tmp1337 = _mm512_fmadd_ps(in270, _mm512_set1_ps(2.5e-01f), tmp1331);
tmp1327 = _mm512_sub_ps(in263, tmp1327);
tmp1331 = _mm512_sub_ps(in271, tmp1331);
tmp1333 = _mm512_fmadd_ps(in266, _mm512_set1_ps(-1.25e+00f), tmp1333);
tmp1337 = _mm512_fmadd_ps(in274, _mm512_set1_ps(-1.25e+00f), tmp1337);
in266 = _mm512_sub_ps(in266, in262);
in274 = _mm512_sub_ps(in274, in270);
in266 = _mm512_fmadd_ps(in266, _mm512_set1_ps(5.25e+00f), tmp1327);
in274 = _mm512_fmadd_ps(in274, _mm512_set1_ps(5.25e+00f), tmp1331);
tmp1328 = _mm512_fmadd_ps(tmp1333, _mm512_set1_ps(2e+00f), tmp1326);
tmp1332 = _mm512_fmadd_ps(tmp1337, _mm512_set1_ps(2e+00f), tmp1330);
tmp1326 = _mm512_fnmadd_ps(tmp1333, _mm512_set1_ps(2e+00f), tmp1326);
tmp1330 = _mm512_fnmadd_ps(tmp1337, _mm512_set1_ps(2e+00f), tmp1330);
__m512 out279 = _mm512_shuffle_f32x4(in260, tmp1335, 68);
__m512 out287 = _mm512_shuffle_f32x4(in260, tmp1335, 238);
__m512 out280 = _mm512_shuffle_f32x4(tmp1336, in264, 68);
__m512 out288 = _mm512_shuffle_f32x4(tmp1336, in264, 238);
__m512 out281 = _mm512_shuffle_f32x4(tmp1334, tmp1328, 68);
__m512 out289 = _mm512_shuffle_f32x4(tmp1334, tmp1328, 238);
__m512 out282 = _mm512_shuffle_f32x4(tmp1326, in266, 68);
__m512 out290 = _mm512_shuffle_f32x4(tmp1326, in266, 238);
__m512 out283 = _mm512_shuffle_f32x4(in268, tmp1339, 68);
__m512 out291 = _mm512_shuffle_f32x4(in268, tmp1339, 238);
__m512 out284 = _mm512_shuffle_f32x4(tmp1340, in272, 68);
__m512 out292 = _mm512_shuffle_f32x4(tmp1340, in272, 238);
__m512 out285 = _mm512_shuffle_f32x4(tmp1338, tmp1332, 68);
__m512 out293 = _mm512_shuffle_f32x4(tmp1338, tmp1332, 238);
__m512 out286 = _mm512_shuffle_f32x4(tmp1330, in274, 68);
__m512 out294 = _mm512_shuffle_f32x4(tmp1330, in274, 238);
_mm512_storeu_ps(dfPtr4+256+1638400*i16+24576*j11+24576*s13+768*k57, out279);
_mm512_storeu_ps(dfPtr4+384+1638400*i16+24576*j11+24576*s13+768*k57, out287);
_mm512_storeu_ps(dfPtr4+320+1638400*i16+24576*j11+24576*s13+768*k57, out283);
_mm512_storeu_ps(dfPtr4+448+1638400*i16+24576*j11+24576*s13+768*k57, out291);
_mm512_storeu_ps(dfPtr4+409856+1638400*i16+24576*j11+24576*s13+768*k57, out280);
_mm512_storeu_ps(dfPtr4+409984+1638400*i16+24576*j11+24576*s13+768*k57, out288);
_mm512_storeu_ps(dfPtr4+409920+1638400*i16+24576*j11+24576*s13+768*k57, out284);
_mm512_storeu_ps(dfPtr4+410048+1638400*i16+24576*j11+24576*s13+768*k57, out292);
_mm512_storeu_ps(dfPtr4+819456+1638400*i16+24576*j11+24576*s13+768*k57, out281);
_mm512_storeu_ps(dfPtr4+819584+1638400*i16+24576*j11+24576*s13+768*k57, out289);
_mm512_storeu_ps(dfPtr4+819520+1638400*i16+24576*j11+24576*s13+768*k57, out285);
_mm512_storeu_ps(dfPtr4+819648+1638400*i16+24576*j11+24576*s13+768*k57, out293);
_mm512_storeu_ps(dfPtr4+1229056+1638400*i16+24576*j11+24576*s13+768*k57, out282);
_mm512_storeu_ps(dfPtr4+1229184+1638400*i16+24576*j11+24576*s13+768*k57, out290);
_mm512_storeu_ps(dfPtr4+1229120+1638400*i16+24576*j11+24576*s13+768*k57, out286);
_mm512_storeu_ps(dfPtr4+1229248+1638400*i16+24576*j11+24576*s13+768*k57, out294);
__m512 dat1183 = _mm512_maskz_loadu_ps(16383, datPtr5+12656+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1183 = _mm512_max_ps(_mm512_setzero_ps(), dat1183);
__m512 dat1184 = _mm512_maskz_loadu_ps(16383, datPtr5+12704+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1184 = _mm512_max_ps(_mm512_setzero_ps(), dat1184);
__m512i pm94 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in276 = _mm512_permutexvar_ps(pm94, dat1183);
__m512 in284 = _mm512_permutexvar_ps(pm94, dat1184);
__m512 dat1185 = _mm512_maskz_loadu_ps(16383, datPtr5+12880+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1185 = _mm512_max_ps(_mm512_setzero_ps(), dat1185);
__m512 dat1186 = _mm512_maskz_loadu_ps(16383, datPtr5+12928+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1186 = _mm512_max_ps(_mm512_setzero_ps(), dat1186);
__m512 in277 = _mm512_permutexvar_ps(pm94, dat1185);
__m512 in285 = _mm512_permutexvar_ps(pm94, dat1186);
__m512 dat1187 = _mm512_maskz_loadu_ps(16383, datPtr5+13104+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1187 = _mm512_max_ps(_mm512_setzero_ps(), dat1187);
__m512 dat1188 = _mm512_maskz_loadu_ps(16383, datPtr5+13152+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1188 = _mm512_max_ps(_mm512_setzero_ps(), dat1188);
__m512 in278 = _mm512_permutexvar_ps(pm94, dat1187);
__m512 in286 = _mm512_permutexvar_ps(pm94, dat1188);
__m512 dat1189 = _mm512_maskz_loadu_ps(16383, datPtr5+13328+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1189 = _mm512_max_ps(_mm512_setzero_ps(), dat1189);
__m512 dat1190 = _mm512_maskz_loadu_ps(16383, datPtr5+13376+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1190 = _mm512_max_ps(_mm512_setzero_ps(), dat1190);
__m512 in279 = _mm512_permutexvar_ps(pm94, dat1189);
__m512 in287 = _mm512_permutexvar_ps(pm94, dat1190);
__m512 dat1191 = _mm512_maskz_loadu_ps(16383, datPtr5+13552+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1191 = _mm512_max_ps(_mm512_setzero_ps(), dat1191);
__m512 dat1192 = _mm512_maskz_loadu_ps(16383, datPtr5+13600+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1192 = _mm512_max_ps(_mm512_setzero_ps(), dat1192);
__m512 in280 = _mm512_permutexvar_ps(pm94, dat1191);
__m512 in288 = _mm512_permutexvar_ps(pm94, dat1192);
__m512 dat1193 = _mm512_maskz_loadu_ps(16383, datPtr5+13776+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1193 = _mm512_max_ps(_mm512_setzero_ps(), dat1193);
__m512 dat1194 = _mm512_maskz_loadu_ps(16383, datPtr5+13824+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1194 = _mm512_max_ps(_mm512_setzero_ps(), dat1194);
__m512 in281 = _mm512_permutexvar_ps(pm94, dat1193);
__m512 in289 = _mm512_permutexvar_ps(pm94, dat1194);
__m512 dat1195 = _mm512_maskz_loadu_ps(16383, datPtr5+14000+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1195 = _mm512_max_ps(_mm512_setzero_ps(), dat1195);
__m512 dat1196 = _mm512_maskz_loadu_ps(16383, datPtr5+14048+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1196 = _mm512_max_ps(_mm512_setzero_ps(), dat1196);
__m512 in282 = _mm512_permutexvar_ps(pm94, dat1195);
__m512 in290 = _mm512_permutexvar_ps(pm94, dat1196);
__m512 dat1197 = _mm512_maskz_loadu_ps(16383, datPtr5+14224+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1197 = _mm512_max_ps(_mm512_setzero_ps(), dat1197);
__m512 dat1198 = _mm512_maskz_loadu_ps(16383, datPtr5+14272+806912*i16+224*h25+4*w28+806912*s13+25216*k57);
dat1198 = _mm512_max_ps(_mm512_setzero_ps(), dat1198);
__m512 in283 = _mm512_permutexvar_ps(pm94, dat1197);
__m512 in291 = _mm512_permutexvar_ps(pm94, dat1198);
__m512 tmp1389 = _mm512_add_ps(in277, in281);
__m512 tmp1393 = _mm512_add_ps(in285, in289);
__m512 tmp1390 = _mm512_sub_ps(in280, in278);
__m512 tmp1394 = _mm512_sub_ps(in288, in286);
__m512 tmp1391 = _mm512_add_ps(in278, in282);
__m512 tmp1395 = _mm512_add_ps(in286, in290);
in276 = _mm512_sub_ps(in276, in282);
in284 = _mm512_sub_ps(in284, in290);
tmp1389 = _mm512_fmadd_ps(in279, _mm512_set1_ps(-4.25e+00f), tmp1389);
tmp1393 = _mm512_fmadd_ps(in287, _mm512_set1_ps(-4.25e+00f), tmp1393);
tmp1391 = _mm512_fmadd_ps(in280, _mm512_set1_ps(-4.25e+00f), tmp1391);
tmp1395 = _mm512_fmadd_ps(in288, _mm512_set1_ps(-4.25e+00f), tmp1395);
in276 = _mm512_fmadd_ps(tmp1390, _mm512_set1_ps(5.25e+00f), in276);
in284 = _mm512_fmadd_ps(tmp1394, _mm512_set1_ps(5.25e+00f), in284);
tmp1390 = _mm512_fmadd_ps(in278, _mm512_set1_ps(2.5e-01f), in282);
tmp1394 = _mm512_fmadd_ps(in286, _mm512_set1_ps(2.5e-01f), in290);
in278 = _mm512_fmadd_ps(in278, _mm512_set1_ps(4e+00f), in282);
in286 = _mm512_fmadd_ps(in286, _mm512_set1_ps(4e+00f), in290);
__m512 tmp1392 = _mm512_sub_ps(tmp1391, tmp1389);
__m512 tmp1396 = _mm512_sub_ps(tmp1395, tmp1393);
tmp1391 = _mm512_add_ps(tmp1389, tmp1391);
tmp1395 = _mm512_add_ps(tmp1393, tmp1395);
tmp1389 = _mm512_fmadd_ps(in277, _mm512_set1_ps(2.5e-01f), in281);
tmp1393 = _mm512_fmadd_ps(in285, _mm512_set1_ps(2.5e-01f), in289);
tmp1390 = _mm512_fmadd_ps(in280, _mm512_set1_ps(-1.25e+00f), tmp1390);
tmp1394 = _mm512_fmadd_ps(in288, _mm512_set1_ps(-1.25e+00f), tmp1394);
in280 = _mm512_fmadd_ps(in280, _mm512_set1_ps(-5e+00f), in278);
in288 = _mm512_fmadd_ps(in288, _mm512_set1_ps(-5e+00f), in286);
tmp1389 = _mm512_fmadd_ps(in279, _mm512_set1_ps(-1.25e+00f), tmp1389);
tmp1393 = _mm512_fmadd_ps(in287, _mm512_set1_ps(-1.25e+00f), tmp1393);
in282 = _mm512_fmadd_ps(tmp1389, _mm512_set1_ps(2e+00f), tmp1390);
in290 = _mm512_fmadd_ps(tmp1393, _mm512_set1_ps(2e+00f), tmp1394);
tmp1390 = _mm512_fnmadd_ps(tmp1389, _mm512_set1_ps(2e+00f), tmp1390);
tmp1394 = _mm512_fnmadd_ps(tmp1393, _mm512_set1_ps(2e+00f), tmp1394);
tmp1389 = _mm512_fmadd_ps(in281, _mm512_set1_ps(2.5e-01f), in277);
tmp1393 = _mm512_fmadd_ps(in289, _mm512_set1_ps(2.5e-01f), in285);
in277 = _mm512_sub_ps(in283, in277);
in285 = _mm512_sub_ps(in291, in285);
tmp1389 = _mm512_fmadd_ps(in279, _mm512_set1_ps(-1.25e+00f), tmp1389);
tmp1393 = _mm512_fmadd_ps(in287, _mm512_set1_ps(-1.25e+00f), tmp1393);
in279 = _mm512_sub_ps(in279, in281);
in287 = _mm512_sub_ps(in287, in289);
in279 = _mm512_fmadd_ps(in279, _mm512_set1_ps(5.25e+00f), in277);
in287 = _mm512_fmadd_ps(in287, _mm512_set1_ps(5.25e+00f), in285);
in278 = _mm512_fmadd_ps(tmp1389, _mm512_set1_ps(2e+00f), in280);
in286 = _mm512_fmadd_ps(tmp1393, _mm512_set1_ps(2e+00f), in288);
in280 = _mm512_fnmadd_ps(tmp1389, _mm512_set1_ps(2e+00f), in280);
in288 = _mm512_fnmadd_ps(tmp1393, _mm512_set1_ps(2e+00f), in288);
__m512 tmp1405 = _mm512_unpacklo_ps(in276, tmp1391);
__m512 tmp1406 = _mm512_unpackhi_ps(in276, tmp1391);
__m512 tmp1407 = _mm512_unpacklo_ps(tmp1392, in282);
__m512 tmp1408 = _mm512_unpackhi_ps(tmp1392, in282);
__m512 tmp1409 = _mm512_unpacklo_ps(tmp1390, in278);
__m512 tmp1410 = _mm512_unpackhi_ps(tmp1390, in278);
__m512 tmp1411 = _mm512_unpacklo_ps(in280, in279);
__m512 tmp1412 = _mm512_unpackhi_ps(in280, in279);
__m512 tmp1413 = _mm512_unpacklo_ps(in284, tmp1395);
__m512 tmp1414 = _mm512_unpackhi_ps(in284, tmp1395);
__m512 tmp1415 = _mm512_unpacklo_ps(tmp1396, in290);
__m512 tmp1416 = _mm512_unpackhi_ps(tmp1396, in290);
__m512 tmp1417 = _mm512_unpacklo_ps(tmp1394, in286);
__m512 tmp1418 = _mm512_unpackhi_ps(tmp1394, in286);
__m512 tmp1419 = _mm512_unpacklo_ps(in288, in287);
__m512 tmp1420 = _mm512_unpackhi_ps(in288, in287);
__m512 tmp1421 = _mm512_shuffle_ps(tmp1405, tmp1407, 68);
__m512 tmp1422 = _mm512_shuffle_ps(tmp1405, tmp1407, 238);
__m512 tmp1423 = _mm512_shuffle_ps(tmp1406, tmp1408, 68);
__m512 tmp1424 = _mm512_shuffle_ps(tmp1406, tmp1408, 238);
__m512 tmp1425 = _mm512_shuffle_ps(tmp1409, tmp1411, 68);
__m512 tmp1426 = _mm512_shuffle_ps(tmp1409, tmp1411, 238);
__m512 tmp1427 = _mm512_shuffle_ps(tmp1410, tmp1412, 68);
__m512 tmp1428 = _mm512_shuffle_ps(tmp1410, tmp1412, 238);
__m512 tmp1429 = _mm512_shuffle_ps(tmp1413, tmp1415, 68);
__m512 tmp1430 = _mm512_shuffle_ps(tmp1413, tmp1415, 238);
__m512 tmp1431 = _mm512_shuffle_ps(tmp1414, tmp1416, 68);
__m512 tmp1432 = _mm512_shuffle_ps(tmp1414, tmp1416, 238);
__m512 tmp1433 = _mm512_shuffle_ps(tmp1417, tmp1419, 68);
__m512 tmp1434 = _mm512_shuffle_ps(tmp1417, tmp1419, 238);
__m512 tmp1435 = _mm512_shuffle_ps(tmp1418, tmp1420, 68);
__m512 tmp1436 = _mm512_shuffle_ps(tmp1418, tmp1420, 238);
__m512 tmp1437 = _mm512_shuffle_f32x4(tmp1421, tmp1425, 136);
__m512 tmp1438 = _mm512_shuffle_f32x4(tmp1421, tmp1425, 221);
__m512 tmp1439 = _mm512_shuffle_f32x4(tmp1422, tmp1426, 136);
__m512 tmp1440 = _mm512_shuffle_f32x4(tmp1422, tmp1426, 221);
__m512 tmp1441 = _mm512_shuffle_f32x4(tmp1423, tmp1427, 136);
__m512 tmp1442 = _mm512_shuffle_f32x4(tmp1423, tmp1427, 221);
__m512 tmp1443 = _mm512_shuffle_f32x4(tmp1424, tmp1428, 136);
__m512 tmp1444 = _mm512_shuffle_f32x4(tmp1424, tmp1428, 221);
__m512 tmp1445 = _mm512_shuffle_f32x4(tmp1429, tmp1433, 136);
__m512 tmp1446 = _mm512_shuffle_f32x4(tmp1429, tmp1433, 221);
__m512 tmp1447 = _mm512_shuffle_f32x4(tmp1430, tmp1434, 136);
__m512 tmp1448 = _mm512_shuffle_f32x4(tmp1430, tmp1434, 221);
__m512 tmp1449 = _mm512_shuffle_f32x4(tmp1431, tmp1435, 136);
__m512 tmp1450 = _mm512_shuffle_f32x4(tmp1431, tmp1435, 221);
__m512 tmp1451 = _mm512_shuffle_f32x4(tmp1432, tmp1436, 136);
__m512 tmp1452 = _mm512_shuffle_f32x4(tmp1432, tmp1436, 221);
in276 = _mm512_shuffle_f32x4(tmp1437, tmp1445, 136);
in284 = _mm512_shuffle_f32x4(tmp1437, tmp1445, 221);
tmp1391 = _mm512_shuffle_f32x4(tmp1439, tmp1447, 136);
tmp1395 = _mm512_shuffle_f32x4(tmp1439, tmp1447, 221);
tmp1392 = _mm512_shuffle_f32x4(tmp1441, tmp1449, 136);
tmp1396 = _mm512_shuffle_f32x4(tmp1441, tmp1449, 221);
in282 = _mm512_shuffle_f32x4(tmp1443, tmp1451, 136);
in290 = _mm512_shuffle_f32x4(tmp1443, tmp1451, 221);
tmp1390 = _mm512_shuffle_f32x4(tmp1438, tmp1446, 136);
tmp1394 = _mm512_shuffle_f32x4(tmp1438, tmp1446, 221);
in278 = _mm512_shuffle_f32x4(tmp1440, tmp1448, 136);
in286 = _mm512_shuffle_f32x4(tmp1440, tmp1448, 221);
in280 = _mm512_shuffle_f32x4(tmp1442, tmp1450, 136);
in288 = _mm512_shuffle_f32x4(tmp1442, tmp1450, 221);
in279 = _mm512_shuffle_f32x4(tmp1444, tmp1452, 136);
in287 = _mm512_shuffle_f32x4(tmp1444, tmp1452, 221);
__m512 tmp1397 = _mm512_add_ps(tmp1391, in278);
__m512 tmp1401 = _mm512_add_ps(tmp1395, in286);
__m512 tmp1398 = _mm512_sub_ps(tmp1390, tmp1392);
__m512 tmp1402 = _mm512_sub_ps(tmp1394, tmp1396);
__m512 tmp1399 = _mm512_add_ps(tmp1392, in280);
__m512 tmp1403 = _mm512_add_ps(tmp1396, in288);
in276 = _mm512_sub_ps(in276, in280);
in284 = _mm512_sub_ps(in284, in288);
tmp1397 = _mm512_fmadd_ps(in282, _mm512_set1_ps(-4.25e+00f), tmp1397);
tmp1401 = _mm512_fmadd_ps(in290, _mm512_set1_ps(-4.25e+00f), tmp1401);
tmp1399 = _mm512_fmadd_ps(tmp1390, _mm512_set1_ps(-4.25e+00f), tmp1399);
tmp1403 = _mm512_fmadd_ps(tmp1394, _mm512_set1_ps(-4.25e+00f), tmp1403);
in276 = _mm512_fmadd_ps(tmp1398, _mm512_set1_ps(5.25e+00f), in276);
in284 = _mm512_fmadd_ps(tmp1402, _mm512_set1_ps(5.25e+00f), in284);
tmp1398 = _mm512_fmadd_ps(tmp1392, _mm512_set1_ps(2.5e-01f), in280);
tmp1402 = _mm512_fmadd_ps(tmp1396, _mm512_set1_ps(2.5e-01f), in288);
tmp1392 = _mm512_fmadd_ps(tmp1392, _mm512_set1_ps(4e+00f), in280);
tmp1396 = _mm512_fmadd_ps(tmp1396, _mm512_set1_ps(4e+00f), in288);
__m512 tmp1400 = _mm512_sub_ps(tmp1399, tmp1397);
__m512 tmp1404 = _mm512_sub_ps(tmp1403, tmp1401);
tmp1399 = _mm512_add_ps(tmp1397, tmp1399);
tmp1403 = _mm512_add_ps(tmp1401, tmp1403);
tmp1397 = _mm512_fmadd_ps(tmp1391, _mm512_set1_ps(2.5e-01f), in278);
tmp1401 = _mm512_fmadd_ps(tmp1395, _mm512_set1_ps(2.5e-01f), in286);
tmp1398 = _mm512_fmadd_ps(tmp1390, _mm512_set1_ps(-1.25e+00f), tmp1398);
tmp1402 = _mm512_fmadd_ps(tmp1394, _mm512_set1_ps(-1.25e+00f), tmp1402);
tmp1390 = _mm512_fmadd_ps(tmp1390, _mm512_set1_ps(-5e+00f), tmp1392);
tmp1394 = _mm512_fmadd_ps(tmp1394, _mm512_set1_ps(-5e+00f), tmp1396);
tmp1397 = _mm512_fmadd_ps(in282, _mm512_set1_ps(-1.25e+00f), tmp1397);
tmp1401 = _mm512_fmadd_ps(in290, _mm512_set1_ps(-1.25e+00f), tmp1401);
in280 = _mm512_fmadd_ps(tmp1397, _mm512_set1_ps(2e+00f), tmp1398);
in288 = _mm512_fmadd_ps(tmp1401, _mm512_set1_ps(2e+00f), tmp1402);
tmp1398 = _mm512_fnmadd_ps(tmp1397, _mm512_set1_ps(2e+00f), tmp1398);
tmp1402 = _mm512_fnmadd_ps(tmp1401, _mm512_set1_ps(2e+00f), tmp1402);
tmp1397 = _mm512_fmadd_ps(in278, _mm512_set1_ps(2.5e-01f), tmp1391);
tmp1401 = _mm512_fmadd_ps(in286, _mm512_set1_ps(2.5e-01f), tmp1395);
tmp1391 = _mm512_sub_ps(in279, tmp1391);
tmp1395 = _mm512_sub_ps(in287, tmp1395);
tmp1397 = _mm512_fmadd_ps(in282, _mm512_set1_ps(-1.25e+00f), tmp1397);
tmp1401 = _mm512_fmadd_ps(in290, _mm512_set1_ps(-1.25e+00f), tmp1401);
in282 = _mm512_sub_ps(in282, in278);
in290 = _mm512_sub_ps(in290, in286);
in282 = _mm512_fmadd_ps(in282, _mm512_set1_ps(5.25e+00f), tmp1391);
in290 = _mm512_fmadd_ps(in290, _mm512_set1_ps(5.25e+00f), tmp1395);
tmp1392 = _mm512_fmadd_ps(tmp1397, _mm512_set1_ps(2e+00f), tmp1390);
tmp1396 = _mm512_fmadd_ps(tmp1401, _mm512_set1_ps(2e+00f), tmp1394);
tmp1390 = _mm512_fnmadd_ps(tmp1397, _mm512_set1_ps(2e+00f), tmp1390);
tmp1394 = _mm512_fnmadd_ps(tmp1401, _mm512_set1_ps(2e+00f), tmp1394);
__m512 out295 = _mm512_shuffle_f32x4(in276, tmp1399, 68);
__m512 out303 = _mm512_shuffle_f32x4(in276, tmp1399, 238);
__m512 out296 = _mm512_shuffle_f32x4(tmp1400, in280, 68);
__m512 out304 = _mm512_shuffle_f32x4(tmp1400, in280, 238);
__m512 out297 = _mm512_shuffle_f32x4(tmp1398, tmp1392, 68);
__m512 out305 = _mm512_shuffle_f32x4(tmp1398, tmp1392, 238);
__m512 out298 = _mm512_shuffle_f32x4(tmp1390, in282, 68);
__m512 out306 = _mm512_shuffle_f32x4(tmp1390, in282, 238);
__m512 out299 = _mm512_shuffle_f32x4(in284, tmp1403, 68);
__m512 out307 = _mm512_shuffle_f32x4(in284, tmp1403, 238);
__m512 out300 = _mm512_shuffle_f32x4(tmp1404, in288, 68);
__m512 out308 = _mm512_shuffle_f32x4(tmp1404, in288, 238);
__m512 out301 = _mm512_shuffle_f32x4(tmp1402, tmp1396, 68);
__m512 out309 = _mm512_shuffle_f32x4(tmp1402, tmp1396, 238);
__m512 out302 = _mm512_shuffle_f32x4(tmp1394, in290, 68);
__m512 out310 = _mm512_shuffle_f32x4(tmp1394, in290, 238);
_mm512_storeu_ps(dfPtr4+512+1638400*i16+24576*j11+24576*s13+768*k57, out295);
_mm512_storeu_ps(dfPtr4+640+1638400*i16+24576*j11+24576*s13+768*k57, out303);
_mm512_storeu_ps(dfPtr4+576+1638400*i16+24576*j11+24576*s13+768*k57, out299);
_mm512_storeu_ps(dfPtr4+704+1638400*i16+24576*j11+24576*s13+768*k57, out307);
_mm512_storeu_ps(dfPtr4+410112+1638400*i16+24576*j11+24576*s13+768*k57, out296);
_mm512_storeu_ps(dfPtr4+410240+1638400*i16+24576*j11+24576*s13+768*k57, out304);
_mm512_storeu_ps(dfPtr4+410176+1638400*i16+24576*j11+24576*s13+768*k57, out300);
_mm512_storeu_ps(dfPtr4+410304+1638400*i16+24576*j11+24576*s13+768*k57, out308);
_mm512_storeu_ps(dfPtr4+819712+1638400*i16+24576*j11+24576*s13+768*k57, out297);
_mm512_storeu_ps(dfPtr4+819840+1638400*i16+24576*j11+24576*s13+768*k57, out305);
_mm512_storeu_ps(dfPtr4+819776+1638400*i16+24576*j11+24576*s13+768*k57, out301);
_mm512_storeu_ps(dfPtr4+819904+1638400*i16+24576*j11+24576*s13+768*k57, out309);
_mm512_storeu_ps(dfPtr4+1229312+1638400*i16+24576*j11+24576*s13+768*k57, out298);
_mm512_storeu_ps(dfPtr4+1229440+1638400*i16+24576*j11+24576*s13+768*k57, out306);
_mm512_storeu_ps(dfPtr4+1229376+1638400*i16+24576*j11+24576*s13+768*k57, out302);
_mm512_storeu_ps(dfPtr4+1229504+1638400*i16+24576*j11+24576*s13+768*k57, out310);
}
if (j11 >= last3) return;
++j11;
rel8 = 4;
}
ptrdiff_t h26 = base8+12;
ptrdiff_t w29 = 36;
ptrdiff_t k58 = 0;
for (; k58 != 32; ++k58) {
__m512 dat1199 = _mm512_maskz_loadu_ps(16383, datPtr5+0+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1199 = _mm512_max_ps(_mm512_setzero_ps(), dat1199);
__m512 dat1200 = _mm512_maskz_loadu_ps(511, datPtr5+48+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1200 = _mm512_max_ps(_mm512_setzero_ps(), dat1200);
__m512i pm95 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in292 = _mm512_permutexvar_ps(pm95, dat1199);
__m512i pm96 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in300 = _mm512_permutexvar_ps(pm96, dat1200);
__m512 dat1201 = _mm512_maskz_loadu_ps(16383, datPtr5+224+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1201 = _mm512_max_ps(_mm512_setzero_ps(), dat1201);
__m512 dat1202 = _mm512_maskz_loadu_ps(511, datPtr5+272+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1202 = _mm512_max_ps(_mm512_setzero_ps(), dat1202);
__m512 in293 = _mm512_permutexvar_ps(pm95, dat1201);
__m512 in301 = _mm512_permutexvar_ps(pm96, dat1202);
__m512 dat1203 = _mm512_maskz_loadu_ps(16383, datPtr5+448+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1203 = _mm512_max_ps(_mm512_setzero_ps(), dat1203);
__m512 dat1204 = _mm512_maskz_loadu_ps(511, datPtr5+496+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1204 = _mm512_max_ps(_mm512_setzero_ps(), dat1204);
__m512 in294 = _mm512_permutexvar_ps(pm95, dat1203);
__m512 in302 = _mm512_permutexvar_ps(pm96, dat1204);
__m512 dat1205 = _mm512_maskz_loadu_ps(16383, datPtr5+672+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1205 = _mm512_max_ps(_mm512_setzero_ps(), dat1205);
__m512 dat1206 = _mm512_maskz_loadu_ps(511, datPtr5+720+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1206 = _mm512_max_ps(_mm512_setzero_ps(), dat1206);
__m512 in295 = _mm512_permutexvar_ps(pm95, dat1205);
__m512 in303 = _mm512_permutexvar_ps(pm96, dat1206);
__m512 dat1207 = _mm512_maskz_loadu_ps(16383, datPtr5+896+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1207 = _mm512_max_ps(_mm512_setzero_ps(), dat1207);
__m512 dat1208 = _mm512_maskz_loadu_ps(511, datPtr5+944+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1208 = _mm512_max_ps(_mm512_setzero_ps(), dat1208);
__m512 in296 = _mm512_permutexvar_ps(pm95, dat1207);
__m512 in304 = _mm512_permutexvar_ps(pm96, dat1208);
__m512 dat1209 = _mm512_maskz_loadu_ps(16383, datPtr5+1120+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1209 = _mm512_max_ps(_mm512_setzero_ps(), dat1209);
__m512 dat1210 = _mm512_maskz_loadu_ps(511, datPtr5+1168+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1210 = _mm512_max_ps(_mm512_setzero_ps(), dat1210);
__m512 in297 = _mm512_permutexvar_ps(pm95, dat1209);
__m512 in305 = _mm512_permutexvar_ps(pm96, dat1210);
__m512 dat1211 = _mm512_maskz_loadu_ps(16383, datPtr5+1344+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1211 = _mm512_max_ps(_mm512_setzero_ps(), dat1211);
__m512 dat1212 = _mm512_maskz_loadu_ps(511, datPtr5+1392+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1212 = _mm512_max_ps(_mm512_setzero_ps(), dat1212);
__m512 in298 = _mm512_permutexvar_ps(pm95, dat1211);
__m512 in306 = _mm512_permutexvar_ps(pm96, dat1212);
__m512 dat1213 = _mm512_maskz_loadu_ps(16383, datPtr5+1568+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1213 = _mm512_max_ps(_mm512_setzero_ps(), dat1213);
__m512 dat1214 = _mm512_maskz_loadu_ps(511, datPtr5+1616+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1214 = _mm512_max_ps(_mm512_setzero_ps(), dat1214);
__m512 in299 = _mm512_permutexvar_ps(pm95, dat1213);
__m512 in307 = _mm512_permutexvar_ps(pm96, dat1214);
__m512 tmp1453 = _mm512_add_ps(in293, in297);
__m512 tmp1457 = _mm512_add_ps(in301, in305);
__m512 tmp1454 = _mm512_sub_ps(in296, in294);
__m512 tmp1458 = _mm512_sub_ps(in304, in302);
__m512 tmp1455 = _mm512_add_ps(in294, in298);
__m512 tmp1459 = _mm512_add_ps(in302, in306);
in292 = _mm512_sub_ps(in292, in298);
in300 = _mm512_sub_ps(in300, in306);
tmp1453 = _mm512_fmadd_ps(in295, _mm512_set1_ps(-4.25e+00f), tmp1453);
tmp1457 = _mm512_fmadd_ps(in303, _mm512_set1_ps(-4.25e+00f), tmp1457);
tmp1455 = _mm512_fmadd_ps(in296, _mm512_set1_ps(-4.25e+00f), tmp1455);
tmp1459 = _mm512_fmadd_ps(in304, _mm512_set1_ps(-4.25e+00f), tmp1459);
in292 = _mm512_fmadd_ps(tmp1454, _mm512_set1_ps(5.25e+00f), in292);
in300 = _mm512_fmadd_ps(tmp1458, _mm512_set1_ps(5.25e+00f), in300);
tmp1454 = _mm512_fmadd_ps(in294, _mm512_set1_ps(2.5e-01f), in298);
tmp1458 = _mm512_fmadd_ps(in302, _mm512_set1_ps(2.5e-01f), in306);
in294 = _mm512_fmadd_ps(in294, _mm512_set1_ps(4e+00f), in298);
in302 = _mm512_fmadd_ps(in302, _mm512_set1_ps(4e+00f), in306);
__m512 tmp1456 = _mm512_sub_ps(tmp1455, tmp1453);
__m512 tmp1460 = _mm512_sub_ps(tmp1459, tmp1457);
tmp1455 = _mm512_add_ps(tmp1453, tmp1455);
tmp1459 = _mm512_add_ps(tmp1457, tmp1459);
tmp1453 = _mm512_fmadd_ps(in293, _mm512_set1_ps(2.5e-01f), in297);
tmp1457 = _mm512_fmadd_ps(in301, _mm512_set1_ps(2.5e-01f), in305);
tmp1454 = _mm512_fmadd_ps(in296, _mm512_set1_ps(-1.25e+00f), tmp1454);
tmp1458 = _mm512_fmadd_ps(in304, _mm512_set1_ps(-1.25e+00f), tmp1458);
in296 = _mm512_fmadd_ps(in296, _mm512_set1_ps(-5e+00f), in294);
in304 = _mm512_fmadd_ps(in304, _mm512_set1_ps(-5e+00f), in302);
tmp1453 = _mm512_fmadd_ps(in295, _mm512_set1_ps(-1.25e+00f), tmp1453);
tmp1457 = _mm512_fmadd_ps(in303, _mm512_set1_ps(-1.25e+00f), tmp1457);
in298 = _mm512_fmadd_ps(tmp1453, _mm512_set1_ps(2e+00f), tmp1454);
in306 = _mm512_fmadd_ps(tmp1457, _mm512_set1_ps(2e+00f), tmp1458);
tmp1454 = _mm512_fnmadd_ps(tmp1453, _mm512_set1_ps(2e+00f), tmp1454);
tmp1458 = _mm512_fnmadd_ps(tmp1457, _mm512_set1_ps(2e+00f), tmp1458);
tmp1453 = _mm512_fmadd_ps(in297, _mm512_set1_ps(2.5e-01f), in293);
tmp1457 = _mm512_fmadd_ps(in305, _mm512_set1_ps(2.5e-01f), in301);
in293 = _mm512_sub_ps(in299, in293);
in301 = _mm512_sub_ps(in307, in301);
tmp1453 = _mm512_fmadd_ps(in295, _mm512_set1_ps(-1.25e+00f), tmp1453);
tmp1457 = _mm512_fmadd_ps(in303, _mm512_set1_ps(-1.25e+00f), tmp1457);
in295 = _mm512_sub_ps(in295, in297);
in303 = _mm512_sub_ps(in303, in305);
in295 = _mm512_fmadd_ps(in295, _mm512_set1_ps(5.25e+00f), in293);
in303 = _mm512_fmadd_ps(in303, _mm512_set1_ps(5.25e+00f), in301);
in294 = _mm512_fmadd_ps(tmp1453, _mm512_set1_ps(2e+00f), in296);
in302 = _mm512_fmadd_ps(tmp1457, _mm512_set1_ps(2e+00f), in304);
in296 = _mm512_fnmadd_ps(tmp1453, _mm512_set1_ps(2e+00f), in296);
in304 = _mm512_fnmadd_ps(tmp1457, _mm512_set1_ps(2e+00f), in304);
__m512 tmp1469 = _mm512_unpacklo_ps(in292, tmp1455);
__m512 tmp1470 = _mm512_unpackhi_ps(in292, tmp1455);
__m512 tmp1471 = _mm512_unpacklo_ps(tmp1456, in298);
__m512 tmp1472 = _mm512_unpackhi_ps(tmp1456, in298);
__m512 tmp1473 = _mm512_unpacklo_ps(tmp1454, in294);
__m512 tmp1474 = _mm512_unpackhi_ps(tmp1454, in294);
__m512 tmp1475 = _mm512_unpacklo_ps(in296, in295);
__m512 tmp1476 = _mm512_unpackhi_ps(in296, in295);
__m512 tmp1477 = _mm512_unpacklo_ps(in300, tmp1459);
__m512 tmp1478 = _mm512_unpackhi_ps(in300, tmp1459);
__m512 tmp1479 = _mm512_unpacklo_ps(tmp1460, in306);
__m512 tmp1480 = _mm512_unpackhi_ps(tmp1460, in306);
__m512 tmp1481 = _mm512_unpacklo_ps(tmp1458, in302);
__m512 tmp1482 = _mm512_unpackhi_ps(tmp1458, in302);
__m512 tmp1483 = _mm512_unpacklo_ps(in304, in303);
__m512 tmp1484 = _mm512_unpackhi_ps(in304, in303);
__m512 tmp1485 = _mm512_shuffle_ps(tmp1469, tmp1471, 68);
__m512 tmp1486 = _mm512_shuffle_ps(tmp1469, tmp1471, 238);
__m512 tmp1487 = _mm512_shuffle_ps(tmp1470, tmp1472, 68);
__m512 tmp1488 = _mm512_shuffle_ps(tmp1470, tmp1472, 238);
__m512 tmp1489 = _mm512_shuffle_ps(tmp1473, tmp1475, 68);
__m512 tmp1490 = _mm512_shuffle_ps(tmp1473, tmp1475, 238);
__m512 tmp1491 = _mm512_shuffle_ps(tmp1474, tmp1476, 68);
__m512 tmp1492 = _mm512_shuffle_ps(tmp1474, tmp1476, 238);
__m512 tmp1493 = _mm512_shuffle_ps(tmp1477, tmp1479, 68);
__m512 tmp1494 = _mm512_shuffle_ps(tmp1477, tmp1479, 238);
__m512 tmp1495 = _mm512_shuffle_ps(tmp1478, tmp1480, 68);
__m512 tmp1496 = _mm512_shuffle_ps(tmp1478, tmp1480, 238);
__m512 tmp1497 = _mm512_shuffle_ps(tmp1481, tmp1483, 68);
__m512 tmp1498 = _mm512_shuffle_ps(tmp1481, tmp1483, 238);
__m512 tmp1499 = _mm512_shuffle_ps(tmp1482, tmp1484, 68);
__m512 tmp1500 = _mm512_shuffle_ps(tmp1482, tmp1484, 238);
__m512 tmp1501 = _mm512_shuffle_f32x4(tmp1485, tmp1489, 136);
__m512 tmp1502 = _mm512_shuffle_f32x4(tmp1485, tmp1489, 221);
__m512 tmp1503 = _mm512_shuffle_f32x4(tmp1486, tmp1490, 136);
__m512 tmp1504 = _mm512_shuffle_f32x4(tmp1486, tmp1490, 221);
__m512 tmp1505 = _mm512_shuffle_f32x4(tmp1487, tmp1491, 136);
__m512 tmp1506 = _mm512_shuffle_f32x4(tmp1487, tmp1491, 221);
__m512 tmp1507 = _mm512_shuffle_f32x4(tmp1488, tmp1492, 136);
__m512 tmp1508 = _mm512_shuffle_f32x4(tmp1488, tmp1492, 221);
__m512 tmp1509 = _mm512_shuffle_f32x4(tmp1493, tmp1497, 136);
__m512 tmp1510 = _mm512_shuffle_f32x4(tmp1493, tmp1497, 221);
__m512 tmp1511 = _mm512_shuffle_f32x4(tmp1494, tmp1498, 136);
__m512 tmp1512 = _mm512_shuffle_f32x4(tmp1494, tmp1498, 221);
__m512 tmp1513 = _mm512_shuffle_f32x4(tmp1495, tmp1499, 136);
__m512 tmp1514 = _mm512_shuffle_f32x4(tmp1495, tmp1499, 221);
__m512 tmp1515 = _mm512_shuffle_f32x4(tmp1496, tmp1500, 136);
__m512 tmp1516 = _mm512_shuffle_f32x4(tmp1496, tmp1500, 221);
in292 = _mm512_shuffle_f32x4(tmp1501, tmp1509, 136);
in300 = _mm512_shuffle_f32x4(tmp1501, tmp1509, 221);
tmp1455 = _mm512_shuffle_f32x4(tmp1503, tmp1511, 136);
tmp1459 = _mm512_shuffle_f32x4(tmp1503, tmp1511, 221);
tmp1456 = _mm512_shuffle_f32x4(tmp1505, tmp1513, 136);
tmp1460 = _mm512_shuffle_f32x4(tmp1505, tmp1513, 221);
in298 = _mm512_shuffle_f32x4(tmp1507, tmp1515, 136);
in306 = _mm512_shuffle_f32x4(tmp1507, tmp1515, 221);
tmp1454 = _mm512_shuffle_f32x4(tmp1502, tmp1510, 136);
tmp1458 = _mm512_shuffle_f32x4(tmp1502, tmp1510, 221);
in294 = _mm512_shuffle_f32x4(tmp1504, tmp1512, 136);
in302 = _mm512_shuffle_f32x4(tmp1504, tmp1512, 221);
in296 = _mm512_shuffle_f32x4(tmp1506, tmp1514, 136);
in304 = _mm512_shuffle_f32x4(tmp1506, tmp1514, 221);
in295 = _mm512_shuffle_f32x4(tmp1508, tmp1516, 136);
in303 = _mm512_shuffle_f32x4(tmp1508, tmp1516, 221);
__m512 tmp1461 = _mm512_add_ps(tmp1455, in294);
__m512 tmp1465 = _mm512_add_ps(tmp1459, in302);
__m512 tmp1462 = _mm512_sub_ps(tmp1454, tmp1456);
__m512 tmp1466 = _mm512_sub_ps(tmp1458, tmp1460);
__m512 tmp1463 = _mm512_add_ps(tmp1456, in296);
__m512 tmp1467 = _mm512_add_ps(tmp1460, in304);
in292 = _mm512_sub_ps(in292, in296);
in300 = _mm512_sub_ps(in300, in304);
tmp1461 = _mm512_fmadd_ps(in298, _mm512_set1_ps(-4.25e+00f), tmp1461);
tmp1465 = _mm512_fmadd_ps(in306, _mm512_set1_ps(-4.25e+00f), tmp1465);
tmp1463 = _mm512_fmadd_ps(tmp1454, _mm512_set1_ps(-4.25e+00f), tmp1463);
tmp1467 = _mm512_fmadd_ps(tmp1458, _mm512_set1_ps(-4.25e+00f), tmp1467);
in292 = _mm512_fmadd_ps(tmp1462, _mm512_set1_ps(5.25e+00f), in292);
in300 = _mm512_fmadd_ps(tmp1466, _mm512_set1_ps(5.25e+00f), in300);
tmp1462 = _mm512_fmadd_ps(tmp1456, _mm512_set1_ps(2.5e-01f), in296);
tmp1466 = _mm512_fmadd_ps(tmp1460, _mm512_set1_ps(2.5e-01f), in304);
tmp1456 = _mm512_fmadd_ps(tmp1456, _mm512_set1_ps(4e+00f), in296);
tmp1460 = _mm512_fmadd_ps(tmp1460, _mm512_set1_ps(4e+00f), in304);
__m512 tmp1464 = _mm512_sub_ps(tmp1463, tmp1461);
__m512 tmp1468 = _mm512_sub_ps(tmp1467, tmp1465);
tmp1463 = _mm512_add_ps(tmp1461, tmp1463);
tmp1467 = _mm512_add_ps(tmp1465, tmp1467);
tmp1461 = _mm512_fmadd_ps(tmp1455, _mm512_set1_ps(2.5e-01f), in294);
tmp1465 = _mm512_fmadd_ps(tmp1459, _mm512_set1_ps(2.5e-01f), in302);
tmp1462 = _mm512_fmadd_ps(tmp1454, _mm512_set1_ps(-1.25e+00f), tmp1462);
tmp1466 = _mm512_fmadd_ps(tmp1458, _mm512_set1_ps(-1.25e+00f), tmp1466);
tmp1454 = _mm512_fmadd_ps(tmp1454, _mm512_set1_ps(-5e+00f), tmp1456);
tmp1458 = _mm512_fmadd_ps(tmp1458, _mm512_set1_ps(-5e+00f), tmp1460);
tmp1461 = _mm512_fmadd_ps(in298, _mm512_set1_ps(-1.25e+00f), tmp1461);
tmp1465 = _mm512_fmadd_ps(in306, _mm512_set1_ps(-1.25e+00f), tmp1465);
in296 = _mm512_fmadd_ps(tmp1461, _mm512_set1_ps(2e+00f), tmp1462);
in304 = _mm512_fmadd_ps(tmp1465, _mm512_set1_ps(2e+00f), tmp1466);
tmp1462 = _mm512_fnmadd_ps(tmp1461, _mm512_set1_ps(2e+00f), tmp1462);
tmp1466 = _mm512_fnmadd_ps(tmp1465, _mm512_set1_ps(2e+00f), tmp1466);
tmp1461 = _mm512_fmadd_ps(in294, _mm512_set1_ps(2.5e-01f), tmp1455);
tmp1465 = _mm512_fmadd_ps(in302, _mm512_set1_ps(2.5e-01f), tmp1459);
tmp1455 = _mm512_sub_ps(in295, tmp1455);
tmp1459 = _mm512_sub_ps(in303, tmp1459);
tmp1461 = _mm512_fmadd_ps(in298, _mm512_set1_ps(-1.25e+00f), tmp1461);
tmp1465 = _mm512_fmadd_ps(in306, _mm512_set1_ps(-1.25e+00f), tmp1465);
in298 = _mm512_sub_ps(in298, in294);
in306 = _mm512_sub_ps(in306, in302);
in298 = _mm512_fmadd_ps(in298, _mm512_set1_ps(5.25e+00f), tmp1455);
in306 = _mm512_fmadd_ps(in306, _mm512_set1_ps(5.25e+00f), tmp1459);
tmp1456 = _mm512_fmadd_ps(tmp1461, _mm512_set1_ps(2e+00f), tmp1454);
tmp1460 = _mm512_fmadd_ps(tmp1465, _mm512_set1_ps(2e+00f), tmp1458);
tmp1454 = _mm512_fnmadd_ps(tmp1461, _mm512_set1_ps(2e+00f), tmp1454);
tmp1458 = _mm512_fnmadd_ps(tmp1465, _mm512_set1_ps(2e+00f), tmp1458);
__m512 out311 = _mm512_shuffle_f32x4(in292, tmp1463, 68);
__m512 out319 = _mm512_shuffle_f32x4(in292, tmp1463, 238);
__m512 out312 = _mm512_shuffle_f32x4(tmp1464, in296, 68);
__m512 out320 = _mm512_shuffle_f32x4(tmp1464, in296, 238);
__m512 out313 = _mm512_shuffle_f32x4(tmp1462, tmp1456, 68);
__m512 out321 = _mm512_shuffle_f32x4(tmp1462, tmp1456, 238);
__m512 out314 = _mm512_shuffle_f32x4(tmp1454, in298, 68);
__m512 out322 = _mm512_shuffle_f32x4(tmp1454, in298, 238);
__m512 out315 = _mm512_shuffle_f32x4(in300, tmp1467, 68);
__m512 out323 = _mm512_shuffle_f32x4(in300, tmp1467, 238);
__m512 out316 = _mm512_shuffle_f32x4(tmp1468, in304, 68);
__m512 out324 = _mm512_shuffle_f32x4(tmp1468, in304, 238);
__m512 out317 = _mm512_shuffle_f32x4(tmp1466, tmp1460, 68);
__m512 out325 = _mm512_shuffle_f32x4(tmp1466, tmp1460, 238);
__m512 out318 = _mm512_shuffle_f32x4(tmp1458, in306, 68);
__m512 out326 = _mm512_shuffle_f32x4(tmp1458, in306, 238);
_mm512_storeu_ps(dfPtr4+0+1638400*i16+24576*j11+24576*s13+768*k58, out311);
_mm512_storeu_ps(dfPtr4+128+1638400*i16+24576*j11+24576*s13+768*k58, out319);
_mm512_storeu_ps(dfPtr4+64+1638400*i16+24576*j11+24576*s13+768*k58, out315);
_mm512_storeu_ps(dfPtr4+192+1638400*i16+24576*j11+24576*s13+768*k58, out323);
_mm512_storeu_ps(dfPtr4+409600+1638400*i16+24576*j11+24576*s13+768*k58, out312);
_mm512_storeu_ps(dfPtr4+409728+1638400*i16+24576*j11+24576*s13+768*k58, out320);
_mm512_storeu_ps(dfPtr4+409664+1638400*i16+24576*j11+24576*s13+768*k58, out316);
_mm512_storeu_ps(dfPtr4+409792+1638400*i16+24576*j11+24576*s13+768*k58, out324);
_mm512_storeu_ps(dfPtr4+819200+1638400*i16+24576*j11+24576*s13+768*k58, out313);
_mm512_storeu_ps(dfPtr4+819328+1638400*i16+24576*j11+24576*s13+768*k58, out321);
_mm512_storeu_ps(dfPtr4+819264+1638400*i16+24576*j11+24576*s13+768*k58, out317);
_mm512_storeu_ps(dfPtr4+819392+1638400*i16+24576*j11+24576*s13+768*k58, out325);
_mm512_storeu_ps(dfPtr4+1228800+1638400*i16+24576*j11+24576*s13+768*k58, out314);
_mm512_storeu_ps(dfPtr4+1228928+1638400*i16+24576*j11+24576*s13+768*k58, out322);
_mm512_storeu_ps(dfPtr4+1228864+1638400*i16+24576*j11+24576*s13+768*k58, out318);
_mm512_storeu_ps(dfPtr4+1228992+1638400*i16+24576*j11+24576*s13+768*k58, out326);
__m512 dat1215 = _mm512_maskz_loadu_ps(8191, datPtr5+1204+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1215 = _mm512_max_ps(_mm512_setzero_ps(), dat1215);
__m512 dat1216 = _mm512_maskz_loadu_ps(16383, datPtr5+12608+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1216 = _mm512_max_ps(_mm512_setzero_ps(), dat1216);
__m512i pm97 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in308 = _mm512_permutexvar_ps(pm97, dat1215);
__m512i pm98 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in316 = _mm512_permutexvar_ps(pm98, dat1216);
__m512 dat1217 = _mm512_maskz_loadu_ps(8191, datPtr5+1428+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1217 = _mm512_max_ps(_mm512_setzero_ps(), dat1217);
__m512 dat1218 = _mm512_maskz_loadu_ps(16383, datPtr5+12832+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1218 = _mm512_max_ps(_mm512_setzero_ps(), dat1218);
__m512 in309 = _mm512_permutexvar_ps(pm97, dat1217);
__m512 in317 = _mm512_permutexvar_ps(pm98, dat1218);
__m512 dat1219 = _mm512_maskz_loadu_ps(8191, datPtr5+1652+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1219 = _mm512_max_ps(_mm512_setzero_ps(), dat1219);
__m512 dat1220 = _mm512_maskz_loadu_ps(16383, datPtr5+13056+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1220 = _mm512_max_ps(_mm512_setzero_ps(), dat1220);
__m512 in310 = _mm512_permutexvar_ps(pm97, dat1219);
__m512 in318 = _mm512_permutexvar_ps(pm98, dat1220);
__m512 dat1221 = _mm512_maskz_loadu_ps(8191, datPtr5+1876+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1221 = _mm512_max_ps(_mm512_setzero_ps(), dat1221);
__m512 dat1222 = _mm512_maskz_loadu_ps(16383, datPtr5+13280+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1222 = _mm512_max_ps(_mm512_setzero_ps(), dat1222);
__m512 in311 = _mm512_permutexvar_ps(pm97, dat1221);
__m512 in319 = _mm512_permutexvar_ps(pm98, dat1222);
__m512 dat1223 = _mm512_maskz_loadu_ps(8191, datPtr5+2100+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1223 = _mm512_max_ps(_mm512_setzero_ps(), dat1223);
__m512 dat1224 = _mm512_maskz_loadu_ps(16383, datPtr5+13504+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1224 = _mm512_max_ps(_mm512_setzero_ps(), dat1224);
__m512 in312 = _mm512_permutexvar_ps(pm97, dat1223);
__m512 in320 = _mm512_permutexvar_ps(pm98, dat1224);
__m512 dat1225 = _mm512_maskz_loadu_ps(8191, datPtr5+2324+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1225 = _mm512_max_ps(_mm512_setzero_ps(), dat1225);
__m512 dat1226 = _mm512_maskz_loadu_ps(16383, datPtr5+13728+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1226 = _mm512_max_ps(_mm512_setzero_ps(), dat1226);
__m512 in313 = _mm512_permutexvar_ps(pm97, dat1225);
__m512 in321 = _mm512_permutexvar_ps(pm98, dat1226);
__m512 dat1227 = _mm512_maskz_loadu_ps(8191, datPtr5+2548+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1227 = _mm512_max_ps(_mm512_setzero_ps(), dat1227);
__m512 dat1228 = _mm512_maskz_loadu_ps(16383, datPtr5+13952+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1228 = _mm512_max_ps(_mm512_setzero_ps(), dat1228);
__m512 in314 = _mm512_permutexvar_ps(pm97, dat1227);
__m512 in322 = _mm512_permutexvar_ps(pm98, dat1228);
__m512 dat1229 = _mm512_maskz_loadu_ps(8191, datPtr5+2772+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1229 = _mm512_max_ps(_mm512_setzero_ps(), dat1229);
__m512 dat1230 = _mm512_maskz_loadu_ps(16383, datPtr5+14176+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1230 = _mm512_max_ps(_mm512_setzero_ps(), dat1230);
__m512 in315 = _mm512_permutexvar_ps(pm97, dat1229);
__m512 in323 = _mm512_permutexvar_ps(pm98, dat1230);
__m512 tmp1517 = _mm512_add_ps(in309, in313);
__m512 tmp1521 = _mm512_add_ps(in317, in321);
__m512 tmp1518 = _mm512_sub_ps(in312, in310);
__m512 tmp1522 = _mm512_sub_ps(in320, in318);
__m512 tmp1519 = _mm512_add_ps(in310, in314);
__m512 tmp1523 = _mm512_add_ps(in318, in322);
in308 = _mm512_sub_ps(in308, in314);
in316 = _mm512_sub_ps(in316, in322);
tmp1517 = _mm512_fmadd_ps(in311, _mm512_set1_ps(-4.25e+00f), tmp1517);
tmp1521 = _mm512_fmadd_ps(in319, _mm512_set1_ps(-4.25e+00f), tmp1521);
tmp1519 = _mm512_fmadd_ps(in312, _mm512_set1_ps(-4.25e+00f), tmp1519);
tmp1523 = _mm512_fmadd_ps(in320, _mm512_set1_ps(-4.25e+00f), tmp1523);
in308 = _mm512_fmadd_ps(tmp1518, _mm512_set1_ps(5.25e+00f), in308);
in316 = _mm512_fmadd_ps(tmp1522, _mm512_set1_ps(5.25e+00f), in316);
tmp1518 = _mm512_fmadd_ps(in310, _mm512_set1_ps(2.5e-01f), in314);
tmp1522 = _mm512_fmadd_ps(in318, _mm512_set1_ps(2.5e-01f), in322);
in310 = _mm512_fmadd_ps(in310, _mm512_set1_ps(4e+00f), in314);
in318 = _mm512_fmadd_ps(in318, _mm512_set1_ps(4e+00f), in322);
__m512 tmp1520 = _mm512_sub_ps(tmp1519, tmp1517);
__m512 tmp1524 = _mm512_sub_ps(tmp1523, tmp1521);
tmp1519 = _mm512_add_ps(tmp1517, tmp1519);
tmp1523 = _mm512_add_ps(tmp1521, tmp1523);
tmp1517 = _mm512_fmadd_ps(in309, _mm512_set1_ps(2.5e-01f), in313);
tmp1521 = _mm512_fmadd_ps(in317, _mm512_set1_ps(2.5e-01f), in321);
tmp1518 = _mm512_fmadd_ps(in312, _mm512_set1_ps(-1.25e+00f), tmp1518);
tmp1522 = _mm512_fmadd_ps(in320, _mm512_set1_ps(-1.25e+00f), tmp1522);
in312 = _mm512_fmadd_ps(in312, _mm512_set1_ps(-5e+00f), in310);
in320 = _mm512_fmadd_ps(in320, _mm512_set1_ps(-5e+00f), in318);
tmp1517 = _mm512_fmadd_ps(in311, _mm512_set1_ps(-1.25e+00f), tmp1517);
tmp1521 = _mm512_fmadd_ps(in319, _mm512_set1_ps(-1.25e+00f), tmp1521);
in314 = _mm512_fmadd_ps(tmp1517, _mm512_set1_ps(2e+00f), tmp1518);
in322 = _mm512_fmadd_ps(tmp1521, _mm512_set1_ps(2e+00f), tmp1522);
tmp1518 = _mm512_fnmadd_ps(tmp1517, _mm512_set1_ps(2e+00f), tmp1518);
tmp1522 = _mm512_fnmadd_ps(tmp1521, _mm512_set1_ps(2e+00f), tmp1522);
tmp1517 = _mm512_fmadd_ps(in313, _mm512_set1_ps(2.5e-01f), in309);
tmp1521 = _mm512_fmadd_ps(in321, _mm512_set1_ps(2.5e-01f), in317);
in309 = _mm512_sub_ps(in315, in309);
in317 = _mm512_sub_ps(in323, in317);
tmp1517 = _mm512_fmadd_ps(in311, _mm512_set1_ps(-1.25e+00f), tmp1517);
tmp1521 = _mm512_fmadd_ps(in319, _mm512_set1_ps(-1.25e+00f), tmp1521);
in311 = _mm512_sub_ps(in311, in313);
in319 = _mm512_sub_ps(in319, in321);
in311 = _mm512_fmadd_ps(in311, _mm512_set1_ps(5.25e+00f), in309);
in319 = _mm512_fmadd_ps(in319, _mm512_set1_ps(5.25e+00f), in317);
in310 = _mm512_fmadd_ps(tmp1517, _mm512_set1_ps(2e+00f), in312);
in318 = _mm512_fmadd_ps(tmp1521, _mm512_set1_ps(2e+00f), in320);
in312 = _mm512_fnmadd_ps(tmp1517, _mm512_set1_ps(2e+00f), in312);
in320 = _mm512_fnmadd_ps(tmp1521, _mm512_set1_ps(2e+00f), in320);
__m512 tmp1533 = _mm512_unpacklo_ps(in308, tmp1519);
__m512 tmp1534 = _mm512_unpackhi_ps(in308, tmp1519);
__m512 tmp1535 = _mm512_unpacklo_ps(tmp1520, in314);
__m512 tmp1536 = _mm512_unpackhi_ps(tmp1520, in314);
__m512 tmp1537 = _mm512_unpacklo_ps(tmp1518, in310);
__m512 tmp1538 = _mm512_unpackhi_ps(tmp1518, in310);
__m512 tmp1539 = _mm512_unpacklo_ps(in312, in311);
__m512 tmp1540 = _mm512_unpackhi_ps(in312, in311);
__m512 tmp1541 = _mm512_unpacklo_ps(in316, tmp1523);
__m512 tmp1542 = _mm512_unpackhi_ps(in316, tmp1523);
__m512 tmp1543 = _mm512_unpacklo_ps(tmp1524, in322);
__m512 tmp1544 = _mm512_unpackhi_ps(tmp1524, in322);
__m512 tmp1545 = _mm512_unpacklo_ps(tmp1522, in318);
__m512 tmp1546 = _mm512_unpackhi_ps(tmp1522, in318);
__m512 tmp1547 = _mm512_unpacklo_ps(in320, in319);
__m512 tmp1548 = _mm512_unpackhi_ps(in320, in319);
__m512 tmp1549 = _mm512_shuffle_ps(tmp1533, tmp1535, 68);
__m512 tmp1550 = _mm512_shuffle_ps(tmp1533, tmp1535, 238);
__m512 tmp1551 = _mm512_shuffle_ps(tmp1534, tmp1536, 68);
__m512 tmp1552 = _mm512_shuffle_ps(tmp1534, tmp1536, 238);
__m512 tmp1553 = _mm512_shuffle_ps(tmp1537, tmp1539, 68);
__m512 tmp1554 = _mm512_shuffle_ps(tmp1537, tmp1539, 238);
__m512 tmp1555 = _mm512_shuffle_ps(tmp1538, tmp1540, 68);
__m512 tmp1556 = _mm512_shuffle_ps(tmp1538, tmp1540, 238);
__m512 tmp1557 = _mm512_shuffle_ps(tmp1541, tmp1543, 68);
__m512 tmp1558 = _mm512_shuffle_ps(tmp1541, tmp1543, 238);
__m512 tmp1559 = _mm512_shuffle_ps(tmp1542, tmp1544, 68);
__m512 tmp1560 = _mm512_shuffle_ps(tmp1542, tmp1544, 238);
__m512 tmp1561 = _mm512_shuffle_ps(tmp1545, tmp1547, 68);
__m512 tmp1562 = _mm512_shuffle_ps(tmp1545, tmp1547, 238);
__m512 tmp1563 = _mm512_shuffle_ps(tmp1546, tmp1548, 68);
__m512 tmp1564 = _mm512_shuffle_ps(tmp1546, tmp1548, 238);
__m512 tmp1565 = _mm512_shuffle_f32x4(tmp1549, tmp1553, 136);
__m512 tmp1566 = _mm512_shuffle_f32x4(tmp1549, tmp1553, 221);
__m512 tmp1567 = _mm512_shuffle_f32x4(tmp1550, tmp1554, 136);
__m512 tmp1568 = _mm512_shuffle_f32x4(tmp1550, tmp1554, 221);
__m512 tmp1569 = _mm512_shuffle_f32x4(tmp1551, tmp1555, 136);
__m512 tmp1570 = _mm512_shuffle_f32x4(tmp1551, tmp1555, 221);
__m512 tmp1571 = _mm512_shuffle_f32x4(tmp1552, tmp1556, 136);
__m512 tmp1572 = _mm512_shuffle_f32x4(tmp1552, tmp1556, 221);
__m512 tmp1573 = _mm512_shuffle_f32x4(tmp1557, tmp1561, 136);
__m512 tmp1574 = _mm512_shuffle_f32x4(tmp1557, tmp1561, 221);
__m512 tmp1575 = _mm512_shuffle_f32x4(tmp1558, tmp1562, 136);
__m512 tmp1576 = _mm512_shuffle_f32x4(tmp1558, tmp1562, 221);
__m512 tmp1577 = _mm512_shuffle_f32x4(tmp1559, tmp1563, 136);
__m512 tmp1578 = _mm512_shuffle_f32x4(tmp1559, tmp1563, 221);
__m512 tmp1579 = _mm512_shuffle_f32x4(tmp1560, tmp1564, 136);
__m512 tmp1580 = _mm512_shuffle_f32x4(tmp1560, tmp1564, 221);
in308 = _mm512_shuffle_f32x4(tmp1565, tmp1573, 136);
in316 = _mm512_shuffle_f32x4(tmp1565, tmp1573, 221);
tmp1519 = _mm512_shuffle_f32x4(tmp1567, tmp1575, 136);
tmp1523 = _mm512_shuffle_f32x4(tmp1567, tmp1575, 221);
tmp1520 = _mm512_shuffle_f32x4(tmp1569, tmp1577, 136);
tmp1524 = _mm512_shuffle_f32x4(tmp1569, tmp1577, 221);
in314 = _mm512_shuffle_f32x4(tmp1571, tmp1579, 136);
in322 = _mm512_shuffle_f32x4(tmp1571, tmp1579, 221);
tmp1518 = _mm512_shuffle_f32x4(tmp1566, tmp1574, 136);
tmp1522 = _mm512_shuffle_f32x4(tmp1566, tmp1574, 221);
in310 = _mm512_shuffle_f32x4(tmp1568, tmp1576, 136);
in318 = _mm512_shuffle_f32x4(tmp1568, tmp1576, 221);
in312 = _mm512_shuffle_f32x4(tmp1570, tmp1578, 136);
in320 = _mm512_shuffle_f32x4(tmp1570, tmp1578, 221);
in311 = _mm512_shuffle_f32x4(tmp1572, tmp1580, 136);
in319 = _mm512_shuffle_f32x4(tmp1572, tmp1580, 221);
__m512 tmp1525 = _mm512_add_ps(tmp1519, in310);
__m512 tmp1529 = _mm512_add_ps(tmp1523, in318);
__m512 tmp1526 = _mm512_sub_ps(tmp1518, tmp1520);
__m512 tmp1530 = _mm512_sub_ps(tmp1522, tmp1524);
__m512 tmp1527 = _mm512_add_ps(tmp1520, in312);
__m512 tmp1531 = _mm512_add_ps(tmp1524, in320);
in308 = _mm512_sub_ps(in308, in312);
in316 = _mm512_sub_ps(in316, in320);
tmp1525 = _mm512_fmadd_ps(in314, _mm512_set1_ps(-4.25e+00f), tmp1525);
tmp1529 = _mm512_fmadd_ps(in322, _mm512_set1_ps(-4.25e+00f), tmp1529);
tmp1527 = _mm512_fmadd_ps(tmp1518, _mm512_set1_ps(-4.25e+00f), tmp1527);
tmp1531 = _mm512_fmadd_ps(tmp1522, _mm512_set1_ps(-4.25e+00f), tmp1531);
in308 = _mm512_fmadd_ps(tmp1526, _mm512_set1_ps(5.25e+00f), in308);
in316 = _mm512_fmadd_ps(tmp1530, _mm512_set1_ps(5.25e+00f), in316);
tmp1526 = _mm512_fmadd_ps(tmp1520, _mm512_set1_ps(2.5e-01f), in312);
tmp1530 = _mm512_fmadd_ps(tmp1524, _mm512_set1_ps(2.5e-01f), in320);
tmp1520 = _mm512_fmadd_ps(tmp1520, _mm512_set1_ps(4e+00f), in312);
tmp1524 = _mm512_fmadd_ps(tmp1524, _mm512_set1_ps(4e+00f), in320);
__m512 tmp1528 = _mm512_sub_ps(tmp1527, tmp1525);
__m512 tmp1532 = _mm512_sub_ps(tmp1531, tmp1529);
tmp1527 = _mm512_add_ps(tmp1525, tmp1527);
tmp1531 = _mm512_add_ps(tmp1529, tmp1531);
tmp1525 = _mm512_fmadd_ps(tmp1519, _mm512_set1_ps(2.5e-01f), in310);
tmp1529 = _mm512_fmadd_ps(tmp1523, _mm512_set1_ps(2.5e-01f), in318);
tmp1526 = _mm512_fmadd_ps(tmp1518, _mm512_set1_ps(-1.25e+00f), tmp1526);
tmp1530 = _mm512_fmadd_ps(tmp1522, _mm512_set1_ps(-1.25e+00f), tmp1530);
tmp1518 = _mm512_fmadd_ps(tmp1518, _mm512_set1_ps(-5e+00f), tmp1520);
tmp1522 = _mm512_fmadd_ps(tmp1522, _mm512_set1_ps(-5e+00f), tmp1524);
tmp1525 = _mm512_fmadd_ps(in314, _mm512_set1_ps(-1.25e+00f), tmp1525);
tmp1529 = _mm512_fmadd_ps(in322, _mm512_set1_ps(-1.25e+00f), tmp1529);
in312 = _mm512_fmadd_ps(tmp1525, _mm512_set1_ps(2e+00f), tmp1526);
in320 = _mm512_fmadd_ps(tmp1529, _mm512_set1_ps(2e+00f), tmp1530);
tmp1526 = _mm512_fnmadd_ps(tmp1525, _mm512_set1_ps(2e+00f), tmp1526);
tmp1530 = _mm512_fnmadd_ps(tmp1529, _mm512_set1_ps(2e+00f), tmp1530);
tmp1525 = _mm512_fmadd_ps(in310, _mm512_set1_ps(2.5e-01f), tmp1519);
tmp1529 = _mm512_fmadd_ps(in318, _mm512_set1_ps(2.5e-01f), tmp1523);
tmp1519 = _mm512_sub_ps(in311, tmp1519);
tmp1523 = _mm512_sub_ps(in319, tmp1523);
tmp1525 = _mm512_fmadd_ps(in314, _mm512_set1_ps(-1.25e+00f), tmp1525);
tmp1529 = _mm512_fmadd_ps(in322, _mm512_set1_ps(-1.25e+00f), tmp1529);
in314 = _mm512_sub_ps(in314, in310);
in322 = _mm512_sub_ps(in322, in318);
in314 = _mm512_fmadd_ps(in314, _mm512_set1_ps(5.25e+00f), tmp1519);
in322 = _mm512_fmadd_ps(in322, _mm512_set1_ps(5.25e+00f), tmp1523);
tmp1520 = _mm512_fmadd_ps(tmp1525, _mm512_set1_ps(2e+00f), tmp1518);
tmp1524 = _mm512_fmadd_ps(tmp1529, _mm512_set1_ps(2e+00f), tmp1522);
tmp1518 = _mm512_fnmadd_ps(tmp1525, _mm512_set1_ps(2e+00f), tmp1518);
tmp1522 = _mm512_fnmadd_ps(tmp1529, _mm512_set1_ps(2e+00f), tmp1522);
__m512 out327 = _mm512_shuffle_f32x4(in308, tmp1527, 68);
__m512 out335 = _mm512_shuffle_f32x4(in308, tmp1527, 238);
__m512 out328 = _mm512_shuffle_f32x4(tmp1528, in312, 68);
__m512 out336 = _mm512_shuffle_f32x4(tmp1528, in312, 238);
__m512 out329 = _mm512_shuffle_f32x4(tmp1526, tmp1520, 68);
__m512 out337 = _mm512_shuffle_f32x4(tmp1526, tmp1520, 238);
__m512 out330 = _mm512_shuffle_f32x4(tmp1518, in314, 68);
__m512 out338 = _mm512_shuffle_f32x4(tmp1518, in314, 238);
__m512 out331 = _mm512_shuffle_f32x4(in316, tmp1531, 68);
__m512 out339 = _mm512_shuffle_f32x4(in316, tmp1531, 238);
__m512 out332 = _mm512_shuffle_f32x4(tmp1532, in320, 68);
__m512 out340 = _mm512_shuffle_f32x4(tmp1532, in320, 238);
__m512 out333 = _mm512_shuffle_f32x4(tmp1530, tmp1524, 68);
__m512 out341 = _mm512_shuffle_f32x4(tmp1530, tmp1524, 238);
__m512 out334 = _mm512_shuffle_f32x4(tmp1522, in322, 68);
__m512 out342 = _mm512_shuffle_f32x4(tmp1522, in322, 238);
_mm512_storeu_ps(dfPtr4+256+1638400*i16+24576*j11+24576*s13+768*k58, out327);
_mm512_storeu_ps(dfPtr4+384+1638400*i16+24576*j11+24576*s13+768*k58, out335);
_mm512_storeu_ps(dfPtr4+320+1638400*i16+24576*j11+24576*s13+768*k58, out331);
_mm512_storeu_ps(dfPtr4+448+1638400*i16+24576*j11+24576*s13+768*k58, out339);
_mm512_storeu_ps(dfPtr4+409856+1638400*i16+24576*j11+24576*s13+768*k58, out328);
_mm512_storeu_ps(dfPtr4+409984+1638400*i16+24576*j11+24576*s13+768*k58, out336);
_mm512_storeu_ps(dfPtr4+409920+1638400*i16+24576*j11+24576*s13+768*k58, out332);
_mm512_storeu_ps(dfPtr4+410048+1638400*i16+24576*j11+24576*s13+768*k58, out340);
_mm512_storeu_ps(dfPtr4+819456+1638400*i16+24576*j11+24576*s13+768*k58, out329);
_mm512_storeu_ps(dfPtr4+819584+1638400*i16+24576*j11+24576*s13+768*k58, out337);
_mm512_storeu_ps(dfPtr4+819520+1638400*i16+24576*j11+24576*s13+768*k58, out333);
_mm512_storeu_ps(dfPtr4+819648+1638400*i16+24576*j11+24576*s13+768*k58, out341);
_mm512_storeu_ps(dfPtr4+1229056+1638400*i16+24576*j11+24576*s13+768*k58, out330);
_mm512_storeu_ps(dfPtr4+1229184+1638400*i16+24576*j11+24576*s13+768*k58, out338);
_mm512_storeu_ps(dfPtr4+1229120+1638400*i16+24576*j11+24576*s13+768*k58, out334);
_mm512_storeu_ps(dfPtr4+1229248+1638400*i16+24576*j11+24576*s13+768*k58, out342);
__m512 dat1231 = _mm512_maskz_loadu_ps(511, datPtr5+12656+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1231 = _mm512_max_ps(_mm512_setzero_ps(), dat1231);
__m512 dat1232 = _mm512_maskz_loadu_ps(8191, datPtr5+13812+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1232 = _mm512_max_ps(_mm512_setzero_ps(), dat1232);
__m512i pm99 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in324 = _mm512_permutexvar_ps(pm99, dat1231);
__m512i pm100 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in332 = _mm512_permutexvar_ps(pm100, dat1232);
__m512 dat1233 = _mm512_maskz_loadu_ps(511, datPtr5+12880+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1233 = _mm512_max_ps(_mm512_setzero_ps(), dat1233);
__m512 dat1234 = _mm512_maskz_loadu_ps(8191, datPtr5+14036+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1234 = _mm512_max_ps(_mm512_setzero_ps(), dat1234);
__m512 in325 = _mm512_permutexvar_ps(pm99, dat1233);
__m512 in333 = _mm512_permutexvar_ps(pm100, dat1234);
__m512 dat1235 = _mm512_maskz_loadu_ps(511, datPtr5+13104+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1235 = _mm512_max_ps(_mm512_setzero_ps(), dat1235);
__m512 dat1236 = _mm512_maskz_loadu_ps(8191, datPtr5+14260+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1236 = _mm512_max_ps(_mm512_setzero_ps(), dat1236);
__m512 in326 = _mm512_permutexvar_ps(pm99, dat1235);
__m512 in334 = _mm512_permutexvar_ps(pm100, dat1236);
__m512 dat1237 = _mm512_maskz_loadu_ps(511, datPtr5+13328+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1237 = _mm512_max_ps(_mm512_setzero_ps(), dat1237);
__m512 dat1238 = _mm512_maskz_loadu_ps(8191, datPtr5+14484+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1238 = _mm512_max_ps(_mm512_setzero_ps(), dat1238);
__m512 in327 = _mm512_permutexvar_ps(pm99, dat1237);
__m512 in335 = _mm512_permutexvar_ps(pm100, dat1238);
__m512 dat1239 = _mm512_maskz_loadu_ps(511, datPtr5+13552+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1239 = _mm512_max_ps(_mm512_setzero_ps(), dat1239);
__m512 dat1240 = _mm512_maskz_loadu_ps(8191, datPtr5+14708+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1240 = _mm512_max_ps(_mm512_setzero_ps(), dat1240);
__m512 in328 = _mm512_permutexvar_ps(pm99, dat1239);
__m512 in336 = _mm512_permutexvar_ps(pm100, dat1240);
__m512 dat1241 = _mm512_maskz_loadu_ps(511, datPtr5+13776+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1241 = _mm512_max_ps(_mm512_setzero_ps(), dat1241);
__m512 dat1242 = _mm512_maskz_loadu_ps(8191, datPtr5+14932+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1242 = _mm512_max_ps(_mm512_setzero_ps(), dat1242);
__m512 in329 = _mm512_permutexvar_ps(pm99, dat1241);
__m512 in337 = _mm512_permutexvar_ps(pm100, dat1242);
__m512 dat1243 = _mm512_maskz_loadu_ps(511, datPtr5+14000+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1243 = _mm512_max_ps(_mm512_setzero_ps(), dat1243);
__m512 dat1244 = _mm512_maskz_loadu_ps(8191, datPtr5+15156+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1244 = _mm512_max_ps(_mm512_setzero_ps(), dat1244);
__m512 in330 = _mm512_permutexvar_ps(pm99, dat1243);
__m512 in338 = _mm512_permutexvar_ps(pm100, dat1244);
__m512 dat1245 = _mm512_maskz_loadu_ps(511, datPtr5+14224+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1245 = _mm512_max_ps(_mm512_setzero_ps(), dat1245);
__m512 dat1246 = _mm512_maskz_loadu_ps(8191, datPtr5+15380+806912*i16+224*h26+4*w29+806912*s13+25216*k58);
dat1246 = _mm512_max_ps(_mm512_setzero_ps(), dat1246);
__m512 in331 = _mm512_permutexvar_ps(pm99, dat1245);
__m512 in339 = _mm512_permutexvar_ps(pm100, dat1246);
__m512 tmp1581 = _mm512_add_ps(in325, in329);
__m512 tmp1585 = _mm512_add_ps(in333, in337);
__m512 tmp1582 = _mm512_sub_ps(in328, in326);
__m512 tmp1586 = _mm512_sub_ps(in336, in334);
__m512 tmp1583 = _mm512_add_ps(in326, in330);
__m512 tmp1587 = _mm512_add_ps(in334, in338);
in324 = _mm512_sub_ps(in324, in330);
in332 = _mm512_sub_ps(in332, in338);
tmp1581 = _mm512_fmadd_ps(in327, _mm512_set1_ps(-4.25e+00f), tmp1581);
tmp1585 = _mm512_fmadd_ps(in335, _mm512_set1_ps(-4.25e+00f), tmp1585);
tmp1583 = _mm512_fmadd_ps(in328, _mm512_set1_ps(-4.25e+00f), tmp1583);
tmp1587 = _mm512_fmadd_ps(in336, _mm512_set1_ps(-4.25e+00f), tmp1587);
in324 = _mm512_fmadd_ps(tmp1582, _mm512_set1_ps(5.25e+00f), in324);
in332 = _mm512_fmadd_ps(tmp1586, _mm512_set1_ps(5.25e+00f), in332);
tmp1582 = _mm512_fmadd_ps(in326, _mm512_set1_ps(2.5e-01f), in330);
tmp1586 = _mm512_fmadd_ps(in334, _mm512_set1_ps(2.5e-01f), in338);
in326 = _mm512_fmadd_ps(in326, _mm512_set1_ps(4e+00f), in330);
in334 = _mm512_fmadd_ps(in334, _mm512_set1_ps(4e+00f), in338);
__m512 tmp1584 = _mm512_sub_ps(tmp1583, tmp1581);
__m512 tmp1588 = _mm512_sub_ps(tmp1587, tmp1585);
tmp1583 = _mm512_add_ps(tmp1581, tmp1583);
tmp1587 = _mm512_add_ps(tmp1585, tmp1587);
tmp1581 = _mm512_fmadd_ps(in325, _mm512_set1_ps(2.5e-01f), in329);
tmp1585 = _mm512_fmadd_ps(in333, _mm512_set1_ps(2.5e-01f), in337);
tmp1582 = _mm512_fmadd_ps(in328, _mm512_set1_ps(-1.25e+00f), tmp1582);
tmp1586 = _mm512_fmadd_ps(in336, _mm512_set1_ps(-1.25e+00f), tmp1586);
in328 = _mm512_fmadd_ps(in328, _mm512_set1_ps(-5e+00f), in326);
in336 = _mm512_fmadd_ps(in336, _mm512_set1_ps(-5e+00f), in334);
tmp1581 = _mm512_fmadd_ps(in327, _mm512_set1_ps(-1.25e+00f), tmp1581);
tmp1585 = _mm512_fmadd_ps(in335, _mm512_set1_ps(-1.25e+00f), tmp1585);
in330 = _mm512_fmadd_ps(tmp1581, _mm512_set1_ps(2e+00f), tmp1582);
in338 = _mm512_fmadd_ps(tmp1585, _mm512_set1_ps(2e+00f), tmp1586);
tmp1582 = _mm512_fnmadd_ps(tmp1581, _mm512_set1_ps(2e+00f), tmp1582);
tmp1586 = _mm512_fnmadd_ps(tmp1585, _mm512_set1_ps(2e+00f), tmp1586);
tmp1581 = _mm512_fmadd_ps(in329, _mm512_set1_ps(2.5e-01f), in325);
tmp1585 = _mm512_fmadd_ps(in337, _mm512_set1_ps(2.5e-01f), in333);
in325 = _mm512_sub_ps(in331, in325);
in333 = _mm512_sub_ps(in339, in333);
tmp1581 = _mm512_fmadd_ps(in327, _mm512_set1_ps(-1.25e+00f), tmp1581);
tmp1585 = _mm512_fmadd_ps(in335, _mm512_set1_ps(-1.25e+00f), tmp1585);
in327 = _mm512_sub_ps(in327, in329);
in335 = _mm512_sub_ps(in335, in337);
in327 = _mm512_fmadd_ps(in327, _mm512_set1_ps(5.25e+00f), in325);
in335 = _mm512_fmadd_ps(in335, _mm512_set1_ps(5.25e+00f), in333);
in326 = _mm512_fmadd_ps(tmp1581, _mm512_set1_ps(2e+00f), in328);
in334 = _mm512_fmadd_ps(tmp1585, _mm512_set1_ps(2e+00f), in336);
in328 = _mm512_fnmadd_ps(tmp1581, _mm512_set1_ps(2e+00f), in328);
in336 = _mm512_fnmadd_ps(tmp1585, _mm512_set1_ps(2e+00f), in336);
__m512 tmp1597 = _mm512_unpacklo_ps(in324, tmp1583);
__m512 tmp1598 = _mm512_unpackhi_ps(in324, tmp1583);
__m512 tmp1599 = _mm512_unpacklo_ps(tmp1584, in330);
__m512 tmp1600 = _mm512_unpackhi_ps(tmp1584, in330);
__m512 tmp1601 = _mm512_unpacklo_ps(tmp1582, in326);
__m512 tmp1602 = _mm512_unpackhi_ps(tmp1582, in326);
__m512 tmp1603 = _mm512_unpacklo_ps(in328, in327);
__m512 tmp1604 = _mm512_unpackhi_ps(in328, in327);
__m512 tmp1605 = _mm512_unpacklo_ps(in332, tmp1587);
__m512 tmp1606 = _mm512_unpackhi_ps(in332, tmp1587);
__m512 tmp1607 = _mm512_unpacklo_ps(tmp1588, in338);
__m512 tmp1608 = _mm512_unpackhi_ps(tmp1588, in338);
__m512 tmp1609 = _mm512_unpacklo_ps(tmp1586, in334);
__m512 tmp1610 = _mm512_unpackhi_ps(tmp1586, in334);
__m512 tmp1611 = _mm512_unpacklo_ps(in336, in335);
__m512 tmp1612 = _mm512_unpackhi_ps(in336, in335);
__m512 tmp1613 = _mm512_shuffle_ps(tmp1597, tmp1599, 68);
__m512 tmp1614 = _mm512_shuffle_ps(tmp1597, tmp1599, 238);
__m512 tmp1615 = _mm512_shuffle_ps(tmp1598, tmp1600, 68);
__m512 tmp1616 = _mm512_shuffle_ps(tmp1598, tmp1600, 238);
__m512 tmp1617 = _mm512_shuffle_ps(tmp1601, tmp1603, 68);
__m512 tmp1618 = _mm512_shuffle_ps(tmp1601, tmp1603, 238);
__m512 tmp1619 = _mm512_shuffle_ps(tmp1602, tmp1604, 68);
__m512 tmp1620 = _mm512_shuffle_ps(tmp1602, tmp1604, 238);
__m512 tmp1621 = _mm512_shuffle_ps(tmp1605, tmp1607, 68);
__m512 tmp1622 = _mm512_shuffle_ps(tmp1605, tmp1607, 238);
__m512 tmp1623 = _mm512_shuffle_ps(tmp1606, tmp1608, 68);
__m512 tmp1624 = _mm512_shuffle_ps(tmp1606, tmp1608, 238);
__m512 tmp1625 = _mm512_shuffle_ps(tmp1609, tmp1611, 68);
__m512 tmp1626 = _mm512_shuffle_ps(tmp1609, tmp1611, 238);
__m512 tmp1627 = _mm512_shuffle_ps(tmp1610, tmp1612, 68);
__m512 tmp1628 = _mm512_shuffle_ps(tmp1610, tmp1612, 238);
__m512 tmp1629 = _mm512_shuffle_f32x4(tmp1613, tmp1617, 136);
__m512 tmp1630 = _mm512_shuffle_f32x4(tmp1613, tmp1617, 221);
__m512 tmp1631 = _mm512_shuffle_f32x4(tmp1614, tmp1618, 136);
__m512 tmp1632 = _mm512_shuffle_f32x4(tmp1614, tmp1618, 221);
__m512 tmp1633 = _mm512_shuffle_f32x4(tmp1615, tmp1619, 136);
__m512 tmp1634 = _mm512_shuffle_f32x4(tmp1615, tmp1619, 221);
__m512 tmp1635 = _mm512_shuffle_f32x4(tmp1616, tmp1620, 136);
__m512 tmp1636 = _mm512_shuffle_f32x4(tmp1616, tmp1620, 221);
__m512 tmp1637 = _mm512_shuffle_f32x4(tmp1621, tmp1625, 136);
__m512 tmp1638 = _mm512_shuffle_f32x4(tmp1621, tmp1625, 221);
__m512 tmp1639 = _mm512_shuffle_f32x4(tmp1622, tmp1626, 136);
__m512 tmp1640 = _mm512_shuffle_f32x4(tmp1622, tmp1626, 221);
__m512 tmp1641 = _mm512_shuffle_f32x4(tmp1623, tmp1627, 136);
__m512 tmp1642 = _mm512_shuffle_f32x4(tmp1623, tmp1627, 221);
__m512 tmp1643 = _mm512_shuffle_f32x4(tmp1624, tmp1628, 136);
__m512 tmp1644 = _mm512_shuffle_f32x4(tmp1624, tmp1628, 221);
in324 = _mm512_shuffle_f32x4(tmp1629, tmp1637, 136);
in332 = _mm512_shuffle_f32x4(tmp1629, tmp1637, 221);
tmp1583 = _mm512_shuffle_f32x4(tmp1631, tmp1639, 136);
tmp1587 = _mm512_shuffle_f32x4(tmp1631, tmp1639, 221);
tmp1584 = _mm512_shuffle_f32x4(tmp1633, tmp1641, 136);
tmp1588 = _mm512_shuffle_f32x4(tmp1633, tmp1641, 221);
in330 = _mm512_shuffle_f32x4(tmp1635, tmp1643, 136);
in338 = _mm512_shuffle_f32x4(tmp1635, tmp1643, 221);
tmp1582 = _mm512_shuffle_f32x4(tmp1630, tmp1638, 136);
tmp1586 = _mm512_shuffle_f32x4(tmp1630, tmp1638, 221);
in326 = _mm512_shuffle_f32x4(tmp1632, tmp1640, 136);
in334 = _mm512_shuffle_f32x4(tmp1632, tmp1640, 221);
in328 = _mm512_shuffle_f32x4(tmp1634, tmp1642, 136);
in336 = _mm512_shuffle_f32x4(tmp1634, tmp1642, 221);
in327 = _mm512_shuffle_f32x4(tmp1636, tmp1644, 136);
in335 = _mm512_shuffle_f32x4(tmp1636, tmp1644, 221);
__m512 tmp1589 = _mm512_add_ps(tmp1583, in326);
__m512 tmp1593 = _mm512_add_ps(tmp1587, in334);
__m512 tmp1590 = _mm512_sub_ps(tmp1582, tmp1584);
__m512 tmp1594 = _mm512_sub_ps(tmp1586, tmp1588);
__m512 tmp1591 = _mm512_add_ps(tmp1584, in328);
__m512 tmp1595 = _mm512_add_ps(tmp1588, in336);
in324 = _mm512_sub_ps(in324, in328);
in332 = _mm512_sub_ps(in332, in336);
tmp1589 = _mm512_fmadd_ps(in330, _mm512_set1_ps(-4.25e+00f), tmp1589);
tmp1593 = _mm512_fmadd_ps(in338, _mm512_set1_ps(-4.25e+00f), tmp1593);
tmp1591 = _mm512_fmadd_ps(tmp1582, _mm512_set1_ps(-4.25e+00f), tmp1591);
tmp1595 = _mm512_fmadd_ps(tmp1586, _mm512_set1_ps(-4.25e+00f), tmp1595);
in324 = _mm512_fmadd_ps(tmp1590, _mm512_set1_ps(5.25e+00f), in324);
in332 = _mm512_fmadd_ps(tmp1594, _mm512_set1_ps(5.25e+00f), in332);
tmp1590 = _mm512_fmadd_ps(tmp1584, _mm512_set1_ps(2.5e-01f), in328);
tmp1594 = _mm512_fmadd_ps(tmp1588, _mm512_set1_ps(2.5e-01f), in336);
tmp1584 = _mm512_fmadd_ps(tmp1584, _mm512_set1_ps(4e+00f), in328);
tmp1588 = _mm512_fmadd_ps(tmp1588, _mm512_set1_ps(4e+00f), in336);
__m512 tmp1592 = _mm512_sub_ps(tmp1591, tmp1589);
__m512 tmp1596 = _mm512_sub_ps(tmp1595, tmp1593);
tmp1591 = _mm512_add_ps(tmp1589, tmp1591);
tmp1595 = _mm512_add_ps(tmp1593, tmp1595);
tmp1589 = _mm512_fmadd_ps(tmp1583, _mm512_set1_ps(2.5e-01f), in326);
tmp1593 = _mm512_fmadd_ps(tmp1587, _mm512_set1_ps(2.5e-01f), in334);
tmp1590 = _mm512_fmadd_ps(tmp1582, _mm512_set1_ps(-1.25e+00f), tmp1590);
tmp1594 = _mm512_fmadd_ps(tmp1586, _mm512_set1_ps(-1.25e+00f), tmp1594);
tmp1582 = _mm512_fmadd_ps(tmp1582, _mm512_set1_ps(-5e+00f), tmp1584);
tmp1586 = _mm512_fmadd_ps(tmp1586, _mm512_set1_ps(-5e+00f), tmp1588);
tmp1589 = _mm512_fmadd_ps(in330, _mm512_set1_ps(-1.25e+00f), tmp1589);
tmp1593 = _mm512_fmadd_ps(in338, _mm512_set1_ps(-1.25e+00f), tmp1593);
in328 = _mm512_fmadd_ps(tmp1589, _mm512_set1_ps(2e+00f), tmp1590);
in336 = _mm512_fmadd_ps(tmp1593, _mm512_set1_ps(2e+00f), tmp1594);
tmp1590 = _mm512_fnmadd_ps(tmp1589, _mm512_set1_ps(2e+00f), tmp1590);
tmp1594 = _mm512_fnmadd_ps(tmp1593, _mm512_set1_ps(2e+00f), tmp1594);
tmp1589 = _mm512_fmadd_ps(in326, _mm512_set1_ps(2.5e-01f), tmp1583);
tmp1593 = _mm512_fmadd_ps(in334, _mm512_set1_ps(2.5e-01f), tmp1587);
tmp1583 = _mm512_sub_ps(in327, tmp1583);
tmp1587 = _mm512_sub_ps(in335, tmp1587);
tmp1589 = _mm512_fmadd_ps(in330, _mm512_set1_ps(-1.25e+00f), tmp1589);
tmp1593 = _mm512_fmadd_ps(in338, _mm512_set1_ps(-1.25e+00f), tmp1593);
in330 = _mm512_sub_ps(in330, in326);
in338 = _mm512_sub_ps(in338, in334);
in330 = _mm512_fmadd_ps(in330, _mm512_set1_ps(5.25e+00f), tmp1583);
in338 = _mm512_fmadd_ps(in338, _mm512_set1_ps(5.25e+00f), tmp1587);
tmp1584 = _mm512_fmadd_ps(tmp1589, _mm512_set1_ps(2e+00f), tmp1582);
tmp1588 = _mm512_fmadd_ps(tmp1593, _mm512_set1_ps(2e+00f), tmp1586);
tmp1582 = _mm512_fnmadd_ps(tmp1589, _mm512_set1_ps(2e+00f), tmp1582);
tmp1586 = _mm512_fnmadd_ps(tmp1593, _mm512_set1_ps(2e+00f), tmp1586);
__m512 out343 = _mm512_shuffle_f32x4(in324, tmp1591, 68);
__m512 out351 = _mm512_shuffle_f32x4(in324, tmp1591, 238);
__m512 out344 = _mm512_shuffle_f32x4(tmp1592, in328, 68);
__m512 out352 = _mm512_shuffle_f32x4(tmp1592, in328, 238);
__m512 out345 = _mm512_shuffle_f32x4(tmp1590, tmp1584, 68);
__m512 out353 = _mm512_shuffle_f32x4(tmp1590, tmp1584, 238);
__m512 out346 = _mm512_shuffle_f32x4(tmp1582, in330, 68);
__m512 out354 = _mm512_shuffle_f32x4(tmp1582, in330, 238);
__m512 out347 = _mm512_shuffle_f32x4(in332, tmp1595, 68);
__m512 out355 = _mm512_shuffle_f32x4(in332, tmp1595, 238);
__m512 out348 = _mm512_shuffle_f32x4(tmp1596, in336, 68);
__m512 out356 = _mm512_shuffle_f32x4(tmp1596, in336, 238);
__m512 out349 = _mm512_shuffle_f32x4(tmp1594, tmp1588, 68);
__m512 out357 = _mm512_shuffle_f32x4(tmp1594, tmp1588, 238);
__m512 out350 = _mm512_shuffle_f32x4(tmp1586, in338, 68);
__m512 out358 = _mm512_shuffle_f32x4(tmp1586, in338, 238);
_mm512_storeu_ps(dfPtr4+512+1638400*i16+24576*j11+24576*s13+768*k58, out343);
_mm512_storeu_ps(dfPtr4+640+1638400*i16+24576*j11+24576*s13+768*k58, out351);
_mm512_storeu_ps(dfPtr4+576+1638400*i16+24576*j11+24576*s13+768*k58, out347);
_mm512_storeu_ps(dfPtr4+704+1638400*i16+24576*j11+24576*s13+768*k58, out355);
_mm512_storeu_ps(dfPtr4+410112+1638400*i16+24576*j11+24576*s13+768*k58, out344);
_mm512_storeu_ps(dfPtr4+410240+1638400*i16+24576*j11+24576*s13+768*k58, out352);
_mm512_storeu_ps(dfPtr4+410176+1638400*i16+24576*j11+24576*s13+768*k58, out348);
_mm512_storeu_ps(dfPtr4+410304+1638400*i16+24576*j11+24576*s13+768*k58, out356);
_mm512_storeu_ps(dfPtr4+819712+1638400*i16+24576*j11+24576*s13+768*k58, out345);
_mm512_storeu_ps(dfPtr4+819840+1638400*i16+24576*j11+24576*s13+768*k58, out353);
_mm512_storeu_ps(dfPtr4+819776+1638400*i16+24576*j11+24576*s13+768*k58, out349);
_mm512_storeu_ps(dfPtr4+819904+1638400*i16+24576*j11+24576*s13+768*k58, out357);
_mm512_storeu_ps(dfPtr4+1229312+1638400*i16+24576*j11+24576*s13+768*k58, out346);
_mm512_storeu_ps(dfPtr4+1229440+1638400*i16+24576*j11+24576*s13+768*k58, out354);
_mm512_storeu_ps(dfPtr4+1229376+1638400*i16+24576*j11+24576*s13+768*k58, out350);
_mm512_storeu_ps(dfPtr4+1229504+1638400*i16+24576*j11+24576*s13+768*k58, out358);
}
if (j11 >= last3) return;
++j11;
}
j11 = 15;
}
ptrdiff_t rel9 = j11-15;
ptrdiff_t base9 = 54;
if (rel9 < 1) {
ptrdiff_t h27 = base9+0;
ptrdiff_t w30 = 0;
ptrdiff_t k59 = 0;
for (; k59 != 32; ++k59) {
__m512 dat1247 = _mm512_maskz_loadu_ps(8191, datPtr5+4+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1247 = _mm512_max_ps(_mm512_setzero_ps(), dat1247);
__m512 dat1248 = _mm512_maskz_loadu_ps(16383, datPtr5+48+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1248 = _mm512_max_ps(_mm512_setzero_ps(), dat1248);
__m512i pm101 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in340 = _mm512_permutexvar_ps(pm101, dat1247);
__m512i pm102 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in343 = _mm512_permutexvar_ps(pm102, dat1248);
__m512 dat1249 = _mm512_maskz_loadu_ps(8191, datPtr5+228+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1249 = _mm512_max_ps(_mm512_setzero_ps(), dat1249);
__m512 dat1250 = _mm512_maskz_loadu_ps(16383, datPtr5+272+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1250 = _mm512_max_ps(_mm512_setzero_ps(), dat1250);
__m512 in341 = _mm512_permutexvar_ps(pm101, dat1249);
__m512 in344 = _mm512_permutexvar_ps(pm102, dat1250);
__m512 dat1251 = _mm512_maskz_loadu_ps(8191, datPtr5+452+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1251 = _mm512_max_ps(_mm512_setzero_ps(), dat1251);
__m512 dat1252 = _mm512_maskz_loadu_ps(16383, datPtr5+496+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1252 = _mm512_max_ps(_mm512_setzero_ps(), dat1252);
__m512 in342 = _mm512_permutexvar_ps(pm101, dat1251);
__m512 in345 = _mm512_permutexvar_ps(pm102, dat1252);
__m512 tmp1645 = in341;
__m512 tmp1652 = in344;
__m512 tmp1646 = _mm512_sub_ps(_mm512_setzero_ps(), in342);
__m512 tmp1653 = _mm512_sub_ps(_mm512_setzero_ps(), in345);
__m512 tmp1647 = in342;
__m512 tmp1654 = in345;
in340 = in340;
in343 = in343;
tmp1645 = tmp1645;
tmp1652 = tmp1652;
tmp1647 = tmp1647;
tmp1654 = tmp1654;
in340 = _mm512_fmadd_ps(tmp1646, _mm512_set1_ps(5.25e+00f), in340);
in343 = _mm512_fmadd_ps(tmp1653, _mm512_set1_ps(5.25e+00f), in343);
tmp1646 = _mm512_mul_ps(in342, _mm512_set1_ps(2.5e-01f));
tmp1653 = _mm512_mul_ps(in345, _mm512_set1_ps(2.5e-01f));
in342 = _mm512_mul_ps(in342, _mm512_set1_ps(4e+00f));
in345 = _mm512_mul_ps(in345, _mm512_set1_ps(4e+00f));
__m512 tmp1648 = _mm512_sub_ps(tmp1647, tmp1645);
__m512 tmp1655 = _mm512_sub_ps(tmp1654, tmp1652);
tmp1647 = _mm512_add_ps(tmp1645, tmp1647);
tmp1654 = _mm512_add_ps(tmp1652, tmp1654);
tmp1645 = _mm512_mul_ps(in341, _mm512_set1_ps(2.5e-01f));
tmp1652 = _mm512_mul_ps(in344, _mm512_set1_ps(2.5e-01f));
tmp1646 = tmp1646;
tmp1653 = tmp1653;
__m512 tmp1649 = in342;
__m512 tmp1656 = in345;
tmp1645 = tmp1645;
tmp1652 = tmp1652;
__m512 tmp1650 = _mm512_fmadd_ps(tmp1645, _mm512_set1_ps(2e+00f), tmp1646);
__m512 tmp1657 = _mm512_fmadd_ps(tmp1652, _mm512_set1_ps(2e+00f), tmp1653);
tmp1646 = _mm512_fnmadd_ps(tmp1645, _mm512_set1_ps(2e+00f), tmp1646);
tmp1653 = _mm512_fnmadd_ps(tmp1652, _mm512_set1_ps(2e+00f), tmp1653);
tmp1645 = in341;
tmp1652 = in344;
in341 = _mm512_sub_ps(_mm512_setzero_ps(), in341);
in344 = _mm512_sub_ps(_mm512_setzero_ps(), in344);
tmp1645 = tmp1645;
tmp1652 = tmp1652;
__m512 tmp1651 = in341;
__m512 tmp1658 = in344;
in342 = _mm512_fmadd_ps(tmp1645, _mm512_set1_ps(2e+00f), tmp1649);
in345 = _mm512_fmadd_ps(tmp1652, _mm512_set1_ps(2e+00f), tmp1656);
tmp1649 = _mm512_fnmadd_ps(tmp1645, _mm512_set1_ps(2e+00f), tmp1649);
tmp1656 = _mm512_fnmadd_ps(tmp1652, _mm512_set1_ps(2e+00f), tmp1656);
__m512 tmp1667 = _mm512_unpacklo_ps(in340, tmp1647);
__m512 tmp1668 = _mm512_unpackhi_ps(in340, tmp1647);
__m512 tmp1669 = _mm512_unpacklo_ps(tmp1648, tmp1650);
__m512 tmp1670 = _mm512_unpackhi_ps(tmp1648, tmp1650);
__m512 tmp1671 = _mm512_unpacklo_ps(tmp1646, in342);
__m512 tmp1672 = _mm512_unpackhi_ps(tmp1646, in342);
__m512 tmp1673 = _mm512_unpacklo_ps(tmp1649, tmp1651);
__m512 tmp1674 = _mm512_unpackhi_ps(tmp1649, tmp1651);
__m512 tmp1675 = _mm512_unpacklo_ps(in343, tmp1654);
__m512 tmp1676 = _mm512_unpackhi_ps(in343, tmp1654);
__m512 tmp1677 = _mm512_unpacklo_ps(tmp1655, tmp1657);
__m512 tmp1678 = _mm512_unpackhi_ps(tmp1655, tmp1657);
__m512 tmp1679 = _mm512_unpacklo_ps(tmp1653, in345);
__m512 tmp1680 = _mm512_unpackhi_ps(tmp1653, in345);
__m512 tmp1681 = _mm512_unpacklo_ps(tmp1656, tmp1658);
__m512 tmp1682 = _mm512_unpackhi_ps(tmp1656, tmp1658);
__m512 tmp1683 = _mm512_shuffle_ps(tmp1667, tmp1669, 68);
__m512 tmp1684 = _mm512_shuffle_ps(tmp1667, tmp1669, 238);
__m512 tmp1685 = _mm512_shuffle_ps(tmp1668, tmp1670, 68);
__m512 tmp1686 = _mm512_shuffle_ps(tmp1668, tmp1670, 238);
__m512 tmp1687 = _mm512_shuffle_ps(tmp1671, tmp1673, 68);
__m512 tmp1688 = _mm512_shuffle_ps(tmp1671, tmp1673, 238);
__m512 tmp1689 = _mm512_shuffle_ps(tmp1672, tmp1674, 68);
__m512 tmp1690 = _mm512_shuffle_ps(tmp1672, tmp1674, 238);
__m512 tmp1691 = _mm512_shuffle_ps(tmp1675, tmp1677, 68);
__m512 tmp1692 = _mm512_shuffle_ps(tmp1675, tmp1677, 238);
__m512 tmp1693 = _mm512_shuffle_ps(tmp1676, tmp1678, 68);
__m512 tmp1694 = _mm512_shuffle_ps(tmp1676, tmp1678, 238);
__m512 tmp1695 = _mm512_shuffle_ps(tmp1679, tmp1681, 68);
__m512 tmp1696 = _mm512_shuffle_ps(tmp1679, tmp1681, 238);
__m512 tmp1697 = _mm512_shuffle_ps(tmp1680, tmp1682, 68);
__m512 tmp1698 = _mm512_shuffle_ps(tmp1680, tmp1682, 238);
__m512 tmp1699 = _mm512_shuffle_f32x4(tmp1683, tmp1687, 136);
__m512 tmp1700 = _mm512_shuffle_f32x4(tmp1683, tmp1687, 221);
__m512 tmp1701 = _mm512_shuffle_f32x4(tmp1684, tmp1688, 136);
__m512 tmp1702 = _mm512_shuffle_f32x4(tmp1684, tmp1688, 221);
__m512 tmp1703 = _mm512_shuffle_f32x4(tmp1685, tmp1689, 136);
__m512 tmp1704 = _mm512_shuffle_f32x4(tmp1685, tmp1689, 221);
__m512 tmp1705 = _mm512_shuffle_f32x4(tmp1686, tmp1690, 136);
__m512 tmp1706 = _mm512_shuffle_f32x4(tmp1686, tmp1690, 221);
__m512 tmp1707 = _mm512_shuffle_f32x4(tmp1691, tmp1695, 136);
__m512 tmp1708 = _mm512_shuffle_f32x4(tmp1691, tmp1695, 221);
__m512 tmp1709 = _mm512_shuffle_f32x4(tmp1692, tmp1696, 136);
__m512 tmp1710 = _mm512_shuffle_f32x4(tmp1692, tmp1696, 221);
__m512 tmp1711 = _mm512_shuffle_f32x4(tmp1693, tmp1697, 136);
__m512 tmp1712 = _mm512_shuffle_f32x4(tmp1693, tmp1697, 221);
__m512 tmp1713 = _mm512_shuffle_f32x4(tmp1694, tmp1698, 136);
__m512 tmp1714 = _mm512_shuffle_f32x4(tmp1694, tmp1698, 221);
in340 = _mm512_shuffle_f32x4(tmp1699, tmp1707, 136);
in343 = _mm512_shuffle_f32x4(tmp1699, tmp1707, 221);
tmp1647 = _mm512_shuffle_f32x4(tmp1701, tmp1709, 136);
tmp1654 = _mm512_shuffle_f32x4(tmp1701, tmp1709, 221);
tmp1648 = _mm512_shuffle_f32x4(tmp1703, tmp1711, 136);
tmp1655 = _mm512_shuffle_f32x4(tmp1703, tmp1711, 221);
tmp1650 = _mm512_shuffle_f32x4(tmp1705, tmp1713, 136);
tmp1657 = _mm512_shuffle_f32x4(tmp1705, tmp1713, 221);
tmp1646 = _mm512_shuffle_f32x4(tmp1700, tmp1708, 136);
tmp1653 = _mm512_shuffle_f32x4(tmp1700, tmp1708, 221);
in342 = _mm512_shuffle_f32x4(tmp1702, tmp1710, 136);
in345 = _mm512_shuffle_f32x4(tmp1702, tmp1710, 221);
tmp1649 = _mm512_shuffle_f32x4(tmp1704, tmp1712, 136);
tmp1656 = _mm512_shuffle_f32x4(tmp1704, tmp1712, 221);
tmp1651 = _mm512_shuffle_f32x4(tmp1706, tmp1714, 136);
tmp1658 = _mm512_shuffle_f32x4(tmp1706, tmp1714, 221);
__m512 tmp1659 = _mm512_add_ps(tmp1647, in342);
__m512 tmp1663 = _mm512_add_ps(tmp1654, in345);
__m512 tmp1660 = _mm512_sub_ps(tmp1646, tmp1648);
__m512 tmp1664 = _mm512_sub_ps(tmp1653, tmp1655);
__m512 tmp1661 = _mm512_add_ps(tmp1648, tmp1649);
__m512 tmp1665 = _mm512_add_ps(tmp1655, tmp1656);
in340 = _mm512_sub_ps(in340, tmp1649);
in343 = _mm512_sub_ps(in343, tmp1656);
tmp1659 = _mm512_fmadd_ps(tmp1650, _mm512_set1_ps(-4.25e+00f), tmp1659);
tmp1663 = _mm512_fmadd_ps(tmp1657, _mm512_set1_ps(-4.25e+00f), tmp1663);
tmp1661 = _mm512_fmadd_ps(tmp1646, _mm512_set1_ps(-4.25e+00f), tmp1661);
tmp1665 = _mm512_fmadd_ps(tmp1653, _mm512_set1_ps(-4.25e+00f), tmp1665);
in340 = _mm512_fmadd_ps(tmp1660, _mm512_set1_ps(5.25e+00f), in340);
in343 = _mm512_fmadd_ps(tmp1664, _mm512_set1_ps(5.25e+00f), in343);
tmp1660 = _mm512_fmadd_ps(tmp1648, _mm512_set1_ps(2.5e-01f), tmp1649);
tmp1664 = _mm512_fmadd_ps(tmp1655, _mm512_set1_ps(2.5e-01f), tmp1656);
tmp1648 = _mm512_fmadd_ps(tmp1648, _mm512_set1_ps(4e+00f), tmp1649);
tmp1655 = _mm512_fmadd_ps(tmp1655, _mm512_set1_ps(4e+00f), tmp1656);
__m512 tmp1662 = _mm512_sub_ps(tmp1661, tmp1659);
__m512 tmp1666 = _mm512_sub_ps(tmp1665, tmp1663);
tmp1661 = _mm512_add_ps(tmp1659, tmp1661);
tmp1665 = _mm512_add_ps(tmp1663, tmp1665);
tmp1659 = _mm512_fmadd_ps(tmp1647, _mm512_set1_ps(2.5e-01f), in342);
tmp1663 = _mm512_fmadd_ps(tmp1654, _mm512_set1_ps(2.5e-01f), in345);
tmp1660 = _mm512_fmadd_ps(tmp1646, _mm512_set1_ps(-1.25e+00f), tmp1660);
tmp1664 = _mm512_fmadd_ps(tmp1653, _mm512_set1_ps(-1.25e+00f), tmp1664);
tmp1646 = _mm512_fmadd_ps(tmp1646, _mm512_set1_ps(-5e+00f), tmp1648);
tmp1653 = _mm512_fmadd_ps(tmp1653, _mm512_set1_ps(-5e+00f), tmp1655);
tmp1659 = _mm512_fmadd_ps(tmp1650, _mm512_set1_ps(-1.25e+00f), tmp1659);
tmp1663 = _mm512_fmadd_ps(tmp1657, _mm512_set1_ps(-1.25e+00f), tmp1663);
tmp1649 = _mm512_fmadd_ps(tmp1659, _mm512_set1_ps(2e+00f), tmp1660);
tmp1656 = _mm512_fmadd_ps(tmp1663, _mm512_set1_ps(2e+00f), tmp1664);
tmp1660 = _mm512_fnmadd_ps(tmp1659, _mm512_set1_ps(2e+00f), tmp1660);
tmp1664 = _mm512_fnmadd_ps(tmp1663, _mm512_set1_ps(2e+00f), tmp1664);
tmp1659 = _mm512_fmadd_ps(in342, _mm512_set1_ps(2.5e-01f), tmp1647);
tmp1663 = _mm512_fmadd_ps(in345, _mm512_set1_ps(2.5e-01f), tmp1654);
tmp1647 = _mm512_sub_ps(tmp1651, tmp1647);
tmp1654 = _mm512_sub_ps(tmp1658, tmp1654);
tmp1659 = _mm512_fmadd_ps(tmp1650, _mm512_set1_ps(-1.25e+00f), tmp1659);
tmp1663 = _mm512_fmadd_ps(tmp1657, _mm512_set1_ps(-1.25e+00f), tmp1663);
tmp1650 = _mm512_sub_ps(tmp1650, in342);
tmp1657 = _mm512_sub_ps(tmp1657, in345);
tmp1650 = _mm512_fmadd_ps(tmp1650, _mm512_set1_ps(5.25e+00f), tmp1647);
tmp1657 = _mm512_fmadd_ps(tmp1657, _mm512_set1_ps(5.25e+00f), tmp1654);
tmp1648 = _mm512_fmadd_ps(tmp1659, _mm512_set1_ps(2e+00f), tmp1646);
tmp1655 = _mm512_fmadd_ps(tmp1663, _mm512_set1_ps(2e+00f), tmp1653);
tmp1646 = _mm512_fnmadd_ps(tmp1659, _mm512_set1_ps(2e+00f), tmp1646);
tmp1653 = _mm512_fnmadd_ps(tmp1663, _mm512_set1_ps(2e+00f), tmp1653);
__m512 out359 = _mm512_shuffle_f32x4(in340, tmp1661, 68);
__m512 out367 = _mm512_shuffle_f32x4(in340, tmp1661, 238);
__m512 out360 = _mm512_shuffle_f32x4(tmp1662, tmp1649, 68);
__m512 out368 = _mm512_shuffle_f32x4(tmp1662, tmp1649, 238);
__m512 out361 = _mm512_shuffle_f32x4(tmp1660, tmp1648, 68);
__m512 out369 = _mm512_shuffle_f32x4(tmp1660, tmp1648, 238);
__m512 out362 = _mm512_shuffle_f32x4(tmp1646, tmp1650, 68);
__m512 out370 = _mm512_shuffle_f32x4(tmp1646, tmp1650, 238);
__m512 out363 = _mm512_shuffle_f32x4(in343, tmp1665, 68);
__m512 out371 = _mm512_shuffle_f32x4(in343, tmp1665, 238);
__m512 out364 = _mm512_shuffle_f32x4(tmp1666, tmp1656, 68);
__m512 out372 = _mm512_shuffle_f32x4(tmp1666, tmp1656, 238);
__m512 out365 = _mm512_shuffle_f32x4(tmp1664, tmp1655, 68);
__m512 out373 = _mm512_shuffle_f32x4(tmp1664, tmp1655, 238);
__m512 out366 = _mm512_shuffle_f32x4(tmp1653, tmp1657, 68);
__m512 out374 = _mm512_shuffle_f32x4(tmp1653, tmp1657, 238);
_mm512_storeu_ps(dfPtr4+0+1638400*i16+24576*j11+24576*s13+768*k59, out359);
_mm512_storeu_ps(dfPtr4+128+1638400*i16+24576*j11+24576*s13+768*k59, out367);
_mm512_storeu_ps(dfPtr4+64+1638400*i16+24576*j11+24576*s13+768*k59, out363);
_mm512_storeu_ps(dfPtr4+192+1638400*i16+24576*j11+24576*s13+768*k59, out371);
_mm512_storeu_ps(dfPtr4+409600+1638400*i16+24576*j11+24576*s13+768*k59, out360);
_mm512_storeu_ps(dfPtr4+409728+1638400*i16+24576*j11+24576*s13+768*k59, out368);
_mm512_storeu_ps(dfPtr4+409664+1638400*i16+24576*j11+24576*s13+768*k59, out364);
_mm512_storeu_ps(dfPtr4+409792+1638400*i16+24576*j11+24576*s13+768*k59, out372);
_mm512_storeu_ps(dfPtr4+819200+1638400*i16+24576*j11+24576*s13+768*k59, out361);
_mm512_storeu_ps(dfPtr4+819328+1638400*i16+24576*j11+24576*s13+768*k59, out369);
_mm512_storeu_ps(dfPtr4+819264+1638400*i16+24576*j11+24576*s13+768*k59, out365);
_mm512_storeu_ps(dfPtr4+819392+1638400*i16+24576*j11+24576*s13+768*k59, out373);
_mm512_storeu_ps(dfPtr4+1228800+1638400*i16+24576*j11+24576*s13+768*k59, out362);
_mm512_storeu_ps(dfPtr4+1228928+1638400*i16+24576*j11+24576*s13+768*k59, out370);
_mm512_storeu_ps(dfPtr4+1228864+1638400*i16+24576*j11+24576*s13+768*k59, out366);
_mm512_storeu_ps(dfPtr4+1228992+1638400*i16+24576*j11+24576*s13+768*k59, out374);
__m512 dat1253 = _mm512_maskz_loadu_ps(16383, datPtr5+96+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1253 = _mm512_max_ps(_mm512_setzero_ps(), dat1253);
__m512 dat1254 = _mm512_maskz_loadu_ps(8191, datPtr5+12612+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1254 = _mm512_max_ps(_mm512_setzero_ps(), dat1254);
__m512i pm103 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in346 = _mm512_permutexvar_ps(pm103, dat1253);
__m512i pm104 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in349 = _mm512_permutexvar_ps(pm104, dat1254);
__m512 dat1255 = _mm512_maskz_loadu_ps(16383, datPtr5+320+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1255 = _mm512_max_ps(_mm512_setzero_ps(), dat1255);
__m512 dat1256 = _mm512_maskz_loadu_ps(8191, datPtr5+12836+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1256 = _mm512_max_ps(_mm512_setzero_ps(), dat1256);
__m512 in347 = _mm512_permutexvar_ps(pm103, dat1255);
__m512 in350 = _mm512_permutexvar_ps(pm104, dat1256);
__m512 dat1257 = _mm512_maskz_loadu_ps(16383, datPtr5+544+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1257 = _mm512_max_ps(_mm512_setzero_ps(), dat1257);
__m512 dat1258 = _mm512_maskz_loadu_ps(8191, datPtr5+13060+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1258 = _mm512_max_ps(_mm512_setzero_ps(), dat1258);
__m512 in348 = _mm512_permutexvar_ps(pm103, dat1257);
__m512 in351 = _mm512_permutexvar_ps(pm104, dat1258);
__m512 tmp1715 = in347;
__m512 tmp1722 = in350;
__m512 tmp1716 = _mm512_sub_ps(_mm512_setzero_ps(), in348);
__m512 tmp1723 = _mm512_sub_ps(_mm512_setzero_ps(), in351);
__m512 tmp1717 = in348;
__m512 tmp1724 = in351;
in346 = in346;
in349 = in349;
tmp1715 = tmp1715;
tmp1722 = tmp1722;
tmp1717 = tmp1717;
tmp1724 = tmp1724;
in346 = _mm512_fmadd_ps(tmp1716, _mm512_set1_ps(5.25e+00f), in346);
in349 = _mm512_fmadd_ps(tmp1723, _mm512_set1_ps(5.25e+00f), in349);
tmp1716 = _mm512_mul_ps(in348, _mm512_set1_ps(2.5e-01f));
tmp1723 = _mm512_mul_ps(in351, _mm512_set1_ps(2.5e-01f));
in348 = _mm512_mul_ps(in348, _mm512_set1_ps(4e+00f));
in351 = _mm512_mul_ps(in351, _mm512_set1_ps(4e+00f));
__m512 tmp1718 = _mm512_sub_ps(tmp1717, tmp1715);
__m512 tmp1725 = _mm512_sub_ps(tmp1724, tmp1722);
tmp1717 = _mm512_add_ps(tmp1715, tmp1717);
tmp1724 = _mm512_add_ps(tmp1722, tmp1724);
tmp1715 = _mm512_mul_ps(in347, _mm512_set1_ps(2.5e-01f));
tmp1722 = _mm512_mul_ps(in350, _mm512_set1_ps(2.5e-01f));
tmp1716 = tmp1716;
tmp1723 = tmp1723;
__m512 tmp1719 = in348;
__m512 tmp1726 = in351;
tmp1715 = tmp1715;
tmp1722 = tmp1722;
__m512 tmp1720 = _mm512_fmadd_ps(tmp1715, _mm512_set1_ps(2e+00f), tmp1716);
__m512 tmp1727 = _mm512_fmadd_ps(tmp1722, _mm512_set1_ps(2e+00f), tmp1723);
tmp1716 = _mm512_fnmadd_ps(tmp1715, _mm512_set1_ps(2e+00f), tmp1716);
tmp1723 = _mm512_fnmadd_ps(tmp1722, _mm512_set1_ps(2e+00f), tmp1723);
tmp1715 = in347;
tmp1722 = in350;
in347 = _mm512_sub_ps(_mm512_setzero_ps(), in347);
in350 = _mm512_sub_ps(_mm512_setzero_ps(), in350);
tmp1715 = tmp1715;
tmp1722 = tmp1722;
__m512 tmp1721 = in347;
__m512 tmp1728 = in350;
in348 = _mm512_fmadd_ps(tmp1715, _mm512_set1_ps(2e+00f), tmp1719);
in351 = _mm512_fmadd_ps(tmp1722, _mm512_set1_ps(2e+00f), tmp1726);
tmp1719 = _mm512_fnmadd_ps(tmp1715, _mm512_set1_ps(2e+00f), tmp1719);
tmp1726 = _mm512_fnmadd_ps(tmp1722, _mm512_set1_ps(2e+00f), tmp1726);
__m512 tmp1737 = _mm512_unpacklo_ps(in346, tmp1717);
__m512 tmp1738 = _mm512_unpackhi_ps(in346, tmp1717);
__m512 tmp1739 = _mm512_unpacklo_ps(tmp1718, tmp1720);
__m512 tmp1740 = _mm512_unpackhi_ps(tmp1718, tmp1720);
__m512 tmp1741 = _mm512_unpacklo_ps(tmp1716, in348);
__m512 tmp1742 = _mm512_unpackhi_ps(tmp1716, in348);
__m512 tmp1743 = _mm512_unpacklo_ps(tmp1719, tmp1721);
__m512 tmp1744 = _mm512_unpackhi_ps(tmp1719, tmp1721);
__m512 tmp1745 = _mm512_unpacklo_ps(in349, tmp1724);
__m512 tmp1746 = _mm512_unpackhi_ps(in349, tmp1724);
__m512 tmp1747 = _mm512_unpacklo_ps(tmp1725, tmp1727);
__m512 tmp1748 = _mm512_unpackhi_ps(tmp1725, tmp1727);
__m512 tmp1749 = _mm512_unpacklo_ps(tmp1723, in351);
__m512 tmp1750 = _mm512_unpackhi_ps(tmp1723, in351);
__m512 tmp1751 = _mm512_unpacklo_ps(tmp1726, tmp1728);
__m512 tmp1752 = _mm512_unpackhi_ps(tmp1726, tmp1728);
__m512 tmp1753 = _mm512_shuffle_ps(tmp1737, tmp1739, 68);
__m512 tmp1754 = _mm512_shuffle_ps(tmp1737, tmp1739, 238);
__m512 tmp1755 = _mm512_shuffle_ps(tmp1738, tmp1740, 68);
__m512 tmp1756 = _mm512_shuffle_ps(tmp1738, tmp1740, 238);
__m512 tmp1757 = _mm512_shuffle_ps(tmp1741, tmp1743, 68);
__m512 tmp1758 = _mm512_shuffle_ps(tmp1741, tmp1743, 238);
__m512 tmp1759 = _mm512_shuffle_ps(tmp1742, tmp1744, 68);
__m512 tmp1760 = _mm512_shuffle_ps(tmp1742, tmp1744, 238);
__m512 tmp1761 = _mm512_shuffle_ps(tmp1745, tmp1747, 68);
__m512 tmp1762 = _mm512_shuffle_ps(tmp1745, tmp1747, 238);
__m512 tmp1763 = _mm512_shuffle_ps(tmp1746, tmp1748, 68);
__m512 tmp1764 = _mm512_shuffle_ps(tmp1746, tmp1748, 238);
__m512 tmp1765 = _mm512_shuffle_ps(tmp1749, tmp1751, 68);
__m512 tmp1766 = _mm512_shuffle_ps(tmp1749, tmp1751, 238);
__m512 tmp1767 = _mm512_shuffle_ps(tmp1750, tmp1752, 68);
__m512 tmp1768 = _mm512_shuffle_ps(tmp1750, tmp1752, 238);
__m512 tmp1769 = _mm512_shuffle_f32x4(tmp1753, tmp1757, 136);
__m512 tmp1770 = _mm512_shuffle_f32x4(tmp1753, tmp1757, 221);
__m512 tmp1771 = _mm512_shuffle_f32x4(tmp1754, tmp1758, 136);
__m512 tmp1772 = _mm512_shuffle_f32x4(tmp1754, tmp1758, 221);
__m512 tmp1773 = _mm512_shuffle_f32x4(tmp1755, tmp1759, 136);
__m512 tmp1774 = _mm512_shuffle_f32x4(tmp1755, tmp1759, 221);
__m512 tmp1775 = _mm512_shuffle_f32x4(tmp1756, tmp1760, 136);
__m512 tmp1776 = _mm512_shuffle_f32x4(tmp1756, tmp1760, 221);
__m512 tmp1777 = _mm512_shuffle_f32x4(tmp1761, tmp1765, 136);
__m512 tmp1778 = _mm512_shuffle_f32x4(tmp1761, tmp1765, 221);
__m512 tmp1779 = _mm512_shuffle_f32x4(tmp1762, tmp1766, 136);
__m512 tmp1780 = _mm512_shuffle_f32x4(tmp1762, tmp1766, 221);
__m512 tmp1781 = _mm512_shuffle_f32x4(tmp1763, tmp1767, 136);
__m512 tmp1782 = _mm512_shuffle_f32x4(tmp1763, tmp1767, 221);
__m512 tmp1783 = _mm512_shuffle_f32x4(tmp1764, tmp1768, 136);
__m512 tmp1784 = _mm512_shuffle_f32x4(tmp1764, tmp1768, 221);
in346 = _mm512_shuffle_f32x4(tmp1769, tmp1777, 136);
in349 = _mm512_shuffle_f32x4(tmp1769, tmp1777, 221);
tmp1717 = _mm512_shuffle_f32x4(tmp1771, tmp1779, 136);
tmp1724 = _mm512_shuffle_f32x4(tmp1771, tmp1779, 221);
tmp1718 = _mm512_shuffle_f32x4(tmp1773, tmp1781, 136);
tmp1725 = _mm512_shuffle_f32x4(tmp1773, tmp1781, 221);
tmp1720 = _mm512_shuffle_f32x4(tmp1775, tmp1783, 136);
tmp1727 = _mm512_shuffle_f32x4(tmp1775, tmp1783, 221);
tmp1716 = _mm512_shuffle_f32x4(tmp1770, tmp1778, 136);
tmp1723 = _mm512_shuffle_f32x4(tmp1770, tmp1778, 221);
in348 = _mm512_shuffle_f32x4(tmp1772, tmp1780, 136);
in351 = _mm512_shuffle_f32x4(tmp1772, tmp1780, 221);
tmp1719 = _mm512_shuffle_f32x4(tmp1774, tmp1782, 136);
tmp1726 = _mm512_shuffle_f32x4(tmp1774, tmp1782, 221);
tmp1721 = _mm512_shuffle_f32x4(tmp1776, tmp1784, 136);
tmp1728 = _mm512_shuffle_f32x4(tmp1776, tmp1784, 221);
__m512 tmp1729 = _mm512_add_ps(tmp1717, in348);
__m512 tmp1733 = _mm512_add_ps(tmp1724, in351);
__m512 tmp1730 = _mm512_sub_ps(tmp1716, tmp1718);
__m512 tmp1734 = _mm512_sub_ps(tmp1723, tmp1725);
__m512 tmp1731 = _mm512_add_ps(tmp1718, tmp1719);
__m512 tmp1735 = _mm512_add_ps(tmp1725, tmp1726);
in346 = _mm512_sub_ps(in346, tmp1719);
in349 = _mm512_sub_ps(in349, tmp1726);
tmp1729 = _mm512_fmadd_ps(tmp1720, _mm512_set1_ps(-4.25e+00f), tmp1729);
tmp1733 = _mm512_fmadd_ps(tmp1727, _mm512_set1_ps(-4.25e+00f), tmp1733);
tmp1731 = _mm512_fmadd_ps(tmp1716, _mm512_set1_ps(-4.25e+00f), tmp1731);
tmp1735 = _mm512_fmadd_ps(tmp1723, _mm512_set1_ps(-4.25e+00f), tmp1735);
in346 = _mm512_fmadd_ps(tmp1730, _mm512_set1_ps(5.25e+00f), in346);
in349 = _mm512_fmadd_ps(tmp1734, _mm512_set1_ps(5.25e+00f), in349);
tmp1730 = _mm512_fmadd_ps(tmp1718, _mm512_set1_ps(2.5e-01f), tmp1719);
tmp1734 = _mm512_fmadd_ps(tmp1725, _mm512_set1_ps(2.5e-01f), tmp1726);
tmp1718 = _mm512_fmadd_ps(tmp1718, _mm512_set1_ps(4e+00f), tmp1719);
tmp1725 = _mm512_fmadd_ps(tmp1725, _mm512_set1_ps(4e+00f), tmp1726);
__m512 tmp1732 = _mm512_sub_ps(tmp1731, tmp1729);
__m512 tmp1736 = _mm512_sub_ps(tmp1735, tmp1733);
tmp1731 = _mm512_add_ps(tmp1729, tmp1731);
tmp1735 = _mm512_add_ps(tmp1733, tmp1735);
tmp1729 = _mm512_fmadd_ps(tmp1717, _mm512_set1_ps(2.5e-01f), in348);
tmp1733 = _mm512_fmadd_ps(tmp1724, _mm512_set1_ps(2.5e-01f), in351);
tmp1730 = _mm512_fmadd_ps(tmp1716, _mm512_set1_ps(-1.25e+00f), tmp1730);
tmp1734 = _mm512_fmadd_ps(tmp1723, _mm512_set1_ps(-1.25e+00f), tmp1734);
tmp1716 = _mm512_fmadd_ps(tmp1716, _mm512_set1_ps(-5e+00f), tmp1718);
tmp1723 = _mm512_fmadd_ps(tmp1723, _mm512_set1_ps(-5e+00f), tmp1725);
tmp1729 = _mm512_fmadd_ps(tmp1720, _mm512_set1_ps(-1.25e+00f), tmp1729);
tmp1733 = _mm512_fmadd_ps(tmp1727, _mm512_set1_ps(-1.25e+00f), tmp1733);
tmp1719 = _mm512_fmadd_ps(tmp1729, _mm512_set1_ps(2e+00f), tmp1730);
tmp1726 = _mm512_fmadd_ps(tmp1733, _mm512_set1_ps(2e+00f), tmp1734);
tmp1730 = _mm512_fnmadd_ps(tmp1729, _mm512_set1_ps(2e+00f), tmp1730);
tmp1734 = _mm512_fnmadd_ps(tmp1733, _mm512_set1_ps(2e+00f), tmp1734);
tmp1729 = _mm512_fmadd_ps(in348, _mm512_set1_ps(2.5e-01f), tmp1717);
tmp1733 = _mm512_fmadd_ps(in351, _mm512_set1_ps(2.5e-01f), tmp1724);
tmp1717 = _mm512_sub_ps(tmp1721, tmp1717);
tmp1724 = _mm512_sub_ps(tmp1728, tmp1724);
tmp1729 = _mm512_fmadd_ps(tmp1720, _mm512_set1_ps(-1.25e+00f), tmp1729);
tmp1733 = _mm512_fmadd_ps(tmp1727, _mm512_set1_ps(-1.25e+00f), tmp1733);
tmp1720 = _mm512_sub_ps(tmp1720, in348);
tmp1727 = _mm512_sub_ps(tmp1727, in351);
tmp1720 = _mm512_fmadd_ps(tmp1720, _mm512_set1_ps(5.25e+00f), tmp1717);
tmp1727 = _mm512_fmadd_ps(tmp1727, _mm512_set1_ps(5.25e+00f), tmp1724);
tmp1718 = _mm512_fmadd_ps(tmp1729, _mm512_set1_ps(2e+00f), tmp1716);
tmp1725 = _mm512_fmadd_ps(tmp1733, _mm512_set1_ps(2e+00f), tmp1723);
tmp1716 = _mm512_fnmadd_ps(tmp1729, _mm512_set1_ps(2e+00f), tmp1716);
tmp1723 = _mm512_fnmadd_ps(tmp1733, _mm512_set1_ps(2e+00f), tmp1723);
__m512 out375 = _mm512_shuffle_f32x4(in346, tmp1731, 68);
__m512 out383 = _mm512_shuffle_f32x4(in346, tmp1731, 238);
__m512 out376 = _mm512_shuffle_f32x4(tmp1732, tmp1719, 68);
__m512 out384 = _mm512_shuffle_f32x4(tmp1732, tmp1719, 238);
__m512 out377 = _mm512_shuffle_f32x4(tmp1730, tmp1718, 68);
__m512 out385 = _mm512_shuffle_f32x4(tmp1730, tmp1718, 238);
__m512 out378 = _mm512_shuffle_f32x4(tmp1716, tmp1720, 68);
__m512 out386 = _mm512_shuffle_f32x4(tmp1716, tmp1720, 238);
__m512 out379 = _mm512_shuffle_f32x4(in349, tmp1735, 68);
__m512 out387 = _mm512_shuffle_f32x4(in349, tmp1735, 238);
__m512 out380 = _mm512_shuffle_f32x4(tmp1736, tmp1726, 68);
__m512 out388 = _mm512_shuffle_f32x4(tmp1736, tmp1726, 238);
__m512 out381 = _mm512_shuffle_f32x4(tmp1734, tmp1725, 68);
__m512 out389 = _mm512_shuffle_f32x4(tmp1734, tmp1725, 238);
__m512 out382 = _mm512_shuffle_f32x4(tmp1723, tmp1727, 68);
__m512 out390 = _mm512_shuffle_f32x4(tmp1723, tmp1727, 238);
_mm512_storeu_ps(dfPtr4+256+1638400*i16+24576*j11+24576*s13+768*k59, out375);
_mm512_storeu_ps(dfPtr4+384+1638400*i16+24576*j11+24576*s13+768*k59, out383);
_mm512_storeu_ps(dfPtr4+320+1638400*i16+24576*j11+24576*s13+768*k59, out379);
_mm512_storeu_ps(dfPtr4+448+1638400*i16+24576*j11+24576*s13+768*k59, out387);
_mm512_storeu_ps(dfPtr4+409856+1638400*i16+24576*j11+24576*s13+768*k59, out376);
_mm512_storeu_ps(dfPtr4+409984+1638400*i16+24576*j11+24576*s13+768*k59, out384);
_mm512_storeu_ps(dfPtr4+409920+1638400*i16+24576*j11+24576*s13+768*k59, out380);
_mm512_storeu_ps(dfPtr4+410048+1638400*i16+24576*j11+24576*s13+768*k59, out388);
_mm512_storeu_ps(dfPtr4+819456+1638400*i16+24576*j11+24576*s13+768*k59, out377);
_mm512_storeu_ps(dfPtr4+819584+1638400*i16+24576*j11+24576*s13+768*k59, out385);
_mm512_storeu_ps(dfPtr4+819520+1638400*i16+24576*j11+24576*s13+768*k59, out381);
_mm512_storeu_ps(dfPtr4+819648+1638400*i16+24576*j11+24576*s13+768*k59, out389);
_mm512_storeu_ps(dfPtr4+1229056+1638400*i16+24576*j11+24576*s13+768*k59, out378);
_mm512_storeu_ps(dfPtr4+1229184+1638400*i16+24576*j11+24576*s13+768*k59, out386);
_mm512_storeu_ps(dfPtr4+1229120+1638400*i16+24576*j11+24576*s13+768*k59, out382);
_mm512_storeu_ps(dfPtr4+1229248+1638400*i16+24576*j11+24576*s13+768*k59, out390);
__m512 dat1259 = _mm512_maskz_loadu_ps(16383, datPtr5+12656+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1259 = _mm512_max_ps(_mm512_setzero_ps(), dat1259);
__m512 dat1260 = _mm512_maskz_loadu_ps(16383, datPtr5+12704+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1260 = _mm512_max_ps(_mm512_setzero_ps(), dat1260);
__m512i pm105 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in352 = _mm512_permutexvar_ps(pm105, dat1259);
__m512 in355 = _mm512_permutexvar_ps(pm105, dat1260);
__m512 dat1261 = _mm512_maskz_loadu_ps(16383, datPtr5+12880+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1261 = _mm512_max_ps(_mm512_setzero_ps(), dat1261);
__m512 dat1262 = _mm512_maskz_loadu_ps(16383, datPtr5+12928+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1262 = _mm512_max_ps(_mm512_setzero_ps(), dat1262);
__m512 in353 = _mm512_permutexvar_ps(pm105, dat1261);
__m512 in356 = _mm512_permutexvar_ps(pm105, dat1262);
__m512 dat1263 = _mm512_maskz_loadu_ps(16383, datPtr5+13104+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1263 = _mm512_max_ps(_mm512_setzero_ps(), dat1263);
__m512 dat1264 = _mm512_maskz_loadu_ps(16383, datPtr5+13152+806912*i16+224*h27+4*w30+806912*s13+25216*k59);
dat1264 = _mm512_max_ps(_mm512_setzero_ps(), dat1264);
__m512 in354 = _mm512_permutexvar_ps(pm105, dat1263);
__m512 in357 = _mm512_permutexvar_ps(pm105, dat1264);
__m512 tmp1785 = in353;
__m512 tmp1792 = in356;
__m512 tmp1786 = _mm512_sub_ps(_mm512_setzero_ps(), in354);
__m512 tmp1793 = _mm512_sub_ps(_mm512_setzero_ps(), in357);
__m512 tmp1787 = in354;
__m512 tmp1794 = in357;
in352 = in352;
in355 = in355;
tmp1785 = tmp1785;
tmp1792 = tmp1792;
tmp1787 = tmp1787;
tmp1794 = tmp1794;
in352 = _mm512_fmadd_ps(tmp1786, _mm512_set1_ps(5.25e+00f), in352);
in355 = _mm512_fmadd_ps(tmp1793, _mm512_set1_ps(5.25e+00f), in355);
tmp1786 = _mm512_mul_ps(in354, _mm512_set1_ps(2.5e-01f));
tmp1793 = _mm512_mul_ps(in357, _mm512_set1_ps(2.5e-01f));
in354 = _mm512_mul_ps(in354, _mm512_set1_ps(4e+00f));
in357 = _mm512_mul_ps(in357, _mm512_set1_ps(4e+00f));
__m512 tmp1788 = _mm512_sub_ps(tmp1787, tmp1785);
__m512 tmp1795 = _mm512_sub_ps(tmp1794, tmp1792);
tmp1787 = _mm512_add_ps(tmp1785, tmp1787);
tmp1794 = _mm512_add_ps(tmp1792, tmp1794);
tmp1785 = _mm512_mul_ps(in353, _mm512_set1_ps(2.5e-01f));
tmp1792 = _mm512_mul_ps(in356, _mm512_set1_ps(2.5e-01f));
tmp1786 = tmp1786;
tmp1793 = tmp1793;
__m512 tmp1789 = in354;
__m512 tmp1796 = in357;
tmp1785 = tmp1785;
tmp1792 = tmp1792;
__m512 tmp1790 = _mm512_fmadd_ps(tmp1785, _mm512_set1_ps(2e+00f), tmp1786);
__m512 tmp1797 = _mm512_fmadd_ps(tmp1792, _mm512_set1_ps(2e+00f), tmp1793);
tmp1786 = _mm512_fnmadd_ps(tmp1785, _mm512_set1_ps(2e+00f), tmp1786);
tmp1793 = _mm512_fnmadd_ps(tmp1792, _mm512_set1_ps(2e+00f), tmp1793);
tmp1785 = in353;
tmp1792 = in356;
in353 = _mm512_sub_ps(_mm512_setzero_ps(), in353);
in356 = _mm512_sub_ps(_mm512_setzero_ps(), in356);
tmp1785 = tmp1785;
tmp1792 = tmp1792;
__m512 tmp1791 = in353;
__m512 tmp1798 = in356;
in354 = _mm512_fmadd_ps(tmp1785, _mm512_set1_ps(2e+00f), tmp1789);
in357 = _mm512_fmadd_ps(tmp1792, _mm512_set1_ps(2e+00f), tmp1796);
tmp1789 = _mm512_fnmadd_ps(tmp1785, _mm512_set1_ps(2e+00f), tmp1789);
tmp1796 = _mm512_fnmadd_ps(tmp1792, _mm512_set1_ps(2e+00f), tmp1796);
__m512 tmp1807 = _mm512_unpacklo_ps(in352, tmp1787);
__m512 tmp1808 = _mm512_unpackhi_ps(in352, tmp1787);
__m512 tmp1809 = _mm512_unpacklo_ps(tmp1788, tmp1790);
__m512 tmp1810 = _mm512_unpackhi_ps(tmp1788, tmp1790);
__m512 tmp1811 = _mm512_unpacklo_ps(tmp1786, in354);
__m512 tmp1812 = _mm512_unpackhi_ps(tmp1786, in354);
__m512 tmp1813 = _mm512_unpacklo_ps(tmp1789, tmp1791);
__m512 tmp1814 = _mm512_unpackhi_ps(tmp1789, tmp1791);
__m512 tmp1815 = _mm512_unpacklo_ps(in355, tmp1794);
__m512 tmp1816 = _mm512_unpackhi_ps(in355, tmp1794);
__m512 tmp1817 = _mm512_unpacklo_ps(tmp1795, tmp1797);
__m512 tmp1818 = _mm512_unpackhi_ps(tmp1795, tmp1797);
__m512 tmp1819 = _mm512_unpacklo_ps(tmp1793, in357);
__m512 tmp1820 = _mm512_unpackhi_ps(tmp1793, in357);
__m512 tmp1821 = _mm512_unpacklo_ps(tmp1796, tmp1798);
__m512 tmp1822 = _mm512_unpackhi_ps(tmp1796, tmp1798);
__m512 tmp1823 = _mm512_shuffle_ps(tmp1807, tmp1809, 68);
__m512 tmp1824 = _mm512_shuffle_ps(tmp1807, tmp1809, 238);
__m512 tmp1825 = _mm512_shuffle_ps(tmp1808, tmp1810, 68);
__m512 tmp1826 = _mm512_shuffle_ps(tmp1808, tmp1810, 238);
__m512 tmp1827 = _mm512_shuffle_ps(tmp1811, tmp1813, 68);
__m512 tmp1828 = _mm512_shuffle_ps(tmp1811, tmp1813, 238);
__m512 tmp1829 = _mm512_shuffle_ps(tmp1812, tmp1814, 68);
__m512 tmp1830 = _mm512_shuffle_ps(tmp1812, tmp1814, 238);
__m512 tmp1831 = _mm512_shuffle_ps(tmp1815, tmp1817, 68);
__m512 tmp1832 = _mm512_shuffle_ps(tmp1815, tmp1817, 238);
__m512 tmp1833 = _mm512_shuffle_ps(tmp1816, tmp1818, 68);
__m512 tmp1834 = _mm512_shuffle_ps(tmp1816, tmp1818, 238);
__m512 tmp1835 = _mm512_shuffle_ps(tmp1819, tmp1821, 68);
__m512 tmp1836 = _mm512_shuffle_ps(tmp1819, tmp1821, 238);
__m512 tmp1837 = _mm512_shuffle_ps(tmp1820, tmp1822, 68);
__m512 tmp1838 = _mm512_shuffle_ps(tmp1820, tmp1822, 238);
__m512 tmp1839 = _mm512_shuffle_f32x4(tmp1823, tmp1827, 136);
__m512 tmp1840 = _mm512_shuffle_f32x4(tmp1823, tmp1827, 221);
__m512 tmp1841 = _mm512_shuffle_f32x4(tmp1824, tmp1828, 136);
__m512 tmp1842 = _mm512_shuffle_f32x4(tmp1824, tmp1828, 221);
__m512 tmp1843 = _mm512_shuffle_f32x4(tmp1825, tmp1829, 136);
__m512 tmp1844 = _mm512_shuffle_f32x4(tmp1825, tmp1829, 221);
__m512 tmp1845 = _mm512_shuffle_f32x4(tmp1826, tmp1830, 136);
__m512 tmp1846 = _mm512_shuffle_f32x4(tmp1826, tmp1830, 221);
__m512 tmp1847 = _mm512_shuffle_f32x4(tmp1831, tmp1835, 136);
__m512 tmp1848 = _mm512_shuffle_f32x4(tmp1831, tmp1835, 221);
__m512 tmp1849 = _mm512_shuffle_f32x4(tmp1832, tmp1836, 136);
__m512 tmp1850 = _mm512_shuffle_f32x4(tmp1832, tmp1836, 221);
__m512 tmp1851 = _mm512_shuffle_f32x4(tmp1833, tmp1837, 136);
__m512 tmp1852 = _mm512_shuffle_f32x4(tmp1833, tmp1837, 221);
__m512 tmp1853 = _mm512_shuffle_f32x4(tmp1834, tmp1838, 136);
__m512 tmp1854 = _mm512_shuffle_f32x4(tmp1834, tmp1838, 221);
in352 = _mm512_shuffle_f32x4(tmp1839, tmp1847, 136);
in355 = _mm512_shuffle_f32x4(tmp1839, tmp1847, 221);
tmp1787 = _mm512_shuffle_f32x4(tmp1841, tmp1849, 136);
tmp1794 = _mm512_shuffle_f32x4(tmp1841, tmp1849, 221);
tmp1788 = _mm512_shuffle_f32x4(tmp1843, tmp1851, 136);
tmp1795 = _mm512_shuffle_f32x4(tmp1843, tmp1851, 221);
tmp1790 = _mm512_shuffle_f32x4(tmp1845, tmp1853, 136);
tmp1797 = _mm512_shuffle_f32x4(tmp1845, tmp1853, 221);
tmp1786 = _mm512_shuffle_f32x4(tmp1840, tmp1848, 136);
tmp1793 = _mm512_shuffle_f32x4(tmp1840, tmp1848, 221);
in354 = _mm512_shuffle_f32x4(tmp1842, tmp1850, 136);
in357 = _mm512_shuffle_f32x4(tmp1842, tmp1850, 221);
tmp1789 = _mm512_shuffle_f32x4(tmp1844, tmp1852, 136);
tmp1796 = _mm512_shuffle_f32x4(tmp1844, tmp1852, 221);
tmp1791 = _mm512_shuffle_f32x4(tmp1846, tmp1854, 136);
tmp1798 = _mm512_shuffle_f32x4(tmp1846, tmp1854, 221);
__m512 tmp1799 = _mm512_add_ps(tmp1787, in354);
__m512 tmp1803 = _mm512_add_ps(tmp1794, in357);
__m512 tmp1800 = _mm512_sub_ps(tmp1786, tmp1788);
__m512 tmp1804 = _mm512_sub_ps(tmp1793, tmp1795);
__m512 tmp1801 = _mm512_add_ps(tmp1788, tmp1789);
__m512 tmp1805 = _mm512_add_ps(tmp1795, tmp1796);
in352 = _mm512_sub_ps(in352, tmp1789);
in355 = _mm512_sub_ps(in355, tmp1796);
tmp1799 = _mm512_fmadd_ps(tmp1790, _mm512_set1_ps(-4.25e+00f), tmp1799);
tmp1803 = _mm512_fmadd_ps(tmp1797, _mm512_set1_ps(-4.25e+00f), tmp1803);
tmp1801 = _mm512_fmadd_ps(tmp1786, _mm512_set1_ps(-4.25e+00f), tmp1801);
tmp1805 = _mm512_fmadd_ps(tmp1793, _mm512_set1_ps(-4.25e+00f), tmp1805);
in352 = _mm512_fmadd_ps(tmp1800, _mm512_set1_ps(5.25e+00f), in352);
in355 = _mm512_fmadd_ps(tmp1804, _mm512_set1_ps(5.25e+00f), in355);
tmp1800 = _mm512_fmadd_ps(tmp1788, _mm512_set1_ps(2.5e-01f), tmp1789);
tmp1804 = _mm512_fmadd_ps(tmp1795, _mm512_set1_ps(2.5e-01f), tmp1796);
tmp1788 = _mm512_fmadd_ps(tmp1788, _mm512_set1_ps(4e+00f), tmp1789);
tmp1795 = _mm512_fmadd_ps(tmp1795, _mm512_set1_ps(4e+00f), tmp1796);
__m512 tmp1802 = _mm512_sub_ps(tmp1801, tmp1799);
__m512 tmp1806 = _mm512_sub_ps(tmp1805, tmp1803);
tmp1801 = _mm512_add_ps(tmp1799, tmp1801);
tmp1805 = _mm512_add_ps(tmp1803, tmp1805);
tmp1799 = _mm512_fmadd_ps(tmp1787, _mm512_set1_ps(2.5e-01f), in354);
tmp1803 = _mm512_fmadd_ps(tmp1794, _mm512_set1_ps(2.5e-01f), in357);
tmp1800 = _mm512_fmadd_ps(tmp1786, _mm512_set1_ps(-1.25e+00f), tmp1800);
tmp1804 = _mm512_fmadd_ps(tmp1793, _mm512_set1_ps(-1.25e+00f), tmp1804);
tmp1786 = _mm512_fmadd_ps(tmp1786, _mm512_set1_ps(-5e+00f), tmp1788);
tmp1793 = _mm512_fmadd_ps(tmp1793, _mm512_set1_ps(-5e+00f), tmp1795);
tmp1799 = _mm512_fmadd_ps(tmp1790, _mm512_set1_ps(-1.25e+00f), tmp1799);
tmp1803 = _mm512_fmadd_ps(tmp1797, _mm512_set1_ps(-1.25e+00f), tmp1803);
tmp1789 = _mm512_fmadd_ps(tmp1799, _mm512_set1_ps(2e+00f), tmp1800);
tmp1796 = _mm512_fmadd_ps(tmp1803, _mm512_set1_ps(2e+00f), tmp1804);
tmp1800 = _mm512_fnmadd_ps(tmp1799, _mm512_set1_ps(2e+00f), tmp1800);
tmp1804 = _mm512_fnmadd_ps(tmp1803, _mm512_set1_ps(2e+00f), tmp1804);
tmp1799 = _mm512_fmadd_ps(in354, _mm512_set1_ps(2.5e-01f), tmp1787);
tmp1803 = _mm512_fmadd_ps(in357, _mm512_set1_ps(2.5e-01f), tmp1794);
tmp1787 = _mm512_sub_ps(tmp1791, tmp1787);
tmp1794 = _mm512_sub_ps(tmp1798, tmp1794);
tmp1799 = _mm512_fmadd_ps(tmp1790, _mm512_set1_ps(-1.25e+00f), tmp1799);
tmp1803 = _mm512_fmadd_ps(tmp1797, _mm512_set1_ps(-1.25e+00f), tmp1803);
tmp1790 = _mm512_sub_ps(tmp1790, in354);
tmp1797 = _mm512_sub_ps(tmp1797, in357);
tmp1790 = _mm512_fmadd_ps(tmp1790, _mm512_set1_ps(5.25e+00f), tmp1787);
tmp1797 = _mm512_fmadd_ps(tmp1797, _mm512_set1_ps(5.25e+00f), tmp1794);
tmp1788 = _mm512_fmadd_ps(tmp1799, _mm512_set1_ps(2e+00f), tmp1786);
tmp1795 = _mm512_fmadd_ps(tmp1803, _mm512_set1_ps(2e+00f), tmp1793);
tmp1786 = _mm512_fnmadd_ps(tmp1799, _mm512_set1_ps(2e+00f), tmp1786);
tmp1793 = _mm512_fnmadd_ps(tmp1803, _mm512_set1_ps(2e+00f), tmp1793);
__m512 out391 = _mm512_shuffle_f32x4(in352, tmp1801, 68);
__m512 out399 = _mm512_shuffle_f32x4(in352, tmp1801, 238);
__m512 out392 = _mm512_shuffle_f32x4(tmp1802, tmp1789, 68);
__m512 out400 = _mm512_shuffle_f32x4(tmp1802, tmp1789, 238);
__m512 out393 = _mm512_shuffle_f32x4(tmp1800, tmp1788, 68);
__m512 out401 = _mm512_shuffle_f32x4(tmp1800, tmp1788, 238);
__m512 out394 = _mm512_shuffle_f32x4(tmp1786, tmp1790, 68);
__m512 out402 = _mm512_shuffle_f32x4(tmp1786, tmp1790, 238);
__m512 out395 = _mm512_shuffle_f32x4(in355, tmp1805, 68);
__m512 out403 = _mm512_shuffle_f32x4(in355, tmp1805, 238);
__m512 out396 = _mm512_shuffle_f32x4(tmp1806, tmp1796, 68);
__m512 out404 = _mm512_shuffle_f32x4(tmp1806, tmp1796, 238);
__m512 out397 = _mm512_shuffle_f32x4(tmp1804, tmp1795, 68);
__m512 out405 = _mm512_shuffle_f32x4(tmp1804, tmp1795, 238);
__m512 out398 = _mm512_shuffle_f32x4(tmp1793, tmp1797, 68);
__m512 out406 = _mm512_shuffle_f32x4(tmp1793, tmp1797, 238);
_mm512_storeu_ps(dfPtr4+512+1638400*i16+24576*j11+24576*s13+768*k59, out391);
_mm512_storeu_ps(dfPtr4+640+1638400*i16+24576*j11+24576*s13+768*k59, out399);
_mm512_storeu_ps(dfPtr4+576+1638400*i16+24576*j11+24576*s13+768*k59, out395);
_mm512_storeu_ps(dfPtr4+704+1638400*i16+24576*j11+24576*s13+768*k59, out403);
_mm512_storeu_ps(dfPtr4+410112+1638400*i16+24576*j11+24576*s13+768*k59, out392);
_mm512_storeu_ps(dfPtr4+410240+1638400*i16+24576*j11+24576*s13+768*k59, out400);
_mm512_storeu_ps(dfPtr4+410176+1638400*i16+24576*j11+24576*s13+768*k59, out396);
_mm512_storeu_ps(dfPtr4+410304+1638400*i16+24576*j11+24576*s13+768*k59, out404);
_mm512_storeu_ps(dfPtr4+819712+1638400*i16+24576*j11+24576*s13+768*k59, out393);
_mm512_storeu_ps(dfPtr4+819840+1638400*i16+24576*j11+24576*s13+768*k59, out401);
_mm512_storeu_ps(dfPtr4+819776+1638400*i16+24576*j11+24576*s13+768*k59, out397);
_mm512_storeu_ps(dfPtr4+819904+1638400*i16+24576*j11+24576*s13+768*k59, out405);
_mm512_storeu_ps(dfPtr4+1229312+1638400*i16+24576*j11+24576*s13+768*k59, out394);
_mm512_storeu_ps(dfPtr4+1229440+1638400*i16+24576*j11+24576*s13+768*k59, out402);
_mm512_storeu_ps(dfPtr4+1229376+1638400*i16+24576*j11+24576*s13+768*k59, out398);
_mm512_storeu_ps(dfPtr4+1229504+1638400*i16+24576*j11+24576*s13+768*k59, out406);
}
if (j11 >= last3) return;
++j11;
rel9 = 1;
}
ptrdiff_t h28 = base9+0;
ptrdiff_t w31 = 36;
ptrdiff_t k60 = 0;
for (; k60 != 64; ++k60) {
__m512 dat1265 = _mm512_maskz_loadu_ps(16383, datPtr5+0+806912*i16+224*h28+4*w31+806912*s13+12608*k60);
dat1265 = _mm512_max_ps(_mm512_setzero_ps(), dat1265);
__m512 dat1266 = _mm512_maskz_loadu_ps(511, datPtr5+48+806912*i16+224*h28+4*w31+806912*s13+12608*k60);
dat1266 = _mm512_max_ps(_mm512_setzero_ps(), dat1266);
__m512i pm106 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in358 = _mm512_permutexvar_ps(pm106, dat1265);
__m512i pm107 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in361 = _mm512_permutexvar_ps(pm107, dat1266);
__m512 dat1267 = _mm512_maskz_loadu_ps(16383, datPtr5+224+806912*i16+224*h28+4*w31+806912*s13+12608*k60);
dat1267 = _mm512_max_ps(_mm512_setzero_ps(), dat1267);
__m512 dat1268 = _mm512_maskz_loadu_ps(511, datPtr5+272+806912*i16+224*h28+4*w31+806912*s13+12608*k60);
dat1268 = _mm512_max_ps(_mm512_setzero_ps(), dat1268);
__m512 in359 = _mm512_permutexvar_ps(pm106, dat1267);
__m512 in362 = _mm512_permutexvar_ps(pm107, dat1268);
__m512 dat1269 = _mm512_maskz_loadu_ps(16383, datPtr5+448+806912*i16+224*h28+4*w31+806912*s13+12608*k60);
dat1269 = _mm512_max_ps(_mm512_setzero_ps(), dat1269);
__m512 dat1270 = _mm512_maskz_loadu_ps(511, datPtr5+496+806912*i16+224*h28+4*w31+806912*s13+12608*k60);
dat1270 = _mm512_max_ps(_mm512_setzero_ps(), dat1270);
__m512 in360 = _mm512_permutexvar_ps(pm106, dat1269);
__m512 in363 = _mm512_permutexvar_ps(pm107, dat1270);
__m512 tmp1855 = in359;
__m512 tmp1862 = in362;
__m512 tmp1856 = _mm512_sub_ps(_mm512_setzero_ps(), in360);
__m512 tmp1863 = _mm512_sub_ps(_mm512_setzero_ps(), in363);
__m512 tmp1857 = in360;
__m512 tmp1864 = in363;
in358 = in358;
in361 = in361;
tmp1855 = tmp1855;
tmp1862 = tmp1862;
tmp1857 = tmp1857;
tmp1864 = tmp1864;
in358 = _mm512_fmadd_ps(tmp1856, _mm512_set1_ps(5.25e+00f), in358);
in361 = _mm512_fmadd_ps(tmp1863, _mm512_set1_ps(5.25e+00f), in361);
tmp1856 = _mm512_mul_ps(in360, _mm512_set1_ps(2.5e-01f));
tmp1863 = _mm512_mul_ps(in363, _mm512_set1_ps(2.5e-01f));
in360 = _mm512_mul_ps(in360, _mm512_set1_ps(4e+00f));
in363 = _mm512_mul_ps(in363, _mm512_set1_ps(4e+00f));
__m512 tmp1858 = _mm512_sub_ps(tmp1857, tmp1855);
__m512 tmp1865 = _mm512_sub_ps(tmp1864, tmp1862);
tmp1857 = _mm512_add_ps(tmp1855, tmp1857);
tmp1864 = _mm512_add_ps(tmp1862, tmp1864);
tmp1855 = _mm512_mul_ps(in359, _mm512_set1_ps(2.5e-01f));
tmp1862 = _mm512_mul_ps(in362, _mm512_set1_ps(2.5e-01f));
tmp1856 = tmp1856;
tmp1863 = tmp1863;
__m512 tmp1859 = in360;
__m512 tmp1866 = in363;
tmp1855 = tmp1855;
tmp1862 = tmp1862;
__m512 tmp1860 = _mm512_fmadd_ps(tmp1855, _mm512_set1_ps(2e+00f), tmp1856);
__m512 tmp1867 = _mm512_fmadd_ps(tmp1862, _mm512_set1_ps(2e+00f), tmp1863);
tmp1856 = _mm512_fnmadd_ps(tmp1855, _mm512_set1_ps(2e+00f), tmp1856);
tmp1863 = _mm512_fnmadd_ps(tmp1862, _mm512_set1_ps(2e+00f), tmp1863);
tmp1855 = in359;
tmp1862 = in362;
in359 = _mm512_sub_ps(_mm512_setzero_ps(), in359);
in362 = _mm512_sub_ps(_mm512_setzero_ps(), in362);
tmp1855 = tmp1855;
tmp1862 = tmp1862;
__m512 tmp1861 = in359;
__m512 tmp1868 = in362;
in360 = _mm512_fmadd_ps(tmp1855, _mm512_set1_ps(2e+00f), tmp1859);
in363 = _mm512_fmadd_ps(tmp1862, _mm512_set1_ps(2e+00f), tmp1866);
tmp1859 = _mm512_fnmadd_ps(tmp1855, _mm512_set1_ps(2e+00f), tmp1859);
tmp1866 = _mm512_fnmadd_ps(tmp1862, _mm512_set1_ps(2e+00f), tmp1866);
__m512 tmp1877 = _mm512_unpacklo_ps(in358, tmp1857);
__m512 tmp1878 = _mm512_unpackhi_ps(in358, tmp1857);
__m512 tmp1879 = _mm512_unpacklo_ps(tmp1858, tmp1860);
__m512 tmp1880 = _mm512_unpackhi_ps(tmp1858, tmp1860);
__m512 tmp1881 = _mm512_unpacklo_ps(tmp1856, in360);
__m512 tmp1882 = _mm512_unpackhi_ps(tmp1856, in360);
__m512 tmp1883 = _mm512_unpacklo_ps(tmp1859, tmp1861);
__m512 tmp1884 = _mm512_unpackhi_ps(tmp1859, tmp1861);
__m512 tmp1885 = _mm512_unpacklo_ps(in361, tmp1864);
__m512 tmp1886 = _mm512_unpackhi_ps(in361, tmp1864);
__m512 tmp1887 = _mm512_unpacklo_ps(tmp1865, tmp1867);
__m512 tmp1888 = _mm512_unpackhi_ps(tmp1865, tmp1867);
__m512 tmp1889 = _mm512_unpacklo_ps(tmp1863, in363);
__m512 tmp1890 = _mm512_unpackhi_ps(tmp1863, in363);
__m512 tmp1891 = _mm512_unpacklo_ps(tmp1866, tmp1868);
__m512 tmp1892 = _mm512_unpackhi_ps(tmp1866, tmp1868);
__m512 tmp1893 = _mm512_shuffle_ps(tmp1877, tmp1879, 68);
__m512 tmp1894 = _mm512_shuffle_ps(tmp1877, tmp1879, 238);
__m512 tmp1895 = _mm512_shuffle_ps(tmp1878, tmp1880, 68);
__m512 tmp1896 = _mm512_shuffle_ps(tmp1878, tmp1880, 238);
__m512 tmp1897 = _mm512_shuffle_ps(tmp1881, tmp1883, 68);
__m512 tmp1898 = _mm512_shuffle_ps(tmp1881, tmp1883, 238);
__m512 tmp1899 = _mm512_shuffle_ps(tmp1882, tmp1884, 68);
__m512 tmp1900 = _mm512_shuffle_ps(tmp1882, tmp1884, 238);
__m512 tmp1901 = _mm512_shuffle_ps(tmp1885, tmp1887, 68);
__m512 tmp1902 = _mm512_shuffle_ps(tmp1885, tmp1887, 238);
__m512 tmp1903 = _mm512_shuffle_ps(tmp1886, tmp1888, 68);
__m512 tmp1904 = _mm512_shuffle_ps(tmp1886, tmp1888, 238);
__m512 tmp1905 = _mm512_shuffle_ps(tmp1889, tmp1891, 68);
__m512 tmp1906 = _mm512_shuffle_ps(tmp1889, tmp1891, 238);
__m512 tmp1907 = _mm512_shuffle_ps(tmp1890, tmp1892, 68);
__m512 tmp1908 = _mm512_shuffle_ps(tmp1890, tmp1892, 238);
__m512 tmp1909 = _mm512_shuffle_f32x4(tmp1893, tmp1897, 136);
__m512 tmp1910 = _mm512_shuffle_f32x4(tmp1893, tmp1897, 221);
__m512 tmp1911 = _mm512_shuffle_f32x4(tmp1894, tmp1898, 136);
__m512 tmp1912 = _mm512_shuffle_f32x4(tmp1894, tmp1898, 221);
__m512 tmp1913 = _mm512_shuffle_f32x4(tmp1895, tmp1899, 136);
__m512 tmp1914 = _mm512_shuffle_f32x4(tmp1895, tmp1899, 221);
__m512 tmp1915 = _mm512_shuffle_f32x4(tmp1896, tmp1900, 136);
__m512 tmp1916 = _mm512_shuffle_f32x4(tmp1896, tmp1900, 221);
__m512 tmp1917 = _mm512_shuffle_f32x4(tmp1901, tmp1905, 136);
__m512 tmp1918 = _mm512_shuffle_f32x4(tmp1901, tmp1905, 221);
__m512 tmp1919 = _mm512_shuffle_f32x4(tmp1902, tmp1906, 136);
__m512 tmp1920 = _mm512_shuffle_f32x4(tmp1902, tmp1906, 221);
__m512 tmp1921 = _mm512_shuffle_f32x4(tmp1903, tmp1907, 136);
__m512 tmp1922 = _mm512_shuffle_f32x4(tmp1903, tmp1907, 221);
__m512 tmp1923 = _mm512_shuffle_f32x4(tmp1904, tmp1908, 136);
__m512 tmp1924 = _mm512_shuffle_f32x4(tmp1904, tmp1908, 221);
in358 = _mm512_shuffle_f32x4(tmp1909, tmp1917, 136);
in361 = _mm512_shuffle_f32x4(tmp1909, tmp1917, 221);
tmp1857 = _mm512_shuffle_f32x4(tmp1911, tmp1919, 136);
tmp1864 = _mm512_shuffle_f32x4(tmp1911, tmp1919, 221);
tmp1858 = _mm512_shuffle_f32x4(tmp1913, tmp1921, 136);
tmp1865 = _mm512_shuffle_f32x4(tmp1913, tmp1921, 221);
tmp1860 = _mm512_shuffle_f32x4(tmp1915, tmp1923, 136);
tmp1867 = _mm512_shuffle_f32x4(tmp1915, tmp1923, 221);
tmp1856 = _mm512_shuffle_f32x4(tmp1910, tmp1918, 136);
tmp1863 = _mm512_shuffle_f32x4(tmp1910, tmp1918, 221);
in360 = _mm512_shuffle_f32x4(tmp1912, tmp1920, 136);
in363 = _mm512_shuffle_f32x4(tmp1912, tmp1920, 221);
tmp1859 = _mm512_shuffle_f32x4(tmp1914, tmp1922, 136);
tmp1866 = _mm512_shuffle_f32x4(tmp1914, tmp1922, 221);
tmp1861 = _mm512_shuffle_f32x4(tmp1916, tmp1924, 136);
tmp1868 = _mm512_shuffle_f32x4(tmp1916, tmp1924, 221);
__m512 tmp1869 = _mm512_add_ps(tmp1857, in360);
__m512 tmp1873 = _mm512_add_ps(tmp1864, in363);
__m512 tmp1870 = _mm512_sub_ps(tmp1856, tmp1858);
__m512 tmp1874 = _mm512_sub_ps(tmp1863, tmp1865);
__m512 tmp1871 = _mm512_add_ps(tmp1858, tmp1859);
__m512 tmp1875 = _mm512_add_ps(tmp1865, tmp1866);
in358 = _mm512_sub_ps(in358, tmp1859);
in361 = _mm512_sub_ps(in361, tmp1866);
tmp1869 = _mm512_fmadd_ps(tmp1860, _mm512_set1_ps(-4.25e+00f), tmp1869);
tmp1873 = _mm512_fmadd_ps(tmp1867, _mm512_set1_ps(-4.25e+00f), tmp1873);
tmp1871 = _mm512_fmadd_ps(tmp1856, _mm512_set1_ps(-4.25e+00f), tmp1871);
tmp1875 = _mm512_fmadd_ps(tmp1863, _mm512_set1_ps(-4.25e+00f), tmp1875);
in358 = _mm512_fmadd_ps(tmp1870, _mm512_set1_ps(5.25e+00f), in358);
in361 = _mm512_fmadd_ps(tmp1874, _mm512_set1_ps(5.25e+00f), in361);
tmp1870 = _mm512_fmadd_ps(tmp1858, _mm512_set1_ps(2.5e-01f), tmp1859);
tmp1874 = _mm512_fmadd_ps(tmp1865, _mm512_set1_ps(2.5e-01f), tmp1866);
tmp1858 = _mm512_fmadd_ps(tmp1858, _mm512_set1_ps(4e+00f), tmp1859);
tmp1865 = _mm512_fmadd_ps(tmp1865, _mm512_set1_ps(4e+00f), tmp1866);
__m512 tmp1872 = _mm512_sub_ps(tmp1871, tmp1869);
__m512 tmp1876 = _mm512_sub_ps(tmp1875, tmp1873);
tmp1871 = _mm512_add_ps(tmp1869, tmp1871);
tmp1875 = _mm512_add_ps(tmp1873, tmp1875);
tmp1869 = _mm512_fmadd_ps(tmp1857, _mm512_set1_ps(2.5e-01f), in360);
tmp1873 = _mm512_fmadd_ps(tmp1864, _mm512_set1_ps(2.5e-01f), in363);
tmp1870 = _mm512_fmadd_ps(tmp1856, _mm512_set1_ps(-1.25e+00f), tmp1870);
tmp1874 = _mm512_fmadd_ps(tmp1863, _mm512_set1_ps(-1.25e+00f), tmp1874);
tmp1856 = _mm512_fmadd_ps(tmp1856, _mm512_set1_ps(-5e+00f), tmp1858);
tmp1863 = _mm512_fmadd_ps(tmp1863, _mm512_set1_ps(-5e+00f), tmp1865);
tmp1869 = _mm512_fmadd_ps(tmp1860, _mm512_set1_ps(-1.25e+00f), tmp1869);
tmp1873 = _mm512_fmadd_ps(tmp1867, _mm512_set1_ps(-1.25e+00f), tmp1873);
tmp1859 = _mm512_fmadd_ps(tmp1869, _mm512_set1_ps(2e+00f), tmp1870);
tmp1866 = _mm512_fmadd_ps(tmp1873, _mm512_set1_ps(2e+00f), tmp1874);
tmp1870 = _mm512_fnmadd_ps(tmp1869, _mm512_set1_ps(2e+00f), tmp1870);
tmp1874 = _mm512_fnmadd_ps(tmp1873, _mm512_set1_ps(2e+00f), tmp1874);
tmp1869 = _mm512_fmadd_ps(in360, _mm512_set1_ps(2.5e-01f), tmp1857);
tmp1873 = _mm512_fmadd_ps(in363, _mm512_set1_ps(2.5e-01f), tmp1864);
tmp1857 = _mm512_sub_ps(tmp1861, tmp1857);
tmp1864 = _mm512_sub_ps(tmp1868, tmp1864);
tmp1869 = _mm512_fmadd_ps(tmp1860, _mm512_set1_ps(-1.25e+00f), tmp1869);
tmp1873 = _mm512_fmadd_ps(tmp1867, _mm512_set1_ps(-1.25e+00f), tmp1873);
tmp1860 = _mm512_sub_ps(tmp1860, in360);
tmp1867 = _mm512_sub_ps(tmp1867, in363);
tmp1860 = _mm512_fmadd_ps(tmp1860, _mm512_set1_ps(5.25e+00f), tmp1857);
tmp1867 = _mm512_fmadd_ps(tmp1867, _mm512_set1_ps(5.25e+00f), tmp1864);
tmp1858 = _mm512_fmadd_ps(tmp1869, _mm512_set1_ps(2e+00f), tmp1856);
tmp1865 = _mm512_fmadd_ps(tmp1873, _mm512_set1_ps(2e+00f), tmp1863);
tmp1856 = _mm512_fnmadd_ps(tmp1869, _mm512_set1_ps(2e+00f), tmp1856);
tmp1863 = _mm512_fnmadd_ps(tmp1873, _mm512_set1_ps(2e+00f), tmp1863);
__m512 out407 = _mm512_shuffle_f32x4(in358, tmp1871, 68);
__m512 out415 = _mm512_shuffle_f32x4(in358, tmp1871, 238);
__m512 out408 = _mm512_shuffle_f32x4(tmp1872, tmp1859, 68);
__m512 out416 = _mm512_shuffle_f32x4(tmp1872, tmp1859, 238);
__m512 out409 = _mm512_shuffle_f32x4(tmp1870, tmp1858, 68);
__m512 out417 = _mm512_shuffle_f32x4(tmp1870, tmp1858, 238);
__m512 out410 = _mm512_shuffle_f32x4(tmp1856, tmp1860, 68);
__m512 out418 = _mm512_shuffle_f32x4(tmp1856, tmp1860, 238);
__m512 out411 = _mm512_shuffle_f32x4(in361, tmp1875, 68);
__m512 out419 = _mm512_shuffle_f32x4(in361, tmp1875, 238);
__m512 out412 = _mm512_shuffle_f32x4(tmp1876, tmp1866, 68);
__m512 out420 = _mm512_shuffle_f32x4(tmp1876, tmp1866, 238);
__m512 out413 = _mm512_shuffle_f32x4(tmp1874, tmp1865, 68);
__m512 out421 = _mm512_shuffle_f32x4(tmp1874, tmp1865, 238);
__m512 out414 = _mm512_shuffle_f32x4(tmp1863, tmp1867, 68);
__m512 out422 = _mm512_shuffle_f32x4(tmp1863, tmp1867, 238);
_mm512_storeu_ps(dfPtr4+0+1638400*i16+24576*j11+16384*s13+256*k60, out407);
_mm512_storeu_ps(dfPtr4+128+1638400*i16+24576*j11+16384*s13+256*k60, out415);
_mm512_storeu_ps(dfPtr4+64+1638400*i16+24576*j11+16384*s13+256*k60, out411);
_mm512_storeu_ps(dfPtr4+192+1638400*i16+24576*j11+16384*s13+256*k60, out419);
_mm512_storeu_ps(dfPtr4+409600+1638400*i16+24576*j11+16384*s13+256*k60, out408);
_mm512_storeu_ps(dfPtr4+409728+1638400*i16+24576*j11+16384*s13+256*k60, out416);
_mm512_storeu_ps(dfPtr4+409664+1638400*i16+24576*j11+16384*s13+256*k60, out412);
_mm512_storeu_ps(dfPtr4+409792+1638400*i16+24576*j11+16384*s13+256*k60, out420);
_mm512_storeu_ps(dfPtr4+819200+1638400*i16+24576*j11+16384*s13+256*k60, out409);
_mm512_storeu_ps(dfPtr4+819328+1638400*i16+24576*j11+16384*s13+256*k60, out417);
_mm512_storeu_ps(dfPtr4+819264+1638400*i16+24576*j11+16384*s13+256*k60, out413);
_mm512_storeu_ps(dfPtr4+819392+1638400*i16+24576*j11+16384*s13+256*k60, out421);
_mm512_storeu_ps(dfPtr4+1228800+1638400*i16+24576*j11+16384*s13+256*k60, out410);
_mm512_storeu_ps(dfPtr4+1228928+1638400*i16+24576*j11+16384*s13+256*k60, out418);
_mm512_storeu_ps(dfPtr4+1228864+1638400*i16+24576*j11+16384*s13+256*k60, out414);
_mm512_storeu_ps(dfPtr4+1228992+1638400*i16+24576*j11+16384*s13+256*k60, out422);
}
if (j11 >= last3) return;
++j11;
}

static void ResNet50ThreeArrangeDats1(ResNet50ThreaderTeam1* team23, char** tensors19) {
ResNet50ThreaderTask1 task23;
task23.callee1 = ResNet50ThreeArrangeDats1Callee1;
task23.any1 = tensors19;
task23.nd1 = 4;
task23.hull1[0] = 1;
task23.hull1[1] = 8;
task23.hull1[2] = 1;
task23.hull1[3] = 1;
ResNet50ThreaderDo1(team23, &task23);
}

static void ResNet50ThreeProduceSums1Callee1(ResNet50ThreaderTask1* task24, int64_t* pt17) {
void** pair4 = task24->any1;
char** tensors22 = pair4[0];
ptrdiff_t e9 = 0;
ptrdiff_t g9 = 0;
ptrdiff_t f44 = pt17[2];
ptrdiff_t d4 = pt17[1];
ptrdiff_t w32 = pt17[0];
char*restrict bfPtr5 = tensors22[0]+256*e9;
char*restrict wfPtr5 = tensors22[0]+256+3244032*e9;
char*restrict dfPtr5 = tensors22[1]+10137600*e9;
char*restrict sfPtr4 = tensors22[2];
ptrdiff_t i17 = 1*g9;
ptrdiff_t j12 = 1*f44;
ptrdiff_t k61 = 1*d4;
ptrdiff_t kk26 = k61+0;
for (; k61 != 16; ++k61) {
ptrdiff_t l12 = 8*w32;
ptrdiff_t ll1 = l12+7;
for (; l12 != 16; ++l12) {
__m512 sum39;
__m512 sum45;
__m512 sum51;
__m512 sum57;
if (__builtin_expect(!j12, 0)) {
sum39 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr5+0+256*i17+16*l12)));
sum45 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr5+4+256*i17+16*l12)));
sum51 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr5+8+256*i17+16*l12)));
sum57 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr5+12+256*i17+16*l12)));
} else {
sum39 = _mm512_setzero_ps();
sum45 = _mm512_setzero_ps();
sum51 = _mm512_setzero_ps();
sum57 = _mm512_setzero_ps();
}
__m512 sum40 = sum39;
__m512 sum41 = sum39;
__m512 sum42 = sum39;
__m512 sum43 = sum39;
__m512 sum44 = sum39;
__m512 sum46 = sum45;
__m512 sum47 = sum45;
__m512 sum48 = sum45;
__m512 sum49 = sum45;
__m512 sum50 = sum45;
__m512 sum52 = sum51;
__m512 sum53 = sum51;
__m512 sum54 = sum51;
__m512 sum55 = sum51;
__m512 sum56 = sum51;
__m512 sum58 = sum57;
__m512 sum59 = sum57;
__m512 sum60 = sum57;
__m512 sum61 = sum57;
__m512 sum62 = sum57;
ptrdiff_t b46 = 0;
for (; b46 != 64; ++b46) {
__m512i wfs17 = _mm512_maskz_loadu_epi32(65535, wfPtr5+0+524288*i17+131072*j12+8192*l12+128*b46);
__m512 wf49 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs17));
__m512 df641 = _mm512_loadu_ps(dfPtr5+0+1638400*i17+409600*j12+24576*k61+384*b46);
sum39 = _mm512_fmadd_ps(wf49, df641, sum39);
__m512 df642 = _mm512_loadu_ps(dfPtr5+64+1638400*i17+409600*j12+24576*k61+384*b46);
sum40 = _mm512_fmadd_ps(wf49, df642, sum40);
__m512 df643 = _mm512_loadu_ps(dfPtr5+128+1638400*i17+409600*j12+24576*k61+384*b46);
sum41 = _mm512_fmadd_ps(wf49, df643, sum41);
__m512 df644 = _mm512_loadu_ps(dfPtr5+192+1638400*i17+409600*j12+24576*k61+384*b46);
sum42 = _mm512_fmadd_ps(wf49, df644, sum42);
__m512 df645 = _mm512_loadu_ps(dfPtr5+256+1638400*i17+409600*j12+24576*k61+384*b46);
sum43 = _mm512_fmadd_ps(wf49, df645, sum43);
__m512 df646 = _mm512_loadu_ps(dfPtr5+320+1638400*i17+409600*j12+24576*k61+384*b46);
sum44 = _mm512_fmadd_ps(wf49, df646, sum44);
__m512 wf50 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs17, 1));
sum45 = _mm512_fmadd_ps(wf50, df641, sum45);
sum46 = _mm512_fmadd_ps(wf50, df642, sum46);
sum47 = _mm512_fmadd_ps(wf50, df643, sum47);
sum48 = _mm512_fmadd_ps(wf50, df644, sum48);
sum49 = _mm512_fmadd_ps(wf50, df645, sum49);
sum50 = _mm512_fmadd_ps(wf50, df646, sum50);
__m512i wfs18 = _mm512_maskz_loadu_epi32(65535, wfPtr5+64+524288*i17+131072*j12+8192*l12+128*b46);
__m512 wf51 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs18));
sum51 = _mm512_fmadd_ps(wf51, df641, sum51);
sum52 = _mm512_fmadd_ps(wf51, df642, sum52);
sum53 = _mm512_fmadd_ps(wf51, df643, sum53);
sum54 = _mm512_fmadd_ps(wf51, df644, sum54);
sum55 = _mm512_fmadd_ps(wf51, df645, sum55);
sum56 = _mm512_fmadd_ps(wf51, df646, sum56);
__m512 wf52 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs18, 1));
sum57 = _mm512_fmadd_ps(wf52, df641, sum57);
sum58 = _mm512_fmadd_ps(wf52, df642, sum58);
sum59 = _mm512_fmadd_ps(wf52, df643, sum59);
sum60 = _mm512_fmadd_ps(wf52, df644, sum60);
sum61 = _mm512_fmadd_ps(wf52, df645, sum61);
sum62 = _mm512_fmadd_ps(wf52, df646, sum62);
}
_mm512_storeu_ps(sfPtr4+0+1638400*i17+409600*j12+24576*k61+1536*l12, sum39);
_mm512_storeu_ps(sfPtr4+64+1638400*i17+409600*j12+24576*k61+1536*l12, sum40);
_mm512_storeu_ps(sfPtr4+128+1638400*i17+409600*j12+24576*k61+1536*l12, sum41);
_mm512_storeu_ps(sfPtr4+192+1638400*i17+409600*j12+24576*k61+1536*l12, sum42);
_mm512_storeu_ps(sfPtr4+256+1638400*i17+409600*j12+24576*k61+1536*l12, sum43);
_mm512_storeu_ps(sfPtr4+320+1638400*i17+409600*j12+24576*k61+1536*l12, sum44);
_mm512_storeu_ps(sfPtr4+384+1638400*i17+409600*j12+24576*k61+1536*l12, sum45);
_mm512_storeu_ps(sfPtr4+448+1638400*i17+409600*j12+24576*k61+1536*l12, sum46);
_mm512_storeu_ps(sfPtr4+512+1638400*i17+409600*j12+24576*k61+1536*l12, sum47);
_mm512_storeu_ps(sfPtr4+576+1638400*i17+409600*j12+24576*k61+1536*l12, sum48);
_mm512_storeu_ps(sfPtr4+640+1638400*i17+409600*j12+24576*k61+1536*l12, sum49);
_mm512_storeu_ps(sfPtr4+704+1638400*i17+409600*j12+24576*k61+1536*l12, sum50);
_mm512_storeu_ps(sfPtr4+768+1638400*i17+409600*j12+24576*k61+1536*l12, sum51);
_mm512_storeu_ps(sfPtr4+832+1638400*i17+409600*j12+24576*k61+1536*l12, sum52);
_mm512_storeu_ps(sfPtr4+896+1638400*i17+409600*j12+24576*k61+1536*l12, sum53);
_mm512_storeu_ps(sfPtr4+960+1638400*i17+409600*j12+24576*k61+1536*l12, sum54);
_mm512_storeu_ps(sfPtr4+1024+1638400*i17+409600*j12+24576*k61+1536*l12, sum55);
_mm512_storeu_ps(sfPtr4+1088+1638400*i17+409600*j12+24576*k61+1536*l12, sum56);
_mm512_storeu_ps(sfPtr4+1152+1638400*i17+409600*j12+24576*k61+1536*l12, sum57);
_mm512_storeu_ps(sfPtr4+1216+1638400*i17+409600*j12+24576*k61+1536*l12, sum58);
_mm512_storeu_ps(sfPtr4+1280+1638400*i17+409600*j12+24576*k61+1536*l12, sum59);
_mm512_storeu_ps(sfPtr4+1344+1638400*i17+409600*j12+24576*k61+1536*l12, sum60);
_mm512_storeu_ps(sfPtr4+1408+1638400*i17+409600*j12+24576*k61+1536*l12, sum61);
_mm512_storeu_ps(sfPtr4+1472+1638400*i17+409600*j12+24576*k61+1536*l12, sum62);
if (l12 >= ll1) return;
}
if (k61 >= kk26) return;
}
ptrdiff_t l13 = 8*w32;
ptrdiff_t ll2 = l13+7;
for (; l13 != 16; ++l13) {
__m512 sum63;
__m512 sum67;
__m512 sum71;
__m512 sum75;
if (__builtin_expect(!j12, 0)) {
sum63 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr5+0+256*i17+16*l13)));
sum67 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr5+4+256*i17+16*l13)));
sum71 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr5+8+256*i17+16*l13)));
sum75 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr5+12+256*i17+16*l13)));
} else {
sum63 = _mm512_setzero_ps();
sum67 = _mm512_setzero_ps();
sum71 = _mm512_setzero_ps();
sum75 = _mm512_setzero_ps();
}
__m512 sum64 = sum63;
__m512 sum65 = sum63;
__m512 sum66 = sum63;
__m512 sum68 = sum67;
__m512 sum69 = sum67;
__m512 sum70 = sum67;
__m512 sum72 = sum71;
__m512 sum73 = sum71;
__m512 sum74 = sum71;
__m512 sum76 = sum75;
__m512 sum77 = sum75;
__m512 sum78 = sum75;
ptrdiff_t b47 = 0;
for (; b47 != 64; ++b47) {
__m512i wfs19 = _mm512_maskz_loadu_epi32(65535, wfPtr5+0+524288*i17+131072*j12+8192*l13+128*b47);
__m512 wf53 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs19));
__m512 df647 = _mm512_loadu_ps(dfPtr5+0+1638400*i17+409600*j12+24576*k61+256*b47);
sum63 = _mm512_fmadd_ps(wf53, df647, sum63);
__m512 df648 = _mm512_loadu_ps(dfPtr5+64+1638400*i17+409600*j12+24576*k61+256*b47);
sum64 = _mm512_fmadd_ps(wf53, df648, sum64);
__m512 df649 = _mm512_loadu_ps(dfPtr5+128+1638400*i17+409600*j12+24576*k61+256*b47);
sum65 = _mm512_fmadd_ps(wf53, df649, sum65);
__m512 df650 = _mm512_loadu_ps(dfPtr5+192+1638400*i17+409600*j12+24576*k61+256*b47);
sum66 = _mm512_fmadd_ps(wf53, df650, sum66);
__m512 wf54 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs19, 1));
sum67 = _mm512_fmadd_ps(wf54, df647, sum67);
sum68 = _mm512_fmadd_ps(wf54, df648, sum68);
sum69 = _mm512_fmadd_ps(wf54, df649, sum69);
sum70 = _mm512_fmadd_ps(wf54, df650, sum70);
__m512i wfs20 = _mm512_maskz_loadu_epi32(65535, wfPtr5+64+524288*i17+131072*j12+8192*l13+128*b47);
__m512 wf55 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs20));
sum71 = _mm512_fmadd_ps(wf55, df647, sum71);
sum72 = _mm512_fmadd_ps(wf55, df648, sum72);
sum73 = _mm512_fmadd_ps(wf55, df649, sum73);
sum74 = _mm512_fmadd_ps(wf55, df650, sum74);
__m512 wf56 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs20, 1));
sum75 = _mm512_fmadd_ps(wf56, df647, sum75);
sum76 = _mm512_fmadd_ps(wf56, df648, sum76);
sum77 = _mm512_fmadd_ps(wf56, df649, sum77);
sum78 = _mm512_fmadd_ps(wf56, df650, sum78);
}
_mm512_storeu_ps(sfPtr4+0+1638400*i17+409600*j12+24576*k61+1024*l13, sum63);
_mm512_storeu_ps(sfPtr4+64+1638400*i17+409600*j12+24576*k61+1024*l13, sum64);
_mm512_storeu_ps(sfPtr4+128+1638400*i17+409600*j12+24576*k61+1024*l13, sum65);
_mm512_storeu_ps(sfPtr4+192+1638400*i17+409600*j12+24576*k61+1024*l13, sum66);
_mm512_storeu_ps(sfPtr4+256+1638400*i17+409600*j12+24576*k61+1024*l13, sum67);
_mm512_storeu_ps(sfPtr4+320+1638400*i17+409600*j12+24576*k61+1024*l13, sum68);
_mm512_storeu_ps(sfPtr4+384+1638400*i17+409600*j12+24576*k61+1024*l13, sum69);
_mm512_storeu_ps(sfPtr4+448+1638400*i17+409600*j12+24576*k61+1024*l13, sum70);
_mm512_storeu_ps(sfPtr4+512+1638400*i17+409600*j12+24576*k61+1024*l13, sum71);
_mm512_storeu_ps(sfPtr4+576+1638400*i17+409600*j12+24576*k61+1024*l13, sum72);
_mm512_storeu_ps(sfPtr4+640+1638400*i17+409600*j12+24576*k61+1024*l13, sum73);
_mm512_storeu_ps(sfPtr4+704+1638400*i17+409600*j12+24576*k61+1024*l13, sum74);
_mm512_storeu_ps(sfPtr4+768+1638400*i17+409600*j12+24576*k61+1024*l13, sum75);
_mm512_storeu_ps(sfPtr4+832+1638400*i17+409600*j12+24576*k61+1024*l13, sum76);
_mm512_storeu_ps(sfPtr4+896+1638400*i17+409600*j12+24576*k61+1024*l13, sum77);
_mm512_storeu_ps(sfPtr4+960+1638400*i17+409600*j12+24576*k61+1024*l13, sum78);
if (l13 >= ll2) return;
}
}

static void ResNet50ThreeProduceSums1(ResNet50ThreaderTeam1* team24, char** tensors21) {
void* pair3[] = {tensors21, 0};
ResNet50ThreaderTask1 task25;
task25.callee1 = ResNet50ThreeProduceSums1Callee1;
task25.any1 = pair3;
task25.nd1 = 4;
task25.hull1[0] = 2;
task25.hull1[1] = 17;
task25.hull1[2] = 4;
task25.hull1[3] = 1;
ResNet50ThreaderDo1(team24, &task25);
}

static void ResNet50ThreeConsumeSums1Callee1(ResNet50ThreaderTask1* task26, int64_t* pt18) {
char** tensors24 = task26->any1;
ptrdiff_t w33 = 0;
ptrdiff_t d5 = pt18[1];
ptrdiff_t g10 = 0;
char*restrict sfPtr5 = tensors24[0];
char*restrict datPtr6 = tensors24[1];
ptrdiff_t i18 = 1*g10;
ptrdiff_t j13 = 2*d5;
ptrdiff_t last4 = j13+(d5 < 7 ? 1 : 2);
if (j13 < 2) {
ptrdiff_t rel10 = j13-0;
ptrdiff_t base10 = 0;
if (rel10 < 1) {
ptrdiff_t toH20 = base10+0;
ptrdiff_t toW20 = 0;
ptrdiff_t k62 = 16*w33;
for (; k62 != 16; ++k62) {
ptrdiff_t l14 = 0;
for (; l14 != 2; ++l14) {
__m512 sf1 = _mm512_loadu_ps(sfPtr5+0+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf2 = _mm512_loadu_ps(sfPtr5+128+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in364 = _mm512_shuffle_f32x4(sf1, sf2, 68);
__m512 in365 = _mm512_shuffle_f32x4(sf1, sf2, 238);
__m512 sf3 = _mm512_loadu_ps(sfPtr5+64+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf4 = _mm512_loadu_ps(sfPtr5+192+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in372 = _mm512_shuffle_f32x4(sf3, sf4, 68);
__m512 in373 = _mm512_shuffle_f32x4(sf3, sf4, 238);
__m512 sf5 = _mm512_loadu_ps(sfPtr5+409600+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf6 = _mm512_loadu_ps(sfPtr5+409728+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in366 = _mm512_shuffle_f32x4(sf5, sf6, 68);
__m512 in367 = _mm512_shuffle_f32x4(sf5, sf6, 238);
__m512 sf7 = _mm512_loadu_ps(sfPtr5+409664+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf8 = _mm512_loadu_ps(sfPtr5+409792+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in374 = _mm512_shuffle_f32x4(sf7, sf8, 68);
__m512 in375 = _mm512_shuffle_f32x4(sf7, sf8, 238);
__m512 sf9 = _mm512_loadu_ps(sfPtr5+819200+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf10 = _mm512_loadu_ps(sfPtr5+819328+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in368 = _mm512_shuffle_f32x4(sf9, sf10, 68);
__m512 in369 = _mm512_shuffle_f32x4(sf9, sf10, 238);
__m512 sf11 = _mm512_loadu_ps(sfPtr5+819264+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf12 = _mm512_loadu_ps(sfPtr5+819392+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in376 = _mm512_shuffle_f32x4(sf11, sf12, 68);
__m512 in377 = _mm512_shuffle_f32x4(sf11, sf12, 238);
__m512 sf13 = _mm512_loadu_ps(sfPtr5+1228800+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf14 = _mm512_loadu_ps(sfPtr5+1228928+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in370 = _mm512_shuffle_f32x4(sf13, sf14, 68);
__m512 in371 = _mm512_shuffle_f32x4(sf13, sf14, 238);
__m512 sf15 = _mm512_loadu_ps(sfPtr5+1228864+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf16 = _mm512_loadu_ps(sfPtr5+1228992+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in378 = _mm512_shuffle_f32x4(sf15, sf16, 68);
__m512 in379 = _mm512_shuffle_f32x4(sf15, sf16, 238);
__m512 tmp1941 = _mm512_add_ps(in365, in366);
__m512 tmp1961 = _mm512_add_ps(in373, in374);
__m512 tmp1940 = _mm512_add_ps(in367, in368);
__m512 tmp1960 = _mm512_add_ps(in375, in376);
__m512 tmp1946 = _mm512_sub_ps(in367, in368);
__m512 tmp1966 = _mm512_sub_ps(in375, in376);
__m512 tmp1945 = _mm512_sub_ps(in365, in366);
__m512 tmp1965 = _mm512_sub_ps(in373, in374);
__m512 tmp1942 = _mm512_add_ps(in369, in370);
__m512 tmp1962 = _mm512_add_ps(in377, in378);
__m512 tmp1947 = _mm512_sub_ps(in369, in370);
__m512 tmp1967 = _mm512_sub_ps(in377, in378);
__m512 tmp1944 = _mm512_fmadd_ps(tmp1946, _mm512_set1_ps(2e+00f), tmp1945);
__m512 tmp1964 = _mm512_fmadd_ps(tmp1966, _mm512_set1_ps(2e+00f), tmp1965);
__m512 tmp1951 = _mm512_fmadd_ps(tmp1946, _mm512_set1_ps(8e+00f), tmp1945);
__m512 tmp1971 = _mm512_fmadd_ps(tmp1966, _mm512_set1_ps(8e+00f), tmp1965);
__m512 tmp1939 = _mm512_add_ps(tmp1940, tmp1941);
__m512 tmp1959 = _mm512_add_ps(tmp1960, tmp1961);
__m512 tmp1943 = _mm512_fmadd_ps(tmp1947, _mm512_set1_ps(1.6e+01f), tmp1944);
__m512 tmp1963 = _mm512_fmadd_ps(tmp1967, _mm512_set1_ps(1.6e+01f), tmp1964);
__m512 tmp1950 = _mm512_fmadd_ps(tmp1947, _mm512_set1_ps(4e+00f), tmp1951);
__m512 tmp1970 = _mm512_fmadd_ps(tmp1967, _mm512_set1_ps(4e+00f), tmp1971);
__m512 tmp1956 = _mm512_add_ps(tmp1947, tmp1945);
__m512 tmp1976 = _mm512_add_ps(tmp1967, tmp1965);
__m512 tmp1949 = _mm512_fmadd_ps(tmp1940, _mm512_set1_ps(4e+00f), tmp1941);
__m512 tmp1969 = _mm512_fmadd_ps(tmp1960, _mm512_set1_ps(4e+00f), tmp1961);
__m512 tmp1953 = _mm512_fmadd_ps(tmp1940, _mm512_set1_ps(1.6e+01f), tmp1941);
__m512 tmp1973 = _mm512_fmadd_ps(tmp1960, _mm512_set1_ps(1.6e+01f), tmp1961);
__m512 tmp1938 = _mm512_add_ps(tmp1939, in364);
__m512 tmp1958 = _mm512_add_ps(tmp1959, in372);
__m512 tmp1955 = _mm512_add_ps(tmp1956, in371);
__m512 tmp1975 = _mm512_add_ps(tmp1976, in379);
__m512 tmp1937 = _mm512_fmadd_ps(tmp1942, _mm512_set1_ps(3.2e+01f), tmp1938);
__m512 tmp1957 = _mm512_fmadd_ps(tmp1962, _mm512_set1_ps(3.2e+01f), tmp1958);
__m512 tmp1948 = _mm512_fmadd_ps(tmp1942, _mm512_set1_ps(8e+00f), tmp1949);
__m512 tmp1968 = _mm512_fmadd_ps(tmp1962, _mm512_set1_ps(8e+00f), tmp1969);
__m512 tmp1954 = _mm512_fmadd_ps(tmp1946, _mm512_set1_ps(3.2e+01f), tmp1955);
__m512 tmp1974 = _mm512_fmadd_ps(tmp1966, _mm512_set1_ps(3.2e+01f), tmp1975);
__m512 tmp1952 = _mm512_fmadd_ps(tmp1942, _mm512_set1_ps(2e+00f), tmp1953);
__m512 tmp1972 = _mm512_fmadd_ps(tmp1962, _mm512_set1_ps(2e+00f), tmp1973);
__m512 tmp1925 = tmp1937;
__m512 tmp1931 = tmp1957;
__m512 tmp1926 = tmp1943;
__m512 tmp1932 = tmp1963;
__m512 tmp1927 = tmp1948;
__m512 tmp1933 = tmp1968;
__m512 tmp1928 = tmp1950;
__m512 tmp1934 = tmp1970;
__m512 tmp1929 = tmp1952;
__m512 tmp1935 = tmp1972;
__m512 tmp1930 = tmp1954;
__m512 tmp1936 = tmp1974;
__m512 tmp2021 = _mm512_unpacklo_ps(tmp1925, tmp1926);
__m512 tmp2022 = _mm512_unpackhi_ps(tmp1925, tmp1926);
__m512 tmp2023 = _mm512_unpacklo_ps(tmp1927, tmp1928);
__m512 tmp2024 = _mm512_unpackhi_ps(tmp1927, tmp1928);
__m512 tmp2025 = _mm512_unpacklo_ps(tmp1929, tmp1930);
__m512 tmp2026 = _mm512_unpackhi_ps(tmp1929, tmp1930);
__m512 tmp2027 = _mm512_unpacklo_ps(tmp1931, tmp1932);
__m512 tmp2028 = _mm512_unpackhi_ps(tmp1931, tmp1932);
__m512 tmp2029 = _mm512_unpacklo_ps(tmp1933, tmp1934);
__m512 tmp2030 = _mm512_unpackhi_ps(tmp1933, tmp1934);
__m512 tmp2031 = _mm512_unpacklo_ps(tmp1935, tmp1936);
__m512 tmp2032 = _mm512_unpackhi_ps(tmp1935, tmp1936);
__m512 tmp2033 = _mm512_shuffle_ps(tmp2021, tmp2023, 68);
__m512 tmp2034 = _mm512_shuffle_ps(tmp2021, tmp2023, 238);
__m512 tmp2035 = _mm512_shuffle_ps(tmp2022, tmp2024, 68);
__m512 tmp2036 = _mm512_shuffle_ps(tmp2022, tmp2024, 238);
__m512 tmp2037 = _mm512_shuffle_ps(tmp2025, tmp2027, 68);
__m512 tmp2038 = _mm512_shuffle_ps(tmp2025, tmp2027, 238);
__m512 tmp2039 = _mm512_shuffle_ps(tmp2026, tmp2028, 68);
__m512 tmp2040 = _mm512_shuffle_ps(tmp2026, tmp2028, 238);
__m512 tmp2041 = _mm512_shuffle_ps(tmp2029, tmp2031, 68);
__m512 tmp2042 = _mm512_shuffle_ps(tmp2029, tmp2031, 238);
__m512 tmp2043 = _mm512_shuffle_ps(tmp2030, tmp2032, 68);
__m512 tmp2044 = _mm512_shuffle_ps(tmp2030, tmp2032, 238);
__m512 tmp2045 = _mm512_shuffle_f32x4(tmp2033, tmp2037, 136);
__m512 tmp2046 = _mm512_shuffle_f32x4(tmp2033, tmp2037, 221);
__m512 tmp2047 = _mm512_shuffle_f32x4(tmp2034, tmp2038, 136);
__m512 tmp2048 = _mm512_shuffle_f32x4(tmp2034, tmp2038, 221);
__m512 tmp2049 = _mm512_shuffle_f32x4(tmp2035, tmp2039, 136);
__m512 tmp2050 = _mm512_shuffle_f32x4(tmp2035, tmp2039, 221);
__m512 tmp2051 = _mm512_shuffle_f32x4(tmp2036, tmp2040, 136);
__m512 tmp2052 = _mm512_shuffle_f32x4(tmp2036, tmp2040, 221);
__m512 tmp2053 = _mm512_shuffle_f32x4(tmp2041, tmp2041, 136);
__m512 tmp2054 = _mm512_shuffle_f32x4(tmp2041, tmp2041, 221);
__m512 tmp2055 = _mm512_shuffle_f32x4(tmp2042, tmp2042, 136);
__m512 tmp2056 = _mm512_shuffle_f32x4(tmp2042, tmp2042, 221);
__m512 tmp2057 = _mm512_shuffle_f32x4(tmp2043, tmp2043, 136);
__m512 tmp2058 = _mm512_shuffle_f32x4(tmp2043, tmp2043, 221);
__m512 tmp2059 = _mm512_shuffle_f32x4(tmp2044, tmp2044, 136);
__m512 tmp2060 = _mm512_shuffle_f32x4(tmp2044, tmp2044, 221);
tmp1925 = _mm512_shuffle_f32x4(tmp2045, tmp2053, 136);
tmp1933 = _mm512_shuffle_f32x4(tmp2045, tmp2053, 221);
tmp1926 = _mm512_shuffle_f32x4(tmp2047, tmp2055, 136);
tmp1934 = _mm512_shuffle_f32x4(tmp2047, tmp2055, 221);
tmp1927 = _mm512_shuffle_f32x4(tmp2049, tmp2057, 136);
tmp1935 = _mm512_shuffle_f32x4(tmp2049, tmp2057, 221);
tmp1928 = _mm512_shuffle_f32x4(tmp2051, tmp2059, 136);
tmp1936 = _mm512_shuffle_f32x4(tmp2051, tmp2059, 221);
tmp1929 = _mm512_shuffle_f32x4(tmp2046, tmp2054, 136);
__m512 tmp1977 = _mm512_shuffle_f32x4(tmp2046, tmp2054, 221);
tmp1930 = _mm512_shuffle_f32x4(tmp2048, tmp2056, 136);
__m512 tmp1978 = _mm512_shuffle_f32x4(tmp2048, tmp2056, 221);
tmp1931 = _mm512_shuffle_f32x4(tmp2050, tmp2058, 136);
__m512 tmp1979 = _mm512_shuffle_f32x4(tmp2050, tmp2058, 221);
tmp1932 = _mm512_shuffle_f32x4(tmp2052, tmp2060, 136);
__m512 tmp1980 = _mm512_shuffle_f32x4(tmp2052, tmp2060, 221);
__m512 tmp1985 = _mm512_add_ps(tmp1926, tmp1927);
__m512 tmp2005 = _mm512_add_ps(tmp1934, tmp1935);
__m512 tmp1984 = _mm512_add_ps(tmp1928, tmp1929);
__m512 tmp2004 = _mm512_add_ps(tmp1936, tmp1977);
__m512 tmp1990 = _mm512_sub_ps(tmp1928, tmp1929);
__m512 tmp2010 = _mm512_sub_ps(tmp1936, tmp1977);
__m512 tmp1989 = _mm512_sub_ps(tmp1926, tmp1927);
__m512 tmp2009 = _mm512_sub_ps(tmp1934, tmp1935);
__m512 tmp1986 = _mm512_add_ps(tmp1930, tmp1931);
__m512 tmp2006 = _mm512_add_ps(tmp1978, tmp1979);
__m512 tmp1991 = _mm512_sub_ps(tmp1930, tmp1931);
__m512 tmp2011 = _mm512_sub_ps(tmp1978, tmp1979);
__m512 tmp1988 = _mm512_fmadd_ps(tmp1990, _mm512_set1_ps(2e+00f), tmp1989);
__m512 tmp2008 = _mm512_fmadd_ps(tmp2010, _mm512_set1_ps(2e+00f), tmp2009);
__m512 tmp1995 = _mm512_fmadd_ps(tmp1990, _mm512_set1_ps(8e+00f), tmp1989);
__m512 tmp2015 = _mm512_fmadd_ps(tmp2010, _mm512_set1_ps(8e+00f), tmp2009);
__m512 tmp1983 = _mm512_add_ps(tmp1984, tmp1985);
__m512 tmp2003 = _mm512_add_ps(tmp2004, tmp2005);
__m512 tmp1987 = _mm512_fmadd_ps(tmp1991, _mm512_set1_ps(1.6e+01f), tmp1988);
__m512 tmp2007 = _mm512_fmadd_ps(tmp2011, _mm512_set1_ps(1.6e+01f), tmp2008);
__m512 tmp1994 = _mm512_fmadd_ps(tmp1991, _mm512_set1_ps(4e+00f), tmp1995);
__m512 tmp2014 = _mm512_fmadd_ps(tmp2011, _mm512_set1_ps(4e+00f), tmp2015);
__m512 tmp2000 = _mm512_add_ps(tmp1991, tmp1989);
__m512 tmp2020 = _mm512_add_ps(tmp2011, tmp2009);
__m512 tmp1993 = _mm512_fmadd_ps(tmp1984, _mm512_set1_ps(4e+00f), tmp1985);
__m512 tmp2013 = _mm512_fmadd_ps(tmp2004, _mm512_set1_ps(4e+00f), tmp2005);
__m512 tmp1997 = _mm512_fmadd_ps(tmp1984, _mm512_set1_ps(1.6e+01f), tmp1985);
__m512 tmp2017 = _mm512_fmadd_ps(tmp2004, _mm512_set1_ps(1.6e+01f), tmp2005);
__m512 tmp1982 = _mm512_add_ps(tmp1983, tmp1925);
__m512 tmp2002 = _mm512_add_ps(tmp2003, tmp1933);
__m512 tmp1999 = _mm512_add_ps(tmp2000, tmp1932);
__m512 tmp2019 = _mm512_add_ps(tmp2020, tmp1980);
__m512 tmp1981 = _mm512_fmadd_ps(tmp1986, _mm512_set1_ps(3.2e+01f), tmp1982);
__m512 tmp2001 = _mm512_fmadd_ps(tmp2006, _mm512_set1_ps(3.2e+01f), tmp2002);
__m512 tmp1992 = _mm512_fmadd_ps(tmp1986, _mm512_set1_ps(8e+00f), tmp1993);
__m512 tmp2012 = _mm512_fmadd_ps(tmp2006, _mm512_set1_ps(8e+00f), tmp2013);
__m512 tmp1998 = _mm512_fmadd_ps(tmp1990, _mm512_set1_ps(3.2e+01f), tmp1999);
__m512 tmp2018 = _mm512_fmadd_ps(tmp2010, _mm512_set1_ps(3.2e+01f), tmp2019);
__m512 tmp1996 = _mm512_fmadd_ps(tmp1986, _mm512_set1_ps(2e+00f), tmp1997);
__m512 tmp2016 = _mm512_fmadd_ps(tmp2006, _mm512_set1_ps(2e+00f), tmp2017);
__m512 out423 = tmp1981;
__m512 out429 = tmp2001;
__m512 out424 = tmp1987;
__m512 out430 = tmp2007;
__m512 out425 = tmp1992;
__m512 out431 = tmp2012;
__m512 out426 = tmp1994;
__m512 out432 = tmp2014;
__m512 out427 = tmp1996;
__m512 out433 = tmp2016;
__m512 out428 = tmp1998;
__m512 out434 = tmp2018;
out423 = _mm512_max_ps(_mm512_setzero_ps(), out423);
out429 = _mm512_max_ps(_mm512_setzero_ps(), out429);
out424 = _mm512_max_ps(_mm512_setzero_ps(), out424);
out430 = _mm512_max_ps(_mm512_setzero_ps(), out430);
out425 = _mm512_max_ps(_mm512_setzero_ps(), out425);
out431 = _mm512_max_ps(_mm512_setzero_ps(), out431);
out426 = _mm512_max_ps(_mm512_setzero_ps(), out426);
out432 = _mm512_max_ps(_mm512_setzero_ps(), out432);
out427 = _mm512_max_ps(_mm512_setzero_ps(), out427);
out433 = _mm512_max_ps(_mm512_setzero_ps(), out433);
out428 = _mm512_max_ps(_mm512_setzero_ps(), out428);
out434 = _mm512_max_ps(_mm512_setzero_ps(), out434);
_mm512_mask_storeu_ps(datPtr6+0+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out423);
_mm512_mask_storeu_ps(datPtr6+48+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out429);
_mm512_mask_storeu_ps(datPtr6+224+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out424);
_mm512_mask_storeu_ps(datPtr6+272+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out430);
_mm512_mask_storeu_ps(datPtr6+448+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out425);
_mm512_mask_storeu_ps(datPtr6+496+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out431);
_mm512_mask_storeu_ps(datPtr6+672+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out426);
_mm512_mask_storeu_ps(datPtr6+720+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out432);
_mm512_mask_storeu_ps(datPtr6+896+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out427);
_mm512_mask_storeu_ps(datPtr6+944+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out433);
_mm512_mask_storeu_ps(datPtr6+1120+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out428);
_mm512_mask_storeu_ps(datPtr6+1168+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out434);
__m512 sf17 = _mm512_loadu_ps(sfPtr5+256+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf18 = _mm512_loadu_ps(sfPtr5+384+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in380 = _mm512_shuffle_f32x4(sf17, sf18, 68);
__m512 in381 = _mm512_shuffle_f32x4(sf17, sf18, 238);
__m512 sf19 = _mm512_loadu_ps(sfPtr5+320+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf20 = _mm512_loadu_ps(sfPtr5+448+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in388 = _mm512_shuffle_f32x4(sf19, sf20, 68);
__m512 in389 = _mm512_shuffle_f32x4(sf19, sf20, 238);
__m512 sf21 = _mm512_loadu_ps(sfPtr5+409856+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf22 = _mm512_loadu_ps(sfPtr5+409984+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in382 = _mm512_shuffle_f32x4(sf21, sf22, 68);
__m512 in383 = _mm512_shuffle_f32x4(sf21, sf22, 238);
__m512 sf23 = _mm512_loadu_ps(sfPtr5+409920+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf24 = _mm512_loadu_ps(sfPtr5+410048+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in390 = _mm512_shuffle_f32x4(sf23, sf24, 68);
__m512 in391 = _mm512_shuffle_f32x4(sf23, sf24, 238);
__m512 sf25 = _mm512_loadu_ps(sfPtr5+819456+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf26 = _mm512_loadu_ps(sfPtr5+819584+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in384 = _mm512_shuffle_f32x4(sf25, sf26, 68);
__m512 in385 = _mm512_shuffle_f32x4(sf25, sf26, 238);
__m512 sf27 = _mm512_loadu_ps(sfPtr5+819520+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf28 = _mm512_loadu_ps(sfPtr5+819648+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in392 = _mm512_shuffle_f32x4(sf27, sf28, 68);
__m512 in393 = _mm512_shuffle_f32x4(sf27, sf28, 238);
__m512 sf29 = _mm512_loadu_ps(sfPtr5+1229056+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf30 = _mm512_loadu_ps(sfPtr5+1229184+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in386 = _mm512_shuffle_f32x4(sf29, sf30, 68);
__m512 in387 = _mm512_shuffle_f32x4(sf29, sf30, 238);
__m512 sf31 = _mm512_loadu_ps(sfPtr5+1229120+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf32 = _mm512_loadu_ps(sfPtr5+1229248+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in394 = _mm512_shuffle_f32x4(sf31, sf32, 68);
__m512 in395 = _mm512_shuffle_f32x4(sf31, sf32, 238);
__m512 tmp2077 = _mm512_add_ps(in381, in382);
__m512 tmp2097 = _mm512_add_ps(in389, in390);
__m512 tmp2076 = _mm512_add_ps(in383, in384);
__m512 tmp2096 = _mm512_add_ps(in391, in392);
__m512 tmp2082 = _mm512_sub_ps(in383, in384);
__m512 tmp2102 = _mm512_sub_ps(in391, in392);
__m512 tmp2081 = _mm512_sub_ps(in381, in382);
__m512 tmp2101 = _mm512_sub_ps(in389, in390);
__m512 tmp2078 = _mm512_add_ps(in385, in386);
__m512 tmp2098 = _mm512_add_ps(in393, in394);
__m512 tmp2083 = _mm512_sub_ps(in385, in386);
__m512 tmp2103 = _mm512_sub_ps(in393, in394);
__m512 tmp2080 = _mm512_fmadd_ps(tmp2082, _mm512_set1_ps(2e+00f), tmp2081);
__m512 tmp2100 = _mm512_fmadd_ps(tmp2102, _mm512_set1_ps(2e+00f), tmp2101);
__m512 tmp2087 = _mm512_fmadd_ps(tmp2082, _mm512_set1_ps(8e+00f), tmp2081);
__m512 tmp2107 = _mm512_fmadd_ps(tmp2102, _mm512_set1_ps(8e+00f), tmp2101);
__m512 tmp2075 = _mm512_add_ps(tmp2076, tmp2077);
__m512 tmp2095 = _mm512_add_ps(tmp2096, tmp2097);
__m512 tmp2079 = _mm512_fmadd_ps(tmp2083, _mm512_set1_ps(1.6e+01f), tmp2080);
__m512 tmp2099 = _mm512_fmadd_ps(tmp2103, _mm512_set1_ps(1.6e+01f), tmp2100);
__m512 tmp2086 = _mm512_fmadd_ps(tmp2083, _mm512_set1_ps(4e+00f), tmp2087);
__m512 tmp2106 = _mm512_fmadd_ps(tmp2103, _mm512_set1_ps(4e+00f), tmp2107);
__m512 tmp2092 = _mm512_add_ps(tmp2083, tmp2081);
__m512 tmp2112 = _mm512_add_ps(tmp2103, tmp2101);
__m512 tmp2085 = _mm512_fmadd_ps(tmp2076, _mm512_set1_ps(4e+00f), tmp2077);
__m512 tmp2105 = _mm512_fmadd_ps(tmp2096, _mm512_set1_ps(4e+00f), tmp2097);
__m512 tmp2089 = _mm512_fmadd_ps(tmp2076, _mm512_set1_ps(1.6e+01f), tmp2077);
__m512 tmp2109 = _mm512_fmadd_ps(tmp2096, _mm512_set1_ps(1.6e+01f), tmp2097);
__m512 tmp2074 = _mm512_add_ps(tmp2075, in380);
__m512 tmp2094 = _mm512_add_ps(tmp2095, in388);
__m512 tmp2091 = _mm512_add_ps(tmp2092, in387);
__m512 tmp2111 = _mm512_add_ps(tmp2112, in395);
__m512 tmp2073 = _mm512_fmadd_ps(tmp2078, _mm512_set1_ps(3.2e+01f), tmp2074);
__m512 tmp2093 = _mm512_fmadd_ps(tmp2098, _mm512_set1_ps(3.2e+01f), tmp2094);
__m512 tmp2084 = _mm512_fmadd_ps(tmp2078, _mm512_set1_ps(8e+00f), tmp2085);
__m512 tmp2104 = _mm512_fmadd_ps(tmp2098, _mm512_set1_ps(8e+00f), tmp2105);
__m512 tmp2090 = _mm512_fmadd_ps(tmp2082, _mm512_set1_ps(3.2e+01f), tmp2091);
__m512 tmp2110 = _mm512_fmadd_ps(tmp2102, _mm512_set1_ps(3.2e+01f), tmp2111);
__m512 tmp2088 = _mm512_fmadd_ps(tmp2078, _mm512_set1_ps(2e+00f), tmp2089);
__m512 tmp2108 = _mm512_fmadd_ps(tmp2098, _mm512_set1_ps(2e+00f), tmp2109);
__m512 tmp2061 = tmp2073;
__m512 tmp2067 = tmp2093;
__m512 tmp2062 = tmp2079;
__m512 tmp2068 = tmp2099;
__m512 tmp2063 = tmp2084;
__m512 tmp2069 = tmp2104;
__m512 tmp2064 = tmp2086;
__m512 tmp2070 = tmp2106;
__m512 tmp2065 = tmp2088;
__m512 tmp2071 = tmp2108;
__m512 tmp2066 = tmp2090;
__m512 tmp2072 = tmp2110;
__m512 tmp2157 = _mm512_unpacklo_ps(tmp2061, tmp2062);
__m512 tmp2158 = _mm512_unpackhi_ps(tmp2061, tmp2062);
__m512 tmp2159 = _mm512_unpacklo_ps(tmp2063, tmp2064);
__m512 tmp2160 = _mm512_unpackhi_ps(tmp2063, tmp2064);
__m512 tmp2161 = _mm512_unpacklo_ps(tmp2065, tmp2066);
__m512 tmp2162 = _mm512_unpackhi_ps(tmp2065, tmp2066);
__m512 tmp2163 = _mm512_unpacklo_ps(tmp2067, tmp2068);
__m512 tmp2164 = _mm512_unpackhi_ps(tmp2067, tmp2068);
__m512 tmp2165 = _mm512_unpacklo_ps(tmp2069, tmp2070);
__m512 tmp2166 = _mm512_unpackhi_ps(tmp2069, tmp2070);
__m512 tmp2167 = _mm512_unpacklo_ps(tmp2071, tmp2072);
__m512 tmp2168 = _mm512_unpackhi_ps(tmp2071, tmp2072);
__m512 tmp2169 = _mm512_shuffle_ps(tmp2157, tmp2159, 68);
__m512 tmp2170 = _mm512_shuffle_ps(tmp2157, tmp2159, 238);
__m512 tmp2171 = _mm512_shuffle_ps(tmp2158, tmp2160, 68);
__m512 tmp2172 = _mm512_shuffle_ps(tmp2158, tmp2160, 238);
__m512 tmp2173 = _mm512_shuffle_ps(tmp2161, tmp2163, 68);
__m512 tmp2174 = _mm512_shuffle_ps(tmp2161, tmp2163, 238);
__m512 tmp2175 = _mm512_shuffle_ps(tmp2162, tmp2164, 68);
__m512 tmp2176 = _mm512_shuffle_ps(tmp2162, tmp2164, 238);
__m512 tmp2177 = _mm512_shuffle_ps(tmp2165, tmp2167, 68);
__m512 tmp2178 = _mm512_shuffle_ps(tmp2165, tmp2167, 238);
__m512 tmp2179 = _mm512_shuffle_ps(tmp2166, tmp2168, 68);
__m512 tmp2180 = _mm512_shuffle_ps(tmp2166, tmp2168, 238);
__m512 tmp2181 = _mm512_shuffle_f32x4(tmp2169, tmp2173, 136);
__m512 tmp2182 = _mm512_shuffle_f32x4(tmp2169, tmp2173, 221);
__m512 tmp2183 = _mm512_shuffle_f32x4(tmp2170, tmp2174, 136);
__m512 tmp2184 = _mm512_shuffle_f32x4(tmp2170, tmp2174, 221);
__m512 tmp2185 = _mm512_shuffle_f32x4(tmp2171, tmp2175, 136);
__m512 tmp2186 = _mm512_shuffle_f32x4(tmp2171, tmp2175, 221);
__m512 tmp2187 = _mm512_shuffle_f32x4(tmp2172, tmp2176, 136);
__m512 tmp2188 = _mm512_shuffle_f32x4(tmp2172, tmp2176, 221);
__m512 tmp2189 = _mm512_shuffle_f32x4(tmp2177, tmp2177, 136);
__m512 tmp2190 = _mm512_shuffle_f32x4(tmp2177, tmp2177, 221);
__m512 tmp2191 = _mm512_shuffle_f32x4(tmp2178, tmp2178, 136);
__m512 tmp2192 = _mm512_shuffle_f32x4(tmp2178, tmp2178, 221);
__m512 tmp2193 = _mm512_shuffle_f32x4(tmp2179, tmp2179, 136);
__m512 tmp2194 = _mm512_shuffle_f32x4(tmp2179, tmp2179, 221);
__m512 tmp2195 = _mm512_shuffle_f32x4(tmp2180, tmp2180, 136);
__m512 tmp2196 = _mm512_shuffle_f32x4(tmp2180, tmp2180, 221);
tmp2061 = _mm512_shuffle_f32x4(tmp2181, tmp2189, 136);
tmp2069 = _mm512_shuffle_f32x4(tmp2181, tmp2189, 221);
tmp2062 = _mm512_shuffle_f32x4(tmp2183, tmp2191, 136);
tmp2070 = _mm512_shuffle_f32x4(tmp2183, tmp2191, 221);
tmp2063 = _mm512_shuffle_f32x4(tmp2185, tmp2193, 136);
tmp2071 = _mm512_shuffle_f32x4(tmp2185, tmp2193, 221);
tmp2064 = _mm512_shuffle_f32x4(tmp2187, tmp2195, 136);
tmp2072 = _mm512_shuffle_f32x4(tmp2187, tmp2195, 221);
tmp2065 = _mm512_shuffle_f32x4(tmp2182, tmp2190, 136);
__m512 tmp2113 = _mm512_shuffle_f32x4(tmp2182, tmp2190, 221);
tmp2066 = _mm512_shuffle_f32x4(tmp2184, tmp2192, 136);
__m512 tmp2114 = _mm512_shuffle_f32x4(tmp2184, tmp2192, 221);
tmp2067 = _mm512_shuffle_f32x4(tmp2186, tmp2194, 136);
__m512 tmp2115 = _mm512_shuffle_f32x4(tmp2186, tmp2194, 221);
tmp2068 = _mm512_shuffle_f32x4(tmp2188, tmp2196, 136);
__m512 tmp2116 = _mm512_shuffle_f32x4(tmp2188, tmp2196, 221);
__m512 tmp2121 = _mm512_add_ps(tmp2062, tmp2063);
__m512 tmp2141 = _mm512_add_ps(tmp2070, tmp2071);
__m512 tmp2120 = _mm512_add_ps(tmp2064, tmp2065);
__m512 tmp2140 = _mm512_add_ps(tmp2072, tmp2113);
__m512 tmp2126 = _mm512_sub_ps(tmp2064, tmp2065);
__m512 tmp2146 = _mm512_sub_ps(tmp2072, tmp2113);
__m512 tmp2125 = _mm512_sub_ps(tmp2062, tmp2063);
__m512 tmp2145 = _mm512_sub_ps(tmp2070, tmp2071);
__m512 tmp2122 = _mm512_add_ps(tmp2066, tmp2067);
__m512 tmp2142 = _mm512_add_ps(tmp2114, tmp2115);
__m512 tmp2127 = _mm512_sub_ps(tmp2066, tmp2067);
__m512 tmp2147 = _mm512_sub_ps(tmp2114, tmp2115);
__m512 tmp2124 = _mm512_fmadd_ps(tmp2126, _mm512_set1_ps(2e+00f), tmp2125);
__m512 tmp2144 = _mm512_fmadd_ps(tmp2146, _mm512_set1_ps(2e+00f), tmp2145);
__m512 tmp2131 = _mm512_fmadd_ps(tmp2126, _mm512_set1_ps(8e+00f), tmp2125);
__m512 tmp2151 = _mm512_fmadd_ps(tmp2146, _mm512_set1_ps(8e+00f), tmp2145);
__m512 tmp2119 = _mm512_add_ps(tmp2120, tmp2121);
__m512 tmp2139 = _mm512_add_ps(tmp2140, tmp2141);
__m512 tmp2123 = _mm512_fmadd_ps(tmp2127, _mm512_set1_ps(1.6e+01f), tmp2124);
__m512 tmp2143 = _mm512_fmadd_ps(tmp2147, _mm512_set1_ps(1.6e+01f), tmp2144);
__m512 tmp2130 = _mm512_fmadd_ps(tmp2127, _mm512_set1_ps(4e+00f), tmp2131);
__m512 tmp2150 = _mm512_fmadd_ps(tmp2147, _mm512_set1_ps(4e+00f), tmp2151);
__m512 tmp2136 = _mm512_add_ps(tmp2127, tmp2125);
__m512 tmp2156 = _mm512_add_ps(tmp2147, tmp2145);
__m512 tmp2129 = _mm512_fmadd_ps(tmp2120, _mm512_set1_ps(4e+00f), tmp2121);
__m512 tmp2149 = _mm512_fmadd_ps(tmp2140, _mm512_set1_ps(4e+00f), tmp2141);
__m512 tmp2133 = _mm512_fmadd_ps(tmp2120, _mm512_set1_ps(1.6e+01f), tmp2121);
__m512 tmp2153 = _mm512_fmadd_ps(tmp2140, _mm512_set1_ps(1.6e+01f), tmp2141);
__m512 tmp2118 = _mm512_add_ps(tmp2119, tmp2061);
__m512 tmp2138 = _mm512_add_ps(tmp2139, tmp2069);
__m512 tmp2135 = _mm512_add_ps(tmp2136, tmp2068);
__m512 tmp2155 = _mm512_add_ps(tmp2156, tmp2116);
__m512 tmp2117 = _mm512_fmadd_ps(tmp2122, _mm512_set1_ps(3.2e+01f), tmp2118);
__m512 tmp2137 = _mm512_fmadd_ps(tmp2142, _mm512_set1_ps(3.2e+01f), tmp2138);
__m512 tmp2128 = _mm512_fmadd_ps(tmp2122, _mm512_set1_ps(8e+00f), tmp2129);
__m512 tmp2148 = _mm512_fmadd_ps(tmp2142, _mm512_set1_ps(8e+00f), tmp2149);
__m512 tmp2134 = _mm512_fmadd_ps(tmp2126, _mm512_set1_ps(3.2e+01f), tmp2135);
__m512 tmp2154 = _mm512_fmadd_ps(tmp2146, _mm512_set1_ps(3.2e+01f), tmp2155);
__m512 tmp2132 = _mm512_fmadd_ps(tmp2122, _mm512_set1_ps(2e+00f), tmp2133);
__m512 tmp2152 = _mm512_fmadd_ps(tmp2142, _mm512_set1_ps(2e+00f), tmp2153);
__m512 out435 = tmp2117;
__m512 out441 = tmp2137;
__m512 out436 = tmp2123;
__m512 out442 = tmp2143;
__m512 out437 = tmp2128;
__m512 out443 = tmp2148;
__m512 out438 = tmp2130;
__m512 out444 = tmp2150;
__m512 out439 = tmp2132;
__m512 out445 = tmp2152;
__m512 out440 = tmp2134;
__m512 out446 = tmp2154;
out435 = _mm512_max_ps(_mm512_setzero_ps(), out435);
out441 = _mm512_max_ps(_mm512_setzero_ps(), out441);
out436 = _mm512_max_ps(_mm512_setzero_ps(), out436);
out442 = _mm512_max_ps(_mm512_setzero_ps(), out442);
out437 = _mm512_max_ps(_mm512_setzero_ps(), out437);
out443 = _mm512_max_ps(_mm512_setzero_ps(), out443);
out438 = _mm512_max_ps(_mm512_setzero_ps(), out438);
out444 = _mm512_max_ps(_mm512_setzero_ps(), out444);
out439 = _mm512_max_ps(_mm512_setzero_ps(), out439);
out445 = _mm512_max_ps(_mm512_setzero_ps(), out445);
out440 = _mm512_max_ps(_mm512_setzero_ps(), out440);
out446 = _mm512_max_ps(_mm512_setzero_ps(), out446);
_mm512_mask_storeu_ps(datPtr6+96+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out435);
_mm512_mask_storeu_ps(datPtr6+12608+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out441);
_mm512_mask_storeu_ps(datPtr6+320+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out436);
_mm512_mask_storeu_ps(datPtr6+12832+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out442);
_mm512_mask_storeu_ps(datPtr6+544+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out437);
_mm512_mask_storeu_ps(datPtr6+13056+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out443);
_mm512_mask_storeu_ps(datPtr6+768+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out438);
_mm512_mask_storeu_ps(datPtr6+13280+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out444);
_mm512_mask_storeu_ps(datPtr6+992+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out439);
_mm512_mask_storeu_ps(datPtr6+13504+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out445);
_mm512_mask_storeu_ps(datPtr6+1216+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out440);
_mm512_mask_storeu_ps(datPtr6+13728+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out446);
__m512 sf33 = _mm512_loadu_ps(sfPtr5+512+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf34 = _mm512_loadu_ps(sfPtr5+640+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in396 = _mm512_shuffle_f32x4(sf33, sf34, 68);
__m512 in397 = _mm512_shuffle_f32x4(sf33, sf34, 238);
__m512 sf35 = _mm512_loadu_ps(sfPtr5+576+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf36 = _mm512_loadu_ps(sfPtr5+704+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in404 = _mm512_shuffle_f32x4(sf35, sf36, 68);
__m512 in405 = _mm512_shuffle_f32x4(sf35, sf36, 238);
__m512 sf37 = _mm512_loadu_ps(sfPtr5+410112+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf38 = _mm512_loadu_ps(sfPtr5+410240+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in398 = _mm512_shuffle_f32x4(sf37, sf38, 68);
__m512 in399 = _mm512_shuffle_f32x4(sf37, sf38, 238);
__m512 sf39 = _mm512_loadu_ps(sfPtr5+410176+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf40 = _mm512_loadu_ps(sfPtr5+410304+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in406 = _mm512_shuffle_f32x4(sf39, sf40, 68);
__m512 in407 = _mm512_shuffle_f32x4(sf39, sf40, 238);
__m512 sf41 = _mm512_loadu_ps(sfPtr5+819712+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf42 = _mm512_loadu_ps(sfPtr5+819840+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in400 = _mm512_shuffle_f32x4(sf41, sf42, 68);
__m512 in401 = _mm512_shuffle_f32x4(sf41, sf42, 238);
__m512 sf43 = _mm512_loadu_ps(sfPtr5+819776+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf44 = _mm512_loadu_ps(sfPtr5+819904+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in408 = _mm512_shuffle_f32x4(sf43, sf44, 68);
__m512 in409 = _mm512_shuffle_f32x4(sf43, sf44, 238);
__m512 sf45 = _mm512_loadu_ps(sfPtr5+1229312+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf46 = _mm512_loadu_ps(sfPtr5+1229440+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in402 = _mm512_shuffle_f32x4(sf45, sf46, 68);
__m512 in403 = _mm512_shuffle_f32x4(sf45, sf46, 238);
__m512 sf47 = _mm512_loadu_ps(sfPtr5+1229376+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 sf48 = _mm512_loadu_ps(sfPtr5+1229504+1638400*i18+24576*j13+1536*k62+768*l14);
__m512 in410 = _mm512_shuffle_f32x4(sf47, sf48, 68);
__m512 in411 = _mm512_shuffle_f32x4(sf47, sf48, 238);
__m512 tmp2213 = _mm512_add_ps(in397, in398);
__m512 tmp2233 = _mm512_add_ps(in405, in406);
__m512 tmp2212 = _mm512_add_ps(in399, in400);
__m512 tmp2232 = _mm512_add_ps(in407, in408);
__m512 tmp2218 = _mm512_sub_ps(in399, in400);
__m512 tmp2238 = _mm512_sub_ps(in407, in408);
__m512 tmp2217 = _mm512_sub_ps(in397, in398);
__m512 tmp2237 = _mm512_sub_ps(in405, in406);
__m512 tmp2214 = _mm512_add_ps(in401, in402);
__m512 tmp2234 = _mm512_add_ps(in409, in410);
__m512 tmp2219 = _mm512_sub_ps(in401, in402);
__m512 tmp2239 = _mm512_sub_ps(in409, in410);
__m512 tmp2216 = _mm512_fmadd_ps(tmp2218, _mm512_set1_ps(2e+00f), tmp2217);
__m512 tmp2236 = _mm512_fmadd_ps(tmp2238, _mm512_set1_ps(2e+00f), tmp2237);
__m512 tmp2223 = _mm512_fmadd_ps(tmp2218, _mm512_set1_ps(8e+00f), tmp2217);
__m512 tmp2243 = _mm512_fmadd_ps(tmp2238, _mm512_set1_ps(8e+00f), tmp2237);
__m512 tmp2211 = _mm512_add_ps(tmp2212, tmp2213);
__m512 tmp2231 = _mm512_add_ps(tmp2232, tmp2233);
__m512 tmp2215 = _mm512_fmadd_ps(tmp2219, _mm512_set1_ps(1.6e+01f), tmp2216);
__m512 tmp2235 = _mm512_fmadd_ps(tmp2239, _mm512_set1_ps(1.6e+01f), tmp2236);
__m512 tmp2222 = _mm512_fmadd_ps(tmp2219, _mm512_set1_ps(4e+00f), tmp2223);
__m512 tmp2242 = _mm512_fmadd_ps(tmp2239, _mm512_set1_ps(4e+00f), tmp2243);
__m512 tmp2228 = _mm512_add_ps(tmp2219, tmp2217);
__m512 tmp2248 = _mm512_add_ps(tmp2239, tmp2237);
__m512 tmp2221 = _mm512_fmadd_ps(tmp2212, _mm512_set1_ps(4e+00f), tmp2213);
__m512 tmp2241 = _mm512_fmadd_ps(tmp2232, _mm512_set1_ps(4e+00f), tmp2233);
__m512 tmp2225 = _mm512_fmadd_ps(tmp2212, _mm512_set1_ps(1.6e+01f), tmp2213);
__m512 tmp2245 = _mm512_fmadd_ps(tmp2232, _mm512_set1_ps(1.6e+01f), tmp2233);
__m512 tmp2210 = _mm512_add_ps(tmp2211, in396);
__m512 tmp2230 = _mm512_add_ps(tmp2231, in404);
__m512 tmp2227 = _mm512_add_ps(tmp2228, in403);
__m512 tmp2247 = _mm512_add_ps(tmp2248, in411);
__m512 tmp2209 = _mm512_fmadd_ps(tmp2214, _mm512_set1_ps(3.2e+01f), tmp2210);
__m512 tmp2229 = _mm512_fmadd_ps(tmp2234, _mm512_set1_ps(3.2e+01f), tmp2230);
__m512 tmp2220 = _mm512_fmadd_ps(tmp2214, _mm512_set1_ps(8e+00f), tmp2221);
__m512 tmp2240 = _mm512_fmadd_ps(tmp2234, _mm512_set1_ps(8e+00f), tmp2241);
__m512 tmp2226 = _mm512_fmadd_ps(tmp2218, _mm512_set1_ps(3.2e+01f), tmp2227);
__m512 tmp2246 = _mm512_fmadd_ps(tmp2238, _mm512_set1_ps(3.2e+01f), tmp2247);
__m512 tmp2224 = _mm512_fmadd_ps(tmp2214, _mm512_set1_ps(2e+00f), tmp2225);
__m512 tmp2244 = _mm512_fmadd_ps(tmp2234, _mm512_set1_ps(2e+00f), tmp2245);
__m512 tmp2197 = tmp2209;
__m512 tmp2203 = tmp2229;
__m512 tmp2198 = tmp2215;
__m512 tmp2204 = tmp2235;
__m512 tmp2199 = tmp2220;
__m512 tmp2205 = tmp2240;
__m512 tmp2200 = tmp2222;
__m512 tmp2206 = tmp2242;
__m512 tmp2201 = tmp2224;
__m512 tmp2207 = tmp2244;
__m512 tmp2202 = tmp2226;
__m512 tmp2208 = tmp2246;
__m512 tmp2293 = _mm512_unpacklo_ps(tmp2197, tmp2198);
__m512 tmp2294 = _mm512_unpackhi_ps(tmp2197, tmp2198);
__m512 tmp2295 = _mm512_unpacklo_ps(tmp2199, tmp2200);
__m512 tmp2296 = _mm512_unpackhi_ps(tmp2199, tmp2200);
__m512 tmp2297 = _mm512_unpacklo_ps(tmp2201, tmp2202);
__m512 tmp2298 = _mm512_unpackhi_ps(tmp2201, tmp2202);
__m512 tmp2299 = _mm512_unpacklo_ps(tmp2203, tmp2204);
__m512 tmp2300 = _mm512_unpackhi_ps(tmp2203, tmp2204);
__m512 tmp2301 = _mm512_unpacklo_ps(tmp2205, tmp2206);
__m512 tmp2302 = _mm512_unpackhi_ps(tmp2205, tmp2206);
__m512 tmp2303 = _mm512_unpacklo_ps(tmp2207, tmp2208);
__m512 tmp2304 = _mm512_unpackhi_ps(tmp2207, tmp2208);
__m512 tmp2305 = _mm512_shuffle_ps(tmp2293, tmp2295, 68);
__m512 tmp2306 = _mm512_shuffle_ps(tmp2293, tmp2295, 238);
__m512 tmp2307 = _mm512_shuffle_ps(tmp2294, tmp2296, 68);
__m512 tmp2308 = _mm512_shuffle_ps(tmp2294, tmp2296, 238);
__m512 tmp2309 = _mm512_shuffle_ps(tmp2297, tmp2299, 68);
__m512 tmp2310 = _mm512_shuffle_ps(tmp2297, tmp2299, 238);
__m512 tmp2311 = _mm512_shuffle_ps(tmp2298, tmp2300, 68);
__m512 tmp2312 = _mm512_shuffle_ps(tmp2298, tmp2300, 238);
__m512 tmp2313 = _mm512_shuffle_ps(tmp2301, tmp2303, 68);
__m512 tmp2314 = _mm512_shuffle_ps(tmp2301, tmp2303, 238);
__m512 tmp2315 = _mm512_shuffle_ps(tmp2302, tmp2304, 68);
__m512 tmp2316 = _mm512_shuffle_ps(tmp2302, tmp2304, 238);
__m512 tmp2317 = _mm512_shuffle_f32x4(tmp2305, tmp2309, 136);
__m512 tmp2318 = _mm512_shuffle_f32x4(tmp2305, tmp2309, 221);
__m512 tmp2319 = _mm512_shuffle_f32x4(tmp2306, tmp2310, 136);
__m512 tmp2320 = _mm512_shuffle_f32x4(tmp2306, tmp2310, 221);
__m512 tmp2321 = _mm512_shuffle_f32x4(tmp2307, tmp2311, 136);
__m512 tmp2322 = _mm512_shuffle_f32x4(tmp2307, tmp2311, 221);
__m512 tmp2323 = _mm512_shuffle_f32x4(tmp2308, tmp2312, 136);
__m512 tmp2324 = _mm512_shuffle_f32x4(tmp2308, tmp2312, 221);
__m512 tmp2325 = _mm512_shuffle_f32x4(tmp2313, tmp2313, 136);
__m512 tmp2326 = _mm512_shuffle_f32x4(tmp2313, tmp2313, 221);
__m512 tmp2327 = _mm512_shuffle_f32x4(tmp2314, tmp2314, 136);
__m512 tmp2328 = _mm512_shuffle_f32x4(tmp2314, tmp2314, 221);
__m512 tmp2329 = _mm512_shuffle_f32x4(tmp2315, tmp2315, 136);
__m512 tmp2330 = _mm512_shuffle_f32x4(tmp2315, tmp2315, 221);
__m512 tmp2331 = _mm512_shuffle_f32x4(tmp2316, tmp2316, 136);
__m512 tmp2332 = _mm512_shuffle_f32x4(tmp2316, tmp2316, 221);
tmp2197 = _mm512_shuffle_f32x4(tmp2317, tmp2325, 136);
tmp2205 = _mm512_shuffle_f32x4(tmp2317, tmp2325, 221);
tmp2198 = _mm512_shuffle_f32x4(tmp2319, tmp2327, 136);
tmp2206 = _mm512_shuffle_f32x4(tmp2319, tmp2327, 221);
tmp2199 = _mm512_shuffle_f32x4(tmp2321, tmp2329, 136);
tmp2207 = _mm512_shuffle_f32x4(tmp2321, tmp2329, 221);
tmp2200 = _mm512_shuffle_f32x4(tmp2323, tmp2331, 136);
tmp2208 = _mm512_shuffle_f32x4(tmp2323, tmp2331, 221);
tmp2201 = _mm512_shuffle_f32x4(tmp2318, tmp2326, 136);
__m512 tmp2249 = _mm512_shuffle_f32x4(tmp2318, tmp2326, 221);
tmp2202 = _mm512_shuffle_f32x4(tmp2320, tmp2328, 136);
__m512 tmp2250 = _mm512_shuffle_f32x4(tmp2320, tmp2328, 221);
tmp2203 = _mm512_shuffle_f32x4(tmp2322, tmp2330, 136);
__m512 tmp2251 = _mm512_shuffle_f32x4(tmp2322, tmp2330, 221);
tmp2204 = _mm512_shuffle_f32x4(tmp2324, tmp2332, 136);
__m512 tmp2252 = _mm512_shuffle_f32x4(tmp2324, tmp2332, 221);
__m512 tmp2257 = _mm512_add_ps(tmp2198, tmp2199);
__m512 tmp2277 = _mm512_add_ps(tmp2206, tmp2207);
__m512 tmp2256 = _mm512_add_ps(tmp2200, tmp2201);
__m512 tmp2276 = _mm512_add_ps(tmp2208, tmp2249);
__m512 tmp2262 = _mm512_sub_ps(tmp2200, tmp2201);
__m512 tmp2282 = _mm512_sub_ps(tmp2208, tmp2249);
__m512 tmp2261 = _mm512_sub_ps(tmp2198, tmp2199);
__m512 tmp2281 = _mm512_sub_ps(tmp2206, tmp2207);
__m512 tmp2258 = _mm512_add_ps(tmp2202, tmp2203);
__m512 tmp2278 = _mm512_add_ps(tmp2250, tmp2251);
__m512 tmp2263 = _mm512_sub_ps(tmp2202, tmp2203);
__m512 tmp2283 = _mm512_sub_ps(tmp2250, tmp2251);
__m512 tmp2260 = _mm512_fmadd_ps(tmp2262, _mm512_set1_ps(2e+00f), tmp2261);
__m512 tmp2280 = _mm512_fmadd_ps(tmp2282, _mm512_set1_ps(2e+00f), tmp2281);
__m512 tmp2267 = _mm512_fmadd_ps(tmp2262, _mm512_set1_ps(8e+00f), tmp2261);
__m512 tmp2287 = _mm512_fmadd_ps(tmp2282, _mm512_set1_ps(8e+00f), tmp2281);
__m512 tmp2255 = _mm512_add_ps(tmp2256, tmp2257);
__m512 tmp2275 = _mm512_add_ps(tmp2276, tmp2277);
__m512 tmp2259 = _mm512_fmadd_ps(tmp2263, _mm512_set1_ps(1.6e+01f), tmp2260);
__m512 tmp2279 = _mm512_fmadd_ps(tmp2283, _mm512_set1_ps(1.6e+01f), tmp2280);
__m512 tmp2266 = _mm512_fmadd_ps(tmp2263, _mm512_set1_ps(4e+00f), tmp2267);
__m512 tmp2286 = _mm512_fmadd_ps(tmp2283, _mm512_set1_ps(4e+00f), tmp2287);
__m512 tmp2272 = _mm512_add_ps(tmp2263, tmp2261);
__m512 tmp2292 = _mm512_add_ps(tmp2283, tmp2281);
__m512 tmp2265 = _mm512_fmadd_ps(tmp2256, _mm512_set1_ps(4e+00f), tmp2257);
__m512 tmp2285 = _mm512_fmadd_ps(tmp2276, _mm512_set1_ps(4e+00f), tmp2277);
__m512 tmp2269 = _mm512_fmadd_ps(tmp2256, _mm512_set1_ps(1.6e+01f), tmp2257);
__m512 tmp2289 = _mm512_fmadd_ps(tmp2276, _mm512_set1_ps(1.6e+01f), tmp2277);
__m512 tmp2254 = _mm512_add_ps(tmp2255, tmp2197);
__m512 tmp2274 = _mm512_add_ps(tmp2275, tmp2205);
__m512 tmp2271 = _mm512_add_ps(tmp2272, tmp2204);
__m512 tmp2291 = _mm512_add_ps(tmp2292, tmp2252);
__m512 tmp2253 = _mm512_fmadd_ps(tmp2258, _mm512_set1_ps(3.2e+01f), tmp2254);
__m512 tmp2273 = _mm512_fmadd_ps(tmp2278, _mm512_set1_ps(3.2e+01f), tmp2274);
__m512 tmp2264 = _mm512_fmadd_ps(tmp2258, _mm512_set1_ps(8e+00f), tmp2265);
__m512 tmp2284 = _mm512_fmadd_ps(tmp2278, _mm512_set1_ps(8e+00f), tmp2285);
__m512 tmp2270 = _mm512_fmadd_ps(tmp2262, _mm512_set1_ps(3.2e+01f), tmp2271);
__m512 tmp2290 = _mm512_fmadd_ps(tmp2282, _mm512_set1_ps(3.2e+01f), tmp2291);
__m512 tmp2268 = _mm512_fmadd_ps(tmp2258, _mm512_set1_ps(2e+00f), tmp2269);
__m512 tmp2288 = _mm512_fmadd_ps(tmp2278, _mm512_set1_ps(2e+00f), tmp2289);
__m512 out447 = tmp2253;
__m512 out453 = tmp2273;
__m512 out448 = tmp2259;
__m512 out454 = tmp2279;
__m512 out449 = tmp2264;
__m512 out455 = tmp2284;
__m512 out450 = tmp2266;
__m512 out456 = tmp2286;
__m512 out451 = tmp2268;
__m512 out457 = tmp2288;
__m512 out452 = tmp2270;
__m512 out458 = tmp2290;
out447 = _mm512_max_ps(_mm512_setzero_ps(), out447);
out453 = _mm512_max_ps(_mm512_setzero_ps(), out453);
out448 = _mm512_max_ps(_mm512_setzero_ps(), out448);
out454 = _mm512_max_ps(_mm512_setzero_ps(), out454);
out449 = _mm512_max_ps(_mm512_setzero_ps(), out449);
out455 = _mm512_max_ps(_mm512_setzero_ps(), out455);
out450 = _mm512_max_ps(_mm512_setzero_ps(), out450);
out456 = _mm512_max_ps(_mm512_setzero_ps(), out456);
out451 = _mm512_max_ps(_mm512_setzero_ps(), out451);
out457 = _mm512_max_ps(_mm512_setzero_ps(), out457);
out452 = _mm512_max_ps(_mm512_setzero_ps(), out452);
out458 = _mm512_max_ps(_mm512_setzero_ps(), out458);
_mm512_mask_storeu_ps(datPtr6+12656+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out447);
_mm512_mask_storeu_ps(datPtr6+12704+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out453);
_mm512_mask_storeu_ps(datPtr6+12880+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out448);
_mm512_mask_storeu_ps(datPtr6+12928+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out454);
_mm512_mask_storeu_ps(datPtr6+13104+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out449);
_mm512_mask_storeu_ps(datPtr6+13152+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out455);
_mm512_mask_storeu_ps(datPtr6+13328+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out450);
_mm512_mask_storeu_ps(datPtr6+13376+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out456);
_mm512_mask_storeu_ps(datPtr6+13552+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out451);
_mm512_mask_storeu_ps(datPtr6+13600+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out457);
_mm512_mask_storeu_ps(datPtr6+13776+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out452);
_mm512_mask_storeu_ps(datPtr6+13824+806912*i18+224*toH20+4*toW20+50432*k62+25216*l14, 4095, out458);
}
}
if (j13 >= last4) return;
++j13;
rel10 = 1;
}
ptrdiff_t toH21 = base10+0;
ptrdiff_t toW21 = 36;
ptrdiff_t k63 = 16*w33;
for (; k63 != 16; ++k63) {
ptrdiff_t l15 = 0;
for (; l15 != 2; ++l15) {
__m512 sf49 = _mm512_loadu_ps(sfPtr5+0+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf50 = _mm512_loadu_ps(sfPtr5+128+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in412 = _mm512_shuffle_f32x4(sf49, sf50, 68);
__m512 in413 = _mm512_shuffle_f32x4(sf49, sf50, 238);
__m512 sf51 = _mm512_loadu_ps(sfPtr5+64+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf52 = _mm512_loadu_ps(sfPtr5+192+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in420 = _mm512_shuffle_f32x4(sf51, sf52, 68);
__m512 in421 = _mm512_shuffle_f32x4(sf51, sf52, 238);
__m512 sf53 = _mm512_loadu_ps(sfPtr5+409600+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf54 = _mm512_loadu_ps(sfPtr5+409728+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in414 = _mm512_shuffle_f32x4(sf53, sf54, 68);
__m512 in415 = _mm512_shuffle_f32x4(sf53, sf54, 238);
__m512 sf55 = _mm512_loadu_ps(sfPtr5+409664+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf56 = _mm512_loadu_ps(sfPtr5+409792+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in422 = _mm512_shuffle_f32x4(sf55, sf56, 68);
__m512 in423 = _mm512_shuffle_f32x4(sf55, sf56, 238);
__m512 sf57 = _mm512_loadu_ps(sfPtr5+819200+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf58 = _mm512_loadu_ps(sfPtr5+819328+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in416 = _mm512_shuffle_f32x4(sf57, sf58, 68);
__m512 in417 = _mm512_shuffle_f32x4(sf57, sf58, 238);
__m512 sf59 = _mm512_loadu_ps(sfPtr5+819264+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf60 = _mm512_loadu_ps(sfPtr5+819392+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in424 = _mm512_shuffle_f32x4(sf59, sf60, 68);
__m512 in425 = _mm512_shuffle_f32x4(sf59, sf60, 238);
__m512 sf61 = _mm512_loadu_ps(sfPtr5+1228800+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf62 = _mm512_loadu_ps(sfPtr5+1228928+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in418 = _mm512_shuffle_f32x4(sf61, sf62, 68);
__m512 in419 = _mm512_shuffle_f32x4(sf61, sf62, 238);
__m512 sf63 = _mm512_loadu_ps(sfPtr5+1228864+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf64 = _mm512_loadu_ps(sfPtr5+1228992+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in426 = _mm512_shuffle_f32x4(sf63, sf64, 68);
__m512 in427 = _mm512_shuffle_f32x4(sf63, sf64, 238);
__m512 tmp2349 = _mm512_add_ps(in413, in414);
__m512 tmp2369 = _mm512_add_ps(in421, in422);
__m512 tmp2348 = _mm512_add_ps(in415, in416);
__m512 tmp2368 = _mm512_add_ps(in423, in424);
__m512 tmp2354 = _mm512_sub_ps(in415, in416);
__m512 tmp2374 = _mm512_sub_ps(in423, in424);
__m512 tmp2353 = _mm512_sub_ps(in413, in414);
__m512 tmp2373 = _mm512_sub_ps(in421, in422);
__m512 tmp2350 = _mm512_add_ps(in417, in418);
__m512 tmp2370 = _mm512_add_ps(in425, in426);
__m512 tmp2355 = _mm512_sub_ps(in417, in418);
__m512 tmp2375 = _mm512_sub_ps(in425, in426);
__m512 tmp2352 = _mm512_fmadd_ps(tmp2354, _mm512_set1_ps(2e+00f), tmp2353);
__m512 tmp2372 = _mm512_fmadd_ps(tmp2374, _mm512_set1_ps(2e+00f), tmp2373);
__m512 tmp2359 = _mm512_fmadd_ps(tmp2354, _mm512_set1_ps(8e+00f), tmp2353);
__m512 tmp2379 = _mm512_fmadd_ps(tmp2374, _mm512_set1_ps(8e+00f), tmp2373);
__m512 tmp2347 = _mm512_add_ps(tmp2348, tmp2349);
__m512 tmp2367 = _mm512_add_ps(tmp2368, tmp2369);
__m512 tmp2351 = _mm512_fmadd_ps(tmp2355, _mm512_set1_ps(1.6e+01f), tmp2352);
__m512 tmp2371 = _mm512_fmadd_ps(tmp2375, _mm512_set1_ps(1.6e+01f), tmp2372);
__m512 tmp2358 = _mm512_fmadd_ps(tmp2355, _mm512_set1_ps(4e+00f), tmp2359);
__m512 tmp2378 = _mm512_fmadd_ps(tmp2375, _mm512_set1_ps(4e+00f), tmp2379);
__m512 tmp2364 = _mm512_add_ps(tmp2355, tmp2353);
__m512 tmp2384 = _mm512_add_ps(tmp2375, tmp2373);
__m512 tmp2357 = _mm512_fmadd_ps(tmp2348, _mm512_set1_ps(4e+00f), tmp2349);
__m512 tmp2377 = _mm512_fmadd_ps(tmp2368, _mm512_set1_ps(4e+00f), tmp2369);
__m512 tmp2361 = _mm512_fmadd_ps(tmp2348, _mm512_set1_ps(1.6e+01f), tmp2349);
__m512 tmp2381 = _mm512_fmadd_ps(tmp2368, _mm512_set1_ps(1.6e+01f), tmp2369);
__m512 tmp2346 = _mm512_add_ps(tmp2347, in412);
__m512 tmp2366 = _mm512_add_ps(tmp2367, in420);
__m512 tmp2363 = _mm512_add_ps(tmp2364, in419);
__m512 tmp2383 = _mm512_add_ps(tmp2384, in427);
__m512 tmp2345 = _mm512_fmadd_ps(tmp2350, _mm512_set1_ps(3.2e+01f), tmp2346);
__m512 tmp2365 = _mm512_fmadd_ps(tmp2370, _mm512_set1_ps(3.2e+01f), tmp2366);
__m512 tmp2356 = _mm512_fmadd_ps(tmp2350, _mm512_set1_ps(8e+00f), tmp2357);
__m512 tmp2376 = _mm512_fmadd_ps(tmp2370, _mm512_set1_ps(8e+00f), tmp2377);
__m512 tmp2362 = _mm512_fmadd_ps(tmp2354, _mm512_set1_ps(3.2e+01f), tmp2363);
__m512 tmp2382 = _mm512_fmadd_ps(tmp2374, _mm512_set1_ps(3.2e+01f), tmp2383);
__m512 tmp2360 = _mm512_fmadd_ps(tmp2350, _mm512_set1_ps(2e+00f), tmp2361);
__m512 tmp2380 = _mm512_fmadd_ps(tmp2370, _mm512_set1_ps(2e+00f), tmp2381);
__m512 tmp2333 = tmp2345;
__m512 tmp2339 = tmp2365;
__m512 tmp2334 = tmp2351;
__m512 tmp2340 = tmp2371;
__m512 tmp2335 = tmp2356;
__m512 tmp2341 = tmp2376;
__m512 tmp2336 = tmp2358;
__m512 tmp2342 = tmp2378;
__m512 tmp2337 = tmp2360;
__m512 tmp2343 = tmp2380;
__m512 tmp2338 = tmp2362;
__m512 tmp2344 = tmp2382;
__m512 tmp2429 = _mm512_unpacklo_ps(tmp2333, tmp2334);
__m512 tmp2430 = _mm512_unpackhi_ps(tmp2333, tmp2334);
__m512 tmp2431 = _mm512_unpacklo_ps(tmp2335, tmp2336);
__m512 tmp2432 = _mm512_unpackhi_ps(tmp2335, tmp2336);
__m512 tmp2433 = _mm512_unpacklo_ps(tmp2337, tmp2338);
__m512 tmp2434 = _mm512_unpackhi_ps(tmp2337, tmp2338);
__m512 tmp2435 = _mm512_unpacklo_ps(tmp2339, tmp2340);
__m512 tmp2436 = _mm512_unpackhi_ps(tmp2339, tmp2340);
__m512 tmp2437 = _mm512_unpacklo_ps(tmp2341, tmp2342);
__m512 tmp2438 = _mm512_unpackhi_ps(tmp2341, tmp2342);
__m512 tmp2439 = _mm512_unpacklo_ps(tmp2343, tmp2344);
__m512 tmp2440 = _mm512_unpackhi_ps(tmp2343, tmp2344);
__m512 tmp2441 = _mm512_shuffle_ps(tmp2429, tmp2431, 68);
__m512 tmp2442 = _mm512_shuffle_ps(tmp2429, tmp2431, 238);
__m512 tmp2443 = _mm512_shuffle_ps(tmp2430, tmp2432, 68);
__m512 tmp2444 = _mm512_shuffle_ps(tmp2430, tmp2432, 238);
__m512 tmp2445 = _mm512_shuffle_ps(tmp2433, tmp2435, 68);
__m512 tmp2446 = _mm512_shuffle_ps(tmp2433, tmp2435, 238);
__m512 tmp2447 = _mm512_shuffle_ps(tmp2434, tmp2436, 68);
__m512 tmp2448 = _mm512_shuffle_ps(tmp2434, tmp2436, 238);
__m512 tmp2449 = _mm512_shuffle_ps(tmp2437, tmp2439, 68);
__m512 tmp2450 = _mm512_shuffle_ps(tmp2437, tmp2439, 238);
__m512 tmp2451 = _mm512_shuffle_ps(tmp2438, tmp2440, 68);
__m512 tmp2452 = _mm512_shuffle_ps(tmp2438, tmp2440, 238);
__m512 tmp2453 = _mm512_shuffle_f32x4(tmp2441, tmp2445, 136);
__m512 tmp2454 = _mm512_shuffle_f32x4(tmp2441, tmp2445, 221);
__m512 tmp2455 = _mm512_shuffle_f32x4(tmp2442, tmp2446, 136);
__m512 tmp2456 = _mm512_shuffle_f32x4(tmp2442, tmp2446, 221);
__m512 tmp2457 = _mm512_shuffle_f32x4(tmp2443, tmp2447, 136);
__m512 tmp2458 = _mm512_shuffle_f32x4(tmp2443, tmp2447, 221);
__m512 tmp2459 = _mm512_shuffle_f32x4(tmp2444, tmp2448, 136);
__m512 tmp2460 = _mm512_shuffle_f32x4(tmp2444, tmp2448, 221);
__m512 tmp2461 = _mm512_shuffle_f32x4(tmp2449, tmp2449, 136);
__m512 tmp2462 = _mm512_shuffle_f32x4(tmp2449, tmp2449, 221);
__m512 tmp2463 = _mm512_shuffle_f32x4(tmp2450, tmp2450, 136);
__m512 tmp2464 = _mm512_shuffle_f32x4(tmp2450, tmp2450, 221);
__m512 tmp2465 = _mm512_shuffle_f32x4(tmp2451, tmp2451, 136);
__m512 tmp2466 = _mm512_shuffle_f32x4(tmp2451, tmp2451, 221);
__m512 tmp2467 = _mm512_shuffle_f32x4(tmp2452, tmp2452, 136);
__m512 tmp2468 = _mm512_shuffle_f32x4(tmp2452, tmp2452, 221);
tmp2333 = _mm512_shuffle_f32x4(tmp2453, tmp2461, 136);
tmp2341 = _mm512_shuffle_f32x4(tmp2453, tmp2461, 221);
tmp2334 = _mm512_shuffle_f32x4(tmp2455, tmp2463, 136);
tmp2342 = _mm512_shuffle_f32x4(tmp2455, tmp2463, 221);
tmp2335 = _mm512_shuffle_f32x4(tmp2457, tmp2465, 136);
tmp2343 = _mm512_shuffle_f32x4(tmp2457, tmp2465, 221);
tmp2336 = _mm512_shuffle_f32x4(tmp2459, tmp2467, 136);
tmp2344 = _mm512_shuffle_f32x4(tmp2459, tmp2467, 221);
tmp2337 = _mm512_shuffle_f32x4(tmp2454, tmp2462, 136);
__m512 tmp2385 = _mm512_shuffle_f32x4(tmp2454, tmp2462, 221);
tmp2338 = _mm512_shuffle_f32x4(tmp2456, tmp2464, 136);
__m512 tmp2386 = _mm512_shuffle_f32x4(tmp2456, tmp2464, 221);
tmp2339 = _mm512_shuffle_f32x4(tmp2458, tmp2466, 136);
__m512 tmp2387 = _mm512_shuffle_f32x4(tmp2458, tmp2466, 221);
tmp2340 = _mm512_shuffle_f32x4(tmp2460, tmp2468, 136);
__m512 tmp2388 = _mm512_shuffle_f32x4(tmp2460, tmp2468, 221);
__m512 tmp2393 = _mm512_add_ps(tmp2334, tmp2335);
__m512 tmp2413 = _mm512_add_ps(tmp2342, tmp2343);
__m512 tmp2392 = _mm512_add_ps(tmp2336, tmp2337);
__m512 tmp2412 = _mm512_add_ps(tmp2344, tmp2385);
__m512 tmp2398 = _mm512_sub_ps(tmp2336, tmp2337);
__m512 tmp2418 = _mm512_sub_ps(tmp2344, tmp2385);
__m512 tmp2397 = _mm512_sub_ps(tmp2334, tmp2335);
__m512 tmp2417 = _mm512_sub_ps(tmp2342, tmp2343);
__m512 tmp2394 = _mm512_add_ps(tmp2338, tmp2339);
__m512 tmp2414 = _mm512_add_ps(tmp2386, tmp2387);
__m512 tmp2399 = _mm512_sub_ps(tmp2338, tmp2339);
__m512 tmp2419 = _mm512_sub_ps(tmp2386, tmp2387);
__m512 tmp2396 = _mm512_fmadd_ps(tmp2398, _mm512_set1_ps(2e+00f), tmp2397);
__m512 tmp2416 = _mm512_fmadd_ps(tmp2418, _mm512_set1_ps(2e+00f), tmp2417);
__m512 tmp2403 = _mm512_fmadd_ps(tmp2398, _mm512_set1_ps(8e+00f), tmp2397);
__m512 tmp2423 = _mm512_fmadd_ps(tmp2418, _mm512_set1_ps(8e+00f), tmp2417);
__m512 tmp2391 = _mm512_add_ps(tmp2392, tmp2393);
__m512 tmp2411 = _mm512_add_ps(tmp2412, tmp2413);
__m512 tmp2395 = _mm512_fmadd_ps(tmp2399, _mm512_set1_ps(1.6e+01f), tmp2396);
__m512 tmp2415 = _mm512_fmadd_ps(tmp2419, _mm512_set1_ps(1.6e+01f), tmp2416);
__m512 tmp2402 = _mm512_fmadd_ps(tmp2399, _mm512_set1_ps(4e+00f), tmp2403);
__m512 tmp2422 = _mm512_fmadd_ps(tmp2419, _mm512_set1_ps(4e+00f), tmp2423);
__m512 tmp2408 = _mm512_add_ps(tmp2399, tmp2397);
__m512 tmp2428 = _mm512_add_ps(tmp2419, tmp2417);
__m512 tmp2401 = _mm512_fmadd_ps(tmp2392, _mm512_set1_ps(4e+00f), tmp2393);
__m512 tmp2421 = _mm512_fmadd_ps(tmp2412, _mm512_set1_ps(4e+00f), tmp2413);
__m512 tmp2405 = _mm512_fmadd_ps(tmp2392, _mm512_set1_ps(1.6e+01f), tmp2393);
__m512 tmp2425 = _mm512_fmadd_ps(tmp2412, _mm512_set1_ps(1.6e+01f), tmp2413);
__m512 tmp2390 = _mm512_add_ps(tmp2391, tmp2333);
__m512 tmp2410 = _mm512_add_ps(tmp2411, tmp2341);
__m512 tmp2407 = _mm512_add_ps(tmp2408, tmp2340);
__m512 tmp2427 = _mm512_add_ps(tmp2428, tmp2388);
__m512 tmp2389 = _mm512_fmadd_ps(tmp2394, _mm512_set1_ps(3.2e+01f), tmp2390);
__m512 tmp2409 = _mm512_fmadd_ps(tmp2414, _mm512_set1_ps(3.2e+01f), tmp2410);
__m512 tmp2400 = _mm512_fmadd_ps(tmp2394, _mm512_set1_ps(8e+00f), tmp2401);
__m512 tmp2420 = _mm512_fmadd_ps(tmp2414, _mm512_set1_ps(8e+00f), tmp2421);
__m512 tmp2406 = _mm512_fmadd_ps(tmp2398, _mm512_set1_ps(3.2e+01f), tmp2407);
__m512 tmp2426 = _mm512_fmadd_ps(tmp2418, _mm512_set1_ps(3.2e+01f), tmp2427);
__m512 tmp2404 = _mm512_fmadd_ps(tmp2394, _mm512_set1_ps(2e+00f), tmp2405);
__m512 tmp2424 = _mm512_fmadd_ps(tmp2414, _mm512_set1_ps(2e+00f), tmp2425);
__m512 out459 = tmp2389;
__m512 out465 = tmp2409;
__m512 out460 = tmp2395;
__m512 out466 = tmp2415;
__m512 out461 = tmp2400;
__m512 out467 = tmp2420;
__m512 out462 = tmp2402;
__m512 out468 = tmp2422;
__m512 out463 = tmp2404;
__m512 out469 = tmp2424;
__m512 out464 = tmp2406;
__m512 out470 = tmp2426;
out459 = _mm512_max_ps(_mm512_setzero_ps(), out459);
out465 = _mm512_max_ps(_mm512_setzero_ps(), out465);
out460 = _mm512_max_ps(_mm512_setzero_ps(), out460);
out466 = _mm512_max_ps(_mm512_setzero_ps(), out466);
out461 = _mm512_max_ps(_mm512_setzero_ps(), out461);
out467 = _mm512_max_ps(_mm512_setzero_ps(), out467);
out462 = _mm512_max_ps(_mm512_setzero_ps(), out462);
out468 = _mm512_max_ps(_mm512_setzero_ps(), out468);
out463 = _mm512_max_ps(_mm512_setzero_ps(), out463);
out469 = _mm512_max_ps(_mm512_setzero_ps(), out469);
out464 = _mm512_max_ps(_mm512_setzero_ps(), out464);
out470 = _mm512_max_ps(_mm512_setzero_ps(), out470);
_mm512_mask_storeu_ps(datPtr6+0+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out459);
_mm512_mask_storeu_ps(datPtr6+48+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 255, out465);
_mm512_mask_storeu_ps(datPtr6+224+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out460);
_mm512_mask_storeu_ps(datPtr6+272+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 255, out466);
_mm512_mask_storeu_ps(datPtr6+448+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out461);
_mm512_mask_storeu_ps(datPtr6+496+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 255, out467);
_mm512_mask_storeu_ps(datPtr6+672+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out462);
_mm512_mask_storeu_ps(datPtr6+720+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 255, out468);
_mm512_mask_storeu_ps(datPtr6+896+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out463);
_mm512_mask_storeu_ps(datPtr6+944+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 255, out469);
_mm512_mask_storeu_ps(datPtr6+1120+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out464);
_mm512_mask_storeu_ps(datPtr6+1168+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 255, out470);
__m512 sf65 = _mm512_loadu_ps(sfPtr5+256+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf66 = _mm512_loadu_ps(sfPtr5+384+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in428 = _mm512_shuffle_f32x4(sf65, sf66, 68);
__m512 in429 = _mm512_shuffle_f32x4(sf65, sf66, 238);
__m512 sf67 = _mm512_loadu_ps(sfPtr5+320+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf68 = _mm512_loadu_ps(sfPtr5+448+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in436 = _mm512_shuffle_f32x4(sf67, sf68, 68);
__m512 in437 = _mm512_shuffle_f32x4(sf67, sf68, 238);
__m512 sf69 = _mm512_loadu_ps(sfPtr5+409856+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf70 = _mm512_loadu_ps(sfPtr5+409984+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in430 = _mm512_shuffle_f32x4(sf69, sf70, 68);
__m512 in431 = _mm512_shuffle_f32x4(sf69, sf70, 238);
__m512 sf71 = _mm512_loadu_ps(sfPtr5+409920+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf72 = _mm512_loadu_ps(sfPtr5+410048+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in438 = _mm512_shuffle_f32x4(sf71, sf72, 68);
__m512 in439 = _mm512_shuffle_f32x4(sf71, sf72, 238);
__m512 sf73 = _mm512_loadu_ps(sfPtr5+819456+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf74 = _mm512_loadu_ps(sfPtr5+819584+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in432 = _mm512_shuffle_f32x4(sf73, sf74, 68);
__m512 in433 = _mm512_shuffle_f32x4(sf73, sf74, 238);
__m512 sf75 = _mm512_loadu_ps(sfPtr5+819520+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf76 = _mm512_loadu_ps(sfPtr5+819648+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in440 = _mm512_shuffle_f32x4(sf75, sf76, 68);
__m512 in441 = _mm512_shuffle_f32x4(sf75, sf76, 238);
__m512 sf77 = _mm512_loadu_ps(sfPtr5+1229056+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf78 = _mm512_loadu_ps(sfPtr5+1229184+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in434 = _mm512_shuffle_f32x4(sf77, sf78, 68);
__m512 in435 = _mm512_shuffle_f32x4(sf77, sf78, 238);
__m512 sf79 = _mm512_loadu_ps(sfPtr5+1229120+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf80 = _mm512_loadu_ps(sfPtr5+1229248+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in442 = _mm512_shuffle_f32x4(sf79, sf80, 68);
__m512 in443 = _mm512_shuffle_f32x4(sf79, sf80, 238);
__m512 tmp2485 = _mm512_add_ps(in429, in430);
__m512 tmp2505 = _mm512_add_ps(in437, in438);
__m512 tmp2484 = _mm512_add_ps(in431, in432);
__m512 tmp2504 = _mm512_add_ps(in439, in440);
__m512 tmp2490 = _mm512_sub_ps(in431, in432);
__m512 tmp2510 = _mm512_sub_ps(in439, in440);
__m512 tmp2489 = _mm512_sub_ps(in429, in430);
__m512 tmp2509 = _mm512_sub_ps(in437, in438);
__m512 tmp2486 = _mm512_add_ps(in433, in434);
__m512 tmp2506 = _mm512_add_ps(in441, in442);
__m512 tmp2491 = _mm512_sub_ps(in433, in434);
__m512 tmp2511 = _mm512_sub_ps(in441, in442);
__m512 tmp2488 = _mm512_fmadd_ps(tmp2490, _mm512_set1_ps(2e+00f), tmp2489);
__m512 tmp2508 = _mm512_fmadd_ps(tmp2510, _mm512_set1_ps(2e+00f), tmp2509);
__m512 tmp2495 = _mm512_fmadd_ps(tmp2490, _mm512_set1_ps(8e+00f), tmp2489);
__m512 tmp2515 = _mm512_fmadd_ps(tmp2510, _mm512_set1_ps(8e+00f), tmp2509);
__m512 tmp2483 = _mm512_add_ps(tmp2484, tmp2485);
__m512 tmp2503 = _mm512_add_ps(tmp2504, tmp2505);
__m512 tmp2487 = _mm512_fmadd_ps(tmp2491, _mm512_set1_ps(1.6e+01f), tmp2488);
__m512 tmp2507 = _mm512_fmadd_ps(tmp2511, _mm512_set1_ps(1.6e+01f), tmp2508);
__m512 tmp2494 = _mm512_fmadd_ps(tmp2491, _mm512_set1_ps(4e+00f), tmp2495);
__m512 tmp2514 = _mm512_fmadd_ps(tmp2511, _mm512_set1_ps(4e+00f), tmp2515);
__m512 tmp2500 = _mm512_add_ps(tmp2491, tmp2489);
__m512 tmp2520 = _mm512_add_ps(tmp2511, tmp2509);
__m512 tmp2493 = _mm512_fmadd_ps(tmp2484, _mm512_set1_ps(4e+00f), tmp2485);
__m512 tmp2513 = _mm512_fmadd_ps(tmp2504, _mm512_set1_ps(4e+00f), tmp2505);
__m512 tmp2497 = _mm512_fmadd_ps(tmp2484, _mm512_set1_ps(1.6e+01f), tmp2485);
__m512 tmp2517 = _mm512_fmadd_ps(tmp2504, _mm512_set1_ps(1.6e+01f), tmp2505);
__m512 tmp2482 = _mm512_add_ps(tmp2483, in428);
__m512 tmp2502 = _mm512_add_ps(tmp2503, in436);
__m512 tmp2499 = _mm512_add_ps(tmp2500, in435);
__m512 tmp2519 = _mm512_add_ps(tmp2520, in443);
__m512 tmp2481 = _mm512_fmadd_ps(tmp2486, _mm512_set1_ps(3.2e+01f), tmp2482);
__m512 tmp2501 = _mm512_fmadd_ps(tmp2506, _mm512_set1_ps(3.2e+01f), tmp2502);
__m512 tmp2492 = _mm512_fmadd_ps(tmp2486, _mm512_set1_ps(8e+00f), tmp2493);
__m512 tmp2512 = _mm512_fmadd_ps(tmp2506, _mm512_set1_ps(8e+00f), tmp2513);
__m512 tmp2498 = _mm512_fmadd_ps(tmp2490, _mm512_set1_ps(3.2e+01f), tmp2499);
__m512 tmp2518 = _mm512_fmadd_ps(tmp2510, _mm512_set1_ps(3.2e+01f), tmp2519);
__m512 tmp2496 = _mm512_fmadd_ps(tmp2486, _mm512_set1_ps(2e+00f), tmp2497);
__m512 tmp2516 = _mm512_fmadd_ps(tmp2506, _mm512_set1_ps(2e+00f), tmp2517);
__m512 tmp2469 = tmp2481;
__m512 tmp2475 = tmp2501;
__m512 tmp2470 = tmp2487;
__m512 tmp2476 = tmp2507;
__m512 tmp2471 = tmp2492;
__m512 tmp2477 = tmp2512;
__m512 tmp2472 = tmp2494;
__m512 tmp2478 = tmp2514;
__m512 tmp2473 = tmp2496;
__m512 tmp2479 = tmp2516;
__m512 tmp2474 = tmp2498;
__m512 tmp2480 = tmp2518;
__m512 tmp2565 = _mm512_unpacklo_ps(tmp2469, tmp2470);
__m512 tmp2566 = _mm512_unpackhi_ps(tmp2469, tmp2470);
__m512 tmp2567 = _mm512_unpacklo_ps(tmp2471, tmp2472);
__m512 tmp2568 = _mm512_unpackhi_ps(tmp2471, tmp2472);
__m512 tmp2569 = _mm512_unpacklo_ps(tmp2473, tmp2474);
__m512 tmp2570 = _mm512_unpackhi_ps(tmp2473, tmp2474);
__m512 tmp2571 = _mm512_unpacklo_ps(tmp2475, tmp2476);
__m512 tmp2572 = _mm512_unpackhi_ps(tmp2475, tmp2476);
__m512 tmp2573 = _mm512_unpacklo_ps(tmp2477, tmp2478);
__m512 tmp2574 = _mm512_unpackhi_ps(tmp2477, tmp2478);
__m512 tmp2575 = _mm512_unpacklo_ps(tmp2479, tmp2480);
__m512 tmp2576 = _mm512_unpackhi_ps(tmp2479, tmp2480);
__m512 tmp2577 = _mm512_shuffle_ps(tmp2565, tmp2567, 68);
__m512 tmp2578 = _mm512_shuffle_ps(tmp2565, tmp2567, 238);
__m512 tmp2579 = _mm512_shuffle_ps(tmp2566, tmp2568, 68);
__m512 tmp2580 = _mm512_shuffle_ps(tmp2566, tmp2568, 238);
__m512 tmp2581 = _mm512_shuffle_ps(tmp2569, tmp2571, 68);
__m512 tmp2582 = _mm512_shuffle_ps(tmp2569, tmp2571, 238);
__m512 tmp2583 = _mm512_shuffle_ps(tmp2570, tmp2572, 68);
__m512 tmp2584 = _mm512_shuffle_ps(tmp2570, tmp2572, 238);
__m512 tmp2585 = _mm512_shuffle_ps(tmp2573, tmp2575, 68);
__m512 tmp2586 = _mm512_shuffle_ps(tmp2573, tmp2575, 238);
__m512 tmp2587 = _mm512_shuffle_ps(tmp2574, tmp2576, 68);
__m512 tmp2588 = _mm512_shuffle_ps(tmp2574, tmp2576, 238);
__m512 tmp2589 = _mm512_shuffle_f32x4(tmp2577, tmp2581, 136);
__m512 tmp2590 = _mm512_shuffle_f32x4(tmp2577, tmp2581, 221);
__m512 tmp2591 = _mm512_shuffle_f32x4(tmp2578, tmp2582, 136);
__m512 tmp2592 = _mm512_shuffle_f32x4(tmp2578, tmp2582, 221);
__m512 tmp2593 = _mm512_shuffle_f32x4(tmp2579, tmp2583, 136);
__m512 tmp2594 = _mm512_shuffle_f32x4(tmp2579, tmp2583, 221);
__m512 tmp2595 = _mm512_shuffle_f32x4(tmp2580, tmp2584, 136);
__m512 tmp2596 = _mm512_shuffle_f32x4(tmp2580, tmp2584, 221);
__m512 tmp2597 = _mm512_shuffle_f32x4(tmp2585, tmp2585, 136);
__m512 tmp2598 = _mm512_shuffle_f32x4(tmp2585, tmp2585, 221);
__m512 tmp2599 = _mm512_shuffle_f32x4(tmp2586, tmp2586, 136);
__m512 tmp2600 = _mm512_shuffle_f32x4(tmp2586, tmp2586, 221);
__m512 tmp2601 = _mm512_shuffle_f32x4(tmp2587, tmp2587, 136);
__m512 tmp2602 = _mm512_shuffle_f32x4(tmp2587, tmp2587, 221);
__m512 tmp2603 = _mm512_shuffle_f32x4(tmp2588, tmp2588, 136);
__m512 tmp2604 = _mm512_shuffle_f32x4(tmp2588, tmp2588, 221);
tmp2469 = _mm512_shuffle_f32x4(tmp2589, tmp2597, 136);
tmp2477 = _mm512_shuffle_f32x4(tmp2589, tmp2597, 221);
tmp2470 = _mm512_shuffle_f32x4(tmp2591, tmp2599, 136);
tmp2478 = _mm512_shuffle_f32x4(tmp2591, tmp2599, 221);
tmp2471 = _mm512_shuffle_f32x4(tmp2593, tmp2601, 136);
tmp2479 = _mm512_shuffle_f32x4(tmp2593, tmp2601, 221);
tmp2472 = _mm512_shuffle_f32x4(tmp2595, tmp2603, 136);
tmp2480 = _mm512_shuffle_f32x4(tmp2595, tmp2603, 221);
tmp2473 = _mm512_shuffle_f32x4(tmp2590, tmp2598, 136);
__m512 tmp2521 = _mm512_shuffle_f32x4(tmp2590, tmp2598, 221);
tmp2474 = _mm512_shuffle_f32x4(tmp2592, tmp2600, 136);
__m512 tmp2522 = _mm512_shuffle_f32x4(tmp2592, tmp2600, 221);
tmp2475 = _mm512_shuffle_f32x4(tmp2594, tmp2602, 136);
__m512 tmp2523 = _mm512_shuffle_f32x4(tmp2594, tmp2602, 221);
tmp2476 = _mm512_shuffle_f32x4(tmp2596, tmp2604, 136);
__m512 tmp2524 = _mm512_shuffle_f32x4(tmp2596, tmp2604, 221);
__m512 tmp2529 = _mm512_add_ps(tmp2470, tmp2471);
__m512 tmp2549 = _mm512_add_ps(tmp2478, tmp2479);
__m512 tmp2528 = _mm512_add_ps(tmp2472, tmp2473);
__m512 tmp2548 = _mm512_add_ps(tmp2480, tmp2521);
__m512 tmp2534 = _mm512_sub_ps(tmp2472, tmp2473);
__m512 tmp2554 = _mm512_sub_ps(tmp2480, tmp2521);
__m512 tmp2533 = _mm512_sub_ps(tmp2470, tmp2471);
__m512 tmp2553 = _mm512_sub_ps(tmp2478, tmp2479);
__m512 tmp2530 = _mm512_add_ps(tmp2474, tmp2475);
__m512 tmp2550 = _mm512_add_ps(tmp2522, tmp2523);
__m512 tmp2535 = _mm512_sub_ps(tmp2474, tmp2475);
__m512 tmp2555 = _mm512_sub_ps(tmp2522, tmp2523);
__m512 tmp2532 = _mm512_fmadd_ps(tmp2534, _mm512_set1_ps(2e+00f), tmp2533);
__m512 tmp2552 = _mm512_fmadd_ps(tmp2554, _mm512_set1_ps(2e+00f), tmp2553);
__m512 tmp2539 = _mm512_fmadd_ps(tmp2534, _mm512_set1_ps(8e+00f), tmp2533);
__m512 tmp2559 = _mm512_fmadd_ps(tmp2554, _mm512_set1_ps(8e+00f), tmp2553);
__m512 tmp2527 = _mm512_add_ps(tmp2528, tmp2529);
__m512 tmp2547 = _mm512_add_ps(tmp2548, tmp2549);
__m512 tmp2531 = _mm512_fmadd_ps(tmp2535, _mm512_set1_ps(1.6e+01f), tmp2532);
__m512 tmp2551 = _mm512_fmadd_ps(tmp2555, _mm512_set1_ps(1.6e+01f), tmp2552);
__m512 tmp2538 = _mm512_fmadd_ps(tmp2535, _mm512_set1_ps(4e+00f), tmp2539);
__m512 tmp2558 = _mm512_fmadd_ps(tmp2555, _mm512_set1_ps(4e+00f), tmp2559);
__m512 tmp2544 = _mm512_add_ps(tmp2535, tmp2533);
__m512 tmp2564 = _mm512_add_ps(tmp2555, tmp2553);
__m512 tmp2537 = _mm512_fmadd_ps(tmp2528, _mm512_set1_ps(4e+00f), tmp2529);
__m512 tmp2557 = _mm512_fmadd_ps(tmp2548, _mm512_set1_ps(4e+00f), tmp2549);
__m512 tmp2541 = _mm512_fmadd_ps(tmp2528, _mm512_set1_ps(1.6e+01f), tmp2529);
__m512 tmp2561 = _mm512_fmadd_ps(tmp2548, _mm512_set1_ps(1.6e+01f), tmp2549);
__m512 tmp2526 = _mm512_add_ps(tmp2527, tmp2469);
__m512 tmp2546 = _mm512_add_ps(tmp2547, tmp2477);
__m512 tmp2543 = _mm512_add_ps(tmp2544, tmp2476);
__m512 tmp2563 = _mm512_add_ps(tmp2564, tmp2524);
__m512 tmp2525 = _mm512_fmadd_ps(tmp2530, _mm512_set1_ps(3.2e+01f), tmp2526);
__m512 tmp2545 = _mm512_fmadd_ps(tmp2550, _mm512_set1_ps(3.2e+01f), tmp2546);
__m512 tmp2536 = _mm512_fmadd_ps(tmp2530, _mm512_set1_ps(8e+00f), tmp2537);
__m512 tmp2556 = _mm512_fmadd_ps(tmp2550, _mm512_set1_ps(8e+00f), tmp2557);
__m512 tmp2542 = _mm512_fmadd_ps(tmp2534, _mm512_set1_ps(3.2e+01f), tmp2543);
__m512 tmp2562 = _mm512_fmadd_ps(tmp2554, _mm512_set1_ps(3.2e+01f), tmp2563);
__m512 tmp2540 = _mm512_fmadd_ps(tmp2530, _mm512_set1_ps(2e+00f), tmp2541);
__m512 tmp2560 = _mm512_fmadd_ps(tmp2550, _mm512_set1_ps(2e+00f), tmp2561);
__m512 out471 = tmp2525;
__m512 out477 = tmp2545;
__m512 out472 = tmp2531;
__m512 out478 = tmp2551;
__m512 out473 = tmp2536;
__m512 out479 = tmp2556;
__m512 out474 = tmp2538;
__m512 out480 = tmp2558;
__m512 out475 = tmp2540;
__m512 out481 = tmp2560;
__m512 out476 = tmp2542;
__m512 out482 = tmp2562;
out471 = _mm512_max_ps(_mm512_setzero_ps(), out471);
out477 = _mm512_max_ps(_mm512_setzero_ps(), out477);
out472 = _mm512_max_ps(_mm512_setzero_ps(), out472);
out478 = _mm512_max_ps(_mm512_setzero_ps(), out478);
out473 = _mm512_max_ps(_mm512_setzero_ps(), out473);
out479 = _mm512_max_ps(_mm512_setzero_ps(), out479);
out474 = _mm512_max_ps(_mm512_setzero_ps(), out474);
out480 = _mm512_max_ps(_mm512_setzero_ps(), out480);
out475 = _mm512_max_ps(_mm512_setzero_ps(), out475);
out481 = _mm512_max_ps(_mm512_setzero_ps(), out481);
out476 = _mm512_max_ps(_mm512_setzero_ps(), out476);
out482 = _mm512_max_ps(_mm512_setzero_ps(), out482);
_mm512_mask_storeu_ps(datPtr6+1200+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out471);
_mm512_mask_storeu_ps(datPtr6+12608+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out477);
_mm512_mask_storeu_ps(datPtr6+1424+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out472);
_mm512_mask_storeu_ps(datPtr6+12832+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out478);
_mm512_mask_storeu_ps(datPtr6+1648+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out473);
_mm512_mask_storeu_ps(datPtr6+13056+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out479);
_mm512_mask_storeu_ps(datPtr6+1872+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out474);
_mm512_mask_storeu_ps(datPtr6+13280+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out480);
_mm512_mask_storeu_ps(datPtr6+2096+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out475);
_mm512_mask_storeu_ps(datPtr6+13504+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out481);
_mm512_mask_storeu_ps(datPtr6+2320+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out476);
_mm512_mask_storeu_ps(datPtr6+13728+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out482);
__m512 sf81 = _mm512_loadu_ps(sfPtr5+512+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf82 = _mm512_loadu_ps(sfPtr5+640+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in444 = _mm512_shuffle_f32x4(sf81, sf82, 68);
__m512 in445 = _mm512_shuffle_f32x4(sf81, sf82, 238);
__m512 sf83 = _mm512_loadu_ps(sfPtr5+576+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf84 = _mm512_loadu_ps(sfPtr5+704+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in452 = _mm512_shuffle_f32x4(sf83, sf84, 68);
__m512 in453 = _mm512_shuffle_f32x4(sf83, sf84, 238);
__m512 sf85 = _mm512_loadu_ps(sfPtr5+410112+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf86 = _mm512_loadu_ps(sfPtr5+410240+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in446 = _mm512_shuffle_f32x4(sf85, sf86, 68);
__m512 in447 = _mm512_shuffle_f32x4(sf85, sf86, 238);
__m512 sf87 = _mm512_loadu_ps(sfPtr5+410176+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf88 = _mm512_loadu_ps(sfPtr5+410304+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in454 = _mm512_shuffle_f32x4(sf87, sf88, 68);
__m512 in455 = _mm512_shuffle_f32x4(sf87, sf88, 238);
__m512 sf89 = _mm512_loadu_ps(sfPtr5+819712+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf90 = _mm512_loadu_ps(sfPtr5+819840+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in448 = _mm512_shuffle_f32x4(sf89, sf90, 68);
__m512 in449 = _mm512_shuffle_f32x4(sf89, sf90, 238);
__m512 sf91 = _mm512_loadu_ps(sfPtr5+819776+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf92 = _mm512_loadu_ps(sfPtr5+819904+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in456 = _mm512_shuffle_f32x4(sf91, sf92, 68);
__m512 in457 = _mm512_shuffle_f32x4(sf91, sf92, 238);
__m512 sf93 = _mm512_loadu_ps(sfPtr5+1229312+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf94 = _mm512_loadu_ps(sfPtr5+1229440+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in450 = _mm512_shuffle_f32x4(sf93, sf94, 68);
__m512 in451 = _mm512_shuffle_f32x4(sf93, sf94, 238);
__m512 sf95 = _mm512_loadu_ps(sfPtr5+1229376+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 sf96 = _mm512_loadu_ps(sfPtr5+1229504+1638400*i18+24576*j13+1536*k63+768*l15);
__m512 in458 = _mm512_shuffle_f32x4(sf95, sf96, 68);
__m512 in459 = _mm512_shuffle_f32x4(sf95, sf96, 238);
__m512 tmp2621 = _mm512_add_ps(in445, in446);
__m512 tmp2641 = _mm512_add_ps(in453, in454);
__m512 tmp2620 = _mm512_add_ps(in447, in448);
__m512 tmp2640 = _mm512_add_ps(in455, in456);
__m512 tmp2626 = _mm512_sub_ps(in447, in448);
__m512 tmp2646 = _mm512_sub_ps(in455, in456);
__m512 tmp2625 = _mm512_sub_ps(in445, in446);
__m512 tmp2645 = _mm512_sub_ps(in453, in454);
__m512 tmp2622 = _mm512_add_ps(in449, in450);
__m512 tmp2642 = _mm512_add_ps(in457, in458);
__m512 tmp2627 = _mm512_sub_ps(in449, in450);
__m512 tmp2647 = _mm512_sub_ps(in457, in458);
__m512 tmp2624 = _mm512_fmadd_ps(tmp2626, _mm512_set1_ps(2e+00f), tmp2625);
__m512 tmp2644 = _mm512_fmadd_ps(tmp2646, _mm512_set1_ps(2e+00f), tmp2645);
__m512 tmp2631 = _mm512_fmadd_ps(tmp2626, _mm512_set1_ps(8e+00f), tmp2625);
__m512 tmp2651 = _mm512_fmadd_ps(tmp2646, _mm512_set1_ps(8e+00f), tmp2645);
__m512 tmp2619 = _mm512_add_ps(tmp2620, tmp2621);
__m512 tmp2639 = _mm512_add_ps(tmp2640, tmp2641);
__m512 tmp2623 = _mm512_fmadd_ps(tmp2627, _mm512_set1_ps(1.6e+01f), tmp2624);
__m512 tmp2643 = _mm512_fmadd_ps(tmp2647, _mm512_set1_ps(1.6e+01f), tmp2644);
__m512 tmp2630 = _mm512_fmadd_ps(tmp2627, _mm512_set1_ps(4e+00f), tmp2631);
__m512 tmp2650 = _mm512_fmadd_ps(tmp2647, _mm512_set1_ps(4e+00f), tmp2651);
__m512 tmp2636 = _mm512_add_ps(tmp2627, tmp2625);
__m512 tmp2656 = _mm512_add_ps(tmp2647, tmp2645);
__m512 tmp2629 = _mm512_fmadd_ps(tmp2620, _mm512_set1_ps(4e+00f), tmp2621);
__m512 tmp2649 = _mm512_fmadd_ps(tmp2640, _mm512_set1_ps(4e+00f), tmp2641);
__m512 tmp2633 = _mm512_fmadd_ps(tmp2620, _mm512_set1_ps(1.6e+01f), tmp2621);
__m512 tmp2653 = _mm512_fmadd_ps(tmp2640, _mm512_set1_ps(1.6e+01f), tmp2641);
__m512 tmp2618 = _mm512_add_ps(tmp2619, in444);
__m512 tmp2638 = _mm512_add_ps(tmp2639, in452);
__m512 tmp2635 = _mm512_add_ps(tmp2636, in451);
__m512 tmp2655 = _mm512_add_ps(tmp2656, in459);
__m512 tmp2617 = _mm512_fmadd_ps(tmp2622, _mm512_set1_ps(3.2e+01f), tmp2618);
__m512 tmp2637 = _mm512_fmadd_ps(tmp2642, _mm512_set1_ps(3.2e+01f), tmp2638);
__m512 tmp2628 = _mm512_fmadd_ps(tmp2622, _mm512_set1_ps(8e+00f), tmp2629);
__m512 tmp2648 = _mm512_fmadd_ps(tmp2642, _mm512_set1_ps(8e+00f), tmp2649);
__m512 tmp2634 = _mm512_fmadd_ps(tmp2626, _mm512_set1_ps(3.2e+01f), tmp2635);
__m512 tmp2654 = _mm512_fmadd_ps(tmp2646, _mm512_set1_ps(3.2e+01f), tmp2655);
__m512 tmp2632 = _mm512_fmadd_ps(tmp2622, _mm512_set1_ps(2e+00f), tmp2633);
__m512 tmp2652 = _mm512_fmadd_ps(tmp2642, _mm512_set1_ps(2e+00f), tmp2653);
__m512 tmp2605 = tmp2617;
__m512 tmp2611 = tmp2637;
__m512 tmp2606 = tmp2623;
__m512 tmp2612 = tmp2643;
__m512 tmp2607 = tmp2628;
__m512 tmp2613 = tmp2648;
__m512 tmp2608 = tmp2630;
__m512 tmp2614 = tmp2650;
__m512 tmp2609 = tmp2632;
__m512 tmp2615 = tmp2652;
__m512 tmp2610 = tmp2634;
__m512 tmp2616 = tmp2654;
__m512 tmp2701 = _mm512_unpacklo_ps(tmp2605, tmp2606);
__m512 tmp2702 = _mm512_unpackhi_ps(tmp2605, tmp2606);
__m512 tmp2703 = _mm512_unpacklo_ps(tmp2607, tmp2608);
__m512 tmp2704 = _mm512_unpackhi_ps(tmp2607, tmp2608);
__m512 tmp2705 = _mm512_unpacklo_ps(tmp2609, tmp2610);
__m512 tmp2706 = _mm512_unpackhi_ps(tmp2609, tmp2610);
__m512 tmp2707 = _mm512_unpacklo_ps(tmp2611, tmp2612);
__m512 tmp2708 = _mm512_unpackhi_ps(tmp2611, tmp2612);
__m512 tmp2709 = _mm512_unpacklo_ps(tmp2613, tmp2614);
__m512 tmp2710 = _mm512_unpackhi_ps(tmp2613, tmp2614);
__m512 tmp2711 = _mm512_unpacklo_ps(tmp2615, tmp2616);
__m512 tmp2712 = _mm512_unpackhi_ps(tmp2615, tmp2616);
__m512 tmp2713 = _mm512_shuffle_ps(tmp2701, tmp2703, 68);
__m512 tmp2714 = _mm512_shuffle_ps(tmp2701, tmp2703, 238);
__m512 tmp2715 = _mm512_shuffle_ps(tmp2702, tmp2704, 68);
__m512 tmp2716 = _mm512_shuffle_ps(tmp2702, tmp2704, 238);
__m512 tmp2717 = _mm512_shuffle_ps(tmp2705, tmp2707, 68);
__m512 tmp2718 = _mm512_shuffle_ps(tmp2705, tmp2707, 238);
__m512 tmp2719 = _mm512_shuffle_ps(tmp2706, tmp2708, 68);
__m512 tmp2720 = _mm512_shuffle_ps(tmp2706, tmp2708, 238);
__m512 tmp2721 = _mm512_shuffle_ps(tmp2709, tmp2711, 68);
__m512 tmp2722 = _mm512_shuffle_ps(tmp2709, tmp2711, 238);
__m512 tmp2723 = _mm512_shuffle_ps(tmp2710, tmp2712, 68);
__m512 tmp2724 = _mm512_shuffle_ps(tmp2710, tmp2712, 238);
__m512 tmp2725 = _mm512_shuffle_f32x4(tmp2713, tmp2717, 136);
__m512 tmp2726 = _mm512_shuffle_f32x4(tmp2713, tmp2717, 221);
__m512 tmp2727 = _mm512_shuffle_f32x4(tmp2714, tmp2718, 136);
__m512 tmp2728 = _mm512_shuffle_f32x4(tmp2714, tmp2718, 221);
__m512 tmp2729 = _mm512_shuffle_f32x4(tmp2715, tmp2719, 136);
__m512 tmp2730 = _mm512_shuffle_f32x4(tmp2715, tmp2719, 221);
__m512 tmp2731 = _mm512_shuffle_f32x4(tmp2716, tmp2720, 136);
__m512 tmp2732 = _mm512_shuffle_f32x4(tmp2716, tmp2720, 221);
__m512 tmp2733 = _mm512_shuffle_f32x4(tmp2721, tmp2721, 136);
__m512 tmp2734 = _mm512_shuffle_f32x4(tmp2721, tmp2721, 221);
__m512 tmp2735 = _mm512_shuffle_f32x4(tmp2722, tmp2722, 136);
__m512 tmp2736 = _mm512_shuffle_f32x4(tmp2722, tmp2722, 221);
__m512 tmp2737 = _mm512_shuffle_f32x4(tmp2723, tmp2723, 136);
__m512 tmp2738 = _mm512_shuffle_f32x4(tmp2723, tmp2723, 221);
__m512 tmp2739 = _mm512_shuffle_f32x4(tmp2724, tmp2724, 136);
__m512 tmp2740 = _mm512_shuffle_f32x4(tmp2724, tmp2724, 221);
tmp2605 = _mm512_shuffle_f32x4(tmp2725, tmp2733, 136);
tmp2613 = _mm512_shuffle_f32x4(tmp2725, tmp2733, 221);
tmp2606 = _mm512_shuffle_f32x4(tmp2727, tmp2735, 136);
tmp2614 = _mm512_shuffle_f32x4(tmp2727, tmp2735, 221);
tmp2607 = _mm512_shuffle_f32x4(tmp2729, tmp2737, 136);
tmp2615 = _mm512_shuffle_f32x4(tmp2729, tmp2737, 221);
tmp2608 = _mm512_shuffle_f32x4(tmp2731, tmp2739, 136);
tmp2616 = _mm512_shuffle_f32x4(tmp2731, tmp2739, 221);
tmp2609 = _mm512_shuffle_f32x4(tmp2726, tmp2734, 136);
__m512 tmp2657 = _mm512_shuffle_f32x4(tmp2726, tmp2734, 221);
tmp2610 = _mm512_shuffle_f32x4(tmp2728, tmp2736, 136);
__m512 tmp2658 = _mm512_shuffle_f32x4(tmp2728, tmp2736, 221);
tmp2611 = _mm512_shuffle_f32x4(tmp2730, tmp2738, 136);
__m512 tmp2659 = _mm512_shuffle_f32x4(tmp2730, tmp2738, 221);
tmp2612 = _mm512_shuffle_f32x4(tmp2732, tmp2740, 136);
__m512 tmp2660 = _mm512_shuffle_f32x4(tmp2732, tmp2740, 221);
__m512 tmp2665 = _mm512_add_ps(tmp2606, tmp2607);
__m512 tmp2685 = _mm512_add_ps(tmp2614, tmp2615);
__m512 tmp2664 = _mm512_add_ps(tmp2608, tmp2609);
__m512 tmp2684 = _mm512_add_ps(tmp2616, tmp2657);
__m512 tmp2670 = _mm512_sub_ps(tmp2608, tmp2609);
__m512 tmp2690 = _mm512_sub_ps(tmp2616, tmp2657);
__m512 tmp2669 = _mm512_sub_ps(tmp2606, tmp2607);
__m512 tmp2689 = _mm512_sub_ps(tmp2614, tmp2615);
__m512 tmp2666 = _mm512_add_ps(tmp2610, tmp2611);
__m512 tmp2686 = _mm512_add_ps(tmp2658, tmp2659);
__m512 tmp2671 = _mm512_sub_ps(tmp2610, tmp2611);
__m512 tmp2691 = _mm512_sub_ps(tmp2658, tmp2659);
__m512 tmp2668 = _mm512_fmadd_ps(tmp2670, _mm512_set1_ps(2e+00f), tmp2669);
__m512 tmp2688 = _mm512_fmadd_ps(tmp2690, _mm512_set1_ps(2e+00f), tmp2689);
__m512 tmp2675 = _mm512_fmadd_ps(tmp2670, _mm512_set1_ps(8e+00f), tmp2669);
__m512 tmp2695 = _mm512_fmadd_ps(tmp2690, _mm512_set1_ps(8e+00f), tmp2689);
__m512 tmp2663 = _mm512_add_ps(tmp2664, tmp2665);
__m512 tmp2683 = _mm512_add_ps(tmp2684, tmp2685);
__m512 tmp2667 = _mm512_fmadd_ps(tmp2671, _mm512_set1_ps(1.6e+01f), tmp2668);
__m512 tmp2687 = _mm512_fmadd_ps(tmp2691, _mm512_set1_ps(1.6e+01f), tmp2688);
__m512 tmp2674 = _mm512_fmadd_ps(tmp2671, _mm512_set1_ps(4e+00f), tmp2675);
__m512 tmp2694 = _mm512_fmadd_ps(tmp2691, _mm512_set1_ps(4e+00f), tmp2695);
__m512 tmp2680 = _mm512_add_ps(tmp2671, tmp2669);
__m512 tmp2700 = _mm512_add_ps(tmp2691, tmp2689);
__m512 tmp2673 = _mm512_fmadd_ps(tmp2664, _mm512_set1_ps(4e+00f), tmp2665);
__m512 tmp2693 = _mm512_fmadd_ps(tmp2684, _mm512_set1_ps(4e+00f), tmp2685);
__m512 tmp2677 = _mm512_fmadd_ps(tmp2664, _mm512_set1_ps(1.6e+01f), tmp2665);
__m512 tmp2697 = _mm512_fmadd_ps(tmp2684, _mm512_set1_ps(1.6e+01f), tmp2685);
__m512 tmp2662 = _mm512_add_ps(tmp2663, tmp2605);
__m512 tmp2682 = _mm512_add_ps(tmp2683, tmp2613);
__m512 tmp2679 = _mm512_add_ps(tmp2680, tmp2612);
__m512 tmp2699 = _mm512_add_ps(tmp2700, tmp2660);
__m512 tmp2661 = _mm512_fmadd_ps(tmp2666, _mm512_set1_ps(3.2e+01f), tmp2662);
__m512 tmp2681 = _mm512_fmadd_ps(tmp2686, _mm512_set1_ps(3.2e+01f), tmp2682);
__m512 tmp2672 = _mm512_fmadd_ps(tmp2666, _mm512_set1_ps(8e+00f), tmp2673);
__m512 tmp2692 = _mm512_fmadd_ps(tmp2686, _mm512_set1_ps(8e+00f), tmp2693);
__m512 tmp2678 = _mm512_fmadd_ps(tmp2670, _mm512_set1_ps(3.2e+01f), tmp2679);
__m512 tmp2698 = _mm512_fmadd_ps(tmp2690, _mm512_set1_ps(3.2e+01f), tmp2699);
__m512 tmp2676 = _mm512_fmadd_ps(tmp2666, _mm512_set1_ps(2e+00f), tmp2677);
__m512 tmp2696 = _mm512_fmadd_ps(tmp2686, _mm512_set1_ps(2e+00f), tmp2697);
__m512 out483 = tmp2661;
__m512 out489 = tmp2681;
__m512 out484 = tmp2667;
__m512 out490 = tmp2687;
__m512 out485 = tmp2672;
__m512 out491 = tmp2692;
__m512 out486 = tmp2674;
__m512 out492 = tmp2694;
__m512 out487 = tmp2676;
__m512 out493 = tmp2696;
__m512 out488 = tmp2678;
__m512 out494 = tmp2698;
out483 = _mm512_max_ps(_mm512_setzero_ps(), out483);
out489 = _mm512_max_ps(_mm512_setzero_ps(), out489);
out484 = _mm512_max_ps(_mm512_setzero_ps(), out484);
out490 = _mm512_max_ps(_mm512_setzero_ps(), out490);
out485 = _mm512_max_ps(_mm512_setzero_ps(), out485);
out491 = _mm512_max_ps(_mm512_setzero_ps(), out491);
out486 = _mm512_max_ps(_mm512_setzero_ps(), out486);
out492 = _mm512_max_ps(_mm512_setzero_ps(), out492);
out487 = _mm512_max_ps(_mm512_setzero_ps(), out487);
out493 = _mm512_max_ps(_mm512_setzero_ps(), out493);
out488 = _mm512_max_ps(_mm512_setzero_ps(), out488);
out494 = _mm512_max_ps(_mm512_setzero_ps(), out494);
_mm512_mask_storeu_ps(datPtr6+12656+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 255, out483);
_mm512_mask_storeu_ps(datPtr6+13808+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out489);
_mm512_mask_storeu_ps(datPtr6+12880+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 255, out484);
_mm512_mask_storeu_ps(datPtr6+14032+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out490);
_mm512_mask_storeu_ps(datPtr6+13104+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 255, out485);
_mm512_mask_storeu_ps(datPtr6+14256+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out491);
_mm512_mask_storeu_ps(datPtr6+13328+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 255, out486);
_mm512_mask_storeu_ps(datPtr6+14480+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out492);
_mm512_mask_storeu_ps(datPtr6+13552+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 255, out487);
_mm512_mask_storeu_ps(datPtr6+14704+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out493);
_mm512_mask_storeu_ps(datPtr6+13776+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 255, out488);
_mm512_mask_storeu_ps(datPtr6+14928+806912*i18+224*toH21+4*toW21+50432*k63+25216*l15, 4095, out494);
}
}
if (j13 >= last4) return;
++j13;
j13 = 2;
}
if (j13 < 15) {
ptrdiff_t rel11 = (size_t)(j13-2)%5;
ptrdiff_t base11 = 6+(size_t)(j13-2)/5*18;
for (; ; rel11 = 0, base11 += 18) {
if (rel11 < 2) {
if (rel11 < 1) {
ptrdiff_t toH22 = base11+0;
ptrdiff_t toW22 = 12;
ptrdiff_t k64 = 16*w33;
for (; k64 != 16; ++k64) {
ptrdiff_t l16 = 0;
for (; l16 != 2; ++l16) {
__m512 sf97 = _mm512_loadu_ps(sfPtr5+0+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf98 = _mm512_loadu_ps(sfPtr5+128+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in460 = _mm512_shuffle_f32x4(sf97, sf98, 68);
__m512 in461 = _mm512_shuffle_f32x4(sf97, sf98, 238);
__m512 sf99 = _mm512_loadu_ps(sfPtr5+64+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf100 = _mm512_loadu_ps(sfPtr5+192+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in468 = _mm512_shuffle_f32x4(sf99, sf100, 68);
__m512 in469 = _mm512_shuffle_f32x4(sf99, sf100, 238);
__m512 sf101 = _mm512_loadu_ps(sfPtr5+409600+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf102 = _mm512_loadu_ps(sfPtr5+409728+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in462 = _mm512_shuffle_f32x4(sf101, sf102, 68);
__m512 in463 = _mm512_shuffle_f32x4(sf101, sf102, 238);
__m512 sf103 = _mm512_loadu_ps(sfPtr5+409664+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf104 = _mm512_loadu_ps(sfPtr5+409792+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in470 = _mm512_shuffle_f32x4(sf103, sf104, 68);
__m512 in471 = _mm512_shuffle_f32x4(sf103, sf104, 238);
__m512 sf105 = _mm512_loadu_ps(sfPtr5+819200+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf106 = _mm512_loadu_ps(sfPtr5+819328+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in464 = _mm512_shuffle_f32x4(sf105, sf106, 68);
__m512 in465 = _mm512_shuffle_f32x4(sf105, sf106, 238);
__m512 sf107 = _mm512_loadu_ps(sfPtr5+819264+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf108 = _mm512_loadu_ps(sfPtr5+819392+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in472 = _mm512_shuffle_f32x4(sf107, sf108, 68);
__m512 in473 = _mm512_shuffle_f32x4(sf107, sf108, 238);
__m512 sf109 = _mm512_loadu_ps(sfPtr5+1228800+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf110 = _mm512_loadu_ps(sfPtr5+1228928+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in466 = _mm512_shuffle_f32x4(sf109, sf110, 68);
__m512 in467 = _mm512_shuffle_f32x4(sf109, sf110, 238);
__m512 sf111 = _mm512_loadu_ps(sfPtr5+1228864+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf112 = _mm512_loadu_ps(sfPtr5+1228992+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in474 = _mm512_shuffle_f32x4(sf111, sf112, 68);
__m512 in475 = _mm512_shuffle_f32x4(sf111, sf112, 238);
__m512 tmp2757 = _mm512_add_ps(in461, in462);
__m512 tmp2777 = _mm512_add_ps(in469, in470);
__m512 tmp2756 = _mm512_add_ps(in463, in464);
__m512 tmp2776 = _mm512_add_ps(in471, in472);
__m512 tmp2762 = _mm512_sub_ps(in463, in464);
__m512 tmp2782 = _mm512_sub_ps(in471, in472);
__m512 tmp2761 = _mm512_sub_ps(in461, in462);
__m512 tmp2781 = _mm512_sub_ps(in469, in470);
__m512 tmp2758 = _mm512_add_ps(in465, in466);
__m512 tmp2778 = _mm512_add_ps(in473, in474);
__m512 tmp2763 = _mm512_sub_ps(in465, in466);
__m512 tmp2783 = _mm512_sub_ps(in473, in474);
__m512 tmp2760 = _mm512_fmadd_ps(tmp2762, _mm512_set1_ps(2e+00f), tmp2761);
__m512 tmp2780 = _mm512_fmadd_ps(tmp2782, _mm512_set1_ps(2e+00f), tmp2781);
__m512 tmp2767 = _mm512_fmadd_ps(tmp2762, _mm512_set1_ps(8e+00f), tmp2761);
__m512 tmp2787 = _mm512_fmadd_ps(tmp2782, _mm512_set1_ps(8e+00f), tmp2781);
__m512 tmp2755 = _mm512_add_ps(tmp2756, tmp2757);
__m512 tmp2775 = _mm512_add_ps(tmp2776, tmp2777);
__m512 tmp2759 = _mm512_fmadd_ps(tmp2763, _mm512_set1_ps(1.6e+01f), tmp2760);
__m512 tmp2779 = _mm512_fmadd_ps(tmp2783, _mm512_set1_ps(1.6e+01f), tmp2780);
__m512 tmp2766 = _mm512_fmadd_ps(tmp2763, _mm512_set1_ps(4e+00f), tmp2767);
__m512 tmp2786 = _mm512_fmadd_ps(tmp2783, _mm512_set1_ps(4e+00f), tmp2787);
__m512 tmp2772 = _mm512_add_ps(tmp2763, tmp2761);
__m512 tmp2792 = _mm512_add_ps(tmp2783, tmp2781);
__m512 tmp2765 = _mm512_fmadd_ps(tmp2756, _mm512_set1_ps(4e+00f), tmp2757);
__m512 tmp2785 = _mm512_fmadd_ps(tmp2776, _mm512_set1_ps(4e+00f), tmp2777);
__m512 tmp2769 = _mm512_fmadd_ps(tmp2756, _mm512_set1_ps(1.6e+01f), tmp2757);
__m512 tmp2789 = _mm512_fmadd_ps(tmp2776, _mm512_set1_ps(1.6e+01f), tmp2777);
__m512 tmp2754 = _mm512_add_ps(tmp2755, in460);
__m512 tmp2774 = _mm512_add_ps(tmp2775, in468);
__m512 tmp2771 = _mm512_add_ps(tmp2772, in467);
__m512 tmp2791 = _mm512_add_ps(tmp2792, in475);
__m512 tmp2753 = _mm512_fmadd_ps(tmp2758, _mm512_set1_ps(3.2e+01f), tmp2754);
__m512 tmp2773 = _mm512_fmadd_ps(tmp2778, _mm512_set1_ps(3.2e+01f), tmp2774);
__m512 tmp2764 = _mm512_fmadd_ps(tmp2758, _mm512_set1_ps(8e+00f), tmp2765);
__m512 tmp2784 = _mm512_fmadd_ps(tmp2778, _mm512_set1_ps(8e+00f), tmp2785);
__m512 tmp2770 = _mm512_fmadd_ps(tmp2762, _mm512_set1_ps(3.2e+01f), tmp2771);
__m512 tmp2790 = _mm512_fmadd_ps(tmp2782, _mm512_set1_ps(3.2e+01f), tmp2791);
__m512 tmp2768 = _mm512_fmadd_ps(tmp2758, _mm512_set1_ps(2e+00f), tmp2769);
__m512 tmp2788 = _mm512_fmadd_ps(tmp2778, _mm512_set1_ps(2e+00f), tmp2789);
__m512 tmp2741 = tmp2753;
__m512 tmp2747 = tmp2773;
__m512 tmp2742 = tmp2759;
__m512 tmp2748 = tmp2779;
__m512 tmp2743 = tmp2764;
__m512 tmp2749 = tmp2784;
__m512 tmp2744 = tmp2766;
__m512 tmp2750 = tmp2786;
__m512 tmp2745 = tmp2768;
__m512 tmp2751 = tmp2788;
__m512 tmp2746 = tmp2770;
__m512 tmp2752 = tmp2790;
__m512 tmp2837 = _mm512_unpacklo_ps(tmp2741, tmp2742);
__m512 tmp2838 = _mm512_unpackhi_ps(tmp2741, tmp2742);
__m512 tmp2839 = _mm512_unpacklo_ps(tmp2743, tmp2744);
__m512 tmp2840 = _mm512_unpackhi_ps(tmp2743, tmp2744);
__m512 tmp2841 = _mm512_unpacklo_ps(tmp2745, tmp2746);
__m512 tmp2842 = _mm512_unpackhi_ps(tmp2745, tmp2746);
__m512 tmp2843 = _mm512_unpacklo_ps(tmp2747, tmp2748);
__m512 tmp2844 = _mm512_unpackhi_ps(tmp2747, tmp2748);
__m512 tmp2845 = _mm512_unpacklo_ps(tmp2749, tmp2750);
__m512 tmp2846 = _mm512_unpackhi_ps(tmp2749, tmp2750);
__m512 tmp2847 = _mm512_unpacklo_ps(tmp2751, tmp2752);
__m512 tmp2848 = _mm512_unpackhi_ps(tmp2751, tmp2752);
__m512 tmp2849 = _mm512_shuffle_ps(tmp2837, tmp2839, 68);
__m512 tmp2850 = _mm512_shuffle_ps(tmp2837, tmp2839, 238);
__m512 tmp2851 = _mm512_shuffle_ps(tmp2838, tmp2840, 68);
__m512 tmp2852 = _mm512_shuffle_ps(tmp2838, tmp2840, 238);
__m512 tmp2853 = _mm512_shuffle_ps(tmp2841, tmp2843, 68);
__m512 tmp2854 = _mm512_shuffle_ps(tmp2841, tmp2843, 238);
__m512 tmp2855 = _mm512_shuffle_ps(tmp2842, tmp2844, 68);
__m512 tmp2856 = _mm512_shuffle_ps(tmp2842, tmp2844, 238);
__m512 tmp2857 = _mm512_shuffle_ps(tmp2845, tmp2847, 68);
__m512 tmp2858 = _mm512_shuffle_ps(tmp2845, tmp2847, 238);
__m512 tmp2859 = _mm512_shuffle_ps(tmp2846, tmp2848, 68);
__m512 tmp2860 = _mm512_shuffle_ps(tmp2846, tmp2848, 238);
__m512 tmp2861 = _mm512_shuffle_f32x4(tmp2849, tmp2853, 136);
__m512 tmp2862 = _mm512_shuffle_f32x4(tmp2849, tmp2853, 221);
__m512 tmp2863 = _mm512_shuffle_f32x4(tmp2850, tmp2854, 136);
__m512 tmp2864 = _mm512_shuffle_f32x4(tmp2850, tmp2854, 221);
__m512 tmp2865 = _mm512_shuffle_f32x4(tmp2851, tmp2855, 136);
__m512 tmp2866 = _mm512_shuffle_f32x4(tmp2851, tmp2855, 221);
__m512 tmp2867 = _mm512_shuffle_f32x4(tmp2852, tmp2856, 136);
__m512 tmp2868 = _mm512_shuffle_f32x4(tmp2852, tmp2856, 221);
__m512 tmp2869 = _mm512_shuffle_f32x4(tmp2857, tmp2857, 136);
__m512 tmp2870 = _mm512_shuffle_f32x4(tmp2857, tmp2857, 221);
__m512 tmp2871 = _mm512_shuffle_f32x4(tmp2858, tmp2858, 136);
__m512 tmp2872 = _mm512_shuffle_f32x4(tmp2858, tmp2858, 221);
__m512 tmp2873 = _mm512_shuffle_f32x4(tmp2859, tmp2859, 136);
__m512 tmp2874 = _mm512_shuffle_f32x4(tmp2859, tmp2859, 221);
__m512 tmp2875 = _mm512_shuffle_f32x4(tmp2860, tmp2860, 136);
__m512 tmp2876 = _mm512_shuffle_f32x4(tmp2860, tmp2860, 221);
tmp2741 = _mm512_shuffle_f32x4(tmp2861, tmp2869, 136);
tmp2749 = _mm512_shuffle_f32x4(tmp2861, tmp2869, 221);
tmp2742 = _mm512_shuffle_f32x4(tmp2863, tmp2871, 136);
tmp2750 = _mm512_shuffle_f32x4(tmp2863, tmp2871, 221);
tmp2743 = _mm512_shuffle_f32x4(tmp2865, tmp2873, 136);
tmp2751 = _mm512_shuffle_f32x4(tmp2865, tmp2873, 221);
tmp2744 = _mm512_shuffle_f32x4(tmp2867, tmp2875, 136);
tmp2752 = _mm512_shuffle_f32x4(tmp2867, tmp2875, 221);
tmp2745 = _mm512_shuffle_f32x4(tmp2862, tmp2870, 136);
__m512 tmp2793 = _mm512_shuffle_f32x4(tmp2862, tmp2870, 221);
tmp2746 = _mm512_shuffle_f32x4(tmp2864, tmp2872, 136);
__m512 tmp2794 = _mm512_shuffle_f32x4(tmp2864, tmp2872, 221);
tmp2747 = _mm512_shuffle_f32x4(tmp2866, tmp2874, 136);
__m512 tmp2795 = _mm512_shuffle_f32x4(tmp2866, tmp2874, 221);
tmp2748 = _mm512_shuffle_f32x4(tmp2868, tmp2876, 136);
__m512 tmp2796 = _mm512_shuffle_f32x4(tmp2868, tmp2876, 221);
__m512 tmp2801 = _mm512_add_ps(tmp2742, tmp2743);
__m512 tmp2821 = _mm512_add_ps(tmp2750, tmp2751);
__m512 tmp2800 = _mm512_add_ps(tmp2744, tmp2745);
__m512 tmp2820 = _mm512_add_ps(tmp2752, tmp2793);
__m512 tmp2806 = _mm512_sub_ps(tmp2744, tmp2745);
__m512 tmp2826 = _mm512_sub_ps(tmp2752, tmp2793);
__m512 tmp2805 = _mm512_sub_ps(tmp2742, tmp2743);
__m512 tmp2825 = _mm512_sub_ps(tmp2750, tmp2751);
__m512 tmp2802 = _mm512_add_ps(tmp2746, tmp2747);
__m512 tmp2822 = _mm512_add_ps(tmp2794, tmp2795);
__m512 tmp2807 = _mm512_sub_ps(tmp2746, tmp2747);
__m512 tmp2827 = _mm512_sub_ps(tmp2794, tmp2795);
__m512 tmp2804 = _mm512_fmadd_ps(tmp2806, _mm512_set1_ps(2e+00f), tmp2805);
__m512 tmp2824 = _mm512_fmadd_ps(tmp2826, _mm512_set1_ps(2e+00f), tmp2825);
__m512 tmp2811 = _mm512_fmadd_ps(tmp2806, _mm512_set1_ps(8e+00f), tmp2805);
__m512 tmp2831 = _mm512_fmadd_ps(tmp2826, _mm512_set1_ps(8e+00f), tmp2825);
__m512 tmp2799 = _mm512_add_ps(tmp2800, tmp2801);
__m512 tmp2819 = _mm512_add_ps(tmp2820, tmp2821);
__m512 tmp2803 = _mm512_fmadd_ps(tmp2807, _mm512_set1_ps(1.6e+01f), tmp2804);
__m512 tmp2823 = _mm512_fmadd_ps(tmp2827, _mm512_set1_ps(1.6e+01f), tmp2824);
__m512 tmp2810 = _mm512_fmadd_ps(tmp2807, _mm512_set1_ps(4e+00f), tmp2811);
__m512 tmp2830 = _mm512_fmadd_ps(tmp2827, _mm512_set1_ps(4e+00f), tmp2831);
__m512 tmp2816 = _mm512_add_ps(tmp2807, tmp2805);
__m512 tmp2836 = _mm512_add_ps(tmp2827, tmp2825);
__m512 tmp2809 = _mm512_fmadd_ps(tmp2800, _mm512_set1_ps(4e+00f), tmp2801);
__m512 tmp2829 = _mm512_fmadd_ps(tmp2820, _mm512_set1_ps(4e+00f), tmp2821);
__m512 tmp2813 = _mm512_fmadd_ps(tmp2800, _mm512_set1_ps(1.6e+01f), tmp2801);
__m512 tmp2833 = _mm512_fmadd_ps(tmp2820, _mm512_set1_ps(1.6e+01f), tmp2821);
__m512 tmp2798 = _mm512_add_ps(tmp2799, tmp2741);
__m512 tmp2818 = _mm512_add_ps(tmp2819, tmp2749);
__m512 tmp2815 = _mm512_add_ps(tmp2816, tmp2748);
__m512 tmp2835 = _mm512_add_ps(tmp2836, tmp2796);
__m512 tmp2797 = _mm512_fmadd_ps(tmp2802, _mm512_set1_ps(3.2e+01f), tmp2798);
__m512 tmp2817 = _mm512_fmadd_ps(tmp2822, _mm512_set1_ps(3.2e+01f), tmp2818);
__m512 tmp2808 = _mm512_fmadd_ps(tmp2802, _mm512_set1_ps(8e+00f), tmp2809);
__m512 tmp2828 = _mm512_fmadd_ps(tmp2822, _mm512_set1_ps(8e+00f), tmp2829);
__m512 tmp2814 = _mm512_fmadd_ps(tmp2806, _mm512_set1_ps(3.2e+01f), tmp2815);
__m512 tmp2834 = _mm512_fmadd_ps(tmp2826, _mm512_set1_ps(3.2e+01f), tmp2835);
__m512 tmp2812 = _mm512_fmadd_ps(tmp2802, _mm512_set1_ps(2e+00f), tmp2813);
__m512 tmp2832 = _mm512_fmadd_ps(tmp2822, _mm512_set1_ps(2e+00f), tmp2833);
__m512 out495 = tmp2797;
__m512 out501 = tmp2817;
__m512 out496 = tmp2803;
__m512 out502 = tmp2823;
__m512 out497 = tmp2808;
__m512 out503 = tmp2828;
__m512 out498 = tmp2810;
__m512 out504 = tmp2830;
__m512 out499 = tmp2812;
__m512 out505 = tmp2832;
__m512 out500 = tmp2814;
__m512 out506 = tmp2834;
out495 = _mm512_max_ps(_mm512_setzero_ps(), out495);
out501 = _mm512_max_ps(_mm512_setzero_ps(), out501);
out496 = _mm512_max_ps(_mm512_setzero_ps(), out496);
out502 = _mm512_max_ps(_mm512_setzero_ps(), out502);
out497 = _mm512_max_ps(_mm512_setzero_ps(), out497);
out503 = _mm512_max_ps(_mm512_setzero_ps(), out503);
out498 = _mm512_max_ps(_mm512_setzero_ps(), out498);
out504 = _mm512_max_ps(_mm512_setzero_ps(), out504);
out499 = _mm512_max_ps(_mm512_setzero_ps(), out499);
out505 = _mm512_max_ps(_mm512_setzero_ps(), out505);
out500 = _mm512_max_ps(_mm512_setzero_ps(), out500);
out506 = _mm512_max_ps(_mm512_setzero_ps(), out506);
_mm512_mask_storeu_ps(datPtr6+0+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out495);
_mm512_mask_storeu_ps(datPtr6+48+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out501);
_mm512_mask_storeu_ps(datPtr6+224+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out496);
_mm512_mask_storeu_ps(datPtr6+272+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out502);
_mm512_mask_storeu_ps(datPtr6+448+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out497);
_mm512_mask_storeu_ps(datPtr6+496+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out503);
_mm512_mask_storeu_ps(datPtr6+672+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out498);
_mm512_mask_storeu_ps(datPtr6+720+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out504);
_mm512_mask_storeu_ps(datPtr6+896+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out499);
_mm512_mask_storeu_ps(datPtr6+944+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out505);
_mm512_mask_storeu_ps(datPtr6+1120+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out500);
_mm512_mask_storeu_ps(datPtr6+1168+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out506);
__m512 sf113 = _mm512_loadu_ps(sfPtr5+256+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf114 = _mm512_loadu_ps(sfPtr5+384+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in476 = _mm512_shuffle_f32x4(sf113, sf114, 68);
__m512 in477 = _mm512_shuffle_f32x4(sf113, sf114, 238);
__m512 sf115 = _mm512_loadu_ps(sfPtr5+320+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf116 = _mm512_loadu_ps(sfPtr5+448+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in484 = _mm512_shuffle_f32x4(sf115, sf116, 68);
__m512 in485 = _mm512_shuffle_f32x4(sf115, sf116, 238);
__m512 sf117 = _mm512_loadu_ps(sfPtr5+409856+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf118 = _mm512_loadu_ps(sfPtr5+409984+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in478 = _mm512_shuffle_f32x4(sf117, sf118, 68);
__m512 in479 = _mm512_shuffle_f32x4(sf117, sf118, 238);
__m512 sf119 = _mm512_loadu_ps(sfPtr5+409920+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf120 = _mm512_loadu_ps(sfPtr5+410048+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in486 = _mm512_shuffle_f32x4(sf119, sf120, 68);
__m512 in487 = _mm512_shuffle_f32x4(sf119, sf120, 238);
__m512 sf121 = _mm512_loadu_ps(sfPtr5+819456+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf122 = _mm512_loadu_ps(sfPtr5+819584+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in480 = _mm512_shuffle_f32x4(sf121, sf122, 68);
__m512 in481 = _mm512_shuffle_f32x4(sf121, sf122, 238);
__m512 sf123 = _mm512_loadu_ps(sfPtr5+819520+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf124 = _mm512_loadu_ps(sfPtr5+819648+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in488 = _mm512_shuffle_f32x4(sf123, sf124, 68);
__m512 in489 = _mm512_shuffle_f32x4(sf123, sf124, 238);
__m512 sf125 = _mm512_loadu_ps(sfPtr5+1229056+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf126 = _mm512_loadu_ps(sfPtr5+1229184+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in482 = _mm512_shuffle_f32x4(sf125, sf126, 68);
__m512 in483 = _mm512_shuffle_f32x4(sf125, sf126, 238);
__m512 sf127 = _mm512_loadu_ps(sfPtr5+1229120+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf128 = _mm512_loadu_ps(sfPtr5+1229248+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in490 = _mm512_shuffle_f32x4(sf127, sf128, 68);
__m512 in491 = _mm512_shuffle_f32x4(sf127, sf128, 238);
__m512 tmp2893 = _mm512_add_ps(in477, in478);
__m512 tmp2913 = _mm512_add_ps(in485, in486);
__m512 tmp2892 = _mm512_add_ps(in479, in480);
__m512 tmp2912 = _mm512_add_ps(in487, in488);
__m512 tmp2898 = _mm512_sub_ps(in479, in480);
__m512 tmp2918 = _mm512_sub_ps(in487, in488);
__m512 tmp2897 = _mm512_sub_ps(in477, in478);
__m512 tmp2917 = _mm512_sub_ps(in485, in486);
__m512 tmp2894 = _mm512_add_ps(in481, in482);
__m512 tmp2914 = _mm512_add_ps(in489, in490);
__m512 tmp2899 = _mm512_sub_ps(in481, in482);
__m512 tmp2919 = _mm512_sub_ps(in489, in490);
__m512 tmp2896 = _mm512_fmadd_ps(tmp2898, _mm512_set1_ps(2e+00f), tmp2897);
__m512 tmp2916 = _mm512_fmadd_ps(tmp2918, _mm512_set1_ps(2e+00f), tmp2917);
__m512 tmp2903 = _mm512_fmadd_ps(tmp2898, _mm512_set1_ps(8e+00f), tmp2897);
__m512 tmp2923 = _mm512_fmadd_ps(tmp2918, _mm512_set1_ps(8e+00f), tmp2917);
__m512 tmp2891 = _mm512_add_ps(tmp2892, tmp2893);
__m512 tmp2911 = _mm512_add_ps(tmp2912, tmp2913);
__m512 tmp2895 = _mm512_fmadd_ps(tmp2899, _mm512_set1_ps(1.6e+01f), tmp2896);
__m512 tmp2915 = _mm512_fmadd_ps(tmp2919, _mm512_set1_ps(1.6e+01f), tmp2916);
__m512 tmp2902 = _mm512_fmadd_ps(tmp2899, _mm512_set1_ps(4e+00f), tmp2903);
__m512 tmp2922 = _mm512_fmadd_ps(tmp2919, _mm512_set1_ps(4e+00f), tmp2923);
__m512 tmp2908 = _mm512_add_ps(tmp2899, tmp2897);
__m512 tmp2928 = _mm512_add_ps(tmp2919, tmp2917);
__m512 tmp2901 = _mm512_fmadd_ps(tmp2892, _mm512_set1_ps(4e+00f), tmp2893);
__m512 tmp2921 = _mm512_fmadd_ps(tmp2912, _mm512_set1_ps(4e+00f), tmp2913);
__m512 tmp2905 = _mm512_fmadd_ps(tmp2892, _mm512_set1_ps(1.6e+01f), tmp2893);
__m512 tmp2925 = _mm512_fmadd_ps(tmp2912, _mm512_set1_ps(1.6e+01f), tmp2913);
__m512 tmp2890 = _mm512_add_ps(tmp2891, in476);
__m512 tmp2910 = _mm512_add_ps(tmp2911, in484);
__m512 tmp2907 = _mm512_add_ps(tmp2908, in483);
__m512 tmp2927 = _mm512_add_ps(tmp2928, in491);
__m512 tmp2889 = _mm512_fmadd_ps(tmp2894, _mm512_set1_ps(3.2e+01f), tmp2890);
__m512 tmp2909 = _mm512_fmadd_ps(tmp2914, _mm512_set1_ps(3.2e+01f), tmp2910);
__m512 tmp2900 = _mm512_fmadd_ps(tmp2894, _mm512_set1_ps(8e+00f), tmp2901);
__m512 tmp2920 = _mm512_fmadd_ps(tmp2914, _mm512_set1_ps(8e+00f), tmp2921);
__m512 tmp2906 = _mm512_fmadd_ps(tmp2898, _mm512_set1_ps(3.2e+01f), tmp2907);
__m512 tmp2926 = _mm512_fmadd_ps(tmp2918, _mm512_set1_ps(3.2e+01f), tmp2927);
__m512 tmp2904 = _mm512_fmadd_ps(tmp2894, _mm512_set1_ps(2e+00f), tmp2905);
__m512 tmp2924 = _mm512_fmadd_ps(tmp2914, _mm512_set1_ps(2e+00f), tmp2925);
__m512 tmp2877 = tmp2889;
__m512 tmp2883 = tmp2909;
__m512 tmp2878 = tmp2895;
__m512 tmp2884 = tmp2915;
__m512 tmp2879 = tmp2900;
__m512 tmp2885 = tmp2920;
__m512 tmp2880 = tmp2902;
__m512 tmp2886 = tmp2922;
__m512 tmp2881 = tmp2904;
__m512 tmp2887 = tmp2924;
__m512 tmp2882 = tmp2906;
__m512 tmp2888 = tmp2926;
__m512 tmp2973 = _mm512_unpacklo_ps(tmp2877, tmp2878);
__m512 tmp2974 = _mm512_unpackhi_ps(tmp2877, tmp2878);
__m512 tmp2975 = _mm512_unpacklo_ps(tmp2879, tmp2880);
__m512 tmp2976 = _mm512_unpackhi_ps(tmp2879, tmp2880);
__m512 tmp2977 = _mm512_unpacklo_ps(tmp2881, tmp2882);
__m512 tmp2978 = _mm512_unpackhi_ps(tmp2881, tmp2882);
__m512 tmp2979 = _mm512_unpacklo_ps(tmp2883, tmp2884);
__m512 tmp2980 = _mm512_unpackhi_ps(tmp2883, tmp2884);
__m512 tmp2981 = _mm512_unpacklo_ps(tmp2885, tmp2886);
__m512 tmp2982 = _mm512_unpackhi_ps(tmp2885, tmp2886);
__m512 tmp2983 = _mm512_unpacklo_ps(tmp2887, tmp2888);
__m512 tmp2984 = _mm512_unpackhi_ps(tmp2887, tmp2888);
__m512 tmp2985 = _mm512_shuffle_ps(tmp2973, tmp2975, 68);
__m512 tmp2986 = _mm512_shuffle_ps(tmp2973, tmp2975, 238);
__m512 tmp2987 = _mm512_shuffle_ps(tmp2974, tmp2976, 68);
__m512 tmp2988 = _mm512_shuffle_ps(tmp2974, tmp2976, 238);
__m512 tmp2989 = _mm512_shuffle_ps(tmp2977, tmp2979, 68);
__m512 tmp2990 = _mm512_shuffle_ps(tmp2977, tmp2979, 238);
__m512 tmp2991 = _mm512_shuffle_ps(tmp2978, tmp2980, 68);
__m512 tmp2992 = _mm512_shuffle_ps(tmp2978, tmp2980, 238);
__m512 tmp2993 = _mm512_shuffle_ps(tmp2981, tmp2983, 68);
__m512 tmp2994 = _mm512_shuffle_ps(tmp2981, tmp2983, 238);
__m512 tmp2995 = _mm512_shuffle_ps(tmp2982, tmp2984, 68);
__m512 tmp2996 = _mm512_shuffle_ps(tmp2982, tmp2984, 238);
__m512 tmp2997 = _mm512_shuffle_f32x4(tmp2985, tmp2989, 136);
__m512 tmp2998 = _mm512_shuffle_f32x4(tmp2985, tmp2989, 221);
__m512 tmp2999 = _mm512_shuffle_f32x4(tmp2986, tmp2990, 136);
__m512 tmp3000 = _mm512_shuffle_f32x4(tmp2986, tmp2990, 221);
__m512 tmp3001 = _mm512_shuffle_f32x4(tmp2987, tmp2991, 136);
__m512 tmp3002 = _mm512_shuffle_f32x4(tmp2987, tmp2991, 221);
__m512 tmp3003 = _mm512_shuffle_f32x4(tmp2988, tmp2992, 136);
__m512 tmp3004 = _mm512_shuffle_f32x4(tmp2988, tmp2992, 221);
__m512 tmp3005 = _mm512_shuffle_f32x4(tmp2993, tmp2993, 136);
__m512 tmp3006 = _mm512_shuffle_f32x4(tmp2993, tmp2993, 221);
__m512 tmp3007 = _mm512_shuffle_f32x4(tmp2994, tmp2994, 136);
__m512 tmp3008 = _mm512_shuffle_f32x4(tmp2994, tmp2994, 221);
__m512 tmp3009 = _mm512_shuffle_f32x4(tmp2995, tmp2995, 136);
__m512 tmp3010 = _mm512_shuffle_f32x4(tmp2995, tmp2995, 221);
__m512 tmp3011 = _mm512_shuffle_f32x4(tmp2996, tmp2996, 136);
__m512 tmp3012 = _mm512_shuffle_f32x4(tmp2996, tmp2996, 221);
tmp2877 = _mm512_shuffle_f32x4(tmp2997, tmp3005, 136);
tmp2885 = _mm512_shuffle_f32x4(tmp2997, tmp3005, 221);
tmp2878 = _mm512_shuffle_f32x4(tmp2999, tmp3007, 136);
tmp2886 = _mm512_shuffle_f32x4(tmp2999, tmp3007, 221);
tmp2879 = _mm512_shuffle_f32x4(tmp3001, tmp3009, 136);
tmp2887 = _mm512_shuffle_f32x4(tmp3001, tmp3009, 221);
tmp2880 = _mm512_shuffle_f32x4(tmp3003, tmp3011, 136);
tmp2888 = _mm512_shuffle_f32x4(tmp3003, tmp3011, 221);
tmp2881 = _mm512_shuffle_f32x4(tmp2998, tmp3006, 136);
__m512 tmp2929 = _mm512_shuffle_f32x4(tmp2998, tmp3006, 221);
tmp2882 = _mm512_shuffle_f32x4(tmp3000, tmp3008, 136);
__m512 tmp2930 = _mm512_shuffle_f32x4(tmp3000, tmp3008, 221);
tmp2883 = _mm512_shuffle_f32x4(tmp3002, tmp3010, 136);
__m512 tmp2931 = _mm512_shuffle_f32x4(tmp3002, tmp3010, 221);
tmp2884 = _mm512_shuffle_f32x4(tmp3004, tmp3012, 136);
__m512 tmp2932 = _mm512_shuffle_f32x4(tmp3004, tmp3012, 221);
__m512 tmp2937 = _mm512_add_ps(tmp2878, tmp2879);
__m512 tmp2957 = _mm512_add_ps(tmp2886, tmp2887);
__m512 tmp2936 = _mm512_add_ps(tmp2880, tmp2881);
__m512 tmp2956 = _mm512_add_ps(tmp2888, tmp2929);
__m512 tmp2942 = _mm512_sub_ps(tmp2880, tmp2881);
__m512 tmp2962 = _mm512_sub_ps(tmp2888, tmp2929);
__m512 tmp2941 = _mm512_sub_ps(tmp2878, tmp2879);
__m512 tmp2961 = _mm512_sub_ps(tmp2886, tmp2887);
__m512 tmp2938 = _mm512_add_ps(tmp2882, tmp2883);
__m512 tmp2958 = _mm512_add_ps(tmp2930, tmp2931);
__m512 tmp2943 = _mm512_sub_ps(tmp2882, tmp2883);
__m512 tmp2963 = _mm512_sub_ps(tmp2930, tmp2931);
__m512 tmp2940 = _mm512_fmadd_ps(tmp2942, _mm512_set1_ps(2e+00f), tmp2941);
__m512 tmp2960 = _mm512_fmadd_ps(tmp2962, _mm512_set1_ps(2e+00f), tmp2961);
__m512 tmp2947 = _mm512_fmadd_ps(tmp2942, _mm512_set1_ps(8e+00f), tmp2941);
__m512 tmp2967 = _mm512_fmadd_ps(tmp2962, _mm512_set1_ps(8e+00f), tmp2961);
__m512 tmp2935 = _mm512_add_ps(tmp2936, tmp2937);
__m512 tmp2955 = _mm512_add_ps(tmp2956, tmp2957);
__m512 tmp2939 = _mm512_fmadd_ps(tmp2943, _mm512_set1_ps(1.6e+01f), tmp2940);
__m512 tmp2959 = _mm512_fmadd_ps(tmp2963, _mm512_set1_ps(1.6e+01f), tmp2960);
__m512 tmp2946 = _mm512_fmadd_ps(tmp2943, _mm512_set1_ps(4e+00f), tmp2947);
__m512 tmp2966 = _mm512_fmadd_ps(tmp2963, _mm512_set1_ps(4e+00f), tmp2967);
__m512 tmp2952 = _mm512_add_ps(tmp2943, tmp2941);
__m512 tmp2972 = _mm512_add_ps(tmp2963, tmp2961);
__m512 tmp2945 = _mm512_fmadd_ps(tmp2936, _mm512_set1_ps(4e+00f), tmp2937);
__m512 tmp2965 = _mm512_fmadd_ps(tmp2956, _mm512_set1_ps(4e+00f), tmp2957);
__m512 tmp2949 = _mm512_fmadd_ps(tmp2936, _mm512_set1_ps(1.6e+01f), tmp2937);
__m512 tmp2969 = _mm512_fmadd_ps(tmp2956, _mm512_set1_ps(1.6e+01f), tmp2957);
__m512 tmp2934 = _mm512_add_ps(tmp2935, tmp2877);
__m512 tmp2954 = _mm512_add_ps(tmp2955, tmp2885);
__m512 tmp2951 = _mm512_add_ps(tmp2952, tmp2884);
__m512 tmp2971 = _mm512_add_ps(tmp2972, tmp2932);
__m512 tmp2933 = _mm512_fmadd_ps(tmp2938, _mm512_set1_ps(3.2e+01f), tmp2934);
__m512 tmp2953 = _mm512_fmadd_ps(tmp2958, _mm512_set1_ps(3.2e+01f), tmp2954);
__m512 tmp2944 = _mm512_fmadd_ps(tmp2938, _mm512_set1_ps(8e+00f), tmp2945);
__m512 tmp2964 = _mm512_fmadd_ps(tmp2958, _mm512_set1_ps(8e+00f), tmp2965);
__m512 tmp2950 = _mm512_fmadd_ps(tmp2942, _mm512_set1_ps(3.2e+01f), tmp2951);
__m512 tmp2970 = _mm512_fmadd_ps(tmp2962, _mm512_set1_ps(3.2e+01f), tmp2971);
__m512 tmp2948 = _mm512_fmadd_ps(tmp2938, _mm512_set1_ps(2e+00f), tmp2949);
__m512 tmp2968 = _mm512_fmadd_ps(tmp2958, _mm512_set1_ps(2e+00f), tmp2969);
__m512 out507 = tmp2933;
__m512 out513 = tmp2953;
__m512 out508 = tmp2939;
__m512 out514 = tmp2959;
__m512 out509 = tmp2944;
__m512 out515 = tmp2964;
__m512 out510 = tmp2946;
__m512 out516 = tmp2966;
__m512 out511 = tmp2948;
__m512 out517 = tmp2968;
__m512 out512 = tmp2950;
__m512 out518 = tmp2970;
out507 = _mm512_max_ps(_mm512_setzero_ps(), out507);
out513 = _mm512_max_ps(_mm512_setzero_ps(), out513);
out508 = _mm512_max_ps(_mm512_setzero_ps(), out508);
out514 = _mm512_max_ps(_mm512_setzero_ps(), out514);
out509 = _mm512_max_ps(_mm512_setzero_ps(), out509);
out515 = _mm512_max_ps(_mm512_setzero_ps(), out515);
out510 = _mm512_max_ps(_mm512_setzero_ps(), out510);
out516 = _mm512_max_ps(_mm512_setzero_ps(), out516);
out511 = _mm512_max_ps(_mm512_setzero_ps(), out511);
out517 = _mm512_max_ps(_mm512_setzero_ps(), out517);
out512 = _mm512_max_ps(_mm512_setzero_ps(), out512);
out518 = _mm512_max_ps(_mm512_setzero_ps(), out518);
_mm512_mask_storeu_ps(datPtr6+96+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out507);
_mm512_mask_storeu_ps(datPtr6+12608+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out513);
_mm512_mask_storeu_ps(datPtr6+320+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out508);
_mm512_mask_storeu_ps(datPtr6+12832+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out514);
_mm512_mask_storeu_ps(datPtr6+544+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out509);
_mm512_mask_storeu_ps(datPtr6+13056+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out515);
_mm512_mask_storeu_ps(datPtr6+768+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out510);
_mm512_mask_storeu_ps(datPtr6+13280+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out516);
_mm512_mask_storeu_ps(datPtr6+992+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out511);
_mm512_mask_storeu_ps(datPtr6+13504+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out517);
_mm512_mask_storeu_ps(datPtr6+1216+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out512);
_mm512_mask_storeu_ps(datPtr6+13728+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out518);
__m512 sf129 = _mm512_loadu_ps(sfPtr5+512+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf130 = _mm512_loadu_ps(sfPtr5+640+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in492 = _mm512_shuffle_f32x4(sf129, sf130, 68);
__m512 in493 = _mm512_shuffle_f32x4(sf129, sf130, 238);
__m512 sf131 = _mm512_loadu_ps(sfPtr5+576+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf132 = _mm512_loadu_ps(sfPtr5+704+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in500 = _mm512_shuffle_f32x4(sf131, sf132, 68);
__m512 in501 = _mm512_shuffle_f32x4(sf131, sf132, 238);
__m512 sf133 = _mm512_loadu_ps(sfPtr5+410112+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf134 = _mm512_loadu_ps(sfPtr5+410240+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in494 = _mm512_shuffle_f32x4(sf133, sf134, 68);
__m512 in495 = _mm512_shuffle_f32x4(sf133, sf134, 238);
__m512 sf135 = _mm512_loadu_ps(sfPtr5+410176+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf136 = _mm512_loadu_ps(sfPtr5+410304+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in502 = _mm512_shuffle_f32x4(sf135, sf136, 68);
__m512 in503 = _mm512_shuffle_f32x4(sf135, sf136, 238);
__m512 sf137 = _mm512_loadu_ps(sfPtr5+819712+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf138 = _mm512_loadu_ps(sfPtr5+819840+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in496 = _mm512_shuffle_f32x4(sf137, sf138, 68);
__m512 in497 = _mm512_shuffle_f32x4(sf137, sf138, 238);
__m512 sf139 = _mm512_loadu_ps(sfPtr5+819776+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf140 = _mm512_loadu_ps(sfPtr5+819904+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in504 = _mm512_shuffle_f32x4(sf139, sf140, 68);
__m512 in505 = _mm512_shuffle_f32x4(sf139, sf140, 238);
__m512 sf141 = _mm512_loadu_ps(sfPtr5+1229312+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf142 = _mm512_loadu_ps(sfPtr5+1229440+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in498 = _mm512_shuffle_f32x4(sf141, sf142, 68);
__m512 in499 = _mm512_shuffle_f32x4(sf141, sf142, 238);
__m512 sf143 = _mm512_loadu_ps(sfPtr5+1229376+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 sf144 = _mm512_loadu_ps(sfPtr5+1229504+1638400*i18+24576*j13+1536*k64+768*l16);
__m512 in506 = _mm512_shuffle_f32x4(sf143, sf144, 68);
__m512 in507 = _mm512_shuffle_f32x4(sf143, sf144, 238);
__m512 tmp3029 = _mm512_add_ps(in493, in494);
__m512 tmp3049 = _mm512_add_ps(in501, in502);
__m512 tmp3028 = _mm512_add_ps(in495, in496);
__m512 tmp3048 = _mm512_add_ps(in503, in504);
__m512 tmp3034 = _mm512_sub_ps(in495, in496);
__m512 tmp3054 = _mm512_sub_ps(in503, in504);
__m512 tmp3033 = _mm512_sub_ps(in493, in494);
__m512 tmp3053 = _mm512_sub_ps(in501, in502);
__m512 tmp3030 = _mm512_add_ps(in497, in498);
__m512 tmp3050 = _mm512_add_ps(in505, in506);
__m512 tmp3035 = _mm512_sub_ps(in497, in498);
__m512 tmp3055 = _mm512_sub_ps(in505, in506);
__m512 tmp3032 = _mm512_fmadd_ps(tmp3034, _mm512_set1_ps(2e+00f), tmp3033);
__m512 tmp3052 = _mm512_fmadd_ps(tmp3054, _mm512_set1_ps(2e+00f), tmp3053);
__m512 tmp3039 = _mm512_fmadd_ps(tmp3034, _mm512_set1_ps(8e+00f), tmp3033);
__m512 tmp3059 = _mm512_fmadd_ps(tmp3054, _mm512_set1_ps(8e+00f), tmp3053);
__m512 tmp3027 = _mm512_add_ps(tmp3028, tmp3029);
__m512 tmp3047 = _mm512_add_ps(tmp3048, tmp3049);
__m512 tmp3031 = _mm512_fmadd_ps(tmp3035, _mm512_set1_ps(1.6e+01f), tmp3032);
__m512 tmp3051 = _mm512_fmadd_ps(tmp3055, _mm512_set1_ps(1.6e+01f), tmp3052);
__m512 tmp3038 = _mm512_fmadd_ps(tmp3035, _mm512_set1_ps(4e+00f), tmp3039);
__m512 tmp3058 = _mm512_fmadd_ps(tmp3055, _mm512_set1_ps(4e+00f), tmp3059);
__m512 tmp3044 = _mm512_add_ps(tmp3035, tmp3033);
__m512 tmp3064 = _mm512_add_ps(tmp3055, tmp3053);
__m512 tmp3037 = _mm512_fmadd_ps(tmp3028, _mm512_set1_ps(4e+00f), tmp3029);
__m512 tmp3057 = _mm512_fmadd_ps(tmp3048, _mm512_set1_ps(4e+00f), tmp3049);
__m512 tmp3041 = _mm512_fmadd_ps(tmp3028, _mm512_set1_ps(1.6e+01f), tmp3029);
__m512 tmp3061 = _mm512_fmadd_ps(tmp3048, _mm512_set1_ps(1.6e+01f), tmp3049);
__m512 tmp3026 = _mm512_add_ps(tmp3027, in492);
__m512 tmp3046 = _mm512_add_ps(tmp3047, in500);
__m512 tmp3043 = _mm512_add_ps(tmp3044, in499);
__m512 tmp3063 = _mm512_add_ps(tmp3064, in507);
__m512 tmp3025 = _mm512_fmadd_ps(tmp3030, _mm512_set1_ps(3.2e+01f), tmp3026);
__m512 tmp3045 = _mm512_fmadd_ps(tmp3050, _mm512_set1_ps(3.2e+01f), tmp3046);
__m512 tmp3036 = _mm512_fmadd_ps(tmp3030, _mm512_set1_ps(8e+00f), tmp3037);
__m512 tmp3056 = _mm512_fmadd_ps(tmp3050, _mm512_set1_ps(8e+00f), tmp3057);
__m512 tmp3042 = _mm512_fmadd_ps(tmp3034, _mm512_set1_ps(3.2e+01f), tmp3043);
__m512 tmp3062 = _mm512_fmadd_ps(tmp3054, _mm512_set1_ps(3.2e+01f), tmp3063);
__m512 tmp3040 = _mm512_fmadd_ps(tmp3030, _mm512_set1_ps(2e+00f), tmp3041);
__m512 tmp3060 = _mm512_fmadd_ps(tmp3050, _mm512_set1_ps(2e+00f), tmp3061);
__m512 tmp3013 = tmp3025;
__m512 tmp3019 = tmp3045;
__m512 tmp3014 = tmp3031;
__m512 tmp3020 = tmp3051;
__m512 tmp3015 = tmp3036;
__m512 tmp3021 = tmp3056;
__m512 tmp3016 = tmp3038;
__m512 tmp3022 = tmp3058;
__m512 tmp3017 = tmp3040;
__m512 tmp3023 = tmp3060;
__m512 tmp3018 = tmp3042;
__m512 tmp3024 = tmp3062;
__m512 tmp3109 = _mm512_unpacklo_ps(tmp3013, tmp3014);
__m512 tmp3110 = _mm512_unpackhi_ps(tmp3013, tmp3014);
__m512 tmp3111 = _mm512_unpacklo_ps(tmp3015, tmp3016);
__m512 tmp3112 = _mm512_unpackhi_ps(tmp3015, tmp3016);
__m512 tmp3113 = _mm512_unpacklo_ps(tmp3017, tmp3018);
__m512 tmp3114 = _mm512_unpackhi_ps(tmp3017, tmp3018);
__m512 tmp3115 = _mm512_unpacklo_ps(tmp3019, tmp3020);
__m512 tmp3116 = _mm512_unpackhi_ps(tmp3019, tmp3020);
__m512 tmp3117 = _mm512_unpacklo_ps(tmp3021, tmp3022);
__m512 tmp3118 = _mm512_unpackhi_ps(tmp3021, tmp3022);
__m512 tmp3119 = _mm512_unpacklo_ps(tmp3023, tmp3024);
__m512 tmp3120 = _mm512_unpackhi_ps(tmp3023, tmp3024);
__m512 tmp3121 = _mm512_shuffle_ps(tmp3109, tmp3111, 68);
__m512 tmp3122 = _mm512_shuffle_ps(tmp3109, tmp3111, 238);
__m512 tmp3123 = _mm512_shuffle_ps(tmp3110, tmp3112, 68);
__m512 tmp3124 = _mm512_shuffle_ps(tmp3110, tmp3112, 238);
__m512 tmp3125 = _mm512_shuffle_ps(tmp3113, tmp3115, 68);
__m512 tmp3126 = _mm512_shuffle_ps(tmp3113, tmp3115, 238);
__m512 tmp3127 = _mm512_shuffle_ps(tmp3114, tmp3116, 68);
__m512 tmp3128 = _mm512_shuffle_ps(tmp3114, tmp3116, 238);
__m512 tmp3129 = _mm512_shuffle_ps(tmp3117, tmp3119, 68);
__m512 tmp3130 = _mm512_shuffle_ps(tmp3117, tmp3119, 238);
__m512 tmp3131 = _mm512_shuffle_ps(tmp3118, tmp3120, 68);
__m512 tmp3132 = _mm512_shuffle_ps(tmp3118, tmp3120, 238);
__m512 tmp3133 = _mm512_shuffle_f32x4(tmp3121, tmp3125, 136);
__m512 tmp3134 = _mm512_shuffle_f32x4(tmp3121, tmp3125, 221);
__m512 tmp3135 = _mm512_shuffle_f32x4(tmp3122, tmp3126, 136);
__m512 tmp3136 = _mm512_shuffle_f32x4(tmp3122, tmp3126, 221);
__m512 tmp3137 = _mm512_shuffle_f32x4(tmp3123, tmp3127, 136);
__m512 tmp3138 = _mm512_shuffle_f32x4(tmp3123, tmp3127, 221);
__m512 tmp3139 = _mm512_shuffle_f32x4(tmp3124, tmp3128, 136);
__m512 tmp3140 = _mm512_shuffle_f32x4(tmp3124, tmp3128, 221);
__m512 tmp3141 = _mm512_shuffle_f32x4(tmp3129, tmp3129, 136);
__m512 tmp3142 = _mm512_shuffle_f32x4(tmp3129, tmp3129, 221);
__m512 tmp3143 = _mm512_shuffle_f32x4(tmp3130, tmp3130, 136);
__m512 tmp3144 = _mm512_shuffle_f32x4(tmp3130, tmp3130, 221);
__m512 tmp3145 = _mm512_shuffle_f32x4(tmp3131, tmp3131, 136);
__m512 tmp3146 = _mm512_shuffle_f32x4(tmp3131, tmp3131, 221);
__m512 tmp3147 = _mm512_shuffle_f32x4(tmp3132, tmp3132, 136);
__m512 tmp3148 = _mm512_shuffle_f32x4(tmp3132, tmp3132, 221);
tmp3013 = _mm512_shuffle_f32x4(tmp3133, tmp3141, 136);
tmp3021 = _mm512_shuffle_f32x4(tmp3133, tmp3141, 221);
tmp3014 = _mm512_shuffle_f32x4(tmp3135, tmp3143, 136);
tmp3022 = _mm512_shuffle_f32x4(tmp3135, tmp3143, 221);
tmp3015 = _mm512_shuffle_f32x4(tmp3137, tmp3145, 136);
tmp3023 = _mm512_shuffle_f32x4(tmp3137, tmp3145, 221);
tmp3016 = _mm512_shuffle_f32x4(tmp3139, tmp3147, 136);
tmp3024 = _mm512_shuffle_f32x4(tmp3139, tmp3147, 221);
tmp3017 = _mm512_shuffle_f32x4(tmp3134, tmp3142, 136);
__m512 tmp3065 = _mm512_shuffle_f32x4(tmp3134, tmp3142, 221);
tmp3018 = _mm512_shuffle_f32x4(tmp3136, tmp3144, 136);
__m512 tmp3066 = _mm512_shuffle_f32x4(tmp3136, tmp3144, 221);
tmp3019 = _mm512_shuffle_f32x4(tmp3138, tmp3146, 136);
__m512 tmp3067 = _mm512_shuffle_f32x4(tmp3138, tmp3146, 221);
tmp3020 = _mm512_shuffle_f32x4(tmp3140, tmp3148, 136);
__m512 tmp3068 = _mm512_shuffle_f32x4(tmp3140, tmp3148, 221);
__m512 tmp3073 = _mm512_add_ps(tmp3014, tmp3015);
__m512 tmp3093 = _mm512_add_ps(tmp3022, tmp3023);
__m512 tmp3072 = _mm512_add_ps(tmp3016, tmp3017);
__m512 tmp3092 = _mm512_add_ps(tmp3024, tmp3065);
__m512 tmp3078 = _mm512_sub_ps(tmp3016, tmp3017);
__m512 tmp3098 = _mm512_sub_ps(tmp3024, tmp3065);
__m512 tmp3077 = _mm512_sub_ps(tmp3014, tmp3015);
__m512 tmp3097 = _mm512_sub_ps(tmp3022, tmp3023);
__m512 tmp3074 = _mm512_add_ps(tmp3018, tmp3019);
__m512 tmp3094 = _mm512_add_ps(tmp3066, tmp3067);
__m512 tmp3079 = _mm512_sub_ps(tmp3018, tmp3019);
__m512 tmp3099 = _mm512_sub_ps(tmp3066, tmp3067);
__m512 tmp3076 = _mm512_fmadd_ps(tmp3078, _mm512_set1_ps(2e+00f), tmp3077);
__m512 tmp3096 = _mm512_fmadd_ps(tmp3098, _mm512_set1_ps(2e+00f), tmp3097);
__m512 tmp3083 = _mm512_fmadd_ps(tmp3078, _mm512_set1_ps(8e+00f), tmp3077);
__m512 tmp3103 = _mm512_fmadd_ps(tmp3098, _mm512_set1_ps(8e+00f), tmp3097);
__m512 tmp3071 = _mm512_add_ps(tmp3072, tmp3073);
__m512 tmp3091 = _mm512_add_ps(tmp3092, tmp3093);
__m512 tmp3075 = _mm512_fmadd_ps(tmp3079, _mm512_set1_ps(1.6e+01f), tmp3076);
__m512 tmp3095 = _mm512_fmadd_ps(tmp3099, _mm512_set1_ps(1.6e+01f), tmp3096);
__m512 tmp3082 = _mm512_fmadd_ps(tmp3079, _mm512_set1_ps(4e+00f), tmp3083);
__m512 tmp3102 = _mm512_fmadd_ps(tmp3099, _mm512_set1_ps(4e+00f), tmp3103);
__m512 tmp3088 = _mm512_add_ps(tmp3079, tmp3077);
__m512 tmp3108 = _mm512_add_ps(tmp3099, tmp3097);
__m512 tmp3081 = _mm512_fmadd_ps(tmp3072, _mm512_set1_ps(4e+00f), tmp3073);
__m512 tmp3101 = _mm512_fmadd_ps(tmp3092, _mm512_set1_ps(4e+00f), tmp3093);
__m512 tmp3085 = _mm512_fmadd_ps(tmp3072, _mm512_set1_ps(1.6e+01f), tmp3073);
__m512 tmp3105 = _mm512_fmadd_ps(tmp3092, _mm512_set1_ps(1.6e+01f), tmp3093);
__m512 tmp3070 = _mm512_add_ps(tmp3071, tmp3013);
__m512 tmp3090 = _mm512_add_ps(tmp3091, tmp3021);
__m512 tmp3087 = _mm512_add_ps(tmp3088, tmp3020);
__m512 tmp3107 = _mm512_add_ps(tmp3108, tmp3068);
__m512 tmp3069 = _mm512_fmadd_ps(tmp3074, _mm512_set1_ps(3.2e+01f), tmp3070);
__m512 tmp3089 = _mm512_fmadd_ps(tmp3094, _mm512_set1_ps(3.2e+01f), tmp3090);
__m512 tmp3080 = _mm512_fmadd_ps(tmp3074, _mm512_set1_ps(8e+00f), tmp3081);
__m512 tmp3100 = _mm512_fmadd_ps(tmp3094, _mm512_set1_ps(8e+00f), tmp3101);
__m512 tmp3086 = _mm512_fmadd_ps(tmp3078, _mm512_set1_ps(3.2e+01f), tmp3087);
__m512 tmp3106 = _mm512_fmadd_ps(tmp3098, _mm512_set1_ps(3.2e+01f), tmp3107);
__m512 tmp3084 = _mm512_fmadd_ps(tmp3074, _mm512_set1_ps(2e+00f), tmp3085);
__m512 tmp3104 = _mm512_fmadd_ps(tmp3094, _mm512_set1_ps(2e+00f), tmp3105);
__m512 out519 = tmp3069;
__m512 out525 = tmp3089;
__m512 out520 = tmp3075;
__m512 out526 = tmp3095;
__m512 out521 = tmp3080;
__m512 out527 = tmp3100;
__m512 out522 = tmp3082;
__m512 out528 = tmp3102;
__m512 out523 = tmp3084;
__m512 out529 = tmp3104;
__m512 out524 = tmp3086;
__m512 out530 = tmp3106;
out519 = _mm512_max_ps(_mm512_setzero_ps(), out519);
out525 = _mm512_max_ps(_mm512_setzero_ps(), out525);
out520 = _mm512_max_ps(_mm512_setzero_ps(), out520);
out526 = _mm512_max_ps(_mm512_setzero_ps(), out526);
out521 = _mm512_max_ps(_mm512_setzero_ps(), out521);
out527 = _mm512_max_ps(_mm512_setzero_ps(), out527);
out522 = _mm512_max_ps(_mm512_setzero_ps(), out522);
out528 = _mm512_max_ps(_mm512_setzero_ps(), out528);
out523 = _mm512_max_ps(_mm512_setzero_ps(), out523);
out529 = _mm512_max_ps(_mm512_setzero_ps(), out529);
out524 = _mm512_max_ps(_mm512_setzero_ps(), out524);
out530 = _mm512_max_ps(_mm512_setzero_ps(), out530);
_mm512_mask_storeu_ps(datPtr6+12656+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out519);
_mm512_mask_storeu_ps(datPtr6+12704+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out525);
_mm512_mask_storeu_ps(datPtr6+12880+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out520);
_mm512_mask_storeu_ps(datPtr6+12928+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out526);
_mm512_mask_storeu_ps(datPtr6+13104+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out521);
_mm512_mask_storeu_ps(datPtr6+13152+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out527);
_mm512_mask_storeu_ps(datPtr6+13328+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out522);
_mm512_mask_storeu_ps(datPtr6+13376+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out528);
_mm512_mask_storeu_ps(datPtr6+13552+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out523);
_mm512_mask_storeu_ps(datPtr6+13600+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out529);
_mm512_mask_storeu_ps(datPtr6+13776+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out524);
_mm512_mask_storeu_ps(datPtr6+13824+806912*i18+224*toH22+4*toW22+50432*k64+25216*l16, 4095, out530);
}
}
if (j13 >= last4) return;
++j13;
rel11 = 1;
}
ptrdiff_t toH23 = base11+0;
ptrdiff_t toW23 = 48;
ptrdiff_t k65 = 16*w33;
for (; k65 != 16; ++k65) {
ptrdiff_t l17 = 0;
for (; l17 != 2; ++l17) {
__m512 sf145 = _mm512_loadu_ps(sfPtr5+0+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf146 = _mm512_loadu_ps(sfPtr5+128+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in508 = _mm512_shuffle_f32x4(sf145, sf146, 68);
__m512 in509 = _mm512_shuffle_f32x4(sf145, sf146, 238);
__m512 sf147 = _mm512_loadu_ps(sfPtr5+64+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf148 = _mm512_loadu_ps(sfPtr5+192+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in516 = _mm512_shuffle_f32x4(sf147, sf148, 68);
__m512 in517 = _mm512_shuffle_f32x4(sf147, sf148, 238);
__m512 sf149 = _mm512_loadu_ps(sfPtr5+409600+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf150 = _mm512_loadu_ps(sfPtr5+409728+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in510 = _mm512_shuffle_f32x4(sf149, sf150, 68);
__m512 in511 = _mm512_shuffle_f32x4(sf149, sf150, 238);
__m512 sf151 = _mm512_loadu_ps(sfPtr5+409664+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf152 = _mm512_loadu_ps(sfPtr5+409792+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in518 = _mm512_shuffle_f32x4(sf151, sf152, 68);
__m512 in519 = _mm512_shuffle_f32x4(sf151, sf152, 238);
__m512 sf153 = _mm512_loadu_ps(sfPtr5+819200+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf154 = _mm512_loadu_ps(sfPtr5+819328+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in512 = _mm512_shuffle_f32x4(sf153, sf154, 68);
__m512 in513 = _mm512_shuffle_f32x4(sf153, sf154, 238);
__m512 sf155 = _mm512_loadu_ps(sfPtr5+819264+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf156 = _mm512_loadu_ps(sfPtr5+819392+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in520 = _mm512_shuffle_f32x4(sf155, sf156, 68);
__m512 in521 = _mm512_shuffle_f32x4(sf155, sf156, 238);
__m512 sf157 = _mm512_loadu_ps(sfPtr5+1228800+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf158 = _mm512_loadu_ps(sfPtr5+1228928+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in514 = _mm512_shuffle_f32x4(sf157, sf158, 68);
__m512 in515 = _mm512_shuffle_f32x4(sf157, sf158, 238);
__m512 sf159 = _mm512_loadu_ps(sfPtr5+1228864+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf160 = _mm512_loadu_ps(sfPtr5+1228992+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in522 = _mm512_shuffle_f32x4(sf159, sf160, 68);
__m512 in523 = _mm512_shuffle_f32x4(sf159, sf160, 238);
__m512 tmp3165 = _mm512_add_ps(in509, in510);
__m512 tmp3185 = _mm512_add_ps(in517, in518);
__m512 tmp3164 = _mm512_add_ps(in511, in512);
__m512 tmp3184 = _mm512_add_ps(in519, in520);
__m512 tmp3170 = _mm512_sub_ps(in511, in512);
__m512 tmp3190 = _mm512_sub_ps(in519, in520);
__m512 tmp3169 = _mm512_sub_ps(in509, in510);
__m512 tmp3189 = _mm512_sub_ps(in517, in518);
__m512 tmp3166 = _mm512_add_ps(in513, in514);
__m512 tmp3186 = _mm512_add_ps(in521, in522);
__m512 tmp3171 = _mm512_sub_ps(in513, in514);
__m512 tmp3191 = _mm512_sub_ps(in521, in522);
__m512 tmp3168 = _mm512_fmadd_ps(tmp3170, _mm512_set1_ps(2e+00f), tmp3169);
__m512 tmp3188 = _mm512_fmadd_ps(tmp3190, _mm512_set1_ps(2e+00f), tmp3189);
__m512 tmp3175 = _mm512_fmadd_ps(tmp3170, _mm512_set1_ps(8e+00f), tmp3169);
__m512 tmp3195 = _mm512_fmadd_ps(tmp3190, _mm512_set1_ps(8e+00f), tmp3189);
__m512 tmp3163 = _mm512_add_ps(tmp3164, tmp3165);
__m512 tmp3183 = _mm512_add_ps(tmp3184, tmp3185);
__m512 tmp3167 = _mm512_fmadd_ps(tmp3171, _mm512_set1_ps(1.6e+01f), tmp3168);
__m512 tmp3187 = _mm512_fmadd_ps(tmp3191, _mm512_set1_ps(1.6e+01f), tmp3188);
__m512 tmp3174 = _mm512_fmadd_ps(tmp3171, _mm512_set1_ps(4e+00f), tmp3175);
__m512 tmp3194 = _mm512_fmadd_ps(tmp3191, _mm512_set1_ps(4e+00f), tmp3195);
__m512 tmp3180 = _mm512_add_ps(tmp3171, tmp3169);
__m512 tmp3200 = _mm512_add_ps(tmp3191, tmp3189);
__m512 tmp3173 = _mm512_fmadd_ps(tmp3164, _mm512_set1_ps(4e+00f), tmp3165);
__m512 tmp3193 = _mm512_fmadd_ps(tmp3184, _mm512_set1_ps(4e+00f), tmp3185);
__m512 tmp3177 = _mm512_fmadd_ps(tmp3164, _mm512_set1_ps(1.6e+01f), tmp3165);
__m512 tmp3197 = _mm512_fmadd_ps(tmp3184, _mm512_set1_ps(1.6e+01f), tmp3185);
__m512 tmp3162 = _mm512_add_ps(tmp3163, in508);
__m512 tmp3182 = _mm512_add_ps(tmp3183, in516);
__m512 tmp3179 = _mm512_add_ps(tmp3180, in515);
__m512 tmp3199 = _mm512_add_ps(tmp3200, in523);
__m512 tmp3161 = _mm512_fmadd_ps(tmp3166, _mm512_set1_ps(3.2e+01f), tmp3162);
__m512 tmp3181 = _mm512_fmadd_ps(tmp3186, _mm512_set1_ps(3.2e+01f), tmp3182);
__m512 tmp3172 = _mm512_fmadd_ps(tmp3166, _mm512_set1_ps(8e+00f), tmp3173);
__m512 tmp3192 = _mm512_fmadd_ps(tmp3186, _mm512_set1_ps(8e+00f), tmp3193);
__m512 tmp3178 = _mm512_fmadd_ps(tmp3170, _mm512_set1_ps(3.2e+01f), tmp3179);
__m512 tmp3198 = _mm512_fmadd_ps(tmp3190, _mm512_set1_ps(3.2e+01f), tmp3199);
__m512 tmp3176 = _mm512_fmadd_ps(tmp3166, _mm512_set1_ps(2e+00f), tmp3177);
__m512 tmp3196 = _mm512_fmadd_ps(tmp3186, _mm512_set1_ps(2e+00f), tmp3197);
__m512 tmp3149 = tmp3161;
__m512 tmp3155 = tmp3181;
__m512 tmp3150 = tmp3167;
__m512 tmp3156 = tmp3187;
__m512 tmp3151 = tmp3172;
__m512 tmp3157 = tmp3192;
__m512 tmp3152 = tmp3174;
__m512 tmp3158 = tmp3194;
__m512 tmp3153 = tmp3176;
__m512 tmp3159 = tmp3196;
__m512 tmp3154 = tmp3178;
__m512 tmp3160 = tmp3198;
__m512 tmp3245 = _mm512_unpacklo_ps(tmp3149, tmp3150);
__m512 tmp3246 = _mm512_unpackhi_ps(tmp3149, tmp3150);
__m512 tmp3247 = _mm512_unpacklo_ps(tmp3151, tmp3152);
__m512 tmp3248 = _mm512_unpackhi_ps(tmp3151, tmp3152);
__m512 tmp3249 = _mm512_unpacklo_ps(tmp3153, tmp3154);
__m512 tmp3250 = _mm512_unpackhi_ps(tmp3153, tmp3154);
__m512 tmp3251 = _mm512_unpacklo_ps(tmp3155, tmp3156);
__m512 tmp3252 = _mm512_unpackhi_ps(tmp3155, tmp3156);
__m512 tmp3253 = _mm512_unpacklo_ps(tmp3157, tmp3158);
__m512 tmp3254 = _mm512_unpackhi_ps(tmp3157, tmp3158);
__m512 tmp3255 = _mm512_unpacklo_ps(tmp3159, tmp3160);
__m512 tmp3256 = _mm512_unpackhi_ps(tmp3159, tmp3160);
__m512 tmp3257 = _mm512_shuffle_ps(tmp3245, tmp3247, 68);
__m512 tmp3258 = _mm512_shuffle_ps(tmp3245, tmp3247, 238);
__m512 tmp3259 = _mm512_shuffle_ps(tmp3246, tmp3248, 68);
__m512 tmp3260 = _mm512_shuffle_ps(tmp3246, tmp3248, 238);
__m512 tmp3261 = _mm512_shuffle_ps(tmp3249, tmp3251, 68);
__m512 tmp3262 = _mm512_shuffle_ps(tmp3249, tmp3251, 238);
__m512 tmp3263 = _mm512_shuffle_ps(tmp3250, tmp3252, 68);
__m512 tmp3264 = _mm512_shuffle_ps(tmp3250, tmp3252, 238);
__m512 tmp3265 = _mm512_shuffle_ps(tmp3253, tmp3255, 68);
__m512 tmp3266 = _mm512_shuffle_ps(tmp3253, tmp3255, 238);
__m512 tmp3267 = _mm512_shuffle_ps(tmp3254, tmp3256, 68);
__m512 tmp3268 = _mm512_shuffle_ps(tmp3254, tmp3256, 238);
__m512 tmp3269 = _mm512_shuffle_f32x4(tmp3257, tmp3261, 136);
__m512 tmp3270 = _mm512_shuffle_f32x4(tmp3257, tmp3261, 221);
__m512 tmp3271 = _mm512_shuffle_f32x4(tmp3258, tmp3262, 136);
__m512 tmp3272 = _mm512_shuffle_f32x4(tmp3258, tmp3262, 221);
__m512 tmp3273 = _mm512_shuffle_f32x4(tmp3259, tmp3263, 136);
__m512 tmp3274 = _mm512_shuffle_f32x4(tmp3259, tmp3263, 221);
__m512 tmp3275 = _mm512_shuffle_f32x4(tmp3260, tmp3264, 136);
__m512 tmp3276 = _mm512_shuffle_f32x4(tmp3260, tmp3264, 221);
__m512 tmp3277 = _mm512_shuffle_f32x4(tmp3265, tmp3265, 136);
__m512 tmp3278 = _mm512_shuffle_f32x4(tmp3265, tmp3265, 221);
__m512 tmp3279 = _mm512_shuffle_f32x4(tmp3266, tmp3266, 136);
__m512 tmp3280 = _mm512_shuffle_f32x4(tmp3266, tmp3266, 221);
__m512 tmp3281 = _mm512_shuffle_f32x4(tmp3267, tmp3267, 136);
__m512 tmp3282 = _mm512_shuffle_f32x4(tmp3267, tmp3267, 221);
__m512 tmp3283 = _mm512_shuffle_f32x4(tmp3268, tmp3268, 136);
__m512 tmp3284 = _mm512_shuffle_f32x4(tmp3268, tmp3268, 221);
tmp3149 = _mm512_shuffle_f32x4(tmp3269, tmp3277, 136);
tmp3157 = _mm512_shuffle_f32x4(tmp3269, tmp3277, 221);
tmp3150 = _mm512_shuffle_f32x4(tmp3271, tmp3279, 136);
tmp3158 = _mm512_shuffle_f32x4(tmp3271, tmp3279, 221);
tmp3151 = _mm512_shuffle_f32x4(tmp3273, tmp3281, 136);
tmp3159 = _mm512_shuffle_f32x4(tmp3273, tmp3281, 221);
tmp3152 = _mm512_shuffle_f32x4(tmp3275, tmp3283, 136);
tmp3160 = _mm512_shuffle_f32x4(tmp3275, tmp3283, 221);
tmp3153 = _mm512_shuffle_f32x4(tmp3270, tmp3278, 136);
__m512 tmp3201 = _mm512_shuffle_f32x4(tmp3270, tmp3278, 221);
tmp3154 = _mm512_shuffle_f32x4(tmp3272, tmp3280, 136);
__m512 tmp3202 = _mm512_shuffle_f32x4(tmp3272, tmp3280, 221);
tmp3155 = _mm512_shuffle_f32x4(tmp3274, tmp3282, 136);
__m512 tmp3203 = _mm512_shuffle_f32x4(tmp3274, tmp3282, 221);
tmp3156 = _mm512_shuffle_f32x4(tmp3276, tmp3284, 136);
__m512 tmp3204 = _mm512_shuffle_f32x4(tmp3276, tmp3284, 221);
__m512 tmp3209 = _mm512_add_ps(tmp3150, tmp3151);
__m512 tmp3229 = _mm512_add_ps(tmp3158, tmp3159);
__m512 tmp3208 = _mm512_add_ps(tmp3152, tmp3153);
__m512 tmp3228 = _mm512_add_ps(tmp3160, tmp3201);
__m512 tmp3214 = _mm512_sub_ps(tmp3152, tmp3153);
__m512 tmp3234 = _mm512_sub_ps(tmp3160, tmp3201);
__m512 tmp3213 = _mm512_sub_ps(tmp3150, tmp3151);
__m512 tmp3233 = _mm512_sub_ps(tmp3158, tmp3159);
__m512 tmp3210 = _mm512_add_ps(tmp3154, tmp3155);
__m512 tmp3230 = _mm512_add_ps(tmp3202, tmp3203);
__m512 tmp3215 = _mm512_sub_ps(tmp3154, tmp3155);
__m512 tmp3235 = _mm512_sub_ps(tmp3202, tmp3203);
__m512 tmp3212 = _mm512_fmadd_ps(tmp3214, _mm512_set1_ps(2e+00f), tmp3213);
__m512 tmp3232 = _mm512_fmadd_ps(tmp3234, _mm512_set1_ps(2e+00f), tmp3233);
__m512 tmp3219 = _mm512_fmadd_ps(tmp3214, _mm512_set1_ps(8e+00f), tmp3213);
__m512 tmp3239 = _mm512_fmadd_ps(tmp3234, _mm512_set1_ps(8e+00f), tmp3233);
__m512 tmp3207 = _mm512_add_ps(tmp3208, tmp3209);
__m512 tmp3227 = _mm512_add_ps(tmp3228, tmp3229);
__m512 tmp3211 = _mm512_fmadd_ps(tmp3215, _mm512_set1_ps(1.6e+01f), tmp3212);
__m512 tmp3231 = _mm512_fmadd_ps(tmp3235, _mm512_set1_ps(1.6e+01f), tmp3232);
__m512 tmp3218 = _mm512_fmadd_ps(tmp3215, _mm512_set1_ps(4e+00f), tmp3219);
__m512 tmp3238 = _mm512_fmadd_ps(tmp3235, _mm512_set1_ps(4e+00f), tmp3239);
__m512 tmp3224 = _mm512_add_ps(tmp3215, tmp3213);
__m512 tmp3244 = _mm512_add_ps(tmp3235, tmp3233);
__m512 tmp3217 = _mm512_fmadd_ps(tmp3208, _mm512_set1_ps(4e+00f), tmp3209);
__m512 tmp3237 = _mm512_fmadd_ps(tmp3228, _mm512_set1_ps(4e+00f), tmp3229);
__m512 tmp3221 = _mm512_fmadd_ps(tmp3208, _mm512_set1_ps(1.6e+01f), tmp3209);
__m512 tmp3241 = _mm512_fmadd_ps(tmp3228, _mm512_set1_ps(1.6e+01f), tmp3229);
__m512 tmp3206 = _mm512_add_ps(tmp3207, tmp3149);
__m512 tmp3226 = _mm512_add_ps(tmp3227, tmp3157);
__m512 tmp3223 = _mm512_add_ps(tmp3224, tmp3156);
__m512 tmp3243 = _mm512_add_ps(tmp3244, tmp3204);
__m512 tmp3205 = _mm512_fmadd_ps(tmp3210, _mm512_set1_ps(3.2e+01f), tmp3206);
__m512 tmp3225 = _mm512_fmadd_ps(tmp3230, _mm512_set1_ps(3.2e+01f), tmp3226);
__m512 tmp3216 = _mm512_fmadd_ps(tmp3210, _mm512_set1_ps(8e+00f), tmp3217);
__m512 tmp3236 = _mm512_fmadd_ps(tmp3230, _mm512_set1_ps(8e+00f), tmp3237);
__m512 tmp3222 = _mm512_fmadd_ps(tmp3214, _mm512_set1_ps(3.2e+01f), tmp3223);
__m512 tmp3242 = _mm512_fmadd_ps(tmp3234, _mm512_set1_ps(3.2e+01f), tmp3243);
__m512 tmp3220 = _mm512_fmadd_ps(tmp3210, _mm512_set1_ps(2e+00f), tmp3221);
__m512 tmp3240 = _mm512_fmadd_ps(tmp3230, _mm512_set1_ps(2e+00f), tmp3241);
__m512 out531 = tmp3205;
__m512 out537 = tmp3225;
__m512 out532 = tmp3211;
__m512 out538 = tmp3231;
__m512 out533 = tmp3216;
__m512 out539 = tmp3236;
__m512 out534 = tmp3218;
__m512 out540 = tmp3238;
__m512 out535 = tmp3220;
__m512 out541 = tmp3240;
__m512 out536 = tmp3222;
__m512 out542 = tmp3242;
out531 = _mm512_max_ps(_mm512_setzero_ps(), out531);
out537 = _mm512_max_ps(_mm512_setzero_ps(), out537);
out532 = _mm512_max_ps(_mm512_setzero_ps(), out532);
out538 = _mm512_max_ps(_mm512_setzero_ps(), out538);
out533 = _mm512_max_ps(_mm512_setzero_ps(), out533);
out539 = _mm512_max_ps(_mm512_setzero_ps(), out539);
out534 = _mm512_max_ps(_mm512_setzero_ps(), out534);
out540 = _mm512_max_ps(_mm512_setzero_ps(), out540);
out535 = _mm512_max_ps(_mm512_setzero_ps(), out535);
out541 = _mm512_max_ps(_mm512_setzero_ps(), out541);
out536 = _mm512_max_ps(_mm512_setzero_ps(), out536);
out542 = _mm512_max_ps(_mm512_setzero_ps(), out542);
_mm512_mask_storeu_ps(datPtr6+0+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 255, out531);
_mm512_mask_storeu_ps(datPtr6+1152+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out537);
_mm512_mask_storeu_ps(datPtr6+224+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 255, out532);
_mm512_mask_storeu_ps(datPtr6+1376+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out538);
_mm512_mask_storeu_ps(datPtr6+448+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 255, out533);
_mm512_mask_storeu_ps(datPtr6+1600+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out539);
_mm512_mask_storeu_ps(datPtr6+672+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 255, out534);
_mm512_mask_storeu_ps(datPtr6+1824+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out540);
_mm512_mask_storeu_ps(datPtr6+896+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 255, out535);
_mm512_mask_storeu_ps(datPtr6+2048+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out541);
_mm512_mask_storeu_ps(datPtr6+1120+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 255, out536);
_mm512_mask_storeu_ps(datPtr6+2272+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out542);
__m512 sf161 = _mm512_loadu_ps(sfPtr5+256+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf162 = _mm512_loadu_ps(sfPtr5+384+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in524 = _mm512_shuffle_f32x4(sf161, sf162, 68);
__m512 in525 = _mm512_shuffle_f32x4(sf161, sf162, 238);
__m512 sf163 = _mm512_loadu_ps(sfPtr5+320+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf164 = _mm512_loadu_ps(sfPtr5+448+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in532 = _mm512_shuffle_f32x4(sf163, sf164, 68);
__m512 in533 = _mm512_shuffle_f32x4(sf163, sf164, 238);
__m512 sf165 = _mm512_loadu_ps(sfPtr5+409856+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf166 = _mm512_loadu_ps(sfPtr5+409984+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in526 = _mm512_shuffle_f32x4(sf165, sf166, 68);
__m512 in527 = _mm512_shuffle_f32x4(sf165, sf166, 238);
__m512 sf167 = _mm512_loadu_ps(sfPtr5+409920+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf168 = _mm512_loadu_ps(sfPtr5+410048+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in534 = _mm512_shuffle_f32x4(sf167, sf168, 68);
__m512 in535 = _mm512_shuffle_f32x4(sf167, sf168, 238);
__m512 sf169 = _mm512_loadu_ps(sfPtr5+819456+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf170 = _mm512_loadu_ps(sfPtr5+819584+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in528 = _mm512_shuffle_f32x4(sf169, sf170, 68);
__m512 in529 = _mm512_shuffle_f32x4(sf169, sf170, 238);
__m512 sf171 = _mm512_loadu_ps(sfPtr5+819520+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf172 = _mm512_loadu_ps(sfPtr5+819648+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in536 = _mm512_shuffle_f32x4(sf171, sf172, 68);
__m512 in537 = _mm512_shuffle_f32x4(sf171, sf172, 238);
__m512 sf173 = _mm512_loadu_ps(sfPtr5+1229056+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf174 = _mm512_loadu_ps(sfPtr5+1229184+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in530 = _mm512_shuffle_f32x4(sf173, sf174, 68);
__m512 in531 = _mm512_shuffle_f32x4(sf173, sf174, 238);
__m512 sf175 = _mm512_loadu_ps(sfPtr5+1229120+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf176 = _mm512_loadu_ps(sfPtr5+1229248+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in538 = _mm512_shuffle_f32x4(sf175, sf176, 68);
__m512 in539 = _mm512_shuffle_f32x4(sf175, sf176, 238);
__m512 tmp3301 = _mm512_add_ps(in525, in526);
__m512 tmp3321 = _mm512_add_ps(in533, in534);
__m512 tmp3300 = _mm512_add_ps(in527, in528);
__m512 tmp3320 = _mm512_add_ps(in535, in536);
__m512 tmp3306 = _mm512_sub_ps(in527, in528);
__m512 tmp3326 = _mm512_sub_ps(in535, in536);
__m512 tmp3305 = _mm512_sub_ps(in525, in526);
__m512 tmp3325 = _mm512_sub_ps(in533, in534);
__m512 tmp3302 = _mm512_add_ps(in529, in530);
__m512 tmp3322 = _mm512_add_ps(in537, in538);
__m512 tmp3307 = _mm512_sub_ps(in529, in530);
__m512 tmp3327 = _mm512_sub_ps(in537, in538);
__m512 tmp3304 = _mm512_fmadd_ps(tmp3306, _mm512_set1_ps(2e+00f), tmp3305);
__m512 tmp3324 = _mm512_fmadd_ps(tmp3326, _mm512_set1_ps(2e+00f), tmp3325);
__m512 tmp3311 = _mm512_fmadd_ps(tmp3306, _mm512_set1_ps(8e+00f), tmp3305);
__m512 tmp3331 = _mm512_fmadd_ps(tmp3326, _mm512_set1_ps(8e+00f), tmp3325);
__m512 tmp3299 = _mm512_add_ps(tmp3300, tmp3301);
__m512 tmp3319 = _mm512_add_ps(tmp3320, tmp3321);
__m512 tmp3303 = _mm512_fmadd_ps(tmp3307, _mm512_set1_ps(1.6e+01f), tmp3304);
__m512 tmp3323 = _mm512_fmadd_ps(tmp3327, _mm512_set1_ps(1.6e+01f), tmp3324);
__m512 tmp3310 = _mm512_fmadd_ps(tmp3307, _mm512_set1_ps(4e+00f), tmp3311);
__m512 tmp3330 = _mm512_fmadd_ps(tmp3327, _mm512_set1_ps(4e+00f), tmp3331);
__m512 tmp3316 = _mm512_add_ps(tmp3307, tmp3305);
__m512 tmp3336 = _mm512_add_ps(tmp3327, tmp3325);
__m512 tmp3309 = _mm512_fmadd_ps(tmp3300, _mm512_set1_ps(4e+00f), tmp3301);
__m512 tmp3329 = _mm512_fmadd_ps(tmp3320, _mm512_set1_ps(4e+00f), tmp3321);
__m512 tmp3313 = _mm512_fmadd_ps(tmp3300, _mm512_set1_ps(1.6e+01f), tmp3301);
__m512 tmp3333 = _mm512_fmadd_ps(tmp3320, _mm512_set1_ps(1.6e+01f), tmp3321);
__m512 tmp3298 = _mm512_add_ps(tmp3299, in524);
__m512 tmp3318 = _mm512_add_ps(tmp3319, in532);
__m512 tmp3315 = _mm512_add_ps(tmp3316, in531);
__m512 tmp3335 = _mm512_add_ps(tmp3336, in539);
__m512 tmp3297 = _mm512_fmadd_ps(tmp3302, _mm512_set1_ps(3.2e+01f), tmp3298);
__m512 tmp3317 = _mm512_fmadd_ps(tmp3322, _mm512_set1_ps(3.2e+01f), tmp3318);
__m512 tmp3308 = _mm512_fmadd_ps(tmp3302, _mm512_set1_ps(8e+00f), tmp3309);
__m512 tmp3328 = _mm512_fmadd_ps(tmp3322, _mm512_set1_ps(8e+00f), tmp3329);
__m512 tmp3314 = _mm512_fmadd_ps(tmp3306, _mm512_set1_ps(3.2e+01f), tmp3315);
__m512 tmp3334 = _mm512_fmadd_ps(tmp3326, _mm512_set1_ps(3.2e+01f), tmp3335);
__m512 tmp3312 = _mm512_fmadd_ps(tmp3302, _mm512_set1_ps(2e+00f), tmp3313);
__m512 tmp3332 = _mm512_fmadd_ps(tmp3322, _mm512_set1_ps(2e+00f), tmp3333);
__m512 tmp3285 = tmp3297;
__m512 tmp3291 = tmp3317;
__m512 tmp3286 = tmp3303;
__m512 tmp3292 = tmp3323;
__m512 tmp3287 = tmp3308;
__m512 tmp3293 = tmp3328;
__m512 tmp3288 = tmp3310;
__m512 tmp3294 = tmp3330;
__m512 tmp3289 = tmp3312;
__m512 tmp3295 = tmp3332;
__m512 tmp3290 = tmp3314;
__m512 tmp3296 = tmp3334;
__m512 tmp3381 = _mm512_unpacklo_ps(tmp3285, tmp3286);
__m512 tmp3382 = _mm512_unpackhi_ps(tmp3285, tmp3286);
__m512 tmp3383 = _mm512_unpacklo_ps(tmp3287, tmp3288);
__m512 tmp3384 = _mm512_unpackhi_ps(tmp3287, tmp3288);
__m512 tmp3385 = _mm512_unpacklo_ps(tmp3289, tmp3290);
__m512 tmp3386 = _mm512_unpackhi_ps(tmp3289, tmp3290);
__m512 tmp3387 = _mm512_unpacklo_ps(tmp3291, tmp3292);
__m512 tmp3388 = _mm512_unpackhi_ps(tmp3291, tmp3292);
__m512 tmp3389 = _mm512_unpacklo_ps(tmp3293, tmp3294);
__m512 tmp3390 = _mm512_unpackhi_ps(tmp3293, tmp3294);
__m512 tmp3391 = _mm512_unpacklo_ps(tmp3295, tmp3296);
__m512 tmp3392 = _mm512_unpackhi_ps(tmp3295, tmp3296);
__m512 tmp3393 = _mm512_shuffle_ps(tmp3381, tmp3383, 68);
__m512 tmp3394 = _mm512_shuffle_ps(tmp3381, tmp3383, 238);
__m512 tmp3395 = _mm512_shuffle_ps(tmp3382, tmp3384, 68);
__m512 tmp3396 = _mm512_shuffle_ps(tmp3382, tmp3384, 238);
__m512 tmp3397 = _mm512_shuffle_ps(tmp3385, tmp3387, 68);
__m512 tmp3398 = _mm512_shuffle_ps(tmp3385, tmp3387, 238);
__m512 tmp3399 = _mm512_shuffle_ps(tmp3386, tmp3388, 68);
__m512 tmp3400 = _mm512_shuffle_ps(tmp3386, tmp3388, 238);
__m512 tmp3401 = _mm512_shuffle_ps(tmp3389, tmp3391, 68);
__m512 tmp3402 = _mm512_shuffle_ps(tmp3389, tmp3391, 238);
__m512 tmp3403 = _mm512_shuffle_ps(tmp3390, tmp3392, 68);
__m512 tmp3404 = _mm512_shuffle_ps(tmp3390, tmp3392, 238);
__m512 tmp3405 = _mm512_shuffle_f32x4(tmp3393, tmp3397, 136);
__m512 tmp3406 = _mm512_shuffle_f32x4(tmp3393, tmp3397, 221);
__m512 tmp3407 = _mm512_shuffle_f32x4(tmp3394, tmp3398, 136);
__m512 tmp3408 = _mm512_shuffle_f32x4(tmp3394, tmp3398, 221);
__m512 tmp3409 = _mm512_shuffle_f32x4(tmp3395, tmp3399, 136);
__m512 tmp3410 = _mm512_shuffle_f32x4(tmp3395, tmp3399, 221);
__m512 tmp3411 = _mm512_shuffle_f32x4(tmp3396, tmp3400, 136);
__m512 tmp3412 = _mm512_shuffle_f32x4(tmp3396, tmp3400, 221);
__m512 tmp3413 = _mm512_shuffle_f32x4(tmp3401, tmp3401, 136);
__m512 tmp3414 = _mm512_shuffle_f32x4(tmp3401, tmp3401, 221);
__m512 tmp3415 = _mm512_shuffle_f32x4(tmp3402, tmp3402, 136);
__m512 tmp3416 = _mm512_shuffle_f32x4(tmp3402, tmp3402, 221);
__m512 tmp3417 = _mm512_shuffle_f32x4(tmp3403, tmp3403, 136);
__m512 tmp3418 = _mm512_shuffle_f32x4(tmp3403, tmp3403, 221);
__m512 tmp3419 = _mm512_shuffle_f32x4(tmp3404, tmp3404, 136);
__m512 tmp3420 = _mm512_shuffle_f32x4(tmp3404, tmp3404, 221);
tmp3285 = _mm512_shuffle_f32x4(tmp3405, tmp3413, 136);
tmp3293 = _mm512_shuffle_f32x4(tmp3405, tmp3413, 221);
tmp3286 = _mm512_shuffle_f32x4(tmp3407, tmp3415, 136);
tmp3294 = _mm512_shuffle_f32x4(tmp3407, tmp3415, 221);
tmp3287 = _mm512_shuffle_f32x4(tmp3409, tmp3417, 136);
tmp3295 = _mm512_shuffle_f32x4(tmp3409, tmp3417, 221);
tmp3288 = _mm512_shuffle_f32x4(tmp3411, tmp3419, 136);
tmp3296 = _mm512_shuffle_f32x4(tmp3411, tmp3419, 221);
tmp3289 = _mm512_shuffle_f32x4(tmp3406, tmp3414, 136);
__m512 tmp3337 = _mm512_shuffle_f32x4(tmp3406, tmp3414, 221);
tmp3290 = _mm512_shuffle_f32x4(tmp3408, tmp3416, 136);
__m512 tmp3338 = _mm512_shuffle_f32x4(tmp3408, tmp3416, 221);
tmp3291 = _mm512_shuffle_f32x4(tmp3410, tmp3418, 136);
__m512 tmp3339 = _mm512_shuffle_f32x4(tmp3410, tmp3418, 221);
tmp3292 = _mm512_shuffle_f32x4(tmp3412, tmp3420, 136);
__m512 tmp3340 = _mm512_shuffle_f32x4(tmp3412, tmp3420, 221);
__m512 tmp3345 = _mm512_add_ps(tmp3286, tmp3287);
__m512 tmp3365 = _mm512_add_ps(tmp3294, tmp3295);
__m512 tmp3344 = _mm512_add_ps(tmp3288, tmp3289);
__m512 tmp3364 = _mm512_add_ps(tmp3296, tmp3337);
__m512 tmp3350 = _mm512_sub_ps(tmp3288, tmp3289);
__m512 tmp3370 = _mm512_sub_ps(tmp3296, tmp3337);
__m512 tmp3349 = _mm512_sub_ps(tmp3286, tmp3287);
__m512 tmp3369 = _mm512_sub_ps(tmp3294, tmp3295);
__m512 tmp3346 = _mm512_add_ps(tmp3290, tmp3291);
__m512 tmp3366 = _mm512_add_ps(tmp3338, tmp3339);
__m512 tmp3351 = _mm512_sub_ps(tmp3290, tmp3291);
__m512 tmp3371 = _mm512_sub_ps(tmp3338, tmp3339);
__m512 tmp3348 = _mm512_fmadd_ps(tmp3350, _mm512_set1_ps(2e+00f), tmp3349);
__m512 tmp3368 = _mm512_fmadd_ps(tmp3370, _mm512_set1_ps(2e+00f), tmp3369);
__m512 tmp3355 = _mm512_fmadd_ps(tmp3350, _mm512_set1_ps(8e+00f), tmp3349);
__m512 tmp3375 = _mm512_fmadd_ps(tmp3370, _mm512_set1_ps(8e+00f), tmp3369);
__m512 tmp3343 = _mm512_add_ps(tmp3344, tmp3345);
__m512 tmp3363 = _mm512_add_ps(tmp3364, tmp3365);
__m512 tmp3347 = _mm512_fmadd_ps(tmp3351, _mm512_set1_ps(1.6e+01f), tmp3348);
__m512 tmp3367 = _mm512_fmadd_ps(tmp3371, _mm512_set1_ps(1.6e+01f), tmp3368);
__m512 tmp3354 = _mm512_fmadd_ps(tmp3351, _mm512_set1_ps(4e+00f), tmp3355);
__m512 tmp3374 = _mm512_fmadd_ps(tmp3371, _mm512_set1_ps(4e+00f), tmp3375);
__m512 tmp3360 = _mm512_add_ps(tmp3351, tmp3349);
__m512 tmp3380 = _mm512_add_ps(tmp3371, tmp3369);
__m512 tmp3353 = _mm512_fmadd_ps(tmp3344, _mm512_set1_ps(4e+00f), tmp3345);
__m512 tmp3373 = _mm512_fmadd_ps(tmp3364, _mm512_set1_ps(4e+00f), tmp3365);
__m512 tmp3357 = _mm512_fmadd_ps(tmp3344, _mm512_set1_ps(1.6e+01f), tmp3345);
__m512 tmp3377 = _mm512_fmadd_ps(tmp3364, _mm512_set1_ps(1.6e+01f), tmp3365);
__m512 tmp3342 = _mm512_add_ps(tmp3343, tmp3285);
__m512 tmp3362 = _mm512_add_ps(tmp3363, tmp3293);
__m512 tmp3359 = _mm512_add_ps(tmp3360, tmp3292);
__m512 tmp3379 = _mm512_add_ps(tmp3380, tmp3340);
__m512 tmp3341 = _mm512_fmadd_ps(tmp3346, _mm512_set1_ps(3.2e+01f), tmp3342);
__m512 tmp3361 = _mm512_fmadd_ps(tmp3366, _mm512_set1_ps(3.2e+01f), tmp3362);
__m512 tmp3352 = _mm512_fmadd_ps(tmp3346, _mm512_set1_ps(8e+00f), tmp3353);
__m512 tmp3372 = _mm512_fmadd_ps(tmp3366, _mm512_set1_ps(8e+00f), tmp3373);
__m512 tmp3358 = _mm512_fmadd_ps(tmp3350, _mm512_set1_ps(3.2e+01f), tmp3359);
__m512 tmp3378 = _mm512_fmadd_ps(tmp3370, _mm512_set1_ps(3.2e+01f), tmp3379);
__m512 tmp3356 = _mm512_fmadd_ps(tmp3346, _mm512_set1_ps(2e+00f), tmp3357);
__m512 tmp3376 = _mm512_fmadd_ps(tmp3366, _mm512_set1_ps(2e+00f), tmp3377);
__m512 out543 = tmp3341;
__m512 out549 = tmp3361;
__m512 out544 = tmp3347;
__m512 out550 = tmp3367;
__m512 out545 = tmp3352;
__m512 out551 = tmp3372;
__m512 out546 = tmp3354;
__m512 out552 = tmp3374;
__m512 out547 = tmp3356;
__m512 out553 = tmp3376;
__m512 out548 = tmp3358;
__m512 out554 = tmp3378;
out543 = _mm512_max_ps(_mm512_setzero_ps(), out543);
out549 = _mm512_max_ps(_mm512_setzero_ps(), out549);
out544 = _mm512_max_ps(_mm512_setzero_ps(), out544);
out550 = _mm512_max_ps(_mm512_setzero_ps(), out550);
out545 = _mm512_max_ps(_mm512_setzero_ps(), out545);
out551 = _mm512_max_ps(_mm512_setzero_ps(), out551);
out546 = _mm512_max_ps(_mm512_setzero_ps(), out546);
out552 = _mm512_max_ps(_mm512_setzero_ps(), out552);
out547 = _mm512_max_ps(_mm512_setzero_ps(), out547);
out553 = _mm512_max_ps(_mm512_setzero_ps(), out553);
out548 = _mm512_max_ps(_mm512_setzero_ps(), out548);
out554 = _mm512_max_ps(_mm512_setzero_ps(), out554);
_mm512_mask_storeu_ps(datPtr6+1200+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out543);
_mm512_mask_storeu_ps(datPtr6+12608+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 255, out549);
_mm512_mask_storeu_ps(datPtr6+1424+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out544);
_mm512_mask_storeu_ps(datPtr6+12832+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 255, out550);
_mm512_mask_storeu_ps(datPtr6+1648+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out545);
_mm512_mask_storeu_ps(datPtr6+13056+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 255, out551);
_mm512_mask_storeu_ps(datPtr6+1872+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out546);
_mm512_mask_storeu_ps(datPtr6+13280+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 255, out552);
_mm512_mask_storeu_ps(datPtr6+2096+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out547);
_mm512_mask_storeu_ps(datPtr6+13504+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 255, out553);
_mm512_mask_storeu_ps(datPtr6+2320+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out548);
_mm512_mask_storeu_ps(datPtr6+13728+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 255, out554);
__m512 sf177 = _mm512_loadu_ps(sfPtr5+512+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf178 = _mm512_loadu_ps(sfPtr5+640+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in540 = _mm512_shuffle_f32x4(sf177, sf178, 68);
__m512 in541 = _mm512_shuffle_f32x4(sf177, sf178, 238);
__m512 sf179 = _mm512_loadu_ps(sfPtr5+576+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf180 = _mm512_loadu_ps(sfPtr5+704+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in548 = _mm512_shuffle_f32x4(sf179, sf180, 68);
__m512 in549 = _mm512_shuffle_f32x4(sf179, sf180, 238);
__m512 sf181 = _mm512_loadu_ps(sfPtr5+410112+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf182 = _mm512_loadu_ps(sfPtr5+410240+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in542 = _mm512_shuffle_f32x4(sf181, sf182, 68);
__m512 in543 = _mm512_shuffle_f32x4(sf181, sf182, 238);
__m512 sf183 = _mm512_loadu_ps(sfPtr5+410176+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf184 = _mm512_loadu_ps(sfPtr5+410304+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in550 = _mm512_shuffle_f32x4(sf183, sf184, 68);
__m512 in551 = _mm512_shuffle_f32x4(sf183, sf184, 238);
__m512 sf185 = _mm512_loadu_ps(sfPtr5+819712+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf186 = _mm512_loadu_ps(sfPtr5+819840+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in544 = _mm512_shuffle_f32x4(sf185, sf186, 68);
__m512 in545 = _mm512_shuffle_f32x4(sf185, sf186, 238);
__m512 sf187 = _mm512_loadu_ps(sfPtr5+819776+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf188 = _mm512_loadu_ps(sfPtr5+819904+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in552 = _mm512_shuffle_f32x4(sf187, sf188, 68);
__m512 in553 = _mm512_shuffle_f32x4(sf187, sf188, 238);
__m512 sf189 = _mm512_loadu_ps(sfPtr5+1229312+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf190 = _mm512_loadu_ps(sfPtr5+1229440+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in546 = _mm512_shuffle_f32x4(sf189, sf190, 68);
__m512 in547 = _mm512_shuffle_f32x4(sf189, sf190, 238);
__m512 sf191 = _mm512_loadu_ps(sfPtr5+1229376+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 sf192 = _mm512_loadu_ps(sfPtr5+1229504+1638400*i18+24576*j13+1536*k65+768*l17);
__m512 in554 = _mm512_shuffle_f32x4(sf191, sf192, 68);
__m512 in555 = _mm512_shuffle_f32x4(sf191, sf192, 238);
__m512 tmp3437 = _mm512_add_ps(in541, in542);
__m512 tmp3457 = _mm512_add_ps(in549, in550);
__m512 tmp3436 = _mm512_add_ps(in543, in544);
__m512 tmp3456 = _mm512_add_ps(in551, in552);
__m512 tmp3442 = _mm512_sub_ps(in543, in544);
__m512 tmp3462 = _mm512_sub_ps(in551, in552);
__m512 tmp3441 = _mm512_sub_ps(in541, in542);
__m512 tmp3461 = _mm512_sub_ps(in549, in550);
__m512 tmp3438 = _mm512_add_ps(in545, in546);
__m512 tmp3458 = _mm512_add_ps(in553, in554);
__m512 tmp3443 = _mm512_sub_ps(in545, in546);
__m512 tmp3463 = _mm512_sub_ps(in553, in554);
__m512 tmp3440 = _mm512_fmadd_ps(tmp3442, _mm512_set1_ps(2e+00f), tmp3441);
__m512 tmp3460 = _mm512_fmadd_ps(tmp3462, _mm512_set1_ps(2e+00f), tmp3461);
__m512 tmp3447 = _mm512_fmadd_ps(tmp3442, _mm512_set1_ps(8e+00f), tmp3441);
__m512 tmp3467 = _mm512_fmadd_ps(tmp3462, _mm512_set1_ps(8e+00f), tmp3461);
__m512 tmp3435 = _mm512_add_ps(tmp3436, tmp3437);
__m512 tmp3455 = _mm512_add_ps(tmp3456, tmp3457);
__m512 tmp3439 = _mm512_fmadd_ps(tmp3443, _mm512_set1_ps(1.6e+01f), tmp3440);
__m512 tmp3459 = _mm512_fmadd_ps(tmp3463, _mm512_set1_ps(1.6e+01f), tmp3460);
__m512 tmp3446 = _mm512_fmadd_ps(tmp3443, _mm512_set1_ps(4e+00f), tmp3447);
__m512 tmp3466 = _mm512_fmadd_ps(tmp3463, _mm512_set1_ps(4e+00f), tmp3467);
__m512 tmp3452 = _mm512_add_ps(tmp3443, tmp3441);
__m512 tmp3472 = _mm512_add_ps(tmp3463, tmp3461);
__m512 tmp3445 = _mm512_fmadd_ps(tmp3436, _mm512_set1_ps(4e+00f), tmp3437);
__m512 tmp3465 = _mm512_fmadd_ps(tmp3456, _mm512_set1_ps(4e+00f), tmp3457);
__m512 tmp3449 = _mm512_fmadd_ps(tmp3436, _mm512_set1_ps(1.6e+01f), tmp3437);
__m512 tmp3469 = _mm512_fmadd_ps(tmp3456, _mm512_set1_ps(1.6e+01f), tmp3457);
__m512 tmp3434 = _mm512_add_ps(tmp3435, in540);
__m512 tmp3454 = _mm512_add_ps(tmp3455, in548);
__m512 tmp3451 = _mm512_add_ps(tmp3452, in547);
__m512 tmp3471 = _mm512_add_ps(tmp3472, in555);
__m512 tmp3433 = _mm512_fmadd_ps(tmp3438, _mm512_set1_ps(3.2e+01f), tmp3434);
__m512 tmp3453 = _mm512_fmadd_ps(tmp3458, _mm512_set1_ps(3.2e+01f), tmp3454);
__m512 tmp3444 = _mm512_fmadd_ps(tmp3438, _mm512_set1_ps(8e+00f), tmp3445);
__m512 tmp3464 = _mm512_fmadd_ps(tmp3458, _mm512_set1_ps(8e+00f), tmp3465);
__m512 tmp3450 = _mm512_fmadd_ps(tmp3442, _mm512_set1_ps(3.2e+01f), tmp3451);
__m512 tmp3470 = _mm512_fmadd_ps(tmp3462, _mm512_set1_ps(3.2e+01f), tmp3471);
__m512 tmp3448 = _mm512_fmadd_ps(tmp3438, _mm512_set1_ps(2e+00f), tmp3449);
__m512 tmp3468 = _mm512_fmadd_ps(tmp3458, _mm512_set1_ps(2e+00f), tmp3469);
__m512 tmp3421 = tmp3433;
__m512 tmp3427 = tmp3453;
__m512 tmp3422 = tmp3439;
__m512 tmp3428 = tmp3459;
__m512 tmp3423 = tmp3444;
__m512 tmp3429 = tmp3464;
__m512 tmp3424 = tmp3446;
__m512 tmp3430 = tmp3466;
__m512 tmp3425 = tmp3448;
__m512 tmp3431 = tmp3468;
__m512 tmp3426 = tmp3450;
__m512 tmp3432 = tmp3470;
__m512 tmp3517 = _mm512_unpacklo_ps(tmp3421, tmp3422);
__m512 tmp3518 = _mm512_unpackhi_ps(tmp3421, tmp3422);
__m512 tmp3519 = _mm512_unpacklo_ps(tmp3423, tmp3424);
__m512 tmp3520 = _mm512_unpackhi_ps(tmp3423, tmp3424);
__m512 tmp3521 = _mm512_unpacklo_ps(tmp3425, tmp3426);
__m512 tmp3522 = _mm512_unpackhi_ps(tmp3425, tmp3426);
__m512 tmp3523 = _mm512_unpacklo_ps(tmp3427, tmp3428);
__m512 tmp3524 = _mm512_unpackhi_ps(tmp3427, tmp3428);
__m512 tmp3525 = _mm512_unpacklo_ps(tmp3429, tmp3430);
__m512 tmp3526 = _mm512_unpackhi_ps(tmp3429, tmp3430);
__m512 tmp3527 = _mm512_unpacklo_ps(tmp3431, tmp3432);
__m512 tmp3528 = _mm512_unpackhi_ps(tmp3431, tmp3432);
__m512 tmp3529 = _mm512_shuffle_ps(tmp3517, tmp3519, 68);
__m512 tmp3530 = _mm512_shuffle_ps(tmp3517, tmp3519, 238);
__m512 tmp3531 = _mm512_shuffle_ps(tmp3518, tmp3520, 68);
__m512 tmp3532 = _mm512_shuffle_ps(tmp3518, tmp3520, 238);
__m512 tmp3533 = _mm512_shuffle_ps(tmp3521, tmp3523, 68);
__m512 tmp3534 = _mm512_shuffle_ps(tmp3521, tmp3523, 238);
__m512 tmp3535 = _mm512_shuffle_ps(tmp3522, tmp3524, 68);
__m512 tmp3536 = _mm512_shuffle_ps(tmp3522, tmp3524, 238);
__m512 tmp3537 = _mm512_shuffle_ps(tmp3525, tmp3527, 68);
__m512 tmp3538 = _mm512_shuffle_ps(tmp3525, tmp3527, 238);
__m512 tmp3539 = _mm512_shuffle_ps(tmp3526, tmp3528, 68);
__m512 tmp3540 = _mm512_shuffle_ps(tmp3526, tmp3528, 238);
__m512 tmp3541 = _mm512_shuffle_f32x4(tmp3529, tmp3533, 136);
__m512 tmp3542 = _mm512_shuffle_f32x4(tmp3529, tmp3533, 221);
__m512 tmp3543 = _mm512_shuffle_f32x4(tmp3530, tmp3534, 136);
__m512 tmp3544 = _mm512_shuffle_f32x4(tmp3530, tmp3534, 221);
__m512 tmp3545 = _mm512_shuffle_f32x4(tmp3531, tmp3535, 136);
__m512 tmp3546 = _mm512_shuffle_f32x4(tmp3531, tmp3535, 221);
__m512 tmp3547 = _mm512_shuffle_f32x4(tmp3532, tmp3536, 136);
__m512 tmp3548 = _mm512_shuffle_f32x4(tmp3532, tmp3536, 221);
__m512 tmp3549 = _mm512_shuffle_f32x4(tmp3537, tmp3537, 136);
__m512 tmp3550 = _mm512_shuffle_f32x4(tmp3537, tmp3537, 221);
__m512 tmp3551 = _mm512_shuffle_f32x4(tmp3538, tmp3538, 136);
__m512 tmp3552 = _mm512_shuffle_f32x4(tmp3538, tmp3538, 221);
__m512 tmp3553 = _mm512_shuffle_f32x4(tmp3539, tmp3539, 136);
__m512 tmp3554 = _mm512_shuffle_f32x4(tmp3539, tmp3539, 221);
__m512 tmp3555 = _mm512_shuffle_f32x4(tmp3540, tmp3540, 136);
__m512 tmp3556 = _mm512_shuffle_f32x4(tmp3540, tmp3540, 221);
tmp3421 = _mm512_shuffle_f32x4(tmp3541, tmp3549, 136);
tmp3429 = _mm512_shuffle_f32x4(tmp3541, tmp3549, 221);
tmp3422 = _mm512_shuffle_f32x4(tmp3543, tmp3551, 136);
tmp3430 = _mm512_shuffle_f32x4(tmp3543, tmp3551, 221);
tmp3423 = _mm512_shuffle_f32x4(tmp3545, tmp3553, 136);
tmp3431 = _mm512_shuffle_f32x4(tmp3545, tmp3553, 221);
tmp3424 = _mm512_shuffle_f32x4(tmp3547, tmp3555, 136);
tmp3432 = _mm512_shuffle_f32x4(tmp3547, tmp3555, 221);
tmp3425 = _mm512_shuffle_f32x4(tmp3542, tmp3550, 136);
__m512 tmp3473 = _mm512_shuffle_f32x4(tmp3542, tmp3550, 221);
tmp3426 = _mm512_shuffle_f32x4(tmp3544, tmp3552, 136);
__m512 tmp3474 = _mm512_shuffle_f32x4(tmp3544, tmp3552, 221);
tmp3427 = _mm512_shuffle_f32x4(tmp3546, tmp3554, 136);
__m512 tmp3475 = _mm512_shuffle_f32x4(tmp3546, tmp3554, 221);
tmp3428 = _mm512_shuffle_f32x4(tmp3548, tmp3556, 136);
__m512 tmp3476 = _mm512_shuffle_f32x4(tmp3548, tmp3556, 221);
__m512 tmp3481 = _mm512_add_ps(tmp3422, tmp3423);
__m512 tmp3501 = _mm512_add_ps(tmp3430, tmp3431);
__m512 tmp3480 = _mm512_add_ps(tmp3424, tmp3425);
__m512 tmp3500 = _mm512_add_ps(tmp3432, tmp3473);
__m512 tmp3486 = _mm512_sub_ps(tmp3424, tmp3425);
__m512 tmp3506 = _mm512_sub_ps(tmp3432, tmp3473);
__m512 tmp3485 = _mm512_sub_ps(tmp3422, tmp3423);
__m512 tmp3505 = _mm512_sub_ps(tmp3430, tmp3431);
__m512 tmp3482 = _mm512_add_ps(tmp3426, tmp3427);
__m512 tmp3502 = _mm512_add_ps(tmp3474, tmp3475);
__m512 tmp3487 = _mm512_sub_ps(tmp3426, tmp3427);
__m512 tmp3507 = _mm512_sub_ps(tmp3474, tmp3475);
__m512 tmp3484 = _mm512_fmadd_ps(tmp3486, _mm512_set1_ps(2e+00f), tmp3485);
__m512 tmp3504 = _mm512_fmadd_ps(tmp3506, _mm512_set1_ps(2e+00f), tmp3505);
__m512 tmp3491 = _mm512_fmadd_ps(tmp3486, _mm512_set1_ps(8e+00f), tmp3485);
__m512 tmp3511 = _mm512_fmadd_ps(tmp3506, _mm512_set1_ps(8e+00f), tmp3505);
__m512 tmp3479 = _mm512_add_ps(tmp3480, tmp3481);
__m512 tmp3499 = _mm512_add_ps(tmp3500, tmp3501);
__m512 tmp3483 = _mm512_fmadd_ps(tmp3487, _mm512_set1_ps(1.6e+01f), tmp3484);
__m512 tmp3503 = _mm512_fmadd_ps(tmp3507, _mm512_set1_ps(1.6e+01f), tmp3504);
__m512 tmp3490 = _mm512_fmadd_ps(tmp3487, _mm512_set1_ps(4e+00f), tmp3491);
__m512 tmp3510 = _mm512_fmadd_ps(tmp3507, _mm512_set1_ps(4e+00f), tmp3511);
__m512 tmp3496 = _mm512_add_ps(tmp3487, tmp3485);
__m512 tmp3516 = _mm512_add_ps(tmp3507, tmp3505);
__m512 tmp3489 = _mm512_fmadd_ps(tmp3480, _mm512_set1_ps(4e+00f), tmp3481);
__m512 tmp3509 = _mm512_fmadd_ps(tmp3500, _mm512_set1_ps(4e+00f), tmp3501);
__m512 tmp3493 = _mm512_fmadd_ps(tmp3480, _mm512_set1_ps(1.6e+01f), tmp3481);
__m512 tmp3513 = _mm512_fmadd_ps(tmp3500, _mm512_set1_ps(1.6e+01f), tmp3501);
__m512 tmp3478 = _mm512_add_ps(tmp3479, tmp3421);
__m512 tmp3498 = _mm512_add_ps(tmp3499, tmp3429);
__m512 tmp3495 = _mm512_add_ps(tmp3496, tmp3428);
__m512 tmp3515 = _mm512_add_ps(tmp3516, tmp3476);
__m512 tmp3477 = _mm512_fmadd_ps(tmp3482, _mm512_set1_ps(3.2e+01f), tmp3478);
__m512 tmp3497 = _mm512_fmadd_ps(tmp3502, _mm512_set1_ps(3.2e+01f), tmp3498);
__m512 tmp3488 = _mm512_fmadd_ps(tmp3482, _mm512_set1_ps(8e+00f), tmp3489);
__m512 tmp3508 = _mm512_fmadd_ps(tmp3502, _mm512_set1_ps(8e+00f), tmp3509);
__m512 tmp3494 = _mm512_fmadd_ps(tmp3486, _mm512_set1_ps(3.2e+01f), tmp3495);
__m512 tmp3514 = _mm512_fmadd_ps(tmp3506, _mm512_set1_ps(3.2e+01f), tmp3515);
__m512 tmp3492 = _mm512_fmadd_ps(tmp3482, _mm512_set1_ps(2e+00f), tmp3493);
__m512 tmp3512 = _mm512_fmadd_ps(tmp3502, _mm512_set1_ps(2e+00f), tmp3513);
__m512 out555 = tmp3477;
__m512 out561 = tmp3497;
__m512 out556 = tmp3483;
__m512 out562 = tmp3503;
__m512 out557 = tmp3488;
__m512 out563 = tmp3508;
__m512 out558 = tmp3490;
__m512 out564 = tmp3510;
__m512 out559 = tmp3492;
__m512 out565 = tmp3512;
__m512 out560 = tmp3494;
__m512 out566 = tmp3514;
out555 = _mm512_max_ps(_mm512_setzero_ps(), out555);
out561 = _mm512_max_ps(_mm512_setzero_ps(), out561);
out556 = _mm512_max_ps(_mm512_setzero_ps(), out556);
out562 = _mm512_max_ps(_mm512_setzero_ps(), out562);
out557 = _mm512_max_ps(_mm512_setzero_ps(), out557);
out563 = _mm512_max_ps(_mm512_setzero_ps(), out563);
out558 = _mm512_max_ps(_mm512_setzero_ps(), out558);
out564 = _mm512_max_ps(_mm512_setzero_ps(), out564);
out559 = _mm512_max_ps(_mm512_setzero_ps(), out559);
out565 = _mm512_max_ps(_mm512_setzero_ps(), out565);
out560 = _mm512_max_ps(_mm512_setzero_ps(), out560);
out566 = _mm512_max_ps(_mm512_setzero_ps(), out566);
_mm512_mask_storeu_ps(datPtr6+13760+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out555);
_mm512_mask_storeu_ps(datPtr6+13808+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out561);
_mm512_mask_storeu_ps(datPtr6+13984+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out556);
_mm512_mask_storeu_ps(datPtr6+14032+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out562);
_mm512_mask_storeu_ps(datPtr6+14208+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out557);
_mm512_mask_storeu_ps(datPtr6+14256+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out563);
_mm512_mask_storeu_ps(datPtr6+14432+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out558);
_mm512_mask_storeu_ps(datPtr6+14480+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out564);
_mm512_mask_storeu_ps(datPtr6+14656+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out559);
_mm512_mask_storeu_ps(datPtr6+14704+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out565);
_mm512_mask_storeu_ps(datPtr6+14880+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out560);
_mm512_mask_storeu_ps(datPtr6+14928+806912*i18+224*toH23+4*toW23+50432*k65+25216*l17, 4095, out566);
}
}
if (j13 >= last4) return;
++j13;
rel11 = 2;
}
if (rel11 < 3) {
ptrdiff_t toH24 = base11+6;
ptrdiff_t toW24 = 24;
ptrdiff_t k66 = 16*w33;
for (; k66 != 16; ++k66) {
ptrdiff_t l18 = 0;
for (; l18 != 2; ++l18) {
__m512 sf193 = _mm512_loadu_ps(sfPtr5+0+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf194 = _mm512_loadu_ps(sfPtr5+128+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in556 = _mm512_shuffle_f32x4(sf193, sf194, 68);
__m512 in557 = _mm512_shuffle_f32x4(sf193, sf194, 238);
__m512 sf195 = _mm512_loadu_ps(sfPtr5+64+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf196 = _mm512_loadu_ps(sfPtr5+192+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in564 = _mm512_shuffle_f32x4(sf195, sf196, 68);
__m512 in565 = _mm512_shuffle_f32x4(sf195, sf196, 238);
__m512 sf197 = _mm512_loadu_ps(sfPtr5+409600+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf198 = _mm512_loadu_ps(sfPtr5+409728+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in558 = _mm512_shuffle_f32x4(sf197, sf198, 68);
__m512 in559 = _mm512_shuffle_f32x4(sf197, sf198, 238);
__m512 sf199 = _mm512_loadu_ps(sfPtr5+409664+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf200 = _mm512_loadu_ps(sfPtr5+409792+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in566 = _mm512_shuffle_f32x4(sf199, sf200, 68);
__m512 in567 = _mm512_shuffle_f32x4(sf199, sf200, 238);
__m512 sf201 = _mm512_loadu_ps(sfPtr5+819200+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf202 = _mm512_loadu_ps(sfPtr5+819328+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in560 = _mm512_shuffle_f32x4(sf201, sf202, 68);
__m512 in561 = _mm512_shuffle_f32x4(sf201, sf202, 238);
__m512 sf203 = _mm512_loadu_ps(sfPtr5+819264+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf204 = _mm512_loadu_ps(sfPtr5+819392+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in568 = _mm512_shuffle_f32x4(sf203, sf204, 68);
__m512 in569 = _mm512_shuffle_f32x4(sf203, sf204, 238);
__m512 sf205 = _mm512_loadu_ps(sfPtr5+1228800+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf206 = _mm512_loadu_ps(sfPtr5+1228928+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in562 = _mm512_shuffle_f32x4(sf205, sf206, 68);
__m512 in563 = _mm512_shuffle_f32x4(sf205, sf206, 238);
__m512 sf207 = _mm512_loadu_ps(sfPtr5+1228864+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf208 = _mm512_loadu_ps(sfPtr5+1228992+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in570 = _mm512_shuffle_f32x4(sf207, sf208, 68);
__m512 in571 = _mm512_shuffle_f32x4(sf207, sf208, 238);
__m512 tmp3573 = _mm512_add_ps(in557, in558);
__m512 tmp3593 = _mm512_add_ps(in565, in566);
__m512 tmp3572 = _mm512_add_ps(in559, in560);
__m512 tmp3592 = _mm512_add_ps(in567, in568);
__m512 tmp3578 = _mm512_sub_ps(in559, in560);
__m512 tmp3598 = _mm512_sub_ps(in567, in568);
__m512 tmp3577 = _mm512_sub_ps(in557, in558);
__m512 tmp3597 = _mm512_sub_ps(in565, in566);
__m512 tmp3574 = _mm512_add_ps(in561, in562);
__m512 tmp3594 = _mm512_add_ps(in569, in570);
__m512 tmp3579 = _mm512_sub_ps(in561, in562);
__m512 tmp3599 = _mm512_sub_ps(in569, in570);
__m512 tmp3576 = _mm512_fmadd_ps(tmp3578, _mm512_set1_ps(2e+00f), tmp3577);
__m512 tmp3596 = _mm512_fmadd_ps(tmp3598, _mm512_set1_ps(2e+00f), tmp3597);
__m512 tmp3583 = _mm512_fmadd_ps(tmp3578, _mm512_set1_ps(8e+00f), tmp3577);
__m512 tmp3603 = _mm512_fmadd_ps(tmp3598, _mm512_set1_ps(8e+00f), tmp3597);
__m512 tmp3571 = _mm512_add_ps(tmp3572, tmp3573);
__m512 tmp3591 = _mm512_add_ps(tmp3592, tmp3593);
__m512 tmp3575 = _mm512_fmadd_ps(tmp3579, _mm512_set1_ps(1.6e+01f), tmp3576);
__m512 tmp3595 = _mm512_fmadd_ps(tmp3599, _mm512_set1_ps(1.6e+01f), tmp3596);
__m512 tmp3582 = _mm512_fmadd_ps(tmp3579, _mm512_set1_ps(4e+00f), tmp3583);
__m512 tmp3602 = _mm512_fmadd_ps(tmp3599, _mm512_set1_ps(4e+00f), tmp3603);
__m512 tmp3588 = _mm512_add_ps(tmp3579, tmp3577);
__m512 tmp3608 = _mm512_add_ps(tmp3599, tmp3597);
__m512 tmp3581 = _mm512_fmadd_ps(tmp3572, _mm512_set1_ps(4e+00f), tmp3573);
__m512 tmp3601 = _mm512_fmadd_ps(tmp3592, _mm512_set1_ps(4e+00f), tmp3593);
__m512 tmp3585 = _mm512_fmadd_ps(tmp3572, _mm512_set1_ps(1.6e+01f), tmp3573);
__m512 tmp3605 = _mm512_fmadd_ps(tmp3592, _mm512_set1_ps(1.6e+01f), tmp3593);
__m512 tmp3570 = _mm512_add_ps(tmp3571, in556);
__m512 tmp3590 = _mm512_add_ps(tmp3591, in564);
__m512 tmp3587 = _mm512_add_ps(tmp3588, in563);
__m512 tmp3607 = _mm512_add_ps(tmp3608, in571);
__m512 tmp3569 = _mm512_fmadd_ps(tmp3574, _mm512_set1_ps(3.2e+01f), tmp3570);
__m512 tmp3589 = _mm512_fmadd_ps(tmp3594, _mm512_set1_ps(3.2e+01f), tmp3590);
__m512 tmp3580 = _mm512_fmadd_ps(tmp3574, _mm512_set1_ps(8e+00f), tmp3581);
__m512 tmp3600 = _mm512_fmadd_ps(tmp3594, _mm512_set1_ps(8e+00f), tmp3601);
__m512 tmp3586 = _mm512_fmadd_ps(tmp3578, _mm512_set1_ps(3.2e+01f), tmp3587);
__m512 tmp3606 = _mm512_fmadd_ps(tmp3598, _mm512_set1_ps(3.2e+01f), tmp3607);
__m512 tmp3584 = _mm512_fmadd_ps(tmp3574, _mm512_set1_ps(2e+00f), tmp3585);
__m512 tmp3604 = _mm512_fmadd_ps(tmp3594, _mm512_set1_ps(2e+00f), tmp3605);
__m512 tmp3557 = tmp3569;
__m512 tmp3563 = tmp3589;
__m512 tmp3558 = tmp3575;
__m512 tmp3564 = tmp3595;
__m512 tmp3559 = tmp3580;
__m512 tmp3565 = tmp3600;
__m512 tmp3560 = tmp3582;
__m512 tmp3566 = tmp3602;
__m512 tmp3561 = tmp3584;
__m512 tmp3567 = tmp3604;
__m512 tmp3562 = tmp3586;
__m512 tmp3568 = tmp3606;
__m512 tmp3653 = _mm512_unpacklo_ps(tmp3557, tmp3558);
__m512 tmp3654 = _mm512_unpackhi_ps(tmp3557, tmp3558);
__m512 tmp3655 = _mm512_unpacklo_ps(tmp3559, tmp3560);
__m512 tmp3656 = _mm512_unpackhi_ps(tmp3559, tmp3560);
__m512 tmp3657 = _mm512_unpacklo_ps(tmp3561, tmp3562);
__m512 tmp3658 = _mm512_unpackhi_ps(tmp3561, tmp3562);
__m512 tmp3659 = _mm512_unpacklo_ps(tmp3563, tmp3564);
__m512 tmp3660 = _mm512_unpackhi_ps(tmp3563, tmp3564);
__m512 tmp3661 = _mm512_unpacklo_ps(tmp3565, tmp3566);
__m512 tmp3662 = _mm512_unpackhi_ps(tmp3565, tmp3566);
__m512 tmp3663 = _mm512_unpacklo_ps(tmp3567, tmp3568);
__m512 tmp3664 = _mm512_unpackhi_ps(tmp3567, tmp3568);
__m512 tmp3665 = _mm512_shuffle_ps(tmp3653, tmp3655, 68);
__m512 tmp3666 = _mm512_shuffle_ps(tmp3653, tmp3655, 238);
__m512 tmp3667 = _mm512_shuffle_ps(tmp3654, tmp3656, 68);
__m512 tmp3668 = _mm512_shuffle_ps(tmp3654, tmp3656, 238);
__m512 tmp3669 = _mm512_shuffle_ps(tmp3657, tmp3659, 68);
__m512 tmp3670 = _mm512_shuffle_ps(tmp3657, tmp3659, 238);
__m512 tmp3671 = _mm512_shuffle_ps(tmp3658, tmp3660, 68);
__m512 tmp3672 = _mm512_shuffle_ps(tmp3658, tmp3660, 238);
__m512 tmp3673 = _mm512_shuffle_ps(tmp3661, tmp3663, 68);
__m512 tmp3674 = _mm512_shuffle_ps(tmp3661, tmp3663, 238);
__m512 tmp3675 = _mm512_shuffle_ps(tmp3662, tmp3664, 68);
__m512 tmp3676 = _mm512_shuffle_ps(tmp3662, tmp3664, 238);
__m512 tmp3677 = _mm512_shuffle_f32x4(tmp3665, tmp3669, 136);
__m512 tmp3678 = _mm512_shuffle_f32x4(tmp3665, tmp3669, 221);
__m512 tmp3679 = _mm512_shuffle_f32x4(tmp3666, tmp3670, 136);
__m512 tmp3680 = _mm512_shuffle_f32x4(tmp3666, tmp3670, 221);
__m512 tmp3681 = _mm512_shuffle_f32x4(tmp3667, tmp3671, 136);
__m512 tmp3682 = _mm512_shuffle_f32x4(tmp3667, tmp3671, 221);
__m512 tmp3683 = _mm512_shuffle_f32x4(tmp3668, tmp3672, 136);
__m512 tmp3684 = _mm512_shuffle_f32x4(tmp3668, tmp3672, 221);
__m512 tmp3685 = _mm512_shuffle_f32x4(tmp3673, tmp3673, 136);
__m512 tmp3686 = _mm512_shuffle_f32x4(tmp3673, tmp3673, 221);
__m512 tmp3687 = _mm512_shuffle_f32x4(tmp3674, tmp3674, 136);
__m512 tmp3688 = _mm512_shuffle_f32x4(tmp3674, tmp3674, 221);
__m512 tmp3689 = _mm512_shuffle_f32x4(tmp3675, tmp3675, 136);
__m512 tmp3690 = _mm512_shuffle_f32x4(tmp3675, tmp3675, 221);
__m512 tmp3691 = _mm512_shuffle_f32x4(tmp3676, tmp3676, 136);
__m512 tmp3692 = _mm512_shuffle_f32x4(tmp3676, tmp3676, 221);
tmp3557 = _mm512_shuffle_f32x4(tmp3677, tmp3685, 136);
tmp3565 = _mm512_shuffle_f32x4(tmp3677, tmp3685, 221);
tmp3558 = _mm512_shuffle_f32x4(tmp3679, tmp3687, 136);
tmp3566 = _mm512_shuffle_f32x4(tmp3679, tmp3687, 221);
tmp3559 = _mm512_shuffle_f32x4(tmp3681, tmp3689, 136);
tmp3567 = _mm512_shuffle_f32x4(tmp3681, tmp3689, 221);
tmp3560 = _mm512_shuffle_f32x4(tmp3683, tmp3691, 136);
tmp3568 = _mm512_shuffle_f32x4(tmp3683, tmp3691, 221);
tmp3561 = _mm512_shuffle_f32x4(tmp3678, tmp3686, 136);
__m512 tmp3609 = _mm512_shuffle_f32x4(tmp3678, tmp3686, 221);
tmp3562 = _mm512_shuffle_f32x4(tmp3680, tmp3688, 136);
__m512 tmp3610 = _mm512_shuffle_f32x4(tmp3680, tmp3688, 221);
tmp3563 = _mm512_shuffle_f32x4(tmp3682, tmp3690, 136);
__m512 tmp3611 = _mm512_shuffle_f32x4(tmp3682, tmp3690, 221);
tmp3564 = _mm512_shuffle_f32x4(tmp3684, tmp3692, 136);
__m512 tmp3612 = _mm512_shuffle_f32x4(tmp3684, tmp3692, 221);
__m512 tmp3617 = _mm512_add_ps(tmp3558, tmp3559);
__m512 tmp3637 = _mm512_add_ps(tmp3566, tmp3567);
__m512 tmp3616 = _mm512_add_ps(tmp3560, tmp3561);
__m512 tmp3636 = _mm512_add_ps(tmp3568, tmp3609);
__m512 tmp3622 = _mm512_sub_ps(tmp3560, tmp3561);
__m512 tmp3642 = _mm512_sub_ps(tmp3568, tmp3609);
__m512 tmp3621 = _mm512_sub_ps(tmp3558, tmp3559);
__m512 tmp3641 = _mm512_sub_ps(tmp3566, tmp3567);
__m512 tmp3618 = _mm512_add_ps(tmp3562, tmp3563);
__m512 tmp3638 = _mm512_add_ps(tmp3610, tmp3611);
__m512 tmp3623 = _mm512_sub_ps(tmp3562, tmp3563);
__m512 tmp3643 = _mm512_sub_ps(tmp3610, tmp3611);
__m512 tmp3620 = _mm512_fmadd_ps(tmp3622, _mm512_set1_ps(2e+00f), tmp3621);
__m512 tmp3640 = _mm512_fmadd_ps(tmp3642, _mm512_set1_ps(2e+00f), tmp3641);
__m512 tmp3627 = _mm512_fmadd_ps(tmp3622, _mm512_set1_ps(8e+00f), tmp3621);
__m512 tmp3647 = _mm512_fmadd_ps(tmp3642, _mm512_set1_ps(8e+00f), tmp3641);
__m512 tmp3615 = _mm512_add_ps(tmp3616, tmp3617);
__m512 tmp3635 = _mm512_add_ps(tmp3636, tmp3637);
__m512 tmp3619 = _mm512_fmadd_ps(tmp3623, _mm512_set1_ps(1.6e+01f), tmp3620);
__m512 tmp3639 = _mm512_fmadd_ps(tmp3643, _mm512_set1_ps(1.6e+01f), tmp3640);
__m512 tmp3626 = _mm512_fmadd_ps(tmp3623, _mm512_set1_ps(4e+00f), tmp3627);
__m512 tmp3646 = _mm512_fmadd_ps(tmp3643, _mm512_set1_ps(4e+00f), tmp3647);
__m512 tmp3632 = _mm512_add_ps(tmp3623, tmp3621);
__m512 tmp3652 = _mm512_add_ps(tmp3643, tmp3641);
__m512 tmp3625 = _mm512_fmadd_ps(tmp3616, _mm512_set1_ps(4e+00f), tmp3617);
__m512 tmp3645 = _mm512_fmadd_ps(tmp3636, _mm512_set1_ps(4e+00f), tmp3637);
__m512 tmp3629 = _mm512_fmadd_ps(tmp3616, _mm512_set1_ps(1.6e+01f), tmp3617);
__m512 tmp3649 = _mm512_fmadd_ps(tmp3636, _mm512_set1_ps(1.6e+01f), tmp3637);
__m512 tmp3614 = _mm512_add_ps(tmp3615, tmp3557);
__m512 tmp3634 = _mm512_add_ps(tmp3635, tmp3565);
__m512 tmp3631 = _mm512_add_ps(tmp3632, tmp3564);
__m512 tmp3651 = _mm512_add_ps(tmp3652, tmp3612);
__m512 tmp3613 = _mm512_fmadd_ps(tmp3618, _mm512_set1_ps(3.2e+01f), tmp3614);
__m512 tmp3633 = _mm512_fmadd_ps(tmp3638, _mm512_set1_ps(3.2e+01f), tmp3634);
__m512 tmp3624 = _mm512_fmadd_ps(tmp3618, _mm512_set1_ps(8e+00f), tmp3625);
__m512 tmp3644 = _mm512_fmadd_ps(tmp3638, _mm512_set1_ps(8e+00f), tmp3645);
__m512 tmp3630 = _mm512_fmadd_ps(tmp3622, _mm512_set1_ps(3.2e+01f), tmp3631);
__m512 tmp3650 = _mm512_fmadd_ps(tmp3642, _mm512_set1_ps(3.2e+01f), tmp3651);
__m512 tmp3628 = _mm512_fmadd_ps(tmp3618, _mm512_set1_ps(2e+00f), tmp3629);
__m512 tmp3648 = _mm512_fmadd_ps(tmp3638, _mm512_set1_ps(2e+00f), tmp3649);
__m512 out567 = tmp3613;
__m512 out573 = tmp3633;
__m512 out568 = tmp3619;
__m512 out574 = tmp3639;
__m512 out569 = tmp3624;
__m512 out575 = tmp3644;
__m512 out570 = tmp3626;
__m512 out576 = tmp3646;
__m512 out571 = tmp3628;
__m512 out577 = tmp3648;
__m512 out572 = tmp3630;
__m512 out578 = tmp3650;
out567 = _mm512_max_ps(_mm512_setzero_ps(), out567);
out573 = _mm512_max_ps(_mm512_setzero_ps(), out573);
out568 = _mm512_max_ps(_mm512_setzero_ps(), out568);
out574 = _mm512_max_ps(_mm512_setzero_ps(), out574);
out569 = _mm512_max_ps(_mm512_setzero_ps(), out569);
out575 = _mm512_max_ps(_mm512_setzero_ps(), out575);
out570 = _mm512_max_ps(_mm512_setzero_ps(), out570);
out576 = _mm512_max_ps(_mm512_setzero_ps(), out576);
out571 = _mm512_max_ps(_mm512_setzero_ps(), out571);
out577 = _mm512_max_ps(_mm512_setzero_ps(), out577);
out572 = _mm512_max_ps(_mm512_setzero_ps(), out572);
out578 = _mm512_max_ps(_mm512_setzero_ps(), out578);
_mm512_mask_storeu_ps(datPtr6+0+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out567);
_mm512_mask_storeu_ps(datPtr6+48+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out573);
_mm512_mask_storeu_ps(datPtr6+224+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out568);
_mm512_mask_storeu_ps(datPtr6+272+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out574);
_mm512_mask_storeu_ps(datPtr6+448+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out569);
_mm512_mask_storeu_ps(datPtr6+496+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out575);
_mm512_mask_storeu_ps(datPtr6+672+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out570);
_mm512_mask_storeu_ps(datPtr6+720+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out576);
_mm512_mask_storeu_ps(datPtr6+896+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out571);
_mm512_mask_storeu_ps(datPtr6+944+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out577);
_mm512_mask_storeu_ps(datPtr6+1120+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out572);
_mm512_mask_storeu_ps(datPtr6+1168+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out578);
__m512 sf209 = _mm512_loadu_ps(sfPtr5+256+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf210 = _mm512_loadu_ps(sfPtr5+384+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in572 = _mm512_shuffle_f32x4(sf209, sf210, 68);
__m512 in573 = _mm512_shuffle_f32x4(sf209, sf210, 238);
__m512 sf211 = _mm512_loadu_ps(sfPtr5+320+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf212 = _mm512_loadu_ps(sfPtr5+448+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in580 = _mm512_shuffle_f32x4(sf211, sf212, 68);
__m512 in581 = _mm512_shuffle_f32x4(sf211, sf212, 238);
__m512 sf213 = _mm512_loadu_ps(sfPtr5+409856+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf214 = _mm512_loadu_ps(sfPtr5+409984+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in574 = _mm512_shuffle_f32x4(sf213, sf214, 68);
__m512 in575 = _mm512_shuffle_f32x4(sf213, sf214, 238);
__m512 sf215 = _mm512_loadu_ps(sfPtr5+409920+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf216 = _mm512_loadu_ps(sfPtr5+410048+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in582 = _mm512_shuffle_f32x4(sf215, sf216, 68);
__m512 in583 = _mm512_shuffle_f32x4(sf215, sf216, 238);
__m512 sf217 = _mm512_loadu_ps(sfPtr5+819456+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf218 = _mm512_loadu_ps(sfPtr5+819584+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in576 = _mm512_shuffle_f32x4(sf217, sf218, 68);
__m512 in577 = _mm512_shuffle_f32x4(sf217, sf218, 238);
__m512 sf219 = _mm512_loadu_ps(sfPtr5+819520+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf220 = _mm512_loadu_ps(sfPtr5+819648+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in584 = _mm512_shuffle_f32x4(sf219, sf220, 68);
__m512 in585 = _mm512_shuffle_f32x4(sf219, sf220, 238);
__m512 sf221 = _mm512_loadu_ps(sfPtr5+1229056+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf222 = _mm512_loadu_ps(sfPtr5+1229184+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in578 = _mm512_shuffle_f32x4(sf221, sf222, 68);
__m512 in579 = _mm512_shuffle_f32x4(sf221, sf222, 238);
__m512 sf223 = _mm512_loadu_ps(sfPtr5+1229120+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf224 = _mm512_loadu_ps(sfPtr5+1229248+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in586 = _mm512_shuffle_f32x4(sf223, sf224, 68);
__m512 in587 = _mm512_shuffle_f32x4(sf223, sf224, 238);
__m512 tmp3709 = _mm512_add_ps(in573, in574);
__m512 tmp3729 = _mm512_add_ps(in581, in582);
__m512 tmp3708 = _mm512_add_ps(in575, in576);
__m512 tmp3728 = _mm512_add_ps(in583, in584);
__m512 tmp3714 = _mm512_sub_ps(in575, in576);
__m512 tmp3734 = _mm512_sub_ps(in583, in584);
__m512 tmp3713 = _mm512_sub_ps(in573, in574);
__m512 tmp3733 = _mm512_sub_ps(in581, in582);
__m512 tmp3710 = _mm512_add_ps(in577, in578);
__m512 tmp3730 = _mm512_add_ps(in585, in586);
__m512 tmp3715 = _mm512_sub_ps(in577, in578);
__m512 tmp3735 = _mm512_sub_ps(in585, in586);
__m512 tmp3712 = _mm512_fmadd_ps(tmp3714, _mm512_set1_ps(2e+00f), tmp3713);
__m512 tmp3732 = _mm512_fmadd_ps(tmp3734, _mm512_set1_ps(2e+00f), tmp3733);
__m512 tmp3719 = _mm512_fmadd_ps(tmp3714, _mm512_set1_ps(8e+00f), tmp3713);
__m512 tmp3739 = _mm512_fmadd_ps(tmp3734, _mm512_set1_ps(8e+00f), tmp3733);
__m512 tmp3707 = _mm512_add_ps(tmp3708, tmp3709);
__m512 tmp3727 = _mm512_add_ps(tmp3728, tmp3729);
__m512 tmp3711 = _mm512_fmadd_ps(tmp3715, _mm512_set1_ps(1.6e+01f), tmp3712);
__m512 tmp3731 = _mm512_fmadd_ps(tmp3735, _mm512_set1_ps(1.6e+01f), tmp3732);
__m512 tmp3718 = _mm512_fmadd_ps(tmp3715, _mm512_set1_ps(4e+00f), tmp3719);
__m512 tmp3738 = _mm512_fmadd_ps(tmp3735, _mm512_set1_ps(4e+00f), tmp3739);
__m512 tmp3724 = _mm512_add_ps(tmp3715, tmp3713);
__m512 tmp3744 = _mm512_add_ps(tmp3735, tmp3733);
__m512 tmp3717 = _mm512_fmadd_ps(tmp3708, _mm512_set1_ps(4e+00f), tmp3709);
__m512 tmp3737 = _mm512_fmadd_ps(tmp3728, _mm512_set1_ps(4e+00f), tmp3729);
__m512 tmp3721 = _mm512_fmadd_ps(tmp3708, _mm512_set1_ps(1.6e+01f), tmp3709);
__m512 tmp3741 = _mm512_fmadd_ps(tmp3728, _mm512_set1_ps(1.6e+01f), tmp3729);
__m512 tmp3706 = _mm512_add_ps(tmp3707, in572);
__m512 tmp3726 = _mm512_add_ps(tmp3727, in580);
__m512 tmp3723 = _mm512_add_ps(tmp3724, in579);
__m512 tmp3743 = _mm512_add_ps(tmp3744, in587);
__m512 tmp3705 = _mm512_fmadd_ps(tmp3710, _mm512_set1_ps(3.2e+01f), tmp3706);
__m512 tmp3725 = _mm512_fmadd_ps(tmp3730, _mm512_set1_ps(3.2e+01f), tmp3726);
__m512 tmp3716 = _mm512_fmadd_ps(tmp3710, _mm512_set1_ps(8e+00f), tmp3717);
__m512 tmp3736 = _mm512_fmadd_ps(tmp3730, _mm512_set1_ps(8e+00f), tmp3737);
__m512 tmp3722 = _mm512_fmadd_ps(tmp3714, _mm512_set1_ps(3.2e+01f), tmp3723);
__m512 tmp3742 = _mm512_fmadd_ps(tmp3734, _mm512_set1_ps(3.2e+01f), tmp3743);
__m512 tmp3720 = _mm512_fmadd_ps(tmp3710, _mm512_set1_ps(2e+00f), tmp3721);
__m512 tmp3740 = _mm512_fmadd_ps(tmp3730, _mm512_set1_ps(2e+00f), tmp3741);
__m512 tmp3693 = tmp3705;
__m512 tmp3699 = tmp3725;
__m512 tmp3694 = tmp3711;
__m512 tmp3700 = tmp3731;
__m512 tmp3695 = tmp3716;
__m512 tmp3701 = tmp3736;
__m512 tmp3696 = tmp3718;
__m512 tmp3702 = tmp3738;
__m512 tmp3697 = tmp3720;
__m512 tmp3703 = tmp3740;
__m512 tmp3698 = tmp3722;
__m512 tmp3704 = tmp3742;
__m512 tmp3789 = _mm512_unpacklo_ps(tmp3693, tmp3694);
__m512 tmp3790 = _mm512_unpackhi_ps(tmp3693, tmp3694);
__m512 tmp3791 = _mm512_unpacklo_ps(tmp3695, tmp3696);
__m512 tmp3792 = _mm512_unpackhi_ps(tmp3695, tmp3696);
__m512 tmp3793 = _mm512_unpacklo_ps(tmp3697, tmp3698);
__m512 tmp3794 = _mm512_unpackhi_ps(tmp3697, tmp3698);
__m512 tmp3795 = _mm512_unpacklo_ps(tmp3699, tmp3700);
__m512 tmp3796 = _mm512_unpackhi_ps(tmp3699, tmp3700);
__m512 tmp3797 = _mm512_unpacklo_ps(tmp3701, tmp3702);
__m512 tmp3798 = _mm512_unpackhi_ps(tmp3701, tmp3702);
__m512 tmp3799 = _mm512_unpacklo_ps(tmp3703, tmp3704);
__m512 tmp3800 = _mm512_unpackhi_ps(tmp3703, tmp3704);
__m512 tmp3801 = _mm512_shuffle_ps(tmp3789, tmp3791, 68);
__m512 tmp3802 = _mm512_shuffle_ps(tmp3789, tmp3791, 238);
__m512 tmp3803 = _mm512_shuffle_ps(tmp3790, tmp3792, 68);
__m512 tmp3804 = _mm512_shuffle_ps(tmp3790, tmp3792, 238);
__m512 tmp3805 = _mm512_shuffle_ps(tmp3793, tmp3795, 68);
__m512 tmp3806 = _mm512_shuffle_ps(tmp3793, tmp3795, 238);
__m512 tmp3807 = _mm512_shuffle_ps(tmp3794, tmp3796, 68);
__m512 tmp3808 = _mm512_shuffle_ps(tmp3794, tmp3796, 238);
__m512 tmp3809 = _mm512_shuffle_ps(tmp3797, tmp3799, 68);
__m512 tmp3810 = _mm512_shuffle_ps(tmp3797, tmp3799, 238);
__m512 tmp3811 = _mm512_shuffle_ps(tmp3798, tmp3800, 68);
__m512 tmp3812 = _mm512_shuffle_ps(tmp3798, tmp3800, 238);
__m512 tmp3813 = _mm512_shuffle_f32x4(tmp3801, tmp3805, 136);
__m512 tmp3814 = _mm512_shuffle_f32x4(tmp3801, tmp3805, 221);
__m512 tmp3815 = _mm512_shuffle_f32x4(tmp3802, tmp3806, 136);
__m512 tmp3816 = _mm512_shuffle_f32x4(tmp3802, tmp3806, 221);
__m512 tmp3817 = _mm512_shuffle_f32x4(tmp3803, tmp3807, 136);
__m512 tmp3818 = _mm512_shuffle_f32x4(tmp3803, tmp3807, 221);
__m512 tmp3819 = _mm512_shuffle_f32x4(tmp3804, tmp3808, 136);
__m512 tmp3820 = _mm512_shuffle_f32x4(tmp3804, tmp3808, 221);
__m512 tmp3821 = _mm512_shuffle_f32x4(tmp3809, tmp3809, 136);
__m512 tmp3822 = _mm512_shuffle_f32x4(tmp3809, tmp3809, 221);
__m512 tmp3823 = _mm512_shuffle_f32x4(tmp3810, tmp3810, 136);
__m512 tmp3824 = _mm512_shuffle_f32x4(tmp3810, tmp3810, 221);
__m512 tmp3825 = _mm512_shuffle_f32x4(tmp3811, tmp3811, 136);
__m512 tmp3826 = _mm512_shuffle_f32x4(tmp3811, tmp3811, 221);
__m512 tmp3827 = _mm512_shuffle_f32x4(tmp3812, tmp3812, 136);
__m512 tmp3828 = _mm512_shuffle_f32x4(tmp3812, tmp3812, 221);
tmp3693 = _mm512_shuffle_f32x4(tmp3813, tmp3821, 136);
tmp3701 = _mm512_shuffle_f32x4(tmp3813, tmp3821, 221);
tmp3694 = _mm512_shuffle_f32x4(tmp3815, tmp3823, 136);
tmp3702 = _mm512_shuffle_f32x4(tmp3815, tmp3823, 221);
tmp3695 = _mm512_shuffle_f32x4(tmp3817, tmp3825, 136);
tmp3703 = _mm512_shuffle_f32x4(tmp3817, tmp3825, 221);
tmp3696 = _mm512_shuffle_f32x4(tmp3819, tmp3827, 136);
tmp3704 = _mm512_shuffle_f32x4(tmp3819, tmp3827, 221);
tmp3697 = _mm512_shuffle_f32x4(tmp3814, tmp3822, 136);
__m512 tmp3745 = _mm512_shuffle_f32x4(tmp3814, tmp3822, 221);
tmp3698 = _mm512_shuffle_f32x4(tmp3816, tmp3824, 136);
__m512 tmp3746 = _mm512_shuffle_f32x4(tmp3816, tmp3824, 221);
tmp3699 = _mm512_shuffle_f32x4(tmp3818, tmp3826, 136);
__m512 tmp3747 = _mm512_shuffle_f32x4(tmp3818, tmp3826, 221);
tmp3700 = _mm512_shuffle_f32x4(tmp3820, tmp3828, 136);
__m512 tmp3748 = _mm512_shuffle_f32x4(tmp3820, tmp3828, 221);
__m512 tmp3753 = _mm512_add_ps(tmp3694, tmp3695);
__m512 tmp3773 = _mm512_add_ps(tmp3702, tmp3703);
__m512 tmp3752 = _mm512_add_ps(tmp3696, tmp3697);
__m512 tmp3772 = _mm512_add_ps(tmp3704, tmp3745);
__m512 tmp3758 = _mm512_sub_ps(tmp3696, tmp3697);
__m512 tmp3778 = _mm512_sub_ps(tmp3704, tmp3745);
__m512 tmp3757 = _mm512_sub_ps(tmp3694, tmp3695);
__m512 tmp3777 = _mm512_sub_ps(tmp3702, tmp3703);
__m512 tmp3754 = _mm512_add_ps(tmp3698, tmp3699);
__m512 tmp3774 = _mm512_add_ps(tmp3746, tmp3747);
__m512 tmp3759 = _mm512_sub_ps(tmp3698, tmp3699);
__m512 tmp3779 = _mm512_sub_ps(tmp3746, tmp3747);
__m512 tmp3756 = _mm512_fmadd_ps(tmp3758, _mm512_set1_ps(2e+00f), tmp3757);
__m512 tmp3776 = _mm512_fmadd_ps(tmp3778, _mm512_set1_ps(2e+00f), tmp3777);
__m512 tmp3763 = _mm512_fmadd_ps(tmp3758, _mm512_set1_ps(8e+00f), tmp3757);
__m512 tmp3783 = _mm512_fmadd_ps(tmp3778, _mm512_set1_ps(8e+00f), tmp3777);
__m512 tmp3751 = _mm512_add_ps(tmp3752, tmp3753);
__m512 tmp3771 = _mm512_add_ps(tmp3772, tmp3773);
__m512 tmp3755 = _mm512_fmadd_ps(tmp3759, _mm512_set1_ps(1.6e+01f), tmp3756);
__m512 tmp3775 = _mm512_fmadd_ps(tmp3779, _mm512_set1_ps(1.6e+01f), tmp3776);
__m512 tmp3762 = _mm512_fmadd_ps(tmp3759, _mm512_set1_ps(4e+00f), tmp3763);
__m512 tmp3782 = _mm512_fmadd_ps(tmp3779, _mm512_set1_ps(4e+00f), tmp3783);
__m512 tmp3768 = _mm512_add_ps(tmp3759, tmp3757);
__m512 tmp3788 = _mm512_add_ps(tmp3779, tmp3777);
__m512 tmp3761 = _mm512_fmadd_ps(tmp3752, _mm512_set1_ps(4e+00f), tmp3753);
__m512 tmp3781 = _mm512_fmadd_ps(tmp3772, _mm512_set1_ps(4e+00f), tmp3773);
__m512 tmp3765 = _mm512_fmadd_ps(tmp3752, _mm512_set1_ps(1.6e+01f), tmp3753);
__m512 tmp3785 = _mm512_fmadd_ps(tmp3772, _mm512_set1_ps(1.6e+01f), tmp3773);
__m512 tmp3750 = _mm512_add_ps(tmp3751, tmp3693);
__m512 tmp3770 = _mm512_add_ps(tmp3771, tmp3701);
__m512 tmp3767 = _mm512_add_ps(tmp3768, tmp3700);
__m512 tmp3787 = _mm512_add_ps(tmp3788, tmp3748);
__m512 tmp3749 = _mm512_fmadd_ps(tmp3754, _mm512_set1_ps(3.2e+01f), tmp3750);
__m512 tmp3769 = _mm512_fmadd_ps(tmp3774, _mm512_set1_ps(3.2e+01f), tmp3770);
__m512 tmp3760 = _mm512_fmadd_ps(tmp3754, _mm512_set1_ps(8e+00f), tmp3761);
__m512 tmp3780 = _mm512_fmadd_ps(tmp3774, _mm512_set1_ps(8e+00f), tmp3781);
__m512 tmp3766 = _mm512_fmadd_ps(tmp3758, _mm512_set1_ps(3.2e+01f), tmp3767);
__m512 tmp3786 = _mm512_fmadd_ps(tmp3778, _mm512_set1_ps(3.2e+01f), tmp3787);
__m512 tmp3764 = _mm512_fmadd_ps(tmp3754, _mm512_set1_ps(2e+00f), tmp3765);
__m512 tmp3784 = _mm512_fmadd_ps(tmp3774, _mm512_set1_ps(2e+00f), tmp3785);
__m512 out579 = tmp3749;
__m512 out585 = tmp3769;
__m512 out580 = tmp3755;
__m512 out586 = tmp3775;
__m512 out581 = tmp3760;
__m512 out587 = tmp3780;
__m512 out582 = tmp3762;
__m512 out588 = tmp3782;
__m512 out583 = tmp3764;
__m512 out589 = tmp3784;
__m512 out584 = tmp3766;
__m512 out590 = tmp3786;
out579 = _mm512_max_ps(_mm512_setzero_ps(), out579);
out585 = _mm512_max_ps(_mm512_setzero_ps(), out585);
out580 = _mm512_max_ps(_mm512_setzero_ps(), out580);
out586 = _mm512_max_ps(_mm512_setzero_ps(), out586);
out581 = _mm512_max_ps(_mm512_setzero_ps(), out581);
out587 = _mm512_max_ps(_mm512_setzero_ps(), out587);
out582 = _mm512_max_ps(_mm512_setzero_ps(), out582);
out588 = _mm512_max_ps(_mm512_setzero_ps(), out588);
out583 = _mm512_max_ps(_mm512_setzero_ps(), out583);
out589 = _mm512_max_ps(_mm512_setzero_ps(), out589);
out584 = _mm512_max_ps(_mm512_setzero_ps(), out584);
out590 = _mm512_max_ps(_mm512_setzero_ps(), out590);
_mm512_mask_storeu_ps(datPtr6+96+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 255, out579);
_mm512_mask_storeu_ps(datPtr6+12608+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out585);
_mm512_mask_storeu_ps(datPtr6+320+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 255, out580);
_mm512_mask_storeu_ps(datPtr6+12832+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out586);
_mm512_mask_storeu_ps(datPtr6+544+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 255, out581);
_mm512_mask_storeu_ps(datPtr6+13056+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out587);
_mm512_mask_storeu_ps(datPtr6+768+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 255, out582);
_mm512_mask_storeu_ps(datPtr6+13280+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out588);
_mm512_mask_storeu_ps(datPtr6+992+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 255, out583);
_mm512_mask_storeu_ps(datPtr6+13504+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out589);
_mm512_mask_storeu_ps(datPtr6+1216+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 255, out584);
_mm512_mask_storeu_ps(datPtr6+13728+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out590);
__m512 sf225 = _mm512_loadu_ps(sfPtr5+512+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf226 = _mm512_loadu_ps(sfPtr5+640+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in588 = _mm512_shuffle_f32x4(sf225, sf226, 68);
__m512 in589 = _mm512_shuffle_f32x4(sf225, sf226, 238);
__m512 sf227 = _mm512_loadu_ps(sfPtr5+576+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf228 = _mm512_loadu_ps(sfPtr5+704+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in596 = _mm512_shuffle_f32x4(sf227, sf228, 68);
__m512 in597 = _mm512_shuffle_f32x4(sf227, sf228, 238);
__m512 sf229 = _mm512_loadu_ps(sfPtr5+410112+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf230 = _mm512_loadu_ps(sfPtr5+410240+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in590 = _mm512_shuffle_f32x4(sf229, sf230, 68);
__m512 in591 = _mm512_shuffle_f32x4(sf229, sf230, 238);
__m512 sf231 = _mm512_loadu_ps(sfPtr5+410176+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf232 = _mm512_loadu_ps(sfPtr5+410304+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in598 = _mm512_shuffle_f32x4(sf231, sf232, 68);
__m512 in599 = _mm512_shuffle_f32x4(sf231, sf232, 238);
__m512 sf233 = _mm512_loadu_ps(sfPtr5+819712+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf234 = _mm512_loadu_ps(sfPtr5+819840+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in592 = _mm512_shuffle_f32x4(sf233, sf234, 68);
__m512 in593 = _mm512_shuffle_f32x4(sf233, sf234, 238);
__m512 sf235 = _mm512_loadu_ps(sfPtr5+819776+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf236 = _mm512_loadu_ps(sfPtr5+819904+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in600 = _mm512_shuffle_f32x4(sf235, sf236, 68);
__m512 in601 = _mm512_shuffle_f32x4(sf235, sf236, 238);
__m512 sf237 = _mm512_loadu_ps(sfPtr5+1229312+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf238 = _mm512_loadu_ps(sfPtr5+1229440+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in594 = _mm512_shuffle_f32x4(sf237, sf238, 68);
__m512 in595 = _mm512_shuffle_f32x4(sf237, sf238, 238);
__m512 sf239 = _mm512_loadu_ps(sfPtr5+1229376+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 sf240 = _mm512_loadu_ps(sfPtr5+1229504+1638400*i18+24576*j13+1536*k66+768*l18);
__m512 in602 = _mm512_shuffle_f32x4(sf239, sf240, 68);
__m512 in603 = _mm512_shuffle_f32x4(sf239, sf240, 238);
__m512 tmp3845 = _mm512_add_ps(in589, in590);
__m512 tmp3865 = _mm512_add_ps(in597, in598);
__m512 tmp3844 = _mm512_add_ps(in591, in592);
__m512 tmp3864 = _mm512_add_ps(in599, in600);
__m512 tmp3850 = _mm512_sub_ps(in591, in592);
__m512 tmp3870 = _mm512_sub_ps(in599, in600);
__m512 tmp3849 = _mm512_sub_ps(in589, in590);
__m512 tmp3869 = _mm512_sub_ps(in597, in598);
__m512 tmp3846 = _mm512_add_ps(in593, in594);
__m512 tmp3866 = _mm512_add_ps(in601, in602);
__m512 tmp3851 = _mm512_sub_ps(in593, in594);
__m512 tmp3871 = _mm512_sub_ps(in601, in602);
__m512 tmp3848 = _mm512_fmadd_ps(tmp3850, _mm512_set1_ps(2e+00f), tmp3849);
__m512 tmp3868 = _mm512_fmadd_ps(tmp3870, _mm512_set1_ps(2e+00f), tmp3869);
__m512 tmp3855 = _mm512_fmadd_ps(tmp3850, _mm512_set1_ps(8e+00f), tmp3849);
__m512 tmp3875 = _mm512_fmadd_ps(tmp3870, _mm512_set1_ps(8e+00f), tmp3869);
__m512 tmp3843 = _mm512_add_ps(tmp3844, tmp3845);
__m512 tmp3863 = _mm512_add_ps(tmp3864, tmp3865);
__m512 tmp3847 = _mm512_fmadd_ps(tmp3851, _mm512_set1_ps(1.6e+01f), tmp3848);
__m512 tmp3867 = _mm512_fmadd_ps(tmp3871, _mm512_set1_ps(1.6e+01f), tmp3868);
__m512 tmp3854 = _mm512_fmadd_ps(tmp3851, _mm512_set1_ps(4e+00f), tmp3855);
__m512 tmp3874 = _mm512_fmadd_ps(tmp3871, _mm512_set1_ps(4e+00f), tmp3875);
__m512 tmp3860 = _mm512_add_ps(tmp3851, tmp3849);
__m512 tmp3880 = _mm512_add_ps(tmp3871, tmp3869);
__m512 tmp3853 = _mm512_fmadd_ps(tmp3844, _mm512_set1_ps(4e+00f), tmp3845);
__m512 tmp3873 = _mm512_fmadd_ps(tmp3864, _mm512_set1_ps(4e+00f), tmp3865);
__m512 tmp3857 = _mm512_fmadd_ps(tmp3844, _mm512_set1_ps(1.6e+01f), tmp3845);
__m512 tmp3877 = _mm512_fmadd_ps(tmp3864, _mm512_set1_ps(1.6e+01f), tmp3865);
__m512 tmp3842 = _mm512_add_ps(tmp3843, in588);
__m512 tmp3862 = _mm512_add_ps(tmp3863, in596);
__m512 tmp3859 = _mm512_add_ps(tmp3860, in595);
__m512 tmp3879 = _mm512_add_ps(tmp3880, in603);
__m512 tmp3841 = _mm512_fmadd_ps(tmp3846, _mm512_set1_ps(3.2e+01f), tmp3842);
__m512 tmp3861 = _mm512_fmadd_ps(tmp3866, _mm512_set1_ps(3.2e+01f), tmp3862);
__m512 tmp3852 = _mm512_fmadd_ps(tmp3846, _mm512_set1_ps(8e+00f), tmp3853);
__m512 tmp3872 = _mm512_fmadd_ps(tmp3866, _mm512_set1_ps(8e+00f), tmp3873);
__m512 tmp3858 = _mm512_fmadd_ps(tmp3850, _mm512_set1_ps(3.2e+01f), tmp3859);
__m512 tmp3878 = _mm512_fmadd_ps(tmp3870, _mm512_set1_ps(3.2e+01f), tmp3879);
__m512 tmp3856 = _mm512_fmadd_ps(tmp3846, _mm512_set1_ps(2e+00f), tmp3857);
__m512 tmp3876 = _mm512_fmadd_ps(tmp3866, _mm512_set1_ps(2e+00f), tmp3877);
__m512 tmp3829 = tmp3841;
__m512 tmp3835 = tmp3861;
__m512 tmp3830 = tmp3847;
__m512 tmp3836 = tmp3867;
__m512 tmp3831 = tmp3852;
__m512 tmp3837 = tmp3872;
__m512 tmp3832 = tmp3854;
__m512 tmp3838 = tmp3874;
__m512 tmp3833 = tmp3856;
__m512 tmp3839 = tmp3876;
__m512 tmp3834 = tmp3858;
__m512 tmp3840 = tmp3878;
__m512 tmp3925 = _mm512_unpacklo_ps(tmp3829, tmp3830);
__m512 tmp3926 = _mm512_unpackhi_ps(tmp3829, tmp3830);
__m512 tmp3927 = _mm512_unpacklo_ps(tmp3831, tmp3832);
__m512 tmp3928 = _mm512_unpackhi_ps(tmp3831, tmp3832);
__m512 tmp3929 = _mm512_unpacklo_ps(tmp3833, tmp3834);
__m512 tmp3930 = _mm512_unpackhi_ps(tmp3833, tmp3834);
__m512 tmp3931 = _mm512_unpacklo_ps(tmp3835, tmp3836);
__m512 tmp3932 = _mm512_unpackhi_ps(tmp3835, tmp3836);
__m512 tmp3933 = _mm512_unpacklo_ps(tmp3837, tmp3838);
__m512 tmp3934 = _mm512_unpackhi_ps(tmp3837, tmp3838);
__m512 tmp3935 = _mm512_unpacklo_ps(tmp3839, tmp3840);
__m512 tmp3936 = _mm512_unpackhi_ps(tmp3839, tmp3840);
__m512 tmp3937 = _mm512_shuffle_ps(tmp3925, tmp3927, 68);
__m512 tmp3938 = _mm512_shuffle_ps(tmp3925, tmp3927, 238);
__m512 tmp3939 = _mm512_shuffle_ps(tmp3926, tmp3928, 68);
__m512 tmp3940 = _mm512_shuffle_ps(tmp3926, tmp3928, 238);
__m512 tmp3941 = _mm512_shuffle_ps(tmp3929, tmp3931, 68);
__m512 tmp3942 = _mm512_shuffle_ps(tmp3929, tmp3931, 238);
__m512 tmp3943 = _mm512_shuffle_ps(tmp3930, tmp3932, 68);
__m512 tmp3944 = _mm512_shuffle_ps(tmp3930, tmp3932, 238);
__m512 tmp3945 = _mm512_shuffle_ps(tmp3933, tmp3935, 68);
__m512 tmp3946 = _mm512_shuffle_ps(tmp3933, tmp3935, 238);
__m512 tmp3947 = _mm512_shuffle_ps(tmp3934, tmp3936, 68);
__m512 tmp3948 = _mm512_shuffle_ps(tmp3934, tmp3936, 238);
__m512 tmp3949 = _mm512_shuffle_f32x4(tmp3937, tmp3941, 136);
__m512 tmp3950 = _mm512_shuffle_f32x4(tmp3937, tmp3941, 221);
__m512 tmp3951 = _mm512_shuffle_f32x4(tmp3938, tmp3942, 136);
__m512 tmp3952 = _mm512_shuffle_f32x4(tmp3938, tmp3942, 221);
__m512 tmp3953 = _mm512_shuffle_f32x4(tmp3939, tmp3943, 136);
__m512 tmp3954 = _mm512_shuffle_f32x4(tmp3939, tmp3943, 221);
__m512 tmp3955 = _mm512_shuffle_f32x4(tmp3940, tmp3944, 136);
__m512 tmp3956 = _mm512_shuffle_f32x4(tmp3940, tmp3944, 221);
__m512 tmp3957 = _mm512_shuffle_f32x4(tmp3945, tmp3945, 136);
__m512 tmp3958 = _mm512_shuffle_f32x4(tmp3945, tmp3945, 221);
__m512 tmp3959 = _mm512_shuffle_f32x4(tmp3946, tmp3946, 136);
__m512 tmp3960 = _mm512_shuffle_f32x4(tmp3946, tmp3946, 221);
__m512 tmp3961 = _mm512_shuffle_f32x4(tmp3947, tmp3947, 136);
__m512 tmp3962 = _mm512_shuffle_f32x4(tmp3947, tmp3947, 221);
__m512 tmp3963 = _mm512_shuffle_f32x4(tmp3948, tmp3948, 136);
__m512 tmp3964 = _mm512_shuffle_f32x4(tmp3948, tmp3948, 221);
tmp3829 = _mm512_shuffle_f32x4(tmp3949, tmp3957, 136);
tmp3837 = _mm512_shuffle_f32x4(tmp3949, tmp3957, 221);
tmp3830 = _mm512_shuffle_f32x4(tmp3951, tmp3959, 136);
tmp3838 = _mm512_shuffle_f32x4(tmp3951, tmp3959, 221);
tmp3831 = _mm512_shuffle_f32x4(tmp3953, tmp3961, 136);
tmp3839 = _mm512_shuffle_f32x4(tmp3953, tmp3961, 221);
tmp3832 = _mm512_shuffle_f32x4(tmp3955, tmp3963, 136);
tmp3840 = _mm512_shuffle_f32x4(tmp3955, tmp3963, 221);
tmp3833 = _mm512_shuffle_f32x4(tmp3950, tmp3958, 136);
__m512 tmp3881 = _mm512_shuffle_f32x4(tmp3950, tmp3958, 221);
tmp3834 = _mm512_shuffle_f32x4(tmp3952, tmp3960, 136);
__m512 tmp3882 = _mm512_shuffle_f32x4(tmp3952, tmp3960, 221);
tmp3835 = _mm512_shuffle_f32x4(tmp3954, tmp3962, 136);
__m512 tmp3883 = _mm512_shuffle_f32x4(tmp3954, tmp3962, 221);
tmp3836 = _mm512_shuffle_f32x4(tmp3956, tmp3964, 136);
__m512 tmp3884 = _mm512_shuffle_f32x4(tmp3956, tmp3964, 221);
__m512 tmp3889 = _mm512_add_ps(tmp3830, tmp3831);
__m512 tmp3909 = _mm512_add_ps(tmp3838, tmp3839);
__m512 tmp3888 = _mm512_add_ps(tmp3832, tmp3833);
__m512 tmp3908 = _mm512_add_ps(tmp3840, tmp3881);
__m512 tmp3894 = _mm512_sub_ps(tmp3832, tmp3833);
__m512 tmp3914 = _mm512_sub_ps(tmp3840, tmp3881);
__m512 tmp3893 = _mm512_sub_ps(tmp3830, tmp3831);
__m512 tmp3913 = _mm512_sub_ps(tmp3838, tmp3839);
__m512 tmp3890 = _mm512_add_ps(tmp3834, tmp3835);
__m512 tmp3910 = _mm512_add_ps(tmp3882, tmp3883);
__m512 tmp3895 = _mm512_sub_ps(tmp3834, tmp3835);
__m512 tmp3915 = _mm512_sub_ps(tmp3882, tmp3883);
__m512 tmp3892 = _mm512_fmadd_ps(tmp3894, _mm512_set1_ps(2e+00f), tmp3893);
__m512 tmp3912 = _mm512_fmadd_ps(tmp3914, _mm512_set1_ps(2e+00f), tmp3913);
__m512 tmp3899 = _mm512_fmadd_ps(tmp3894, _mm512_set1_ps(8e+00f), tmp3893);
__m512 tmp3919 = _mm512_fmadd_ps(tmp3914, _mm512_set1_ps(8e+00f), tmp3913);
__m512 tmp3887 = _mm512_add_ps(tmp3888, tmp3889);
__m512 tmp3907 = _mm512_add_ps(tmp3908, tmp3909);
__m512 tmp3891 = _mm512_fmadd_ps(tmp3895, _mm512_set1_ps(1.6e+01f), tmp3892);
__m512 tmp3911 = _mm512_fmadd_ps(tmp3915, _mm512_set1_ps(1.6e+01f), tmp3912);
__m512 tmp3898 = _mm512_fmadd_ps(tmp3895, _mm512_set1_ps(4e+00f), tmp3899);
__m512 tmp3918 = _mm512_fmadd_ps(tmp3915, _mm512_set1_ps(4e+00f), tmp3919);
__m512 tmp3904 = _mm512_add_ps(tmp3895, tmp3893);
__m512 tmp3924 = _mm512_add_ps(tmp3915, tmp3913);
__m512 tmp3897 = _mm512_fmadd_ps(tmp3888, _mm512_set1_ps(4e+00f), tmp3889);
__m512 tmp3917 = _mm512_fmadd_ps(tmp3908, _mm512_set1_ps(4e+00f), tmp3909);
__m512 tmp3901 = _mm512_fmadd_ps(tmp3888, _mm512_set1_ps(1.6e+01f), tmp3889);
__m512 tmp3921 = _mm512_fmadd_ps(tmp3908, _mm512_set1_ps(1.6e+01f), tmp3909);
__m512 tmp3886 = _mm512_add_ps(tmp3887, tmp3829);
__m512 tmp3906 = _mm512_add_ps(tmp3907, tmp3837);
__m512 tmp3903 = _mm512_add_ps(tmp3904, tmp3836);
__m512 tmp3923 = _mm512_add_ps(tmp3924, tmp3884);
__m512 tmp3885 = _mm512_fmadd_ps(tmp3890, _mm512_set1_ps(3.2e+01f), tmp3886);
__m512 tmp3905 = _mm512_fmadd_ps(tmp3910, _mm512_set1_ps(3.2e+01f), tmp3906);
__m512 tmp3896 = _mm512_fmadd_ps(tmp3890, _mm512_set1_ps(8e+00f), tmp3897);
__m512 tmp3916 = _mm512_fmadd_ps(tmp3910, _mm512_set1_ps(8e+00f), tmp3917);
__m512 tmp3902 = _mm512_fmadd_ps(tmp3894, _mm512_set1_ps(3.2e+01f), tmp3903);
__m512 tmp3922 = _mm512_fmadd_ps(tmp3914, _mm512_set1_ps(3.2e+01f), tmp3923);
__m512 tmp3900 = _mm512_fmadd_ps(tmp3890, _mm512_set1_ps(2e+00f), tmp3901);
__m512 tmp3920 = _mm512_fmadd_ps(tmp3910, _mm512_set1_ps(2e+00f), tmp3921);
__m512 out591 = tmp3885;
__m512 out597 = tmp3905;
__m512 out592 = tmp3891;
__m512 out598 = tmp3911;
__m512 out593 = tmp3896;
__m512 out599 = tmp3916;
__m512 out594 = tmp3898;
__m512 out600 = tmp3918;
__m512 out595 = tmp3900;
__m512 out601 = tmp3920;
__m512 out596 = tmp3902;
__m512 out602 = tmp3922;
out591 = _mm512_max_ps(_mm512_setzero_ps(), out591);
out597 = _mm512_max_ps(_mm512_setzero_ps(), out597);
out592 = _mm512_max_ps(_mm512_setzero_ps(), out592);
out598 = _mm512_max_ps(_mm512_setzero_ps(), out598);
out593 = _mm512_max_ps(_mm512_setzero_ps(), out593);
out599 = _mm512_max_ps(_mm512_setzero_ps(), out599);
out594 = _mm512_max_ps(_mm512_setzero_ps(), out594);
out600 = _mm512_max_ps(_mm512_setzero_ps(), out600);
out595 = _mm512_max_ps(_mm512_setzero_ps(), out595);
out601 = _mm512_max_ps(_mm512_setzero_ps(), out601);
out596 = _mm512_max_ps(_mm512_setzero_ps(), out596);
out602 = _mm512_max_ps(_mm512_setzero_ps(), out602);
_mm512_mask_storeu_ps(datPtr6+12656+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out591);
_mm512_mask_storeu_ps(datPtr6+12704+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 255, out597);
_mm512_mask_storeu_ps(datPtr6+12880+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out592);
_mm512_mask_storeu_ps(datPtr6+12928+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 255, out598);
_mm512_mask_storeu_ps(datPtr6+13104+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out593);
_mm512_mask_storeu_ps(datPtr6+13152+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 255, out599);
_mm512_mask_storeu_ps(datPtr6+13328+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out594);
_mm512_mask_storeu_ps(datPtr6+13376+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 255, out600);
_mm512_mask_storeu_ps(datPtr6+13552+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out595);
_mm512_mask_storeu_ps(datPtr6+13600+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 255, out601);
_mm512_mask_storeu_ps(datPtr6+13776+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 4095, out596);
_mm512_mask_storeu_ps(datPtr6+13824+806912*i18+224*toH24+4*toW24+50432*k66+25216*l18, 255, out602);
}
}
if (j13 >= last4) return;
++j13;
if (j13 >= 15) break;
rel11 = 3;
}
if (rel11 < 4) {
ptrdiff_t toH25 = base11+12;
ptrdiff_t toW25 = 0;
ptrdiff_t k67 = 16*w33;
for (; k67 != 16; ++k67) {
ptrdiff_t l19 = 0;
for (; l19 != 2; ++l19) {
__m512 sf241 = _mm512_loadu_ps(sfPtr5+0+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf242 = _mm512_loadu_ps(sfPtr5+128+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in604 = _mm512_shuffle_f32x4(sf241, sf242, 68);
__m512 in605 = _mm512_shuffle_f32x4(sf241, sf242, 238);
__m512 sf243 = _mm512_loadu_ps(sfPtr5+64+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf244 = _mm512_loadu_ps(sfPtr5+192+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in612 = _mm512_shuffle_f32x4(sf243, sf244, 68);
__m512 in613 = _mm512_shuffle_f32x4(sf243, sf244, 238);
__m512 sf245 = _mm512_loadu_ps(sfPtr5+409600+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf246 = _mm512_loadu_ps(sfPtr5+409728+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in606 = _mm512_shuffle_f32x4(sf245, sf246, 68);
__m512 in607 = _mm512_shuffle_f32x4(sf245, sf246, 238);
__m512 sf247 = _mm512_loadu_ps(sfPtr5+409664+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf248 = _mm512_loadu_ps(sfPtr5+409792+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in614 = _mm512_shuffle_f32x4(sf247, sf248, 68);
__m512 in615 = _mm512_shuffle_f32x4(sf247, sf248, 238);
__m512 sf249 = _mm512_loadu_ps(sfPtr5+819200+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf250 = _mm512_loadu_ps(sfPtr5+819328+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in608 = _mm512_shuffle_f32x4(sf249, sf250, 68);
__m512 in609 = _mm512_shuffle_f32x4(sf249, sf250, 238);
__m512 sf251 = _mm512_loadu_ps(sfPtr5+819264+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf252 = _mm512_loadu_ps(sfPtr5+819392+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in616 = _mm512_shuffle_f32x4(sf251, sf252, 68);
__m512 in617 = _mm512_shuffle_f32x4(sf251, sf252, 238);
__m512 sf253 = _mm512_loadu_ps(sfPtr5+1228800+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf254 = _mm512_loadu_ps(sfPtr5+1228928+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in610 = _mm512_shuffle_f32x4(sf253, sf254, 68);
__m512 in611 = _mm512_shuffle_f32x4(sf253, sf254, 238);
__m512 sf255 = _mm512_loadu_ps(sfPtr5+1228864+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf256 = _mm512_loadu_ps(sfPtr5+1228992+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in618 = _mm512_shuffle_f32x4(sf255, sf256, 68);
__m512 in619 = _mm512_shuffle_f32x4(sf255, sf256, 238);
__m512 tmp3981 = _mm512_add_ps(in605, in606);
__m512 tmp4001 = _mm512_add_ps(in613, in614);
__m512 tmp3980 = _mm512_add_ps(in607, in608);
__m512 tmp4000 = _mm512_add_ps(in615, in616);
__m512 tmp3986 = _mm512_sub_ps(in607, in608);
__m512 tmp4006 = _mm512_sub_ps(in615, in616);
__m512 tmp3985 = _mm512_sub_ps(in605, in606);
__m512 tmp4005 = _mm512_sub_ps(in613, in614);
__m512 tmp3982 = _mm512_add_ps(in609, in610);
__m512 tmp4002 = _mm512_add_ps(in617, in618);
__m512 tmp3987 = _mm512_sub_ps(in609, in610);
__m512 tmp4007 = _mm512_sub_ps(in617, in618);
__m512 tmp3984 = _mm512_fmadd_ps(tmp3986, _mm512_set1_ps(2e+00f), tmp3985);
__m512 tmp4004 = _mm512_fmadd_ps(tmp4006, _mm512_set1_ps(2e+00f), tmp4005);
__m512 tmp3991 = _mm512_fmadd_ps(tmp3986, _mm512_set1_ps(8e+00f), tmp3985);
__m512 tmp4011 = _mm512_fmadd_ps(tmp4006, _mm512_set1_ps(8e+00f), tmp4005);
__m512 tmp3979 = _mm512_add_ps(tmp3980, tmp3981);
__m512 tmp3999 = _mm512_add_ps(tmp4000, tmp4001);
__m512 tmp3983 = _mm512_fmadd_ps(tmp3987, _mm512_set1_ps(1.6e+01f), tmp3984);
__m512 tmp4003 = _mm512_fmadd_ps(tmp4007, _mm512_set1_ps(1.6e+01f), tmp4004);
__m512 tmp3990 = _mm512_fmadd_ps(tmp3987, _mm512_set1_ps(4e+00f), tmp3991);
__m512 tmp4010 = _mm512_fmadd_ps(tmp4007, _mm512_set1_ps(4e+00f), tmp4011);
__m512 tmp3996 = _mm512_add_ps(tmp3987, tmp3985);
__m512 tmp4016 = _mm512_add_ps(tmp4007, tmp4005);
__m512 tmp3989 = _mm512_fmadd_ps(tmp3980, _mm512_set1_ps(4e+00f), tmp3981);
__m512 tmp4009 = _mm512_fmadd_ps(tmp4000, _mm512_set1_ps(4e+00f), tmp4001);
__m512 tmp3993 = _mm512_fmadd_ps(tmp3980, _mm512_set1_ps(1.6e+01f), tmp3981);
__m512 tmp4013 = _mm512_fmadd_ps(tmp4000, _mm512_set1_ps(1.6e+01f), tmp4001);
__m512 tmp3978 = _mm512_add_ps(tmp3979, in604);
__m512 tmp3998 = _mm512_add_ps(tmp3999, in612);
__m512 tmp3995 = _mm512_add_ps(tmp3996, in611);
__m512 tmp4015 = _mm512_add_ps(tmp4016, in619);
__m512 tmp3977 = _mm512_fmadd_ps(tmp3982, _mm512_set1_ps(3.2e+01f), tmp3978);
__m512 tmp3997 = _mm512_fmadd_ps(tmp4002, _mm512_set1_ps(3.2e+01f), tmp3998);
__m512 tmp3988 = _mm512_fmadd_ps(tmp3982, _mm512_set1_ps(8e+00f), tmp3989);
__m512 tmp4008 = _mm512_fmadd_ps(tmp4002, _mm512_set1_ps(8e+00f), tmp4009);
__m512 tmp3994 = _mm512_fmadd_ps(tmp3986, _mm512_set1_ps(3.2e+01f), tmp3995);
__m512 tmp4014 = _mm512_fmadd_ps(tmp4006, _mm512_set1_ps(3.2e+01f), tmp4015);
__m512 tmp3992 = _mm512_fmadd_ps(tmp3982, _mm512_set1_ps(2e+00f), tmp3993);
__m512 tmp4012 = _mm512_fmadd_ps(tmp4002, _mm512_set1_ps(2e+00f), tmp4013);
__m512 tmp3965 = tmp3977;
__m512 tmp3971 = tmp3997;
__m512 tmp3966 = tmp3983;
__m512 tmp3972 = tmp4003;
__m512 tmp3967 = tmp3988;
__m512 tmp3973 = tmp4008;
__m512 tmp3968 = tmp3990;
__m512 tmp3974 = tmp4010;
__m512 tmp3969 = tmp3992;
__m512 tmp3975 = tmp4012;
__m512 tmp3970 = tmp3994;
__m512 tmp3976 = tmp4014;
__m512 tmp4061 = _mm512_unpacklo_ps(tmp3965, tmp3966);
__m512 tmp4062 = _mm512_unpackhi_ps(tmp3965, tmp3966);
__m512 tmp4063 = _mm512_unpacklo_ps(tmp3967, tmp3968);
__m512 tmp4064 = _mm512_unpackhi_ps(tmp3967, tmp3968);
__m512 tmp4065 = _mm512_unpacklo_ps(tmp3969, tmp3970);
__m512 tmp4066 = _mm512_unpackhi_ps(tmp3969, tmp3970);
__m512 tmp4067 = _mm512_unpacklo_ps(tmp3971, tmp3972);
__m512 tmp4068 = _mm512_unpackhi_ps(tmp3971, tmp3972);
__m512 tmp4069 = _mm512_unpacklo_ps(tmp3973, tmp3974);
__m512 tmp4070 = _mm512_unpackhi_ps(tmp3973, tmp3974);
__m512 tmp4071 = _mm512_unpacklo_ps(tmp3975, tmp3976);
__m512 tmp4072 = _mm512_unpackhi_ps(tmp3975, tmp3976);
__m512 tmp4073 = _mm512_shuffle_ps(tmp4061, tmp4063, 68);
__m512 tmp4074 = _mm512_shuffle_ps(tmp4061, tmp4063, 238);
__m512 tmp4075 = _mm512_shuffle_ps(tmp4062, tmp4064, 68);
__m512 tmp4076 = _mm512_shuffle_ps(tmp4062, tmp4064, 238);
__m512 tmp4077 = _mm512_shuffle_ps(tmp4065, tmp4067, 68);
__m512 tmp4078 = _mm512_shuffle_ps(tmp4065, tmp4067, 238);
__m512 tmp4079 = _mm512_shuffle_ps(tmp4066, tmp4068, 68);
__m512 tmp4080 = _mm512_shuffle_ps(tmp4066, tmp4068, 238);
__m512 tmp4081 = _mm512_shuffle_ps(tmp4069, tmp4071, 68);
__m512 tmp4082 = _mm512_shuffle_ps(tmp4069, tmp4071, 238);
__m512 tmp4083 = _mm512_shuffle_ps(tmp4070, tmp4072, 68);
__m512 tmp4084 = _mm512_shuffle_ps(tmp4070, tmp4072, 238);
__m512 tmp4085 = _mm512_shuffle_f32x4(tmp4073, tmp4077, 136);
__m512 tmp4086 = _mm512_shuffle_f32x4(tmp4073, tmp4077, 221);
__m512 tmp4087 = _mm512_shuffle_f32x4(tmp4074, tmp4078, 136);
__m512 tmp4088 = _mm512_shuffle_f32x4(tmp4074, tmp4078, 221);
__m512 tmp4089 = _mm512_shuffle_f32x4(tmp4075, tmp4079, 136);
__m512 tmp4090 = _mm512_shuffle_f32x4(tmp4075, tmp4079, 221);
__m512 tmp4091 = _mm512_shuffle_f32x4(tmp4076, tmp4080, 136);
__m512 tmp4092 = _mm512_shuffle_f32x4(tmp4076, tmp4080, 221);
__m512 tmp4093 = _mm512_shuffle_f32x4(tmp4081, tmp4081, 136);
__m512 tmp4094 = _mm512_shuffle_f32x4(tmp4081, tmp4081, 221);
__m512 tmp4095 = _mm512_shuffle_f32x4(tmp4082, tmp4082, 136);
__m512 tmp4096 = _mm512_shuffle_f32x4(tmp4082, tmp4082, 221);
__m512 tmp4097 = _mm512_shuffle_f32x4(tmp4083, tmp4083, 136);
__m512 tmp4098 = _mm512_shuffle_f32x4(tmp4083, tmp4083, 221);
__m512 tmp4099 = _mm512_shuffle_f32x4(tmp4084, tmp4084, 136);
__m512 tmp4100 = _mm512_shuffle_f32x4(tmp4084, tmp4084, 221);
tmp3965 = _mm512_shuffle_f32x4(tmp4085, tmp4093, 136);
tmp3973 = _mm512_shuffle_f32x4(tmp4085, tmp4093, 221);
tmp3966 = _mm512_shuffle_f32x4(tmp4087, tmp4095, 136);
tmp3974 = _mm512_shuffle_f32x4(tmp4087, tmp4095, 221);
tmp3967 = _mm512_shuffle_f32x4(tmp4089, tmp4097, 136);
tmp3975 = _mm512_shuffle_f32x4(tmp4089, tmp4097, 221);
tmp3968 = _mm512_shuffle_f32x4(tmp4091, tmp4099, 136);
tmp3976 = _mm512_shuffle_f32x4(tmp4091, tmp4099, 221);
tmp3969 = _mm512_shuffle_f32x4(tmp4086, tmp4094, 136);
__m512 tmp4017 = _mm512_shuffle_f32x4(tmp4086, tmp4094, 221);
tmp3970 = _mm512_shuffle_f32x4(tmp4088, tmp4096, 136);
__m512 tmp4018 = _mm512_shuffle_f32x4(tmp4088, tmp4096, 221);
tmp3971 = _mm512_shuffle_f32x4(tmp4090, tmp4098, 136);
__m512 tmp4019 = _mm512_shuffle_f32x4(tmp4090, tmp4098, 221);
tmp3972 = _mm512_shuffle_f32x4(tmp4092, tmp4100, 136);
__m512 tmp4020 = _mm512_shuffle_f32x4(tmp4092, tmp4100, 221);
__m512 tmp4025 = _mm512_add_ps(tmp3966, tmp3967);
__m512 tmp4045 = _mm512_add_ps(tmp3974, tmp3975);
__m512 tmp4024 = _mm512_add_ps(tmp3968, tmp3969);
__m512 tmp4044 = _mm512_add_ps(tmp3976, tmp4017);
__m512 tmp4030 = _mm512_sub_ps(tmp3968, tmp3969);
__m512 tmp4050 = _mm512_sub_ps(tmp3976, tmp4017);
__m512 tmp4029 = _mm512_sub_ps(tmp3966, tmp3967);
__m512 tmp4049 = _mm512_sub_ps(tmp3974, tmp3975);
__m512 tmp4026 = _mm512_add_ps(tmp3970, tmp3971);
__m512 tmp4046 = _mm512_add_ps(tmp4018, tmp4019);
__m512 tmp4031 = _mm512_sub_ps(tmp3970, tmp3971);
__m512 tmp4051 = _mm512_sub_ps(tmp4018, tmp4019);
__m512 tmp4028 = _mm512_fmadd_ps(tmp4030, _mm512_set1_ps(2e+00f), tmp4029);
__m512 tmp4048 = _mm512_fmadd_ps(tmp4050, _mm512_set1_ps(2e+00f), tmp4049);
__m512 tmp4035 = _mm512_fmadd_ps(tmp4030, _mm512_set1_ps(8e+00f), tmp4029);
__m512 tmp4055 = _mm512_fmadd_ps(tmp4050, _mm512_set1_ps(8e+00f), tmp4049);
__m512 tmp4023 = _mm512_add_ps(tmp4024, tmp4025);
__m512 tmp4043 = _mm512_add_ps(tmp4044, tmp4045);
__m512 tmp4027 = _mm512_fmadd_ps(tmp4031, _mm512_set1_ps(1.6e+01f), tmp4028);
__m512 tmp4047 = _mm512_fmadd_ps(tmp4051, _mm512_set1_ps(1.6e+01f), tmp4048);
__m512 tmp4034 = _mm512_fmadd_ps(tmp4031, _mm512_set1_ps(4e+00f), tmp4035);
__m512 tmp4054 = _mm512_fmadd_ps(tmp4051, _mm512_set1_ps(4e+00f), tmp4055);
__m512 tmp4040 = _mm512_add_ps(tmp4031, tmp4029);
__m512 tmp4060 = _mm512_add_ps(tmp4051, tmp4049);
__m512 tmp4033 = _mm512_fmadd_ps(tmp4024, _mm512_set1_ps(4e+00f), tmp4025);
__m512 tmp4053 = _mm512_fmadd_ps(tmp4044, _mm512_set1_ps(4e+00f), tmp4045);
__m512 tmp4037 = _mm512_fmadd_ps(tmp4024, _mm512_set1_ps(1.6e+01f), tmp4025);
__m512 tmp4057 = _mm512_fmadd_ps(tmp4044, _mm512_set1_ps(1.6e+01f), tmp4045);
__m512 tmp4022 = _mm512_add_ps(tmp4023, tmp3965);
__m512 tmp4042 = _mm512_add_ps(tmp4043, tmp3973);
__m512 tmp4039 = _mm512_add_ps(tmp4040, tmp3972);
__m512 tmp4059 = _mm512_add_ps(tmp4060, tmp4020);
__m512 tmp4021 = _mm512_fmadd_ps(tmp4026, _mm512_set1_ps(3.2e+01f), tmp4022);
__m512 tmp4041 = _mm512_fmadd_ps(tmp4046, _mm512_set1_ps(3.2e+01f), tmp4042);
__m512 tmp4032 = _mm512_fmadd_ps(tmp4026, _mm512_set1_ps(8e+00f), tmp4033);
__m512 tmp4052 = _mm512_fmadd_ps(tmp4046, _mm512_set1_ps(8e+00f), tmp4053);
__m512 tmp4038 = _mm512_fmadd_ps(tmp4030, _mm512_set1_ps(3.2e+01f), tmp4039);
__m512 tmp4058 = _mm512_fmadd_ps(tmp4050, _mm512_set1_ps(3.2e+01f), tmp4059);
__m512 tmp4036 = _mm512_fmadd_ps(tmp4026, _mm512_set1_ps(2e+00f), tmp4037);
__m512 tmp4056 = _mm512_fmadd_ps(tmp4046, _mm512_set1_ps(2e+00f), tmp4057);
__m512 out603 = tmp4021;
__m512 out609 = tmp4041;
__m512 out604 = tmp4027;
__m512 out610 = tmp4047;
__m512 out605 = tmp4032;
__m512 out611 = tmp4052;
__m512 out606 = tmp4034;
__m512 out612 = tmp4054;
__m512 out607 = tmp4036;
__m512 out613 = tmp4056;
__m512 out608 = tmp4038;
__m512 out614 = tmp4058;
out603 = _mm512_max_ps(_mm512_setzero_ps(), out603);
out609 = _mm512_max_ps(_mm512_setzero_ps(), out609);
out604 = _mm512_max_ps(_mm512_setzero_ps(), out604);
out610 = _mm512_max_ps(_mm512_setzero_ps(), out610);
out605 = _mm512_max_ps(_mm512_setzero_ps(), out605);
out611 = _mm512_max_ps(_mm512_setzero_ps(), out611);
out606 = _mm512_max_ps(_mm512_setzero_ps(), out606);
out612 = _mm512_max_ps(_mm512_setzero_ps(), out612);
out607 = _mm512_max_ps(_mm512_setzero_ps(), out607);
out613 = _mm512_max_ps(_mm512_setzero_ps(), out613);
out608 = _mm512_max_ps(_mm512_setzero_ps(), out608);
out614 = _mm512_max_ps(_mm512_setzero_ps(), out614);
_mm512_mask_storeu_ps(datPtr6+0+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out603);
_mm512_mask_storeu_ps(datPtr6+48+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out609);
_mm512_mask_storeu_ps(datPtr6+224+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out604);
_mm512_mask_storeu_ps(datPtr6+272+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out610);
_mm512_mask_storeu_ps(datPtr6+448+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out605);
_mm512_mask_storeu_ps(datPtr6+496+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out611);
_mm512_mask_storeu_ps(datPtr6+672+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out606);
_mm512_mask_storeu_ps(datPtr6+720+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out612);
_mm512_mask_storeu_ps(datPtr6+896+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out607);
_mm512_mask_storeu_ps(datPtr6+944+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out613);
_mm512_mask_storeu_ps(datPtr6+1120+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out608);
_mm512_mask_storeu_ps(datPtr6+1168+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out614);
__m512 sf257 = _mm512_loadu_ps(sfPtr5+256+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf258 = _mm512_loadu_ps(sfPtr5+384+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in620 = _mm512_shuffle_f32x4(sf257, sf258, 68);
__m512 in621 = _mm512_shuffle_f32x4(sf257, sf258, 238);
__m512 sf259 = _mm512_loadu_ps(sfPtr5+320+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf260 = _mm512_loadu_ps(sfPtr5+448+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in628 = _mm512_shuffle_f32x4(sf259, sf260, 68);
__m512 in629 = _mm512_shuffle_f32x4(sf259, sf260, 238);
__m512 sf261 = _mm512_loadu_ps(sfPtr5+409856+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf262 = _mm512_loadu_ps(sfPtr5+409984+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in622 = _mm512_shuffle_f32x4(sf261, sf262, 68);
__m512 in623 = _mm512_shuffle_f32x4(sf261, sf262, 238);
__m512 sf263 = _mm512_loadu_ps(sfPtr5+409920+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf264 = _mm512_loadu_ps(sfPtr5+410048+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in630 = _mm512_shuffle_f32x4(sf263, sf264, 68);
__m512 in631 = _mm512_shuffle_f32x4(sf263, sf264, 238);
__m512 sf265 = _mm512_loadu_ps(sfPtr5+819456+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf266 = _mm512_loadu_ps(sfPtr5+819584+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in624 = _mm512_shuffle_f32x4(sf265, sf266, 68);
__m512 in625 = _mm512_shuffle_f32x4(sf265, sf266, 238);
__m512 sf267 = _mm512_loadu_ps(sfPtr5+819520+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf268 = _mm512_loadu_ps(sfPtr5+819648+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in632 = _mm512_shuffle_f32x4(sf267, sf268, 68);
__m512 in633 = _mm512_shuffle_f32x4(sf267, sf268, 238);
__m512 sf269 = _mm512_loadu_ps(sfPtr5+1229056+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf270 = _mm512_loadu_ps(sfPtr5+1229184+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in626 = _mm512_shuffle_f32x4(sf269, sf270, 68);
__m512 in627 = _mm512_shuffle_f32x4(sf269, sf270, 238);
__m512 sf271 = _mm512_loadu_ps(sfPtr5+1229120+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf272 = _mm512_loadu_ps(sfPtr5+1229248+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in634 = _mm512_shuffle_f32x4(sf271, sf272, 68);
__m512 in635 = _mm512_shuffle_f32x4(sf271, sf272, 238);
__m512 tmp4117 = _mm512_add_ps(in621, in622);
__m512 tmp4137 = _mm512_add_ps(in629, in630);
__m512 tmp4116 = _mm512_add_ps(in623, in624);
__m512 tmp4136 = _mm512_add_ps(in631, in632);
__m512 tmp4122 = _mm512_sub_ps(in623, in624);
__m512 tmp4142 = _mm512_sub_ps(in631, in632);
__m512 tmp4121 = _mm512_sub_ps(in621, in622);
__m512 tmp4141 = _mm512_sub_ps(in629, in630);
__m512 tmp4118 = _mm512_add_ps(in625, in626);
__m512 tmp4138 = _mm512_add_ps(in633, in634);
__m512 tmp4123 = _mm512_sub_ps(in625, in626);
__m512 tmp4143 = _mm512_sub_ps(in633, in634);
__m512 tmp4120 = _mm512_fmadd_ps(tmp4122, _mm512_set1_ps(2e+00f), tmp4121);
__m512 tmp4140 = _mm512_fmadd_ps(tmp4142, _mm512_set1_ps(2e+00f), tmp4141);
__m512 tmp4127 = _mm512_fmadd_ps(tmp4122, _mm512_set1_ps(8e+00f), tmp4121);
__m512 tmp4147 = _mm512_fmadd_ps(tmp4142, _mm512_set1_ps(8e+00f), tmp4141);
__m512 tmp4115 = _mm512_add_ps(tmp4116, tmp4117);
__m512 tmp4135 = _mm512_add_ps(tmp4136, tmp4137);
__m512 tmp4119 = _mm512_fmadd_ps(tmp4123, _mm512_set1_ps(1.6e+01f), tmp4120);
__m512 tmp4139 = _mm512_fmadd_ps(tmp4143, _mm512_set1_ps(1.6e+01f), tmp4140);
__m512 tmp4126 = _mm512_fmadd_ps(tmp4123, _mm512_set1_ps(4e+00f), tmp4127);
__m512 tmp4146 = _mm512_fmadd_ps(tmp4143, _mm512_set1_ps(4e+00f), tmp4147);
__m512 tmp4132 = _mm512_add_ps(tmp4123, tmp4121);
__m512 tmp4152 = _mm512_add_ps(tmp4143, tmp4141);
__m512 tmp4125 = _mm512_fmadd_ps(tmp4116, _mm512_set1_ps(4e+00f), tmp4117);
__m512 tmp4145 = _mm512_fmadd_ps(tmp4136, _mm512_set1_ps(4e+00f), tmp4137);
__m512 tmp4129 = _mm512_fmadd_ps(tmp4116, _mm512_set1_ps(1.6e+01f), tmp4117);
__m512 tmp4149 = _mm512_fmadd_ps(tmp4136, _mm512_set1_ps(1.6e+01f), tmp4137);
__m512 tmp4114 = _mm512_add_ps(tmp4115, in620);
__m512 tmp4134 = _mm512_add_ps(tmp4135, in628);
__m512 tmp4131 = _mm512_add_ps(tmp4132, in627);
__m512 tmp4151 = _mm512_add_ps(tmp4152, in635);
__m512 tmp4113 = _mm512_fmadd_ps(tmp4118, _mm512_set1_ps(3.2e+01f), tmp4114);
__m512 tmp4133 = _mm512_fmadd_ps(tmp4138, _mm512_set1_ps(3.2e+01f), tmp4134);
__m512 tmp4124 = _mm512_fmadd_ps(tmp4118, _mm512_set1_ps(8e+00f), tmp4125);
__m512 tmp4144 = _mm512_fmadd_ps(tmp4138, _mm512_set1_ps(8e+00f), tmp4145);
__m512 tmp4130 = _mm512_fmadd_ps(tmp4122, _mm512_set1_ps(3.2e+01f), tmp4131);
__m512 tmp4150 = _mm512_fmadd_ps(tmp4142, _mm512_set1_ps(3.2e+01f), tmp4151);
__m512 tmp4128 = _mm512_fmadd_ps(tmp4118, _mm512_set1_ps(2e+00f), tmp4129);
__m512 tmp4148 = _mm512_fmadd_ps(tmp4138, _mm512_set1_ps(2e+00f), tmp4149);
__m512 tmp4101 = tmp4113;
__m512 tmp4107 = tmp4133;
__m512 tmp4102 = tmp4119;
__m512 tmp4108 = tmp4139;
__m512 tmp4103 = tmp4124;
__m512 tmp4109 = tmp4144;
__m512 tmp4104 = tmp4126;
__m512 tmp4110 = tmp4146;
__m512 tmp4105 = tmp4128;
__m512 tmp4111 = tmp4148;
__m512 tmp4106 = tmp4130;
__m512 tmp4112 = tmp4150;
__m512 tmp4197 = _mm512_unpacklo_ps(tmp4101, tmp4102);
__m512 tmp4198 = _mm512_unpackhi_ps(tmp4101, tmp4102);
__m512 tmp4199 = _mm512_unpacklo_ps(tmp4103, tmp4104);
__m512 tmp4200 = _mm512_unpackhi_ps(tmp4103, tmp4104);
__m512 tmp4201 = _mm512_unpacklo_ps(tmp4105, tmp4106);
__m512 tmp4202 = _mm512_unpackhi_ps(tmp4105, tmp4106);
__m512 tmp4203 = _mm512_unpacklo_ps(tmp4107, tmp4108);
__m512 tmp4204 = _mm512_unpackhi_ps(tmp4107, tmp4108);
__m512 tmp4205 = _mm512_unpacklo_ps(tmp4109, tmp4110);
__m512 tmp4206 = _mm512_unpackhi_ps(tmp4109, tmp4110);
__m512 tmp4207 = _mm512_unpacklo_ps(tmp4111, tmp4112);
__m512 tmp4208 = _mm512_unpackhi_ps(tmp4111, tmp4112);
__m512 tmp4209 = _mm512_shuffle_ps(tmp4197, tmp4199, 68);
__m512 tmp4210 = _mm512_shuffle_ps(tmp4197, tmp4199, 238);
__m512 tmp4211 = _mm512_shuffle_ps(tmp4198, tmp4200, 68);
__m512 tmp4212 = _mm512_shuffle_ps(tmp4198, tmp4200, 238);
__m512 tmp4213 = _mm512_shuffle_ps(tmp4201, tmp4203, 68);
__m512 tmp4214 = _mm512_shuffle_ps(tmp4201, tmp4203, 238);
__m512 tmp4215 = _mm512_shuffle_ps(tmp4202, tmp4204, 68);
__m512 tmp4216 = _mm512_shuffle_ps(tmp4202, tmp4204, 238);
__m512 tmp4217 = _mm512_shuffle_ps(tmp4205, tmp4207, 68);
__m512 tmp4218 = _mm512_shuffle_ps(tmp4205, tmp4207, 238);
__m512 tmp4219 = _mm512_shuffle_ps(tmp4206, tmp4208, 68);
__m512 tmp4220 = _mm512_shuffle_ps(tmp4206, tmp4208, 238);
__m512 tmp4221 = _mm512_shuffle_f32x4(tmp4209, tmp4213, 136);
__m512 tmp4222 = _mm512_shuffle_f32x4(tmp4209, tmp4213, 221);
__m512 tmp4223 = _mm512_shuffle_f32x4(tmp4210, tmp4214, 136);
__m512 tmp4224 = _mm512_shuffle_f32x4(tmp4210, tmp4214, 221);
__m512 tmp4225 = _mm512_shuffle_f32x4(tmp4211, tmp4215, 136);
__m512 tmp4226 = _mm512_shuffle_f32x4(tmp4211, tmp4215, 221);
__m512 tmp4227 = _mm512_shuffle_f32x4(tmp4212, tmp4216, 136);
__m512 tmp4228 = _mm512_shuffle_f32x4(tmp4212, tmp4216, 221);
__m512 tmp4229 = _mm512_shuffle_f32x4(tmp4217, tmp4217, 136);
__m512 tmp4230 = _mm512_shuffle_f32x4(tmp4217, tmp4217, 221);
__m512 tmp4231 = _mm512_shuffle_f32x4(tmp4218, tmp4218, 136);
__m512 tmp4232 = _mm512_shuffle_f32x4(tmp4218, tmp4218, 221);
__m512 tmp4233 = _mm512_shuffle_f32x4(tmp4219, tmp4219, 136);
__m512 tmp4234 = _mm512_shuffle_f32x4(tmp4219, tmp4219, 221);
__m512 tmp4235 = _mm512_shuffle_f32x4(tmp4220, tmp4220, 136);
__m512 tmp4236 = _mm512_shuffle_f32x4(tmp4220, tmp4220, 221);
tmp4101 = _mm512_shuffle_f32x4(tmp4221, tmp4229, 136);
tmp4109 = _mm512_shuffle_f32x4(tmp4221, tmp4229, 221);
tmp4102 = _mm512_shuffle_f32x4(tmp4223, tmp4231, 136);
tmp4110 = _mm512_shuffle_f32x4(tmp4223, tmp4231, 221);
tmp4103 = _mm512_shuffle_f32x4(tmp4225, tmp4233, 136);
tmp4111 = _mm512_shuffle_f32x4(tmp4225, tmp4233, 221);
tmp4104 = _mm512_shuffle_f32x4(tmp4227, tmp4235, 136);
tmp4112 = _mm512_shuffle_f32x4(tmp4227, tmp4235, 221);
tmp4105 = _mm512_shuffle_f32x4(tmp4222, tmp4230, 136);
__m512 tmp4153 = _mm512_shuffle_f32x4(tmp4222, tmp4230, 221);
tmp4106 = _mm512_shuffle_f32x4(tmp4224, tmp4232, 136);
__m512 tmp4154 = _mm512_shuffle_f32x4(tmp4224, tmp4232, 221);
tmp4107 = _mm512_shuffle_f32x4(tmp4226, tmp4234, 136);
__m512 tmp4155 = _mm512_shuffle_f32x4(tmp4226, tmp4234, 221);
tmp4108 = _mm512_shuffle_f32x4(tmp4228, tmp4236, 136);
__m512 tmp4156 = _mm512_shuffle_f32x4(tmp4228, tmp4236, 221);
__m512 tmp4161 = _mm512_add_ps(tmp4102, tmp4103);
__m512 tmp4181 = _mm512_add_ps(tmp4110, tmp4111);
__m512 tmp4160 = _mm512_add_ps(tmp4104, tmp4105);
__m512 tmp4180 = _mm512_add_ps(tmp4112, tmp4153);
__m512 tmp4166 = _mm512_sub_ps(tmp4104, tmp4105);
__m512 tmp4186 = _mm512_sub_ps(tmp4112, tmp4153);
__m512 tmp4165 = _mm512_sub_ps(tmp4102, tmp4103);
__m512 tmp4185 = _mm512_sub_ps(tmp4110, tmp4111);
__m512 tmp4162 = _mm512_add_ps(tmp4106, tmp4107);
__m512 tmp4182 = _mm512_add_ps(tmp4154, tmp4155);
__m512 tmp4167 = _mm512_sub_ps(tmp4106, tmp4107);
__m512 tmp4187 = _mm512_sub_ps(tmp4154, tmp4155);
__m512 tmp4164 = _mm512_fmadd_ps(tmp4166, _mm512_set1_ps(2e+00f), tmp4165);
__m512 tmp4184 = _mm512_fmadd_ps(tmp4186, _mm512_set1_ps(2e+00f), tmp4185);
__m512 tmp4171 = _mm512_fmadd_ps(tmp4166, _mm512_set1_ps(8e+00f), tmp4165);
__m512 tmp4191 = _mm512_fmadd_ps(tmp4186, _mm512_set1_ps(8e+00f), tmp4185);
__m512 tmp4159 = _mm512_add_ps(tmp4160, tmp4161);
__m512 tmp4179 = _mm512_add_ps(tmp4180, tmp4181);
__m512 tmp4163 = _mm512_fmadd_ps(tmp4167, _mm512_set1_ps(1.6e+01f), tmp4164);
__m512 tmp4183 = _mm512_fmadd_ps(tmp4187, _mm512_set1_ps(1.6e+01f), tmp4184);
__m512 tmp4170 = _mm512_fmadd_ps(tmp4167, _mm512_set1_ps(4e+00f), tmp4171);
__m512 tmp4190 = _mm512_fmadd_ps(tmp4187, _mm512_set1_ps(4e+00f), tmp4191);
__m512 tmp4176 = _mm512_add_ps(tmp4167, tmp4165);
__m512 tmp4196 = _mm512_add_ps(tmp4187, tmp4185);
__m512 tmp4169 = _mm512_fmadd_ps(tmp4160, _mm512_set1_ps(4e+00f), tmp4161);
__m512 tmp4189 = _mm512_fmadd_ps(tmp4180, _mm512_set1_ps(4e+00f), tmp4181);
__m512 tmp4173 = _mm512_fmadd_ps(tmp4160, _mm512_set1_ps(1.6e+01f), tmp4161);
__m512 tmp4193 = _mm512_fmadd_ps(tmp4180, _mm512_set1_ps(1.6e+01f), tmp4181);
__m512 tmp4158 = _mm512_add_ps(tmp4159, tmp4101);
__m512 tmp4178 = _mm512_add_ps(tmp4179, tmp4109);
__m512 tmp4175 = _mm512_add_ps(tmp4176, tmp4108);
__m512 tmp4195 = _mm512_add_ps(tmp4196, tmp4156);
__m512 tmp4157 = _mm512_fmadd_ps(tmp4162, _mm512_set1_ps(3.2e+01f), tmp4158);
__m512 tmp4177 = _mm512_fmadd_ps(tmp4182, _mm512_set1_ps(3.2e+01f), tmp4178);
__m512 tmp4168 = _mm512_fmadd_ps(tmp4162, _mm512_set1_ps(8e+00f), tmp4169);
__m512 tmp4188 = _mm512_fmadd_ps(tmp4182, _mm512_set1_ps(8e+00f), tmp4189);
__m512 tmp4174 = _mm512_fmadd_ps(tmp4166, _mm512_set1_ps(3.2e+01f), tmp4175);
__m512 tmp4194 = _mm512_fmadd_ps(tmp4186, _mm512_set1_ps(3.2e+01f), tmp4195);
__m512 tmp4172 = _mm512_fmadd_ps(tmp4162, _mm512_set1_ps(2e+00f), tmp4173);
__m512 tmp4192 = _mm512_fmadd_ps(tmp4182, _mm512_set1_ps(2e+00f), tmp4193);
__m512 out615 = tmp4157;
__m512 out621 = tmp4177;
__m512 out616 = tmp4163;
__m512 out622 = tmp4183;
__m512 out617 = tmp4168;
__m512 out623 = tmp4188;
__m512 out618 = tmp4170;
__m512 out624 = tmp4190;
__m512 out619 = tmp4172;
__m512 out625 = tmp4192;
__m512 out620 = tmp4174;
__m512 out626 = tmp4194;
out615 = _mm512_max_ps(_mm512_setzero_ps(), out615);
out621 = _mm512_max_ps(_mm512_setzero_ps(), out621);
out616 = _mm512_max_ps(_mm512_setzero_ps(), out616);
out622 = _mm512_max_ps(_mm512_setzero_ps(), out622);
out617 = _mm512_max_ps(_mm512_setzero_ps(), out617);
out623 = _mm512_max_ps(_mm512_setzero_ps(), out623);
out618 = _mm512_max_ps(_mm512_setzero_ps(), out618);
out624 = _mm512_max_ps(_mm512_setzero_ps(), out624);
out619 = _mm512_max_ps(_mm512_setzero_ps(), out619);
out625 = _mm512_max_ps(_mm512_setzero_ps(), out625);
out620 = _mm512_max_ps(_mm512_setzero_ps(), out620);
out626 = _mm512_max_ps(_mm512_setzero_ps(), out626);
_mm512_mask_storeu_ps(datPtr6+96+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out615);
_mm512_mask_storeu_ps(datPtr6+12608+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out621);
_mm512_mask_storeu_ps(datPtr6+320+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out616);
_mm512_mask_storeu_ps(datPtr6+12832+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out622);
_mm512_mask_storeu_ps(datPtr6+544+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out617);
_mm512_mask_storeu_ps(datPtr6+13056+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out623);
_mm512_mask_storeu_ps(datPtr6+768+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out618);
_mm512_mask_storeu_ps(datPtr6+13280+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out624);
_mm512_mask_storeu_ps(datPtr6+992+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out619);
_mm512_mask_storeu_ps(datPtr6+13504+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out625);
_mm512_mask_storeu_ps(datPtr6+1216+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out620);
_mm512_mask_storeu_ps(datPtr6+13728+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out626);
__m512 sf273 = _mm512_loadu_ps(sfPtr5+512+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf274 = _mm512_loadu_ps(sfPtr5+640+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in636 = _mm512_shuffle_f32x4(sf273, sf274, 68);
__m512 in637 = _mm512_shuffle_f32x4(sf273, sf274, 238);
__m512 sf275 = _mm512_loadu_ps(sfPtr5+576+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf276 = _mm512_loadu_ps(sfPtr5+704+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in644 = _mm512_shuffle_f32x4(sf275, sf276, 68);
__m512 in645 = _mm512_shuffle_f32x4(sf275, sf276, 238);
__m512 sf277 = _mm512_loadu_ps(sfPtr5+410112+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf278 = _mm512_loadu_ps(sfPtr5+410240+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in638 = _mm512_shuffle_f32x4(sf277, sf278, 68);
__m512 in639 = _mm512_shuffle_f32x4(sf277, sf278, 238);
__m512 sf279 = _mm512_loadu_ps(sfPtr5+410176+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf280 = _mm512_loadu_ps(sfPtr5+410304+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in646 = _mm512_shuffle_f32x4(sf279, sf280, 68);
__m512 in647 = _mm512_shuffle_f32x4(sf279, sf280, 238);
__m512 sf281 = _mm512_loadu_ps(sfPtr5+819712+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf282 = _mm512_loadu_ps(sfPtr5+819840+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in640 = _mm512_shuffle_f32x4(sf281, sf282, 68);
__m512 in641 = _mm512_shuffle_f32x4(sf281, sf282, 238);
__m512 sf283 = _mm512_loadu_ps(sfPtr5+819776+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf284 = _mm512_loadu_ps(sfPtr5+819904+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in648 = _mm512_shuffle_f32x4(sf283, sf284, 68);
__m512 in649 = _mm512_shuffle_f32x4(sf283, sf284, 238);
__m512 sf285 = _mm512_loadu_ps(sfPtr5+1229312+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf286 = _mm512_loadu_ps(sfPtr5+1229440+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in642 = _mm512_shuffle_f32x4(sf285, sf286, 68);
__m512 in643 = _mm512_shuffle_f32x4(sf285, sf286, 238);
__m512 sf287 = _mm512_loadu_ps(sfPtr5+1229376+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 sf288 = _mm512_loadu_ps(sfPtr5+1229504+1638400*i18+24576*j13+1536*k67+768*l19);
__m512 in650 = _mm512_shuffle_f32x4(sf287, sf288, 68);
__m512 in651 = _mm512_shuffle_f32x4(sf287, sf288, 238);
__m512 tmp4253 = _mm512_add_ps(in637, in638);
__m512 tmp4273 = _mm512_add_ps(in645, in646);
__m512 tmp4252 = _mm512_add_ps(in639, in640);
__m512 tmp4272 = _mm512_add_ps(in647, in648);
__m512 tmp4258 = _mm512_sub_ps(in639, in640);
__m512 tmp4278 = _mm512_sub_ps(in647, in648);
__m512 tmp4257 = _mm512_sub_ps(in637, in638);
__m512 tmp4277 = _mm512_sub_ps(in645, in646);
__m512 tmp4254 = _mm512_add_ps(in641, in642);
__m512 tmp4274 = _mm512_add_ps(in649, in650);
__m512 tmp4259 = _mm512_sub_ps(in641, in642);
__m512 tmp4279 = _mm512_sub_ps(in649, in650);
__m512 tmp4256 = _mm512_fmadd_ps(tmp4258, _mm512_set1_ps(2e+00f), tmp4257);
__m512 tmp4276 = _mm512_fmadd_ps(tmp4278, _mm512_set1_ps(2e+00f), tmp4277);
__m512 tmp4263 = _mm512_fmadd_ps(tmp4258, _mm512_set1_ps(8e+00f), tmp4257);
__m512 tmp4283 = _mm512_fmadd_ps(tmp4278, _mm512_set1_ps(8e+00f), tmp4277);
__m512 tmp4251 = _mm512_add_ps(tmp4252, tmp4253);
__m512 tmp4271 = _mm512_add_ps(tmp4272, tmp4273);
__m512 tmp4255 = _mm512_fmadd_ps(tmp4259, _mm512_set1_ps(1.6e+01f), tmp4256);
__m512 tmp4275 = _mm512_fmadd_ps(tmp4279, _mm512_set1_ps(1.6e+01f), tmp4276);
__m512 tmp4262 = _mm512_fmadd_ps(tmp4259, _mm512_set1_ps(4e+00f), tmp4263);
__m512 tmp4282 = _mm512_fmadd_ps(tmp4279, _mm512_set1_ps(4e+00f), tmp4283);
__m512 tmp4268 = _mm512_add_ps(tmp4259, tmp4257);
__m512 tmp4288 = _mm512_add_ps(tmp4279, tmp4277);
__m512 tmp4261 = _mm512_fmadd_ps(tmp4252, _mm512_set1_ps(4e+00f), tmp4253);
__m512 tmp4281 = _mm512_fmadd_ps(tmp4272, _mm512_set1_ps(4e+00f), tmp4273);
__m512 tmp4265 = _mm512_fmadd_ps(tmp4252, _mm512_set1_ps(1.6e+01f), tmp4253);
__m512 tmp4285 = _mm512_fmadd_ps(tmp4272, _mm512_set1_ps(1.6e+01f), tmp4273);
__m512 tmp4250 = _mm512_add_ps(tmp4251, in636);
__m512 tmp4270 = _mm512_add_ps(tmp4271, in644);
__m512 tmp4267 = _mm512_add_ps(tmp4268, in643);
__m512 tmp4287 = _mm512_add_ps(tmp4288, in651);
__m512 tmp4249 = _mm512_fmadd_ps(tmp4254, _mm512_set1_ps(3.2e+01f), tmp4250);
__m512 tmp4269 = _mm512_fmadd_ps(tmp4274, _mm512_set1_ps(3.2e+01f), tmp4270);
__m512 tmp4260 = _mm512_fmadd_ps(tmp4254, _mm512_set1_ps(8e+00f), tmp4261);
__m512 tmp4280 = _mm512_fmadd_ps(tmp4274, _mm512_set1_ps(8e+00f), tmp4281);
__m512 tmp4266 = _mm512_fmadd_ps(tmp4258, _mm512_set1_ps(3.2e+01f), tmp4267);
__m512 tmp4286 = _mm512_fmadd_ps(tmp4278, _mm512_set1_ps(3.2e+01f), tmp4287);
__m512 tmp4264 = _mm512_fmadd_ps(tmp4254, _mm512_set1_ps(2e+00f), tmp4265);
__m512 tmp4284 = _mm512_fmadd_ps(tmp4274, _mm512_set1_ps(2e+00f), tmp4285);
__m512 tmp4237 = tmp4249;
__m512 tmp4243 = tmp4269;
__m512 tmp4238 = tmp4255;
__m512 tmp4244 = tmp4275;
__m512 tmp4239 = tmp4260;
__m512 tmp4245 = tmp4280;
__m512 tmp4240 = tmp4262;
__m512 tmp4246 = tmp4282;
__m512 tmp4241 = tmp4264;
__m512 tmp4247 = tmp4284;
__m512 tmp4242 = tmp4266;
__m512 tmp4248 = tmp4286;
__m512 tmp4333 = _mm512_unpacklo_ps(tmp4237, tmp4238);
__m512 tmp4334 = _mm512_unpackhi_ps(tmp4237, tmp4238);
__m512 tmp4335 = _mm512_unpacklo_ps(tmp4239, tmp4240);
__m512 tmp4336 = _mm512_unpackhi_ps(tmp4239, tmp4240);
__m512 tmp4337 = _mm512_unpacklo_ps(tmp4241, tmp4242);
__m512 tmp4338 = _mm512_unpackhi_ps(tmp4241, tmp4242);
__m512 tmp4339 = _mm512_unpacklo_ps(tmp4243, tmp4244);
__m512 tmp4340 = _mm512_unpackhi_ps(tmp4243, tmp4244);
__m512 tmp4341 = _mm512_unpacklo_ps(tmp4245, tmp4246);
__m512 tmp4342 = _mm512_unpackhi_ps(tmp4245, tmp4246);
__m512 tmp4343 = _mm512_unpacklo_ps(tmp4247, tmp4248);
__m512 tmp4344 = _mm512_unpackhi_ps(tmp4247, tmp4248);
__m512 tmp4345 = _mm512_shuffle_ps(tmp4333, tmp4335, 68);
__m512 tmp4346 = _mm512_shuffle_ps(tmp4333, tmp4335, 238);
__m512 tmp4347 = _mm512_shuffle_ps(tmp4334, tmp4336, 68);
__m512 tmp4348 = _mm512_shuffle_ps(tmp4334, tmp4336, 238);
__m512 tmp4349 = _mm512_shuffle_ps(tmp4337, tmp4339, 68);
__m512 tmp4350 = _mm512_shuffle_ps(tmp4337, tmp4339, 238);
__m512 tmp4351 = _mm512_shuffle_ps(tmp4338, tmp4340, 68);
__m512 tmp4352 = _mm512_shuffle_ps(tmp4338, tmp4340, 238);
__m512 tmp4353 = _mm512_shuffle_ps(tmp4341, tmp4343, 68);
__m512 tmp4354 = _mm512_shuffle_ps(tmp4341, tmp4343, 238);
__m512 tmp4355 = _mm512_shuffle_ps(tmp4342, tmp4344, 68);
__m512 tmp4356 = _mm512_shuffle_ps(tmp4342, tmp4344, 238);
__m512 tmp4357 = _mm512_shuffle_f32x4(tmp4345, tmp4349, 136);
__m512 tmp4358 = _mm512_shuffle_f32x4(tmp4345, tmp4349, 221);
__m512 tmp4359 = _mm512_shuffle_f32x4(tmp4346, tmp4350, 136);
__m512 tmp4360 = _mm512_shuffle_f32x4(tmp4346, tmp4350, 221);
__m512 tmp4361 = _mm512_shuffle_f32x4(tmp4347, tmp4351, 136);
__m512 tmp4362 = _mm512_shuffle_f32x4(tmp4347, tmp4351, 221);
__m512 tmp4363 = _mm512_shuffle_f32x4(tmp4348, tmp4352, 136);
__m512 tmp4364 = _mm512_shuffle_f32x4(tmp4348, tmp4352, 221);
__m512 tmp4365 = _mm512_shuffle_f32x4(tmp4353, tmp4353, 136);
__m512 tmp4366 = _mm512_shuffle_f32x4(tmp4353, tmp4353, 221);
__m512 tmp4367 = _mm512_shuffle_f32x4(tmp4354, tmp4354, 136);
__m512 tmp4368 = _mm512_shuffle_f32x4(tmp4354, tmp4354, 221);
__m512 tmp4369 = _mm512_shuffle_f32x4(tmp4355, tmp4355, 136);
__m512 tmp4370 = _mm512_shuffle_f32x4(tmp4355, tmp4355, 221);
__m512 tmp4371 = _mm512_shuffle_f32x4(tmp4356, tmp4356, 136);
__m512 tmp4372 = _mm512_shuffle_f32x4(tmp4356, tmp4356, 221);
tmp4237 = _mm512_shuffle_f32x4(tmp4357, tmp4365, 136);
tmp4245 = _mm512_shuffle_f32x4(tmp4357, tmp4365, 221);
tmp4238 = _mm512_shuffle_f32x4(tmp4359, tmp4367, 136);
tmp4246 = _mm512_shuffle_f32x4(tmp4359, tmp4367, 221);
tmp4239 = _mm512_shuffle_f32x4(tmp4361, tmp4369, 136);
tmp4247 = _mm512_shuffle_f32x4(tmp4361, tmp4369, 221);
tmp4240 = _mm512_shuffle_f32x4(tmp4363, tmp4371, 136);
tmp4248 = _mm512_shuffle_f32x4(tmp4363, tmp4371, 221);
tmp4241 = _mm512_shuffle_f32x4(tmp4358, tmp4366, 136);
__m512 tmp4289 = _mm512_shuffle_f32x4(tmp4358, tmp4366, 221);
tmp4242 = _mm512_shuffle_f32x4(tmp4360, tmp4368, 136);
__m512 tmp4290 = _mm512_shuffle_f32x4(tmp4360, tmp4368, 221);
tmp4243 = _mm512_shuffle_f32x4(tmp4362, tmp4370, 136);
__m512 tmp4291 = _mm512_shuffle_f32x4(tmp4362, tmp4370, 221);
tmp4244 = _mm512_shuffle_f32x4(tmp4364, tmp4372, 136);
__m512 tmp4292 = _mm512_shuffle_f32x4(tmp4364, tmp4372, 221);
__m512 tmp4297 = _mm512_add_ps(tmp4238, tmp4239);
__m512 tmp4317 = _mm512_add_ps(tmp4246, tmp4247);
__m512 tmp4296 = _mm512_add_ps(tmp4240, tmp4241);
__m512 tmp4316 = _mm512_add_ps(tmp4248, tmp4289);
__m512 tmp4302 = _mm512_sub_ps(tmp4240, tmp4241);
__m512 tmp4322 = _mm512_sub_ps(tmp4248, tmp4289);
__m512 tmp4301 = _mm512_sub_ps(tmp4238, tmp4239);
__m512 tmp4321 = _mm512_sub_ps(tmp4246, tmp4247);
__m512 tmp4298 = _mm512_add_ps(tmp4242, tmp4243);
__m512 tmp4318 = _mm512_add_ps(tmp4290, tmp4291);
__m512 tmp4303 = _mm512_sub_ps(tmp4242, tmp4243);
__m512 tmp4323 = _mm512_sub_ps(tmp4290, tmp4291);
__m512 tmp4300 = _mm512_fmadd_ps(tmp4302, _mm512_set1_ps(2e+00f), tmp4301);
__m512 tmp4320 = _mm512_fmadd_ps(tmp4322, _mm512_set1_ps(2e+00f), tmp4321);
__m512 tmp4307 = _mm512_fmadd_ps(tmp4302, _mm512_set1_ps(8e+00f), tmp4301);
__m512 tmp4327 = _mm512_fmadd_ps(tmp4322, _mm512_set1_ps(8e+00f), tmp4321);
__m512 tmp4295 = _mm512_add_ps(tmp4296, tmp4297);
__m512 tmp4315 = _mm512_add_ps(tmp4316, tmp4317);
__m512 tmp4299 = _mm512_fmadd_ps(tmp4303, _mm512_set1_ps(1.6e+01f), tmp4300);
__m512 tmp4319 = _mm512_fmadd_ps(tmp4323, _mm512_set1_ps(1.6e+01f), tmp4320);
__m512 tmp4306 = _mm512_fmadd_ps(tmp4303, _mm512_set1_ps(4e+00f), tmp4307);
__m512 tmp4326 = _mm512_fmadd_ps(tmp4323, _mm512_set1_ps(4e+00f), tmp4327);
__m512 tmp4312 = _mm512_add_ps(tmp4303, tmp4301);
__m512 tmp4332 = _mm512_add_ps(tmp4323, tmp4321);
__m512 tmp4305 = _mm512_fmadd_ps(tmp4296, _mm512_set1_ps(4e+00f), tmp4297);
__m512 tmp4325 = _mm512_fmadd_ps(tmp4316, _mm512_set1_ps(4e+00f), tmp4317);
__m512 tmp4309 = _mm512_fmadd_ps(tmp4296, _mm512_set1_ps(1.6e+01f), tmp4297);
__m512 tmp4329 = _mm512_fmadd_ps(tmp4316, _mm512_set1_ps(1.6e+01f), tmp4317);
__m512 tmp4294 = _mm512_add_ps(tmp4295, tmp4237);
__m512 tmp4314 = _mm512_add_ps(tmp4315, tmp4245);
__m512 tmp4311 = _mm512_add_ps(tmp4312, tmp4244);
__m512 tmp4331 = _mm512_add_ps(tmp4332, tmp4292);
__m512 tmp4293 = _mm512_fmadd_ps(tmp4298, _mm512_set1_ps(3.2e+01f), tmp4294);
__m512 tmp4313 = _mm512_fmadd_ps(tmp4318, _mm512_set1_ps(3.2e+01f), tmp4314);
__m512 tmp4304 = _mm512_fmadd_ps(tmp4298, _mm512_set1_ps(8e+00f), tmp4305);
__m512 tmp4324 = _mm512_fmadd_ps(tmp4318, _mm512_set1_ps(8e+00f), tmp4325);
__m512 tmp4310 = _mm512_fmadd_ps(tmp4302, _mm512_set1_ps(3.2e+01f), tmp4311);
__m512 tmp4330 = _mm512_fmadd_ps(tmp4322, _mm512_set1_ps(3.2e+01f), tmp4331);
__m512 tmp4308 = _mm512_fmadd_ps(tmp4298, _mm512_set1_ps(2e+00f), tmp4309);
__m512 tmp4328 = _mm512_fmadd_ps(tmp4318, _mm512_set1_ps(2e+00f), tmp4329);
__m512 out627 = tmp4293;
__m512 out633 = tmp4313;
__m512 out628 = tmp4299;
__m512 out634 = tmp4319;
__m512 out629 = tmp4304;
__m512 out635 = tmp4324;
__m512 out630 = tmp4306;
__m512 out636 = tmp4326;
__m512 out631 = tmp4308;
__m512 out637 = tmp4328;
__m512 out632 = tmp4310;
__m512 out638 = tmp4330;
out627 = _mm512_max_ps(_mm512_setzero_ps(), out627);
out633 = _mm512_max_ps(_mm512_setzero_ps(), out633);
out628 = _mm512_max_ps(_mm512_setzero_ps(), out628);
out634 = _mm512_max_ps(_mm512_setzero_ps(), out634);
out629 = _mm512_max_ps(_mm512_setzero_ps(), out629);
out635 = _mm512_max_ps(_mm512_setzero_ps(), out635);
out630 = _mm512_max_ps(_mm512_setzero_ps(), out630);
out636 = _mm512_max_ps(_mm512_setzero_ps(), out636);
out631 = _mm512_max_ps(_mm512_setzero_ps(), out631);
out637 = _mm512_max_ps(_mm512_setzero_ps(), out637);
out632 = _mm512_max_ps(_mm512_setzero_ps(), out632);
out638 = _mm512_max_ps(_mm512_setzero_ps(), out638);
_mm512_mask_storeu_ps(datPtr6+12656+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out627);
_mm512_mask_storeu_ps(datPtr6+12704+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out633);
_mm512_mask_storeu_ps(datPtr6+12880+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out628);
_mm512_mask_storeu_ps(datPtr6+12928+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out634);
_mm512_mask_storeu_ps(datPtr6+13104+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out629);
_mm512_mask_storeu_ps(datPtr6+13152+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out635);
_mm512_mask_storeu_ps(datPtr6+13328+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out630);
_mm512_mask_storeu_ps(datPtr6+13376+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out636);
_mm512_mask_storeu_ps(datPtr6+13552+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out631);
_mm512_mask_storeu_ps(datPtr6+13600+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out637);
_mm512_mask_storeu_ps(datPtr6+13776+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out632);
_mm512_mask_storeu_ps(datPtr6+13824+806912*i18+224*toH25+4*toW25+50432*k67+25216*l19, 4095, out638);
}
}
if (j13 >= last4) return;
++j13;
rel11 = 4;
}
ptrdiff_t toH26 = base11+12;
ptrdiff_t toW26 = 36;
ptrdiff_t k68 = 16*w33;
for (; k68 != 16; ++k68) {
ptrdiff_t l20 = 0;
for (; l20 != 2; ++l20) {
__m512 sf289 = _mm512_loadu_ps(sfPtr5+0+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf290 = _mm512_loadu_ps(sfPtr5+128+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in652 = _mm512_shuffle_f32x4(sf289, sf290, 68);
__m512 in653 = _mm512_shuffle_f32x4(sf289, sf290, 238);
__m512 sf291 = _mm512_loadu_ps(sfPtr5+64+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf292 = _mm512_loadu_ps(sfPtr5+192+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in660 = _mm512_shuffle_f32x4(sf291, sf292, 68);
__m512 in661 = _mm512_shuffle_f32x4(sf291, sf292, 238);
__m512 sf293 = _mm512_loadu_ps(sfPtr5+409600+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf294 = _mm512_loadu_ps(sfPtr5+409728+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in654 = _mm512_shuffle_f32x4(sf293, sf294, 68);
__m512 in655 = _mm512_shuffle_f32x4(sf293, sf294, 238);
__m512 sf295 = _mm512_loadu_ps(sfPtr5+409664+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf296 = _mm512_loadu_ps(sfPtr5+409792+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in662 = _mm512_shuffle_f32x4(sf295, sf296, 68);
__m512 in663 = _mm512_shuffle_f32x4(sf295, sf296, 238);
__m512 sf297 = _mm512_loadu_ps(sfPtr5+819200+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf298 = _mm512_loadu_ps(sfPtr5+819328+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in656 = _mm512_shuffle_f32x4(sf297, sf298, 68);
__m512 in657 = _mm512_shuffle_f32x4(sf297, sf298, 238);
__m512 sf299 = _mm512_loadu_ps(sfPtr5+819264+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf300 = _mm512_loadu_ps(sfPtr5+819392+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in664 = _mm512_shuffle_f32x4(sf299, sf300, 68);
__m512 in665 = _mm512_shuffle_f32x4(sf299, sf300, 238);
__m512 sf301 = _mm512_loadu_ps(sfPtr5+1228800+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf302 = _mm512_loadu_ps(sfPtr5+1228928+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in658 = _mm512_shuffle_f32x4(sf301, sf302, 68);
__m512 in659 = _mm512_shuffle_f32x4(sf301, sf302, 238);
__m512 sf303 = _mm512_loadu_ps(sfPtr5+1228864+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf304 = _mm512_loadu_ps(sfPtr5+1228992+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in666 = _mm512_shuffle_f32x4(sf303, sf304, 68);
__m512 in667 = _mm512_shuffle_f32x4(sf303, sf304, 238);
__m512 tmp4389 = _mm512_add_ps(in653, in654);
__m512 tmp4409 = _mm512_add_ps(in661, in662);
__m512 tmp4388 = _mm512_add_ps(in655, in656);
__m512 tmp4408 = _mm512_add_ps(in663, in664);
__m512 tmp4394 = _mm512_sub_ps(in655, in656);
__m512 tmp4414 = _mm512_sub_ps(in663, in664);
__m512 tmp4393 = _mm512_sub_ps(in653, in654);
__m512 tmp4413 = _mm512_sub_ps(in661, in662);
__m512 tmp4390 = _mm512_add_ps(in657, in658);
__m512 tmp4410 = _mm512_add_ps(in665, in666);
__m512 tmp4395 = _mm512_sub_ps(in657, in658);
__m512 tmp4415 = _mm512_sub_ps(in665, in666);
__m512 tmp4392 = _mm512_fmadd_ps(tmp4394, _mm512_set1_ps(2e+00f), tmp4393);
__m512 tmp4412 = _mm512_fmadd_ps(tmp4414, _mm512_set1_ps(2e+00f), tmp4413);
__m512 tmp4399 = _mm512_fmadd_ps(tmp4394, _mm512_set1_ps(8e+00f), tmp4393);
__m512 tmp4419 = _mm512_fmadd_ps(tmp4414, _mm512_set1_ps(8e+00f), tmp4413);
__m512 tmp4387 = _mm512_add_ps(tmp4388, tmp4389);
__m512 tmp4407 = _mm512_add_ps(tmp4408, tmp4409);
__m512 tmp4391 = _mm512_fmadd_ps(tmp4395, _mm512_set1_ps(1.6e+01f), tmp4392);
__m512 tmp4411 = _mm512_fmadd_ps(tmp4415, _mm512_set1_ps(1.6e+01f), tmp4412);
__m512 tmp4398 = _mm512_fmadd_ps(tmp4395, _mm512_set1_ps(4e+00f), tmp4399);
__m512 tmp4418 = _mm512_fmadd_ps(tmp4415, _mm512_set1_ps(4e+00f), tmp4419);
__m512 tmp4404 = _mm512_add_ps(tmp4395, tmp4393);
__m512 tmp4424 = _mm512_add_ps(tmp4415, tmp4413);
__m512 tmp4397 = _mm512_fmadd_ps(tmp4388, _mm512_set1_ps(4e+00f), tmp4389);
__m512 tmp4417 = _mm512_fmadd_ps(tmp4408, _mm512_set1_ps(4e+00f), tmp4409);
__m512 tmp4401 = _mm512_fmadd_ps(tmp4388, _mm512_set1_ps(1.6e+01f), tmp4389);
__m512 tmp4421 = _mm512_fmadd_ps(tmp4408, _mm512_set1_ps(1.6e+01f), tmp4409);
__m512 tmp4386 = _mm512_add_ps(tmp4387, in652);
__m512 tmp4406 = _mm512_add_ps(tmp4407, in660);
__m512 tmp4403 = _mm512_add_ps(tmp4404, in659);
__m512 tmp4423 = _mm512_add_ps(tmp4424, in667);
__m512 tmp4385 = _mm512_fmadd_ps(tmp4390, _mm512_set1_ps(3.2e+01f), tmp4386);
__m512 tmp4405 = _mm512_fmadd_ps(tmp4410, _mm512_set1_ps(3.2e+01f), tmp4406);
__m512 tmp4396 = _mm512_fmadd_ps(tmp4390, _mm512_set1_ps(8e+00f), tmp4397);
__m512 tmp4416 = _mm512_fmadd_ps(tmp4410, _mm512_set1_ps(8e+00f), tmp4417);
__m512 tmp4402 = _mm512_fmadd_ps(tmp4394, _mm512_set1_ps(3.2e+01f), tmp4403);
__m512 tmp4422 = _mm512_fmadd_ps(tmp4414, _mm512_set1_ps(3.2e+01f), tmp4423);
__m512 tmp4400 = _mm512_fmadd_ps(tmp4390, _mm512_set1_ps(2e+00f), tmp4401);
__m512 tmp4420 = _mm512_fmadd_ps(tmp4410, _mm512_set1_ps(2e+00f), tmp4421);
__m512 tmp4373 = tmp4385;
__m512 tmp4379 = tmp4405;
__m512 tmp4374 = tmp4391;
__m512 tmp4380 = tmp4411;
__m512 tmp4375 = tmp4396;
__m512 tmp4381 = tmp4416;
__m512 tmp4376 = tmp4398;
__m512 tmp4382 = tmp4418;
__m512 tmp4377 = tmp4400;
__m512 tmp4383 = tmp4420;
__m512 tmp4378 = tmp4402;
__m512 tmp4384 = tmp4422;
__m512 tmp4469 = _mm512_unpacklo_ps(tmp4373, tmp4374);
__m512 tmp4470 = _mm512_unpackhi_ps(tmp4373, tmp4374);
__m512 tmp4471 = _mm512_unpacklo_ps(tmp4375, tmp4376);
__m512 tmp4472 = _mm512_unpackhi_ps(tmp4375, tmp4376);
__m512 tmp4473 = _mm512_unpacklo_ps(tmp4377, tmp4378);
__m512 tmp4474 = _mm512_unpackhi_ps(tmp4377, tmp4378);
__m512 tmp4475 = _mm512_unpacklo_ps(tmp4379, tmp4380);
__m512 tmp4476 = _mm512_unpackhi_ps(tmp4379, tmp4380);
__m512 tmp4477 = _mm512_unpacklo_ps(tmp4381, tmp4382);
__m512 tmp4478 = _mm512_unpackhi_ps(tmp4381, tmp4382);
__m512 tmp4479 = _mm512_unpacklo_ps(tmp4383, tmp4384);
__m512 tmp4480 = _mm512_unpackhi_ps(tmp4383, tmp4384);
__m512 tmp4481 = _mm512_shuffle_ps(tmp4469, tmp4471, 68);
__m512 tmp4482 = _mm512_shuffle_ps(tmp4469, tmp4471, 238);
__m512 tmp4483 = _mm512_shuffle_ps(tmp4470, tmp4472, 68);
__m512 tmp4484 = _mm512_shuffle_ps(tmp4470, tmp4472, 238);
__m512 tmp4485 = _mm512_shuffle_ps(tmp4473, tmp4475, 68);
__m512 tmp4486 = _mm512_shuffle_ps(tmp4473, tmp4475, 238);
__m512 tmp4487 = _mm512_shuffle_ps(tmp4474, tmp4476, 68);
__m512 tmp4488 = _mm512_shuffle_ps(tmp4474, tmp4476, 238);
__m512 tmp4489 = _mm512_shuffle_ps(tmp4477, tmp4479, 68);
__m512 tmp4490 = _mm512_shuffle_ps(tmp4477, tmp4479, 238);
__m512 tmp4491 = _mm512_shuffle_ps(tmp4478, tmp4480, 68);
__m512 tmp4492 = _mm512_shuffle_ps(tmp4478, tmp4480, 238);
__m512 tmp4493 = _mm512_shuffle_f32x4(tmp4481, tmp4485, 136);
__m512 tmp4494 = _mm512_shuffle_f32x4(tmp4481, tmp4485, 221);
__m512 tmp4495 = _mm512_shuffle_f32x4(tmp4482, tmp4486, 136);
__m512 tmp4496 = _mm512_shuffle_f32x4(tmp4482, tmp4486, 221);
__m512 tmp4497 = _mm512_shuffle_f32x4(tmp4483, tmp4487, 136);
__m512 tmp4498 = _mm512_shuffle_f32x4(tmp4483, tmp4487, 221);
__m512 tmp4499 = _mm512_shuffle_f32x4(tmp4484, tmp4488, 136);
__m512 tmp4500 = _mm512_shuffle_f32x4(tmp4484, tmp4488, 221);
__m512 tmp4501 = _mm512_shuffle_f32x4(tmp4489, tmp4489, 136);
__m512 tmp4502 = _mm512_shuffle_f32x4(tmp4489, tmp4489, 221);
__m512 tmp4503 = _mm512_shuffle_f32x4(tmp4490, tmp4490, 136);
__m512 tmp4504 = _mm512_shuffle_f32x4(tmp4490, tmp4490, 221);
__m512 tmp4505 = _mm512_shuffle_f32x4(tmp4491, tmp4491, 136);
__m512 tmp4506 = _mm512_shuffle_f32x4(tmp4491, tmp4491, 221);
__m512 tmp4507 = _mm512_shuffle_f32x4(tmp4492, tmp4492, 136);
__m512 tmp4508 = _mm512_shuffle_f32x4(tmp4492, tmp4492, 221);
tmp4373 = _mm512_shuffle_f32x4(tmp4493, tmp4501, 136);
tmp4381 = _mm512_shuffle_f32x4(tmp4493, tmp4501, 221);
tmp4374 = _mm512_shuffle_f32x4(tmp4495, tmp4503, 136);
tmp4382 = _mm512_shuffle_f32x4(tmp4495, tmp4503, 221);
tmp4375 = _mm512_shuffle_f32x4(tmp4497, tmp4505, 136);
tmp4383 = _mm512_shuffle_f32x4(tmp4497, tmp4505, 221);
tmp4376 = _mm512_shuffle_f32x4(tmp4499, tmp4507, 136);
tmp4384 = _mm512_shuffle_f32x4(tmp4499, tmp4507, 221);
tmp4377 = _mm512_shuffle_f32x4(tmp4494, tmp4502, 136);
__m512 tmp4425 = _mm512_shuffle_f32x4(tmp4494, tmp4502, 221);
tmp4378 = _mm512_shuffle_f32x4(tmp4496, tmp4504, 136);
__m512 tmp4426 = _mm512_shuffle_f32x4(tmp4496, tmp4504, 221);
tmp4379 = _mm512_shuffle_f32x4(tmp4498, tmp4506, 136);
__m512 tmp4427 = _mm512_shuffle_f32x4(tmp4498, tmp4506, 221);
tmp4380 = _mm512_shuffle_f32x4(tmp4500, tmp4508, 136);
__m512 tmp4428 = _mm512_shuffle_f32x4(tmp4500, tmp4508, 221);
__m512 tmp4433 = _mm512_add_ps(tmp4374, tmp4375);
__m512 tmp4453 = _mm512_add_ps(tmp4382, tmp4383);
__m512 tmp4432 = _mm512_add_ps(tmp4376, tmp4377);
__m512 tmp4452 = _mm512_add_ps(tmp4384, tmp4425);
__m512 tmp4438 = _mm512_sub_ps(tmp4376, tmp4377);
__m512 tmp4458 = _mm512_sub_ps(tmp4384, tmp4425);
__m512 tmp4437 = _mm512_sub_ps(tmp4374, tmp4375);
__m512 tmp4457 = _mm512_sub_ps(tmp4382, tmp4383);
__m512 tmp4434 = _mm512_add_ps(tmp4378, tmp4379);
__m512 tmp4454 = _mm512_add_ps(tmp4426, tmp4427);
__m512 tmp4439 = _mm512_sub_ps(tmp4378, tmp4379);
__m512 tmp4459 = _mm512_sub_ps(tmp4426, tmp4427);
__m512 tmp4436 = _mm512_fmadd_ps(tmp4438, _mm512_set1_ps(2e+00f), tmp4437);
__m512 tmp4456 = _mm512_fmadd_ps(tmp4458, _mm512_set1_ps(2e+00f), tmp4457);
__m512 tmp4443 = _mm512_fmadd_ps(tmp4438, _mm512_set1_ps(8e+00f), tmp4437);
__m512 tmp4463 = _mm512_fmadd_ps(tmp4458, _mm512_set1_ps(8e+00f), tmp4457);
__m512 tmp4431 = _mm512_add_ps(tmp4432, tmp4433);
__m512 tmp4451 = _mm512_add_ps(tmp4452, tmp4453);
__m512 tmp4435 = _mm512_fmadd_ps(tmp4439, _mm512_set1_ps(1.6e+01f), tmp4436);
__m512 tmp4455 = _mm512_fmadd_ps(tmp4459, _mm512_set1_ps(1.6e+01f), tmp4456);
__m512 tmp4442 = _mm512_fmadd_ps(tmp4439, _mm512_set1_ps(4e+00f), tmp4443);
__m512 tmp4462 = _mm512_fmadd_ps(tmp4459, _mm512_set1_ps(4e+00f), tmp4463);
__m512 tmp4448 = _mm512_add_ps(tmp4439, tmp4437);
__m512 tmp4468 = _mm512_add_ps(tmp4459, tmp4457);
__m512 tmp4441 = _mm512_fmadd_ps(tmp4432, _mm512_set1_ps(4e+00f), tmp4433);
__m512 tmp4461 = _mm512_fmadd_ps(tmp4452, _mm512_set1_ps(4e+00f), tmp4453);
__m512 tmp4445 = _mm512_fmadd_ps(tmp4432, _mm512_set1_ps(1.6e+01f), tmp4433);
__m512 tmp4465 = _mm512_fmadd_ps(tmp4452, _mm512_set1_ps(1.6e+01f), tmp4453);
__m512 tmp4430 = _mm512_add_ps(tmp4431, tmp4373);
__m512 tmp4450 = _mm512_add_ps(tmp4451, tmp4381);
__m512 tmp4447 = _mm512_add_ps(tmp4448, tmp4380);
__m512 tmp4467 = _mm512_add_ps(tmp4468, tmp4428);
__m512 tmp4429 = _mm512_fmadd_ps(tmp4434, _mm512_set1_ps(3.2e+01f), tmp4430);
__m512 tmp4449 = _mm512_fmadd_ps(tmp4454, _mm512_set1_ps(3.2e+01f), tmp4450);
__m512 tmp4440 = _mm512_fmadd_ps(tmp4434, _mm512_set1_ps(8e+00f), tmp4441);
__m512 tmp4460 = _mm512_fmadd_ps(tmp4454, _mm512_set1_ps(8e+00f), tmp4461);
__m512 tmp4446 = _mm512_fmadd_ps(tmp4438, _mm512_set1_ps(3.2e+01f), tmp4447);
__m512 tmp4466 = _mm512_fmadd_ps(tmp4458, _mm512_set1_ps(3.2e+01f), tmp4467);
__m512 tmp4444 = _mm512_fmadd_ps(tmp4434, _mm512_set1_ps(2e+00f), tmp4445);
__m512 tmp4464 = _mm512_fmadd_ps(tmp4454, _mm512_set1_ps(2e+00f), tmp4465);
__m512 out639 = tmp4429;
__m512 out645 = tmp4449;
__m512 out640 = tmp4435;
__m512 out646 = tmp4455;
__m512 out641 = tmp4440;
__m512 out647 = tmp4460;
__m512 out642 = tmp4442;
__m512 out648 = tmp4462;
__m512 out643 = tmp4444;
__m512 out649 = tmp4464;
__m512 out644 = tmp4446;
__m512 out650 = tmp4466;
out639 = _mm512_max_ps(_mm512_setzero_ps(), out639);
out645 = _mm512_max_ps(_mm512_setzero_ps(), out645);
out640 = _mm512_max_ps(_mm512_setzero_ps(), out640);
out646 = _mm512_max_ps(_mm512_setzero_ps(), out646);
out641 = _mm512_max_ps(_mm512_setzero_ps(), out641);
out647 = _mm512_max_ps(_mm512_setzero_ps(), out647);
out642 = _mm512_max_ps(_mm512_setzero_ps(), out642);
out648 = _mm512_max_ps(_mm512_setzero_ps(), out648);
out643 = _mm512_max_ps(_mm512_setzero_ps(), out643);
out649 = _mm512_max_ps(_mm512_setzero_ps(), out649);
out644 = _mm512_max_ps(_mm512_setzero_ps(), out644);
out650 = _mm512_max_ps(_mm512_setzero_ps(), out650);
_mm512_mask_storeu_ps(datPtr6+0+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out639);
_mm512_mask_storeu_ps(datPtr6+48+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 255, out645);
_mm512_mask_storeu_ps(datPtr6+224+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out640);
_mm512_mask_storeu_ps(datPtr6+272+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 255, out646);
_mm512_mask_storeu_ps(datPtr6+448+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out641);
_mm512_mask_storeu_ps(datPtr6+496+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 255, out647);
_mm512_mask_storeu_ps(datPtr6+672+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out642);
_mm512_mask_storeu_ps(datPtr6+720+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 255, out648);
_mm512_mask_storeu_ps(datPtr6+896+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out643);
_mm512_mask_storeu_ps(datPtr6+944+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 255, out649);
_mm512_mask_storeu_ps(datPtr6+1120+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out644);
_mm512_mask_storeu_ps(datPtr6+1168+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 255, out650);
__m512 sf305 = _mm512_loadu_ps(sfPtr5+256+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf306 = _mm512_loadu_ps(sfPtr5+384+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in668 = _mm512_shuffle_f32x4(sf305, sf306, 68);
__m512 in669 = _mm512_shuffle_f32x4(sf305, sf306, 238);
__m512 sf307 = _mm512_loadu_ps(sfPtr5+320+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf308 = _mm512_loadu_ps(sfPtr5+448+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in676 = _mm512_shuffle_f32x4(sf307, sf308, 68);
__m512 in677 = _mm512_shuffle_f32x4(sf307, sf308, 238);
__m512 sf309 = _mm512_loadu_ps(sfPtr5+409856+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf310 = _mm512_loadu_ps(sfPtr5+409984+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in670 = _mm512_shuffle_f32x4(sf309, sf310, 68);
__m512 in671 = _mm512_shuffle_f32x4(sf309, sf310, 238);
__m512 sf311 = _mm512_loadu_ps(sfPtr5+409920+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf312 = _mm512_loadu_ps(sfPtr5+410048+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in678 = _mm512_shuffle_f32x4(sf311, sf312, 68);
__m512 in679 = _mm512_shuffle_f32x4(sf311, sf312, 238);
__m512 sf313 = _mm512_loadu_ps(sfPtr5+819456+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf314 = _mm512_loadu_ps(sfPtr5+819584+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in672 = _mm512_shuffle_f32x4(sf313, sf314, 68);
__m512 in673 = _mm512_shuffle_f32x4(sf313, sf314, 238);
__m512 sf315 = _mm512_loadu_ps(sfPtr5+819520+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf316 = _mm512_loadu_ps(sfPtr5+819648+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in680 = _mm512_shuffle_f32x4(sf315, sf316, 68);
__m512 in681 = _mm512_shuffle_f32x4(sf315, sf316, 238);
__m512 sf317 = _mm512_loadu_ps(sfPtr5+1229056+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf318 = _mm512_loadu_ps(sfPtr5+1229184+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in674 = _mm512_shuffle_f32x4(sf317, sf318, 68);
__m512 in675 = _mm512_shuffle_f32x4(sf317, sf318, 238);
__m512 sf319 = _mm512_loadu_ps(sfPtr5+1229120+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf320 = _mm512_loadu_ps(sfPtr5+1229248+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in682 = _mm512_shuffle_f32x4(sf319, sf320, 68);
__m512 in683 = _mm512_shuffle_f32x4(sf319, sf320, 238);
__m512 tmp4525 = _mm512_add_ps(in669, in670);
__m512 tmp4545 = _mm512_add_ps(in677, in678);
__m512 tmp4524 = _mm512_add_ps(in671, in672);
__m512 tmp4544 = _mm512_add_ps(in679, in680);
__m512 tmp4530 = _mm512_sub_ps(in671, in672);
__m512 tmp4550 = _mm512_sub_ps(in679, in680);
__m512 tmp4529 = _mm512_sub_ps(in669, in670);
__m512 tmp4549 = _mm512_sub_ps(in677, in678);
__m512 tmp4526 = _mm512_add_ps(in673, in674);
__m512 tmp4546 = _mm512_add_ps(in681, in682);
__m512 tmp4531 = _mm512_sub_ps(in673, in674);
__m512 tmp4551 = _mm512_sub_ps(in681, in682);
__m512 tmp4528 = _mm512_fmadd_ps(tmp4530, _mm512_set1_ps(2e+00f), tmp4529);
__m512 tmp4548 = _mm512_fmadd_ps(tmp4550, _mm512_set1_ps(2e+00f), tmp4549);
__m512 tmp4535 = _mm512_fmadd_ps(tmp4530, _mm512_set1_ps(8e+00f), tmp4529);
__m512 tmp4555 = _mm512_fmadd_ps(tmp4550, _mm512_set1_ps(8e+00f), tmp4549);
__m512 tmp4523 = _mm512_add_ps(tmp4524, tmp4525);
__m512 tmp4543 = _mm512_add_ps(tmp4544, tmp4545);
__m512 tmp4527 = _mm512_fmadd_ps(tmp4531, _mm512_set1_ps(1.6e+01f), tmp4528);
__m512 tmp4547 = _mm512_fmadd_ps(tmp4551, _mm512_set1_ps(1.6e+01f), tmp4548);
__m512 tmp4534 = _mm512_fmadd_ps(tmp4531, _mm512_set1_ps(4e+00f), tmp4535);
__m512 tmp4554 = _mm512_fmadd_ps(tmp4551, _mm512_set1_ps(4e+00f), tmp4555);
__m512 tmp4540 = _mm512_add_ps(tmp4531, tmp4529);
__m512 tmp4560 = _mm512_add_ps(tmp4551, tmp4549);
__m512 tmp4533 = _mm512_fmadd_ps(tmp4524, _mm512_set1_ps(4e+00f), tmp4525);
__m512 tmp4553 = _mm512_fmadd_ps(tmp4544, _mm512_set1_ps(4e+00f), tmp4545);
__m512 tmp4537 = _mm512_fmadd_ps(tmp4524, _mm512_set1_ps(1.6e+01f), tmp4525);
__m512 tmp4557 = _mm512_fmadd_ps(tmp4544, _mm512_set1_ps(1.6e+01f), tmp4545);
__m512 tmp4522 = _mm512_add_ps(tmp4523, in668);
__m512 tmp4542 = _mm512_add_ps(tmp4543, in676);
__m512 tmp4539 = _mm512_add_ps(tmp4540, in675);
__m512 tmp4559 = _mm512_add_ps(tmp4560, in683);
__m512 tmp4521 = _mm512_fmadd_ps(tmp4526, _mm512_set1_ps(3.2e+01f), tmp4522);
__m512 tmp4541 = _mm512_fmadd_ps(tmp4546, _mm512_set1_ps(3.2e+01f), tmp4542);
__m512 tmp4532 = _mm512_fmadd_ps(tmp4526, _mm512_set1_ps(8e+00f), tmp4533);
__m512 tmp4552 = _mm512_fmadd_ps(tmp4546, _mm512_set1_ps(8e+00f), tmp4553);
__m512 tmp4538 = _mm512_fmadd_ps(tmp4530, _mm512_set1_ps(3.2e+01f), tmp4539);
__m512 tmp4558 = _mm512_fmadd_ps(tmp4550, _mm512_set1_ps(3.2e+01f), tmp4559);
__m512 tmp4536 = _mm512_fmadd_ps(tmp4526, _mm512_set1_ps(2e+00f), tmp4537);
__m512 tmp4556 = _mm512_fmadd_ps(tmp4546, _mm512_set1_ps(2e+00f), tmp4557);
__m512 tmp4509 = tmp4521;
__m512 tmp4515 = tmp4541;
__m512 tmp4510 = tmp4527;
__m512 tmp4516 = tmp4547;
__m512 tmp4511 = tmp4532;
__m512 tmp4517 = tmp4552;
__m512 tmp4512 = tmp4534;
__m512 tmp4518 = tmp4554;
__m512 tmp4513 = tmp4536;
__m512 tmp4519 = tmp4556;
__m512 tmp4514 = tmp4538;
__m512 tmp4520 = tmp4558;
__m512 tmp4605 = _mm512_unpacklo_ps(tmp4509, tmp4510);
__m512 tmp4606 = _mm512_unpackhi_ps(tmp4509, tmp4510);
__m512 tmp4607 = _mm512_unpacklo_ps(tmp4511, tmp4512);
__m512 tmp4608 = _mm512_unpackhi_ps(tmp4511, tmp4512);
__m512 tmp4609 = _mm512_unpacklo_ps(tmp4513, tmp4514);
__m512 tmp4610 = _mm512_unpackhi_ps(tmp4513, tmp4514);
__m512 tmp4611 = _mm512_unpacklo_ps(tmp4515, tmp4516);
__m512 tmp4612 = _mm512_unpackhi_ps(tmp4515, tmp4516);
__m512 tmp4613 = _mm512_unpacklo_ps(tmp4517, tmp4518);
__m512 tmp4614 = _mm512_unpackhi_ps(tmp4517, tmp4518);
__m512 tmp4615 = _mm512_unpacklo_ps(tmp4519, tmp4520);
__m512 tmp4616 = _mm512_unpackhi_ps(tmp4519, tmp4520);
__m512 tmp4617 = _mm512_shuffle_ps(tmp4605, tmp4607, 68);
__m512 tmp4618 = _mm512_shuffle_ps(tmp4605, tmp4607, 238);
__m512 tmp4619 = _mm512_shuffle_ps(tmp4606, tmp4608, 68);
__m512 tmp4620 = _mm512_shuffle_ps(tmp4606, tmp4608, 238);
__m512 tmp4621 = _mm512_shuffle_ps(tmp4609, tmp4611, 68);
__m512 tmp4622 = _mm512_shuffle_ps(tmp4609, tmp4611, 238);
__m512 tmp4623 = _mm512_shuffle_ps(tmp4610, tmp4612, 68);
__m512 tmp4624 = _mm512_shuffle_ps(tmp4610, tmp4612, 238);
__m512 tmp4625 = _mm512_shuffle_ps(tmp4613, tmp4615, 68);
__m512 tmp4626 = _mm512_shuffle_ps(tmp4613, tmp4615, 238);
__m512 tmp4627 = _mm512_shuffle_ps(tmp4614, tmp4616, 68);
__m512 tmp4628 = _mm512_shuffle_ps(tmp4614, tmp4616, 238);
__m512 tmp4629 = _mm512_shuffle_f32x4(tmp4617, tmp4621, 136);
__m512 tmp4630 = _mm512_shuffle_f32x4(tmp4617, tmp4621, 221);
__m512 tmp4631 = _mm512_shuffle_f32x4(tmp4618, tmp4622, 136);
__m512 tmp4632 = _mm512_shuffle_f32x4(tmp4618, tmp4622, 221);
__m512 tmp4633 = _mm512_shuffle_f32x4(tmp4619, tmp4623, 136);
__m512 tmp4634 = _mm512_shuffle_f32x4(tmp4619, tmp4623, 221);
__m512 tmp4635 = _mm512_shuffle_f32x4(tmp4620, tmp4624, 136);
__m512 tmp4636 = _mm512_shuffle_f32x4(tmp4620, tmp4624, 221);
__m512 tmp4637 = _mm512_shuffle_f32x4(tmp4625, tmp4625, 136);
__m512 tmp4638 = _mm512_shuffle_f32x4(tmp4625, tmp4625, 221);
__m512 tmp4639 = _mm512_shuffle_f32x4(tmp4626, tmp4626, 136);
__m512 tmp4640 = _mm512_shuffle_f32x4(tmp4626, tmp4626, 221);
__m512 tmp4641 = _mm512_shuffle_f32x4(tmp4627, tmp4627, 136);
__m512 tmp4642 = _mm512_shuffle_f32x4(tmp4627, tmp4627, 221);
__m512 tmp4643 = _mm512_shuffle_f32x4(tmp4628, tmp4628, 136);
__m512 tmp4644 = _mm512_shuffle_f32x4(tmp4628, tmp4628, 221);
tmp4509 = _mm512_shuffle_f32x4(tmp4629, tmp4637, 136);
tmp4517 = _mm512_shuffle_f32x4(tmp4629, tmp4637, 221);
tmp4510 = _mm512_shuffle_f32x4(tmp4631, tmp4639, 136);
tmp4518 = _mm512_shuffle_f32x4(tmp4631, tmp4639, 221);
tmp4511 = _mm512_shuffle_f32x4(tmp4633, tmp4641, 136);
tmp4519 = _mm512_shuffle_f32x4(tmp4633, tmp4641, 221);
tmp4512 = _mm512_shuffle_f32x4(tmp4635, tmp4643, 136);
tmp4520 = _mm512_shuffle_f32x4(tmp4635, tmp4643, 221);
tmp4513 = _mm512_shuffle_f32x4(tmp4630, tmp4638, 136);
__m512 tmp4561 = _mm512_shuffle_f32x4(tmp4630, tmp4638, 221);
tmp4514 = _mm512_shuffle_f32x4(tmp4632, tmp4640, 136);
__m512 tmp4562 = _mm512_shuffle_f32x4(tmp4632, tmp4640, 221);
tmp4515 = _mm512_shuffle_f32x4(tmp4634, tmp4642, 136);
__m512 tmp4563 = _mm512_shuffle_f32x4(tmp4634, tmp4642, 221);
tmp4516 = _mm512_shuffle_f32x4(tmp4636, tmp4644, 136);
__m512 tmp4564 = _mm512_shuffle_f32x4(tmp4636, tmp4644, 221);
__m512 tmp4569 = _mm512_add_ps(tmp4510, tmp4511);
__m512 tmp4589 = _mm512_add_ps(tmp4518, tmp4519);
__m512 tmp4568 = _mm512_add_ps(tmp4512, tmp4513);
__m512 tmp4588 = _mm512_add_ps(tmp4520, tmp4561);
__m512 tmp4574 = _mm512_sub_ps(tmp4512, tmp4513);
__m512 tmp4594 = _mm512_sub_ps(tmp4520, tmp4561);
__m512 tmp4573 = _mm512_sub_ps(tmp4510, tmp4511);
__m512 tmp4593 = _mm512_sub_ps(tmp4518, tmp4519);
__m512 tmp4570 = _mm512_add_ps(tmp4514, tmp4515);
__m512 tmp4590 = _mm512_add_ps(tmp4562, tmp4563);
__m512 tmp4575 = _mm512_sub_ps(tmp4514, tmp4515);
__m512 tmp4595 = _mm512_sub_ps(tmp4562, tmp4563);
__m512 tmp4572 = _mm512_fmadd_ps(tmp4574, _mm512_set1_ps(2e+00f), tmp4573);
__m512 tmp4592 = _mm512_fmadd_ps(tmp4594, _mm512_set1_ps(2e+00f), tmp4593);
__m512 tmp4579 = _mm512_fmadd_ps(tmp4574, _mm512_set1_ps(8e+00f), tmp4573);
__m512 tmp4599 = _mm512_fmadd_ps(tmp4594, _mm512_set1_ps(8e+00f), tmp4593);
__m512 tmp4567 = _mm512_add_ps(tmp4568, tmp4569);
__m512 tmp4587 = _mm512_add_ps(tmp4588, tmp4589);
__m512 tmp4571 = _mm512_fmadd_ps(tmp4575, _mm512_set1_ps(1.6e+01f), tmp4572);
__m512 tmp4591 = _mm512_fmadd_ps(tmp4595, _mm512_set1_ps(1.6e+01f), tmp4592);
__m512 tmp4578 = _mm512_fmadd_ps(tmp4575, _mm512_set1_ps(4e+00f), tmp4579);
__m512 tmp4598 = _mm512_fmadd_ps(tmp4595, _mm512_set1_ps(4e+00f), tmp4599);
__m512 tmp4584 = _mm512_add_ps(tmp4575, tmp4573);
__m512 tmp4604 = _mm512_add_ps(tmp4595, tmp4593);
__m512 tmp4577 = _mm512_fmadd_ps(tmp4568, _mm512_set1_ps(4e+00f), tmp4569);
__m512 tmp4597 = _mm512_fmadd_ps(tmp4588, _mm512_set1_ps(4e+00f), tmp4589);
__m512 tmp4581 = _mm512_fmadd_ps(tmp4568, _mm512_set1_ps(1.6e+01f), tmp4569);
__m512 tmp4601 = _mm512_fmadd_ps(tmp4588, _mm512_set1_ps(1.6e+01f), tmp4589);
__m512 tmp4566 = _mm512_add_ps(tmp4567, tmp4509);
__m512 tmp4586 = _mm512_add_ps(tmp4587, tmp4517);
__m512 tmp4583 = _mm512_add_ps(tmp4584, tmp4516);
__m512 tmp4603 = _mm512_add_ps(tmp4604, tmp4564);
__m512 tmp4565 = _mm512_fmadd_ps(tmp4570, _mm512_set1_ps(3.2e+01f), tmp4566);
__m512 tmp4585 = _mm512_fmadd_ps(tmp4590, _mm512_set1_ps(3.2e+01f), tmp4586);
__m512 tmp4576 = _mm512_fmadd_ps(tmp4570, _mm512_set1_ps(8e+00f), tmp4577);
__m512 tmp4596 = _mm512_fmadd_ps(tmp4590, _mm512_set1_ps(8e+00f), tmp4597);
__m512 tmp4582 = _mm512_fmadd_ps(tmp4574, _mm512_set1_ps(3.2e+01f), tmp4583);
__m512 tmp4602 = _mm512_fmadd_ps(tmp4594, _mm512_set1_ps(3.2e+01f), tmp4603);
__m512 tmp4580 = _mm512_fmadd_ps(tmp4570, _mm512_set1_ps(2e+00f), tmp4581);
__m512 tmp4600 = _mm512_fmadd_ps(tmp4590, _mm512_set1_ps(2e+00f), tmp4601);
__m512 out651 = tmp4565;
__m512 out657 = tmp4585;
__m512 out652 = tmp4571;
__m512 out658 = tmp4591;
__m512 out653 = tmp4576;
__m512 out659 = tmp4596;
__m512 out654 = tmp4578;
__m512 out660 = tmp4598;
__m512 out655 = tmp4580;
__m512 out661 = tmp4600;
__m512 out656 = tmp4582;
__m512 out662 = tmp4602;
out651 = _mm512_max_ps(_mm512_setzero_ps(), out651);
out657 = _mm512_max_ps(_mm512_setzero_ps(), out657);
out652 = _mm512_max_ps(_mm512_setzero_ps(), out652);
out658 = _mm512_max_ps(_mm512_setzero_ps(), out658);
out653 = _mm512_max_ps(_mm512_setzero_ps(), out653);
out659 = _mm512_max_ps(_mm512_setzero_ps(), out659);
out654 = _mm512_max_ps(_mm512_setzero_ps(), out654);
out660 = _mm512_max_ps(_mm512_setzero_ps(), out660);
out655 = _mm512_max_ps(_mm512_setzero_ps(), out655);
out661 = _mm512_max_ps(_mm512_setzero_ps(), out661);
out656 = _mm512_max_ps(_mm512_setzero_ps(), out656);
out662 = _mm512_max_ps(_mm512_setzero_ps(), out662);
_mm512_mask_storeu_ps(datPtr6+1200+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out651);
_mm512_mask_storeu_ps(datPtr6+12608+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out657);
_mm512_mask_storeu_ps(datPtr6+1424+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out652);
_mm512_mask_storeu_ps(datPtr6+12832+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out658);
_mm512_mask_storeu_ps(datPtr6+1648+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out653);
_mm512_mask_storeu_ps(datPtr6+13056+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out659);
_mm512_mask_storeu_ps(datPtr6+1872+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out654);
_mm512_mask_storeu_ps(datPtr6+13280+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out660);
_mm512_mask_storeu_ps(datPtr6+2096+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out655);
_mm512_mask_storeu_ps(datPtr6+13504+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out661);
_mm512_mask_storeu_ps(datPtr6+2320+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out656);
_mm512_mask_storeu_ps(datPtr6+13728+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out662);
__m512 sf321 = _mm512_loadu_ps(sfPtr5+512+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf322 = _mm512_loadu_ps(sfPtr5+640+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in684 = _mm512_shuffle_f32x4(sf321, sf322, 68);
__m512 in685 = _mm512_shuffle_f32x4(sf321, sf322, 238);
__m512 sf323 = _mm512_loadu_ps(sfPtr5+576+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf324 = _mm512_loadu_ps(sfPtr5+704+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in692 = _mm512_shuffle_f32x4(sf323, sf324, 68);
__m512 in693 = _mm512_shuffle_f32x4(sf323, sf324, 238);
__m512 sf325 = _mm512_loadu_ps(sfPtr5+410112+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf326 = _mm512_loadu_ps(sfPtr5+410240+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in686 = _mm512_shuffle_f32x4(sf325, sf326, 68);
__m512 in687 = _mm512_shuffle_f32x4(sf325, sf326, 238);
__m512 sf327 = _mm512_loadu_ps(sfPtr5+410176+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf328 = _mm512_loadu_ps(sfPtr5+410304+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in694 = _mm512_shuffle_f32x4(sf327, sf328, 68);
__m512 in695 = _mm512_shuffle_f32x4(sf327, sf328, 238);
__m512 sf329 = _mm512_loadu_ps(sfPtr5+819712+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf330 = _mm512_loadu_ps(sfPtr5+819840+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in688 = _mm512_shuffle_f32x4(sf329, sf330, 68);
__m512 in689 = _mm512_shuffle_f32x4(sf329, sf330, 238);
__m512 sf331 = _mm512_loadu_ps(sfPtr5+819776+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf332 = _mm512_loadu_ps(sfPtr5+819904+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in696 = _mm512_shuffle_f32x4(sf331, sf332, 68);
__m512 in697 = _mm512_shuffle_f32x4(sf331, sf332, 238);
__m512 sf333 = _mm512_loadu_ps(sfPtr5+1229312+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf334 = _mm512_loadu_ps(sfPtr5+1229440+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in690 = _mm512_shuffle_f32x4(sf333, sf334, 68);
__m512 in691 = _mm512_shuffle_f32x4(sf333, sf334, 238);
__m512 sf335 = _mm512_loadu_ps(sfPtr5+1229376+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 sf336 = _mm512_loadu_ps(sfPtr5+1229504+1638400*i18+24576*j13+1536*k68+768*l20);
__m512 in698 = _mm512_shuffle_f32x4(sf335, sf336, 68);
__m512 in699 = _mm512_shuffle_f32x4(sf335, sf336, 238);
__m512 tmp4661 = _mm512_add_ps(in685, in686);
__m512 tmp4681 = _mm512_add_ps(in693, in694);
__m512 tmp4660 = _mm512_add_ps(in687, in688);
__m512 tmp4680 = _mm512_add_ps(in695, in696);
__m512 tmp4666 = _mm512_sub_ps(in687, in688);
__m512 tmp4686 = _mm512_sub_ps(in695, in696);
__m512 tmp4665 = _mm512_sub_ps(in685, in686);
__m512 tmp4685 = _mm512_sub_ps(in693, in694);
__m512 tmp4662 = _mm512_add_ps(in689, in690);
__m512 tmp4682 = _mm512_add_ps(in697, in698);
__m512 tmp4667 = _mm512_sub_ps(in689, in690);
__m512 tmp4687 = _mm512_sub_ps(in697, in698);
__m512 tmp4664 = _mm512_fmadd_ps(tmp4666, _mm512_set1_ps(2e+00f), tmp4665);
__m512 tmp4684 = _mm512_fmadd_ps(tmp4686, _mm512_set1_ps(2e+00f), tmp4685);
__m512 tmp4671 = _mm512_fmadd_ps(tmp4666, _mm512_set1_ps(8e+00f), tmp4665);
__m512 tmp4691 = _mm512_fmadd_ps(tmp4686, _mm512_set1_ps(8e+00f), tmp4685);
__m512 tmp4659 = _mm512_add_ps(tmp4660, tmp4661);
__m512 tmp4679 = _mm512_add_ps(tmp4680, tmp4681);
__m512 tmp4663 = _mm512_fmadd_ps(tmp4667, _mm512_set1_ps(1.6e+01f), tmp4664);
__m512 tmp4683 = _mm512_fmadd_ps(tmp4687, _mm512_set1_ps(1.6e+01f), tmp4684);
__m512 tmp4670 = _mm512_fmadd_ps(tmp4667, _mm512_set1_ps(4e+00f), tmp4671);
__m512 tmp4690 = _mm512_fmadd_ps(tmp4687, _mm512_set1_ps(4e+00f), tmp4691);
__m512 tmp4676 = _mm512_add_ps(tmp4667, tmp4665);
__m512 tmp4696 = _mm512_add_ps(tmp4687, tmp4685);
__m512 tmp4669 = _mm512_fmadd_ps(tmp4660, _mm512_set1_ps(4e+00f), tmp4661);
__m512 tmp4689 = _mm512_fmadd_ps(tmp4680, _mm512_set1_ps(4e+00f), tmp4681);
__m512 tmp4673 = _mm512_fmadd_ps(tmp4660, _mm512_set1_ps(1.6e+01f), tmp4661);
__m512 tmp4693 = _mm512_fmadd_ps(tmp4680, _mm512_set1_ps(1.6e+01f), tmp4681);
__m512 tmp4658 = _mm512_add_ps(tmp4659, in684);
__m512 tmp4678 = _mm512_add_ps(tmp4679, in692);
__m512 tmp4675 = _mm512_add_ps(tmp4676, in691);
__m512 tmp4695 = _mm512_add_ps(tmp4696, in699);
__m512 tmp4657 = _mm512_fmadd_ps(tmp4662, _mm512_set1_ps(3.2e+01f), tmp4658);
__m512 tmp4677 = _mm512_fmadd_ps(tmp4682, _mm512_set1_ps(3.2e+01f), tmp4678);
__m512 tmp4668 = _mm512_fmadd_ps(tmp4662, _mm512_set1_ps(8e+00f), tmp4669);
__m512 tmp4688 = _mm512_fmadd_ps(tmp4682, _mm512_set1_ps(8e+00f), tmp4689);
__m512 tmp4674 = _mm512_fmadd_ps(tmp4666, _mm512_set1_ps(3.2e+01f), tmp4675);
__m512 tmp4694 = _mm512_fmadd_ps(tmp4686, _mm512_set1_ps(3.2e+01f), tmp4695);
__m512 tmp4672 = _mm512_fmadd_ps(tmp4662, _mm512_set1_ps(2e+00f), tmp4673);
__m512 tmp4692 = _mm512_fmadd_ps(tmp4682, _mm512_set1_ps(2e+00f), tmp4693);
__m512 tmp4645 = tmp4657;
__m512 tmp4651 = tmp4677;
__m512 tmp4646 = tmp4663;
__m512 tmp4652 = tmp4683;
__m512 tmp4647 = tmp4668;
__m512 tmp4653 = tmp4688;
__m512 tmp4648 = tmp4670;
__m512 tmp4654 = tmp4690;
__m512 tmp4649 = tmp4672;
__m512 tmp4655 = tmp4692;
__m512 tmp4650 = tmp4674;
__m512 tmp4656 = tmp4694;
__m512 tmp4741 = _mm512_unpacklo_ps(tmp4645, tmp4646);
__m512 tmp4742 = _mm512_unpackhi_ps(tmp4645, tmp4646);
__m512 tmp4743 = _mm512_unpacklo_ps(tmp4647, tmp4648);
__m512 tmp4744 = _mm512_unpackhi_ps(tmp4647, tmp4648);
__m512 tmp4745 = _mm512_unpacklo_ps(tmp4649, tmp4650);
__m512 tmp4746 = _mm512_unpackhi_ps(tmp4649, tmp4650);
__m512 tmp4747 = _mm512_unpacklo_ps(tmp4651, tmp4652);
__m512 tmp4748 = _mm512_unpackhi_ps(tmp4651, tmp4652);
__m512 tmp4749 = _mm512_unpacklo_ps(tmp4653, tmp4654);
__m512 tmp4750 = _mm512_unpackhi_ps(tmp4653, tmp4654);
__m512 tmp4751 = _mm512_unpacklo_ps(tmp4655, tmp4656);
__m512 tmp4752 = _mm512_unpackhi_ps(tmp4655, tmp4656);
__m512 tmp4753 = _mm512_shuffle_ps(tmp4741, tmp4743, 68);
__m512 tmp4754 = _mm512_shuffle_ps(tmp4741, tmp4743, 238);
__m512 tmp4755 = _mm512_shuffle_ps(tmp4742, tmp4744, 68);
__m512 tmp4756 = _mm512_shuffle_ps(tmp4742, tmp4744, 238);
__m512 tmp4757 = _mm512_shuffle_ps(tmp4745, tmp4747, 68);
__m512 tmp4758 = _mm512_shuffle_ps(tmp4745, tmp4747, 238);
__m512 tmp4759 = _mm512_shuffle_ps(tmp4746, tmp4748, 68);
__m512 tmp4760 = _mm512_shuffle_ps(tmp4746, tmp4748, 238);
__m512 tmp4761 = _mm512_shuffle_ps(tmp4749, tmp4751, 68);
__m512 tmp4762 = _mm512_shuffle_ps(tmp4749, tmp4751, 238);
__m512 tmp4763 = _mm512_shuffle_ps(tmp4750, tmp4752, 68);
__m512 tmp4764 = _mm512_shuffle_ps(tmp4750, tmp4752, 238);
__m512 tmp4765 = _mm512_shuffle_f32x4(tmp4753, tmp4757, 136);
__m512 tmp4766 = _mm512_shuffle_f32x4(tmp4753, tmp4757, 221);
__m512 tmp4767 = _mm512_shuffle_f32x4(tmp4754, tmp4758, 136);
__m512 tmp4768 = _mm512_shuffle_f32x4(tmp4754, tmp4758, 221);
__m512 tmp4769 = _mm512_shuffle_f32x4(tmp4755, tmp4759, 136);
__m512 tmp4770 = _mm512_shuffle_f32x4(tmp4755, tmp4759, 221);
__m512 tmp4771 = _mm512_shuffle_f32x4(tmp4756, tmp4760, 136);
__m512 tmp4772 = _mm512_shuffle_f32x4(tmp4756, tmp4760, 221);
__m512 tmp4773 = _mm512_shuffle_f32x4(tmp4761, tmp4761, 136);
__m512 tmp4774 = _mm512_shuffle_f32x4(tmp4761, tmp4761, 221);
__m512 tmp4775 = _mm512_shuffle_f32x4(tmp4762, tmp4762, 136);
__m512 tmp4776 = _mm512_shuffle_f32x4(tmp4762, tmp4762, 221);
__m512 tmp4777 = _mm512_shuffle_f32x4(tmp4763, tmp4763, 136);
__m512 tmp4778 = _mm512_shuffle_f32x4(tmp4763, tmp4763, 221);
__m512 tmp4779 = _mm512_shuffle_f32x4(tmp4764, tmp4764, 136);
__m512 tmp4780 = _mm512_shuffle_f32x4(tmp4764, tmp4764, 221);
tmp4645 = _mm512_shuffle_f32x4(tmp4765, tmp4773, 136);
tmp4653 = _mm512_shuffle_f32x4(tmp4765, tmp4773, 221);
tmp4646 = _mm512_shuffle_f32x4(tmp4767, tmp4775, 136);
tmp4654 = _mm512_shuffle_f32x4(tmp4767, tmp4775, 221);
tmp4647 = _mm512_shuffle_f32x4(tmp4769, tmp4777, 136);
tmp4655 = _mm512_shuffle_f32x4(tmp4769, tmp4777, 221);
tmp4648 = _mm512_shuffle_f32x4(tmp4771, tmp4779, 136);
tmp4656 = _mm512_shuffle_f32x4(tmp4771, tmp4779, 221);
tmp4649 = _mm512_shuffle_f32x4(tmp4766, tmp4774, 136);
__m512 tmp4697 = _mm512_shuffle_f32x4(tmp4766, tmp4774, 221);
tmp4650 = _mm512_shuffle_f32x4(tmp4768, tmp4776, 136);
__m512 tmp4698 = _mm512_shuffle_f32x4(tmp4768, tmp4776, 221);
tmp4651 = _mm512_shuffle_f32x4(tmp4770, tmp4778, 136);
__m512 tmp4699 = _mm512_shuffle_f32x4(tmp4770, tmp4778, 221);
tmp4652 = _mm512_shuffle_f32x4(tmp4772, tmp4780, 136);
__m512 tmp4700 = _mm512_shuffle_f32x4(tmp4772, tmp4780, 221);
__m512 tmp4705 = _mm512_add_ps(tmp4646, tmp4647);
__m512 tmp4725 = _mm512_add_ps(tmp4654, tmp4655);
__m512 tmp4704 = _mm512_add_ps(tmp4648, tmp4649);
__m512 tmp4724 = _mm512_add_ps(tmp4656, tmp4697);
__m512 tmp4710 = _mm512_sub_ps(tmp4648, tmp4649);
__m512 tmp4730 = _mm512_sub_ps(tmp4656, tmp4697);
__m512 tmp4709 = _mm512_sub_ps(tmp4646, tmp4647);
__m512 tmp4729 = _mm512_sub_ps(tmp4654, tmp4655);
__m512 tmp4706 = _mm512_add_ps(tmp4650, tmp4651);
__m512 tmp4726 = _mm512_add_ps(tmp4698, tmp4699);
__m512 tmp4711 = _mm512_sub_ps(tmp4650, tmp4651);
__m512 tmp4731 = _mm512_sub_ps(tmp4698, tmp4699);
__m512 tmp4708 = _mm512_fmadd_ps(tmp4710, _mm512_set1_ps(2e+00f), tmp4709);
__m512 tmp4728 = _mm512_fmadd_ps(tmp4730, _mm512_set1_ps(2e+00f), tmp4729);
__m512 tmp4715 = _mm512_fmadd_ps(tmp4710, _mm512_set1_ps(8e+00f), tmp4709);
__m512 tmp4735 = _mm512_fmadd_ps(tmp4730, _mm512_set1_ps(8e+00f), tmp4729);
__m512 tmp4703 = _mm512_add_ps(tmp4704, tmp4705);
__m512 tmp4723 = _mm512_add_ps(tmp4724, tmp4725);
__m512 tmp4707 = _mm512_fmadd_ps(tmp4711, _mm512_set1_ps(1.6e+01f), tmp4708);
__m512 tmp4727 = _mm512_fmadd_ps(tmp4731, _mm512_set1_ps(1.6e+01f), tmp4728);
__m512 tmp4714 = _mm512_fmadd_ps(tmp4711, _mm512_set1_ps(4e+00f), tmp4715);
__m512 tmp4734 = _mm512_fmadd_ps(tmp4731, _mm512_set1_ps(4e+00f), tmp4735);
__m512 tmp4720 = _mm512_add_ps(tmp4711, tmp4709);
__m512 tmp4740 = _mm512_add_ps(tmp4731, tmp4729);
__m512 tmp4713 = _mm512_fmadd_ps(tmp4704, _mm512_set1_ps(4e+00f), tmp4705);
__m512 tmp4733 = _mm512_fmadd_ps(tmp4724, _mm512_set1_ps(4e+00f), tmp4725);
__m512 tmp4717 = _mm512_fmadd_ps(tmp4704, _mm512_set1_ps(1.6e+01f), tmp4705);
__m512 tmp4737 = _mm512_fmadd_ps(tmp4724, _mm512_set1_ps(1.6e+01f), tmp4725);
__m512 tmp4702 = _mm512_add_ps(tmp4703, tmp4645);
__m512 tmp4722 = _mm512_add_ps(tmp4723, tmp4653);
__m512 tmp4719 = _mm512_add_ps(tmp4720, tmp4652);
__m512 tmp4739 = _mm512_add_ps(tmp4740, tmp4700);
__m512 tmp4701 = _mm512_fmadd_ps(tmp4706, _mm512_set1_ps(3.2e+01f), tmp4702);
__m512 tmp4721 = _mm512_fmadd_ps(tmp4726, _mm512_set1_ps(3.2e+01f), tmp4722);
__m512 tmp4712 = _mm512_fmadd_ps(tmp4706, _mm512_set1_ps(8e+00f), tmp4713);
__m512 tmp4732 = _mm512_fmadd_ps(tmp4726, _mm512_set1_ps(8e+00f), tmp4733);
__m512 tmp4718 = _mm512_fmadd_ps(tmp4710, _mm512_set1_ps(3.2e+01f), tmp4719);
__m512 tmp4738 = _mm512_fmadd_ps(tmp4730, _mm512_set1_ps(3.2e+01f), tmp4739);
__m512 tmp4716 = _mm512_fmadd_ps(tmp4706, _mm512_set1_ps(2e+00f), tmp4717);
__m512 tmp4736 = _mm512_fmadd_ps(tmp4726, _mm512_set1_ps(2e+00f), tmp4737);
__m512 out663 = tmp4701;
__m512 out669 = tmp4721;
__m512 out664 = tmp4707;
__m512 out670 = tmp4727;
__m512 out665 = tmp4712;
__m512 out671 = tmp4732;
__m512 out666 = tmp4714;
__m512 out672 = tmp4734;
__m512 out667 = tmp4716;
__m512 out673 = tmp4736;
__m512 out668 = tmp4718;
__m512 out674 = tmp4738;
out663 = _mm512_max_ps(_mm512_setzero_ps(), out663);
out669 = _mm512_max_ps(_mm512_setzero_ps(), out669);
out664 = _mm512_max_ps(_mm512_setzero_ps(), out664);
out670 = _mm512_max_ps(_mm512_setzero_ps(), out670);
out665 = _mm512_max_ps(_mm512_setzero_ps(), out665);
out671 = _mm512_max_ps(_mm512_setzero_ps(), out671);
out666 = _mm512_max_ps(_mm512_setzero_ps(), out666);
out672 = _mm512_max_ps(_mm512_setzero_ps(), out672);
out667 = _mm512_max_ps(_mm512_setzero_ps(), out667);
out673 = _mm512_max_ps(_mm512_setzero_ps(), out673);
out668 = _mm512_max_ps(_mm512_setzero_ps(), out668);
out674 = _mm512_max_ps(_mm512_setzero_ps(), out674);
_mm512_mask_storeu_ps(datPtr6+12656+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 255, out663);
_mm512_mask_storeu_ps(datPtr6+13808+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out669);
_mm512_mask_storeu_ps(datPtr6+12880+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 255, out664);
_mm512_mask_storeu_ps(datPtr6+14032+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out670);
_mm512_mask_storeu_ps(datPtr6+13104+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 255, out665);
_mm512_mask_storeu_ps(datPtr6+14256+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out671);
_mm512_mask_storeu_ps(datPtr6+13328+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 255, out666);
_mm512_mask_storeu_ps(datPtr6+14480+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out672);
_mm512_mask_storeu_ps(datPtr6+13552+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 255, out667);
_mm512_mask_storeu_ps(datPtr6+14704+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out673);
_mm512_mask_storeu_ps(datPtr6+13776+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 255, out668);
_mm512_mask_storeu_ps(datPtr6+14928+806912*i18+224*toH26+4*toW26+50432*k68+25216*l20, 4095, out674);
}
}
if (j13 >= last4) return;
++j13;
}
j13 = 15;
}
ptrdiff_t rel12 = j13-15;
ptrdiff_t base12 = 54;
if (rel12 < 1) {
ptrdiff_t toH27 = base12+0;
ptrdiff_t toW27 = 0;
ptrdiff_t k69 = 16*w33;
for (; k69 != 16; ++k69) {
ptrdiff_t l21 = 0;
for (; l21 != 2; ++l21) {
__m512 sf337 = _mm512_loadu_ps(sfPtr5+0+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf338 = _mm512_loadu_ps(sfPtr5+128+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in700 = _mm512_shuffle_f32x4(sf337, sf338, 68);
__m512 in701 = _mm512_shuffle_f32x4(sf337, sf338, 238);
__m512 sf339 = _mm512_loadu_ps(sfPtr5+64+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf340 = _mm512_loadu_ps(sfPtr5+192+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in708 = _mm512_shuffle_f32x4(sf339, sf340, 68);
__m512 in709 = _mm512_shuffle_f32x4(sf339, sf340, 238);
__m512 sf341 = _mm512_loadu_ps(sfPtr5+409600+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf342 = _mm512_loadu_ps(sfPtr5+409728+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in702 = _mm512_shuffle_f32x4(sf341, sf342, 68);
__m512 in703 = _mm512_shuffle_f32x4(sf341, sf342, 238);
__m512 sf343 = _mm512_loadu_ps(sfPtr5+409664+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf344 = _mm512_loadu_ps(sfPtr5+409792+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in710 = _mm512_shuffle_f32x4(sf343, sf344, 68);
__m512 in711 = _mm512_shuffle_f32x4(sf343, sf344, 238);
__m512 sf345 = _mm512_loadu_ps(sfPtr5+819200+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf346 = _mm512_loadu_ps(sfPtr5+819328+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in704 = _mm512_shuffle_f32x4(sf345, sf346, 68);
__m512 in705 = _mm512_shuffle_f32x4(sf345, sf346, 238);
__m512 sf347 = _mm512_loadu_ps(sfPtr5+819264+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf348 = _mm512_loadu_ps(sfPtr5+819392+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in712 = _mm512_shuffle_f32x4(sf347, sf348, 68);
__m512 in713 = _mm512_shuffle_f32x4(sf347, sf348, 238);
__m512 sf349 = _mm512_loadu_ps(sfPtr5+1228800+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf350 = _mm512_loadu_ps(sfPtr5+1228928+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in706 = _mm512_shuffle_f32x4(sf349, sf350, 68);
__m512 in707 = _mm512_shuffle_f32x4(sf349, sf350, 238);
__m512 sf351 = _mm512_loadu_ps(sfPtr5+1228864+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf352 = _mm512_loadu_ps(sfPtr5+1228992+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in714 = _mm512_shuffle_f32x4(sf351, sf352, 68);
__m512 in715 = _mm512_shuffle_f32x4(sf351, sf352, 238);
__m512 tmp4797 = _mm512_add_ps(in701, in702);
__m512 tmp4817 = _mm512_add_ps(in709, in710);
__m512 tmp4796 = _mm512_add_ps(in703, in704);
__m512 tmp4816 = _mm512_add_ps(in711, in712);
__m512 tmp4802 = _mm512_sub_ps(in703, in704);
__m512 tmp4822 = _mm512_sub_ps(in711, in712);
__m512 tmp4801 = _mm512_sub_ps(in701, in702);
__m512 tmp4821 = _mm512_sub_ps(in709, in710);
__m512 tmp4798 = _mm512_add_ps(in705, in706);
__m512 tmp4818 = _mm512_add_ps(in713, in714);
__m512 tmp4803 = _mm512_sub_ps(in705, in706);
__m512 tmp4823 = _mm512_sub_ps(in713, in714);
__m512 tmp4800 = _mm512_fmadd_ps(tmp4802, _mm512_set1_ps(2e+00f), tmp4801);
__m512 tmp4820 = _mm512_fmadd_ps(tmp4822, _mm512_set1_ps(2e+00f), tmp4821);
__m512 tmp4807 = _mm512_fmadd_ps(tmp4802, _mm512_set1_ps(8e+00f), tmp4801);
__m512 tmp4827 = _mm512_fmadd_ps(tmp4822, _mm512_set1_ps(8e+00f), tmp4821);
__m512 tmp4795 = _mm512_add_ps(tmp4796, tmp4797);
__m512 tmp4815 = _mm512_add_ps(tmp4816, tmp4817);
__m512 tmp4799 = _mm512_fmadd_ps(tmp4803, _mm512_set1_ps(1.6e+01f), tmp4800);
__m512 tmp4819 = _mm512_fmadd_ps(tmp4823, _mm512_set1_ps(1.6e+01f), tmp4820);
__m512 tmp4806 = _mm512_fmadd_ps(tmp4803, _mm512_set1_ps(4e+00f), tmp4807);
__m512 tmp4826 = _mm512_fmadd_ps(tmp4823, _mm512_set1_ps(4e+00f), tmp4827);
__m512 tmp4812 = _mm512_add_ps(tmp4803, tmp4801);
__m512 tmp4832 = _mm512_add_ps(tmp4823, tmp4821);
__m512 tmp4805 = _mm512_fmadd_ps(tmp4796, _mm512_set1_ps(4e+00f), tmp4797);
__m512 tmp4825 = _mm512_fmadd_ps(tmp4816, _mm512_set1_ps(4e+00f), tmp4817);
__m512 tmp4809 = _mm512_fmadd_ps(tmp4796, _mm512_set1_ps(1.6e+01f), tmp4797);
__m512 tmp4829 = _mm512_fmadd_ps(tmp4816, _mm512_set1_ps(1.6e+01f), tmp4817);
__m512 tmp4794 = _mm512_add_ps(tmp4795, in700);
__m512 tmp4814 = _mm512_add_ps(tmp4815, in708);
__m512 tmp4811 = _mm512_add_ps(tmp4812, in707);
__m512 tmp4831 = _mm512_add_ps(tmp4832, in715);
__m512 tmp4793 = _mm512_fmadd_ps(tmp4798, _mm512_set1_ps(3.2e+01f), tmp4794);
__m512 tmp4813 = _mm512_fmadd_ps(tmp4818, _mm512_set1_ps(3.2e+01f), tmp4814);
__m512 tmp4804 = _mm512_fmadd_ps(tmp4798, _mm512_set1_ps(8e+00f), tmp4805);
__m512 tmp4824 = _mm512_fmadd_ps(tmp4818, _mm512_set1_ps(8e+00f), tmp4825);
__m512 tmp4810 = _mm512_fmadd_ps(tmp4802, _mm512_set1_ps(3.2e+01f), tmp4811);
__m512 tmp4830 = _mm512_fmadd_ps(tmp4822, _mm512_set1_ps(3.2e+01f), tmp4831);
__m512 tmp4808 = _mm512_fmadd_ps(tmp4798, _mm512_set1_ps(2e+00f), tmp4809);
__m512 tmp4828 = _mm512_fmadd_ps(tmp4818, _mm512_set1_ps(2e+00f), tmp4829);
__m512 tmp4781 = tmp4793;
__m512 tmp4787 = tmp4813;
__m512 tmp4782 = tmp4799;
__m512 tmp4788 = tmp4819;
__m512 tmp4783 = tmp4804;
__m512 tmp4789 = tmp4824;
__m512 tmp4784 = tmp4806;
__m512 tmp4790 = tmp4826;
__m512 tmp4785 = tmp4808;
__m512 tmp4791 = tmp4828;
__m512 tmp4786 = tmp4810;
__m512 tmp4792 = tmp4830;
__m512 tmp4859 = _mm512_unpacklo_ps(tmp4781, tmp4782);
__m512 tmp4860 = _mm512_unpackhi_ps(tmp4781, tmp4782);
__m512 tmp4861 = _mm512_unpacklo_ps(tmp4783, tmp4784);
__m512 tmp4862 = _mm512_unpackhi_ps(tmp4783, tmp4784);
__m512 tmp4863 = _mm512_unpacklo_ps(tmp4785, tmp4786);
__m512 tmp4864 = _mm512_unpackhi_ps(tmp4785, tmp4786);
__m512 tmp4865 = _mm512_unpacklo_ps(tmp4787, tmp4788);
__m512 tmp4866 = _mm512_unpackhi_ps(tmp4787, tmp4788);
__m512 tmp4867 = _mm512_unpacklo_ps(tmp4789, tmp4790);
__m512 tmp4868 = _mm512_unpackhi_ps(tmp4789, tmp4790);
__m512 tmp4869 = _mm512_unpacklo_ps(tmp4791, tmp4792);
__m512 tmp4870 = _mm512_unpackhi_ps(tmp4791, tmp4792);
__m512 tmp4871 = _mm512_shuffle_ps(tmp4859, tmp4861, 68);
__m512 tmp4872 = _mm512_shuffle_ps(tmp4859, tmp4861, 238);
__m512 tmp4873 = _mm512_shuffle_ps(tmp4860, tmp4862, 68);
__m512 tmp4874 = _mm512_shuffle_ps(tmp4860, tmp4862, 238);
__m512 tmp4875 = _mm512_shuffle_ps(tmp4863, tmp4865, 68);
__m512 tmp4876 = _mm512_shuffle_ps(tmp4863, tmp4865, 238);
__m512 tmp4877 = _mm512_shuffle_ps(tmp4864, tmp4866, 68);
__m512 tmp4878 = _mm512_shuffle_ps(tmp4864, tmp4866, 238);
__m512 tmp4879 = _mm512_shuffle_ps(tmp4867, tmp4869, 68);
__m512 tmp4880 = _mm512_shuffle_ps(tmp4867, tmp4869, 238);
__m512 tmp4881 = _mm512_shuffle_ps(tmp4868, tmp4870, 68);
__m512 tmp4882 = _mm512_shuffle_ps(tmp4868, tmp4870, 238);
__m512 tmp4883 = _mm512_shuffle_f32x4(tmp4871, tmp4875, 136);
__m512 tmp4884 = _mm512_shuffle_f32x4(tmp4871, tmp4875, 221);
__m512 tmp4885 = _mm512_shuffle_f32x4(tmp4872, tmp4876, 136);
__m512 tmp4886 = _mm512_shuffle_f32x4(tmp4872, tmp4876, 221);
__m512 tmp4887 = _mm512_shuffle_f32x4(tmp4873, tmp4877, 136);
__m512 tmp4888 = _mm512_shuffle_f32x4(tmp4873, tmp4877, 221);
__m512 tmp4889 = _mm512_shuffle_f32x4(tmp4874, tmp4878, 136);
__m512 tmp4890 = _mm512_shuffle_f32x4(tmp4874, tmp4878, 221);
__m512 tmp4891 = _mm512_shuffle_f32x4(tmp4879, tmp4879, 136);
__m512 tmp4892 = _mm512_shuffle_f32x4(tmp4879, tmp4879, 221);
__m512 tmp4893 = _mm512_shuffle_f32x4(tmp4880, tmp4880, 136);
__m512 tmp4894 = _mm512_shuffle_f32x4(tmp4880, tmp4880, 221);
__m512 tmp4895 = _mm512_shuffle_f32x4(tmp4881, tmp4881, 136);
__m512 tmp4896 = _mm512_shuffle_f32x4(tmp4881, tmp4881, 221);
__m512 tmp4897 = _mm512_shuffle_f32x4(tmp4882, tmp4882, 136);
__m512 tmp4898 = _mm512_shuffle_f32x4(tmp4882, tmp4882, 221);
tmp4781 = _mm512_shuffle_f32x4(tmp4883, tmp4891, 136);
tmp4789 = _mm512_shuffle_f32x4(tmp4883, tmp4891, 221);
tmp4782 = _mm512_shuffle_f32x4(tmp4885, tmp4893, 136);
tmp4790 = _mm512_shuffle_f32x4(tmp4885, tmp4893, 221);
tmp4783 = _mm512_shuffle_f32x4(tmp4887, tmp4895, 136);
tmp4791 = _mm512_shuffle_f32x4(tmp4887, tmp4895, 221);
tmp4784 = _mm512_shuffle_f32x4(tmp4889, tmp4897, 136);
tmp4792 = _mm512_shuffle_f32x4(tmp4889, tmp4897, 221);
tmp4785 = _mm512_shuffle_f32x4(tmp4884, tmp4892, 136);
__m512 tmp4833 = _mm512_shuffle_f32x4(tmp4884, tmp4892, 221);
tmp4786 = _mm512_shuffle_f32x4(tmp4886, tmp4894, 136);
__m512 tmp4834 = _mm512_shuffle_f32x4(tmp4886, tmp4894, 221);
tmp4787 = _mm512_shuffle_f32x4(tmp4888, tmp4896, 136);
__m512 tmp4835 = _mm512_shuffle_f32x4(tmp4888, tmp4896, 221);
tmp4788 = _mm512_shuffle_f32x4(tmp4890, tmp4898, 136);
__m512 tmp4836 = _mm512_shuffle_f32x4(tmp4890, tmp4898, 221);
(void)tmp4788;
(void)tmp4836;
__m512 tmp4841 = _mm512_add_ps(tmp4782, tmp4783);
__m512 tmp4852 = _mm512_add_ps(tmp4790, tmp4791);
__m512 tmp4840 = _mm512_add_ps(tmp4784, tmp4785);
__m512 tmp4851 = _mm512_add_ps(tmp4792, tmp4833);
__m512 tmp4846 = _mm512_sub_ps(tmp4784, tmp4785);
__m512 tmp4857 = _mm512_sub_ps(tmp4792, tmp4833);
__m512 tmp4845 = _mm512_sub_ps(tmp4782, tmp4783);
__m512 tmp4856 = _mm512_sub_ps(tmp4790, tmp4791);
__m512 tmp4842 = _mm512_add_ps(tmp4786, tmp4787);
__m512 tmp4853 = _mm512_add_ps(tmp4834, tmp4835);
__m512 tmp4847 = _mm512_sub_ps(tmp4786, tmp4787);
__m512 tmp4858 = _mm512_sub_ps(tmp4834, tmp4835);
__m512 tmp4844 = _mm512_fmadd_ps(tmp4846, _mm512_set1_ps(2e+00f), tmp4845);
__m512 tmp4855 = _mm512_fmadd_ps(tmp4857, _mm512_set1_ps(2e+00f), tmp4856);
__m512 tmp4839 = _mm512_add_ps(tmp4840, tmp4841);
__m512 tmp4850 = _mm512_add_ps(tmp4851, tmp4852);
__m512 tmp4843 = _mm512_fmadd_ps(tmp4847, _mm512_set1_ps(1.6e+01f), tmp4844);
__m512 tmp4854 = _mm512_fmadd_ps(tmp4858, _mm512_set1_ps(1.6e+01f), tmp4855);
__m512 tmp4838 = _mm512_add_ps(tmp4839, tmp4781);
__m512 tmp4849 = _mm512_add_ps(tmp4850, tmp4789);
__m512 tmp4837 = _mm512_fmadd_ps(tmp4842, _mm512_set1_ps(3.2e+01f), tmp4838);
__m512 tmp4848 = _mm512_fmadd_ps(tmp4853, _mm512_set1_ps(3.2e+01f), tmp4849);
__m512 out675 = tmp4837;
__m512 out677 = tmp4848;
__m512 out676 = tmp4843;
__m512 out678 = tmp4854;
out675 = _mm512_max_ps(_mm512_setzero_ps(), out675);
out677 = _mm512_max_ps(_mm512_setzero_ps(), out677);
out676 = _mm512_max_ps(_mm512_setzero_ps(), out676);
out678 = _mm512_max_ps(_mm512_setzero_ps(), out678);
_mm512_mask_storeu_ps(datPtr6+0+806912*i18+224*toH27+4*toW27+50432*k69+25216*l21, 4095, out675);
_mm512_mask_storeu_ps(datPtr6+48+806912*i18+224*toH27+4*toW27+50432*k69+25216*l21, 4095, out677);
_mm512_mask_storeu_ps(datPtr6+224+806912*i18+224*toH27+4*toW27+50432*k69+25216*l21, 4095, out676);
_mm512_mask_storeu_ps(datPtr6+272+806912*i18+224*toH27+4*toW27+50432*k69+25216*l21, 4095, out678);
__m512 sf353 = _mm512_loadu_ps(sfPtr5+256+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf354 = _mm512_loadu_ps(sfPtr5+384+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in716 = _mm512_shuffle_f32x4(sf353, sf354, 68);
__m512 in717 = _mm512_shuffle_f32x4(sf353, sf354, 238);
__m512 sf355 = _mm512_loadu_ps(sfPtr5+320+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf356 = _mm512_loadu_ps(sfPtr5+448+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in724 = _mm512_shuffle_f32x4(sf355, sf356, 68);
__m512 in725 = _mm512_shuffle_f32x4(sf355, sf356, 238);
__m512 sf357 = _mm512_loadu_ps(sfPtr5+409856+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf358 = _mm512_loadu_ps(sfPtr5+409984+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in718 = _mm512_shuffle_f32x4(sf357, sf358, 68);
__m512 in719 = _mm512_shuffle_f32x4(sf357, sf358, 238);
__m512 sf359 = _mm512_loadu_ps(sfPtr5+409920+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf360 = _mm512_loadu_ps(sfPtr5+410048+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in726 = _mm512_shuffle_f32x4(sf359, sf360, 68);
__m512 in727 = _mm512_shuffle_f32x4(sf359, sf360, 238);
__m512 sf361 = _mm512_loadu_ps(sfPtr5+819456+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf362 = _mm512_loadu_ps(sfPtr5+819584+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in720 = _mm512_shuffle_f32x4(sf361, sf362, 68);
__m512 in721 = _mm512_shuffle_f32x4(sf361, sf362, 238);
__m512 sf363 = _mm512_loadu_ps(sfPtr5+819520+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf364 = _mm512_loadu_ps(sfPtr5+819648+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in728 = _mm512_shuffle_f32x4(sf363, sf364, 68);
__m512 in729 = _mm512_shuffle_f32x4(sf363, sf364, 238);
__m512 sf365 = _mm512_loadu_ps(sfPtr5+1229056+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf366 = _mm512_loadu_ps(sfPtr5+1229184+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in722 = _mm512_shuffle_f32x4(sf365, sf366, 68);
__m512 in723 = _mm512_shuffle_f32x4(sf365, sf366, 238);
__m512 sf367 = _mm512_loadu_ps(sfPtr5+1229120+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf368 = _mm512_loadu_ps(sfPtr5+1229248+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in730 = _mm512_shuffle_f32x4(sf367, sf368, 68);
__m512 in731 = _mm512_shuffle_f32x4(sf367, sf368, 238);
__m512 tmp4915 = _mm512_add_ps(in717, in718);
__m512 tmp4935 = _mm512_add_ps(in725, in726);
__m512 tmp4914 = _mm512_add_ps(in719, in720);
__m512 tmp4934 = _mm512_add_ps(in727, in728);
__m512 tmp4920 = _mm512_sub_ps(in719, in720);
__m512 tmp4940 = _mm512_sub_ps(in727, in728);
__m512 tmp4919 = _mm512_sub_ps(in717, in718);
__m512 tmp4939 = _mm512_sub_ps(in725, in726);
__m512 tmp4916 = _mm512_add_ps(in721, in722);
__m512 tmp4936 = _mm512_add_ps(in729, in730);
__m512 tmp4921 = _mm512_sub_ps(in721, in722);
__m512 tmp4941 = _mm512_sub_ps(in729, in730);
__m512 tmp4918 = _mm512_fmadd_ps(tmp4920, _mm512_set1_ps(2e+00f), tmp4919);
__m512 tmp4938 = _mm512_fmadd_ps(tmp4940, _mm512_set1_ps(2e+00f), tmp4939);
__m512 tmp4925 = _mm512_fmadd_ps(tmp4920, _mm512_set1_ps(8e+00f), tmp4919);
__m512 tmp4945 = _mm512_fmadd_ps(tmp4940, _mm512_set1_ps(8e+00f), tmp4939);
__m512 tmp4913 = _mm512_add_ps(tmp4914, tmp4915);
__m512 tmp4933 = _mm512_add_ps(tmp4934, tmp4935);
__m512 tmp4917 = _mm512_fmadd_ps(tmp4921, _mm512_set1_ps(1.6e+01f), tmp4918);
__m512 tmp4937 = _mm512_fmadd_ps(tmp4941, _mm512_set1_ps(1.6e+01f), tmp4938);
__m512 tmp4924 = _mm512_fmadd_ps(tmp4921, _mm512_set1_ps(4e+00f), tmp4925);
__m512 tmp4944 = _mm512_fmadd_ps(tmp4941, _mm512_set1_ps(4e+00f), tmp4945);
__m512 tmp4930 = _mm512_add_ps(tmp4921, tmp4919);
__m512 tmp4950 = _mm512_add_ps(tmp4941, tmp4939);
__m512 tmp4923 = _mm512_fmadd_ps(tmp4914, _mm512_set1_ps(4e+00f), tmp4915);
__m512 tmp4943 = _mm512_fmadd_ps(tmp4934, _mm512_set1_ps(4e+00f), tmp4935);
__m512 tmp4927 = _mm512_fmadd_ps(tmp4914, _mm512_set1_ps(1.6e+01f), tmp4915);
__m512 tmp4947 = _mm512_fmadd_ps(tmp4934, _mm512_set1_ps(1.6e+01f), tmp4935);
__m512 tmp4912 = _mm512_add_ps(tmp4913, in716);
__m512 tmp4932 = _mm512_add_ps(tmp4933, in724);
__m512 tmp4929 = _mm512_add_ps(tmp4930, in723);
__m512 tmp4949 = _mm512_add_ps(tmp4950, in731);
__m512 tmp4911 = _mm512_fmadd_ps(tmp4916, _mm512_set1_ps(3.2e+01f), tmp4912);
__m512 tmp4931 = _mm512_fmadd_ps(tmp4936, _mm512_set1_ps(3.2e+01f), tmp4932);
__m512 tmp4922 = _mm512_fmadd_ps(tmp4916, _mm512_set1_ps(8e+00f), tmp4923);
__m512 tmp4942 = _mm512_fmadd_ps(tmp4936, _mm512_set1_ps(8e+00f), tmp4943);
__m512 tmp4928 = _mm512_fmadd_ps(tmp4920, _mm512_set1_ps(3.2e+01f), tmp4929);
__m512 tmp4948 = _mm512_fmadd_ps(tmp4940, _mm512_set1_ps(3.2e+01f), tmp4949);
__m512 tmp4926 = _mm512_fmadd_ps(tmp4916, _mm512_set1_ps(2e+00f), tmp4927);
__m512 tmp4946 = _mm512_fmadd_ps(tmp4936, _mm512_set1_ps(2e+00f), tmp4947);
__m512 tmp4899 = tmp4911;
__m512 tmp4905 = tmp4931;
__m512 tmp4900 = tmp4917;
__m512 tmp4906 = tmp4937;
__m512 tmp4901 = tmp4922;
__m512 tmp4907 = tmp4942;
__m512 tmp4902 = tmp4924;
__m512 tmp4908 = tmp4944;
__m512 tmp4903 = tmp4926;
__m512 tmp4909 = tmp4946;
__m512 tmp4904 = tmp4928;
__m512 tmp4910 = tmp4948;
__m512 tmp4977 = _mm512_unpacklo_ps(tmp4899, tmp4900);
__m512 tmp4978 = _mm512_unpackhi_ps(tmp4899, tmp4900);
__m512 tmp4979 = _mm512_unpacklo_ps(tmp4901, tmp4902);
__m512 tmp4980 = _mm512_unpackhi_ps(tmp4901, tmp4902);
__m512 tmp4981 = _mm512_unpacklo_ps(tmp4903, tmp4904);
__m512 tmp4982 = _mm512_unpackhi_ps(tmp4903, tmp4904);
__m512 tmp4983 = _mm512_unpacklo_ps(tmp4905, tmp4906);
__m512 tmp4984 = _mm512_unpackhi_ps(tmp4905, tmp4906);
__m512 tmp4985 = _mm512_unpacklo_ps(tmp4907, tmp4908);
__m512 tmp4986 = _mm512_unpackhi_ps(tmp4907, tmp4908);
__m512 tmp4987 = _mm512_unpacklo_ps(tmp4909, tmp4910);
__m512 tmp4988 = _mm512_unpackhi_ps(tmp4909, tmp4910);
__m512 tmp4989 = _mm512_shuffle_ps(tmp4977, tmp4979, 68);
__m512 tmp4990 = _mm512_shuffle_ps(tmp4977, tmp4979, 238);
__m512 tmp4991 = _mm512_shuffle_ps(tmp4978, tmp4980, 68);
__m512 tmp4992 = _mm512_shuffle_ps(tmp4978, tmp4980, 238);
__m512 tmp4993 = _mm512_shuffle_ps(tmp4981, tmp4983, 68);
__m512 tmp4994 = _mm512_shuffle_ps(tmp4981, tmp4983, 238);
__m512 tmp4995 = _mm512_shuffle_ps(tmp4982, tmp4984, 68);
__m512 tmp4996 = _mm512_shuffle_ps(tmp4982, tmp4984, 238);
__m512 tmp4997 = _mm512_shuffle_ps(tmp4985, tmp4987, 68);
__m512 tmp4998 = _mm512_shuffle_ps(tmp4985, tmp4987, 238);
__m512 tmp4999 = _mm512_shuffle_ps(tmp4986, tmp4988, 68);
__m512 tmp5000 = _mm512_shuffle_ps(tmp4986, tmp4988, 238);
__m512 tmp5001 = _mm512_shuffle_f32x4(tmp4989, tmp4993, 136);
__m512 tmp5002 = _mm512_shuffle_f32x4(tmp4989, tmp4993, 221);
__m512 tmp5003 = _mm512_shuffle_f32x4(tmp4990, tmp4994, 136);
__m512 tmp5004 = _mm512_shuffle_f32x4(tmp4990, tmp4994, 221);
__m512 tmp5005 = _mm512_shuffle_f32x4(tmp4991, tmp4995, 136);
__m512 tmp5006 = _mm512_shuffle_f32x4(tmp4991, tmp4995, 221);
__m512 tmp5007 = _mm512_shuffle_f32x4(tmp4992, tmp4996, 136);
__m512 tmp5008 = _mm512_shuffle_f32x4(tmp4992, tmp4996, 221);
__m512 tmp5009 = _mm512_shuffle_f32x4(tmp4997, tmp4997, 136);
__m512 tmp5010 = _mm512_shuffle_f32x4(tmp4997, tmp4997, 221);
__m512 tmp5011 = _mm512_shuffle_f32x4(tmp4998, tmp4998, 136);
__m512 tmp5012 = _mm512_shuffle_f32x4(tmp4998, tmp4998, 221);
__m512 tmp5013 = _mm512_shuffle_f32x4(tmp4999, tmp4999, 136);
__m512 tmp5014 = _mm512_shuffle_f32x4(tmp4999, tmp4999, 221);
__m512 tmp5015 = _mm512_shuffle_f32x4(tmp5000, tmp5000, 136);
__m512 tmp5016 = _mm512_shuffle_f32x4(tmp5000, tmp5000, 221);
tmp4899 = _mm512_shuffle_f32x4(tmp5001, tmp5009, 136);
tmp4907 = _mm512_shuffle_f32x4(tmp5001, tmp5009, 221);
tmp4900 = _mm512_shuffle_f32x4(tmp5003, tmp5011, 136);
tmp4908 = _mm512_shuffle_f32x4(tmp5003, tmp5011, 221);
tmp4901 = _mm512_shuffle_f32x4(tmp5005, tmp5013, 136);
tmp4909 = _mm512_shuffle_f32x4(tmp5005, tmp5013, 221);
tmp4902 = _mm512_shuffle_f32x4(tmp5007, tmp5015, 136);
tmp4910 = _mm512_shuffle_f32x4(tmp5007, tmp5015, 221);
tmp4903 = _mm512_shuffle_f32x4(tmp5002, tmp5010, 136);
__m512 tmp4951 = _mm512_shuffle_f32x4(tmp5002, tmp5010, 221);
tmp4904 = _mm512_shuffle_f32x4(tmp5004, tmp5012, 136);
__m512 tmp4952 = _mm512_shuffle_f32x4(tmp5004, tmp5012, 221);
tmp4905 = _mm512_shuffle_f32x4(tmp5006, tmp5014, 136);
__m512 tmp4953 = _mm512_shuffle_f32x4(tmp5006, tmp5014, 221);
tmp4906 = _mm512_shuffle_f32x4(tmp5008, tmp5016, 136);
__m512 tmp4954 = _mm512_shuffle_f32x4(tmp5008, tmp5016, 221);
(void)tmp4906;
(void)tmp4954;
__m512 tmp4959 = _mm512_add_ps(tmp4900, tmp4901);
__m512 tmp4970 = _mm512_add_ps(tmp4908, tmp4909);
__m512 tmp4958 = _mm512_add_ps(tmp4902, tmp4903);
__m512 tmp4969 = _mm512_add_ps(tmp4910, tmp4951);
__m512 tmp4964 = _mm512_sub_ps(tmp4902, tmp4903);
__m512 tmp4975 = _mm512_sub_ps(tmp4910, tmp4951);
__m512 tmp4963 = _mm512_sub_ps(tmp4900, tmp4901);
__m512 tmp4974 = _mm512_sub_ps(tmp4908, tmp4909);
__m512 tmp4960 = _mm512_add_ps(tmp4904, tmp4905);
__m512 tmp4971 = _mm512_add_ps(tmp4952, tmp4953);
__m512 tmp4965 = _mm512_sub_ps(tmp4904, tmp4905);
__m512 tmp4976 = _mm512_sub_ps(tmp4952, tmp4953);
__m512 tmp4962 = _mm512_fmadd_ps(tmp4964, _mm512_set1_ps(2e+00f), tmp4963);
__m512 tmp4973 = _mm512_fmadd_ps(tmp4975, _mm512_set1_ps(2e+00f), tmp4974);
__m512 tmp4957 = _mm512_add_ps(tmp4958, tmp4959);
__m512 tmp4968 = _mm512_add_ps(tmp4969, tmp4970);
__m512 tmp4961 = _mm512_fmadd_ps(tmp4965, _mm512_set1_ps(1.6e+01f), tmp4962);
__m512 tmp4972 = _mm512_fmadd_ps(tmp4976, _mm512_set1_ps(1.6e+01f), tmp4973);
__m512 tmp4956 = _mm512_add_ps(tmp4957, tmp4899);
__m512 tmp4967 = _mm512_add_ps(tmp4968, tmp4907);
__m512 tmp4955 = _mm512_fmadd_ps(tmp4960, _mm512_set1_ps(3.2e+01f), tmp4956);
__m512 tmp4966 = _mm512_fmadd_ps(tmp4971, _mm512_set1_ps(3.2e+01f), tmp4967);
__m512 out679 = tmp4955;
__m512 out681 = tmp4966;
__m512 out680 = tmp4961;
__m512 out682 = tmp4972;
out679 = _mm512_max_ps(_mm512_setzero_ps(), out679);
out681 = _mm512_max_ps(_mm512_setzero_ps(), out681);
out680 = _mm512_max_ps(_mm512_setzero_ps(), out680);
out682 = _mm512_max_ps(_mm512_setzero_ps(), out682);
_mm512_mask_storeu_ps(datPtr6+96+806912*i18+224*toH27+4*toW27+50432*k69+25216*l21, 4095, out679);
_mm512_mask_storeu_ps(datPtr6+12608+806912*i18+224*toH27+4*toW27+50432*k69+25216*l21, 4095, out681);
_mm512_mask_storeu_ps(datPtr6+320+806912*i18+224*toH27+4*toW27+50432*k69+25216*l21, 4095, out680);
_mm512_mask_storeu_ps(datPtr6+12832+806912*i18+224*toH27+4*toW27+50432*k69+25216*l21, 4095, out682);
__m512 sf369 = _mm512_loadu_ps(sfPtr5+512+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf370 = _mm512_loadu_ps(sfPtr5+640+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in732 = _mm512_shuffle_f32x4(sf369, sf370, 68);
__m512 in733 = _mm512_shuffle_f32x4(sf369, sf370, 238);
__m512 sf371 = _mm512_loadu_ps(sfPtr5+576+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf372 = _mm512_loadu_ps(sfPtr5+704+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in740 = _mm512_shuffle_f32x4(sf371, sf372, 68);
__m512 in741 = _mm512_shuffle_f32x4(sf371, sf372, 238);
__m512 sf373 = _mm512_loadu_ps(sfPtr5+410112+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf374 = _mm512_loadu_ps(sfPtr5+410240+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in734 = _mm512_shuffle_f32x4(sf373, sf374, 68);
__m512 in735 = _mm512_shuffle_f32x4(sf373, sf374, 238);
__m512 sf375 = _mm512_loadu_ps(sfPtr5+410176+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf376 = _mm512_loadu_ps(sfPtr5+410304+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in742 = _mm512_shuffle_f32x4(sf375, sf376, 68);
__m512 in743 = _mm512_shuffle_f32x4(sf375, sf376, 238);
__m512 sf377 = _mm512_loadu_ps(sfPtr5+819712+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf378 = _mm512_loadu_ps(sfPtr5+819840+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in736 = _mm512_shuffle_f32x4(sf377, sf378, 68);
__m512 in737 = _mm512_shuffle_f32x4(sf377, sf378, 238);
__m512 sf379 = _mm512_loadu_ps(sfPtr5+819776+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf380 = _mm512_loadu_ps(sfPtr5+819904+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in744 = _mm512_shuffle_f32x4(sf379, sf380, 68);
__m512 in745 = _mm512_shuffle_f32x4(sf379, sf380, 238);
__m512 sf381 = _mm512_loadu_ps(sfPtr5+1229312+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf382 = _mm512_loadu_ps(sfPtr5+1229440+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in738 = _mm512_shuffle_f32x4(sf381, sf382, 68);
__m512 in739 = _mm512_shuffle_f32x4(sf381, sf382, 238);
__m512 sf383 = _mm512_loadu_ps(sfPtr5+1229376+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 sf384 = _mm512_loadu_ps(sfPtr5+1229504+1638400*i18+24576*j13+1536*k69+768*l21);
__m512 in746 = _mm512_shuffle_f32x4(sf383, sf384, 68);
__m512 in747 = _mm512_shuffle_f32x4(sf383, sf384, 238);
__m512 tmp5033 = _mm512_add_ps(in733, in734);
__m512 tmp5053 = _mm512_add_ps(in741, in742);
__m512 tmp5032 = _mm512_add_ps(in735, in736);
__m512 tmp5052 = _mm512_add_ps(in743, in744);
__m512 tmp5038 = _mm512_sub_ps(in735, in736);
__m512 tmp5058 = _mm512_sub_ps(in743, in744);
__m512 tmp5037 = _mm512_sub_ps(in733, in734);
__m512 tmp5057 = _mm512_sub_ps(in741, in742);
__m512 tmp5034 = _mm512_add_ps(in737, in738);
__m512 tmp5054 = _mm512_add_ps(in745, in746);
__m512 tmp5039 = _mm512_sub_ps(in737, in738);
__m512 tmp5059 = _mm512_sub_ps(in745, in746);
__m512 tmp5036 = _mm512_fmadd_ps(tmp5038, _mm512_set1_ps(2e+00f), tmp5037);
__m512 tmp5056 = _mm512_fmadd_ps(tmp5058, _mm512_set1_ps(2e+00f), tmp5057);
__m512 tmp5043 = _mm512_fmadd_ps(tmp5038, _mm512_set1_ps(8e+00f), tmp5037);
__m512 tmp5063 = _mm512_fmadd_ps(tmp5058, _mm512_set1_ps(8e+00f), tmp5057);
__m512 tmp5031 = _mm512_add_ps(tmp5032, tmp5033);
__m512 tmp5051 = _mm512_add_ps(tmp5052, tmp5053);
__m512 tmp5035 = _mm512_fmadd_ps(tmp5039, _mm512_set1_ps(1.6e+01f), tmp5036);
__m512 tmp5055 = _mm512_fmadd_ps(tmp5059, _mm512_set1_ps(1.6e+01f), tmp5056);
__m512 tmp5042 = _mm512_fmadd_ps(tmp5039, _mm512_set1_ps(4e+00f), tmp5043);
__m512 tmp5062 = _mm512_fmadd_ps(tmp5059, _mm512_set1_ps(4e+00f), tmp5063);
__m512 tmp5048 = _mm512_add_ps(tmp5039, tmp5037);
__m512 tmp5068 = _mm512_add_ps(tmp5059, tmp5057);
__m512 tmp5041 = _mm512_fmadd_ps(tmp5032, _mm512_set1_ps(4e+00f), tmp5033);
__m512 tmp5061 = _mm512_fmadd_ps(tmp5052, _mm512_set1_ps(4e+00f), tmp5053);
__m512 tmp5045 = _mm512_fmadd_ps(tmp5032, _mm512_set1_ps(1.6e+01f), tmp5033);
__m512 tmp5065 = _mm512_fmadd_ps(tmp5052, _mm512_set1_ps(1.6e+01f), tmp5053);
__m512 tmp5030 = _mm512_add_ps(tmp5031, in732);
__m512 tmp5050 = _mm512_add_ps(tmp5051, in740);
__m512 tmp5047 = _mm512_add_ps(tmp5048, in739);
__m512 tmp5067 = _mm512_add_ps(tmp5068, in747);
__m512 tmp5029 = _mm512_fmadd_ps(tmp5034, _mm512_set1_ps(3.2e+01f), tmp5030);
__m512 tmp5049 = _mm512_fmadd_ps(tmp5054, _mm512_set1_ps(3.2e+01f), tmp5050);
__m512 tmp5040 = _mm512_fmadd_ps(tmp5034, _mm512_set1_ps(8e+00f), tmp5041);
__m512 tmp5060 = _mm512_fmadd_ps(tmp5054, _mm512_set1_ps(8e+00f), tmp5061);
__m512 tmp5046 = _mm512_fmadd_ps(tmp5038, _mm512_set1_ps(3.2e+01f), tmp5047);
__m512 tmp5066 = _mm512_fmadd_ps(tmp5058, _mm512_set1_ps(3.2e+01f), tmp5067);
__m512 tmp5044 = _mm512_fmadd_ps(tmp5034, _mm512_set1_ps(2e+00f), tmp5045);
__m512 tmp5064 = _mm512_fmadd_ps(tmp5054, _mm512_set1_ps(2e+00f), tmp5065);
__m512 tmp5017 = tmp5029;
__m512 tmp5023 = tmp5049;
__m512 tmp5018 = tmp5035;
__m512 tmp5024 = tmp5055;
__m512 tmp5019 = tmp5040;
__m512 tmp5025 = tmp5060;
__m512 tmp5020 = tmp5042;
__m512 tmp5026 = tmp5062;
__m512 tmp5021 = tmp5044;
__m512 tmp5027 = tmp5064;
__m512 tmp5022 = tmp5046;
__m512 tmp5028 = tmp5066;
__m512 tmp5095 = _mm512_unpacklo_ps(tmp5017, tmp5018);
__m512 tmp5096 = _mm512_unpackhi_ps(tmp5017, tmp5018);
__m512 tmp5097 = _mm512_unpacklo_ps(tmp5019, tmp5020);
__m512 tmp5098 = _mm512_unpackhi_ps(tmp5019, tmp5020);
__m512 tmp5099 = _mm512_unpacklo_ps(tmp5021, tmp5022);
__m512 tmp5100 = _mm512_unpackhi_ps(tmp5021, tmp5022);
__m512 tmp5101 = _mm512_unpacklo_ps(tmp5023, tmp5024);
__m512 tmp5102 = _mm512_unpackhi_ps(tmp5023, tmp5024);
__m512 tmp5103 = _mm512_unpacklo_ps(tmp5025, tmp5026);
__m512 tmp5104 = _mm512_unpackhi_ps(tmp5025, tmp5026);
__m512 tmp5105 = _mm512_unpacklo_ps(tmp5027, tmp5028);
__m512 tmp5106 = _mm512_unpackhi_ps(tmp5027, tmp5028);
__m512 tmp5107 = _mm512_shuffle_ps(tmp5095, tmp5097, 68);
__m512 tmp5108 = _mm512_shuffle_ps(tmp5095, tmp5097, 238);
__m512 tmp5109 = _mm512_shuffle_ps(tmp5096, tmp5098, 68);
__m512 tmp5110 = _mm512_shuffle_ps(tmp5096, tmp5098, 238);
__m512 tmp5111 = _mm512_shuffle_ps(tmp5099, tmp5101, 68);
__m512 tmp5112 = _mm512_shuffle_ps(tmp5099, tmp5101, 238);
__m512 tmp5113 = _mm512_shuffle_ps(tmp5100, tmp5102, 68);
__m512 tmp5114 = _mm512_shuffle_ps(tmp5100, tmp5102, 238);
__m512 tmp5115 = _mm512_shuffle_ps(tmp5103, tmp5105, 68);
__m512 tmp5116 = _mm512_shuffle_ps(tmp5103, tmp5105, 238);
__m512 tmp5117 = _mm512_shuffle_ps(tmp5104, tmp5106, 68);
__m512 tmp5118 = _mm512_shuffle_ps(tmp5104, tmp5106, 238);
__m512 tmp5119 = _mm512_shuffle_f32x4(tmp5107, tmp5111, 136);
__m512 tmp5120 = _mm512_shuffle_f32x4(tmp5107, tmp5111, 221);
__m512 tmp5121 = _mm512_shuffle_f32x4(tmp5108, tmp5112, 136);
__m512 tmp5122 = _mm512_shuffle_f32x4(tmp5108, tmp5112, 221);
__m512 tmp5123 = _mm512_shuffle_f32x4(tmp5109, tmp5113, 136);
__m512 tmp5124 = _mm512_shuffle_f32x4(tmp5109, tmp5113, 221);
__m512 tmp5125 = _mm512_shuffle_f32x4(tmp5110, tmp5114, 136);
__m512 tmp5126 = _mm512_shuffle_f32x4(tmp5110, tmp5114, 221);
__m512 tmp5127 = _mm512_shuffle_f32x4(tmp5115, tmp5115, 136);
__m512 tmp5128 = _mm512_shuffle_f32x4(tmp5115, tmp5115, 221);
__m512 tmp5129 = _mm512_shuffle_f32x4(tmp5116, tmp5116, 136);
__m512 tmp5130 = _mm512_shuffle_f32x4(tmp5116, tmp5116, 221);
__m512 tmp5131 = _mm512_shuffle_f32x4(tmp5117, tmp5117, 136);
__m512 tmp5132 = _mm512_shuffle_f32x4(tmp5117, tmp5117, 221);
__m512 tmp5133 = _mm512_shuffle_f32x4(tmp5118, tmp5118, 136);
__m512 tmp5134 = _mm512_shuffle_f32x4(tmp5118, tmp5118, 221);
tmp5017 = _mm512_shuffle_f32x4(tmp5119, tmp5127, 136);
tmp5025 = _mm512_shuffle_f32x4(tmp5119, tmp5127, 221);
tmp5018 = _mm512_shuffle_f32x4(tmp5121, tmp5129, 136);
tmp5026 = _mm512_shuffle_f32x4(tmp5121, tmp5129, 221);
tmp5019 = _mm512_shuffle_f32x4(tmp5123, tmp5131, 136);
tmp5027 = _mm512_shuffle_f32x4(tmp5123, tmp5131, 221);
tmp5020 = _mm512_shuffle_f32x4(tmp5125, tmp5133, 136);
tmp5028 = _mm512_shuffle_f32x4(tmp5125, tmp5133, 221);
tmp5021 = _mm512_shuffle_f32x4(tmp5120, tmp5128, 136);
__m512 tmp5069 = _mm512_shuffle_f32x4(tmp5120, tmp5128, 221);
tmp5022 = _mm512_shuffle_f32x4(tmp5122, tmp5130, 136);
__m512 tmp5070 = _mm512_shuffle_f32x4(tmp5122, tmp5130, 221);
tmp5023 = _mm512_shuffle_f32x4(tmp5124, tmp5132, 136);
__m512 tmp5071 = _mm512_shuffle_f32x4(tmp5124, tmp5132, 221);
tmp5024 = _mm512_shuffle_f32x4(tmp5126, tmp5134, 136);
__m512 tmp5072 = _mm512_shuffle_f32x4(tmp5126, tmp5134, 221);
(void)tmp5024;
(void)tmp5072;
__m512 tmp5077 = _mm512_add_ps(tmp5018, tmp5019);
__m512 tmp5088 = _mm512_add_ps(tmp5026, tmp5027);
__m512 tmp5076 = _mm512_add_ps(tmp5020, tmp5021);
__m512 tmp5087 = _mm512_add_ps(tmp5028, tmp5069);
__m512 tmp5082 = _mm512_sub_ps(tmp5020, tmp5021);
__m512 tmp5093 = _mm512_sub_ps(tmp5028, tmp5069);
__m512 tmp5081 = _mm512_sub_ps(tmp5018, tmp5019);
__m512 tmp5092 = _mm512_sub_ps(tmp5026, tmp5027);
__m512 tmp5078 = _mm512_add_ps(tmp5022, tmp5023);
__m512 tmp5089 = _mm512_add_ps(tmp5070, tmp5071);
__m512 tmp5083 = _mm512_sub_ps(tmp5022, tmp5023);
__m512 tmp5094 = _mm512_sub_ps(tmp5070, tmp5071);
__m512 tmp5080 = _mm512_fmadd_ps(tmp5082, _mm512_set1_ps(2e+00f), tmp5081);
__m512 tmp5091 = _mm512_fmadd_ps(tmp5093, _mm512_set1_ps(2e+00f), tmp5092);
__m512 tmp5075 = _mm512_add_ps(tmp5076, tmp5077);
__m512 tmp5086 = _mm512_add_ps(tmp5087, tmp5088);
__m512 tmp5079 = _mm512_fmadd_ps(tmp5083, _mm512_set1_ps(1.6e+01f), tmp5080);
__m512 tmp5090 = _mm512_fmadd_ps(tmp5094, _mm512_set1_ps(1.6e+01f), tmp5091);
__m512 tmp5074 = _mm512_add_ps(tmp5075, tmp5017);
__m512 tmp5085 = _mm512_add_ps(tmp5086, tmp5025);
__m512 tmp5073 = _mm512_fmadd_ps(tmp5078, _mm512_set1_ps(3.2e+01f), tmp5074);
__m512 tmp5084 = _mm512_fmadd_ps(tmp5089, _mm512_set1_ps(3.2e+01f), tmp5085);
__m512 out683 = tmp5073;
__m512 out685 = tmp5084;
__m512 out684 = tmp5079;
__m512 out686 = tmp5090;
out683 = _mm512_max_ps(_mm512_setzero_ps(), out683);
out685 = _mm512_max_ps(_mm512_setzero_ps(), out685);
out684 = _mm512_max_ps(_mm512_setzero_ps(), out684);
out686 = _mm512_max_ps(_mm512_setzero_ps(), out686);
_mm512_mask_storeu_ps(datPtr6+12656+806912*i18+224*toH27+4*toW27+50432*k69+25216*l21, 4095, out683);
_mm512_mask_storeu_ps(datPtr6+12704+806912*i18+224*toH27+4*toW27+50432*k69+25216*l21, 4095, out685);
_mm512_mask_storeu_ps(datPtr6+12880+806912*i18+224*toH27+4*toW27+50432*k69+25216*l21, 4095, out684);
_mm512_mask_storeu_ps(datPtr6+12928+806912*i18+224*toH27+4*toW27+50432*k69+25216*l21, 4095, out686);
}
}
if (j13 >= last4) return;
++j13;
rel12 = 1;
}
ptrdiff_t toH28 = base12+0;
ptrdiff_t toW28 = 36;
ptrdiff_t k70 = 16*w33;
for (; k70 != 16; ++k70) {
ptrdiff_t l22 = 0;
for (; l22 != 4; ++l22) {
__m512 sf385 = _mm512_loadu_ps(sfPtr5+0+1638400*i18+24576*j13+1024*k70+256*l22);
__m512 sf386 = _mm512_loadu_ps(sfPtr5+128+1638400*i18+24576*j13+1024*k70+256*l22);
__m512 in748 = _mm512_shuffle_f32x4(sf385, sf386, 68);
__m512 in749 = _mm512_shuffle_f32x4(sf385, sf386, 238);
__m512 sf387 = _mm512_loadu_ps(sfPtr5+64+1638400*i18+24576*j13+1024*k70+256*l22);
__m512 sf388 = _mm512_loadu_ps(sfPtr5+192+1638400*i18+24576*j13+1024*k70+256*l22);
__m512 in756 = _mm512_shuffle_f32x4(sf387, sf388, 68);
__m512 in757 = _mm512_shuffle_f32x4(sf387, sf388, 238);
__m512 sf389 = _mm512_loadu_ps(sfPtr5+409600+1638400*i18+24576*j13+1024*k70+256*l22);
__m512 sf390 = _mm512_loadu_ps(sfPtr5+409728+1638400*i18+24576*j13+1024*k70+256*l22);
__m512 in750 = _mm512_shuffle_f32x4(sf389, sf390, 68);
__m512 in751 = _mm512_shuffle_f32x4(sf389, sf390, 238);
__m512 sf391 = _mm512_loadu_ps(sfPtr5+409664+1638400*i18+24576*j13+1024*k70+256*l22);
__m512 sf392 = _mm512_loadu_ps(sfPtr5+409792+1638400*i18+24576*j13+1024*k70+256*l22);
__m512 in758 = _mm512_shuffle_f32x4(sf391, sf392, 68);
__m512 in759 = _mm512_shuffle_f32x4(sf391, sf392, 238);
__m512 sf393 = _mm512_loadu_ps(sfPtr5+819200+1638400*i18+24576*j13+1024*k70+256*l22);
__m512 sf394 = _mm512_loadu_ps(sfPtr5+819328+1638400*i18+24576*j13+1024*k70+256*l22);
__m512 in752 = _mm512_shuffle_f32x4(sf393, sf394, 68);
__m512 in753 = _mm512_shuffle_f32x4(sf393, sf394, 238);
__m512 sf395 = _mm512_loadu_ps(sfPtr5+819264+1638400*i18+24576*j13+1024*k70+256*l22);
__m512 sf396 = _mm512_loadu_ps(sfPtr5+819392+1638400*i18+24576*j13+1024*k70+256*l22);
__m512 in760 = _mm512_shuffle_f32x4(sf395, sf396, 68);
__m512 in761 = _mm512_shuffle_f32x4(sf395, sf396, 238);
__m512 sf397 = _mm512_loadu_ps(sfPtr5+1228800+1638400*i18+24576*j13+1024*k70+256*l22);
__m512 sf398 = _mm512_loadu_ps(sfPtr5+1228928+1638400*i18+24576*j13+1024*k70+256*l22);
__m512 in754 = _mm512_shuffle_f32x4(sf397, sf398, 68);
__m512 in755 = _mm512_shuffle_f32x4(sf397, sf398, 238);
__m512 sf399 = _mm512_loadu_ps(sfPtr5+1228864+1638400*i18+24576*j13+1024*k70+256*l22);
__m512 sf400 = _mm512_loadu_ps(sfPtr5+1228992+1638400*i18+24576*j13+1024*k70+256*l22);
__m512 in762 = _mm512_shuffle_f32x4(sf399, sf400, 68);
__m512 in763 = _mm512_shuffle_f32x4(sf399, sf400, 238);
__m512 tmp5151 = _mm512_add_ps(in749, in750);
__m512 tmp5171 = _mm512_add_ps(in757, in758);
__m512 tmp5150 = _mm512_add_ps(in751, in752);
__m512 tmp5170 = _mm512_add_ps(in759, in760);
__m512 tmp5156 = _mm512_sub_ps(in751, in752);
__m512 tmp5176 = _mm512_sub_ps(in759, in760);
__m512 tmp5155 = _mm512_sub_ps(in749, in750);
__m512 tmp5175 = _mm512_sub_ps(in757, in758);
__m512 tmp5152 = _mm512_add_ps(in753, in754);
__m512 tmp5172 = _mm512_add_ps(in761, in762);
__m512 tmp5157 = _mm512_sub_ps(in753, in754);
__m512 tmp5177 = _mm512_sub_ps(in761, in762);
__m512 tmp5154 = _mm512_fmadd_ps(tmp5156, _mm512_set1_ps(2e+00f), tmp5155);
__m512 tmp5174 = _mm512_fmadd_ps(tmp5176, _mm512_set1_ps(2e+00f), tmp5175);
__m512 tmp5161 = _mm512_fmadd_ps(tmp5156, _mm512_set1_ps(8e+00f), tmp5155);
__m512 tmp5181 = _mm512_fmadd_ps(tmp5176, _mm512_set1_ps(8e+00f), tmp5175);
__m512 tmp5149 = _mm512_add_ps(tmp5150, tmp5151);
__m512 tmp5169 = _mm512_add_ps(tmp5170, tmp5171);
__m512 tmp5153 = _mm512_fmadd_ps(tmp5157, _mm512_set1_ps(1.6e+01f), tmp5154);
__m512 tmp5173 = _mm512_fmadd_ps(tmp5177, _mm512_set1_ps(1.6e+01f), tmp5174);
__m512 tmp5160 = _mm512_fmadd_ps(tmp5157, _mm512_set1_ps(4e+00f), tmp5161);
__m512 tmp5180 = _mm512_fmadd_ps(tmp5177, _mm512_set1_ps(4e+00f), tmp5181);
__m512 tmp5166 = _mm512_add_ps(tmp5157, tmp5155);
__m512 tmp5186 = _mm512_add_ps(tmp5177, tmp5175);
__m512 tmp5159 = _mm512_fmadd_ps(tmp5150, _mm512_set1_ps(4e+00f), tmp5151);
__m512 tmp5179 = _mm512_fmadd_ps(tmp5170, _mm512_set1_ps(4e+00f), tmp5171);
__m512 tmp5163 = _mm512_fmadd_ps(tmp5150, _mm512_set1_ps(1.6e+01f), tmp5151);
__m512 tmp5183 = _mm512_fmadd_ps(tmp5170, _mm512_set1_ps(1.6e+01f), tmp5171);
__m512 tmp5148 = _mm512_add_ps(tmp5149, in748);
__m512 tmp5168 = _mm512_add_ps(tmp5169, in756);
__m512 tmp5165 = _mm512_add_ps(tmp5166, in755);
__m512 tmp5185 = _mm512_add_ps(tmp5186, in763);
__m512 tmp5147 = _mm512_fmadd_ps(tmp5152, _mm512_set1_ps(3.2e+01f), tmp5148);
__m512 tmp5167 = _mm512_fmadd_ps(tmp5172, _mm512_set1_ps(3.2e+01f), tmp5168);
__m512 tmp5158 = _mm512_fmadd_ps(tmp5152, _mm512_set1_ps(8e+00f), tmp5159);
__m512 tmp5178 = _mm512_fmadd_ps(tmp5172, _mm512_set1_ps(8e+00f), tmp5179);
__m512 tmp5164 = _mm512_fmadd_ps(tmp5156, _mm512_set1_ps(3.2e+01f), tmp5165);
__m512 tmp5184 = _mm512_fmadd_ps(tmp5176, _mm512_set1_ps(3.2e+01f), tmp5185);
__m512 tmp5162 = _mm512_fmadd_ps(tmp5152, _mm512_set1_ps(2e+00f), tmp5163);
__m512 tmp5182 = _mm512_fmadd_ps(tmp5172, _mm512_set1_ps(2e+00f), tmp5183);
__m512 tmp5135 = tmp5147;
__m512 tmp5141 = tmp5167;
__m512 tmp5136 = tmp5153;
__m512 tmp5142 = tmp5173;
__m512 tmp5137 = tmp5158;
__m512 tmp5143 = tmp5178;
__m512 tmp5138 = tmp5160;
__m512 tmp5144 = tmp5180;
__m512 tmp5139 = tmp5162;
__m512 tmp5145 = tmp5182;
__m512 tmp5140 = tmp5164;
__m512 tmp5146 = tmp5184;
__m512 tmp5213 = _mm512_unpacklo_ps(tmp5135, tmp5136);
__m512 tmp5214 = _mm512_unpackhi_ps(tmp5135, tmp5136);
__m512 tmp5215 = _mm512_unpacklo_ps(tmp5137, tmp5138);
__m512 tmp5216 = _mm512_unpackhi_ps(tmp5137, tmp5138);
__m512 tmp5217 = _mm512_unpacklo_ps(tmp5139, tmp5140);
__m512 tmp5218 = _mm512_unpackhi_ps(tmp5139, tmp5140);
__m512 tmp5219 = _mm512_unpacklo_ps(tmp5141, tmp5142);
__m512 tmp5220 = _mm512_unpackhi_ps(tmp5141, tmp5142);
__m512 tmp5221 = _mm512_unpacklo_ps(tmp5143, tmp5144);
__m512 tmp5222 = _mm512_unpackhi_ps(tmp5143, tmp5144);
__m512 tmp5223 = _mm512_unpacklo_ps(tmp5145, tmp5146);
__m512 tmp5224 = _mm512_unpackhi_ps(tmp5145, tmp5146);
__m512 tmp5225 = _mm512_shuffle_ps(tmp5213, tmp5215, 68);
__m512 tmp5226 = _mm512_shuffle_ps(tmp5213, tmp5215, 238);
__m512 tmp5227 = _mm512_shuffle_ps(tmp5214, tmp5216, 68);
__m512 tmp5228 = _mm512_shuffle_ps(tmp5214, tmp5216, 238);
__m512 tmp5229 = _mm512_shuffle_ps(tmp5217, tmp5219, 68);
__m512 tmp5230 = _mm512_shuffle_ps(tmp5217, tmp5219, 238);
__m512 tmp5231 = _mm512_shuffle_ps(tmp5218, tmp5220, 68);
__m512 tmp5232 = _mm512_shuffle_ps(tmp5218, tmp5220, 238);
__m512 tmp5233 = _mm512_shuffle_ps(tmp5221, tmp5223, 68);
__m512 tmp5234 = _mm512_shuffle_ps(tmp5221, tmp5223, 238);
__m512 tmp5235 = _mm512_shuffle_ps(tmp5222, tmp5224, 68);
__m512 tmp5236 = _mm512_shuffle_ps(tmp5222, tmp5224, 238);
__m512 tmp5237 = _mm512_shuffle_f32x4(tmp5225, tmp5229, 136);
__m512 tmp5238 = _mm512_shuffle_f32x4(tmp5225, tmp5229, 221);
__m512 tmp5239 = _mm512_shuffle_f32x4(tmp5226, tmp5230, 136);
__m512 tmp5240 = _mm512_shuffle_f32x4(tmp5226, tmp5230, 221);
__m512 tmp5241 = _mm512_shuffle_f32x4(tmp5227, tmp5231, 136);
__m512 tmp5242 = _mm512_shuffle_f32x4(tmp5227, tmp5231, 221);
__m512 tmp5243 = _mm512_shuffle_f32x4(tmp5228, tmp5232, 136);
__m512 tmp5244 = _mm512_shuffle_f32x4(tmp5228, tmp5232, 221);
__m512 tmp5245 = _mm512_shuffle_f32x4(tmp5233, tmp5233, 136);
__m512 tmp5246 = _mm512_shuffle_f32x4(tmp5233, tmp5233, 221);
__m512 tmp5247 = _mm512_shuffle_f32x4(tmp5234, tmp5234, 136);
__m512 tmp5248 = _mm512_shuffle_f32x4(tmp5234, tmp5234, 221);
__m512 tmp5249 = _mm512_shuffle_f32x4(tmp5235, tmp5235, 136);
__m512 tmp5250 = _mm512_shuffle_f32x4(tmp5235, tmp5235, 221);
__m512 tmp5251 = _mm512_shuffle_f32x4(tmp5236, tmp5236, 136);
__m512 tmp5252 = _mm512_shuffle_f32x4(tmp5236, tmp5236, 221);
tmp5135 = _mm512_shuffle_f32x4(tmp5237, tmp5245, 136);
tmp5143 = _mm512_shuffle_f32x4(tmp5237, tmp5245, 221);
tmp5136 = _mm512_shuffle_f32x4(tmp5239, tmp5247, 136);
tmp5144 = _mm512_shuffle_f32x4(tmp5239, tmp5247, 221);
tmp5137 = _mm512_shuffle_f32x4(tmp5241, tmp5249, 136);
tmp5145 = _mm512_shuffle_f32x4(tmp5241, tmp5249, 221);
tmp5138 = _mm512_shuffle_f32x4(tmp5243, tmp5251, 136);
tmp5146 = _mm512_shuffle_f32x4(tmp5243, tmp5251, 221);
tmp5139 = _mm512_shuffle_f32x4(tmp5238, tmp5246, 136);
__m512 tmp5187 = _mm512_shuffle_f32x4(tmp5238, tmp5246, 221);
tmp5140 = _mm512_shuffle_f32x4(tmp5240, tmp5248, 136);
__m512 tmp5188 = _mm512_shuffle_f32x4(tmp5240, tmp5248, 221);
tmp5141 = _mm512_shuffle_f32x4(tmp5242, tmp5250, 136);
__m512 tmp5189 = _mm512_shuffle_f32x4(tmp5242, tmp5250, 221);
tmp5142 = _mm512_shuffle_f32x4(tmp5244, tmp5252, 136);
__m512 tmp5190 = _mm512_shuffle_f32x4(tmp5244, tmp5252, 221);
(void)tmp5142;
(void)tmp5190;
__m512 tmp5195 = _mm512_add_ps(tmp5136, tmp5137);
__m512 tmp5206 = _mm512_add_ps(tmp5144, tmp5145);
__m512 tmp5194 = _mm512_add_ps(tmp5138, tmp5139);
__m512 tmp5205 = _mm512_add_ps(tmp5146, tmp5187);
__m512 tmp5200 = _mm512_sub_ps(tmp5138, tmp5139);
__m512 tmp5211 = _mm512_sub_ps(tmp5146, tmp5187);
__m512 tmp5199 = _mm512_sub_ps(tmp5136, tmp5137);
__m512 tmp5210 = _mm512_sub_ps(tmp5144, tmp5145);
__m512 tmp5196 = _mm512_add_ps(tmp5140, tmp5141);
__m512 tmp5207 = _mm512_add_ps(tmp5188, tmp5189);
__m512 tmp5201 = _mm512_sub_ps(tmp5140, tmp5141);
__m512 tmp5212 = _mm512_sub_ps(tmp5188, tmp5189);
__m512 tmp5198 = _mm512_fmadd_ps(tmp5200, _mm512_set1_ps(2e+00f), tmp5199);
__m512 tmp5209 = _mm512_fmadd_ps(tmp5211, _mm512_set1_ps(2e+00f), tmp5210);
__m512 tmp5193 = _mm512_add_ps(tmp5194, tmp5195);
__m512 tmp5204 = _mm512_add_ps(tmp5205, tmp5206);
__m512 tmp5197 = _mm512_fmadd_ps(tmp5201, _mm512_set1_ps(1.6e+01f), tmp5198);
__m512 tmp5208 = _mm512_fmadd_ps(tmp5212, _mm512_set1_ps(1.6e+01f), tmp5209);
__m512 tmp5192 = _mm512_add_ps(tmp5193, tmp5135);
__m512 tmp5203 = _mm512_add_ps(tmp5204, tmp5143);
__m512 tmp5191 = _mm512_fmadd_ps(tmp5196, _mm512_set1_ps(3.2e+01f), tmp5192);
__m512 tmp5202 = _mm512_fmadd_ps(tmp5207, _mm512_set1_ps(3.2e+01f), tmp5203);
__m512 out687 = tmp5191;
__m512 out689 = tmp5202;
__m512 out688 = tmp5197;
__m512 out690 = tmp5208;
out687 = _mm512_max_ps(_mm512_setzero_ps(), out687);
out689 = _mm512_max_ps(_mm512_setzero_ps(), out689);
out688 = _mm512_max_ps(_mm512_setzero_ps(), out688);
out690 = _mm512_max_ps(_mm512_setzero_ps(), out690);
_mm512_mask_storeu_ps(datPtr6+0+806912*i18+224*toH28+4*toW28+50432*k70+12608*l22, 4095, out687);
_mm512_mask_storeu_ps(datPtr6+48+806912*i18+224*toH28+4*toW28+50432*k70+12608*l22, 255, out689);
_mm512_mask_storeu_ps(datPtr6+224+806912*i18+224*toH28+4*toW28+50432*k70+12608*l22, 4095, out688);
_mm512_mask_storeu_ps(datPtr6+272+806912*i18+224*toH28+4*toW28+50432*k70+12608*l22, 255, out690);
}
}
if (j13 >= last4) return;
++j13;
}

static void ResNet50ThreeConsumeSums1(ResNet50ThreaderTeam1* team25, char** tensors23) {
ResNet50ThreaderTask1 task27;
task27.callee1 = ResNet50ThreeConsumeSums1Callee1;
task27.any1 = tensors23;
task27.nd1 = 3;
task27.hull1[0] = 1;
task27.hull1[1] = 8;
task27.hull1[2] = 1;
ResNet50ThreaderDo1(team25, &task27);
}

static void ResNet50ThreeArrangeFilts2Callee1(ResNet50ThreaderTask1* task40, int64_t* pt25) {
char** tensors38 = task40->any1;
ptrdiff_t b50 = pt25[0];
ptrdiff_t g13 = 0;
ptrdiff_t e12 = 0;
char*restrict bfPtr6 = tensors38[3]+256*e12;
char*restrict wfPtr6 = tensors38[3]+256+3244032*e12;
char*restrict wtPtr7 = tensors38[0]+14256*e12;
char*restrict biasPtr7 = tensors38[1];
char*restrict bnPtr7 = tensors38[2];
ptrdiff_t i25 = 1*g13;
ptrdiff_t j20 = 2*b50;
ptrdiff_t jj29 = j20+1;
if (j20 < 16) {
for (; j20 != 16; ++j20) {
ptrdiff_t k79 = 0+1*j20;
ptrdiff_t cut9 = 0;
__m512 postMul20 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(0+64*i25+4*j20))[0]);
__m512 postMul21 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(1+64*i25+4*j20))[0]);
__m512 postMul22 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(2+64*i25+4*j20))[0]);
__m512 postMul23 = _mm512_set1_ps(((float*)bnPtr7+(ptrdiff_t)2*(3+64*i25+4*j20))[0]);
ptrdiff_t s19 = 0;
for (; s19 != 64; ++s19) {
__m512 wt223 = _mm512_maskz_loadu_ps(511, wtPtr7+0+147456*i25+9216*j20+36*s19);
__m512 wt224 = _mm512_maskz_loadu_ps(511, wtPtr7+2304+147456*i25+9216*j20+36*s19);
__m512 wt225 = _mm512_maskz_loadu_ps(511, wtPtr7+4608+147456*i25+9216*j20+36*s19);
__m512 wt226 = _mm512_maskz_loadu_ps(511, wtPtr7+6912+147456*i25+9216*j20+36*s19);
wt223 = _mm512_mul_ps(wt223, postMul20);
wt224 = _mm512_mul_ps(wt224, postMul21);
wt225 = _mm512_mul_ps(wt225, postMul22);
wt226 = _mm512_mul_ps(wt226, postMul23);
__m512i pm108 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm109 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp5541 = _mm512_permutex2var_ps(wt223, pm108, wt225);
__m512 tmp5542 = _mm512_permutex2var_ps(wt224, pm108, wt226);
__m512 tmp5543 = _mm512_permutex2var_ps(wt223, pm109, wt225);
__m512 tmp5544 = _mm512_permutex2var_ps(wt224, pm109, wt226);
__m512 in764 = _mm512_permutex2var_ps(tmp5541, pm108, tmp5542);
__m512 in765 = _mm512_permutex2var_ps(tmp5541, pm109, tmp5542);
__m512 in766 = _mm512_permutex2var_ps(tmp5543, pm108, tmp5544);
__m512 tmp5545 = _mm512_fmadd_ps(in764, _mm512_set1_ps(4e+00f), in766);
__m512 tmp5546 = _mm512_add_ps(in764, in766);
__m512 tmp5547 = _mm512_fmadd_ps(in766, _mm512_set1_ps(4e+00f), in764);
__m512 tmp5548 = _mm512_add_ps(in765, tmp5546);
__m512 tmp5549 = _mm512_fmadd_ps(in765, _mm512_set1_ps(2e+00f), tmp5547);
tmp5547 = _mm512_fnmadd_ps(in765, _mm512_set1_ps(2e+00f), tmp5547);
__m512 tmp5550 = _mm512_fnmadd_ps(in765, _mm512_set1_ps(2e+00f), tmp5545);
tmp5545 = _mm512_fmadd_ps(in765, _mm512_set1_ps(2e+00f), tmp5545);
tmp5546 = _mm512_sub_ps(tmp5546, in765);
__m512 tmp5567 = _mm512_unpacklo_ps(in764, tmp5548);
__m512 tmp5568 = _mm512_unpackhi_ps(in764, tmp5548);
__m512 tmp5569 = _mm512_unpacklo_ps(tmp5546, tmp5549);
__m512 tmp5570 = _mm512_unpackhi_ps(tmp5546, tmp5549);
__m512 tmp5571 = _mm512_unpacklo_ps(tmp5547, tmp5545);
__m512 tmp5572 = _mm512_unpackhi_ps(tmp5547, tmp5545);
__m512 tmp5573 = _mm512_unpacklo_ps(tmp5550, in766);
__m512 tmp5574 = _mm512_unpackhi_ps(tmp5550, in766);
__m512 tmp5575 = _mm512_shuffle_ps(tmp5567, tmp5569, 68);
__m512 tmp5576 = _mm512_shuffle_ps(tmp5567, tmp5569, 238);
__m512 tmp5577 = _mm512_shuffle_ps(tmp5568, tmp5570, 68);
__m512 tmp5578 = _mm512_shuffle_ps(tmp5568, tmp5570, 238);
__m512 tmp5579 = _mm512_shuffle_ps(tmp5571, tmp5573, 68);
__m512 tmp5580 = _mm512_shuffle_ps(tmp5571, tmp5573, 238);
__m512 tmp5581 = _mm512_shuffle_ps(tmp5572, tmp5574, 68);
__m512 tmp5582 = _mm512_shuffle_ps(tmp5572, tmp5574, 238);
__m512 tmp5583 = _mm512_shuffle_f32x4(tmp5575, tmp5579, 136);
__m512 tmp5584 = _mm512_shuffle_f32x4(tmp5575, tmp5579, 221);
__m512 tmp5585 = _mm512_shuffle_f32x4(tmp5576, tmp5580, 136);
__m512 tmp5586 = _mm512_shuffle_f32x4(tmp5576, tmp5580, 221);
__m512 tmp5587 = _mm512_shuffle_f32x4(tmp5577, tmp5581, 136);
__m512 tmp5588 = _mm512_shuffle_f32x4(tmp5577, tmp5581, 221);
__m512 tmp5589 = _mm512_shuffle_f32x4(tmp5578, tmp5582, 136);
__m512 tmp5590 = _mm512_shuffle_f32x4(tmp5578, tmp5582, 221);
in764 = _mm512_shuffle_f32x4(tmp5583, tmp5583, 136);
__m512 tmp5551 = _mm512_shuffle_f32x4(tmp5583, tmp5583, 221);
tmp5548 = _mm512_shuffle_f32x4(tmp5585, tmp5585, 136);
__m512 tmp5552 = _mm512_shuffle_f32x4(tmp5585, tmp5585, 221);
tmp5546 = _mm512_shuffle_f32x4(tmp5587, tmp5587, 136);
__m512 tmp5553 = _mm512_shuffle_f32x4(tmp5587, tmp5587, 221);
tmp5549 = _mm512_shuffle_f32x4(tmp5589, tmp5589, 136);
__m512 tmp5554 = _mm512_shuffle_f32x4(tmp5589, tmp5589, 221);
tmp5547 = _mm512_shuffle_f32x4(tmp5584, tmp5584, 136);
tmp5545 = _mm512_shuffle_f32x4(tmp5586, tmp5586, 136);
tmp5550 = _mm512_shuffle_f32x4(tmp5588, tmp5588, 136);
in766 = _mm512_shuffle_f32x4(tmp5590, tmp5590, 136);
in764 = _mm512_shuffle_f32x4(in764, tmp5549, 68);
tmp5548 = _mm512_shuffle_f32x4(tmp5548, tmp5547, 68);
tmp5546 = _mm512_shuffle_f32x4(tmp5546, tmp5545, 68);
tmp5550 = _mm512_shuffle_f32x4(tmp5550, tmp5552, 68);
in766 = _mm512_shuffle_f32x4(in766, tmp5553, 68);
tmp5551 = _mm512_shuffle_f32x4(tmp5551, tmp5554, 68);
__m512 tmp5555 = _mm512_fmadd_ps(in764, _mm512_set1_ps(4e+00f), tmp5546);
__m512 tmp5561 = _mm512_fmadd_ps(tmp5550, _mm512_set1_ps(4e+00f), tmp5551);
__m512 tmp5556 = _mm512_add_ps(in764, tmp5546);
__m512 tmp5562 = _mm512_add_ps(tmp5550, tmp5551);
__m512 tmp5557 = _mm512_fmadd_ps(tmp5546, _mm512_set1_ps(4e+00f), in764);
__m512 tmp5563 = _mm512_fmadd_ps(tmp5551, _mm512_set1_ps(4e+00f), tmp5550);
__m512 tmp5558 = _mm512_add_ps(tmp5548, tmp5556);
__m512 tmp5564 = _mm512_add_ps(in766, tmp5562);
__m512 tmp5559 = _mm512_fmadd_ps(tmp5548, _mm512_set1_ps(2e+00f), tmp5557);
__m512 tmp5565 = _mm512_fmadd_ps(in766, _mm512_set1_ps(2e+00f), tmp5563);
tmp5557 = _mm512_fnmadd_ps(tmp5548, _mm512_set1_ps(2e+00f), tmp5557);
tmp5563 = _mm512_fnmadd_ps(in766, _mm512_set1_ps(2e+00f), tmp5563);
__m512 tmp5560 = _mm512_fnmadd_ps(tmp5548, _mm512_set1_ps(2e+00f), tmp5555);
__m512 tmp5566 = _mm512_fnmadd_ps(in766, _mm512_set1_ps(2e+00f), tmp5561);
tmp5555 = _mm512_fmadd_ps(tmp5548, _mm512_set1_ps(2e+00f), tmp5555);
tmp5561 = _mm512_fmadd_ps(in766, _mm512_set1_ps(2e+00f), tmp5561);
tmp5556 = _mm512_sub_ps(tmp5556, tmp5548);
tmp5562 = _mm512_sub_ps(tmp5562, in766);
in764 = _mm512_mul_ps(in764, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp5558 = _mm512_mul_ps(tmp5558, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp5556 = _mm512_mul_ps(tmp5556, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp5559 = _mm512_mul_ps(tmp5559, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp5557 = _mm512_mul_ps(tmp5557, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp5555 = _mm512_mul_ps(tmp5555, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp5560 = _mm512_mul_ps(tmp5560, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp5546 = _mm512_mul_ps(tmp5546, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp5550 = _mm512_mul_ps(tmp5550, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp5564 = _mm512_mul_ps(tmp5564, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp5562 = _mm512_mul_ps(tmp5562, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp5565 = _mm512_mul_ps(tmp5565, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp5563 = _mm512_mul_ps(tmp5563, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp5561 = _mm512_mul_ps(tmp5561, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp5566 = _mm512_mul_ps(tmp5566, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp5551 = _mm512_mul_ps(tmp5551, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out691 = _mm512_shuffle_f32x4(in764, tmp5558, 68);
__m512 out695 = _mm512_shuffle_f32x4(in764, tmp5558, 238);
__m512 out692 = _mm512_shuffle_f32x4(tmp5556, tmp5559, 68);
__m512 out696 = _mm512_shuffle_f32x4(tmp5556, tmp5559, 238);
__m512 out693 = _mm512_shuffle_f32x4(tmp5557, tmp5555, 68);
__m512 out697 = _mm512_shuffle_f32x4(tmp5557, tmp5555, 238);
__m512 out694 = _mm512_shuffle_f32x4(tmp5560, tmp5546, 68);
__m512 out698 = _mm512_shuffle_f32x4(tmp5560, tmp5546, 238);
__m512 out699 = _mm512_shuffle_f32x4(tmp5550, tmp5564, 68);
__m512 out703 = _mm512_shuffle_f32x4(tmp5550, tmp5564, 238);
__m512 out700 = _mm512_shuffle_f32x4(tmp5562, tmp5565, 68);
__m512 out704 = _mm512_shuffle_f32x4(tmp5562, tmp5565, 238);
__m512 out701 = _mm512_shuffle_f32x4(tmp5563, tmp5561, 68);
__m512 out705 = _mm512_shuffle_f32x4(tmp5563, tmp5561, 238);
__m512 out702 = _mm512_shuffle_f32x4(tmp5566, tmp5551, 68);
__m512 out706 = _mm512_shuffle_f32x4(tmp5566, tmp5551, 238);
ptrdiff_t off5 = 32*cut9;
ptrdiff_t off6 = (size_t)(cut9+1)/4*8192+(size_t)(cut9+1)%4*32;
ptrdiff_t off7 = (size_t)(cut9+2)/4*8192+(size_t)(cut9+2)%4*32;
ptrdiff_t off8 = (size_t)(cut9+3)/4*8192+(size_t)(cut9+3)%4*32;
__m512i wf57 = _mm512_castsi256_si512(_mm512_cvtps_ph(out691, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf58 = _mm512_castsi256_si512(_mm512_cvtps_ph(out695, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf59 = _mm512_castsi256_si512(_mm512_cvtps_ph(out699, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf60 = _mm512_castsi256_si512(_mm512_cvtps_ph(out703, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf61 = _mm512_castsi256_si512(_mm512_cvtps_ph(out692, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf62 = _mm512_castsi256_si512(_mm512_cvtps_ph(out696, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf63 = _mm512_castsi256_si512(_mm512_cvtps_ph(out700, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf64 = _mm512_castsi256_si512(_mm512_cvtps_ph(out704, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf65 = _mm512_castsi256_si512(_mm512_cvtps_ph(out693, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf66 = _mm512_castsi256_si512(_mm512_cvtps_ph(out697, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf67 = _mm512_castsi256_si512(_mm512_cvtps_ph(out701, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf68 = _mm512_castsi256_si512(_mm512_cvtps_ph(out705, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf69 = _mm512_castsi256_si512(_mm512_cvtps_ph(out694, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf70 = _mm512_castsi256_si512(_mm512_cvtps_ph(out698, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf71 = _mm512_castsi256_si512(_mm512_cvtps_ph(out702, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf72 = _mm512_castsi256_si512(_mm512_cvtps_ph(out706, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr6+0+524288*i25+8192*k79+off5+128*s19, 255, wf57);
_mm512_mask_storeu_epi32(wfPtr6+0+524288*i25+8192*k79+off6+128*s19, 255, wf58);
_mm512_mask_storeu_epi32(wfPtr6+0+524288*i25+8192*k79+off7+128*s19, 255, wf59);
_mm512_mask_storeu_epi32(wfPtr6+0+524288*i25+8192*k79+off8+128*s19, 255, wf60);
_mm512_mask_storeu_epi32(wfPtr6+131072+524288*i25+8192*k79+off5+128*s19, 255, wf61);
_mm512_mask_storeu_epi32(wfPtr6+131072+524288*i25+8192*k79+off6+128*s19, 255, wf62);
_mm512_mask_storeu_epi32(wfPtr6+131072+524288*i25+8192*k79+off7+128*s19, 255, wf63);
_mm512_mask_storeu_epi32(wfPtr6+131072+524288*i25+8192*k79+off8+128*s19, 255, wf64);
_mm512_mask_storeu_epi32(wfPtr6+262144+524288*i25+8192*k79+off5+128*s19, 255, wf65);
_mm512_mask_storeu_epi32(wfPtr6+262144+524288*i25+8192*k79+off6+128*s19, 255, wf66);
_mm512_mask_storeu_epi32(wfPtr6+262144+524288*i25+8192*k79+off7+128*s19, 255, wf67);
_mm512_mask_storeu_epi32(wfPtr6+262144+524288*i25+8192*k79+off8+128*s19, 255, wf68);
_mm512_mask_storeu_epi32(wfPtr6+393216+524288*i25+8192*k79+off5+128*s19, 255, wf69);
_mm512_mask_storeu_epi32(wfPtr6+393216+524288*i25+8192*k79+off6+128*s19, 255, wf70);
_mm512_mask_storeu_epi32(wfPtr6+393216+524288*i25+8192*k79+off7+128*s19, 255, wf71);
_mm512_mask_storeu_epi32(wfPtr6+393216+524288*i25+8192*k79+off8+128*s19, 255, wf72);
}
__m512 bias3 = _mm512_setzero_ps();
if (!e12) {
bias3 = _mm512_maskz_loadu_ps(15, biasPtr7-0+256*i25+16*j20);
__m512i pmMul14 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd14 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas5 = _mm512_maskz_loadu_ps(255, bnPtr7+(ptrdiff_t)8*(0+64*i25+4*j20));
__m512 postMul24 = _mm512_permutexvar_ps(pmMul14, mas5);
__m512 postAdd14 = _mm512_permutexvar_ps(pmAdd14, mas5);
bias3 = _mm512_fmadd_ps(bias3, postMul24, postAdd14);
}
_mm512_mask_storeu_ps(bfPtr6-0+256*i25+16*j20, 15, bias3);
if (j20 >= jj29) return;
}
}
}

static void ResNet50ThreeArrangeFilts2(ResNet50ThreaderTeam1* team32, char** tensors37) {
ResNet50ThreaderTask1 task41;
task41.callee1 = ResNet50ThreeArrangeFilts2Callee1;
task41.any1 = tensors37;
task41.nd1 = 3;
task41.hull1[0] = 8;
task41.hull1[1] = 1;
task41.hull1[2] = 1;
ResNet50ThreaderDo1(team32, &task41);
}

static void ResNet50ThreeArrangeDats2Callee1(ResNet50ThreaderTask1* task42, int64_t* pt26) {
char** tensors40 = task42->any1;
ptrdiff_t s20 = 0;
ptrdiff_t c20 = pt26[1];
ptrdiff_t g14 = 0;
ptrdiff_t e13 = 0;
char*restrict datPtr12 = tensors40[0]-228+4992768*e13;
char*restrict dfPtr6 = tensors40[1]+10137600*e13;
ptrdiff_t i26 = 1*g14;
ptrdiff_t j21 = 2*c20;
ptrdiff_t last5 = j21+(c20 < 7 ? 1 : 2);
if (j21 < 2) {
ptrdiff_t rel13 = j21-0;
ptrdiff_t base13 = 0;
if (rel13 < 1) {
ptrdiff_t h29 = base13+0;
ptrdiff_t w36 = 0;
ptrdiff_t k80 = 0;
for (; k80 != 32; ++k80) {
__m512 dat1295 = _mm512_maskz_loadu_ps(8191, datPtr12+228+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1296 = _mm512_maskz_loadu_ps(16383, datPtr12+272+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512i pm110 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in767 = _mm512_permutexvar_ps(pm110, dat1295);
__m512i pm111 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in774 = _mm512_permutexvar_ps(pm111, dat1296);
__m512 dat1297 = _mm512_maskz_loadu_ps(8191, datPtr12+452+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1298 = _mm512_maskz_loadu_ps(16383, datPtr12+496+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in768 = _mm512_permutexvar_ps(pm110, dat1297);
__m512 in775 = _mm512_permutexvar_ps(pm111, dat1298);
__m512 dat1299 = _mm512_maskz_loadu_ps(8191, datPtr12+676+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1300 = _mm512_maskz_loadu_ps(16383, datPtr12+720+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in769 = _mm512_permutexvar_ps(pm110, dat1299);
__m512 in776 = _mm512_permutexvar_ps(pm111, dat1300);
__m512 dat1301 = _mm512_maskz_loadu_ps(8191, datPtr12+900+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1302 = _mm512_maskz_loadu_ps(16383, datPtr12+944+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in770 = _mm512_permutexvar_ps(pm110, dat1301);
__m512 in777 = _mm512_permutexvar_ps(pm111, dat1302);
__m512 dat1303 = _mm512_maskz_loadu_ps(8191, datPtr12+1124+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1304 = _mm512_maskz_loadu_ps(16383, datPtr12+1168+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in771 = _mm512_permutexvar_ps(pm110, dat1303);
__m512 in778 = _mm512_permutexvar_ps(pm111, dat1304);
__m512 dat1305 = _mm512_maskz_loadu_ps(8191, datPtr12+1348+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1306 = _mm512_maskz_loadu_ps(16383, datPtr12+1392+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in772 = _mm512_permutexvar_ps(pm110, dat1305);
__m512 in779 = _mm512_permutexvar_ps(pm111, dat1306);
__m512 dat1307 = _mm512_maskz_loadu_ps(8191, datPtr12+1572+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1308 = _mm512_maskz_loadu_ps(16383, datPtr12+1616+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in773 = _mm512_permutexvar_ps(pm110, dat1307);
__m512 in780 = _mm512_permutexvar_ps(pm111, dat1308);
__m512 tmp5591 = _mm512_add_ps(in767, in771);
__m512 tmp5596 = _mm512_add_ps(in774, in778);
__m512 tmp5592 = _mm512_sub_ps(in770, in768);
__m512 tmp5597 = _mm512_sub_ps(in777, in775);
__m512 tmp5593 = _mm512_add_ps(in768, in772);
__m512 tmp5598 = _mm512_add_ps(in775, in779);
__m512 tmp5594 = _mm512_sub_ps(_mm512_setzero_ps(), in772);
__m512 tmp5599 = _mm512_sub_ps(_mm512_setzero_ps(), in779);
tmp5591 = _mm512_fmadd_ps(in769, _mm512_set1_ps(-4.25e+00f), tmp5591);
tmp5596 = _mm512_fmadd_ps(in776, _mm512_set1_ps(-4.25e+00f), tmp5596);
tmp5593 = _mm512_fmadd_ps(in770, _mm512_set1_ps(-4.25e+00f), tmp5593);
tmp5598 = _mm512_fmadd_ps(in777, _mm512_set1_ps(-4.25e+00f), tmp5598);
tmp5594 = _mm512_fmadd_ps(tmp5592, _mm512_set1_ps(5.25e+00f), tmp5594);
tmp5599 = _mm512_fmadd_ps(tmp5597, _mm512_set1_ps(5.25e+00f), tmp5599);
tmp5592 = _mm512_fmadd_ps(in768, _mm512_set1_ps(2.5e-01f), in772);
tmp5597 = _mm512_fmadd_ps(in775, _mm512_set1_ps(2.5e-01f), in779);
in768 = _mm512_fmadd_ps(in768, _mm512_set1_ps(4e+00f), in772);
in775 = _mm512_fmadd_ps(in775, _mm512_set1_ps(4e+00f), in779);
__m512 tmp5595 = _mm512_sub_ps(tmp5593, tmp5591);
__m512 tmp5600 = _mm512_sub_ps(tmp5598, tmp5596);
tmp5593 = _mm512_add_ps(tmp5591, tmp5593);
tmp5598 = _mm512_add_ps(tmp5596, tmp5598);
tmp5591 = _mm512_fmadd_ps(in767, _mm512_set1_ps(2.5e-01f), in771);
tmp5596 = _mm512_fmadd_ps(in774, _mm512_set1_ps(2.5e-01f), in778);
tmp5592 = _mm512_fmadd_ps(in770, _mm512_set1_ps(-1.25e+00f), tmp5592);
tmp5597 = _mm512_fmadd_ps(in777, _mm512_set1_ps(-1.25e+00f), tmp5597);
in770 = _mm512_fmadd_ps(in770, _mm512_set1_ps(-5e+00f), in768);
in777 = _mm512_fmadd_ps(in777, _mm512_set1_ps(-5e+00f), in775);
tmp5591 = _mm512_fmadd_ps(in769, _mm512_set1_ps(-1.25e+00f), tmp5591);
tmp5596 = _mm512_fmadd_ps(in776, _mm512_set1_ps(-1.25e+00f), tmp5596);
in772 = _mm512_fmadd_ps(tmp5591, _mm512_set1_ps(2e+00f), tmp5592);
in779 = _mm512_fmadd_ps(tmp5596, _mm512_set1_ps(2e+00f), tmp5597);
tmp5592 = _mm512_fnmadd_ps(tmp5591, _mm512_set1_ps(2e+00f), tmp5592);
tmp5597 = _mm512_fnmadd_ps(tmp5596, _mm512_set1_ps(2e+00f), tmp5597);
tmp5591 = _mm512_fmadd_ps(in771, _mm512_set1_ps(2.5e-01f), in767);
tmp5596 = _mm512_fmadd_ps(in778, _mm512_set1_ps(2.5e-01f), in774);
in767 = _mm512_sub_ps(in773, in767);
in774 = _mm512_sub_ps(in780, in774);
tmp5591 = _mm512_fmadd_ps(in769, _mm512_set1_ps(-1.25e+00f), tmp5591);
tmp5596 = _mm512_fmadd_ps(in776, _mm512_set1_ps(-1.25e+00f), tmp5596);
in769 = _mm512_sub_ps(in769, in771);
in776 = _mm512_sub_ps(in776, in778);
in769 = _mm512_fmadd_ps(in769, _mm512_set1_ps(5.25e+00f), in767);
in776 = _mm512_fmadd_ps(in776, _mm512_set1_ps(5.25e+00f), in774);
in768 = _mm512_fmadd_ps(tmp5591, _mm512_set1_ps(2e+00f), in770);
in775 = _mm512_fmadd_ps(tmp5596, _mm512_set1_ps(2e+00f), in777);
in770 = _mm512_fnmadd_ps(tmp5591, _mm512_set1_ps(2e+00f), in770);
in777 = _mm512_fnmadd_ps(tmp5596, _mm512_set1_ps(2e+00f), in777);
__m512 tmp5609 = _mm512_unpacklo_ps(tmp5594, tmp5593);
__m512 tmp5610 = _mm512_unpackhi_ps(tmp5594, tmp5593);
__m512 tmp5611 = _mm512_unpacklo_ps(tmp5595, in772);
__m512 tmp5612 = _mm512_unpackhi_ps(tmp5595, in772);
__m512 tmp5613 = _mm512_unpacklo_ps(tmp5592, in768);
__m512 tmp5614 = _mm512_unpackhi_ps(tmp5592, in768);
__m512 tmp5615 = _mm512_unpacklo_ps(in770, in769);
__m512 tmp5616 = _mm512_unpackhi_ps(in770, in769);
__m512 tmp5617 = _mm512_unpacklo_ps(tmp5599, tmp5598);
__m512 tmp5618 = _mm512_unpackhi_ps(tmp5599, tmp5598);
__m512 tmp5619 = _mm512_unpacklo_ps(tmp5600, in779);
__m512 tmp5620 = _mm512_unpackhi_ps(tmp5600, in779);
__m512 tmp5621 = _mm512_unpacklo_ps(tmp5597, in775);
__m512 tmp5622 = _mm512_unpackhi_ps(tmp5597, in775);
__m512 tmp5623 = _mm512_unpacklo_ps(in777, in776);
__m512 tmp5624 = _mm512_unpackhi_ps(in777, in776);
__m512 tmp5625 = _mm512_shuffle_ps(tmp5609, tmp5611, 68);
__m512 tmp5626 = _mm512_shuffle_ps(tmp5609, tmp5611, 238);
__m512 tmp5627 = _mm512_shuffle_ps(tmp5610, tmp5612, 68);
__m512 tmp5628 = _mm512_shuffle_ps(tmp5610, tmp5612, 238);
__m512 tmp5629 = _mm512_shuffle_ps(tmp5613, tmp5615, 68);
__m512 tmp5630 = _mm512_shuffle_ps(tmp5613, tmp5615, 238);
__m512 tmp5631 = _mm512_shuffle_ps(tmp5614, tmp5616, 68);
__m512 tmp5632 = _mm512_shuffle_ps(tmp5614, tmp5616, 238);
__m512 tmp5633 = _mm512_shuffle_ps(tmp5617, tmp5619, 68);
__m512 tmp5634 = _mm512_shuffle_ps(tmp5617, tmp5619, 238);
__m512 tmp5635 = _mm512_shuffle_ps(tmp5618, tmp5620, 68);
__m512 tmp5636 = _mm512_shuffle_ps(tmp5618, tmp5620, 238);
__m512 tmp5637 = _mm512_shuffle_ps(tmp5621, tmp5623, 68);
__m512 tmp5638 = _mm512_shuffle_ps(tmp5621, tmp5623, 238);
__m512 tmp5639 = _mm512_shuffle_ps(tmp5622, tmp5624, 68);
__m512 tmp5640 = _mm512_shuffle_ps(tmp5622, tmp5624, 238);
__m512 tmp5641 = _mm512_shuffle_f32x4(tmp5625, tmp5629, 136);
__m512 tmp5642 = _mm512_shuffle_f32x4(tmp5625, tmp5629, 221);
__m512 tmp5643 = _mm512_shuffle_f32x4(tmp5626, tmp5630, 136);
__m512 tmp5644 = _mm512_shuffle_f32x4(tmp5626, tmp5630, 221);
__m512 tmp5645 = _mm512_shuffle_f32x4(tmp5627, tmp5631, 136);
__m512 tmp5646 = _mm512_shuffle_f32x4(tmp5627, tmp5631, 221);
__m512 tmp5647 = _mm512_shuffle_f32x4(tmp5628, tmp5632, 136);
__m512 tmp5648 = _mm512_shuffle_f32x4(tmp5628, tmp5632, 221);
__m512 tmp5649 = _mm512_shuffle_f32x4(tmp5633, tmp5637, 136);
__m512 tmp5650 = _mm512_shuffle_f32x4(tmp5633, tmp5637, 221);
__m512 tmp5651 = _mm512_shuffle_f32x4(tmp5634, tmp5638, 136);
__m512 tmp5652 = _mm512_shuffle_f32x4(tmp5634, tmp5638, 221);
__m512 tmp5653 = _mm512_shuffle_f32x4(tmp5635, tmp5639, 136);
__m512 tmp5654 = _mm512_shuffle_f32x4(tmp5635, tmp5639, 221);
__m512 tmp5655 = _mm512_shuffle_f32x4(tmp5636, tmp5640, 136);
__m512 tmp5656 = _mm512_shuffle_f32x4(tmp5636, tmp5640, 221);
tmp5594 = _mm512_shuffle_f32x4(tmp5641, tmp5649, 136);
tmp5599 = _mm512_shuffle_f32x4(tmp5641, tmp5649, 221);
tmp5593 = _mm512_shuffle_f32x4(tmp5643, tmp5651, 136);
tmp5598 = _mm512_shuffle_f32x4(tmp5643, tmp5651, 221);
tmp5595 = _mm512_shuffle_f32x4(tmp5645, tmp5653, 136);
tmp5600 = _mm512_shuffle_f32x4(tmp5645, tmp5653, 221);
in772 = _mm512_shuffle_f32x4(tmp5647, tmp5655, 136);
in779 = _mm512_shuffle_f32x4(tmp5647, tmp5655, 221);
tmp5592 = _mm512_shuffle_f32x4(tmp5642, tmp5650, 136);
tmp5597 = _mm512_shuffle_f32x4(tmp5642, tmp5650, 221);
in768 = _mm512_shuffle_f32x4(tmp5644, tmp5652, 136);
in775 = _mm512_shuffle_f32x4(tmp5644, tmp5652, 221);
in770 = _mm512_shuffle_f32x4(tmp5646, tmp5654, 136);
in777 = _mm512_shuffle_f32x4(tmp5646, tmp5654, 221);
in769 = _mm512_shuffle_f32x4(tmp5648, tmp5656, 136);
in776 = _mm512_shuffle_f32x4(tmp5648, tmp5656, 221);
__m512 tmp5601 = _mm512_add_ps(tmp5593, in768);
__m512 tmp5605 = _mm512_add_ps(tmp5598, in775);
__m512 tmp5602 = _mm512_sub_ps(tmp5592, tmp5595);
__m512 tmp5606 = _mm512_sub_ps(tmp5597, tmp5600);
__m512 tmp5603 = _mm512_add_ps(tmp5595, in770);
__m512 tmp5607 = _mm512_add_ps(tmp5600, in777);
tmp5594 = _mm512_sub_ps(tmp5594, in770);
tmp5599 = _mm512_sub_ps(tmp5599, in777);
tmp5601 = _mm512_fmadd_ps(in772, _mm512_set1_ps(-4.25e+00f), tmp5601);
tmp5605 = _mm512_fmadd_ps(in779, _mm512_set1_ps(-4.25e+00f), tmp5605);
tmp5603 = _mm512_fmadd_ps(tmp5592, _mm512_set1_ps(-4.25e+00f), tmp5603);
tmp5607 = _mm512_fmadd_ps(tmp5597, _mm512_set1_ps(-4.25e+00f), tmp5607);
tmp5594 = _mm512_fmadd_ps(tmp5602, _mm512_set1_ps(5.25e+00f), tmp5594);
tmp5599 = _mm512_fmadd_ps(tmp5606, _mm512_set1_ps(5.25e+00f), tmp5599);
tmp5602 = _mm512_fmadd_ps(tmp5595, _mm512_set1_ps(2.5e-01f), in770);
tmp5606 = _mm512_fmadd_ps(tmp5600, _mm512_set1_ps(2.5e-01f), in777);
tmp5595 = _mm512_fmadd_ps(tmp5595, _mm512_set1_ps(4e+00f), in770);
tmp5600 = _mm512_fmadd_ps(tmp5600, _mm512_set1_ps(4e+00f), in777);
__m512 tmp5604 = _mm512_sub_ps(tmp5603, tmp5601);
__m512 tmp5608 = _mm512_sub_ps(tmp5607, tmp5605);
tmp5603 = _mm512_add_ps(tmp5601, tmp5603);
tmp5607 = _mm512_add_ps(tmp5605, tmp5607);
tmp5601 = _mm512_fmadd_ps(tmp5593, _mm512_set1_ps(2.5e-01f), in768);
tmp5605 = _mm512_fmadd_ps(tmp5598, _mm512_set1_ps(2.5e-01f), in775);
tmp5602 = _mm512_fmadd_ps(tmp5592, _mm512_set1_ps(-1.25e+00f), tmp5602);
tmp5606 = _mm512_fmadd_ps(tmp5597, _mm512_set1_ps(-1.25e+00f), tmp5606);
tmp5592 = _mm512_fmadd_ps(tmp5592, _mm512_set1_ps(-5e+00f), tmp5595);
tmp5597 = _mm512_fmadd_ps(tmp5597, _mm512_set1_ps(-5e+00f), tmp5600);
tmp5601 = _mm512_fmadd_ps(in772, _mm512_set1_ps(-1.25e+00f), tmp5601);
tmp5605 = _mm512_fmadd_ps(in779, _mm512_set1_ps(-1.25e+00f), tmp5605);
in770 = _mm512_fmadd_ps(tmp5601, _mm512_set1_ps(2e+00f), tmp5602);
in777 = _mm512_fmadd_ps(tmp5605, _mm512_set1_ps(2e+00f), tmp5606);
tmp5602 = _mm512_fnmadd_ps(tmp5601, _mm512_set1_ps(2e+00f), tmp5602);
tmp5606 = _mm512_fnmadd_ps(tmp5605, _mm512_set1_ps(2e+00f), tmp5606);
tmp5601 = _mm512_fmadd_ps(in768, _mm512_set1_ps(2.5e-01f), tmp5593);
tmp5605 = _mm512_fmadd_ps(in775, _mm512_set1_ps(2.5e-01f), tmp5598);
tmp5593 = _mm512_sub_ps(in769, tmp5593);
tmp5598 = _mm512_sub_ps(in776, tmp5598);
tmp5601 = _mm512_fmadd_ps(in772, _mm512_set1_ps(-1.25e+00f), tmp5601);
tmp5605 = _mm512_fmadd_ps(in779, _mm512_set1_ps(-1.25e+00f), tmp5605);
in772 = _mm512_sub_ps(in772, in768);
in779 = _mm512_sub_ps(in779, in775);
in772 = _mm512_fmadd_ps(in772, _mm512_set1_ps(5.25e+00f), tmp5593);
in779 = _mm512_fmadd_ps(in779, _mm512_set1_ps(5.25e+00f), tmp5598);
tmp5595 = _mm512_fmadd_ps(tmp5601, _mm512_set1_ps(2e+00f), tmp5592);
tmp5600 = _mm512_fmadd_ps(tmp5605, _mm512_set1_ps(2e+00f), tmp5597);
tmp5592 = _mm512_fnmadd_ps(tmp5601, _mm512_set1_ps(2e+00f), tmp5592);
tmp5597 = _mm512_fnmadd_ps(tmp5605, _mm512_set1_ps(2e+00f), tmp5597);
__m512 out707 = _mm512_shuffle_f32x4(tmp5594, tmp5603, 68);
__m512 out715 = _mm512_shuffle_f32x4(tmp5594, tmp5603, 238);
__m512 out708 = _mm512_shuffle_f32x4(tmp5604, in770, 68);
__m512 out716 = _mm512_shuffle_f32x4(tmp5604, in770, 238);
__m512 out709 = _mm512_shuffle_f32x4(tmp5602, tmp5595, 68);
__m512 out717 = _mm512_shuffle_f32x4(tmp5602, tmp5595, 238);
__m512 out710 = _mm512_shuffle_f32x4(tmp5592, in772, 68);
__m512 out718 = _mm512_shuffle_f32x4(tmp5592, in772, 238);
__m512 out711 = _mm512_shuffle_f32x4(tmp5599, tmp5607, 68);
__m512 out719 = _mm512_shuffle_f32x4(tmp5599, tmp5607, 238);
__m512 out712 = _mm512_shuffle_f32x4(tmp5608, in777, 68);
__m512 out720 = _mm512_shuffle_f32x4(tmp5608, in777, 238);
__m512 out713 = _mm512_shuffle_f32x4(tmp5606, tmp5600, 68);
__m512 out721 = _mm512_shuffle_f32x4(tmp5606, tmp5600, 238);
__m512 out714 = _mm512_shuffle_f32x4(tmp5597, in779, 68);
__m512 out722 = _mm512_shuffle_f32x4(tmp5597, in779, 238);
_mm512_storeu_ps(dfPtr6+0+1638400*i26+24576*j21+24576*s20+768*k80, out707);
_mm512_storeu_ps(dfPtr6+128+1638400*i26+24576*j21+24576*s20+768*k80, out715);
_mm512_storeu_ps(dfPtr6+64+1638400*i26+24576*j21+24576*s20+768*k80, out711);
_mm512_storeu_ps(dfPtr6+192+1638400*i26+24576*j21+24576*s20+768*k80, out719);
_mm512_storeu_ps(dfPtr6+409600+1638400*i26+24576*j21+24576*s20+768*k80, out708);
_mm512_storeu_ps(dfPtr6+409728+1638400*i26+24576*j21+24576*s20+768*k80, out716);
_mm512_storeu_ps(dfPtr6+409664+1638400*i26+24576*j21+24576*s20+768*k80, out712);
_mm512_storeu_ps(dfPtr6+409792+1638400*i26+24576*j21+24576*s20+768*k80, out720);
_mm512_storeu_ps(dfPtr6+819200+1638400*i26+24576*j21+24576*s20+768*k80, out709);
_mm512_storeu_ps(dfPtr6+819328+1638400*i26+24576*j21+24576*s20+768*k80, out717);
_mm512_storeu_ps(dfPtr6+819264+1638400*i26+24576*j21+24576*s20+768*k80, out713);
_mm512_storeu_ps(dfPtr6+819392+1638400*i26+24576*j21+24576*s20+768*k80, out721);
_mm512_storeu_ps(dfPtr6+1228800+1638400*i26+24576*j21+24576*s20+768*k80, out710);
_mm512_storeu_ps(dfPtr6+1228928+1638400*i26+24576*j21+24576*s20+768*k80, out718);
_mm512_storeu_ps(dfPtr6+1228864+1638400*i26+24576*j21+24576*s20+768*k80, out714);
_mm512_storeu_ps(dfPtr6+1228992+1638400*i26+24576*j21+24576*s20+768*k80, out722);
__m512 dat1309 = _mm512_maskz_loadu_ps(16383, datPtr12+320+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1310 = _mm512_maskz_loadu_ps(8191, datPtr12+12836+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512i pm112 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in781 = _mm512_permutexvar_ps(pm112, dat1309);
__m512i pm113 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in788 = _mm512_permutexvar_ps(pm113, dat1310);
__m512 dat1311 = _mm512_maskz_loadu_ps(16383, datPtr12+544+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1312 = _mm512_maskz_loadu_ps(8191, datPtr12+13060+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in782 = _mm512_permutexvar_ps(pm112, dat1311);
__m512 in789 = _mm512_permutexvar_ps(pm113, dat1312);
__m512 dat1313 = _mm512_maskz_loadu_ps(16383, datPtr12+768+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1314 = _mm512_maskz_loadu_ps(8191, datPtr12+13284+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in783 = _mm512_permutexvar_ps(pm112, dat1313);
__m512 in790 = _mm512_permutexvar_ps(pm113, dat1314);
__m512 dat1315 = _mm512_maskz_loadu_ps(16383, datPtr12+992+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1316 = _mm512_maskz_loadu_ps(8191, datPtr12+13508+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in784 = _mm512_permutexvar_ps(pm112, dat1315);
__m512 in791 = _mm512_permutexvar_ps(pm113, dat1316);
__m512 dat1317 = _mm512_maskz_loadu_ps(16383, datPtr12+1216+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1318 = _mm512_maskz_loadu_ps(8191, datPtr12+13732+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in785 = _mm512_permutexvar_ps(pm112, dat1317);
__m512 in792 = _mm512_permutexvar_ps(pm113, dat1318);
__m512 dat1319 = _mm512_maskz_loadu_ps(16383, datPtr12+1440+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1320 = _mm512_maskz_loadu_ps(8191, datPtr12+13956+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in786 = _mm512_permutexvar_ps(pm112, dat1319);
__m512 in793 = _mm512_permutexvar_ps(pm113, dat1320);
__m512 dat1321 = _mm512_maskz_loadu_ps(16383, datPtr12+1664+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1322 = _mm512_maskz_loadu_ps(8191, datPtr12+14180+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in787 = _mm512_permutexvar_ps(pm112, dat1321);
__m512 in794 = _mm512_permutexvar_ps(pm113, dat1322);
__m512 tmp5657 = _mm512_add_ps(in781, in785);
__m512 tmp5662 = _mm512_add_ps(in788, in792);
__m512 tmp5658 = _mm512_sub_ps(in784, in782);
__m512 tmp5663 = _mm512_sub_ps(in791, in789);
__m512 tmp5659 = _mm512_add_ps(in782, in786);
__m512 tmp5664 = _mm512_add_ps(in789, in793);
__m512 tmp5660 = _mm512_sub_ps(_mm512_setzero_ps(), in786);
__m512 tmp5665 = _mm512_sub_ps(_mm512_setzero_ps(), in793);
tmp5657 = _mm512_fmadd_ps(in783, _mm512_set1_ps(-4.25e+00f), tmp5657);
tmp5662 = _mm512_fmadd_ps(in790, _mm512_set1_ps(-4.25e+00f), tmp5662);
tmp5659 = _mm512_fmadd_ps(in784, _mm512_set1_ps(-4.25e+00f), tmp5659);
tmp5664 = _mm512_fmadd_ps(in791, _mm512_set1_ps(-4.25e+00f), tmp5664);
tmp5660 = _mm512_fmadd_ps(tmp5658, _mm512_set1_ps(5.25e+00f), tmp5660);
tmp5665 = _mm512_fmadd_ps(tmp5663, _mm512_set1_ps(5.25e+00f), tmp5665);
tmp5658 = _mm512_fmadd_ps(in782, _mm512_set1_ps(2.5e-01f), in786);
tmp5663 = _mm512_fmadd_ps(in789, _mm512_set1_ps(2.5e-01f), in793);
in782 = _mm512_fmadd_ps(in782, _mm512_set1_ps(4e+00f), in786);
in789 = _mm512_fmadd_ps(in789, _mm512_set1_ps(4e+00f), in793);
__m512 tmp5661 = _mm512_sub_ps(tmp5659, tmp5657);
__m512 tmp5666 = _mm512_sub_ps(tmp5664, tmp5662);
tmp5659 = _mm512_add_ps(tmp5657, tmp5659);
tmp5664 = _mm512_add_ps(tmp5662, tmp5664);
tmp5657 = _mm512_fmadd_ps(in781, _mm512_set1_ps(2.5e-01f), in785);
tmp5662 = _mm512_fmadd_ps(in788, _mm512_set1_ps(2.5e-01f), in792);
tmp5658 = _mm512_fmadd_ps(in784, _mm512_set1_ps(-1.25e+00f), tmp5658);
tmp5663 = _mm512_fmadd_ps(in791, _mm512_set1_ps(-1.25e+00f), tmp5663);
in784 = _mm512_fmadd_ps(in784, _mm512_set1_ps(-5e+00f), in782);
in791 = _mm512_fmadd_ps(in791, _mm512_set1_ps(-5e+00f), in789);
tmp5657 = _mm512_fmadd_ps(in783, _mm512_set1_ps(-1.25e+00f), tmp5657);
tmp5662 = _mm512_fmadd_ps(in790, _mm512_set1_ps(-1.25e+00f), tmp5662);
in786 = _mm512_fmadd_ps(tmp5657, _mm512_set1_ps(2e+00f), tmp5658);
in793 = _mm512_fmadd_ps(tmp5662, _mm512_set1_ps(2e+00f), tmp5663);
tmp5658 = _mm512_fnmadd_ps(tmp5657, _mm512_set1_ps(2e+00f), tmp5658);
tmp5663 = _mm512_fnmadd_ps(tmp5662, _mm512_set1_ps(2e+00f), tmp5663);
tmp5657 = _mm512_fmadd_ps(in785, _mm512_set1_ps(2.5e-01f), in781);
tmp5662 = _mm512_fmadd_ps(in792, _mm512_set1_ps(2.5e-01f), in788);
in781 = _mm512_sub_ps(in787, in781);
in788 = _mm512_sub_ps(in794, in788);
tmp5657 = _mm512_fmadd_ps(in783, _mm512_set1_ps(-1.25e+00f), tmp5657);
tmp5662 = _mm512_fmadd_ps(in790, _mm512_set1_ps(-1.25e+00f), tmp5662);
in783 = _mm512_sub_ps(in783, in785);
in790 = _mm512_sub_ps(in790, in792);
in783 = _mm512_fmadd_ps(in783, _mm512_set1_ps(5.25e+00f), in781);
in790 = _mm512_fmadd_ps(in790, _mm512_set1_ps(5.25e+00f), in788);
in782 = _mm512_fmadd_ps(tmp5657, _mm512_set1_ps(2e+00f), in784);
in789 = _mm512_fmadd_ps(tmp5662, _mm512_set1_ps(2e+00f), in791);
in784 = _mm512_fnmadd_ps(tmp5657, _mm512_set1_ps(2e+00f), in784);
in791 = _mm512_fnmadd_ps(tmp5662, _mm512_set1_ps(2e+00f), in791);
__m512 tmp5675 = _mm512_unpacklo_ps(tmp5660, tmp5659);
__m512 tmp5676 = _mm512_unpackhi_ps(tmp5660, tmp5659);
__m512 tmp5677 = _mm512_unpacklo_ps(tmp5661, in786);
__m512 tmp5678 = _mm512_unpackhi_ps(tmp5661, in786);
__m512 tmp5679 = _mm512_unpacklo_ps(tmp5658, in782);
__m512 tmp5680 = _mm512_unpackhi_ps(tmp5658, in782);
__m512 tmp5681 = _mm512_unpacklo_ps(in784, in783);
__m512 tmp5682 = _mm512_unpackhi_ps(in784, in783);
__m512 tmp5683 = _mm512_unpacklo_ps(tmp5665, tmp5664);
__m512 tmp5684 = _mm512_unpackhi_ps(tmp5665, tmp5664);
__m512 tmp5685 = _mm512_unpacklo_ps(tmp5666, in793);
__m512 tmp5686 = _mm512_unpackhi_ps(tmp5666, in793);
__m512 tmp5687 = _mm512_unpacklo_ps(tmp5663, in789);
__m512 tmp5688 = _mm512_unpackhi_ps(tmp5663, in789);
__m512 tmp5689 = _mm512_unpacklo_ps(in791, in790);
__m512 tmp5690 = _mm512_unpackhi_ps(in791, in790);
__m512 tmp5691 = _mm512_shuffle_ps(tmp5675, tmp5677, 68);
__m512 tmp5692 = _mm512_shuffle_ps(tmp5675, tmp5677, 238);
__m512 tmp5693 = _mm512_shuffle_ps(tmp5676, tmp5678, 68);
__m512 tmp5694 = _mm512_shuffle_ps(tmp5676, tmp5678, 238);
__m512 tmp5695 = _mm512_shuffle_ps(tmp5679, tmp5681, 68);
__m512 tmp5696 = _mm512_shuffle_ps(tmp5679, tmp5681, 238);
__m512 tmp5697 = _mm512_shuffle_ps(tmp5680, tmp5682, 68);
__m512 tmp5698 = _mm512_shuffle_ps(tmp5680, tmp5682, 238);
__m512 tmp5699 = _mm512_shuffle_ps(tmp5683, tmp5685, 68);
__m512 tmp5700 = _mm512_shuffle_ps(tmp5683, tmp5685, 238);
__m512 tmp5701 = _mm512_shuffle_ps(tmp5684, tmp5686, 68);
__m512 tmp5702 = _mm512_shuffle_ps(tmp5684, tmp5686, 238);
__m512 tmp5703 = _mm512_shuffle_ps(tmp5687, tmp5689, 68);
__m512 tmp5704 = _mm512_shuffle_ps(tmp5687, tmp5689, 238);
__m512 tmp5705 = _mm512_shuffle_ps(tmp5688, tmp5690, 68);
__m512 tmp5706 = _mm512_shuffle_ps(tmp5688, tmp5690, 238);
__m512 tmp5707 = _mm512_shuffle_f32x4(tmp5691, tmp5695, 136);
__m512 tmp5708 = _mm512_shuffle_f32x4(tmp5691, tmp5695, 221);
__m512 tmp5709 = _mm512_shuffle_f32x4(tmp5692, tmp5696, 136);
__m512 tmp5710 = _mm512_shuffle_f32x4(tmp5692, tmp5696, 221);
__m512 tmp5711 = _mm512_shuffle_f32x4(tmp5693, tmp5697, 136);
__m512 tmp5712 = _mm512_shuffle_f32x4(tmp5693, tmp5697, 221);
__m512 tmp5713 = _mm512_shuffle_f32x4(tmp5694, tmp5698, 136);
__m512 tmp5714 = _mm512_shuffle_f32x4(tmp5694, tmp5698, 221);
__m512 tmp5715 = _mm512_shuffle_f32x4(tmp5699, tmp5703, 136);
__m512 tmp5716 = _mm512_shuffle_f32x4(tmp5699, tmp5703, 221);
__m512 tmp5717 = _mm512_shuffle_f32x4(tmp5700, tmp5704, 136);
__m512 tmp5718 = _mm512_shuffle_f32x4(tmp5700, tmp5704, 221);
__m512 tmp5719 = _mm512_shuffle_f32x4(tmp5701, tmp5705, 136);
__m512 tmp5720 = _mm512_shuffle_f32x4(tmp5701, tmp5705, 221);
__m512 tmp5721 = _mm512_shuffle_f32x4(tmp5702, tmp5706, 136);
__m512 tmp5722 = _mm512_shuffle_f32x4(tmp5702, tmp5706, 221);
tmp5660 = _mm512_shuffle_f32x4(tmp5707, tmp5715, 136);
tmp5665 = _mm512_shuffle_f32x4(tmp5707, tmp5715, 221);
tmp5659 = _mm512_shuffle_f32x4(tmp5709, tmp5717, 136);
tmp5664 = _mm512_shuffle_f32x4(tmp5709, tmp5717, 221);
tmp5661 = _mm512_shuffle_f32x4(tmp5711, tmp5719, 136);
tmp5666 = _mm512_shuffle_f32x4(tmp5711, tmp5719, 221);
in786 = _mm512_shuffle_f32x4(tmp5713, tmp5721, 136);
in793 = _mm512_shuffle_f32x4(tmp5713, tmp5721, 221);
tmp5658 = _mm512_shuffle_f32x4(tmp5708, tmp5716, 136);
tmp5663 = _mm512_shuffle_f32x4(tmp5708, tmp5716, 221);
in782 = _mm512_shuffle_f32x4(tmp5710, tmp5718, 136);
in789 = _mm512_shuffle_f32x4(tmp5710, tmp5718, 221);
in784 = _mm512_shuffle_f32x4(tmp5712, tmp5720, 136);
in791 = _mm512_shuffle_f32x4(tmp5712, tmp5720, 221);
in783 = _mm512_shuffle_f32x4(tmp5714, tmp5722, 136);
in790 = _mm512_shuffle_f32x4(tmp5714, tmp5722, 221);
__m512 tmp5667 = _mm512_add_ps(tmp5659, in782);
__m512 tmp5671 = _mm512_add_ps(tmp5664, in789);
__m512 tmp5668 = _mm512_sub_ps(tmp5658, tmp5661);
__m512 tmp5672 = _mm512_sub_ps(tmp5663, tmp5666);
__m512 tmp5669 = _mm512_add_ps(tmp5661, in784);
__m512 tmp5673 = _mm512_add_ps(tmp5666, in791);
tmp5660 = _mm512_sub_ps(tmp5660, in784);
tmp5665 = _mm512_sub_ps(tmp5665, in791);
tmp5667 = _mm512_fmadd_ps(in786, _mm512_set1_ps(-4.25e+00f), tmp5667);
tmp5671 = _mm512_fmadd_ps(in793, _mm512_set1_ps(-4.25e+00f), tmp5671);
tmp5669 = _mm512_fmadd_ps(tmp5658, _mm512_set1_ps(-4.25e+00f), tmp5669);
tmp5673 = _mm512_fmadd_ps(tmp5663, _mm512_set1_ps(-4.25e+00f), tmp5673);
tmp5660 = _mm512_fmadd_ps(tmp5668, _mm512_set1_ps(5.25e+00f), tmp5660);
tmp5665 = _mm512_fmadd_ps(tmp5672, _mm512_set1_ps(5.25e+00f), tmp5665);
tmp5668 = _mm512_fmadd_ps(tmp5661, _mm512_set1_ps(2.5e-01f), in784);
tmp5672 = _mm512_fmadd_ps(tmp5666, _mm512_set1_ps(2.5e-01f), in791);
tmp5661 = _mm512_fmadd_ps(tmp5661, _mm512_set1_ps(4e+00f), in784);
tmp5666 = _mm512_fmadd_ps(tmp5666, _mm512_set1_ps(4e+00f), in791);
__m512 tmp5670 = _mm512_sub_ps(tmp5669, tmp5667);
__m512 tmp5674 = _mm512_sub_ps(tmp5673, tmp5671);
tmp5669 = _mm512_add_ps(tmp5667, tmp5669);
tmp5673 = _mm512_add_ps(tmp5671, tmp5673);
tmp5667 = _mm512_fmadd_ps(tmp5659, _mm512_set1_ps(2.5e-01f), in782);
tmp5671 = _mm512_fmadd_ps(tmp5664, _mm512_set1_ps(2.5e-01f), in789);
tmp5668 = _mm512_fmadd_ps(tmp5658, _mm512_set1_ps(-1.25e+00f), tmp5668);
tmp5672 = _mm512_fmadd_ps(tmp5663, _mm512_set1_ps(-1.25e+00f), tmp5672);
tmp5658 = _mm512_fmadd_ps(tmp5658, _mm512_set1_ps(-5e+00f), tmp5661);
tmp5663 = _mm512_fmadd_ps(tmp5663, _mm512_set1_ps(-5e+00f), tmp5666);
tmp5667 = _mm512_fmadd_ps(in786, _mm512_set1_ps(-1.25e+00f), tmp5667);
tmp5671 = _mm512_fmadd_ps(in793, _mm512_set1_ps(-1.25e+00f), tmp5671);
in784 = _mm512_fmadd_ps(tmp5667, _mm512_set1_ps(2e+00f), tmp5668);
in791 = _mm512_fmadd_ps(tmp5671, _mm512_set1_ps(2e+00f), tmp5672);
tmp5668 = _mm512_fnmadd_ps(tmp5667, _mm512_set1_ps(2e+00f), tmp5668);
tmp5672 = _mm512_fnmadd_ps(tmp5671, _mm512_set1_ps(2e+00f), tmp5672);
tmp5667 = _mm512_fmadd_ps(in782, _mm512_set1_ps(2.5e-01f), tmp5659);
tmp5671 = _mm512_fmadd_ps(in789, _mm512_set1_ps(2.5e-01f), tmp5664);
tmp5659 = _mm512_sub_ps(in783, tmp5659);
tmp5664 = _mm512_sub_ps(in790, tmp5664);
tmp5667 = _mm512_fmadd_ps(in786, _mm512_set1_ps(-1.25e+00f), tmp5667);
tmp5671 = _mm512_fmadd_ps(in793, _mm512_set1_ps(-1.25e+00f), tmp5671);
in786 = _mm512_sub_ps(in786, in782);
in793 = _mm512_sub_ps(in793, in789);
in786 = _mm512_fmadd_ps(in786, _mm512_set1_ps(5.25e+00f), tmp5659);
in793 = _mm512_fmadd_ps(in793, _mm512_set1_ps(5.25e+00f), tmp5664);
tmp5661 = _mm512_fmadd_ps(tmp5667, _mm512_set1_ps(2e+00f), tmp5658);
tmp5666 = _mm512_fmadd_ps(tmp5671, _mm512_set1_ps(2e+00f), tmp5663);
tmp5658 = _mm512_fnmadd_ps(tmp5667, _mm512_set1_ps(2e+00f), tmp5658);
tmp5663 = _mm512_fnmadd_ps(tmp5671, _mm512_set1_ps(2e+00f), tmp5663);
__m512 out723 = _mm512_shuffle_f32x4(tmp5660, tmp5669, 68);
__m512 out731 = _mm512_shuffle_f32x4(tmp5660, tmp5669, 238);
__m512 out724 = _mm512_shuffle_f32x4(tmp5670, in784, 68);
__m512 out732 = _mm512_shuffle_f32x4(tmp5670, in784, 238);
__m512 out725 = _mm512_shuffle_f32x4(tmp5668, tmp5661, 68);
__m512 out733 = _mm512_shuffle_f32x4(tmp5668, tmp5661, 238);
__m512 out726 = _mm512_shuffle_f32x4(tmp5658, in786, 68);
__m512 out734 = _mm512_shuffle_f32x4(tmp5658, in786, 238);
__m512 out727 = _mm512_shuffle_f32x4(tmp5665, tmp5673, 68);
__m512 out735 = _mm512_shuffle_f32x4(tmp5665, tmp5673, 238);
__m512 out728 = _mm512_shuffle_f32x4(tmp5674, in791, 68);
__m512 out736 = _mm512_shuffle_f32x4(tmp5674, in791, 238);
__m512 out729 = _mm512_shuffle_f32x4(tmp5672, tmp5666, 68);
__m512 out737 = _mm512_shuffle_f32x4(tmp5672, tmp5666, 238);
__m512 out730 = _mm512_shuffle_f32x4(tmp5663, in793, 68);
__m512 out738 = _mm512_shuffle_f32x4(tmp5663, in793, 238);
_mm512_storeu_ps(dfPtr6+256+1638400*i26+24576*j21+24576*s20+768*k80, out723);
_mm512_storeu_ps(dfPtr6+384+1638400*i26+24576*j21+24576*s20+768*k80, out731);
_mm512_storeu_ps(dfPtr6+320+1638400*i26+24576*j21+24576*s20+768*k80, out727);
_mm512_storeu_ps(dfPtr6+448+1638400*i26+24576*j21+24576*s20+768*k80, out735);
_mm512_storeu_ps(dfPtr6+409856+1638400*i26+24576*j21+24576*s20+768*k80, out724);
_mm512_storeu_ps(dfPtr6+409984+1638400*i26+24576*j21+24576*s20+768*k80, out732);
_mm512_storeu_ps(dfPtr6+409920+1638400*i26+24576*j21+24576*s20+768*k80, out728);
_mm512_storeu_ps(dfPtr6+410048+1638400*i26+24576*j21+24576*s20+768*k80, out736);
_mm512_storeu_ps(dfPtr6+819456+1638400*i26+24576*j21+24576*s20+768*k80, out725);
_mm512_storeu_ps(dfPtr6+819584+1638400*i26+24576*j21+24576*s20+768*k80, out733);
_mm512_storeu_ps(dfPtr6+819520+1638400*i26+24576*j21+24576*s20+768*k80, out729);
_mm512_storeu_ps(dfPtr6+819648+1638400*i26+24576*j21+24576*s20+768*k80, out737);
_mm512_storeu_ps(dfPtr6+1229056+1638400*i26+24576*j21+24576*s20+768*k80, out726);
_mm512_storeu_ps(dfPtr6+1229184+1638400*i26+24576*j21+24576*s20+768*k80, out734);
_mm512_storeu_ps(dfPtr6+1229120+1638400*i26+24576*j21+24576*s20+768*k80, out730);
_mm512_storeu_ps(dfPtr6+1229248+1638400*i26+24576*j21+24576*s20+768*k80, out738);
__m512 dat1323 = _mm512_maskz_loadu_ps(16383, datPtr12+12880+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1324 = _mm512_maskz_loadu_ps(16383, datPtr12+12928+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512i pm114 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in795 = _mm512_permutexvar_ps(pm114, dat1323);
__m512 in802 = _mm512_permutexvar_ps(pm114, dat1324);
__m512 dat1325 = _mm512_maskz_loadu_ps(16383, datPtr12+13104+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1326 = _mm512_maskz_loadu_ps(16383, datPtr12+13152+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in796 = _mm512_permutexvar_ps(pm114, dat1325);
__m512 in803 = _mm512_permutexvar_ps(pm114, dat1326);
__m512 dat1327 = _mm512_maskz_loadu_ps(16383, datPtr12+13328+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1328 = _mm512_maskz_loadu_ps(16383, datPtr12+13376+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in797 = _mm512_permutexvar_ps(pm114, dat1327);
__m512 in804 = _mm512_permutexvar_ps(pm114, dat1328);
__m512 dat1329 = _mm512_maskz_loadu_ps(16383, datPtr12+13552+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1330 = _mm512_maskz_loadu_ps(16383, datPtr12+13600+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in798 = _mm512_permutexvar_ps(pm114, dat1329);
__m512 in805 = _mm512_permutexvar_ps(pm114, dat1330);
__m512 dat1331 = _mm512_maskz_loadu_ps(16383, datPtr12+13776+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1332 = _mm512_maskz_loadu_ps(16383, datPtr12+13824+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in799 = _mm512_permutexvar_ps(pm114, dat1331);
__m512 in806 = _mm512_permutexvar_ps(pm114, dat1332);
__m512 dat1333 = _mm512_maskz_loadu_ps(16383, datPtr12+14000+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1334 = _mm512_maskz_loadu_ps(16383, datPtr12+14048+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in800 = _mm512_permutexvar_ps(pm114, dat1333);
__m512 in807 = _mm512_permutexvar_ps(pm114, dat1334);
__m512 dat1335 = _mm512_maskz_loadu_ps(16383, datPtr12+14224+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 dat1336 = _mm512_maskz_loadu_ps(16383, datPtr12+14272+806912*i26+224*h29+4*w36+806912*s20+25216*k80);
__m512 in801 = _mm512_permutexvar_ps(pm114, dat1335);
__m512 in808 = _mm512_permutexvar_ps(pm114, dat1336);
__m512 tmp5723 = _mm512_add_ps(in795, in799);
__m512 tmp5728 = _mm512_add_ps(in802, in806);
__m512 tmp5724 = _mm512_sub_ps(in798, in796);
__m512 tmp5729 = _mm512_sub_ps(in805, in803);
__m512 tmp5725 = _mm512_add_ps(in796, in800);
__m512 tmp5730 = _mm512_add_ps(in803, in807);
__m512 tmp5726 = _mm512_sub_ps(_mm512_setzero_ps(), in800);
__m512 tmp5731 = _mm512_sub_ps(_mm512_setzero_ps(), in807);
tmp5723 = _mm512_fmadd_ps(in797, _mm512_set1_ps(-4.25e+00f), tmp5723);
tmp5728 = _mm512_fmadd_ps(in804, _mm512_set1_ps(-4.25e+00f), tmp5728);
tmp5725 = _mm512_fmadd_ps(in798, _mm512_set1_ps(-4.25e+00f), tmp5725);
tmp5730 = _mm512_fmadd_ps(in805, _mm512_set1_ps(-4.25e+00f), tmp5730);
tmp5726 = _mm512_fmadd_ps(tmp5724, _mm512_set1_ps(5.25e+00f), tmp5726);
tmp5731 = _mm512_fmadd_ps(tmp5729, _mm512_set1_ps(5.25e+00f), tmp5731);
tmp5724 = _mm512_fmadd_ps(in796, _mm512_set1_ps(2.5e-01f), in800);
tmp5729 = _mm512_fmadd_ps(in803, _mm512_set1_ps(2.5e-01f), in807);
in796 = _mm512_fmadd_ps(in796, _mm512_set1_ps(4e+00f), in800);
in803 = _mm512_fmadd_ps(in803, _mm512_set1_ps(4e+00f), in807);
__m512 tmp5727 = _mm512_sub_ps(tmp5725, tmp5723);
__m512 tmp5732 = _mm512_sub_ps(tmp5730, tmp5728);
tmp5725 = _mm512_add_ps(tmp5723, tmp5725);
tmp5730 = _mm512_add_ps(tmp5728, tmp5730);
tmp5723 = _mm512_fmadd_ps(in795, _mm512_set1_ps(2.5e-01f), in799);
tmp5728 = _mm512_fmadd_ps(in802, _mm512_set1_ps(2.5e-01f), in806);
tmp5724 = _mm512_fmadd_ps(in798, _mm512_set1_ps(-1.25e+00f), tmp5724);
tmp5729 = _mm512_fmadd_ps(in805, _mm512_set1_ps(-1.25e+00f), tmp5729);
in798 = _mm512_fmadd_ps(in798, _mm512_set1_ps(-5e+00f), in796);
in805 = _mm512_fmadd_ps(in805, _mm512_set1_ps(-5e+00f), in803);
tmp5723 = _mm512_fmadd_ps(in797, _mm512_set1_ps(-1.25e+00f), tmp5723);
tmp5728 = _mm512_fmadd_ps(in804, _mm512_set1_ps(-1.25e+00f), tmp5728);
in800 = _mm512_fmadd_ps(tmp5723, _mm512_set1_ps(2e+00f), tmp5724);
in807 = _mm512_fmadd_ps(tmp5728, _mm512_set1_ps(2e+00f), tmp5729);
tmp5724 = _mm512_fnmadd_ps(tmp5723, _mm512_set1_ps(2e+00f), tmp5724);
tmp5729 = _mm512_fnmadd_ps(tmp5728, _mm512_set1_ps(2e+00f), tmp5729);
tmp5723 = _mm512_fmadd_ps(in799, _mm512_set1_ps(2.5e-01f), in795);
tmp5728 = _mm512_fmadd_ps(in806, _mm512_set1_ps(2.5e-01f), in802);
in795 = _mm512_sub_ps(in801, in795);
in802 = _mm512_sub_ps(in808, in802);
tmp5723 = _mm512_fmadd_ps(in797, _mm512_set1_ps(-1.25e+00f), tmp5723);
tmp5728 = _mm512_fmadd_ps(in804, _mm512_set1_ps(-1.25e+00f), tmp5728);
in797 = _mm512_sub_ps(in797, in799);
in804 = _mm512_sub_ps(in804, in806);
in797 = _mm512_fmadd_ps(in797, _mm512_set1_ps(5.25e+00f), in795);
in804 = _mm512_fmadd_ps(in804, _mm512_set1_ps(5.25e+00f), in802);
in796 = _mm512_fmadd_ps(tmp5723, _mm512_set1_ps(2e+00f), in798);
in803 = _mm512_fmadd_ps(tmp5728, _mm512_set1_ps(2e+00f), in805);
in798 = _mm512_fnmadd_ps(tmp5723, _mm512_set1_ps(2e+00f), in798);
in805 = _mm512_fnmadd_ps(tmp5728, _mm512_set1_ps(2e+00f), in805);
__m512 tmp5741 = _mm512_unpacklo_ps(tmp5726, tmp5725);
__m512 tmp5742 = _mm512_unpackhi_ps(tmp5726, tmp5725);
__m512 tmp5743 = _mm512_unpacklo_ps(tmp5727, in800);
__m512 tmp5744 = _mm512_unpackhi_ps(tmp5727, in800);
__m512 tmp5745 = _mm512_unpacklo_ps(tmp5724, in796);
__m512 tmp5746 = _mm512_unpackhi_ps(tmp5724, in796);
__m512 tmp5747 = _mm512_unpacklo_ps(in798, in797);
__m512 tmp5748 = _mm512_unpackhi_ps(in798, in797);
__m512 tmp5749 = _mm512_unpacklo_ps(tmp5731, tmp5730);
__m512 tmp5750 = _mm512_unpackhi_ps(tmp5731, tmp5730);
__m512 tmp5751 = _mm512_unpacklo_ps(tmp5732, in807);
__m512 tmp5752 = _mm512_unpackhi_ps(tmp5732, in807);
__m512 tmp5753 = _mm512_unpacklo_ps(tmp5729, in803);
__m512 tmp5754 = _mm512_unpackhi_ps(tmp5729, in803);
__m512 tmp5755 = _mm512_unpacklo_ps(in805, in804);
__m512 tmp5756 = _mm512_unpackhi_ps(in805, in804);
__m512 tmp5757 = _mm512_shuffle_ps(tmp5741, tmp5743, 68);
__m512 tmp5758 = _mm512_shuffle_ps(tmp5741, tmp5743, 238);
__m512 tmp5759 = _mm512_shuffle_ps(tmp5742, tmp5744, 68);
__m512 tmp5760 = _mm512_shuffle_ps(tmp5742, tmp5744, 238);
__m512 tmp5761 = _mm512_shuffle_ps(tmp5745, tmp5747, 68);
__m512 tmp5762 = _mm512_shuffle_ps(tmp5745, tmp5747, 238);
__m512 tmp5763 = _mm512_shuffle_ps(tmp5746, tmp5748, 68);
__m512 tmp5764 = _mm512_shuffle_ps(tmp5746, tmp5748, 238);
__m512 tmp5765 = _mm512_shuffle_ps(tmp5749, tmp5751, 68);
__m512 tmp5766 = _mm512_shuffle_ps(tmp5749, tmp5751, 238);
__m512 tmp5767 = _mm512_shuffle_ps(tmp5750, tmp5752, 68);
__m512 tmp5768 = _mm512_shuffle_ps(tmp5750, tmp5752, 238);
__m512 tmp5769 = _mm512_shuffle_ps(tmp5753, tmp5755, 68);
__m512 tmp5770 = _mm512_shuffle_ps(tmp5753, tmp5755, 238);
__m512 tmp5771 = _mm512_shuffle_ps(tmp5754, tmp5756, 68);
__m512 tmp5772 = _mm512_shuffle_ps(tmp5754, tmp5756, 238);
__m512 tmp5773 = _mm512_shuffle_f32x4(tmp5757, tmp5761, 136);
__m512 tmp5774 = _mm512_shuffle_f32x4(tmp5757, tmp5761, 221);
__m512 tmp5775 = _mm512_shuffle_f32x4(tmp5758, tmp5762, 136);
__m512 tmp5776 = _mm512_shuffle_f32x4(tmp5758, tmp5762, 221);
__m512 tmp5777 = _mm512_shuffle_f32x4(tmp5759, tmp5763, 136);
__m512 tmp5778 = _mm512_shuffle_f32x4(tmp5759, tmp5763, 221);
__m512 tmp5779 = _mm512_shuffle_f32x4(tmp5760, tmp5764, 136);
__m512 tmp5780 = _mm512_shuffle_f32x4(tmp5760, tmp5764, 221);
__m512 tmp5781 = _mm512_shuffle_f32x4(tmp5765, tmp5769, 136);
__m512 tmp5782 = _mm512_shuffle_f32x4(tmp5765, tmp5769, 221);
__m512 tmp5783 = _mm512_shuffle_f32x4(tmp5766, tmp5770, 136);
__m512 tmp5784 = _mm512_shuffle_f32x4(tmp5766, tmp5770, 221);
__m512 tmp5785 = _mm512_shuffle_f32x4(tmp5767, tmp5771, 136);
__m512 tmp5786 = _mm512_shuffle_f32x4(tmp5767, tmp5771, 221);
__m512 tmp5787 = _mm512_shuffle_f32x4(tmp5768, tmp5772, 136);
__m512 tmp5788 = _mm512_shuffle_f32x4(tmp5768, tmp5772, 221);
tmp5726 = _mm512_shuffle_f32x4(tmp5773, tmp5781, 136);
tmp5731 = _mm512_shuffle_f32x4(tmp5773, tmp5781, 221);
tmp5725 = _mm512_shuffle_f32x4(tmp5775, tmp5783, 136);
tmp5730 = _mm512_shuffle_f32x4(tmp5775, tmp5783, 221);
tmp5727 = _mm512_shuffle_f32x4(tmp5777, tmp5785, 136);
tmp5732 = _mm512_shuffle_f32x4(tmp5777, tmp5785, 221);
in800 = _mm512_shuffle_f32x4(tmp5779, tmp5787, 136);
in807 = _mm512_shuffle_f32x4(tmp5779, tmp5787, 221);
tmp5724 = _mm512_shuffle_f32x4(tmp5774, tmp5782, 136);
tmp5729 = _mm512_shuffle_f32x4(tmp5774, tmp5782, 221);
in796 = _mm512_shuffle_f32x4(tmp5776, tmp5784, 136);
in803 = _mm512_shuffle_f32x4(tmp5776, tmp5784, 221);
in798 = _mm512_shuffle_f32x4(tmp5778, tmp5786, 136);
in805 = _mm512_shuffle_f32x4(tmp5778, tmp5786, 221);
in797 = _mm512_shuffle_f32x4(tmp5780, tmp5788, 136);
in804 = _mm512_shuffle_f32x4(tmp5780, tmp5788, 221);
__m512 tmp5733 = _mm512_add_ps(tmp5725, in796);
__m512 tmp5737 = _mm512_add_ps(tmp5730, in803);
__m512 tmp5734 = _mm512_sub_ps(tmp5724, tmp5727);
__m512 tmp5738 = _mm512_sub_ps(tmp5729, tmp5732);
__m512 tmp5735 = _mm512_add_ps(tmp5727, in798);
__m512 tmp5739 = _mm512_add_ps(tmp5732, in805);
tmp5726 = _mm512_sub_ps(tmp5726, in798);
tmp5731 = _mm512_sub_ps(tmp5731, in805);
tmp5733 = _mm512_fmadd_ps(in800, _mm512_set1_ps(-4.25e+00f), tmp5733);
tmp5737 = _mm512_fmadd_ps(in807, _mm512_set1_ps(-4.25e+00f), tmp5737);
tmp5735 = _mm512_fmadd_ps(tmp5724, _mm512_set1_ps(-4.25e+00f), tmp5735);
tmp5739 = _mm512_fmadd_ps(tmp5729, _mm512_set1_ps(-4.25e+00f), tmp5739);
tmp5726 = _mm512_fmadd_ps(tmp5734, _mm512_set1_ps(5.25e+00f), tmp5726);
tmp5731 = _mm512_fmadd_ps(tmp5738, _mm512_set1_ps(5.25e+00f), tmp5731);
tmp5734 = _mm512_fmadd_ps(tmp5727, _mm512_set1_ps(2.5e-01f), in798);
tmp5738 = _mm512_fmadd_ps(tmp5732, _mm512_set1_ps(2.5e-01f), in805);
tmp5727 = _mm512_fmadd_ps(tmp5727, _mm512_set1_ps(4e+00f), in798);
tmp5732 = _mm512_fmadd_ps(tmp5732, _mm512_set1_ps(4e+00f), in805);
__m512 tmp5736 = _mm512_sub_ps(tmp5735, tmp5733);
__m512 tmp5740 = _mm512_sub_ps(tmp5739, tmp5737);
tmp5735 = _mm512_add_ps(tmp5733, tmp5735);
tmp5739 = _mm512_add_ps(tmp5737, tmp5739);
tmp5733 = _mm512_fmadd_ps(tmp5725, _mm512_set1_ps(2.5e-01f), in796);
tmp5737 = _mm512_fmadd_ps(tmp5730, _mm512_set1_ps(2.5e-01f), in803);
tmp5734 = _mm512_fmadd_ps(tmp5724, _mm512_set1_ps(-1.25e+00f), tmp5734);
tmp5738 = _mm512_fmadd_ps(tmp5729, _mm512_set1_ps(-1.25e+00f), tmp5738);
tmp5724 = _mm512_fmadd_ps(tmp5724, _mm512_set1_ps(-5e+00f), tmp5727);
tmp5729 = _mm512_fmadd_ps(tmp5729, _mm512_set1_ps(-5e+00f), tmp5732);
tmp5733 = _mm512_fmadd_ps(in800, _mm512_set1_ps(-1.25e+00f), tmp5733);
tmp5737 = _mm512_fmadd_ps(in807, _mm512_set1_ps(-1.25e+00f), tmp5737);
in798 = _mm512_fmadd_ps(tmp5733, _mm512_set1_ps(2e+00f), tmp5734);
in805 = _mm512_fmadd_ps(tmp5737, _mm512_set1_ps(2e+00f), tmp5738);
tmp5734 = _mm512_fnmadd_ps(tmp5733, _mm512_set1_ps(2e+00f), tmp5734);
tmp5738 = _mm512_fnmadd_ps(tmp5737, _mm512_set1_ps(2e+00f), tmp5738);
tmp5733 = _mm512_fmadd_ps(in796, _mm512_set1_ps(2.5e-01f), tmp5725);
tmp5737 = _mm512_fmadd_ps(in803, _mm512_set1_ps(2.5e-01f), tmp5730);
tmp5725 = _mm512_sub_ps(in797, tmp5725);
tmp5730 = _mm512_sub_ps(in804, tmp5730);
tmp5733 = _mm512_fmadd_ps(in800, _mm512_set1_ps(-1.25e+00f), tmp5733);
tmp5737 = _mm512_fmadd_ps(in807, _mm512_set1_ps(-1.25e+00f), tmp5737);
in800 = _mm512_sub_ps(in800, in796);
in807 = _mm512_sub_ps(in807, in803);
in800 = _mm512_fmadd_ps(in800, _mm512_set1_ps(5.25e+00f), tmp5725);
in807 = _mm512_fmadd_ps(in807, _mm512_set1_ps(5.25e+00f), tmp5730);
tmp5727 = _mm512_fmadd_ps(tmp5733, _mm512_set1_ps(2e+00f), tmp5724);
tmp5732 = _mm512_fmadd_ps(tmp5737, _mm512_set1_ps(2e+00f), tmp5729);
tmp5724 = _mm512_fnmadd_ps(tmp5733, _mm512_set1_ps(2e+00f), tmp5724);
tmp5729 = _mm512_fnmadd_ps(tmp5737, _mm512_set1_ps(2e+00f), tmp5729);
__m512 out739 = _mm512_shuffle_f32x4(tmp5726, tmp5735, 68);
__m512 out747 = _mm512_shuffle_f32x4(tmp5726, tmp5735, 238);
__m512 out740 = _mm512_shuffle_f32x4(tmp5736, in798, 68);
__m512 out748 = _mm512_shuffle_f32x4(tmp5736, in798, 238);
__m512 out741 = _mm512_shuffle_f32x4(tmp5734, tmp5727, 68);
__m512 out749 = _mm512_shuffle_f32x4(tmp5734, tmp5727, 238);
__m512 out742 = _mm512_shuffle_f32x4(tmp5724, in800, 68);
__m512 out750 = _mm512_shuffle_f32x4(tmp5724, in800, 238);
__m512 out743 = _mm512_shuffle_f32x4(tmp5731, tmp5739, 68);
__m512 out751 = _mm512_shuffle_f32x4(tmp5731, tmp5739, 238);
__m512 out744 = _mm512_shuffle_f32x4(tmp5740, in805, 68);
__m512 out752 = _mm512_shuffle_f32x4(tmp5740, in805, 238);
__m512 out745 = _mm512_shuffle_f32x4(tmp5738, tmp5732, 68);
__m512 out753 = _mm512_shuffle_f32x4(tmp5738, tmp5732, 238);
__m512 out746 = _mm512_shuffle_f32x4(tmp5729, in807, 68);
__m512 out754 = _mm512_shuffle_f32x4(tmp5729, in807, 238);
_mm512_storeu_ps(dfPtr6+512+1638400*i26+24576*j21+24576*s20+768*k80, out739);
_mm512_storeu_ps(dfPtr6+640+1638400*i26+24576*j21+24576*s20+768*k80, out747);
_mm512_storeu_ps(dfPtr6+576+1638400*i26+24576*j21+24576*s20+768*k80, out743);
_mm512_storeu_ps(dfPtr6+704+1638400*i26+24576*j21+24576*s20+768*k80, out751);
_mm512_storeu_ps(dfPtr6+410112+1638400*i26+24576*j21+24576*s20+768*k80, out740);
_mm512_storeu_ps(dfPtr6+410240+1638400*i26+24576*j21+24576*s20+768*k80, out748);
_mm512_storeu_ps(dfPtr6+410176+1638400*i26+24576*j21+24576*s20+768*k80, out744);
_mm512_storeu_ps(dfPtr6+410304+1638400*i26+24576*j21+24576*s20+768*k80, out752);
_mm512_storeu_ps(dfPtr6+819712+1638400*i26+24576*j21+24576*s20+768*k80, out741);
_mm512_storeu_ps(dfPtr6+819840+1638400*i26+24576*j21+24576*s20+768*k80, out749);
_mm512_storeu_ps(dfPtr6+819776+1638400*i26+24576*j21+24576*s20+768*k80, out745);
_mm512_storeu_ps(dfPtr6+819904+1638400*i26+24576*j21+24576*s20+768*k80, out753);
_mm512_storeu_ps(dfPtr6+1229312+1638400*i26+24576*j21+24576*s20+768*k80, out742);
_mm512_storeu_ps(dfPtr6+1229440+1638400*i26+24576*j21+24576*s20+768*k80, out750);
_mm512_storeu_ps(dfPtr6+1229376+1638400*i26+24576*j21+24576*s20+768*k80, out746);
_mm512_storeu_ps(dfPtr6+1229504+1638400*i26+24576*j21+24576*s20+768*k80, out754);
}
if (j21 >= last5) return;
++j21;
rel13 = 1;
}
ptrdiff_t h30 = base13+0;
ptrdiff_t w37 = 36;
ptrdiff_t k81 = 0;
for (; k81 != 32; ++k81) {
__m512 dat1337 = _mm512_maskz_loadu_ps(16383, datPtr12+224+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1338 = _mm512_maskz_loadu_ps(511, datPtr12+272+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512i pm115 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in809 = _mm512_permutexvar_ps(pm115, dat1337);
__m512i pm116 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in816 = _mm512_permutexvar_ps(pm116, dat1338);
__m512 dat1339 = _mm512_maskz_loadu_ps(16383, datPtr12+448+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1340 = _mm512_maskz_loadu_ps(511, datPtr12+496+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in810 = _mm512_permutexvar_ps(pm115, dat1339);
__m512 in817 = _mm512_permutexvar_ps(pm116, dat1340);
__m512 dat1341 = _mm512_maskz_loadu_ps(16383, datPtr12+672+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1342 = _mm512_maskz_loadu_ps(511, datPtr12+720+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in811 = _mm512_permutexvar_ps(pm115, dat1341);
__m512 in818 = _mm512_permutexvar_ps(pm116, dat1342);
__m512 dat1343 = _mm512_maskz_loadu_ps(16383, datPtr12+896+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1344 = _mm512_maskz_loadu_ps(511, datPtr12+944+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in812 = _mm512_permutexvar_ps(pm115, dat1343);
__m512 in819 = _mm512_permutexvar_ps(pm116, dat1344);
__m512 dat1345 = _mm512_maskz_loadu_ps(16383, datPtr12+1120+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1346 = _mm512_maskz_loadu_ps(511, datPtr12+1168+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in813 = _mm512_permutexvar_ps(pm115, dat1345);
__m512 in820 = _mm512_permutexvar_ps(pm116, dat1346);
__m512 dat1347 = _mm512_maskz_loadu_ps(16383, datPtr12+1344+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1348 = _mm512_maskz_loadu_ps(511, datPtr12+1392+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in814 = _mm512_permutexvar_ps(pm115, dat1347);
__m512 in821 = _mm512_permutexvar_ps(pm116, dat1348);
__m512 dat1349 = _mm512_maskz_loadu_ps(16383, datPtr12+1568+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1350 = _mm512_maskz_loadu_ps(511, datPtr12+1616+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in815 = _mm512_permutexvar_ps(pm115, dat1349);
__m512 in822 = _mm512_permutexvar_ps(pm116, dat1350);
__m512 tmp5789 = _mm512_add_ps(in809, in813);
__m512 tmp5794 = _mm512_add_ps(in816, in820);
__m512 tmp5790 = _mm512_sub_ps(in812, in810);
__m512 tmp5795 = _mm512_sub_ps(in819, in817);
__m512 tmp5791 = _mm512_add_ps(in810, in814);
__m512 tmp5796 = _mm512_add_ps(in817, in821);
__m512 tmp5792 = _mm512_sub_ps(_mm512_setzero_ps(), in814);
__m512 tmp5797 = _mm512_sub_ps(_mm512_setzero_ps(), in821);
tmp5789 = _mm512_fmadd_ps(in811, _mm512_set1_ps(-4.25e+00f), tmp5789);
tmp5794 = _mm512_fmadd_ps(in818, _mm512_set1_ps(-4.25e+00f), tmp5794);
tmp5791 = _mm512_fmadd_ps(in812, _mm512_set1_ps(-4.25e+00f), tmp5791);
tmp5796 = _mm512_fmadd_ps(in819, _mm512_set1_ps(-4.25e+00f), tmp5796);
tmp5792 = _mm512_fmadd_ps(tmp5790, _mm512_set1_ps(5.25e+00f), tmp5792);
tmp5797 = _mm512_fmadd_ps(tmp5795, _mm512_set1_ps(5.25e+00f), tmp5797);
tmp5790 = _mm512_fmadd_ps(in810, _mm512_set1_ps(2.5e-01f), in814);
tmp5795 = _mm512_fmadd_ps(in817, _mm512_set1_ps(2.5e-01f), in821);
in810 = _mm512_fmadd_ps(in810, _mm512_set1_ps(4e+00f), in814);
in817 = _mm512_fmadd_ps(in817, _mm512_set1_ps(4e+00f), in821);
__m512 tmp5793 = _mm512_sub_ps(tmp5791, tmp5789);
__m512 tmp5798 = _mm512_sub_ps(tmp5796, tmp5794);
tmp5791 = _mm512_add_ps(tmp5789, tmp5791);
tmp5796 = _mm512_add_ps(tmp5794, tmp5796);
tmp5789 = _mm512_fmadd_ps(in809, _mm512_set1_ps(2.5e-01f), in813);
tmp5794 = _mm512_fmadd_ps(in816, _mm512_set1_ps(2.5e-01f), in820);
tmp5790 = _mm512_fmadd_ps(in812, _mm512_set1_ps(-1.25e+00f), tmp5790);
tmp5795 = _mm512_fmadd_ps(in819, _mm512_set1_ps(-1.25e+00f), tmp5795);
in812 = _mm512_fmadd_ps(in812, _mm512_set1_ps(-5e+00f), in810);
in819 = _mm512_fmadd_ps(in819, _mm512_set1_ps(-5e+00f), in817);
tmp5789 = _mm512_fmadd_ps(in811, _mm512_set1_ps(-1.25e+00f), tmp5789);
tmp5794 = _mm512_fmadd_ps(in818, _mm512_set1_ps(-1.25e+00f), tmp5794);
in814 = _mm512_fmadd_ps(tmp5789, _mm512_set1_ps(2e+00f), tmp5790);
in821 = _mm512_fmadd_ps(tmp5794, _mm512_set1_ps(2e+00f), tmp5795);
tmp5790 = _mm512_fnmadd_ps(tmp5789, _mm512_set1_ps(2e+00f), tmp5790);
tmp5795 = _mm512_fnmadd_ps(tmp5794, _mm512_set1_ps(2e+00f), tmp5795);
tmp5789 = _mm512_fmadd_ps(in813, _mm512_set1_ps(2.5e-01f), in809);
tmp5794 = _mm512_fmadd_ps(in820, _mm512_set1_ps(2.5e-01f), in816);
in809 = _mm512_sub_ps(in815, in809);
in816 = _mm512_sub_ps(in822, in816);
tmp5789 = _mm512_fmadd_ps(in811, _mm512_set1_ps(-1.25e+00f), tmp5789);
tmp5794 = _mm512_fmadd_ps(in818, _mm512_set1_ps(-1.25e+00f), tmp5794);
in811 = _mm512_sub_ps(in811, in813);
in818 = _mm512_sub_ps(in818, in820);
in811 = _mm512_fmadd_ps(in811, _mm512_set1_ps(5.25e+00f), in809);
in818 = _mm512_fmadd_ps(in818, _mm512_set1_ps(5.25e+00f), in816);
in810 = _mm512_fmadd_ps(tmp5789, _mm512_set1_ps(2e+00f), in812);
in817 = _mm512_fmadd_ps(tmp5794, _mm512_set1_ps(2e+00f), in819);
in812 = _mm512_fnmadd_ps(tmp5789, _mm512_set1_ps(2e+00f), in812);
in819 = _mm512_fnmadd_ps(tmp5794, _mm512_set1_ps(2e+00f), in819);
__m512 tmp5807 = _mm512_unpacklo_ps(tmp5792, tmp5791);
__m512 tmp5808 = _mm512_unpackhi_ps(tmp5792, tmp5791);
__m512 tmp5809 = _mm512_unpacklo_ps(tmp5793, in814);
__m512 tmp5810 = _mm512_unpackhi_ps(tmp5793, in814);
__m512 tmp5811 = _mm512_unpacklo_ps(tmp5790, in810);
__m512 tmp5812 = _mm512_unpackhi_ps(tmp5790, in810);
__m512 tmp5813 = _mm512_unpacklo_ps(in812, in811);
__m512 tmp5814 = _mm512_unpackhi_ps(in812, in811);
__m512 tmp5815 = _mm512_unpacklo_ps(tmp5797, tmp5796);
__m512 tmp5816 = _mm512_unpackhi_ps(tmp5797, tmp5796);
__m512 tmp5817 = _mm512_unpacklo_ps(tmp5798, in821);
__m512 tmp5818 = _mm512_unpackhi_ps(tmp5798, in821);
__m512 tmp5819 = _mm512_unpacklo_ps(tmp5795, in817);
__m512 tmp5820 = _mm512_unpackhi_ps(tmp5795, in817);
__m512 tmp5821 = _mm512_unpacklo_ps(in819, in818);
__m512 tmp5822 = _mm512_unpackhi_ps(in819, in818);
__m512 tmp5823 = _mm512_shuffle_ps(tmp5807, tmp5809, 68);
__m512 tmp5824 = _mm512_shuffle_ps(tmp5807, tmp5809, 238);
__m512 tmp5825 = _mm512_shuffle_ps(tmp5808, tmp5810, 68);
__m512 tmp5826 = _mm512_shuffle_ps(tmp5808, tmp5810, 238);
__m512 tmp5827 = _mm512_shuffle_ps(tmp5811, tmp5813, 68);
__m512 tmp5828 = _mm512_shuffle_ps(tmp5811, tmp5813, 238);
__m512 tmp5829 = _mm512_shuffle_ps(tmp5812, tmp5814, 68);
__m512 tmp5830 = _mm512_shuffle_ps(tmp5812, tmp5814, 238);
__m512 tmp5831 = _mm512_shuffle_ps(tmp5815, tmp5817, 68);
__m512 tmp5832 = _mm512_shuffle_ps(tmp5815, tmp5817, 238);
__m512 tmp5833 = _mm512_shuffle_ps(tmp5816, tmp5818, 68);
__m512 tmp5834 = _mm512_shuffle_ps(tmp5816, tmp5818, 238);
__m512 tmp5835 = _mm512_shuffle_ps(tmp5819, tmp5821, 68);
__m512 tmp5836 = _mm512_shuffle_ps(tmp5819, tmp5821, 238);
__m512 tmp5837 = _mm512_shuffle_ps(tmp5820, tmp5822, 68);
__m512 tmp5838 = _mm512_shuffle_ps(tmp5820, tmp5822, 238);
__m512 tmp5839 = _mm512_shuffle_f32x4(tmp5823, tmp5827, 136);
__m512 tmp5840 = _mm512_shuffle_f32x4(tmp5823, tmp5827, 221);
__m512 tmp5841 = _mm512_shuffle_f32x4(tmp5824, tmp5828, 136);
__m512 tmp5842 = _mm512_shuffle_f32x4(tmp5824, tmp5828, 221);
__m512 tmp5843 = _mm512_shuffle_f32x4(tmp5825, tmp5829, 136);
__m512 tmp5844 = _mm512_shuffle_f32x4(tmp5825, tmp5829, 221);
__m512 tmp5845 = _mm512_shuffle_f32x4(tmp5826, tmp5830, 136);
__m512 tmp5846 = _mm512_shuffle_f32x4(tmp5826, tmp5830, 221);
__m512 tmp5847 = _mm512_shuffle_f32x4(tmp5831, tmp5835, 136);
__m512 tmp5848 = _mm512_shuffle_f32x4(tmp5831, tmp5835, 221);
__m512 tmp5849 = _mm512_shuffle_f32x4(tmp5832, tmp5836, 136);
__m512 tmp5850 = _mm512_shuffle_f32x4(tmp5832, tmp5836, 221);
__m512 tmp5851 = _mm512_shuffle_f32x4(tmp5833, tmp5837, 136);
__m512 tmp5852 = _mm512_shuffle_f32x4(tmp5833, tmp5837, 221);
__m512 tmp5853 = _mm512_shuffle_f32x4(tmp5834, tmp5838, 136);
__m512 tmp5854 = _mm512_shuffle_f32x4(tmp5834, tmp5838, 221);
tmp5792 = _mm512_shuffle_f32x4(tmp5839, tmp5847, 136);
tmp5797 = _mm512_shuffle_f32x4(tmp5839, tmp5847, 221);
tmp5791 = _mm512_shuffle_f32x4(tmp5841, tmp5849, 136);
tmp5796 = _mm512_shuffle_f32x4(tmp5841, tmp5849, 221);
tmp5793 = _mm512_shuffle_f32x4(tmp5843, tmp5851, 136);
tmp5798 = _mm512_shuffle_f32x4(tmp5843, tmp5851, 221);
in814 = _mm512_shuffle_f32x4(tmp5845, tmp5853, 136);
in821 = _mm512_shuffle_f32x4(tmp5845, tmp5853, 221);
tmp5790 = _mm512_shuffle_f32x4(tmp5840, tmp5848, 136);
tmp5795 = _mm512_shuffle_f32x4(tmp5840, tmp5848, 221);
in810 = _mm512_shuffle_f32x4(tmp5842, tmp5850, 136);
in817 = _mm512_shuffle_f32x4(tmp5842, tmp5850, 221);
in812 = _mm512_shuffle_f32x4(tmp5844, tmp5852, 136);
in819 = _mm512_shuffle_f32x4(tmp5844, tmp5852, 221);
in811 = _mm512_shuffle_f32x4(tmp5846, tmp5854, 136);
in818 = _mm512_shuffle_f32x4(tmp5846, tmp5854, 221);
__m512 tmp5799 = _mm512_add_ps(tmp5791, in810);
__m512 tmp5803 = _mm512_add_ps(tmp5796, in817);
__m512 tmp5800 = _mm512_sub_ps(tmp5790, tmp5793);
__m512 tmp5804 = _mm512_sub_ps(tmp5795, tmp5798);
__m512 tmp5801 = _mm512_add_ps(tmp5793, in812);
__m512 tmp5805 = _mm512_add_ps(tmp5798, in819);
tmp5792 = _mm512_sub_ps(tmp5792, in812);
tmp5797 = _mm512_sub_ps(tmp5797, in819);
tmp5799 = _mm512_fmadd_ps(in814, _mm512_set1_ps(-4.25e+00f), tmp5799);
tmp5803 = _mm512_fmadd_ps(in821, _mm512_set1_ps(-4.25e+00f), tmp5803);
tmp5801 = _mm512_fmadd_ps(tmp5790, _mm512_set1_ps(-4.25e+00f), tmp5801);
tmp5805 = _mm512_fmadd_ps(tmp5795, _mm512_set1_ps(-4.25e+00f), tmp5805);
tmp5792 = _mm512_fmadd_ps(tmp5800, _mm512_set1_ps(5.25e+00f), tmp5792);
tmp5797 = _mm512_fmadd_ps(tmp5804, _mm512_set1_ps(5.25e+00f), tmp5797);
tmp5800 = _mm512_fmadd_ps(tmp5793, _mm512_set1_ps(2.5e-01f), in812);
tmp5804 = _mm512_fmadd_ps(tmp5798, _mm512_set1_ps(2.5e-01f), in819);
tmp5793 = _mm512_fmadd_ps(tmp5793, _mm512_set1_ps(4e+00f), in812);
tmp5798 = _mm512_fmadd_ps(tmp5798, _mm512_set1_ps(4e+00f), in819);
__m512 tmp5802 = _mm512_sub_ps(tmp5801, tmp5799);
__m512 tmp5806 = _mm512_sub_ps(tmp5805, tmp5803);
tmp5801 = _mm512_add_ps(tmp5799, tmp5801);
tmp5805 = _mm512_add_ps(tmp5803, tmp5805);
tmp5799 = _mm512_fmadd_ps(tmp5791, _mm512_set1_ps(2.5e-01f), in810);
tmp5803 = _mm512_fmadd_ps(tmp5796, _mm512_set1_ps(2.5e-01f), in817);
tmp5800 = _mm512_fmadd_ps(tmp5790, _mm512_set1_ps(-1.25e+00f), tmp5800);
tmp5804 = _mm512_fmadd_ps(tmp5795, _mm512_set1_ps(-1.25e+00f), tmp5804);
tmp5790 = _mm512_fmadd_ps(tmp5790, _mm512_set1_ps(-5e+00f), tmp5793);
tmp5795 = _mm512_fmadd_ps(tmp5795, _mm512_set1_ps(-5e+00f), tmp5798);
tmp5799 = _mm512_fmadd_ps(in814, _mm512_set1_ps(-1.25e+00f), tmp5799);
tmp5803 = _mm512_fmadd_ps(in821, _mm512_set1_ps(-1.25e+00f), tmp5803);
in812 = _mm512_fmadd_ps(tmp5799, _mm512_set1_ps(2e+00f), tmp5800);
in819 = _mm512_fmadd_ps(tmp5803, _mm512_set1_ps(2e+00f), tmp5804);
tmp5800 = _mm512_fnmadd_ps(tmp5799, _mm512_set1_ps(2e+00f), tmp5800);
tmp5804 = _mm512_fnmadd_ps(tmp5803, _mm512_set1_ps(2e+00f), tmp5804);
tmp5799 = _mm512_fmadd_ps(in810, _mm512_set1_ps(2.5e-01f), tmp5791);
tmp5803 = _mm512_fmadd_ps(in817, _mm512_set1_ps(2.5e-01f), tmp5796);
tmp5791 = _mm512_sub_ps(in811, tmp5791);
tmp5796 = _mm512_sub_ps(in818, tmp5796);
tmp5799 = _mm512_fmadd_ps(in814, _mm512_set1_ps(-1.25e+00f), tmp5799);
tmp5803 = _mm512_fmadd_ps(in821, _mm512_set1_ps(-1.25e+00f), tmp5803);
in814 = _mm512_sub_ps(in814, in810);
in821 = _mm512_sub_ps(in821, in817);
in814 = _mm512_fmadd_ps(in814, _mm512_set1_ps(5.25e+00f), tmp5791);
in821 = _mm512_fmadd_ps(in821, _mm512_set1_ps(5.25e+00f), tmp5796);
tmp5793 = _mm512_fmadd_ps(tmp5799, _mm512_set1_ps(2e+00f), tmp5790);
tmp5798 = _mm512_fmadd_ps(tmp5803, _mm512_set1_ps(2e+00f), tmp5795);
tmp5790 = _mm512_fnmadd_ps(tmp5799, _mm512_set1_ps(2e+00f), tmp5790);
tmp5795 = _mm512_fnmadd_ps(tmp5803, _mm512_set1_ps(2e+00f), tmp5795);
__m512 out755 = _mm512_shuffle_f32x4(tmp5792, tmp5801, 68);
__m512 out763 = _mm512_shuffle_f32x4(tmp5792, tmp5801, 238);
__m512 out756 = _mm512_shuffle_f32x4(tmp5802, in812, 68);
__m512 out764 = _mm512_shuffle_f32x4(tmp5802, in812, 238);
__m512 out757 = _mm512_shuffle_f32x4(tmp5800, tmp5793, 68);
__m512 out765 = _mm512_shuffle_f32x4(tmp5800, tmp5793, 238);
__m512 out758 = _mm512_shuffle_f32x4(tmp5790, in814, 68);
__m512 out766 = _mm512_shuffle_f32x4(tmp5790, in814, 238);
__m512 out759 = _mm512_shuffle_f32x4(tmp5797, tmp5805, 68);
__m512 out767 = _mm512_shuffle_f32x4(tmp5797, tmp5805, 238);
__m512 out760 = _mm512_shuffle_f32x4(tmp5806, in819, 68);
__m512 out768 = _mm512_shuffle_f32x4(tmp5806, in819, 238);
__m512 out761 = _mm512_shuffle_f32x4(tmp5804, tmp5798, 68);
__m512 out769 = _mm512_shuffle_f32x4(tmp5804, tmp5798, 238);
__m512 out762 = _mm512_shuffle_f32x4(tmp5795, in821, 68);
__m512 out770 = _mm512_shuffle_f32x4(tmp5795, in821, 238);
_mm512_storeu_ps(dfPtr6+0+1638400*i26+24576*j21+24576*s20+768*k81, out755);
_mm512_storeu_ps(dfPtr6+128+1638400*i26+24576*j21+24576*s20+768*k81, out763);
_mm512_storeu_ps(dfPtr6+64+1638400*i26+24576*j21+24576*s20+768*k81, out759);
_mm512_storeu_ps(dfPtr6+192+1638400*i26+24576*j21+24576*s20+768*k81, out767);
_mm512_storeu_ps(dfPtr6+409600+1638400*i26+24576*j21+24576*s20+768*k81, out756);
_mm512_storeu_ps(dfPtr6+409728+1638400*i26+24576*j21+24576*s20+768*k81, out764);
_mm512_storeu_ps(dfPtr6+409664+1638400*i26+24576*j21+24576*s20+768*k81, out760);
_mm512_storeu_ps(dfPtr6+409792+1638400*i26+24576*j21+24576*s20+768*k81, out768);
_mm512_storeu_ps(dfPtr6+819200+1638400*i26+24576*j21+24576*s20+768*k81, out757);
_mm512_storeu_ps(dfPtr6+819328+1638400*i26+24576*j21+24576*s20+768*k81, out765);
_mm512_storeu_ps(dfPtr6+819264+1638400*i26+24576*j21+24576*s20+768*k81, out761);
_mm512_storeu_ps(dfPtr6+819392+1638400*i26+24576*j21+24576*s20+768*k81, out769);
_mm512_storeu_ps(dfPtr6+1228800+1638400*i26+24576*j21+24576*s20+768*k81, out758);
_mm512_storeu_ps(dfPtr6+1228928+1638400*i26+24576*j21+24576*s20+768*k81, out766);
_mm512_storeu_ps(dfPtr6+1228864+1638400*i26+24576*j21+24576*s20+768*k81, out762);
_mm512_storeu_ps(dfPtr6+1228992+1638400*i26+24576*j21+24576*s20+768*k81, out770);
__m512 dat1351 = _mm512_maskz_loadu_ps(8191, datPtr12+1204+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512i pm117 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in823 = _mm512_permutexvar_ps(pm117, dat1351);
__m512 dat1352 = _mm512_maskz_loadu_ps(8191, datPtr12+1428+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1353 = _mm512_maskz_loadu_ps(16383, datPtr12+12832+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in824 = _mm512_permutexvar_ps(pm117, dat1352);
__m512i pm118 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in831 = _mm512_permutexvar_ps(pm118, dat1353);
__m512 dat1354 = _mm512_maskz_loadu_ps(8191, datPtr12+1652+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1355 = _mm512_maskz_loadu_ps(16383, datPtr12+13056+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in825 = _mm512_permutexvar_ps(pm117, dat1354);
__m512 in832 = _mm512_permutexvar_ps(pm118, dat1355);
__m512 dat1356 = _mm512_maskz_loadu_ps(8191, datPtr12+1876+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1357 = _mm512_maskz_loadu_ps(16383, datPtr12+13280+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in826 = _mm512_permutexvar_ps(pm117, dat1356);
__m512 in833 = _mm512_permutexvar_ps(pm118, dat1357);
__m512 dat1358 = _mm512_maskz_loadu_ps(8191, datPtr12+2100+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1359 = _mm512_maskz_loadu_ps(16383, datPtr12+13504+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in827 = _mm512_permutexvar_ps(pm117, dat1358);
__m512 in834 = _mm512_permutexvar_ps(pm118, dat1359);
__m512 dat1360 = _mm512_maskz_loadu_ps(8191, datPtr12+2324+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1361 = _mm512_maskz_loadu_ps(16383, datPtr12+13728+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in828 = _mm512_permutexvar_ps(pm117, dat1360);
__m512 in835 = _mm512_permutexvar_ps(pm118, dat1361);
__m512 dat1362 = _mm512_maskz_loadu_ps(8191, datPtr12+2548+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1363 = _mm512_maskz_loadu_ps(16383, datPtr12+13952+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in829 = _mm512_permutexvar_ps(pm117, dat1362);
__m512 in836 = _mm512_permutexvar_ps(pm118, dat1363);
__m512 dat1364 = _mm512_maskz_loadu_ps(8191, datPtr12+2772+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1365 = _mm512_maskz_loadu_ps(16383, datPtr12+14176+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in830 = _mm512_permutexvar_ps(pm117, dat1364);
__m512 in837 = _mm512_permutexvar_ps(pm118, dat1365);
__m512 tmp5855 = _mm512_add_ps(in824, in828);
__m512 tmp5859 = _mm512_add_ps(in831, in835);
__m512 tmp5856 = _mm512_sub_ps(in827, in825);
__m512 tmp5860 = _mm512_sub_ps(in834, in832);
__m512 tmp5857 = _mm512_add_ps(in825, in829);
__m512 tmp5861 = _mm512_add_ps(in832, in836);
in823 = _mm512_sub_ps(in823, in829);
__m512 tmp5862 = _mm512_sub_ps(_mm512_setzero_ps(), in836);
tmp5855 = _mm512_fmadd_ps(in826, _mm512_set1_ps(-4.25e+00f), tmp5855);
tmp5859 = _mm512_fmadd_ps(in833, _mm512_set1_ps(-4.25e+00f), tmp5859);
tmp5857 = _mm512_fmadd_ps(in827, _mm512_set1_ps(-4.25e+00f), tmp5857);
tmp5861 = _mm512_fmadd_ps(in834, _mm512_set1_ps(-4.25e+00f), tmp5861);
in823 = _mm512_fmadd_ps(tmp5856, _mm512_set1_ps(5.25e+00f), in823);
tmp5862 = _mm512_fmadd_ps(tmp5860, _mm512_set1_ps(5.25e+00f), tmp5862);
tmp5856 = _mm512_fmadd_ps(in825, _mm512_set1_ps(2.5e-01f), in829);
tmp5860 = _mm512_fmadd_ps(in832, _mm512_set1_ps(2.5e-01f), in836);
in825 = _mm512_fmadd_ps(in825, _mm512_set1_ps(4e+00f), in829);
in832 = _mm512_fmadd_ps(in832, _mm512_set1_ps(4e+00f), in836);
__m512 tmp5858 = _mm512_sub_ps(tmp5857, tmp5855);
__m512 tmp5863 = _mm512_sub_ps(tmp5861, tmp5859);
tmp5857 = _mm512_add_ps(tmp5855, tmp5857);
tmp5861 = _mm512_add_ps(tmp5859, tmp5861);
tmp5855 = _mm512_fmadd_ps(in824, _mm512_set1_ps(2.5e-01f), in828);
tmp5859 = _mm512_fmadd_ps(in831, _mm512_set1_ps(2.5e-01f), in835);
tmp5856 = _mm512_fmadd_ps(in827, _mm512_set1_ps(-1.25e+00f), tmp5856);
tmp5860 = _mm512_fmadd_ps(in834, _mm512_set1_ps(-1.25e+00f), tmp5860);
in827 = _mm512_fmadd_ps(in827, _mm512_set1_ps(-5e+00f), in825);
in834 = _mm512_fmadd_ps(in834, _mm512_set1_ps(-5e+00f), in832);
tmp5855 = _mm512_fmadd_ps(in826, _mm512_set1_ps(-1.25e+00f), tmp5855);
tmp5859 = _mm512_fmadd_ps(in833, _mm512_set1_ps(-1.25e+00f), tmp5859);
in829 = _mm512_fmadd_ps(tmp5855, _mm512_set1_ps(2e+00f), tmp5856);
in836 = _mm512_fmadd_ps(tmp5859, _mm512_set1_ps(2e+00f), tmp5860);
tmp5856 = _mm512_fnmadd_ps(tmp5855, _mm512_set1_ps(2e+00f), tmp5856);
tmp5860 = _mm512_fnmadd_ps(tmp5859, _mm512_set1_ps(2e+00f), tmp5860);
tmp5855 = _mm512_fmadd_ps(in828, _mm512_set1_ps(2.5e-01f), in824);
tmp5859 = _mm512_fmadd_ps(in835, _mm512_set1_ps(2.5e-01f), in831);
in824 = _mm512_sub_ps(in830, in824);
in831 = _mm512_sub_ps(in837, in831);
tmp5855 = _mm512_fmadd_ps(in826, _mm512_set1_ps(-1.25e+00f), tmp5855);
tmp5859 = _mm512_fmadd_ps(in833, _mm512_set1_ps(-1.25e+00f), tmp5859);
in826 = _mm512_sub_ps(in826, in828);
in833 = _mm512_sub_ps(in833, in835);
in826 = _mm512_fmadd_ps(in826, _mm512_set1_ps(5.25e+00f), in824);
in833 = _mm512_fmadd_ps(in833, _mm512_set1_ps(5.25e+00f), in831);
in825 = _mm512_fmadd_ps(tmp5855, _mm512_set1_ps(2e+00f), in827);
in832 = _mm512_fmadd_ps(tmp5859, _mm512_set1_ps(2e+00f), in834);
in827 = _mm512_fnmadd_ps(tmp5855, _mm512_set1_ps(2e+00f), in827);
in834 = _mm512_fnmadd_ps(tmp5859, _mm512_set1_ps(2e+00f), in834);
__m512 tmp5872 = _mm512_unpacklo_ps(in823, tmp5857);
__m512 tmp5873 = _mm512_unpackhi_ps(in823, tmp5857);
__m512 tmp5874 = _mm512_unpacklo_ps(tmp5858, in829);
__m512 tmp5875 = _mm512_unpackhi_ps(tmp5858, in829);
__m512 tmp5876 = _mm512_unpacklo_ps(tmp5856, in825);
__m512 tmp5877 = _mm512_unpackhi_ps(tmp5856, in825);
__m512 tmp5878 = _mm512_unpacklo_ps(in827, in826);
__m512 tmp5879 = _mm512_unpackhi_ps(in827, in826);
__m512 tmp5880 = _mm512_unpacklo_ps(tmp5862, tmp5861);
__m512 tmp5881 = _mm512_unpackhi_ps(tmp5862, tmp5861);
__m512 tmp5882 = _mm512_unpacklo_ps(tmp5863, in836);
__m512 tmp5883 = _mm512_unpackhi_ps(tmp5863, in836);
__m512 tmp5884 = _mm512_unpacklo_ps(tmp5860, in832);
__m512 tmp5885 = _mm512_unpackhi_ps(tmp5860, in832);
__m512 tmp5886 = _mm512_unpacklo_ps(in834, in833);
__m512 tmp5887 = _mm512_unpackhi_ps(in834, in833);
__m512 tmp5888 = _mm512_shuffle_ps(tmp5872, tmp5874, 68);
__m512 tmp5889 = _mm512_shuffle_ps(tmp5872, tmp5874, 238);
__m512 tmp5890 = _mm512_shuffle_ps(tmp5873, tmp5875, 68);
__m512 tmp5891 = _mm512_shuffle_ps(tmp5873, tmp5875, 238);
__m512 tmp5892 = _mm512_shuffle_ps(tmp5876, tmp5878, 68);
__m512 tmp5893 = _mm512_shuffle_ps(tmp5876, tmp5878, 238);
__m512 tmp5894 = _mm512_shuffle_ps(tmp5877, tmp5879, 68);
__m512 tmp5895 = _mm512_shuffle_ps(tmp5877, tmp5879, 238);
__m512 tmp5896 = _mm512_shuffle_ps(tmp5880, tmp5882, 68);
__m512 tmp5897 = _mm512_shuffle_ps(tmp5880, tmp5882, 238);
__m512 tmp5898 = _mm512_shuffle_ps(tmp5881, tmp5883, 68);
__m512 tmp5899 = _mm512_shuffle_ps(tmp5881, tmp5883, 238);
__m512 tmp5900 = _mm512_shuffle_ps(tmp5884, tmp5886, 68);
__m512 tmp5901 = _mm512_shuffle_ps(tmp5884, tmp5886, 238);
__m512 tmp5902 = _mm512_shuffle_ps(tmp5885, tmp5887, 68);
__m512 tmp5903 = _mm512_shuffle_ps(tmp5885, tmp5887, 238);
__m512 tmp5904 = _mm512_shuffle_f32x4(tmp5888, tmp5892, 136);
__m512 tmp5905 = _mm512_shuffle_f32x4(tmp5888, tmp5892, 221);
__m512 tmp5906 = _mm512_shuffle_f32x4(tmp5889, tmp5893, 136);
__m512 tmp5907 = _mm512_shuffle_f32x4(tmp5889, tmp5893, 221);
__m512 tmp5908 = _mm512_shuffle_f32x4(tmp5890, tmp5894, 136);
__m512 tmp5909 = _mm512_shuffle_f32x4(tmp5890, tmp5894, 221);
__m512 tmp5910 = _mm512_shuffle_f32x4(tmp5891, tmp5895, 136);
__m512 tmp5911 = _mm512_shuffle_f32x4(tmp5891, tmp5895, 221);
__m512 tmp5912 = _mm512_shuffle_f32x4(tmp5896, tmp5900, 136);
__m512 tmp5913 = _mm512_shuffle_f32x4(tmp5896, tmp5900, 221);
__m512 tmp5914 = _mm512_shuffle_f32x4(tmp5897, tmp5901, 136);
__m512 tmp5915 = _mm512_shuffle_f32x4(tmp5897, tmp5901, 221);
__m512 tmp5916 = _mm512_shuffle_f32x4(tmp5898, tmp5902, 136);
__m512 tmp5917 = _mm512_shuffle_f32x4(tmp5898, tmp5902, 221);
__m512 tmp5918 = _mm512_shuffle_f32x4(tmp5899, tmp5903, 136);
__m512 tmp5919 = _mm512_shuffle_f32x4(tmp5899, tmp5903, 221);
in823 = _mm512_shuffle_f32x4(tmp5904, tmp5912, 136);
tmp5862 = _mm512_shuffle_f32x4(tmp5904, tmp5912, 221);
tmp5857 = _mm512_shuffle_f32x4(tmp5906, tmp5914, 136);
tmp5861 = _mm512_shuffle_f32x4(tmp5906, tmp5914, 221);
tmp5858 = _mm512_shuffle_f32x4(tmp5908, tmp5916, 136);
tmp5863 = _mm512_shuffle_f32x4(tmp5908, tmp5916, 221);
in829 = _mm512_shuffle_f32x4(tmp5910, tmp5918, 136);
in836 = _mm512_shuffle_f32x4(tmp5910, tmp5918, 221);
tmp5856 = _mm512_shuffle_f32x4(tmp5905, tmp5913, 136);
tmp5860 = _mm512_shuffle_f32x4(tmp5905, tmp5913, 221);
in825 = _mm512_shuffle_f32x4(tmp5907, tmp5915, 136);
in832 = _mm512_shuffle_f32x4(tmp5907, tmp5915, 221);
in827 = _mm512_shuffle_f32x4(tmp5909, tmp5917, 136);
in834 = _mm512_shuffle_f32x4(tmp5909, tmp5917, 221);
in826 = _mm512_shuffle_f32x4(tmp5911, tmp5919, 136);
in833 = _mm512_shuffle_f32x4(tmp5911, tmp5919, 221);
__m512 tmp5864 = _mm512_add_ps(tmp5857, in825);
__m512 tmp5868 = _mm512_add_ps(tmp5861, in832);
__m512 tmp5865 = _mm512_sub_ps(tmp5856, tmp5858);
__m512 tmp5869 = _mm512_sub_ps(tmp5860, tmp5863);
__m512 tmp5866 = _mm512_add_ps(tmp5858, in827);
__m512 tmp5870 = _mm512_add_ps(tmp5863, in834);
in823 = _mm512_sub_ps(in823, in827);
tmp5862 = _mm512_sub_ps(tmp5862, in834);
tmp5864 = _mm512_fmadd_ps(in829, _mm512_set1_ps(-4.25e+00f), tmp5864);
tmp5868 = _mm512_fmadd_ps(in836, _mm512_set1_ps(-4.25e+00f), tmp5868);
tmp5866 = _mm512_fmadd_ps(tmp5856, _mm512_set1_ps(-4.25e+00f), tmp5866);
tmp5870 = _mm512_fmadd_ps(tmp5860, _mm512_set1_ps(-4.25e+00f), tmp5870);
in823 = _mm512_fmadd_ps(tmp5865, _mm512_set1_ps(5.25e+00f), in823);
tmp5862 = _mm512_fmadd_ps(tmp5869, _mm512_set1_ps(5.25e+00f), tmp5862);
tmp5865 = _mm512_fmadd_ps(tmp5858, _mm512_set1_ps(2.5e-01f), in827);
tmp5869 = _mm512_fmadd_ps(tmp5863, _mm512_set1_ps(2.5e-01f), in834);
tmp5858 = _mm512_fmadd_ps(tmp5858, _mm512_set1_ps(4e+00f), in827);
tmp5863 = _mm512_fmadd_ps(tmp5863, _mm512_set1_ps(4e+00f), in834);
__m512 tmp5867 = _mm512_sub_ps(tmp5866, tmp5864);
__m512 tmp5871 = _mm512_sub_ps(tmp5870, tmp5868);
tmp5866 = _mm512_add_ps(tmp5864, tmp5866);
tmp5870 = _mm512_add_ps(tmp5868, tmp5870);
tmp5864 = _mm512_fmadd_ps(tmp5857, _mm512_set1_ps(2.5e-01f), in825);
tmp5868 = _mm512_fmadd_ps(tmp5861, _mm512_set1_ps(2.5e-01f), in832);
tmp5865 = _mm512_fmadd_ps(tmp5856, _mm512_set1_ps(-1.25e+00f), tmp5865);
tmp5869 = _mm512_fmadd_ps(tmp5860, _mm512_set1_ps(-1.25e+00f), tmp5869);
tmp5856 = _mm512_fmadd_ps(tmp5856, _mm512_set1_ps(-5e+00f), tmp5858);
tmp5860 = _mm512_fmadd_ps(tmp5860, _mm512_set1_ps(-5e+00f), tmp5863);
tmp5864 = _mm512_fmadd_ps(in829, _mm512_set1_ps(-1.25e+00f), tmp5864);
tmp5868 = _mm512_fmadd_ps(in836, _mm512_set1_ps(-1.25e+00f), tmp5868);
in827 = _mm512_fmadd_ps(tmp5864, _mm512_set1_ps(2e+00f), tmp5865);
in834 = _mm512_fmadd_ps(tmp5868, _mm512_set1_ps(2e+00f), tmp5869);
tmp5865 = _mm512_fnmadd_ps(tmp5864, _mm512_set1_ps(2e+00f), tmp5865);
tmp5869 = _mm512_fnmadd_ps(tmp5868, _mm512_set1_ps(2e+00f), tmp5869);
tmp5864 = _mm512_fmadd_ps(in825, _mm512_set1_ps(2.5e-01f), tmp5857);
tmp5868 = _mm512_fmadd_ps(in832, _mm512_set1_ps(2.5e-01f), tmp5861);
tmp5857 = _mm512_sub_ps(in826, tmp5857);
tmp5861 = _mm512_sub_ps(in833, tmp5861);
tmp5864 = _mm512_fmadd_ps(in829, _mm512_set1_ps(-1.25e+00f), tmp5864);
tmp5868 = _mm512_fmadd_ps(in836, _mm512_set1_ps(-1.25e+00f), tmp5868);
in829 = _mm512_sub_ps(in829, in825);
in836 = _mm512_sub_ps(in836, in832);
in829 = _mm512_fmadd_ps(in829, _mm512_set1_ps(5.25e+00f), tmp5857);
in836 = _mm512_fmadd_ps(in836, _mm512_set1_ps(5.25e+00f), tmp5861);
tmp5858 = _mm512_fmadd_ps(tmp5864, _mm512_set1_ps(2e+00f), tmp5856);
tmp5863 = _mm512_fmadd_ps(tmp5868, _mm512_set1_ps(2e+00f), tmp5860);
tmp5856 = _mm512_fnmadd_ps(tmp5864, _mm512_set1_ps(2e+00f), tmp5856);
tmp5860 = _mm512_fnmadd_ps(tmp5868, _mm512_set1_ps(2e+00f), tmp5860);
__m512 out771 = _mm512_shuffle_f32x4(in823, tmp5866, 68);
__m512 out779 = _mm512_shuffle_f32x4(in823, tmp5866, 238);
__m512 out772 = _mm512_shuffle_f32x4(tmp5867, in827, 68);
__m512 out780 = _mm512_shuffle_f32x4(tmp5867, in827, 238);
__m512 out773 = _mm512_shuffle_f32x4(tmp5865, tmp5858, 68);
__m512 out781 = _mm512_shuffle_f32x4(tmp5865, tmp5858, 238);
__m512 out774 = _mm512_shuffle_f32x4(tmp5856, in829, 68);
__m512 out782 = _mm512_shuffle_f32x4(tmp5856, in829, 238);
__m512 out775 = _mm512_shuffle_f32x4(tmp5862, tmp5870, 68);
__m512 out783 = _mm512_shuffle_f32x4(tmp5862, tmp5870, 238);
__m512 out776 = _mm512_shuffle_f32x4(tmp5871, in834, 68);
__m512 out784 = _mm512_shuffle_f32x4(tmp5871, in834, 238);
__m512 out777 = _mm512_shuffle_f32x4(tmp5869, tmp5863, 68);
__m512 out785 = _mm512_shuffle_f32x4(tmp5869, tmp5863, 238);
__m512 out778 = _mm512_shuffle_f32x4(tmp5860, in836, 68);
__m512 out786 = _mm512_shuffle_f32x4(tmp5860, in836, 238);
_mm512_storeu_ps(dfPtr6+256+1638400*i26+24576*j21+24576*s20+768*k81, out771);
_mm512_storeu_ps(dfPtr6+384+1638400*i26+24576*j21+24576*s20+768*k81, out779);
_mm512_storeu_ps(dfPtr6+320+1638400*i26+24576*j21+24576*s20+768*k81, out775);
_mm512_storeu_ps(dfPtr6+448+1638400*i26+24576*j21+24576*s20+768*k81, out783);
_mm512_storeu_ps(dfPtr6+409856+1638400*i26+24576*j21+24576*s20+768*k81, out772);
_mm512_storeu_ps(dfPtr6+409984+1638400*i26+24576*j21+24576*s20+768*k81, out780);
_mm512_storeu_ps(dfPtr6+409920+1638400*i26+24576*j21+24576*s20+768*k81, out776);
_mm512_storeu_ps(dfPtr6+410048+1638400*i26+24576*j21+24576*s20+768*k81, out784);
_mm512_storeu_ps(dfPtr6+819456+1638400*i26+24576*j21+24576*s20+768*k81, out773);
_mm512_storeu_ps(dfPtr6+819584+1638400*i26+24576*j21+24576*s20+768*k81, out781);
_mm512_storeu_ps(dfPtr6+819520+1638400*i26+24576*j21+24576*s20+768*k81, out777);
_mm512_storeu_ps(dfPtr6+819648+1638400*i26+24576*j21+24576*s20+768*k81, out785);
_mm512_storeu_ps(dfPtr6+1229056+1638400*i26+24576*j21+24576*s20+768*k81, out774);
_mm512_storeu_ps(dfPtr6+1229184+1638400*i26+24576*j21+24576*s20+768*k81, out782);
_mm512_storeu_ps(dfPtr6+1229120+1638400*i26+24576*j21+24576*s20+768*k81, out778);
_mm512_storeu_ps(dfPtr6+1229248+1638400*i26+24576*j21+24576*s20+768*k81, out786);
__m512 dat1366 = _mm512_maskz_loadu_ps(8191, datPtr12+13812+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512i pm119 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in845 = _mm512_permutexvar_ps(pm119, dat1366);
__m512 dat1367 = _mm512_maskz_loadu_ps(511, datPtr12+12880+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1368 = _mm512_maskz_loadu_ps(8191, datPtr12+14036+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512i pm120 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in838 = _mm512_permutexvar_ps(pm120, dat1367);
__m512 in846 = _mm512_permutexvar_ps(pm119, dat1368);
__m512 dat1369 = _mm512_maskz_loadu_ps(511, datPtr12+13104+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1370 = _mm512_maskz_loadu_ps(8191, datPtr12+14260+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in839 = _mm512_permutexvar_ps(pm120, dat1369);
__m512 in847 = _mm512_permutexvar_ps(pm119, dat1370);
__m512 dat1371 = _mm512_maskz_loadu_ps(511, datPtr12+13328+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1372 = _mm512_maskz_loadu_ps(8191, datPtr12+14484+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in840 = _mm512_permutexvar_ps(pm120, dat1371);
__m512 in848 = _mm512_permutexvar_ps(pm119, dat1372);
__m512 dat1373 = _mm512_maskz_loadu_ps(511, datPtr12+13552+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1374 = _mm512_maskz_loadu_ps(8191, datPtr12+14708+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in841 = _mm512_permutexvar_ps(pm120, dat1373);
__m512 in849 = _mm512_permutexvar_ps(pm119, dat1374);
__m512 dat1375 = _mm512_maskz_loadu_ps(511, datPtr12+13776+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1376 = _mm512_maskz_loadu_ps(8191, datPtr12+14932+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in842 = _mm512_permutexvar_ps(pm120, dat1375);
__m512 in850 = _mm512_permutexvar_ps(pm119, dat1376);
__m512 dat1377 = _mm512_maskz_loadu_ps(511, datPtr12+14000+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1378 = _mm512_maskz_loadu_ps(8191, datPtr12+15156+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in843 = _mm512_permutexvar_ps(pm120, dat1377);
__m512 in851 = _mm512_permutexvar_ps(pm119, dat1378);
__m512 dat1379 = _mm512_maskz_loadu_ps(511, datPtr12+14224+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 dat1380 = _mm512_maskz_loadu_ps(8191, datPtr12+15380+806912*i26+224*h30+4*w37+806912*s20+25216*k81);
__m512 in844 = _mm512_permutexvar_ps(pm120, dat1379);
__m512 in852 = _mm512_permutexvar_ps(pm119, dat1380);
__m512 tmp5920 = _mm512_add_ps(in838, in842);
__m512 tmp5925 = _mm512_add_ps(in846, in850);
__m512 tmp5921 = _mm512_sub_ps(in841, in839);
__m512 tmp5926 = _mm512_sub_ps(in849, in847);
__m512 tmp5922 = _mm512_add_ps(in839, in843);
__m512 tmp5927 = _mm512_add_ps(in847, in851);
__m512 tmp5923 = _mm512_sub_ps(_mm512_setzero_ps(), in843);
in845 = _mm512_sub_ps(in845, in851);
tmp5920 = _mm512_fmadd_ps(in840, _mm512_set1_ps(-4.25e+00f), tmp5920);
tmp5925 = _mm512_fmadd_ps(in848, _mm512_set1_ps(-4.25e+00f), tmp5925);
tmp5922 = _mm512_fmadd_ps(in841, _mm512_set1_ps(-4.25e+00f), tmp5922);
tmp5927 = _mm512_fmadd_ps(in849, _mm512_set1_ps(-4.25e+00f), tmp5927);
tmp5923 = _mm512_fmadd_ps(tmp5921, _mm512_set1_ps(5.25e+00f), tmp5923);
in845 = _mm512_fmadd_ps(tmp5926, _mm512_set1_ps(5.25e+00f), in845);
tmp5921 = _mm512_fmadd_ps(in839, _mm512_set1_ps(2.5e-01f), in843);
tmp5926 = _mm512_fmadd_ps(in847, _mm512_set1_ps(2.5e-01f), in851);
in839 = _mm512_fmadd_ps(in839, _mm512_set1_ps(4e+00f), in843);
in847 = _mm512_fmadd_ps(in847, _mm512_set1_ps(4e+00f), in851);
__m512 tmp5924 = _mm512_sub_ps(tmp5922, tmp5920);
__m512 tmp5928 = _mm512_sub_ps(tmp5927, tmp5925);
tmp5922 = _mm512_add_ps(tmp5920, tmp5922);
tmp5927 = _mm512_add_ps(tmp5925, tmp5927);
tmp5920 = _mm512_fmadd_ps(in838, _mm512_set1_ps(2.5e-01f), in842);
tmp5925 = _mm512_fmadd_ps(in846, _mm512_set1_ps(2.5e-01f), in850);
tmp5921 = _mm512_fmadd_ps(in841, _mm512_set1_ps(-1.25e+00f), tmp5921);
tmp5926 = _mm512_fmadd_ps(in849, _mm512_set1_ps(-1.25e+00f), tmp5926);
in841 = _mm512_fmadd_ps(in841, _mm512_set1_ps(-5e+00f), in839);
in849 = _mm512_fmadd_ps(in849, _mm512_set1_ps(-5e+00f), in847);
tmp5920 = _mm512_fmadd_ps(in840, _mm512_set1_ps(-1.25e+00f), tmp5920);
tmp5925 = _mm512_fmadd_ps(in848, _mm512_set1_ps(-1.25e+00f), tmp5925);
in843 = _mm512_fmadd_ps(tmp5920, _mm512_set1_ps(2e+00f), tmp5921);
in851 = _mm512_fmadd_ps(tmp5925, _mm512_set1_ps(2e+00f), tmp5926);
tmp5921 = _mm512_fnmadd_ps(tmp5920, _mm512_set1_ps(2e+00f), tmp5921);
tmp5926 = _mm512_fnmadd_ps(tmp5925, _mm512_set1_ps(2e+00f), tmp5926);
tmp5920 = _mm512_fmadd_ps(in842, _mm512_set1_ps(2.5e-01f), in838);
tmp5925 = _mm512_fmadd_ps(in850, _mm512_set1_ps(2.5e-01f), in846);
in838 = _mm512_sub_ps(in844, in838);
in846 = _mm512_sub_ps(in852, in846);
tmp5920 = _mm512_fmadd_ps(in840, _mm512_set1_ps(-1.25e+00f), tmp5920);
tmp5925 = _mm512_fmadd_ps(in848, _mm512_set1_ps(-1.25e+00f), tmp5925);
in840 = _mm512_sub_ps(in840, in842);
in848 = _mm512_sub_ps(in848, in850);
in840 = _mm512_fmadd_ps(in840, _mm512_set1_ps(5.25e+00f), in838);
in848 = _mm512_fmadd_ps(in848, _mm512_set1_ps(5.25e+00f), in846);
in839 = _mm512_fmadd_ps(tmp5920, _mm512_set1_ps(2e+00f), in841);
in847 = _mm512_fmadd_ps(tmp5925, _mm512_set1_ps(2e+00f), in849);
in841 = _mm512_fnmadd_ps(tmp5920, _mm512_set1_ps(2e+00f), in841);
in849 = _mm512_fnmadd_ps(tmp5925, _mm512_set1_ps(2e+00f), in849);
__m512 tmp5937 = _mm512_unpacklo_ps(tmp5923, tmp5922);
__m512 tmp5938 = _mm512_unpackhi_ps(tmp5923, tmp5922);
__m512 tmp5939 = _mm512_unpacklo_ps(tmp5924, in843);
__m512 tmp5940 = _mm512_unpackhi_ps(tmp5924, in843);
__m512 tmp5941 = _mm512_unpacklo_ps(tmp5921, in839);
__m512 tmp5942 = _mm512_unpackhi_ps(tmp5921, in839);
__m512 tmp5943 = _mm512_unpacklo_ps(in841, in840);
__m512 tmp5944 = _mm512_unpackhi_ps(in841, in840);
__m512 tmp5945 = _mm512_unpacklo_ps(in845, tmp5927);
__m512 tmp5946 = _mm512_unpackhi_ps(in845, tmp5927);
__m512 tmp5947 = _mm512_unpacklo_ps(tmp5928, in851);
__m512 tmp5948 = _mm512_unpackhi_ps(tmp5928, in851);
__m512 tmp5949 = _mm512_unpacklo_ps(tmp5926, in847);
__m512 tmp5950 = _mm512_unpackhi_ps(tmp5926, in847);
__m512 tmp5951 = _mm512_unpacklo_ps(in849, in848);
__m512 tmp5952 = _mm512_unpackhi_ps(in849, in848);
__m512 tmp5953 = _mm512_shuffle_ps(tmp5937, tmp5939, 68);
__m512 tmp5954 = _mm512_shuffle_ps(tmp5937, tmp5939, 238);
__m512 tmp5955 = _mm512_shuffle_ps(tmp5938, tmp5940, 68);
__m512 tmp5956 = _mm512_shuffle_ps(tmp5938, tmp5940, 238);
__m512 tmp5957 = _mm512_shuffle_ps(tmp5941, tmp5943, 68);
__m512 tmp5958 = _mm512_shuffle_ps(tmp5941, tmp5943, 238);
__m512 tmp5959 = _mm512_shuffle_ps(tmp5942, tmp5944, 68);
__m512 tmp5960 = _mm512_shuffle_ps(tmp5942, tmp5944, 238);
__m512 tmp5961 = _mm512_shuffle_ps(tmp5945, tmp5947, 68);
__m512 tmp5962 = _mm512_shuffle_ps(tmp5945, tmp5947, 238);
__m512 tmp5963 = _mm512_shuffle_ps(tmp5946, tmp5948, 68);
__m512 tmp5964 = _mm512_shuffle_ps(tmp5946, tmp5948, 238);
__m512 tmp5965 = _mm512_shuffle_ps(tmp5949, tmp5951, 68);
__m512 tmp5966 = _mm512_shuffle_ps(tmp5949, tmp5951, 238);
__m512 tmp5967 = _mm512_shuffle_ps(tmp5950, tmp5952, 68);
__m512 tmp5968 = _mm512_shuffle_ps(tmp5950, tmp5952, 238);
__m512 tmp5969 = _mm512_shuffle_f32x4(tmp5953, tmp5957, 136);
__m512 tmp5970 = _mm512_shuffle_f32x4(tmp5953, tmp5957, 221);
__m512 tmp5971 = _mm512_shuffle_f32x4(tmp5954, tmp5958, 136);
__m512 tmp5972 = _mm512_shuffle_f32x4(tmp5954, tmp5958, 221);
__m512 tmp5973 = _mm512_shuffle_f32x4(tmp5955, tmp5959, 136);
__m512 tmp5974 = _mm512_shuffle_f32x4(tmp5955, tmp5959, 221);
__m512 tmp5975 = _mm512_shuffle_f32x4(tmp5956, tmp5960, 136);
__m512 tmp5976 = _mm512_shuffle_f32x4(tmp5956, tmp5960, 221);
__m512 tmp5977 = _mm512_shuffle_f32x4(tmp5961, tmp5965, 136);
__m512 tmp5978 = _mm512_shuffle_f32x4(tmp5961, tmp5965, 221);
__m512 tmp5979 = _mm512_shuffle_f32x4(tmp5962, tmp5966, 136);
__m512 tmp5980 = _mm512_shuffle_f32x4(tmp5962, tmp5966, 221);
__m512 tmp5981 = _mm512_shuffle_f32x4(tmp5963, tmp5967, 136);
__m512 tmp5982 = _mm512_shuffle_f32x4(tmp5963, tmp5967, 221);
__m512 tmp5983 = _mm512_shuffle_f32x4(tmp5964, tmp5968, 136);
__m512 tmp5984 = _mm512_shuffle_f32x4(tmp5964, tmp5968, 221);
tmp5923 = _mm512_shuffle_f32x4(tmp5969, tmp5977, 136);
in845 = _mm512_shuffle_f32x4(tmp5969, tmp5977, 221);
tmp5922 = _mm512_shuffle_f32x4(tmp5971, tmp5979, 136);
tmp5927 = _mm512_shuffle_f32x4(tmp5971, tmp5979, 221);
tmp5924 = _mm512_shuffle_f32x4(tmp5973, tmp5981, 136);
tmp5928 = _mm512_shuffle_f32x4(tmp5973, tmp5981, 221);
in843 = _mm512_shuffle_f32x4(tmp5975, tmp5983, 136);
in851 = _mm512_shuffle_f32x4(tmp5975, tmp5983, 221);
tmp5921 = _mm512_shuffle_f32x4(tmp5970, tmp5978, 136);
tmp5926 = _mm512_shuffle_f32x4(tmp5970, tmp5978, 221);
in839 = _mm512_shuffle_f32x4(tmp5972, tmp5980, 136);
in847 = _mm512_shuffle_f32x4(tmp5972, tmp5980, 221);
in841 = _mm512_shuffle_f32x4(tmp5974, tmp5982, 136);
in849 = _mm512_shuffle_f32x4(tmp5974, tmp5982, 221);
in840 = _mm512_shuffle_f32x4(tmp5976, tmp5984, 136);
in848 = _mm512_shuffle_f32x4(tmp5976, tmp5984, 221);
__m512 tmp5929 = _mm512_add_ps(tmp5922, in839);
__m512 tmp5933 = _mm512_add_ps(tmp5927, in847);
__m512 tmp5930 = _mm512_sub_ps(tmp5921, tmp5924);
__m512 tmp5934 = _mm512_sub_ps(tmp5926, tmp5928);
__m512 tmp5931 = _mm512_add_ps(tmp5924, in841);
__m512 tmp5935 = _mm512_add_ps(tmp5928, in849);
tmp5923 = _mm512_sub_ps(tmp5923, in841);
in845 = _mm512_sub_ps(in845, in849);
tmp5929 = _mm512_fmadd_ps(in843, _mm512_set1_ps(-4.25e+00f), tmp5929);
tmp5933 = _mm512_fmadd_ps(in851, _mm512_set1_ps(-4.25e+00f), tmp5933);
tmp5931 = _mm512_fmadd_ps(tmp5921, _mm512_set1_ps(-4.25e+00f), tmp5931);
tmp5935 = _mm512_fmadd_ps(tmp5926, _mm512_set1_ps(-4.25e+00f), tmp5935);
tmp5923 = _mm512_fmadd_ps(tmp5930, _mm512_set1_ps(5.25e+00f), tmp5923);
in845 = _mm512_fmadd_ps(tmp5934, _mm512_set1_ps(5.25e+00f), in845);
tmp5930 = _mm512_fmadd_ps(tmp5924, _mm512_set1_ps(2.5e-01f), in841);
tmp5934 = _mm512_fmadd_ps(tmp5928, _mm512_set1_ps(2.5e-01f), in849);
tmp5924 = _mm512_fmadd_ps(tmp5924, _mm512_set1_ps(4e+00f), in841);
tmp5928 = _mm512_fmadd_ps(tmp5928, _mm512_set1_ps(4e+00f), in849);
__m512 tmp5932 = _mm512_sub_ps(tmp5931, tmp5929);
__m512 tmp5936 = _mm512_sub_ps(tmp5935, tmp5933);
tmp5931 = _mm512_add_ps(tmp5929, tmp5931);
tmp5935 = _mm512_add_ps(tmp5933, tmp5935);
tmp5929 = _mm512_fmadd_ps(tmp5922, _mm512_set1_ps(2.5e-01f), in839);
tmp5933 = _mm512_fmadd_ps(tmp5927, _mm512_set1_ps(2.5e-01f), in847);
tmp5930 = _mm512_fmadd_ps(tmp5921, _mm512_set1_ps(-1.25e+00f), tmp5930);
tmp5934 = _mm512_fmadd_ps(tmp5926, _mm512_set1_ps(-1.25e+00f), tmp5934);
tmp5921 = _mm512_fmadd_ps(tmp5921, _mm512_set1_ps(-5e+00f), tmp5924);
tmp5926 = _mm512_fmadd_ps(tmp5926, _mm512_set1_ps(-5e+00f), tmp5928);
tmp5929 = _mm512_fmadd_ps(in843, _mm512_set1_ps(-1.25e+00f), tmp5929);
tmp5933 = _mm512_fmadd_ps(in851, _mm512_set1_ps(-1.25e+00f), tmp5933);
in841 = _mm512_fmadd_ps(tmp5929, _mm512_set1_ps(2e+00f), tmp5930);
in849 = _mm512_fmadd_ps(tmp5933, _mm512_set1_ps(2e+00f), tmp5934);
tmp5930 = _mm512_fnmadd_ps(tmp5929, _mm512_set1_ps(2e+00f), tmp5930);
tmp5934 = _mm512_fnmadd_ps(tmp5933, _mm512_set1_ps(2e+00f), tmp5934);
tmp5929 = _mm512_fmadd_ps(in839, _mm512_set1_ps(2.5e-01f), tmp5922);
tmp5933 = _mm512_fmadd_ps(in847, _mm512_set1_ps(2.5e-01f), tmp5927);
tmp5922 = _mm512_sub_ps(in840, tmp5922);
tmp5927 = _mm512_sub_ps(in848, tmp5927);
tmp5929 = _mm512_fmadd_ps(in843, _mm512_set1_ps(-1.25e+00f), tmp5929);
tmp5933 = _mm512_fmadd_ps(in851, _mm512_set1_ps(-1.25e+00f), tmp5933);
in843 = _mm512_sub_ps(in843, in839);
in851 = _mm512_sub_ps(in851, in847);
in843 = _mm512_fmadd_ps(in843, _mm512_set1_ps(5.25e+00f), tmp5922);
in851 = _mm512_fmadd_ps(in851, _mm512_set1_ps(5.25e+00f), tmp5927);
tmp5924 = _mm512_fmadd_ps(tmp5929, _mm512_set1_ps(2e+00f), tmp5921);
tmp5928 = _mm512_fmadd_ps(tmp5933, _mm512_set1_ps(2e+00f), tmp5926);
tmp5921 = _mm512_fnmadd_ps(tmp5929, _mm512_set1_ps(2e+00f), tmp5921);
tmp5926 = _mm512_fnmadd_ps(tmp5933, _mm512_set1_ps(2e+00f), tmp5926);
__m512 out787 = _mm512_shuffle_f32x4(tmp5923, tmp5931, 68);
__m512 out795 = _mm512_shuffle_f32x4(tmp5923, tmp5931, 238);
__m512 out788 = _mm512_shuffle_f32x4(tmp5932, in841, 68);
__m512 out796 = _mm512_shuffle_f32x4(tmp5932, in841, 238);
__m512 out789 = _mm512_shuffle_f32x4(tmp5930, tmp5924, 68);
__m512 out797 = _mm512_shuffle_f32x4(tmp5930, tmp5924, 238);
__m512 out790 = _mm512_shuffle_f32x4(tmp5921, in843, 68);
__m512 out798 = _mm512_shuffle_f32x4(tmp5921, in843, 238);
__m512 out791 = _mm512_shuffle_f32x4(in845, tmp5935, 68);
__m512 out799 = _mm512_shuffle_f32x4(in845, tmp5935, 238);
__m512 out792 = _mm512_shuffle_f32x4(tmp5936, in849, 68);
__m512 out800 = _mm512_shuffle_f32x4(tmp5936, in849, 238);
__m512 out793 = _mm512_shuffle_f32x4(tmp5934, tmp5928, 68);
__m512 out801 = _mm512_shuffle_f32x4(tmp5934, tmp5928, 238);
__m512 out794 = _mm512_shuffle_f32x4(tmp5926, in851, 68);
__m512 out802 = _mm512_shuffle_f32x4(tmp5926, in851, 238);
_mm512_storeu_ps(dfPtr6+512+1638400*i26+24576*j21+24576*s20+768*k81, out787);
_mm512_storeu_ps(dfPtr6+640+1638400*i26+24576*j21+24576*s20+768*k81, out795);
_mm512_storeu_ps(dfPtr6+576+1638400*i26+24576*j21+24576*s20+768*k81, out791);
_mm512_storeu_ps(dfPtr6+704+1638400*i26+24576*j21+24576*s20+768*k81, out799);
_mm512_storeu_ps(dfPtr6+410112+1638400*i26+24576*j21+24576*s20+768*k81, out788);
_mm512_storeu_ps(dfPtr6+410240+1638400*i26+24576*j21+24576*s20+768*k81, out796);
_mm512_storeu_ps(dfPtr6+410176+1638400*i26+24576*j21+24576*s20+768*k81, out792);
_mm512_storeu_ps(dfPtr6+410304+1638400*i26+24576*j21+24576*s20+768*k81, out800);
_mm512_storeu_ps(dfPtr6+819712+1638400*i26+24576*j21+24576*s20+768*k81, out789);
_mm512_storeu_ps(dfPtr6+819840+1638400*i26+24576*j21+24576*s20+768*k81, out797);
_mm512_storeu_ps(dfPtr6+819776+1638400*i26+24576*j21+24576*s20+768*k81, out793);
_mm512_storeu_ps(dfPtr6+819904+1638400*i26+24576*j21+24576*s20+768*k81, out801);
_mm512_storeu_ps(dfPtr6+1229312+1638400*i26+24576*j21+24576*s20+768*k81, out790);
_mm512_storeu_ps(dfPtr6+1229440+1638400*i26+24576*j21+24576*s20+768*k81, out798);
_mm512_storeu_ps(dfPtr6+1229376+1638400*i26+24576*j21+24576*s20+768*k81, out794);
_mm512_storeu_ps(dfPtr6+1229504+1638400*i26+24576*j21+24576*s20+768*k81, out802);
}
if (j21 >= last5) return;
++j21;
j21 = 2;
}
if (j21 < 15) {
ptrdiff_t rel14 = (size_t)(j21-2)%5;
ptrdiff_t base14 = 6+(size_t)(j21-2)/5*18;
for (; ; rel14 = 0, base14 += 18) {
if (rel14 < 2) {
if (rel14 < 1) {
ptrdiff_t h31 = base14+0;
ptrdiff_t w38 = 12;
ptrdiff_t k82 = 0;
for (; k82 != 32; ++k82) {
__m512 dat1381 = _mm512_maskz_loadu_ps(16383, datPtr12+0+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1382 = _mm512_maskz_loadu_ps(16383, datPtr12+48+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512i pm121 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in853 = _mm512_permutexvar_ps(pm121, dat1381);
__m512 in861 = _mm512_permutexvar_ps(pm121, dat1382);
__m512 dat1383 = _mm512_maskz_loadu_ps(16383, datPtr12+224+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1384 = _mm512_maskz_loadu_ps(16383, datPtr12+272+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in854 = _mm512_permutexvar_ps(pm121, dat1383);
__m512 in862 = _mm512_permutexvar_ps(pm121, dat1384);
__m512 dat1385 = _mm512_maskz_loadu_ps(16383, datPtr12+448+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1386 = _mm512_maskz_loadu_ps(16383, datPtr12+496+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in855 = _mm512_permutexvar_ps(pm121, dat1385);
__m512 in863 = _mm512_permutexvar_ps(pm121, dat1386);
__m512 dat1387 = _mm512_maskz_loadu_ps(16383, datPtr12+672+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1388 = _mm512_maskz_loadu_ps(16383, datPtr12+720+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in856 = _mm512_permutexvar_ps(pm121, dat1387);
__m512 in864 = _mm512_permutexvar_ps(pm121, dat1388);
__m512 dat1389 = _mm512_maskz_loadu_ps(16383, datPtr12+896+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1390 = _mm512_maskz_loadu_ps(16383, datPtr12+944+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in857 = _mm512_permutexvar_ps(pm121, dat1389);
__m512 in865 = _mm512_permutexvar_ps(pm121, dat1390);
__m512 dat1391 = _mm512_maskz_loadu_ps(16383, datPtr12+1120+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1392 = _mm512_maskz_loadu_ps(16383, datPtr12+1168+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in858 = _mm512_permutexvar_ps(pm121, dat1391);
__m512 in866 = _mm512_permutexvar_ps(pm121, dat1392);
__m512 dat1393 = _mm512_maskz_loadu_ps(16383, datPtr12+1344+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1394 = _mm512_maskz_loadu_ps(16383, datPtr12+1392+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in859 = _mm512_permutexvar_ps(pm121, dat1393);
__m512 in867 = _mm512_permutexvar_ps(pm121, dat1394);
__m512 dat1395 = _mm512_maskz_loadu_ps(16383, datPtr12+1568+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1396 = _mm512_maskz_loadu_ps(16383, datPtr12+1616+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in860 = _mm512_permutexvar_ps(pm121, dat1395);
__m512 in868 = _mm512_permutexvar_ps(pm121, dat1396);
__m512 tmp5985 = _mm512_add_ps(in854, in858);
__m512 tmp5989 = _mm512_add_ps(in862, in866);
__m512 tmp5986 = _mm512_sub_ps(in857, in855);
__m512 tmp5990 = _mm512_sub_ps(in865, in863);
__m512 tmp5987 = _mm512_add_ps(in855, in859);
__m512 tmp5991 = _mm512_add_ps(in863, in867);
in853 = _mm512_sub_ps(in853, in859);
in861 = _mm512_sub_ps(in861, in867);
tmp5985 = _mm512_fmadd_ps(in856, _mm512_set1_ps(-4.25e+00f), tmp5985);
tmp5989 = _mm512_fmadd_ps(in864, _mm512_set1_ps(-4.25e+00f), tmp5989);
tmp5987 = _mm512_fmadd_ps(in857, _mm512_set1_ps(-4.25e+00f), tmp5987);
tmp5991 = _mm512_fmadd_ps(in865, _mm512_set1_ps(-4.25e+00f), tmp5991);
in853 = _mm512_fmadd_ps(tmp5986, _mm512_set1_ps(5.25e+00f), in853);
in861 = _mm512_fmadd_ps(tmp5990, _mm512_set1_ps(5.25e+00f), in861);
tmp5986 = _mm512_fmadd_ps(in855, _mm512_set1_ps(2.5e-01f), in859);
tmp5990 = _mm512_fmadd_ps(in863, _mm512_set1_ps(2.5e-01f), in867);
in855 = _mm512_fmadd_ps(in855, _mm512_set1_ps(4e+00f), in859);
in863 = _mm512_fmadd_ps(in863, _mm512_set1_ps(4e+00f), in867);
__m512 tmp5988 = _mm512_sub_ps(tmp5987, tmp5985);
__m512 tmp5992 = _mm512_sub_ps(tmp5991, tmp5989);
tmp5987 = _mm512_add_ps(tmp5985, tmp5987);
tmp5991 = _mm512_add_ps(tmp5989, tmp5991);
tmp5985 = _mm512_fmadd_ps(in854, _mm512_set1_ps(2.5e-01f), in858);
tmp5989 = _mm512_fmadd_ps(in862, _mm512_set1_ps(2.5e-01f), in866);
tmp5986 = _mm512_fmadd_ps(in857, _mm512_set1_ps(-1.25e+00f), tmp5986);
tmp5990 = _mm512_fmadd_ps(in865, _mm512_set1_ps(-1.25e+00f), tmp5990);
in857 = _mm512_fmadd_ps(in857, _mm512_set1_ps(-5e+00f), in855);
in865 = _mm512_fmadd_ps(in865, _mm512_set1_ps(-5e+00f), in863);
tmp5985 = _mm512_fmadd_ps(in856, _mm512_set1_ps(-1.25e+00f), tmp5985);
tmp5989 = _mm512_fmadd_ps(in864, _mm512_set1_ps(-1.25e+00f), tmp5989);
in859 = _mm512_fmadd_ps(tmp5985, _mm512_set1_ps(2e+00f), tmp5986);
in867 = _mm512_fmadd_ps(tmp5989, _mm512_set1_ps(2e+00f), tmp5990);
tmp5986 = _mm512_fnmadd_ps(tmp5985, _mm512_set1_ps(2e+00f), tmp5986);
tmp5990 = _mm512_fnmadd_ps(tmp5989, _mm512_set1_ps(2e+00f), tmp5990);
tmp5985 = _mm512_fmadd_ps(in858, _mm512_set1_ps(2.5e-01f), in854);
tmp5989 = _mm512_fmadd_ps(in866, _mm512_set1_ps(2.5e-01f), in862);
in854 = _mm512_sub_ps(in860, in854);
in862 = _mm512_sub_ps(in868, in862);
tmp5985 = _mm512_fmadd_ps(in856, _mm512_set1_ps(-1.25e+00f), tmp5985);
tmp5989 = _mm512_fmadd_ps(in864, _mm512_set1_ps(-1.25e+00f), tmp5989);
in856 = _mm512_sub_ps(in856, in858);
in864 = _mm512_sub_ps(in864, in866);
in856 = _mm512_fmadd_ps(in856, _mm512_set1_ps(5.25e+00f), in854);
in864 = _mm512_fmadd_ps(in864, _mm512_set1_ps(5.25e+00f), in862);
in855 = _mm512_fmadd_ps(tmp5985, _mm512_set1_ps(2e+00f), in857);
in863 = _mm512_fmadd_ps(tmp5989, _mm512_set1_ps(2e+00f), in865);
in857 = _mm512_fnmadd_ps(tmp5985, _mm512_set1_ps(2e+00f), in857);
in865 = _mm512_fnmadd_ps(tmp5989, _mm512_set1_ps(2e+00f), in865);
__m512 tmp6001 = _mm512_unpacklo_ps(in853, tmp5987);
__m512 tmp6002 = _mm512_unpackhi_ps(in853, tmp5987);
__m512 tmp6003 = _mm512_unpacklo_ps(tmp5988, in859);
__m512 tmp6004 = _mm512_unpackhi_ps(tmp5988, in859);
__m512 tmp6005 = _mm512_unpacklo_ps(tmp5986, in855);
__m512 tmp6006 = _mm512_unpackhi_ps(tmp5986, in855);
__m512 tmp6007 = _mm512_unpacklo_ps(in857, in856);
__m512 tmp6008 = _mm512_unpackhi_ps(in857, in856);
__m512 tmp6009 = _mm512_unpacklo_ps(in861, tmp5991);
__m512 tmp6010 = _mm512_unpackhi_ps(in861, tmp5991);
__m512 tmp6011 = _mm512_unpacklo_ps(tmp5992, in867);
__m512 tmp6012 = _mm512_unpackhi_ps(tmp5992, in867);
__m512 tmp6013 = _mm512_unpacklo_ps(tmp5990, in863);
__m512 tmp6014 = _mm512_unpackhi_ps(tmp5990, in863);
__m512 tmp6015 = _mm512_unpacklo_ps(in865, in864);
__m512 tmp6016 = _mm512_unpackhi_ps(in865, in864);
__m512 tmp6017 = _mm512_shuffle_ps(tmp6001, tmp6003, 68);
__m512 tmp6018 = _mm512_shuffle_ps(tmp6001, tmp6003, 238);
__m512 tmp6019 = _mm512_shuffle_ps(tmp6002, tmp6004, 68);
__m512 tmp6020 = _mm512_shuffle_ps(tmp6002, tmp6004, 238);
__m512 tmp6021 = _mm512_shuffle_ps(tmp6005, tmp6007, 68);
__m512 tmp6022 = _mm512_shuffle_ps(tmp6005, tmp6007, 238);
__m512 tmp6023 = _mm512_shuffle_ps(tmp6006, tmp6008, 68);
__m512 tmp6024 = _mm512_shuffle_ps(tmp6006, tmp6008, 238);
__m512 tmp6025 = _mm512_shuffle_ps(tmp6009, tmp6011, 68);
__m512 tmp6026 = _mm512_shuffle_ps(tmp6009, tmp6011, 238);
__m512 tmp6027 = _mm512_shuffle_ps(tmp6010, tmp6012, 68);
__m512 tmp6028 = _mm512_shuffle_ps(tmp6010, tmp6012, 238);
__m512 tmp6029 = _mm512_shuffle_ps(tmp6013, tmp6015, 68);
__m512 tmp6030 = _mm512_shuffle_ps(tmp6013, tmp6015, 238);
__m512 tmp6031 = _mm512_shuffle_ps(tmp6014, tmp6016, 68);
__m512 tmp6032 = _mm512_shuffle_ps(tmp6014, tmp6016, 238);
__m512 tmp6033 = _mm512_shuffle_f32x4(tmp6017, tmp6021, 136);
__m512 tmp6034 = _mm512_shuffle_f32x4(tmp6017, tmp6021, 221);
__m512 tmp6035 = _mm512_shuffle_f32x4(tmp6018, tmp6022, 136);
__m512 tmp6036 = _mm512_shuffle_f32x4(tmp6018, tmp6022, 221);
__m512 tmp6037 = _mm512_shuffle_f32x4(tmp6019, tmp6023, 136);
__m512 tmp6038 = _mm512_shuffle_f32x4(tmp6019, tmp6023, 221);
__m512 tmp6039 = _mm512_shuffle_f32x4(tmp6020, tmp6024, 136);
__m512 tmp6040 = _mm512_shuffle_f32x4(tmp6020, tmp6024, 221);
__m512 tmp6041 = _mm512_shuffle_f32x4(tmp6025, tmp6029, 136);
__m512 tmp6042 = _mm512_shuffle_f32x4(tmp6025, tmp6029, 221);
__m512 tmp6043 = _mm512_shuffle_f32x4(tmp6026, tmp6030, 136);
__m512 tmp6044 = _mm512_shuffle_f32x4(tmp6026, tmp6030, 221);
__m512 tmp6045 = _mm512_shuffle_f32x4(tmp6027, tmp6031, 136);
__m512 tmp6046 = _mm512_shuffle_f32x4(tmp6027, tmp6031, 221);
__m512 tmp6047 = _mm512_shuffle_f32x4(tmp6028, tmp6032, 136);
__m512 tmp6048 = _mm512_shuffle_f32x4(tmp6028, tmp6032, 221);
in853 = _mm512_shuffle_f32x4(tmp6033, tmp6041, 136);
in861 = _mm512_shuffle_f32x4(tmp6033, tmp6041, 221);
tmp5987 = _mm512_shuffle_f32x4(tmp6035, tmp6043, 136);
tmp5991 = _mm512_shuffle_f32x4(tmp6035, tmp6043, 221);
tmp5988 = _mm512_shuffle_f32x4(tmp6037, tmp6045, 136);
tmp5992 = _mm512_shuffle_f32x4(tmp6037, tmp6045, 221);
in859 = _mm512_shuffle_f32x4(tmp6039, tmp6047, 136);
in867 = _mm512_shuffle_f32x4(tmp6039, tmp6047, 221);
tmp5986 = _mm512_shuffle_f32x4(tmp6034, tmp6042, 136);
tmp5990 = _mm512_shuffle_f32x4(tmp6034, tmp6042, 221);
in855 = _mm512_shuffle_f32x4(tmp6036, tmp6044, 136);
in863 = _mm512_shuffle_f32x4(tmp6036, tmp6044, 221);
in857 = _mm512_shuffle_f32x4(tmp6038, tmp6046, 136);
in865 = _mm512_shuffle_f32x4(tmp6038, tmp6046, 221);
in856 = _mm512_shuffle_f32x4(tmp6040, tmp6048, 136);
in864 = _mm512_shuffle_f32x4(tmp6040, tmp6048, 221);
__m512 tmp5993 = _mm512_add_ps(tmp5987, in855);
__m512 tmp5997 = _mm512_add_ps(tmp5991, in863);
__m512 tmp5994 = _mm512_sub_ps(tmp5986, tmp5988);
__m512 tmp5998 = _mm512_sub_ps(tmp5990, tmp5992);
__m512 tmp5995 = _mm512_add_ps(tmp5988, in857);
__m512 tmp5999 = _mm512_add_ps(tmp5992, in865);
in853 = _mm512_sub_ps(in853, in857);
in861 = _mm512_sub_ps(in861, in865);
tmp5993 = _mm512_fmadd_ps(in859, _mm512_set1_ps(-4.25e+00f), tmp5993);
tmp5997 = _mm512_fmadd_ps(in867, _mm512_set1_ps(-4.25e+00f), tmp5997);
tmp5995 = _mm512_fmadd_ps(tmp5986, _mm512_set1_ps(-4.25e+00f), tmp5995);
tmp5999 = _mm512_fmadd_ps(tmp5990, _mm512_set1_ps(-4.25e+00f), tmp5999);
in853 = _mm512_fmadd_ps(tmp5994, _mm512_set1_ps(5.25e+00f), in853);
in861 = _mm512_fmadd_ps(tmp5998, _mm512_set1_ps(5.25e+00f), in861);
tmp5994 = _mm512_fmadd_ps(tmp5988, _mm512_set1_ps(2.5e-01f), in857);
tmp5998 = _mm512_fmadd_ps(tmp5992, _mm512_set1_ps(2.5e-01f), in865);
tmp5988 = _mm512_fmadd_ps(tmp5988, _mm512_set1_ps(4e+00f), in857);
tmp5992 = _mm512_fmadd_ps(tmp5992, _mm512_set1_ps(4e+00f), in865);
__m512 tmp5996 = _mm512_sub_ps(tmp5995, tmp5993);
__m512 tmp6000 = _mm512_sub_ps(tmp5999, tmp5997);
tmp5995 = _mm512_add_ps(tmp5993, tmp5995);
tmp5999 = _mm512_add_ps(tmp5997, tmp5999);
tmp5993 = _mm512_fmadd_ps(tmp5987, _mm512_set1_ps(2.5e-01f), in855);
tmp5997 = _mm512_fmadd_ps(tmp5991, _mm512_set1_ps(2.5e-01f), in863);
tmp5994 = _mm512_fmadd_ps(tmp5986, _mm512_set1_ps(-1.25e+00f), tmp5994);
tmp5998 = _mm512_fmadd_ps(tmp5990, _mm512_set1_ps(-1.25e+00f), tmp5998);
tmp5986 = _mm512_fmadd_ps(tmp5986, _mm512_set1_ps(-5e+00f), tmp5988);
tmp5990 = _mm512_fmadd_ps(tmp5990, _mm512_set1_ps(-5e+00f), tmp5992);
tmp5993 = _mm512_fmadd_ps(in859, _mm512_set1_ps(-1.25e+00f), tmp5993);
tmp5997 = _mm512_fmadd_ps(in867, _mm512_set1_ps(-1.25e+00f), tmp5997);
in857 = _mm512_fmadd_ps(tmp5993, _mm512_set1_ps(2e+00f), tmp5994);
in865 = _mm512_fmadd_ps(tmp5997, _mm512_set1_ps(2e+00f), tmp5998);
tmp5994 = _mm512_fnmadd_ps(tmp5993, _mm512_set1_ps(2e+00f), tmp5994);
tmp5998 = _mm512_fnmadd_ps(tmp5997, _mm512_set1_ps(2e+00f), tmp5998);
tmp5993 = _mm512_fmadd_ps(in855, _mm512_set1_ps(2.5e-01f), tmp5987);
tmp5997 = _mm512_fmadd_ps(in863, _mm512_set1_ps(2.5e-01f), tmp5991);
tmp5987 = _mm512_sub_ps(in856, tmp5987);
tmp5991 = _mm512_sub_ps(in864, tmp5991);
tmp5993 = _mm512_fmadd_ps(in859, _mm512_set1_ps(-1.25e+00f), tmp5993);
tmp5997 = _mm512_fmadd_ps(in867, _mm512_set1_ps(-1.25e+00f), tmp5997);
in859 = _mm512_sub_ps(in859, in855);
in867 = _mm512_sub_ps(in867, in863);
in859 = _mm512_fmadd_ps(in859, _mm512_set1_ps(5.25e+00f), tmp5987);
in867 = _mm512_fmadd_ps(in867, _mm512_set1_ps(5.25e+00f), tmp5991);
tmp5988 = _mm512_fmadd_ps(tmp5993, _mm512_set1_ps(2e+00f), tmp5986);
tmp5992 = _mm512_fmadd_ps(tmp5997, _mm512_set1_ps(2e+00f), tmp5990);
tmp5986 = _mm512_fnmadd_ps(tmp5993, _mm512_set1_ps(2e+00f), tmp5986);
tmp5990 = _mm512_fnmadd_ps(tmp5997, _mm512_set1_ps(2e+00f), tmp5990);
__m512 out803 = _mm512_shuffle_f32x4(in853, tmp5995, 68);
__m512 out811 = _mm512_shuffle_f32x4(in853, tmp5995, 238);
__m512 out804 = _mm512_shuffle_f32x4(tmp5996, in857, 68);
__m512 out812 = _mm512_shuffle_f32x4(tmp5996, in857, 238);
__m512 out805 = _mm512_shuffle_f32x4(tmp5994, tmp5988, 68);
__m512 out813 = _mm512_shuffle_f32x4(tmp5994, tmp5988, 238);
__m512 out806 = _mm512_shuffle_f32x4(tmp5986, in859, 68);
__m512 out814 = _mm512_shuffle_f32x4(tmp5986, in859, 238);
__m512 out807 = _mm512_shuffle_f32x4(in861, tmp5999, 68);
__m512 out815 = _mm512_shuffle_f32x4(in861, tmp5999, 238);
__m512 out808 = _mm512_shuffle_f32x4(tmp6000, in865, 68);
__m512 out816 = _mm512_shuffle_f32x4(tmp6000, in865, 238);
__m512 out809 = _mm512_shuffle_f32x4(tmp5998, tmp5992, 68);
__m512 out817 = _mm512_shuffle_f32x4(tmp5998, tmp5992, 238);
__m512 out810 = _mm512_shuffle_f32x4(tmp5990, in867, 68);
__m512 out818 = _mm512_shuffle_f32x4(tmp5990, in867, 238);
_mm512_storeu_ps(dfPtr6+0+1638400*i26+24576*j21+24576*s20+768*k82, out803);
_mm512_storeu_ps(dfPtr6+128+1638400*i26+24576*j21+24576*s20+768*k82, out811);
_mm512_storeu_ps(dfPtr6+64+1638400*i26+24576*j21+24576*s20+768*k82, out807);
_mm512_storeu_ps(dfPtr6+192+1638400*i26+24576*j21+24576*s20+768*k82, out815);
_mm512_storeu_ps(dfPtr6+409600+1638400*i26+24576*j21+24576*s20+768*k82, out804);
_mm512_storeu_ps(dfPtr6+409728+1638400*i26+24576*j21+24576*s20+768*k82, out812);
_mm512_storeu_ps(dfPtr6+409664+1638400*i26+24576*j21+24576*s20+768*k82, out808);
_mm512_storeu_ps(dfPtr6+409792+1638400*i26+24576*j21+24576*s20+768*k82, out816);
_mm512_storeu_ps(dfPtr6+819200+1638400*i26+24576*j21+24576*s20+768*k82, out805);
_mm512_storeu_ps(dfPtr6+819328+1638400*i26+24576*j21+24576*s20+768*k82, out813);
_mm512_storeu_ps(dfPtr6+819264+1638400*i26+24576*j21+24576*s20+768*k82, out809);
_mm512_storeu_ps(dfPtr6+819392+1638400*i26+24576*j21+24576*s20+768*k82, out817);
_mm512_storeu_ps(dfPtr6+1228800+1638400*i26+24576*j21+24576*s20+768*k82, out806);
_mm512_storeu_ps(dfPtr6+1228928+1638400*i26+24576*j21+24576*s20+768*k82, out814);
_mm512_storeu_ps(dfPtr6+1228864+1638400*i26+24576*j21+24576*s20+768*k82, out810);
_mm512_storeu_ps(dfPtr6+1228992+1638400*i26+24576*j21+24576*s20+768*k82, out818);
__m512 dat1397 = _mm512_maskz_loadu_ps(16383, datPtr12+96+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1398 = _mm512_maskz_loadu_ps(16383, datPtr12+12608+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512i pm122 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in869 = _mm512_permutexvar_ps(pm122, dat1397);
__m512 in877 = _mm512_permutexvar_ps(pm122, dat1398);
__m512 dat1399 = _mm512_maskz_loadu_ps(16383, datPtr12+320+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1400 = _mm512_maskz_loadu_ps(16383, datPtr12+12832+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in870 = _mm512_permutexvar_ps(pm122, dat1399);
__m512 in878 = _mm512_permutexvar_ps(pm122, dat1400);
__m512 dat1401 = _mm512_maskz_loadu_ps(16383, datPtr12+544+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1402 = _mm512_maskz_loadu_ps(16383, datPtr12+13056+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in871 = _mm512_permutexvar_ps(pm122, dat1401);
__m512 in879 = _mm512_permutexvar_ps(pm122, dat1402);
__m512 dat1403 = _mm512_maskz_loadu_ps(16383, datPtr12+768+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1404 = _mm512_maskz_loadu_ps(16383, datPtr12+13280+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in872 = _mm512_permutexvar_ps(pm122, dat1403);
__m512 in880 = _mm512_permutexvar_ps(pm122, dat1404);
__m512 dat1405 = _mm512_maskz_loadu_ps(16383, datPtr12+992+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1406 = _mm512_maskz_loadu_ps(16383, datPtr12+13504+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in873 = _mm512_permutexvar_ps(pm122, dat1405);
__m512 in881 = _mm512_permutexvar_ps(pm122, dat1406);
__m512 dat1407 = _mm512_maskz_loadu_ps(16383, datPtr12+1216+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1408 = _mm512_maskz_loadu_ps(16383, datPtr12+13728+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in874 = _mm512_permutexvar_ps(pm122, dat1407);
__m512 in882 = _mm512_permutexvar_ps(pm122, dat1408);
__m512 dat1409 = _mm512_maskz_loadu_ps(16383, datPtr12+1440+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1410 = _mm512_maskz_loadu_ps(16383, datPtr12+13952+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in875 = _mm512_permutexvar_ps(pm122, dat1409);
__m512 in883 = _mm512_permutexvar_ps(pm122, dat1410);
__m512 dat1411 = _mm512_maskz_loadu_ps(16383, datPtr12+1664+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1412 = _mm512_maskz_loadu_ps(16383, datPtr12+14176+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in876 = _mm512_permutexvar_ps(pm122, dat1411);
__m512 in884 = _mm512_permutexvar_ps(pm122, dat1412);
__m512 tmp6049 = _mm512_add_ps(in870, in874);
__m512 tmp6053 = _mm512_add_ps(in878, in882);
__m512 tmp6050 = _mm512_sub_ps(in873, in871);
__m512 tmp6054 = _mm512_sub_ps(in881, in879);
__m512 tmp6051 = _mm512_add_ps(in871, in875);
__m512 tmp6055 = _mm512_add_ps(in879, in883);
in869 = _mm512_sub_ps(in869, in875);
in877 = _mm512_sub_ps(in877, in883);
tmp6049 = _mm512_fmadd_ps(in872, _mm512_set1_ps(-4.25e+00f), tmp6049);
tmp6053 = _mm512_fmadd_ps(in880, _mm512_set1_ps(-4.25e+00f), tmp6053);
tmp6051 = _mm512_fmadd_ps(in873, _mm512_set1_ps(-4.25e+00f), tmp6051);
tmp6055 = _mm512_fmadd_ps(in881, _mm512_set1_ps(-4.25e+00f), tmp6055);
in869 = _mm512_fmadd_ps(tmp6050, _mm512_set1_ps(5.25e+00f), in869);
in877 = _mm512_fmadd_ps(tmp6054, _mm512_set1_ps(5.25e+00f), in877);
tmp6050 = _mm512_fmadd_ps(in871, _mm512_set1_ps(2.5e-01f), in875);
tmp6054 = _mm512_fmadd_ps(in879, _mm512_set1_ps(2.5e-01f), in883);
in871 = _mm512_fmadd_ps(in871, _mm512_set1_ps(4e+00f), in875);
in879 = _mm512_fmadd_ps(in879, _mm512_set1_ps(4e+00f), in883);
__m512 tmp6052 = _mm512_sub_ps(tmp6051, tmp6049);
__m512 tmp6056 = _mm512_sub_ps(tmp6055, tmp6053);
tmp6051 = _mm512_add_ps(tmp6049, tmp6051);
tmp6055 = _mm512_add_ps(tmp6053, tmp6055);
tmp6049 = _mm512_fmadd_ps(in870, _mm512_set1_ps(2.5e-01f), in874);
tmp6053 = _mm512_fmadd_ps(in878, _mm512_set1_ps(2.5e-01f), in882);
tmp6050 = _mm512_fmadd_ps(in873, _mm512_set1_ps(-1.25e+00f), tmp6050);
tmp6054 = _mm512_fmadd_ps(in881, _mm512_set1_ps(-1.25e+00f), tmp6054);
in873 = _mm512_fmadd_ps(in873, _mm512_set1_ps(-5e+00f), in871);
in881 = _mm512_fmadd_ps(in881, _mm512_set1_ps(-5e+00f), in879);
tmp6049 = _mm512_fmadd_ps(in872, _mm512_set1_ps(-1.25e+00f), tmp6049);
tmp6053 = _mm512_fmadd_ps(in880, _mm512_set1_ps(-1.25e+00f), tmp6053);
in875 = _mm512_fmadd_ps(tmp6049, _mm512_set1_ps(2e+00f), tmp6050);
in883 = _mm512_fmadd_ps(tmp6053, _mm512_set1_ps(2e+00f), tmp6054);
tmp6050 = _mm512_fnmadd_ps(tmp6049, _mm512_set1_ps(2e+00f), tmp6050);
tmp6054 = _mm512_fnmadd_ps(tmp6053, _mm512_set1_ps(2e+00f), tmp6054);
tmp6049 = _mm512_fmadd_ps(in874, _mm512_set1_ps(2.5e-01f), in870);
tmp6053 = _mm512_fmadd_ps(in882, _mm512_set1_ps(2.5e-01f), in878);
in870 = _mm512_sub_ps(in876, in870);
in878 = _mm512_sub_ps(in884, in878);
tmp6049 = _mm512_fmadd_ps(in872, _mm512_set1_ps(-1.25e+00f), tmp6049);
tmp6053 = _mm512_fmadd_ps(in880, _mm512_set1_ps(-1.25e+00f), tmp6053);
in872 = _mm512_sub_ps(in872, in874);
in880 = _mm512_sub_ps(in880, in882);
in872 = _mm512_fmadd_ps(in872, _mm512_set1_ps(5.25e+00f), in870);
in880 = _mm512_fmadd_ps(in880, _mm512_set1_ps(5.25e+00f), in878);
in871 = _mm512_fmadd_ps(tmp6049, _mm512_set1_ps(2e+00f), in873);
in879 = _mm512_fmadd_ps(tmp6053, _mm512_set1_ps(2e+00f), in881);
in873 = _mm512_fnmadd_ps(tmp6049, _mm512_set1_ps(2e+00f), in873);
in881 = _mm512_fnmadd_ps(tmp6053, _mm512_set1_ps(2e+00f), in881);
__m512 tmp6065 = _mm512_unpacklo_ps(in869, tmp6051);
__m512 tmp6066 = _mm512_unpackhi_ps(in869, tmp6051);
__m512 tmp6067 = _mm512_unpacklo_ps(tmp6052, in875);
__m512 tmp6068 = _mm512_unpackhi_ps(tmp6052, in875);
__m512 tmp6069 = _mm512_unpacklo_ps(tmp6050, in871);
__m512 tmp6070 = _mm512_unpackhi_ps(tmp6050, in871);
__m512 tmp6071 = _mm512_unpacklo_ps(in873, in872);
__m512 tmp6072 = _mm512_unpackhi_ps(in873, in872);
__m512 tmp6073 = _mm512_unpacklo_ps(in877, tmp6055);
__m512 tmp6074 = _mm512_unpackhi_ps(in877, tmp6055);
__m512 tmp6075 = _mm512_unpacklo_ps(tmp6056, in883);
__m512 tmp6076 = _mm512_unpackhi_ps(tmp6056, in883);
__m512 tmp6077 = _mm512_unpacklo_ps(tmp6054, in879);
__m512 tmp6078 = _mm512_unpackhi_ps(tmp6054, in879);
__m512 tmp6079 = _mm512_unpacklo_ps(in881, in880);
__m512 tmp6080 = _mm512_unpackhi_ps(in881, in880);
__m512 tmp6081 = _mm512_shuffle_ps(tmp6065, tmp6067, 68);
__m512 tmp6082 = _mm512_shuffle_ps(tmp6065, tmp6067, 238);
__m512 tmp6083 = _mm512_shuffle_ps(tmp6066, tmp6068, 68);
__m512 tmp6084 = _mm512_shuffle_ps(tmp6066, tmp6068, 238);
__m512 tmp6085 = _mm512_shuffle_ps(tmp6069, tmp6071, 68);
__m512 tmp6086 = _mm512_shuffle_ps(tmp6069, tmp6071, 238);
__m512 tmp6087 = _mm512_shuffle_ps(tmp6070, tmp6072, 68);
__m512 tmp6088 = _mm512_shuffle_ps(tmp6070, tmp6072, 238);
__m512 tmp6089 = _mm512_shuffle_ps(tmp6073, tmp6075, 68);
__m512 tmp6090 = _mm512_shuffle_ps(tmp6073, tmp6075, 238);
__m512 tmp6091 = _mm512_shuffle_ps(tmp6074, tmp6076, 68);
__m512 tmp6092 = _mm512_shuffle_ps(tmp6074, tmp6076, 238);
__m512 tmp6093 = _mm512_shuffle_ps(tmp6077, tmp6079, 68);
__m512 tmp6094 = _mm512_shuffle_ps(tmp6077, tmp6079, 238);
__m512 tmp6095 = _mm512_shuffle_ps(tmp6078, tmp6080, 68);
__m512 tmp6096 = _mm512_shuffle_ps(tmp6078, tmp6080, 238);
__m512 tmp6097 = _mm512_shuffle_f32x4(tmp6081, tmp6085, 136);
__m512 tmp6098 = _mm512_shuffle_f32x4(tmp6081, tmp6085, 221);
__m512 tmp6099 = _mm512_shuffle_f32x4(tmp6082, tmp6086, 136);
__m512 tmp6100 = _mm512_shuffle_f32x4(tmp6082, tmp6086, 221);
__m512 tmp6101 = _mm512_shuffle_f32x4(tmp6083, tmp6087, 136);
__m512 tmp6102 = _mm512_shuffle_f32x4(tmp6083, tmp6087, 221);
__m512 tmp6103 = _mm512_shuffle_f32x4(tmp6084, tmp6088, 136);
__m512 tmp6104 = _mm512_shuffle_f32x4(tmp6084, tmp6088, 221);
__m512 tmp6105 = _mm512_shuffle_f32x4(tmp6089, tmp6093, 136);
__m512 tmp6106 = _mm512_shuffle_f32x4(tmp6089, tmp6093, 221);
__m512 tmp6107 = _mm512_shuffle_f32x4(tmp6090, tmp6094, 136);
__m512 tmp6108 = _mm512_shuffle_f32x4(tmp6090, tmp6094, 221);
__m512 tmp6109 = _mm512_shuffle_f32x4(tmp6091, tmp6095, 136);
__m512 tmp6110 = _mm512_shuffle_f32x4(tmp6091, tmp6095, 221);
__m512 tmp6111 = _mm512_shuffle_f32x4(tmp6092, tmp6096, 136);
__m512 tmp6112 = _mm512_shuffle_f32x4(tmp6092, tmp6096, 221);
in869 = _mm512_shuffle_f32x4(tmp6097, tmp6105, 136);
in877 = _mm512_shuffle_f32x4(tmp6097, tmp6105, 221);
tmp6051 = _mm512_shuffle_f32x4(tmp6099, tmp6107, 136);
tmp6055 = _mm512_shuffle_f32x4(tmp6099, tmp6107, 221);
tmp6052 = _mm512_shuffle_f32x4(tmp6101, tmp6109, 136);
tmp6056 = _mm512_shuffle_f32x4(tmp6101, tmp6109, 221);
in875 = _mm512_shuffle_f32x4(tmp6103, tmp6111, 136);
in883 = _mm512_shuffle_f32x4(tmp6103, tmp6111, 221);
tmp6050 = _mm512_shuffle_f32x4(tmp6098, tmp6106, 136);
tmp6054 = _mm512_shuffle_f32x4(tmp6098, tmp6106, 221);
in871 = _mm512_shuffle_f32x4(tmp6100, tmp6108, 136);
in879 = _mm512_shuffle_f32x4(tmp6100, tmp6108, 221);
in873 = _mm512_shuffle_f32x4(tmp6102, tmp6110, 136);
in881 = _mm512_shuffle_f32x4(tmp6102, tmp6110, 221);
in872 = _mm512_shuffle_f32x4(tmp6104, tmp6112, 136);
in880 = _mm512_shuffle_f32x4(tmp6104, tmp6112, 221);
__m512 tmp6057 = _mm512_add_ps(tmp6051, in871);
__m512 tmp6061 = _mm512_add_ps(tmp6055, in879);
__m512 tmp6058 = _mm512_sub_ps(tmp6050, tmp6052);
__m512 tmp6062 = _mm512_sub_ps(tmp6054, tmp6056);
__m512 tmp6059 = _mm512_add_ps(tmp6052, in873);
__m512 tmp6063 = _mm512_add_ps(tmp6056, in881);
in869 = _mm512_sub_ps(in869, in873);
in877 = _mm512_sub_ps(in877, in881);
tmp6057 = _mm512_fmadd_ps(in875, _mm512_set1_ps(-4.25e+00f), tmp6057);
tmp6061 = _mm512_fmadd_ps(in883, _mm512_set1_ps(-4.25e+00f), tmp6061);
tmp6059 = _mm512_fmadd_ps(tmp6050, _mm512_set1_ps(-4.25e+00f), tmp6059);
tmp6063 = _mm512_fmadd_ps(tmp6054, _mm512_set1_ps(-4.25e+00f), tmp6063);
in869 = _mm512_fmadd_ps(tmp6058, _mm512_set1_ps(5.25e+00f), in869);
in877 = _mm512_fmadd_ps(tmp6062, _mm512_set1_ps(5.25e+00f), in877);
tmp6058 = _mm512_fmadd_ps(tmp6052, _mm512_set1_ps(2.5e-01f), in873);
tmp6062 = _mm512_fmadd_ps(tmp6056, _mm512_set1_ps(2.5e-01f), in881);
tmp6052 = _mm512_fmadd_ps(tmp6052, _mm512_set1_ps(4e+00f), in873);
tmp6056 = _mm512_fmadd_ps(tmp6056, _mm512_set1_ps(4e+00f), in881);
__m512 tmp6060 = _mm512_sub_ps(tmp6059, tmp6057);
__m512 tmp6064 = _mm512_sub_ps(tmp6063, tmp6061);
tmp6059 = _mm512_add_ps(tmp6057, tmp6059);
tmp6063 = _mm512_add_ps(tmp6061, tmp6063);
tmp6057 = _mm512_fmadd_ps(tmp6051, _mm512_set1_ps(2.5e-01f), in871);
tmp6061 = _mm512_fmadd_ps(tmp6055, _mm512_set1_ps(2.5e-01f), in879);
tmp6058 = _mm512_fmadd_ps(tmp6050, _mm512_set1_ps(-1.25e+00f), tmp6058);
tmp6062 = _mm512_fmadd_ps(tmp6054, _mm512_set1_ps(-1.25e+00f), tmp6062);
tmp6050 = _mm512_fmadd_ps(tmp6050, _mm512_set1_ps(-5e+00f), tmp6052);
tmp6054 = _mm512_fmadd_ps(tmp6054, _mm512_set1_ps(-5e+00f), tmp6056);
tmp6057 = _mm512_fmadd_ps(in875, _mm512_set1_ps(-1.25e+00f), tmp6057);
tmp6061 = _mm512_fmadd_ps(in883, _mm512_set1_ps(-1.25e+00f), tmp6061);
in873 = _mm512_fmadd_ps(tmp6057, _mm512_set1_ps(2e+00f), tmp6058);
in881 = _mm512_fmadd_ps(tmp6061, _mm512_set1_ps(2e+00f), tmp6062);
tmp6058 = _mm512_fnmadd_ps(tmp6057, _mm512_set1_ps(2e+00f), tmp6058);
tmp6062 = _mm512_fnmadd_ps(tmp6061, _mm512_set1_ps(2e+00f), tmp6062);
tmp6057 = _mm512_fmadd_ps(in871, _mm512_set1_ps(2.5e-01f), tmp6051);
tmp6061 = _mm512_fmadd_ps(in879, _mm512_set1_ps(2.5e-01f), tmp6055);
tmp6051 = _mm512_sub_ps(in872, tmp6051);
tmp6055 = _mm512_sub_ps(in880, tmp6055);
tmp6057 = _mm512_fmadd_ps(in875, _mm512_set1_ps(-1.25e+00f), tmp6057);
tmp6061 = _mm512_fmadd_ps(in883, _mm512_set1_ps(-1.25e+00f), tmp6061);
in875 = _mm512_sub_ps(in875, in871);
in883 = _mm512_sub_ps(in883, in879);
in875 = _mm512_fmadd_ps(in875, _mm512_set1_ps(5.25e+00f), tmp6051);
in883 = _mm512_fmadd_ps(in883, _mm512_set1_ps(5.25e+00f), tmp6055);
tmp6052 = _mm512_fmadd_ps(tmp6057, _mm512_set1_ps(2e+00f), tmp6050);
tmp6056 = _mm512_fmadd_ps(tmp6061, _mm512_set1_ps(2e+00f), tmp6054);
tmp6050 = _mm512_fnmadd_ps(tmp6057, _mm512_set1_ps(2e+00f), tmp6050);
tmp6054 = _mm512_fnmadd_ps(tmp6061, _mm512_set1_ps(2e+00f), tmp6054);
__m512 out819 = _mm512_shuffle_f32x4(in869, tmp6059, 68);
__m512 out827 = _mm512_shuffle_f32x4(in869, tmp6059, 238);
__m512 out820 = _mm512_shuffle_f32x4(tmp6060, in873, 68);
__m512 out828 = _mm512_shuffle_f32x4(tmp6060, in873, 238);
__m512 out821 = _mm512_shuffle_f32x4(tmp6058, tmp6052, 68);
__m512 out829 = _mm512_shuffle_f32x4(tmp6058, tmp6052, 238);
__m512 out822 = _mm512_shuffle_f32x4(tmp6050, in875, 68);
__m512 out830 = _mm512_shuffle_f32x4(tmp6050, in875, 238);
__m512 out823 = _mm512_shuffle_f32x4(in877, tmp6063, 68);
__m512 out831 = _mm512_shuffle_f32x4(in877, tmp6063, 238);
__m512 out824 = _mm512_shuffle_f32x4(tmp6064, in881, 68);
__m512 out832 = _mm512_shuffle_f32x4(tmp6064, in881, 238);
__m512 out825 = _mm512_shuffle_f32x4(tmp6062, tmp6056, 68);
__m512 out833 = _mm512_shuffle_f32x4(tmp6062, tmp6056, 238);
__m512 out826 = _mm512_shuffle_f32x4(tmp6054, in883, 68);
__m512 out834 = _mm512_shuffle_f32x4(tmp6054, in883, 238);
_mm512_storeu_ps(dfPtr6+256+1638400*i26+24576*j21+24576*s20+768*k82, out819);
_mm512_storeu_ps(dfPtr6+384+1638400*i26+24576*j21+24576*s20+768*k82, out827);
_mm512_storeu_ps(dfPtr6+320+1638400*i26+24576*j21+24576*s20+768*k82, out823);
_mm512_storeu_ps(dfPtr6+448+1638400*i26+24576*j21+24576*s20+768*k82, out831);
_mm512_storeu_ps(dfPtr6+409856+1638400*i26+24576*j21+24576*s20+768*k82, out820);
_mm512_storeu_ps(dfPtr6+409984+1638400*i26+24576*j21+24576*s20+768*k82, out828);
_mm512_storeu_ps(dfPtr6+409920+1638400*i26+24576*j21+24576*s20+768*k82, out824);
_mm512_storeu_ps(dfPtr6+410048+1638400*i26+24576*j21+24576*s20+768*k82, out832);
_mm512_storeu_ps(dfPtr6+819456+1638400*i26+24576*j21+24576*s20+768*k82, out821);
_mm512_storeu_ps(dfPtr6+819584+1638400*i26+24576*j21+24576*s20+768*k82, out829);
_mm512_storeu_ps(dfPtr6+819520+1638400*i26+24576*j21+24576*s20+768*k82, out825);
_mm512_storeu_ps(dfPtr6+819648+1638400*i26+24576*j21+24576*s20+768*k82, out833);
_mm512_storeu_ps(dfPtr6+1229056+1638400*i26+24576*j21+24576*s20+768*k82, out822);
_mm512_storeu_ps(dfPtr6+1229184+1638400*i26+24576*j21+24576*s20+768*k82, out830);
_mm512_storeu_ps(dfPtr6+1229120+1638400*i26+24576*j21+24576*s20+768*k82, out826);
_mm512_storeu_ps(dfPtr6+1229248+1638400*i26+24576*j21+24576*s20+768*k82, out834);
__m512 dat1413 = _mm512_maskz_loadu_ps(16383, datPtr12+12656+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1414 = _mm512_maskz_loadu_ps(16383, datPtr12+12704+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512i pm123 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in885 = _mm512_permutexvar_ps(pm123, dat1413);
__m512 in893 = _mm512_permutexvar_ps(pm123, dat1414);
__m512 dat1415 = _mm512_maskz_loadu_ps(16383, datPtr12+12880+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1416 = _mm512_maskz_loadu_ps(16383, datPtr12+12928+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in886 = _mm512_permutexvar_ps(pm123, dat1415);
__m512 in894 = _mm512_permutexvar_ps(pm123, dat1416);
__m512 dat1417 = _mm512_maskz_loadu_ps(16383, datPtr12+13104+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1418 = _mm512_maskz_loadu_ps(16383, datPtr12+13152+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in887 = _mm512_permutexvar_ps(pm123, dat1417);
__m512 in895 = _mm512_permutexvar_ps(pm123, dat1418);
__m512 dat1419 = _mm512_maskz_loadu_ps(16383, datPtr12+13328+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1420 = _mm512_maskz_loadu_ps(16383, datPtr12+13376+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in888 = _mm512_permutexvar_ps(pm123, dat1419);
__m512 in896 = _mm512_permutexvar_ps(pm123, dat1420);
__m512 dat1421 = _mm512_maskz_loadu_ps(16383, datPtr12+13552+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1422 = _mm512_maskz_loadu_ps(16383, datPtr12+13600+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in889 = _mm512_permutexvar_ps(pm123, dat1421);
__m512 in897 = _mm512_permutexvar_ps(pm123, dat1422);
__m512 dat1423 = _mm512_maskz_loadu_ps(16383, datPtr12+13776+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1424 = _mm512_maskz_loadu_ps(16383, datPtr12+13824+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in890 = _mm512_permutexvar_ps(pm123, dat1423);
__m512 in898 = _mm512_permutexvar_ps(pm123, dat1424);
__m512 dat1425 = _mm512_maskz_loadu_ps(16383, datPtr12+14000+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1426 = _mm512_maskz_loadu_ps(16383, datPtr12+14048+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in891 = _mm512_permutexvar_ps(pm123, dat1425);
__m512 in899 = _mm512_permutexvar_ps(pm123, dat1426);
__m512 dat1427 = _mm512_maskz_loadu_ps(16383, datPtr12+14224+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 dat1428 = _mm512_maskz_loadu_ps(16383, datPtr12+14272+806912*i26+224*h31+4*w38+806912*s20+25216*k82);
__m512 in892 = _mm512_permutexvar_ps(pm123, dat1427);
__m512 in900 = _mm512_permutexvar_ps(pm123, dat1428);
__m512 tmp6113 = _mm512_add_ps(in886, in890);
__m512 tmp6117 = _mm512_add_ps(in894, in898);
__m512 tmp6114 = _mm512_sub_ps(in889, in887);
__m512 tmp6118 = _mm512_sub_ps(in897, in895);
__m512 tmp6115 = _mm512_add_ps(in887, in891);
__m512 tmp6119 = _mm512_add_ps(in895, in899);
in885 = _mm512_sub_ps(in885, in891);
in893 = _mm512_sub_ps(in893, in899);
tmp6113 = _mm512_fmadd_ps(in888, _mm512_set1_ps(-4.25e+00f), tmp6113);
tmp6117 = _mm512_fmadd_ps(in896, _mm512_set1_ps(-4.25e+00f), tmp6117);
tmp6115 = _mm512_fmadd_ps(in889, _mm512_set1_ps(-4.25e+00f), tmp6115);
tmp6119 = _mm512_fmadd_ps(in897, _mm512_set1_ps(-4.25e+00f), tmp6119);
in885 = _mm512_fmadd_ps(tmp6114, _mm512_set1_ps(5.25e+00f), in885);
in893 = _mm512_fmadd_ps(tmp6118, _mm512_set1_ps(5.25e+00f), in893);
tmp6114 = _mm512_fmadd_ps(in887, _mm512_set1_ps(2.5e-01f), in891);
tmp6118 = _mm512_fmadd_ps(in895, _mm512_set1_ps(2.5e-01f), in899);
in887 = _mm512_fmadd_ps(in887, _mm512_set1_ps(4e+00f), in891);
in895 = _mm512_fmadd_ps(in895, _mm512_set1_ps(4e+00f), in899);
__m512 tmp6116 = _mm512_sub_ps(tmp6115, tmp6113);
__m512 tmp6120 = _mm512_sub_ps(tmp6119, tmp6117);
tmp6115 = _mm512_add_ps(tmp6113, tmp6115);
tmp6119 = _mm512_add_ps(tmp6117, tmp6119);
tmp6113 = _mm512_fmadd_ps(in886, _mm512_set1_ps(2.5e-01f), in890);
tmp6117 = _mm512_fmadd_ps(in894, _mm512_set1_ps(2.5e-01f), in898);
tmp6114 = _mm512_fmadd_ps(in889, _mm512_set1_ps(-1.25e+00f), tmp6114);
tmp6118 = _mm512_fmadd_ps(in897, _mm512_set1_ps(-1.25e+00f), tmp6118);
in889 = _mm512_fmadd_ps(in889, _mm512_set1_ps(-5e+00f), in887);
in897 = _mm512_fmadd_ps(in897, _mm512_set1_ps(-5e+00f), in895);
tmp6113 = _mm512_fmadd_ps(in888, _mm512_set1_ps(-1.25e+00f), tmp6113);
tmp6117 = _mm512_fmadd_ps(in896, _mm512_set1_ps(-1.25e+00f), tmp6117);
in891 = _mm512_fmadd_ps(tmp6113, _mm512_set1_ps(2e+00f), tmp6114);
in899 = _mm512_fmadd_ps(tmp6117, _mm512_set1_ps(2e+00f), tmp6118);
tmp6114 = _mm512_fnmadd_ps(tmp6113, _mm512_set1_ps(2e+00f), tmp6114);
tmp6118 = _mm512_fnmadd_ps(tmp6117, _mm512_set1_ps(2e+00f), tmp6118);
tmp6113 = _mm512_fmadd_ps(in890, _mm512_set1_ps(2.5e-01f), in886);
tmp6117 = _mm512_fmadd_ps(in898, _mm512_set1_ps(2.5e-01f), in894);
in886 = _mm512_sub_ps(in892, in886);
in894 = _mm512_sub_ps(in900, in894);
tmp6113 = _mm512_fmadd_ps(in888, _mm512_set1_ps(-1.25e+00f), tmp6113);
tmp6117 = _mm512_fmadd_ps(in896, _mm512_set1_ps(-1.25e+00f), tmp6117);
in888 = _mm512_sub_ps(in888, in890);
in896 = _mm512_sub_ps(in896, in898);
in888 = _mm512_fmadd_ps(in888, _mm512_set1_ps(5.25e+00f), in886);
in896 = _mm512_fmadd_ps(in896, _mm512_set1_ps(5.25e+00f), in894);
in887 = _mm512_fmadd_ps(tmp6113, _mm512_set1_ps(2e+00f), in889);
in895 = _mm512_fmadd_ps(tmp6117, _mm512_set1_ps(2e+00f), in897);
in889 = _mm512_fnmadd_ps(tmp6113, _mm512_set1_ps(2e+00f), in889);
in897 = _mm512_fnmadd_ps(tmp6117, _mm512_set1_ps(2e+00f), in897);
__m512 tmp6129 = _mm512_unpacklo_ps(in885, tmp6115);
__m512 tmp6130 = _mm512_unpackhi_ps(in885, tmp6115);
__m512 tmp6131 = _mm512_unpacklo_ps(tmp6116, in891);
__m512 tmp6132 = _mm512_unpackhi_ps(tmp6116, in891);
__m512 tmp6133 = _mm512_unpacklo_ps(tmp6114, in887);
__m512 tmp6134 = _mm512_unpackhi_ps(tmp6114, in887);
__m512 tmp6135 = _mm512_unpacklo_ps(in889, in888);
__m512 tmp6136 = _mm512_unpackhi_ps(in889, in888);
__m512 tmp6137 = _mm512_unpacklo_ps(in893, tmp6119);
__m512 tmp6138 = _mm512_unpackhi_ps(in893, tmp6119);
__m512 tmp6139 = _mm512_unpacklo_ps(tmp6120, in899);
__m512 tmp6140 = _mm512_unpackhi_ps(tmp6120, in899);
__m512 tmp6141 = _mm512_unpacklo_ps(tmp6118, in895);
__m512 tmp6142 = _mm512_unpackhi_ps(tmp6118, in895);
__m512 tmp6143 = _mm512_unpacklo_ps(in897, in896);
__m512 tmp6144 = _mm512_unpackhi_ps(in897, in896);
__m512 tmp6145 = _mm512_shuffle_ps(tmp6129, tmp6131, 68);
__m512 tmp6146 = _mm512_shuffle_ps(tmp6129, tmp6131, 238);
__m512 tmp6147 = _mm512_shuffle_ps(tmp6130, tmp6132, 68);
__m512 tmp6148 = _mm512_shuffle_ps(tmp6130, tmp6132, 238);
__m512 tmp6149 = _mm512_shuffle_ps(tmp6133, tmp6135, 68);
__m512 tmp6150 = _mm512_shuffle_ps(tmp6133, tmp6135, 238);
__m512 tmp6151 = _mm512_shuffle_ps(tmp6134, tmp6136, 68);
__m512 tmp6152 = _mm512_shuffle_ps(tmp6134, tmp6136, 238);
__m512 tmp6153 = _mm512_shuffle_ps(tmp6137, tmp6139, 68);
__m512 tmp6154 = _mm512_shuffle_ps(tmp6137, tmp6139, 238);
__m512 tmp6155 = _mm512_shuffle_ps(tmp6138, tmp6140, 68);
__m512 tmp6156 = _mm512_shuffle_ps(tmp6138, tmp6140, 238);
__m512 tmp6157 = _mm512_shuffle_ps(tmp6141, tmp6143, 68);
__m512 tmp6158 = _mm512_shuffle_ps(tmp6141, tmp6143, 238);
__m512 tmp6159 = _mm512_shuffle_ps(tmp6142, tmp6144, 68);
__m512 tmp6160 = _mm512_shuffle_ps(tmp6142, tmp6144, 238);
__m512 tmp6161 = _mm512_shuffle_f32x4(tmp6145, tmp6149, 136);
__m512 tmp6162 = _mm512_shuffle_f32x4(tmp6145, tmp6149, 221);
__m512 tmp6163 = _mm512_shuffle_f32x4(tmp6146, tmp6150, 136);
__m512 tmp6164 = _mm512_shuffle_f32x4(tmp6146, tmp6150, 221);
__m512 tmp6165 = _mm512_shuffle_f32x4(tmp6147, tmp6151, 136);
__m512 tmp6166 = _mm512_shuffle_f32x4(tmp6147, tmp6151, 221);
__m512 tmp6167 = _mm512_shuffle_f32x4(tmp6148, tmp6152, 136);
__m512 tmp6168 = _mm512_shuffle_f32x4(tmp6148, tmp6152, 221);
__m512 tmp6169 = _mm512_shuffle_f32x4(tmp6153, tmp6157, 136);
__m512 tmp6170 = _mm512_shuffle_f32x4(tmp6153, tmp6157, 221);
__m512 tmp6171 = _mm512_shuffle_f32x4(tmp6154, tmp6158, 136);
__m512 tmp6172 = _mm512_shuffle_f32x4(tmp6154, tmp6158, 221);
__m512 tmp6173 = _mm512_shuffle_f32x4(tmp6155, tmp6159, 136);
__m512 tmp6174 = _mm512_shuffle_f32x4(tmp6155, tmp6159, 221);
__m512 tmp6175 = _mm512_shuffle_f32x4(tmp6156, tmp6160, 136);
__m512 tmp6176 = _mm512_shuffle_f32x4(tmp6156, tmp6160, 221);
in885 = _mm512_shuffle_f32x4(tmp6161, tmp6169, 136);
in893 = _mm512_shuffle_f32x4(tmp6161, tmp6169, 221);
tmp6115 = _mm512_shuffle_f32x4(tmp6163, tmp6171, 136);
tmp6119 = _mm512_shuffle_f32x4(tmp6163, tmp6171, 221);
tmp6116 = _mm512_shuffle_f32x4(tmp6165, tmp6173, 136);
tmp6120 = _mm512_shuffle_f32x4(tmp6165, tmp6173, 221);
in891 = _mm512_shuffle_f32x4(tmp6167, tmp6175, 136);
in899 = _mm512_shuffle_f32x4(tmp6167, tmp6175, 221);
tmp6114 = _mm512_shuffle_f32x4(tmp6162, tmp6170, 136);
tmp6118 = _mm512_shuffle_f32x4(tmp6162, tmp6170, 221);
in887 = _mm512_shuffle_f32x4(tmp6164, tmp6172, 136);
in895 = _mm512_shuffle_f32x4(tmp6164, tmp6172, 221);
in889 = _mm512_shuffle_f32x4(tmp6166, tmp6174, 136);
in897 = _mm512_shuffle_f32x4(tmp6166, tmp6174, 221);
in888 = _mm512_shuffle_f32x4(tmp6168, tmp6176, 136);
in896 = _mm512_shuffle_f32x4(tmp6168, tmp6176, 221);
__m512 tmp6121 = _mm512_add_ps(tmp6115, in887);
__m512 tmp6125 = _mm512_add_ps(tmp6119, in895);
__m512 tmp6122 = _mm512_sub_ps(tmp6114, tmp6116);
__m512 tmp6126 = _mm512_sub_ps(tmp6118, tmp6120);
__m512 tmp6123 = _mm512_add_ps(tmp6116, in889);
__m512 tmp6127 = _mm512_add_ps(tmp6120, in897);
in885 = _mm512_sub_ps(in885, in889);
in893 = _mm512_sub_ps(in893, in897);
tmp6121 = _mm512_fmadd_ps(in891, _mm512_set1_ps(-4.25e+00f), tmp6121);
tmp6125 = _mm512_fmadd_ps(in899, _mm512_set1_ps(-4.25e+00f), tmp6125);
tmp6123 = _mm512_fmadd_ps(tmp6114, _mm512_set1_ps(-4.25e+00f), tmp6123);
tmp6127 = _mm512_fmadd_ps(tmp6118, _mm512_set1_ps(-4.25e+00f), tmp6127);
in885 = _mm512_fmadd_ps(tmp6122, _mm512_set1_ps(5.25e+00f), in885);
in893 = _mm512_fmadd_ps(tmp6126, _mm512_set1_ps(5.25e+00f), in893);
tmp6122 = _mm512_fmadd_ps(tmp6116, _mm512_set1_ps(2.5e-01f), in889);
tmp6126 = _mm512_fmadd_ps(tmp6120, _mm512_set1_ps(2.5e-01f), in897);
tmp6116 = _mm512_fmadd_ps(tmp6116, _mm512_set1_ps(4e+00f), in889);
tmp6120 = _mm512_fmadd_ps(tmp6120, _mm512_set1_ps(4e+00f), in897);
__m512 tmp6124 = _mm512_sub_ps(tmp6123, tmp6121);
__m512 tmp6128 = _mm512_sub_ps(tmp6127, tmp6125);
tmp6123 = _mm512_add_ps(tmp6121, tmp6123);
tmp6127 = _mm512_add_ps(tmp6125, tmp6127);
tmp6121 = _mm512_fmadd_ps(tmp6115, _mm512_set1_ps(2.5e-01f), in887);
tmp6125 = _mm512_fmadd_ps(tmp6119, _mm512_set1_ps(2.5e-01f), in895);
tmp6122 = _mm512_fmadd_ps(tmp6114, _mm512_set1_ps(-1.25e+00f), tmp6122);
tmp6126 = _mm512_fmadd_ps(tmp6118, _mm512_set1_ps(-1.25e+00f), tmp6126);
tmp6114 = _mm512_fmadd_ps(tmp6114, _mm512_set1_ps(-5e+00f), tmp6116);
tmp6118 = _mm512_fmadd_ps(tmp6118, _mm512_set1_ps(-5e+00f), tmp6120);
tmp6121 = _mm512_fmadd_ps(in891, _mm512_set1_ps(-1.25e+00f), tmp6121);
tmp6125 = _mm512_fmadd_ps(in899, _mm512_set1_ps(-1.25e+00f), tmp6125);
in889 = _mm512_fmadd_ps(tmp6121, _mm512_set1_ps(2e+00f), tmp6122);
in897 = _mm512_fmadd_ps(tmp6125, _mm512_set1_ps(2e+00f), tmp6126);
tmp6122 = _mm512_fnmadd_ps(tmp6121, _mm512_set1_ps(2e+00f), tmp6122);
tmp6126 = _mm512_fnmadd_ps(tmp6125, _mm512_set1_ps(2e+00f), tmp6126);
tmp6121 = _mm512_fmadd_ps(in887, _mm512_set1_ps(2.5e-01f), tmp6115);
tmp6125 = _mm512_fmadd_ps(in895, _mm512_set1_ps(2.5e-01f), tmp6119);
tmp6115 = _mm512_sub_ps(in888, tmp6115);
tmp6119 = _mm512_sub_ps(in896, tmp6119);
tmp6121 = _mm512_fmadd_ps(in891, _mm512_set1_ps(-1.25e+00f), tmp6121);
tmp6125 = _mm512_fmadd_ps(in899, _mm512_set1_ps(-1.25e+00f), tmp6125);
in891 = _mm512_sub_ps(in891, in887);
in899 = _mm512_sub_ps(in899, in895);
in891 = _mm512_fmadd_ps(in891, _mm512_set1_ps(5.25e+00f), tmp6115);
in899 = _mm512_fmadd_ps(in899, _mm512_set1_ps(5.25e+00f), tmp6119);
tmp6116 = _mm512_fmadd_ps(tmp6121, _mm512_set1_ps(2e+00f), tmp6114);
tmp6120 = _mm512_fmadd_ps(tmp6125, _mm512_set1_ps(2e+00f), tmp6118);
tmp6114 = _mm512_fnmadd_ps(tmp6121, _mm512_set1_ps(2e+00f), tmp6114);
tmp6118 = _mm512_fnmadd_ps(tmp6125, _mm512_set1_ps(2e+00f), tmp6118);
__m512 out835 = _mm512_shuffle_f32x4(in885, tmp6123, 68);
__m512 out843 = _mm512_shuffle_f32x4(in885, tmp6123, 238);
__m512 out836 = _mm512_shuffle_f32x4(tmp6124, in889, 68);
__m512 out844 = _mm512_shuffle_f32x4(tmp6124, in889, 238);
__m512 out837 = _mm512_shuffle_f32x4(tmp6122, tmp6116, 68);
__m512 out845 = _mm512_shuffle_f32x4(tmp6122, tmp6116, 238);
__m512 out838 = _mm512_shuffle_f32x4(tmp6114, in891, 68);
__m512 out846 = _mm512_shuffle_f32x4(tmp6114, in891, 238);
__m512 out839 = _mm512_shuffle_f32x4(in893, tmp6127, 68);
__m512 out847 = _mm512_shuffle_f32x4(in893, tmp6127, 238);
__m512 out840 = _mm512_shuffle_f32x4(tmp6128, in897, 68);
__m512 out848 = _mm512_shuffle_f32x4(tmp6128, in897, 238);
__m512 out841 = _mm512_shuffle_f32x4(tmp6126, tmp6120, 68);
__m512 out849 = _mm512_shuffle_f32x4(tmp6126, tmp6120, 238);
__m512 out842 = _mm512_shuffle_f32x4(tmp6118, in899, 68);
__m512 out850 = _mm512_shuffle_f32x4(tmp6118, in899, 238);
_mm512_storeu_ps(dfPtr6+512+1638400*i26+24576*j21+24576*s20+768*k82, out835);
_mm512_storeu_ps(dfPtr6+640+1638400*i26+24576*j21+24576*s20+768*k82, out843);
_mm512_storeu_ps(dfPtr6+576+1638400*i26+24576*j21+24576*s20+768*k82, out839);
_mm512_storeu_ps(dfPtr6+704+1638400*i26+24576*j21+24576*s20+768*k82, out847);
_mm512_storeu_ps(dfPtr6+410112+1638400*i26+24576*j21+24576*s20+768*k82, out836);
_mm512_storeu_ps(dfPtr6+410240+1638400*i26+24576*j21+24576*s20+768*k82, out844);
_mm512_storeu_ps(dfPtr6+410176+1638400*i26+24576*j21+24576*s20+768*k82, out840);
_mm512_storeu_ps(dfPtr6+410304+1638400*i26+24576*j21+24576*s20+768*k82, out848);
_mm512_storeu_ps(dfPtr6+819712+1638400*i26+24576*j21+24576*s20+768*k82, out837);
_mm512_storeu_ps(dfPtr6+819840+1638400*i26+24576*j21+24576*s20+768*k82, out845);
_mm512_storeu_ps(dfPtr6+819776+1638400*i26+24576*j21+24576*s20+768*k82, out841);
_mm512_storeu_ps(dfPtr6+819904+1638400*i26+24576*j21+24576*s20+768*k82, out849);
_mm512_storeu_ps(dfPtr6+1229312+1638400*i26+24576*j21+24576*s20+768*k82, out838);
_mm512_storeu_ps(dfPtr6+1229440+1638400*i26+24576*j21+24576*s20+768*k82, out846);
_mm512_storeu_ps(dfPtr6+1229376+1638400*i26+24576*j21+24576*s20+768*k82, out842);
_mm512_storeu_ps(dfPtr6+1229504+1638400*i26+24576*j21+24576*s20+768*k82, out850);
}
if (j21 >= last5) return;
++j21;
rel14 = 1;
}
ptrdiff_t h32 = base14+0;
ptrdiff_t w39 = 48;
ptrdiff_t k83 = 0;
for (; k83 != 32; ++k83) {
__m512 dat1429 = _mm512_maskz_loadu_ps(511, datPtr12+0+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1430 = _mm512_maskz_loadu_ps(8191, datPtr12+1156+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512i pm124 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in901 = _mm512_permutexvar_ps(pm124, dat1429);
__m512i pm125 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in909 = _mm512_permutexvar_ps(pm125, dat1430);
__m512 dat1431 = _mm512_maskz_loadu_ps(511, datPtr12+224+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1432 = _mm512_maskz_loadu_ps(8191, datPtr12+1380+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in902 = _mm512_permutexvar_ps(pm124, dat1431);
__m512 in910 = _mm512_permutexvar_ps(pm125, dat1432);
__m512 dat1433 = _mm512_maskz_loadu_ps(511, datPtr12+448+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1434 = _mm512_maskz_loadu_ps(8191, datPtr12+1604+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in903 = _mm512_permutexvar_ps(pm124, dat1433);
__m512 in911 = _mm512_permutexvar_ps(pm125, dat1434);
__m512 dat1435 = _mm512_maskz_loadu_ps(511, datPtr12+672+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1436 = _mm512_maskz_loadu_ps(8191, datPtr12+1828+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in904 = _mm512_permutexvar_ps(pm124, dat1435);
__m512 in912 = _mm512_permutexvar_ps(pm125, dat1436);
__m512 dat1437 = _mm512_maskz_loadu_ps(511, datPtr12+896+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1438 = _mm512_maskz_loadu_ps(8191, datPtr12+2052+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in905 = _mm512_permutexvar_ps(pm124, dat1437);
__m512 in913 = _mm512_permutexvar_ps(pm125, dat1438);
__m512 dat1439 = _mm512_maskz_loadu_ps(511, datPtr12+1120+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1440 = _mm512_maskz_loadu_ps(8191, datPtr12+2276+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in906 = _mm512_permutexvar_ps(pm124, dat1439);
__m512 in914 = _mm512_permutexvar_ps(pm125, dat1440);
__m512 dat1441 = _mm512_maskz_loadu_ps(511, datPtr12+1344+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1442 = _mm512_maskz_loadu_ps(8191, datPtr12+2500+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in907 = _mm512_permutexvar_ps(pm124, dat1441);
__m512 in915 = _mm512_permutexvar_ps(pm125, dat1442);
__m512 dat1443 = _mm512_maskz_loadu_ps(511, datPtr12+1568+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1444 = _mm512_maskz_loadu_ps(8191, datPtr12+2724+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in908 = _mm512_permutexvar_ps(pm124, dat1443);
__m512 in916 = _mm512_permutexvar_ps(pm125, dat1444);
__m512 tmp6177 = _mm512_add_ps(in902, in906);
__m512 tmp6181 = _mm512_add_ps(in910, in914);
__m512 tmp6178 = _mm512_sub_ps(in905, in903);
__m512 tmp6182 = _mm512_sub_ps(in913, in911);
__m512 tmp6179 = _mm512_add_ps(in903, in907);
__m512 tmp6183 = _mm512_add_ps(in911, in915);
in901 = _mm512_sub_ps(in901, in907);
in909 = _mm512_sub_ps(in909, in915);
tmp6177 = _mm512_fmadd_ps(in904, _mm512_set1_ps(-4.25e+00f), tmp6177);
tmp6181 = _mm512_fmadd_ps(in912, _mm512_set1_ps(-4.25e+00f), tmp6181);
tmp6179 = _mm512_fmadd_ps(in905, _mm512_set1_ps(-4.25e+00f), tmp6179);
tmp6183 = _mm512_fmadd_ps(in913, _mm512_set1_ps(-4.25e+00f), tmp6183);
in901 = _mm512_fmadd_ps(tmp6178, _mm512_set1_ps(5.25e+00f), in901);
in909 = _mm512_fmadd_ps(tmp6182, _mm512_set1_ps(5.25e+00f), in909);
tmp6178 = _mm512_fmadd_ps(in903, _mm512_set1_ps(2.5e-01f), in907);
tmp6182 = _mm512_fmadd_ps(in911, _mm512_set1_ps(2.5e-01f), in915);
in903 = _mm512_fmadd_ps(in903, _mm512_set1_ps(4e+00f), in907);
in911 = _mm512_fmadd_ps(in911, _mm512_set1_ps(4e+00f), in915);
__m512 tmp6180 = _mm512_sub_ps(tmp6179, tmp6177);
__m512 tmp6184 = _mm512_sub_ps(tmp6183, tmp6181);
tmp6179 = _mm512_add_ps(tmp6177, tmp6179);
tmp6183 = _mm512_add_ps(tmp6181, tmp6183);
tmp6177 = _mm512_fmadd_ps(in902, _mm512_set1_ps(2.5e-01f), in906);
tmp6181 = _mm512_fmadd_ps(in910, _mm512_set1_ps(2.5e-01f), in914);
tmp6178 = _mm512_fmadd_ps(in905, _mm512_set1_ps(-1.25e+00f), tmp6178);
tmp6182 = _mm512_fmadd_ps(in913, _mm512_set1_ps(-1.25e+00f), tmp6182);
in905 = _mm512_fmadd_ps(in905, _mm512_set1_ps(-5e+00f), in903);
in913 = _mm512_fmadd_ps(in913, _mm512_set1_ps(-5e+00f), in911);
tmp6177 = _mm512_fmadd_ps(in904, _mm512_set1_ps(-1.25e+00f), tmp6177);
tmp6181 = _mm512_fmadd_ps(in912, _mm512_set1_ps(-1.25e+00f), tmp6181);
in907 = _mm512_fmadd_ps(tmp6177, _mm512_set1_ps(2e+00f), tmp6178);
in915 = _mm512_fmadd_ps(tmp6181, _mm512_set1_ps(2e+00f), tmp6182);
tmp6178 = _mm512_fnmadd_ps(tmp6177, _mm512_set1_ps(2e+00f), tmp6178);
tmp6182 = _mm512_fnmadd_ps(tmp6181, _mm512_set1_ps(2e+00f), tmp6182);
tmp6177 = _mm512_fmadd_ps(in906, _mm512_set1_ps(2.5e-01f), in902);
tmp6181 = _mm512_fmadd_ps(in914, _mm512_set1_ps(2.5e-01f), in910);
in902 = _mm512_sub_ps(in908, in902);
in910 = _mm512_sub_ps(in916, in910);
tmp6177 = _mm512_fmadd_ps(in904, _mm512_set1_ps(-1.25e+00f), tmp6177);
tmp6181 = _mm512_fmadd_ps(in912, _mm512_set1_ps(-1.25e+00f), tmp6181);
in904 = _mm512_sub_ps(in904, in906);
in912 = _mm512_sub_ps(in912, in914);
in904 = _mm512_fmadd_ps(in904, _mm512_set1_ps(5.25e+00f), in902);
in912 = _mm512_fmadd_ps(in912, _mm512_set1_ps(5.25e+00f), in910);
in903 = _mm512_fmadd_ps(tmp6177, _mm512_set1_ps(2e+00f), in905);
in911 = _mm512_fmadd_ps(tmp6181, _mm512_set1_ps(2e+00f), in913);
in905 = _mm512_fnmadd_ps(tmp6177, _mm512_set1_ps(2e+00f), in905);
in913 = _mm512_fnmadd_ps(tmp6181, _mm512_set1_ps(2e+00f), in913);
__m512 tmp6193 = _mm512_unpacklo_ps(in901, tmp6179);
__m512 tmp6194 = _mm512_unpackhi_ps(in901, tmp6179);
__m512 tmp6195 = _mm512_unpacklo_ps(tmp6180, in907);
__m512 tmp6196 = _mm512_unpackhi_ps(tmp6180, in907);
__m512 tmp6197 = _mm512_unpacklo_ps(tmp6178, in903);
__m512 tmp6198 = _mm512_unpackhi_ps(tmp6178, in903);
__m512 tmp6199 = _mm512_unpacklo_ps(in905, in904);
__m512 tmp6200 = _mm512_unpackhi_ps(in905, in904);
__m512 tmp6201 = _mm512_unpacklo_ps(in909, tmp6183);
__m512 tmp6202 = _mm512_unpackhi_ps(in909, tmp6183);
__m512 tmp6203 = _mm512_unpacklo_ps(tmp6184, in915);
__m512 tmp6204 = _mm512_unpackhi_ps(tmp6184, in915);
__m512 tmp6205 = _mm512_unpacklo_ps(tmp6182, in911);
__m512 tmp6206 = _mm512_unpackhi_ps(tmp6182, in911);
__m512 tmp6207 = _mm512_unpacklo_ps(in913, in912);
__m512 tmp6208 = _mm512_unpackhi_ps(in913, in912);
__m512 tmp6209 = _mm512_shuffle_ps(tmp6193, tmp6195, 68);
__m512 tmp6210 = _mm512_shuffle_ps(tmp6193, tmp6195, 238);
__m512 tmp6211 = _mm512_shuffle_ps(tmp6194, tmp6196, 68);
__m512 tmp6212 = _mm512_shuffle_ps(tmp6194, tmp6196, 238);
__m512 tmp6213 = _mm512_shuffle_ps(tmp6197, tmp6199, 68);
__m512 tmp6214 = _mm512_shuffle_ps(tmp6197, tmp6199, 238);
__m512 tmp6215 = _mm512_shuffle_ps(tmp6198, tmp6200, 68);
__m512 tmp6216 = _mm512_shuffle_ps(tmp6198, tmp6200, 238);
__m512 tmp6217 = _mm512_shuffle_ps(tmp6201, tmp6203, 68);
__m512 tmp6218 = _mm512_shuffle_ps(tmp6201, tmp6203, 238);
__m512 tmp6219 = _mm512_shuffle_ps(tmp6202, tmp6204, 68);
__m512 tmp6220 = _mm512_shuffle_ps(tmp6202, tmp6204, 238);
__m512 tmp6221 = _mm512_shuffle_ps(tmp6205, tmp6207, 68);
__m512 tmp6222 = _mm512_shuffle_ps(tmp6205, tmp6207, 238);
__m512 tmp6223 = _mm512_shuffle_ps(tmp6206, tmp6208, 68);
__m512 tmp6224 = _mm512_shuffle_ps(tmp6206, tmp6208, 238);
__m512 tmp6225 = _mm512_shuffle_f32x4(tmp6209, tmp6213, 136);
__m512 tmp6226 = _mm512_shuffle_f32x4(tmp6209, tmp6213, 221);
__m512 tmp6227 = _mm512_shuffle_f32x4(tmp6210, tmp6214, 136);
__m512 tmp6228 = _mm512_shuffle_f32x4(tmp6210, tmp6214, 221);
__m512 tmp6229 = _mm512_shuffle_f32x4(tmp6211, tmp6215, 136);
__m512 tmp6230 = _mm512_shuffle_f32x4(tmp6211, tmp6215, 221);
__m512 tmp6231 = _mm512_shuffle_f32x4(tmp6212, tmp6216, 136);
__m512 tmp6232 = _mm512_shuffle_f32x4(tmp6212, tmp6216, 221);
__m512 tmp6233 = _mm512_shuffle_f32x4(tmp6217, tmp6221, 136);
__m512 tmp6234 = _mm512_shuffle_f32x4(tmp6217, tmp6221, 221);
__m512 tmp6235 = _mm512_shuffle_f32x4(tmp6218, tmp6222, 136);
__m512 tmp6236 = _mm512_shuffle_f32x4(tmp6218, tmp6222, 221);
__m512 tmp6237 = _mm512_shuffle_f32x4(tmp6219, tmp6223, 136);
__m512 tmp6238 = _mm512_shuffle_f32x4(tmp6219, tmp6223, 221);
__m512 tmp6239 = _mm512_shuffle_f32x4(tmp6220, tmp6224, 136);
__m512 tmp6240 = _mm512_shuffle_f32x4(tmp6220, tmp6224, 221);
in901 = _mm512_shuffle_f32x4(tmp6225, tmp6233, 136);
in909 = _mm512_shuffle_f32x4(tmp6225, tmp6233, 221);
tmp6179 = _mm512_shuffle_f32x4(tmp6227, tmp6235, 136);
tmp6183 = _mm512_shuffle_f32x4(tmp6227, tmp6235, 221);
tmp6180 = _mm512_shuffle_f32x4(tmp6229, tmp6237, 136);
tmp6184 = _mm512_shuffle_f32x4(tmp6229, tmp6237, 221);
in907 = _mm512_shuffle_f32x4(tmp6231, tmp6239, 136);
in915 = _mm512_shuffle_f32x4(tmp6231, tmp6239, 221);
tmp6178 = _mm512_shuffle_f32x4(tmp6226, tmp6234, 136);
tmp6182 = _mm512_shuffle_f32x4(tmp6226, tmp6234, 221);
in903 = _mm512_shuffle_f32x4(tmp6228, tmp6236, 136);
in911 = _mm512_shuffle_f32x4(tmp6228, tmp6236, 221);
in905 = _mm512_shuffle_f32x4(tmp6230, tmp6238, 136);
in913 = _mm512_shuffle_f32x4(tmp6230, tmp6238, 221);
in904 = _mm512_shuffle_f32x4(tmp6232, tmp6240, 136);
in912 = _mm512_shuffle_f32x4(tmp6232, tmp6240, 221);
__m512 tmp6185 = _mm512_add_ps(tmp6179, in903);
__m512 tmp6189 = _mm512_add_ps(tmp6183, in911);
__m512 tmp6186 = _mm512_sub_ps(tmp6178, tmp6180);
__m512 tmp6190 = _mm512_sub_ps(tmp6182, tmp6184);
__m512 tmp6187 = _mm512_add_ps(tmp6180, in905);
__m512 tmp6191 = _mm512_add_ps(tmp6184, in913);
in901 = _mm512_sub_ps(in901, in905);
in909 = _mm512_sub_ps(in909, in913);
tmp6185 = _mm512_fmadd_ps(in907, _mm512_set1_ps(-4.25e+00f), tmp6185);
tmp6189 = _mm512_fmadd_ps(in915, _mm512_set1_ps(-4.25e+00f), tmp6189);
tmp6187 = _mm512_fmadd_ps(tmp6178, _mm512_set1_ps(-4.25e+00f), tmp6187);
tmp6191 = _mm512_fmadd_ps(tmp6182, _mm512_set1_ps(-4.25e+00f), tmp6191);
in901 = _mm512_fmadd_ps(tmp6186, _mm512_set1_ps(5.25e+00f), in901);
in909 = _mm512_fmadd_ps(tmp6190, _mm512_set1_ps(5.25e+00f), in909);
tmp6186 = _mm512_fmadd_ps(tmp6180, _mm512_set1_ps(2.5e-01f), in905);
tmp6190 = _mm512_fmadd_ps(tmp6184, _mm512_set1_ps(2.5e-01f), in913);
tmp6180 = _mm512_fmadd_ps(tmp6180, _mm512_set1_ps(4e+00f), in905);
tmp6184 = _mm512_fmadd_ps(tmp6184, _mm512_set1_ps(4e+00f), in913);
__m512 tmp6188 = _mm512_sub_ps(tmp6187, tmp6185);
__m512 tmp6192 = _mm512_sub_ps(tmp6191, tmp6189);
tmp6187 = _mm512_add_ps(tmp6185, tmp6187);
tmp6191 = _mm512_add_ps(tmp6189, tmp6191);
tmp6185 = _mm512_fmadd_ps(tmp6179, _mm512_set1_ps(2.5e-01f), in903);
tmp6189 = _mm512_fmadd_ps(tmp6183, _mm512_set1_ps(2.5e-01f), in911);
tmp6186 = _mm512_fmadd_ps(tmp6178, _mm512_set1_ps(-1.25e+00f), tmp6186);
tmp6190 = _mm512_fmadd_ps(tmp6182, _mm512_set1_ps(-1.25e+00f), tmp6190);
tmp6178 = _mm512_fmadd_ps(tmp6178, _mm512_set1_ps(-5e+00f), tmp6180);
tmp6182 = _mm512_fmadd_ps(tmp6182, _mm512_set1_ps(-5e+00f), tmp6184);
tmp6185 = _mm512_fmadd_ps(in907, _mm512_set1_ps(-1.25e+00f), tmp6185);
tmp6189 = _mm512_fmadd_ps(in915, _mm512_set1_ps(-1.25e+00f), tmp6189);
in905 = _mm512_fmadd_ps(tmp6185, _mm512_set1_ps(2e+00f), tmp6186);
in913 = _mm512_fmadd_ps(tmp6189, _mm512_set1_ps(2e+00f), tmp6190);
tmp6186 = _mm512_fnmadd_ps(tmp6185, _mm512_set1_ps(2e+00f), tmp6186);
tmp6190 = _mm512_fnmadd_ps(tmp6189, _mm512_set1_ps(2e+00f), tmp6190);
tmp6185 = _mm512_fmadd_ps(in903, _mm512_set1_ps(2.5e-01f), tmp6179);
tmp6189 = _mm512_fmadd_ps(in911, _mm512_set1_ps(2.5e-01f), tmp6183);
tmp6179 = _mm512_sub_ps(in904, tmp6179);
tmp6183 = _mm512_sub_ps(in912, tmp6183);
tmp6185 = _mm512_fmadd_ps(in907, _mm512_set1_ps(-1.25e+00f), tmp6185);
tmp6189 = _mm512_fmadd_ps(in915, _mm512_set1_ps(-1.25e+00f), tmp6189);
in907 = _mm512_sub_ps(in907, in903);
in915 = _mm512_sub_ps(in915, in911);
in907 = _mm512_fmadd_ps(in907, _mm512_set1_ps(5.25e+00f), tmp6179);
in915 = _mm512_fmadd_ps(in915, _mm512_set1_ps(5.25e+00f), tmp6183);
tmp6180 = _mm512_fmadd_ps(tmp6185, _mm512_set1_ps(2e+00f), tmp6178);
tmp6184 = _mm512_fmadd_ps(tmp6189, _mm512_set1_ps(2e+00f), tmp6182);
tmp6178 = _mm512_fnmadd_ps(tmp6185, _mm512_set1_ps(2e+00f), tmp6178);
tmp6182 = _mm512_fnmadd_ps(tmp6189, _mm512_set1_ps(2e+00f), tmp6182);
__m512 out851 = _mm512_shuffle_f32x4(in901, tmp6187, 68);
__m512 out859 = _mm512_shuffle_f32x4(in901, tmp6187, 238);
__m512 out852 = _mm512_shuffle_f32x4(tmp6188, in905, 68);
__m512 out860 = _mm512_shuffle_f32x4(tmp6188, in905, 238);
__m512 out853 = _mm512_shuffle_f32x4(tmp6186, tmp6180, 68);
__m512 out861 = _mm512_shuffle_f32x4(tmp6186, tmp6180, 238);
__m512 out854 = _mm512_shuffle_f32x4(tmp6178, in907, 68);
__m512 out862 = _mm512_shuffle_f32x4(tmp6178, in907, 238);
__m512 out855 = _mm512_shuffle_f32x4(in909, tmp6191, 68);
__m512 out863 = _mm512_shuffle_f32x4(in909, tmp6191, 238);
__m512 out856 = _mm512_shuffle_f32x4(tmp6192, in913, 68);
__m512 out864 = _mm512_shuffle_f32x4(tmp6192, in913, 238);
__m512 out857 = _mm512_shuffle_f32x4(tmp6190, tmp6184, 68);
__m512 out865 = _mm512_shuffle_f32x4(tmp6190, tmp6184, 238);
__m512 out858 = _mm512_shuffle_f32x4(tmp6182, in915, 68);
__m512 out866 = _mm512_shuffle_f32x4(tmp6182, in915, 238);
_mm512_storeu_ps(dfPtr6+0+1638400*i26+24576*j21+24576*s20+768*k83, out851);
_mm512_storeu_ps(dfPtr6+128+1638400*i26+24576*j21+24576*s20+768*k83, out859);
_mm512_storeu_ps(dfPtr6+64+1638400*i26+24576*j21+24576*s20+768*k83, out855);
_mm512_storeu_ps(dfPtr6+192+1638400*i26+24576*j21+24576*s20+768*k83, out863);
_mm512_storeu_ps(dfPtr6+409600+1638400*i26+24576*j21+24576*s20+768*k83, out852);
_mm512_storeu_ps(dfPtr6+409728+1638400*i26+24576*j21+24576*s20+768*k83, out860);
_mm512_storeu_ps(dfPtr6+409664+1638400*i26+24576*j21+24576*s20+768*k83, out856);
_mm512_storeu_ps(dfPtr6+409792+1638400*i26+24576*j21+24576*s20+768*k83, out864);
_mm512_storeu_ps(dfPtr6+819200+1638400*i26+24576*j21+24576*s20+768*k83, out853);
_mm512_storeu_ps(dfPtr6+819328+1638400*i26+24576*j21+24576*s20+768*k83, out861);
_mm512_storeu_ps(dfPtr6+819264+1638400*i26+24576*j21+24576*s20+768*k83, out857);
_mm512_storeu_ps(dfPtr6+819392+1638400*i26+24576*j21+24576*s20+768*k83, out865);
_mm512_storeu_ps(dfPtr6+1228800+1638400*i26+24576*j21+24576*s20+768*k83, out854);
_mm512_storeu_ps(dfPtr6+1228928+1638400*i26+24576*j21+24576*s20+768*k83, out862);
_mm512_storeu_ps(dfPtr6+1228864+1638400*i26+24576*j21+24576*s20+768*k83, out858);
_mm512_storeu_ps(dfPtr6+1228992+1638400*i26+24576*j21+24576*s20+768*k83, out866);
__m512 dat1445 = _mm512_maskz_loadu_ps(16383, datPtr12+1200+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1446 = _mm512_maskz_loadu_ps(511, datPtr12+12608+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512i pm126 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in917 = _mm512_permutexvar_ps(pm126, dat1445);
__m512i pm127 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in925 = _mm512_permutexvar_ps(pm127, dat1446);
__m512 dat1447 = _mm512_maskz_loadu_ps(16383, datPtr12+1424+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1448 = _mm512_maskz_loadu_ps(511, datPtr12+12832+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in918 = _mm512_permutexvar_ps(pm126, dat1447);
__m512 in926 = _mm512_permutexvar_ps(pm127, dat1448);
__m512 dat1449 = _mm512_maskz_loadu_ps(16383, datPtr12+1648+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1450 = _mm512_maskz_loadu_ps(511, datPtr12+13056+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in919 = _mm512_permutexvar_ps(pm126, dat1449);
__m512 in927 = _mm512_permutexvar_ps(pm127, dat1450);
__m512 dat1451 = _mm512_maskz_loadu_ps(16383, datPtr12+1872+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1452 = _mm512_maskz_loadu_ps(511, datPtr12+13280+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in920 = _mm512_permutexvar_ps(pm126, dat1451);
__m512 in928 = _mm512_permutexvar_ps(pm127, dat1452);
__m512 dat1453 = _mm512_maskz_loadu_ps(16383, datPtr12+2096+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1454 = _mm512_maskz_loadu_ps(511, datPtr12+13504+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in921 = _mm512_permutexvar_ps(pm126, dat1453);
__m512 in929 = _mm512_permutexvar_ps(pm127, dat1454);
__m512 dat1455 = _mm512_maskz_loadu_ps(16383, datPtr12+2320+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1456 = _mm512_maskz_loadu_ps(511, datPtr12+13728+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in922 = _mm512_permutexvar_ps(pm126, dat1455);
__m512 in930 = _mm512_permutexvar_ps(pm127, dat1456);
__m512 dat1457 = _mm512_maskz_loadu_ps(16383, datPtr12+2544+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1458 = _mm512_maskz_loadu_ps(511, datPtr12+13952+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in923 = _mm512_permutexvar_ps(pm126, dat1457);
__m512 in931 = _mm512_permutexvar_ps(pm127, dat1458);
__m512 dat1459 = _mm512_maskz_loadu_ps(16383, datPtr12+2768+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1460 = _mm512_maskz_loadu_ps(511, datPtr12+14176+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in924 = _mm512_permutexvar_ps(pm126, dat1459);
__m512 in932 = _mm512_permutexvar_ps(pm127, dat1460);
__m512 tmp6241 = _mm512_add_ps(in918, in922);
__m512 tmp6245 = _mm512_add_ps(in926, in930);
__m512 tmp6242 = _mm512_sub_ps(in921, in919);
__m512 tmp6246 = _mm512_sub_ps(in929, in927);
__m512 tmp6243 = _mm512_add_ps(in919, in923);
__m512 tmp6247 = _mm512_add_ps(in927, in931);
in917 = _mm512_sub_ps(in917, in923);
in925 = _mm512_sub_ps(in925, in931);
tmp6241 = _mm512_fmadd_ps(in920, _mm512_set1_ps(-4.25e+00f), tmp6241);
tmp6245 = _mm512_fmadd_ps(in928, _mm512_set1_ps(-4.25e+00f), tmp6245);
tmp6243 = _mm512_fmadd_ps(in921, _mm512_set1_ps(-4.25e+00f), tmp6243);
tmp6247 = _mm512_fmadd_ps(in929, _mm512_set1_ps(-4.25e+00f), tmp6247);
in917 = _mm512_fmadd_ps(tmp6242, _mm512_set1_ps(5.25e+00f), in917);
in925 = _mm512_fmadd_ps(tmp6246, _mm512_set1_ps(5.25e+00f), in925);
tmp6242 = _mm512_fmadd_ps(in919, _mm512_set1_ps(2.5e-01f), in923);
tmp6246 = _mm512_fmadd_ps(in927, _mm512_set1_ps(2.5e-01f), in931);
in919 = _mm512_fmadd_ps(in919, _mm512_set1_ps(4e+00f), in923);
in927 = _mm512_fmadd_ps(in927, _mm512_set1_ps(4e+00f), in931);
__m512 tmp6244 = _mm512_sub_ps(tmp6243, tmp6241);
__m512 tmp6248 = _mm512_sub_ps(tmp6247, tmp6245);
tmp6243 = _mm512_add_ps(tmp6241, tmp6243);
tmp6247 = _mm512_add_ps(tmp6245, tmp6247);
tmp6241 = _mm512_fmadd_ps(in918, _mm512_set1_ps(2.5e-01f), in922);
tmp6245 = _mm512_fmadd_ps(in926, _mm512_set1_ps(2.5e-01f), in930);
tmp6242 = _mm512_fmadd_ps(in921, _mm512_set1_ps(-1.25e+00f), tmp6242);
tmp6246 = _mm512_fmadd_ps(in929, _mm512_set1_ps(-1.25e+00f), tmp6246);
in921 = _mm512_fmadd_ps(in921, _mm512_set1_ps(-5e+00f), in919);
in929 = _mm512_fmadd_ps(in929, _mm512_set1_ps(-5e+00f), in927);
tmp6241 = _mm512_fmadd_ps(in920, _mm512_set1_ps(-1.25e+00f), tmp6241);
tmp6245 = _mm512_fmadd_ps(in928, _mm512_set1_ps(-1.25e+00f), tmp6245);
in923 = _mm512_fmadd_ps(tmp6241, _mm512_set1_ps(2e+00f), tmp6242);
in931 = _mm512_fmadd_ps(tmp6245, _mm512_set1_ps(2e+00f), tmp6246);
tmp6242 = _mm512_fnmadd_ps(tmp6241, _mm512_set1_ps(2e+00f), tmp6242);
tmp6246 = _mm512_fnmadd_ps(tmp6245, _mm512_set1_ps(2e+00f), tmp6246);
tmp6241 = _mm512_fmadd_ps(in922, _mm512_set1_ps(2.5e-01f), in918);
tmp6245 = _mm512_fmadd_ps(in930, _mm512_set1_ps(2.5e-01f), in926);
in918 = _mm512_sub_ps(in924, in918);
in926 = _mm512_sub_ps(in932, in926);
tmp6241 = _mm512_fmadd_ps(in920, _mm512_set1_ps(-1.25e+00f), tmp6241);
tmp6245 = _mm512_fmadd_ps(in928, _mm512_set1_ps(-1.25e+00f), tmp6245);
in920 = _mm512_sub_ps(in920, in922);
in928 = _mm512_sub_ps(in928, in930);
in920 = _mm512_fmadd_ps(in920, _mm512_set1_ps(5.25e+00f), in918);
in928 = _mm512_fmadd_ps(in928, _mm512_set1_ps(5.25e+00f), in926);
in919 = _mm512_fmadd_ps(tmp6241, _mm512_set1_ps(2e+00f), in921);
in927 = _mm512_fmadd_ps(tmp6245, _mm512_set1_ps(2e+00f), in929);
in921 = _mm512_fnmadd_ps(tmp6241, _mm512_set1_ps(2e+00f), in921);
in929 = _mm512_fnmadd_ps(tmp6245, _mm512_set1_ps(2e+00f), in929);
__m512 tmp6257 = _mm512_unpacklo_ps(in917, tmp6243);
__m512 tmp6258 = _mm512_unpackhi_ps(in917, tmp6243);
__m512 tmp6259 = _mm512_unpacklo_ps(tmp6244, in923);
__m512 tmp6260 = _mm512_unpackhi_ps(tmp6244, in923);
__m512 tmp6261 = _mm512_unpacklo_ps(tmp6242, in919);
__m512 tmp6262 = _mm512_unpackhi_ps(tmp6242, in919);
__m512 tmp6263 = _mm512_unpacklo_ps(in921, in920);
__m512 tmp6264 = _mm512_unpackhi_ps(in921, in920);
__m512 tmp6265 = _mm512_unpacklo_ps(in925, tmp6247);
__m512 tmp6266 = _mm512_unpackhi_ps(in925, tmp6247);
__m512 tmp6267 = _mm512_unpacklo_ps(tmp6248, in931);
__m512 tmp6268 = _mm512_unpackhi_ps(tmp6248, in931);
__m512 tmp6269 = _mm512_unpacklo_ps(tmp6246, in927);
__m512 tmp6270 = _mm512_unpackhi_ps(tmp6246, in927);
__m512 tmp6271 = _mm512_unpacklo_ps(in929, in928);
__m512 tmp6272 = _mm512_unpackhi_ps(in929, in928);
__m512 tmp6273 = _mm512_shuffle_ps(tmp6257, tmp6259, 68);
__m512 tmp6274 = _mm512_shuffle_ps(tmp6257, tmp6259, 238);
__m512 tmp6275 = _mm512_shuffle_ps(tmp6258, tmp6260, 68);
__m512 tmp6276 = _mm512_shuffle_ps(tmp6258, tmp6260, 238);
__m512 tmp6277 = _mm512_shuffle_ps(tmp6261, tmp6263, 68);
__m512 tmp6278 = _mm512_shuffle_ps(tmp6261, tmp6263, 238);
__m512 tmp6279 = _mm512_shuffle_ps(tmp6262, tmp6264, 68);
__m512 tmp6280 = _mm512_shuffle_ps(tmp6262, tmp6264, 238);
__m512 tmp6281 = _mm512_shuffle_ps(tmp6265, tmp6267, 68);
__m512 tmp6282 = _mm512_shuffle_ps(tmp6265, tmp6267, 238);
__m512 tmp6283 = _mm512_shuffle_ps(tmp6266, tmp6268, 68);
__m512 tmp6284 = _mm512_shuffle_ps(tmp6266, tmp6268, 238);
__m512 tmp6285 = _mm512_shuffle_ps(tmp6269, tmp6271, 68);
__m512 tmp6286 = _mm512_shuffle_ps(tmp6269, tmp6271, 238);
__m512 tmp6287 = _mm512_shuffle_ps(tmp6270, tmp6272, 68);
__m512 tmp6288 = _mm512_shuffle_ps(tmp6270, tmp6272, 238);
__m512 tmp6289 = _mm512_shuffle_f32x4(tmp6273, tmp6277, 136);
__m512 tmp6290 = _mm512_shuffle_f32x4(tmp6273, tmp6277, 221);
__m512 tmp6291 = _mm512_shuffle_f32x4(tmp6274, tmp6278, 136);
__m512 tmp6292 = _mm512_shuffle_f32x4(tmp6274, tmp6278, 221);
__m512 tmp6293 = _mm512_shuffle_f32x4(tmp6275, tmp6279, 136);
__m512 tmp6294 = _mm512_shuffle_f32x4(tmp6275, tmp6279, 221);
__m512 tmp6295 = _mm512_shuffle_f32x4(tmp6276, tmp6280, 136);
__m512 tmp6296 = _mm512_shuffle_f32x4(tmp6276, tmp6280, 221);
__m512 tmp6297 = _mm512_shuffle_f32x4(tmp6281, tmp6285, 136);
__m512 tmp6298 = _mm512_shuffle_f32x4(tmp6281, tmp6285, 221);
__m512 tmp6299 = _mm512_shuffle_f32x4(tmp6282, tmp6286, 136);
__m512 tmp6300 = _mm512_shuffle_f32x4(tmp6282, tmp6286, 221);
__m512 tmp6301 = _mm512_shuffle_f32x4(tmp6283, tmp6287, 136);
__m512 tmp6302 = _mm512_shuffle_f32x4(tmp6283, tmp6287, 221);
__m512 tmp6303 = _mm512_shuffle_f32x4(tmp6284, tmp6288, 136);
__m512 tmp6304 = _mm512_shuffle_f32x4(tmp6284, tmp6288, 221);
in917 = _mm512_shuffle_f32x4(tmp6289, tmp6297, 136);
in925 = _mm512_shuffle_f32x4(tmp6289, tmp6297, 221);
tmp6243 = _mm512_shuffle_f32x4(tmp6291, tmp6299, 136);
tmp6247 = _mm512_shuffle_f32x4(tmp6291, tmp6299, 221);
tmp6244 = _mm512_shuffle_f32x4(tmp6293, tmp6301, 136);
tmp6248 = _mm512_shuffle_f32x4(tmp6293, tmp6301, 221);
in923 = _mm512_shuffle_f32x4(tmp6295, tmp6303, 136);
in931 = _mm512_shuffle_f32x4(tmp6295, tmp6303, 221);
tmp6242 = _mm512_shuffle_f32x4(tmp6290, tmp6298, 136);
tmp6246 = _mm512_shuffle_f32x4(tmp6290, tmp6298, 221);
in919 = _mm512_shuffle_f32x4(tmp6292, tmp6300, 136);
in927 = _mm512_shuffle_f32x4(tmp6292, tmp6300, 221);
in921 = _mm512_shuffle_f32x4(tmp6294, tmp6302, 136);
in929 = _mm512_shuffle_f32x4(tmp6294, tmp6302, 221);
in920 = _mm512_shuffle_f32x4(tmp6296, tmp6304, 136);
in928 = _mm512_shuffle_f32x4(tmp6296, tmp6304, 221);
__m512 tmp6249 = _mm512_add_ps(tmp6243, in919);
__m512 tmp6253 = _mm512_add_ps(tmp6247, in927);
__m512 tmp6250 = _mm512_sub_ps(tmp6242, tmp6244);
__m512 tmp6254 = _mm512_sub_ps(tmp6246, tmp6248);
__m512 tmp6251 = _mm512_add_ps(tmp6244, in921);
__m512 tmp6255 = _mm512_add_ps(tmp6248, in929);
in917 = _mm512_sub_ps(in917, in921);
in925 = _mm512_sub_ps(in925, in929);
tmp6249 = _mm512_fmadd_ps(in923, _mm512_set1_ps(-4.25e+00f), tmp6249);
tmp6253 = _mm512_fmadd_ps(in931, _mm512_set1_ps(-4.25e+00f), tmp6253);
tmp6251 = _mm512_fmadd_ps(tmp6242, _mm512_set1_ps(-4.25e+00f), tmp6251);
tmp6255 = _mm512_fmadd_ps(tmp6246, _mm512_set1_ps(-4.25e+00f), tmp6255);
in917 = _mm512_fmadd_ps(tmp6250, _mm512_set1_ps(5.25e+00f), in917);
in925 = _mm512_fmadd_ps(tmp6254, _mm512_set1_ps(5.25e+00f), in925);
tmp6250 = _mm512_fmadd_ps(tmp6244, _mm512_set1_ps(2.5e-01f), in921);
tmp6254 = _mm512_fmadd_ps(tmp6248, _mm512_set1_ps(2.5e-01f), in929);
tmp6244 = _mm512_fmadd_ps(tmp6244, _mm512_set1_ps(4e+00f), in921);
tmp6248 = _mm512_fmadd_ps(tmp6248, _mm512_set1_ps(4e+00f), in929);
__m512 tmp6252 = _mm512_sub_ps(tmp6251, tmp6249);
__m512 tmp6256 = _mm512_sub_ps(tmp6255, tmp6253);
tmp6251 = _mm512_add_ps(tmp6249, tmp6251);
tmp6255 = _mm512_add_ps(tmp6253, tmp6255);
tmp6249 = _mm512_fmadd_ps(tmp6243, _mm512_set1_ps(2.5e-01f), in919);
tmp6253 = _mm512_fmadd_ps(tmp6247, _mm512_set1_ps(2.5e-01f), in927);
tmp6250 = _mm512_fmadd_ps(tmp6242, _mm512_set1_ps(-1.25e+00f), tmp6250);
tmp6254 = _mm512_fmadd_ps(tmp6246, _mm512_set1_ps(-1.25e+00f), tmp6254);
tmp6242 = _mm512_fmadd_ps(tmp6242, _mm512_set1_ps(-5e+00f), tmp6244);
tmp6246 = _mm512_fmadd_ps(tmp6246, _mm512_set1_ps(-5e+00f), tmp6248);
tmp6249 = _mm512_fmadd_ps(in923, _mm512_set1_ps(-1.25e+00f), tmp6249);
tmp6253 = _mm512_fmadd_ps(in931, _mm512_set1_ps(-1.25e+00f), tmp6253);
in921 = _mm512_fmadd_ps(tmp6249, _mm512_set1_ps(2e+00f), tmp6250);
in929 = _mm512_fmadd_ps(tmp6253, _mm512_set1_ps(2e+00f), tmp6254);
tmp6250 = _mm512_fnmadd_ps(tmp6249, _mm512_set1_ps(2e+00f), tmp6250);
tmp6254 = _mm512_fnmadd_ps(tmp6253, _mm512_set1_ps(2e+00f), tmp6254);
tmp6249 = _mm512_fmadd_ps(in919, _mm512_set1_ps(2.5e-01f), tmp6243);
tmp6253 = _mm512_fmadd_ps(in927, _mm512_set1_ps(2.5e-01f), tmp6247);
tmp6243 = _mm512_sub_ps(in920, tmp6243);
tmp6247 = _mm512_sub_ps(in928, tmp6247);
tmp6249 = _mm512_fmadd_ps(in923, _mm512_set1_ps(-1.25e+00f), tmp6249);
tmp6253 = _mm512_fmadd_ps(in931, _mm512_set1_ps(-1.25e+00f), tmp6253);
in923 = _mm512_sub_ps(in923, in919);
in931 = _mm512_sub_ps(in931, in927);
in923 = _mm512_fmadd_ps(in923, _mm512_set1_ps(5.25e+00f), tmp6243);
in931 = _mm512_fmadd_ps(in931, _mm512_set1_ps(5.25e+00f), tmp6247);
tmp6244 = _mm512_fmadd_ps(tmp6249, _mm512_set1_ps(2e+00f), tmp6242);
tmp6248 = _mm512_fmadd_ps(tmp6253, _mm512_set1_ps(2e+00f), tmp6246);
tmp6242 = _mm512_fnmadd_ps(tmp6249, _mm512_set1_ps(2e+00f), tmp6242);
tmp6246 = _mm512_fnmadd_ps(tmp6253, _mm512_set1_ps(2e+00f), tmp6246);
__m512 out867 = _mm512_shuffle_f32x4(in917, tmp6251, 68);
__m512 out875 = _mm512_shuffle_f32x4(in917, tmp6251, 238);
__m512 out868 = _mm512_shuffle_f32x4(tmp6252, in921, 68);
__m512 out876 = _mm512_shuffle_f32x4(tmp6252, in921, 238);
__m512 out869 = _mm512_shuffle_f32x4(tmp6250, tmp6244, 68);
__m512 out877 = _mm512_shuffle_f32x4(tmp6250, tmp6244, 238);
__m512 out870 = _mm512_shuffle_f32x4(tmp6242, in923, 68);
__m512 out878 = _mm512_shuffle_f32x4(tmp6242, in923, 238);
__m512 out871 = _mm512_shuffle_f32x4(in925, tmp6255, 68);
__m512 out879 = _mm512_shuffle_f32x4(in925, tmp6255, 238);
__m512 out872 = _mm512_shuffle_f32x4(tmp6256, in929, 68);
__m512 out880 = _mm512_shuffle_f32x4(tmp6256, in929, 238);
__m512 out873 = _mm512_shuffle_f32x4(tmp6254, tmp6248, 68);
__m512 out881 = _mm512_shuffle_f32x4(tmp6254, tmp6248, 238);
__m512 out874 = _mm512_shuffle_f32x4(tmp6246, in931, 68);
__m512 out882 = _mm512_shuffle_f32x4(tmp6246, in931, 238);
_mm512_storeu_ps(dfPtr6+256+1638400*i26+24576*j21+24576*s20+768*k83, out867);
_mm512_storeu_ps(dfPtr6+384+1638400*i26+24576*j21+24576*s20+768*k83, out875);
_mm512_storeu_ps(dfPtr6+320+1638400*i26+24576*j21+24576*s20+768*k83, out871);
_mm512_storeu_ps(dfPtr6+448+1638400*i26+24576*j21+24576*s20+768*k83, out879);
_mm512_storeu_ps(dfPtr6+409856+1638400*i26+24576*j21+24576*s20+768*k83, out868);
_mm512_storeu_ps(dfPtr6+409984+1638400*i26+24576*j21+24576*s20+768*k83, out876);
_mm512_storeu_ps(dfPtr6+409920+1638400*i26+24576*j21+24576*s20+768*k83, out872);
_mm512_storeu_ps(dfPtr6+410048+1638400*i26+24576*j21+24576*s20+768*k83, out880);
_mm512_storeu_ps(dfPtr6+819456+1638400*i26+24576*j21+24576*s20+768*k83, out869);
_mm512_storeu_ps(dfPtr6+819584+1638400*i26+24576*j21+24576*s20+768*k83, out877);
_mm512_storeu_ps(dfPtr6+819520+1638400*i26+24576*j21+24576*s20+768*k83, out873);
_mm512_storeu_ps(dfPtr6+819648+1638400*i26+24576*j21+24576*s20+768*k83, out881);
_mm512_storeu_ps(dfPtr6+1229056+1638400*i26+24576*j21+24576*s20+768*k83, out870);
_mm512_storeu_ps(dfPtr6+1229184+1638400*i26+24576*j21+24576*s20+768*k83, out878);
_mm512_storeu_ps(dfPtr6+1229120+1638400*i26+24576*j21+24576*s20+768*k83, out874);
_mm512_storeu_ps(dfPtr6+1229248+1638400*i26+24576*j21+24576*s20+768*k83, out882);
__m512 dat1461 = _mm512_maskz_loadu_ps(8191, datPtr12+13764+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1462 = _mm512_maskz_loadu_ps(16383, datPtr12+13808+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512i pm128 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in933 = _mm512_permutexvar_ps(pm128, dat1461);
__m512i pm129 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in941 = _mm512_permutexvar_ps(pm129, dat1462);
__m512 dat1463 = _mm512_maskz_loadu_ps(8191, datPtr12+13988+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1464 = _mm512_maskz_loadu_ps(16383, datPtr12+14032+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in934 = _mm512_permutexvar_ps(pm128, dat1463);
__m512 in942 = _mm512_permutexvar_ps(pm129, dat1464);
__m512 dat1465 = _mm512_maskz_loadu_ps(8191, datPtr12+14212+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1466 = _mm512_maskz_loadu_ps(16383, datPtr12+14256+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in935 = _mm512_permutexvar_ps(pm128, dat1465);
__m512 in943 = _mm512_permutexvar_ps(pm129, dat1466);
__m512 dat1467 = _mm512_maskz_loadu_ps(8191, datPtr12+14436+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1468 = _mm512_maskz_loadu_ps(16383, datPtr12+14480+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in936 = _mm512_permutexvar_ps(pm128, dat1467);
__m512 in944 = _mm512_permutexvar_ps(pm129, dat1468);
__m512 dat1469 = _mm512_maskz_loadu_ps(8191, datPtr12+14660+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1470 = _mm512_maskz_loadu_ps(16383, datPtr12+14704+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in937 = _mm512_permutexvar_ps(pm128, dat1469);
__m512 in945 = _mm512_permutexvar_ps(pm129, dat1470);
__m512 dat1471 = _mm512_maskz_loadu_ps(8191, datPtr12+14884+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1472 = _mm512_maskz_loadu_ps(16383, datPtr12+14928+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in938 = _mm512_permutexvar_ps(pm128, dat1471);
__m512 in946 = _mm512_permutexvar_ps(pm129, dat1472);
__m512 dat1473 = _mm512_maskz_loadu_ps(8191, datPtr12+15108+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1474 = _mm512_maskz_loadu_ps(16383, datPtr12+15152+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in939 = _mm512_permutexvar_ps(pm128, dat1473);
__m512 in947 = _mm512_permutexvar_ps(pm129, dat1474);
__m512 dat1475 = _mm512_maskz_loadu_ps(8191, datPtr12+15332+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 dat1476 = _mm512_maskz_loadu_ps(16383, datPtr12+15376+806912*i26+224*h32+4*w39+806912*s20+25216*k83);
__m512 in940 = _mm512_permutexvar_ps(pm128, dat1475);
__m512 in948 = _mm512_permutexvar_ps(pm129, dat1476);
__m512 tmp6305 = _mm512_add_ps(in934, in938);
__m512 tmp6309 = _mm512_add_ps(in942, in946);
__m512 tmp6306 = _mm512_sub_ps(in937, in935);
__m512 tmp6310 = _mm512_sub_ps(in945, in943);
__m512 tmp6307 = _mm512_add_ps(in935, in939);
__m512 tmp6311 = _mm512_add_ps(in943, in947);
in933 = _mm512_sub_ps(in933, in939);
in941 = _mm512_sub_ps(in941, in947);
tmp6305 = _mm512_fmadd_ps(in936, _mm512_set1_ps(-4.25e+00f), tmp6305);
tmp6309 = _mm512_fmadd_ps(in944, _mm512_set1_ps(-4.25e+00f), tmp6309);
tmp6307 = _mm512_fmadd_ps(in937, _mm512_set1_ps(-4.25e+00f), tmp6307);
tmp6311 = _mm512_fmadd_ps(in945, _mm512_set1_ps(-4.25e+00f), tmp6311);
in933 = _mm512_fmadd_ps(tmp6306, _mm512_set1_ps(5.25e+00f), in933);
in941 = _mm512_fmadd_ps(tmp6310, _mm512_set1_ps(5.25e+00f), in941);
tmp6306 = _mm512_fmadd_ps(in935, _mm512_set1_ps(2.5e-01f), in939);
tmp6310 = _mm512_fmadd_ps(in943, _mm512_set1_ps(2.5e-01f), in947);
in935 = _mm512_fmadd_ps(in935, _mm512_set1_ps(4e+00f), in939);
in943 = _mm512_fmadd_ps(in943, _mm512_set1_ps(4e+00f), in947);
__m512 tmp6308 = _mm512_sub_ps(tmp6307, tmp6305);
__m512 tmp6312 = _mm512_sub_ps(tmp6311, tmp6309);
tmp6307 = _mm512_add_ps(tmp6305, tmp6307);
tmp6311 = _mm512_add_ps(tmp6309, tmp6311);
tmp6305 = _mm512_fmadd_ps(in934, _mm512_set1_ps(2.5e-01f), in938);
tmp6309 = _mm512_fmadd_ps(in942, _mm512_set1_ps(2.5e-01f), in946);
tmp6306 = _mm512_fmadd_ps(in937, _mm512_set1_ps(-1.25e+00f), tmp6306);
tmp6310 = _mm512_fmadd_ps(in945, _mm512_set1_ps(-1.25e+00f), tmp6310);
in937 = _mm512_fmadd_ps(in937, _mm512_set1_ps(-5e+00f), in935);
in945 = _mm512_fmadd_ps(in945, _mm512_set1_ps(-5e+00f), in943);
tmp6305 = _mm512_fmadd_ps(in936, _mm512_set1_ps(-1.25e+00f), tmp6305);
tmp6309 = _mm512_fmadd_ps(in944, _mm512_set1_ps(-1.25e+00f), tmp6309);
in939 = _mm512_fmadd_ps(tmp6305, _mm512_set1_ps(2e+00f), tmp6306);
in947 = _mm512_fmadd_ps(tmp6309, _mm512_set1_ps(2e+00f), tmp6310);
tmp6306 = _mm512_fnmadd_ps(tmp6305, _mm512_set1_ps(2e+00f), tmp6306);
tmp6310 = _mm512_fnmadd_ps(tmp6309, _mm512_set1_ps(2e+00f), tmp6310);
tmp6305 = _mm512_fmadd_ps(in938, _mm512_set1_ps(2.5e-01f), in934);
tmp6309 = _mm512_fmadd_ps(in946, _mm512_set1_ps(2.5e-01f), in942);
in934 = _mm512_sub_ps(in940, in934);
in942 = _mm512_sub_ps(in948, in942);
tmp6305 = _mm512_fmadd_ps(in936, _mm512_set1_ps(-1.25e+00f), tmp6305);
tmp6309 = _mm512_fmadd_ps(in944, _mm512_set1_ps(-1.25e+00f), tmp6309);
in936 = _mm512_sub_ps(in936, in938);
in944 = _mm512_sub_ps(in944, in946);
in936 = _mm512_fmadd_ps(in936, _mm512_set1_ps(5.25e+00f), in934);
in944 = _mm512_fmadd_ps(in944, _mm512_set1_ps(5.25e+00f), in942);
in935 = _mm512_fmadd_ps(tmp6305, _mm512_set1_ps(2e+00f), in937);
in943 = _mm512_fmadd_ps(tmp6309, _mm512_set1_ps(2e+00f), in945);
in937 = _mm512_fnmadd_ps(tmp6305, _mm512_set1_ps(2e+00f), in937);
in945 = _mm512_fnmadd_ps(tmp6309, _mm512_set1_ps(2e+00f), in945);
__m512 tmp6321 = _mm512_unpacklo_ps(in933, tmp6307);
__m512 tmp6322 = _mm512_unpackhi_ps(in933, tmp6307);
__m512 tmp6323 = _mm512_unpacklo_ps(tmp6308, in939);
__m512 tmp6324 = _mm512_unpackhi_ps(tmp6308, in939);
__m512 tmp6325 = _mm512_unpacklo_ps(tmp6306, in935);
__m512 tmp6326 = _mm512_unpackhi_ps(tmp6306, in935);
__m512 tmp6327 = _mm512_unpacklo_ps(in937, in936);
__m512 tmp6328 = _mm512_unpackhi_ps(in937, in936);
__m512 tmp6329 = _mm512_unpacklo_ps(in941, tmp6311);
__m512 tmp6330 = _mm512_unpackhi_ps(in941, tmp6311);
__m512 tmp6331 = _mm512_unpacklo_ps(tmp6312, in947);
__m512 tmp6332 = _mm512_unpackhi_ps(tmp6312, in947);
__m512 tmp6333 = _mm512_unpacklo_ps(tmp6310, in943);
__m512 tmp6334 = _mm512_unpackhi_ps(tmp6310, in943);
__m512 tmp6335 = _mm512_unpacklo_ps(in945, in944);
__m512 tmp6336 = _mm512_unpackhi_ps(in945, in944);
__m512 tmp6337 = _mm512_shuffle_ps(tmp6321, tmp6323, 68);
__m512 tmp6338 = _mm512_shuffle_ps(tmp6321, tmp6323, 238);
__m512 tmp6339 = _mm512_shuffle_ps(tmp6322, tmp6324, 68);
__m512 tmp6340 = _mm512_shuffle_ps(tmp6322, tmp6324, 238);
__m512 tmp6341 = _mm512_shuffle_ps(tmp6325, tmp6327, 68);
__m512 tmp6342 = _mm512_shuffle_ps(tmp6325, tmp6327, 238);
__m512 tmp6343 = _mm512_shuffle_ps(tmp6326, tmp6328, 68);
__m512 tmp6344 = _mm512_shuffle_ps(tmp6326, tmp6328, 238);
__m512 tmp6345 = _mm512_shuffle_ps(tmp6329, tmp6331, 68);
__m512 tmp6346 = _mm512_shuffle_ps(tmp6329, tmp6331, 238);
__m512 tmp6347 = _mm512_shuffle_ps(tmp6330, tmp6332, 68);
__m512 tmp6348 = _mm512_shuffle_ps(tmp6330, tmp6332, 238);
__m512 tmp6349 = _mm512_shuffle_ps(tmp6333, tmp6335, 68);
__m512 tmp6350 = _mm512_shuffle_ps(tmp6333, tmp6335, 238);
__m512 tmp6351 = _mm512_shuffle_ps(tmp6334, tmp6336, 68);
__m512 tmp6352 = _mm512_shuffle_ps(tmp6334, tmp6336, 238);
__m512 tmp6353 = _mm512_shuffle_f32x4(tmp6337, tmp6341, 136);
__m512 tmp6354 = _mm512_shuffle_f32x4(tmp6337, tmp6341, 221);
__m512 tmp6355 = _mm512_shuffle_f32x4(tmp6338, tmp6342, 136);
__m512 tmp6356 = _mm512_shuffle_f32x4(tmp6338, tmp6342, 221);
__m512 tmp6357 = _mm512_shuffle_f32x4(tmp6339, tmp6343, 136);
__m512 tmp6358 = _mm512_shuffle_f32x4(tmp6339, tmp6343, 221);
__m512 tmp6359 = _mm512_shuffle_f32x4(tmp6340, tmp6344, 136);
__m512 tmp6360 = _mm512_shuffle_f32x4(tmp6340, tmp6344, 221);
__m512 tmp6361 = _mm512_shuffle_f32x4(tmp6345, tmp6349, 136);
__m512 tmp6362 = _mm512_shuffle_f32x4(tmp6345, tmp6349, 221);
__m512 tmp6363 = _mm512_shuffle_f32x4(tmp6346, tmp6350, 136);
__m512 tmp6364 = _mm512_shuffle_f32x4(tmp6346, tmp6350, 221);
__m512 tmp6365 = _mm512_shuffle_f32x4(tmp6347, tmp6351, 136);
__m512 tmp6366 = _mm512_shuffle_f32x4(tmp6347, tmp6351, 221);
__m512 tmp6367 = _mm512_shuffle_f32x4(tmp6348, tmp6352, 136);
__m512 tmp6368 = _mm512_shuffle_f32x4(tmp6348, tmp6352, 221);
in933 = _mm512_shuffle_f32x4(tmp6353, tmp6361, 136);
in941 = _mm512_shuffle_f32x4(tmp6353, tmp6361, 221);
tmp6307 = _mm512_shuffle_f32x4(tmp6355, tmp6363, 136);
tmp6311 = _mm512_shuffle_f32x4(tmp6355, tmp6363, 221);
tmp6308 = _mm512_shuffle_f32x4(tmp6357, tmp6365, 136);
tmp6312 = _mm512_shuffle_f32x4(tmp6357, tmp6365, 221);
in939 = _mm512_shuffle_f32x4(tmp6359, tmp6367, 136);
in947 = _mm512_shuffle_f32x4(tmp6359, tmp6367, 221);
tmp6306 = _mm512_shuffle_f32x4(tmp6354, tmp6362, 136);
tmp6310 = _mm512_shuffle_f32x4(tmp6354, tmp6362, 221);
in935 = _mm512_shuffle_f32x4(tmp6356, tmp6364, 136);
in943 = _mm512_shuffle_f32x4(tmp6356, tmp6364, 221);
in937 = _mm512_shuffle_f32x4(tmp6358, tmp6366, 136);
in945 = _mm512_shuffle_f32x4(tmp6358, tmp6366, 221);
in936 = _mm512_shuffle_f32x4(tmp6360, tmp6368, 136);
in944 = _mm512_shuffle_f32x4(tmp6360, tmp6368, 221);
__m512 tmp6313 = _mm512_add_ps(tmp6307, in935);
__m512 tmp6317 = _mm512_add_ps(tmp6311, in943);
__m512 tmp6314 = _mm512_sub_ps(tmp6306, tmp6308);
__m512 tmp6318 = _mm512_sub_ps(tmp6310, tmp6312);
__m512 tmp6315 = _mm512_add_ps(tmp6308, in937);
__m512 tmp6319 = _mm512_add_ps(tmp6312, in945);
in933 = _mm512_sub_ps(in933, in937);
in941 = _mm512_sub_ps(in941, in945);
tmp6313 = _mm512_fmadd_ps(in939, _mm512_set1_ps(-4.25e+00f), tmp6313);
tmp6317 = _mm512_fmadd_ps(in947, _mm512_set1_ps(-4.25e+00f), tmp6317);
tmp6315 = _mm512_fmadd_ps(tmp6306, _mm512_set1_ps(-4.25e+00f), tmp6315);
tmp6319 = _mm512_fmadd_ps(tmp6310, _mm512_set1_ps(-4.25e+00f), tmp6319);
in933 = _mm512_fmadd_ps(tmp6314, _mm512_set1_ps(5.25e+00f), in933);
in941 = _mm512_fmadd_ps(tmp6318, _mm512_set1_ps(5.25e+00f), in941);
tmp6314 = _mm512_fmadd_ps(tmp6308, _mm512_set1_ps(2.5e-01f), in937);
tmp6318 = _mm512_fmadd_ps(tmp6312, _mm512_set1_ps(2.5e-01f), in945);
tmp6308 = _mm512_fmadd_ps(tmp6308, _mm512_set1_ps(4e+00f), in937);
tmp6312 = _mm512_fmadd_ps(tmp6312, _mm512_set1_ps(4e+00f), in945);
__m512 tmp6316 = _mm512_sub_ps(tmp6315, tmp6313);
__m512 tmp6320 = _mm512_sub_ps(tmp6319, tmp6317);
tmp6315 = _mm512_add_ps(tmp6313, tmp6315);
tmp6319 = _mm512_add_ps(tmp6317, tmp6319);
tmp6313 = _mm512_fmadd_ps(tmp6307, _mm512_set1_ps(2.5e-01f), in935);
tmp6317 = _mm512_fmadd_ps(tmp6311, _mm512_set1_ps(2.5e-01f), in943);
tmp6314 = _mm512_fmadd_ps(tmp6306, _mm512_set1_ps(-1.25e+00f), tmp6314);
tmp6318 = _mm512_fmadd_ps(tmp6310, _mm512_set1_ps(-1.25e+00f), tmp6318);
tmp6306 = _mm512_fmadd_ps(tmp6306, _mm512_set1_ps(-5e+00f), tmp6308);
tmp6310 = _mm512_fmadd_ps(tmp6310, _mm512_set1_ps(-5e+00f), tmp6312);
tmp6313 = _mm512_fmadd_ps(in939, _mm512_set1_ps(-1.25e+00f), tmp6313);
tmp6317 = _mm512_fmadd_ps(in947, _mm512_set1_ps(-1.25e+00f), tmp6317);
in937 = _mm512_fmadd_ps(tmp6313, _mm512_set1_ps(2e+00f), tmp6314);
in945 = _mm512_fmadd_ps(tmp6317, _mm512_set1_ps(2e+00f), tmp6318);
tmp6314 = _mm512_fnmadd_ps(tmp6313, _mm512_set1_ps(2e+00f), tmp6314);
tmp6318 = _mm512_fnmadd_ps(tmp6317, _mm512_set1_ps(2e+00f), tmp6318);
tmp6313 = _mm512_fmadd_ps(in935, _mm512_set1_ps(2.5e-01f), tmp6307);
tmp6317 = _mm512_fmadd_ps(in943, _mm512_set1_ps(2.5e-01f), tmp6311);
tmp6307 = _mm512_sub_ps(in936, tmp6307);
tmp6311 = _mm512_sub_ps(in944, tmp6311);
tmp6313 = _mm512_fmadd_ps(in939, _mm512_set1_ps(-1.25e+00f), tmp6313);
tmp6317 = _mm512_fmadd_ps(in947, _mm512_set1_ps(-1.25e+00f), tmp6317);
in939 = _mm512_sub_ps(in939, in935);
in947 = _mm512_sub_ps(in947, in943);
in939 = _mm512_fmadd_ps(in939, _mm512_set1_ps(5.25e+00f), tmp6307);
in947 = _mm512_fmadd_ps(in947, _mm512_set1_ps(5.25e+00f), tmp6311);
tmp6308 = _mm512_fmadd_ps(tmp6313, _mm512_set1_ps(2e+00f), tmp6306);
tmp6312 = _mm512_fmadd_ps(tmp6317, _mm512_set1_ps(2e+00f), tmp6310);
tmp6306 = _mm512_fnmadd_ps(tmp6313, _mm512_set1_ps(2e+00f), tmp6306);
tmp6310 = _mm512_fnmadd_ps(tmp6317, _mm512_set1_ps(2e+00f), tmp6310);
__m512 out883 = _mm512_shuffle_f32x4(in933, tmp6315, 68);
__m512 out891 = _mm512_shuffle_f32x4(in933, tmp6315, 238);
__m512 out884 = _mm512_shuffle_f32x4(tmp6316, in937, 68);
__m512 out892 = _mm512_shuffle_f32x4(tmp6316, in937, 238);
__m512 out885 = _mm512_shuffle_f32x4(tmp6314, tmp6308, 68);
__m512 out893 = _mm512_shuffle_f32x4(tmp6314, tmp6308, 238);
__m512 out886 = _mm512_shuffle_f32x4(tmp6306, in939, 68);
__m512 out894 = _mm512_shuffle_f32x4(tmp6306, in939, 238);
__m512 out887 = _mm512_shuffle_f32x4(in941, tmp6319, 68);
__m512 out895 = _mm512_shuffle_f32x4(in941, tmp6319, 238);
__m512 out888 = _mm512_shuffle_f32x4(tmp6320, in945, 68);
__m512 out896 = _mm512_shuffle_f32x4(tmp6320, in945, 238);
__m512 out889 = _mm512_shuffle_f32x4(tmp6318, tmp6312, 68);
__m512 out897 = _mm512_shuffle_f32x4(tmp6318, tmp6312, 238);
__m512 out890 = _mm512_shuffle_f32x4(tmp6310, in947, 68);
__m512 out898 = _mm512_shuffle_f32x4(tmp6310, in947, 238);
_mm512_storeu_ps(dfPtr6+512+1638400*i26+24576*j21+24576*s20+768*k83, out883);
_mm512_storeu_ps(dfPtr6+640+1638400*i26+24576*j21+24576*s20+768*k83, out891);
_mm512_storeu_ps(dfPtr6+576+1638400*i26+24576*j21+24576*s20+768*k83, out887);
_mm512_storeu_ps(dfPtr6+704+1638400*i26+24576*j21+24576*s20+768*k83, out895);
_mm512_storeu_ps(dfPtr6+410112+1638400*i26+24576*j21+24576*s20+768*k83, out884);
_mm512_storeu_ps(dfPtr6+410240+1638400*i26+24576*j21+24576*s20+768*k83, out892);
_mm512_storeu_ps(dfPtr6+410176+1638400*i26+24576*j21+24576*s20+768*k83, out888);
_mm512_storeu_ps(dfPtr6+410304+1638400*i26+24576*j21+24576*s20+768*k83, out896);
_mm512_storeu_ps(dfPtr6+819712+1638400*i26+24576*j21+24576*s20+768*k83, out885);
_mm512_storeu_ps(dfPtr6+819840+1638400*i26+24576*j21+24576*s20+768*k83, out893);
_mm512_storeu_ps(dfPtr6+819776+1638400*i26+24576*j21+24576*s20+768*k83, out889);
_mm512_storeu_ps(dfPtr6+819904+1638400*i26+24576*j21+24576*s20+768*k83, out897);
_mm512_storeu_ps(dfPtr6+1229312+1638400*i26+24576*j21+24576*s20+768*k83, out886);
_mm512_storeu_ps(dfPtr6+1229440+1638400*i26+24576*j21+24576*s20+768*k83, out894);
_mm512_storeu_ps(dfPtr6+1229376+1638400*i26+24576*j21+24576*s20+768*k83, out890);
_mm512_storeu_ps(dfPtr6+1229504+1638400*i26+24576*j21+24576*s20+768*k83, out898);
}
if (j21 >= last5) return;
++j21;
rel14 = 2;
}
if (rel14 < 3) {
ptrdiff_t h33 = base14+6;
ptrdiff_t w40 = 24;
ptrdiff_t k84 = 0;
for (; k84 != 32; ++k84) {
__m512 dat1477 = _mm512_maskz_loadu_ps(16383, datPtr12+0+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1478 = _mm512_maskz_loadu_ps(16383, datPtr12+48+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512i pm130 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in949 = _mm512_permutexvar_ps(pm130, dat1477);
__m512 in957 = _mm512_permutexvar_ps(pm130, dat1478);
__m512 dat1479 = _mm512_maskz_loadu_ps(16383, datPtr12+224+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1480 = _mm512_maskz_loadu_ps(16383, datPtr12+272+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in950 = _mm512_permutexvar_ps(pm130, dat1479);
__m512 in958 = _mm512_permutexvar_ps(pm130, dat1480);
__m512 dat1481 = _mm512_maskz_loadu_ps(16383, datPtr12+448+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1482 = _mm512_maskz_loadu_ps(16383, datPtr12+496+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in951 = _mm512_permutexvar_ps(pm130, dat1481);
__m512 in959 = _mm512_permutexvar_ps(pm130, dat1482);
__m512 dat1483 = _mm512_maskz_loadu_ps(16383, datPtr12+672+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1484 = _mm512_maskz_loadu_ps(16383, datPtr12+720+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in952 = _mm512_permutexvar_ps(pm130, dat1483);
__m512 in960 = _mm512_permutexvar_ps(pm130, dat1484);
__m512 dat1485 = _mm512_maskz_loadu_ps(16383, datPtr12+896+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1486 = _mm512_maskz_loadu_ps(16383, datPtr12+944+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in953 = _mm512_permutexvar_ps(pm130, dat1485);
__m512 in961 = _mm512_permutexvar_ps(pm130, dat1486);
__m512 dat1487 = _mm512_maskz_loadu_ps(16383, datPtr12+1120+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1488 = _mm512_maskz_loadu_ps(16383, datPtr12+1168+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in954 = _mm512_permutexvar_ps(pm130, dat1487);
__m512 in962 = _mm512_permutexvar_ps(pm130, dat1488);
__m512 dat1489 = _mm512_maskz_loadu_ps(16383, datPtr12+1344+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1490 = _mm512_maskz_loadu_ps(16383, datPtr12+1392+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in955 = _mm512_permutexvar_ps(pm130, dat1489);
__m512 in963 = _mm512_permutexvar_ps(pm130, dat1490);
__m512 dat1491 = _mm512_maskz_loadu_ps(16383, datPtr12+1568+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1492 = _mm512_maskz_loadu_ps(16383, datPtr12+1616+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in956 = _mm512_permutexvar_ps(pm130, dat1491);
__m512 in964 = _mm512_permutexvar_ps(pm130, dat1492);
__m512 tmp6369 = _mm512_add_ps(in950, in954);
__m512 tmp6373 = _mm512_add_ps(in958, in962);
__m512 tmp6370 = _mm512_sub_ps(in953, in951);
__m512 tmp6374 = _mm512_sub_ps(in961, in959);
__m512 tmp6371 = _mm512_add_ps(in951, in955);
__m512 tmp6375 = _mm512_add_ps(in959, in963);
in949 = _mm512_sub_ps(in949, in955);
in957 = _mm512_sub_ps(in957, in963);
tmp6369 = _mm512_fmadd_ps(in952, _mm512_set1_ps(-4.25e+00f), tmp6369);
tmp6373 = _mm512_fmadd_ps(in960, _mm512_set1_ps(-4.25e+00f), tmp6373);
tmp6371 = _mm512_fmadd_ps(in953, _mm512_set1_ps(-4.25e+00f), tmp6371);
tmp6375 = _mm512_fmadd_ps(in961, _mm512_set1_ps(-4.25e+00f), tmp6375);
in949 = _mm512_fmadd_ps(tmp6370, _mm512_set1_ps(5.25e+00f), in949);
in957 = _mm512_fmadd_ps(tmp6374, _mm512_set1_ps(5.25e+00f), in957);
tmp6370 = _mm512_fmadd_ps(in951, _mm512_set1_ps(2.5e-01f), in955);
tmp6374 = _mm512_fmadd_ps(in959, _mm512_set1_ps(2.5e-01f), in963);
in951 = _mm512_fmadd_ps(in951, _mm512_set1_ps(4e+00f), in955);
in959 = _mm512_fmadd_ps(in959, _mm512_set1_ps(4e+00f), in963);
__m512 tmp6372 = _mm512_sub_ps(tmp6371, tmp6369);
__m512 tmp6376 = _mm512_sub_ps(tmp6375, tmp6373);
tmp6371 = _mm512_add_ps(tmp6369, tmp6371);
tmp6375 = _mm512_add_ps(tmp6373, tmp6375);
tmp6369 = _mm512_fmadd_ps(in950, _mm512_set1_ps(2.5e-01f), in954);
tmp6373 = _mm512_fmadd_ps(in958, _mm512_set1_ps(2.5e-01f), in962);
tmp6370 = _mm512_fmadd_ps(in953, _mm512_set1_ps(-1.25e+00f), tmp6370);
tmp6374 = _mm512_fmadd_ps(in961, _mm512_set1_ps(-1.25e+00f), tmp6374);
in953 = _mm512_fmadd_ps(in953, _mm512_set1_ps(-5e+00f), in951);
in961 = _mm512_fmadd_ps(in961, _mm512_set1_ps(-5e+00f), in959);
tmp6369 = _mm512_fmadd_ps(in952, _mm512_set1_ps(-1.25e+00f), tmp6369);
tmp6373 = _mm512_fmadd_ps(in960, _mm512_set1_ps(-1.25e+00f), tmp6373);
in955 = _mm512_fmadd_ps(tmp6369, _mm512_set1_ps(2e+00f), tmp6370);
in963 = _mm512_fmadd_ps(tmp6373, _mm512_set1_ps(2e+00f), tmp6374);
tmp6370 = _mm512_fnmadd_ps(tmp6369, _mm512_set1_ps(2e+00f), tmp6370);
tmp6374 = _mm512_fnmadd_ps(tmp6373, _mm512_set1_ps(2e+00f), tmp6374);
tmp6369 = _mm512_fmadd_ps(in954, _mm512_set1_ps(2.5e-01f), in950);
tmp6373 = _mm512_fmadd_ps(in962, _mm512_set1_ps(2.5e-01f), in958);
in950 = _mm512_sub_ps(in956, in950);
in958 = _mm512_sub_ps(in964, in958);
tmp6369 = _mm512_fmadd_ps(in952, _mm512_set1_ps(-1.25e+00f), tmp6369);
tmp6373 = _mm512_fmadd_ps(in960, _mm512_set1_ps(-1.25e+00f), tmp6373);
in952 = _mm512_sub_ps(in952, in954);
in960 = _mm512_sub_ps(in960, in962);
in952 = _mm512_fmadd_ps(in952, _mm512_set1_ps(5.25e+00f), in950);
in960 = _mm512_fmadd_ps(in960, _mm512_set1_ps(5.25e+00f), in958);
in951 = _mm512_fmadd_ps(tmp6369, _mm512_set1_ps(2e+00f), in953);
in959 = _mm512_fmadd_ps(tmp6373, _mm512_set1_ps(2e+00f), in961);
in953 = _mm512_fnmadd_ps(tmp6369, _mm512_set1_ps(2e+00f), in953);
in961 = _mm512_fnmadd_ps(tmp6373, _mm512_set1_ps(2e+00f), in961);
__m512 tmp6385 = _mm512_unpacklo_ps(in949, tmp6371);
__m512 tmp6386 = _mm512_unpackhi_ps(in949, tmp6371);
__m512 tmp6387 = _mm512_unpacklo_ps(tmp6372, in955);
__m512 tmp6388 = _mm512_unpackhi_ps(tmp6372, in955);
__m512 tmp6389 = _mm512_unpacklo_ps(tmp6370, in951);
__m512 tmp6390 = _mm512_unpackhi_ps(tmp6370, in951);
__m512 tmp6391 = _mm512_unpacklo_ps(in953, in952);
__m512 tmp6392 = _mm512_unpackhi_ps(in953, in952);
__m512 tmp6393 = _mm512_unpacklo_ps(in957, tmp6375);
__m512 tmp6394 = _mm512_unpackhi_ps(in957, tmp6375);
__m512 tmp6395 = _mm512_unpacklo_ps(tmp6376, in963);
__m512 tmp6396 = _mm512_unpackhi_ps(tmp6376, in963);
__m512 tmp6397 = _mm512_unpacklo_ps(tmp6374, in959);
__m512 tmp6398 = _mm512_unpackhi_ps(tmp6374, in959);
__m512 tmp6399 = _mm512_unpacklo_ps(in961, in960);
__m512 tmp6400 = _mm512_unpackhi_ps(in961, in960);
__m512 tmp6401 = _mm512_shuffle_ps(tmp6385, tmp6387, 68);
__m512 tmp6402 = _mm512_shuffle_ps(tmp6385, tmp6387, 238);
__m512 tmp6403 = _mm512_shuffle_ps(tmp6386, tmp6388, 68);
__m512 tmp6404 = _mm512_shuffle_ps(tmp6386, tmp6388, 238);
__m512 tmp6405 = _mm512_shuffle_ps(tmp6389, tmp6391, 68);
__m512 tmp6406 = _mm512_shuffle_ps(tmp6389, tmp6391, 238);
__m512 tmp6407 = _mm512_shuffle_ps(tmp6390, tmp6392, 68);
__m512 tmp6408 = _mm512_shuffle_ps(tmp6390, tmp6392, 238);
__m512 tmp6409 = _mm512_shuffle_ps(tmp6393, tmp6395, 68);
__m512 tmp6410 = _mm512_shuffle_ps(tmp6393, tmp6395, 238);
__m512 tmp6411 = _mm512_shuffle_ps(tmp6394, tmp6396, 68);
__m512 tmp6412 = _mm512_shuffle_ps(tmp6394, tmp6396, 238);
__m512 tmp6413 = _mm512_shuffle_ps(tmp6397, tmp6399, 68);
__m512 tmp6414 = _mm512_shuffle_ps(tmp6397, tmp6399, 238);
__m512 tmp6415 = _mm512_shuffle_ps(tmp6398, tmp6400, 68);
__m512 tmp6416 = _mm512_shuffle_ps(tmp6398, tmp6400, 238);
__m512 tmp6417 = _mm512_shuffle_f32x4(tmp6401, tmp6405, 136);
__m512 tmp6418 = _mm512_shuffle_f32x4(tmp6401, tmp6405, 221);
__m512 tmp6419 = _mm512_shuffle_f32x4(tmp6402, tmp6406, 136);
__m512 tmp6420 = _mm512_shuffle_f32x4(tmp6402, tmp6406, 221);
__m512 tmp6421 = _mm512_shuffle_f32x4(tmp6403, tmp6407, 136);
__m512 tmp6422 = _mm512_shuffle_f32x4(tmp6403, tmp6407, 221);
__m512 tmp6423 = _mm512_shuffle_f32x4(tmp6404, tmp6408, 136);
__m512 tmp6424 = _mm512_shuffle_f32x4(tmp6404, tmp6408, 221);
__m512 tmp6425 = _mm512_shuffle_f32x4(tmp6409, tmp6413, 136);
__m512 tmp6426 = _mm512_shuffle_f32x4(tmp6409, tmp6413, 221);
__m512 tmp6427 = _mm512_shuffle_f32x4(tmp6410, tmp6414, 136);
__m512 tmp6428 = _mm512_shuffle_f32x4(tmp6410, tmp6414, 221);
__m512 tmp6429 = _mm512_shuffle_f32x4(tmp6411, tmp6415, 136);
__m512 tmp6430 = _mm512_shuffle_f32x4(tmp6411, tmp6415, 221);
__m512 tmp6431 = _mm512_shuffle_f32x4(tmp6412, tmp6416, 136);
__m512 tmp6432 = _mm512_shuffle_f32x4(tmp6412, tmp6416, 221);
in949 = _mm512_shuffle_f32x4(tmp6417, tmp6425, 136);
in957 = _mm512_shuffle_f32x4(tmp6417, tmp6425, 221);
tmp6371 = _mm512_shuffle_f32x4(tmp6419, tmp6427, 136);
tmp6375 = _mm512_shuffle_f32x4(tmp6419, tmp6427, 221);
tmp6372 = _mm512_shuffle_f32x4(tmp6421, tmp6429, 136);
tmp6376 = _mm512_shuffle_f32x4(tmp6421, tmp6429, 221);
in955 = _mm512_shuffle_f32x4(tmp6423, tmp6431, 136);
in963 = _mm512_shuffle_f32x4(tmp6423, tmp6431, 221);
tmp6370 = _mm512_shuffle_f32x4(tmp6418, tmp6426, 136);
tmp6374 = _mm512_shuffle_f32x4(tmp6418, tmp6426, 221);
in951 = _mm512_shuffle_f32x4(tmp6420, tmp6428, 136);
in959 = _mm512_shuffle_f32x4(tmp6420, tmp6428, 221);
in953 = _mm512_shuffle_f32x4(tmp6422, tmp6430, 136);
in961 = _mm512_shuffle_f32x4(tmp6422, tmp6430, 221);
in952 = _mm512_shuffle_f32x4(tmp6424, tmp6432, 136);
in960 = _mm512_shuffle_f32x4(tmp6424, tmp6432, 221);
__m512 tmp6377 = _mm512_add_ps(tmp6371, in951);
__m512 tmp6381 = _mm512_add_ps(tmp6375, in959);
__m512 tmp6378 = _mm512_sub_ps(tmp6370, tmp6372);
__m512 tmp6382 = _mm512_sub_ps(tmp6374, tmp6376);
__m512 tmp6379 = _mm512_add_ps(tmp6372, in953);
__m512 tmp6383 = _mm512_add_ps(tmp6376, in961);
in949 = _mm512_sub_ps(in949, in953);
in957 = _mm512_sub_ps(in957, in961);
tmp6377 = _mm512_fmadd_ps(in955, _mm512_set1_ps(-4.25e+00f), tmp6377);
tmp6381 = _mm512_fmadd_ps(in963, _mm512_set1_ps(-4.25e+00f), tmp6381);
tmp6379 = _mm512_fmadd_ps(tmp6370, _mm512_set1_ps(-4.25e+00f), tmp6379);
tmp6383 = _mm512_fmadd_ps(tmp6374, _mm512_set1_ps(-4.25e+00f), tmp6383);
in949 = _mm512_fmadd_ps(tmp6378, _mm512_set1_ps(5.25e+00f), in949);
in957 = _mm512_fmadd_ps(tmp6382, _mm512_set1_ps(5.25e+00f), in957);
tmp6378 = _mm512_fmadd_ps(tmp6372, _mm512_set1_ps(2.5e-01f), in953);
tmp6382 = _mm512_fmadd_ps(tmp6376, _mm512_set1_ps(2.5e-01f), in961);
tmp6372 = _mm512_fmadd_ps(tmp6372, _mm512_set1_ps(4e+00f), in953);
tmp6376 = _mm512_fmadd_ps(tmp6376, _mm512_set1_ps(4e+00f), in961);
__m512 tmp6380 = _mm512_sub_ps(tmp6379, tmp6377);
__m512 tmp6384 = _mm512_sub_ps(tmp6383, tmp6381);
tmp6379 = _mm512_add_ps(tmp6377, tmp6379);
tmp6383 = _mm512_add_ps(tmp6381, tmp6383);
tmp6377 = _mm512_fmadd_ps(tmp6371, _mm512_set1_ps(2.5e-01f), in951);
tmp6381 = _mm512_fmadd_ps(tmp6375, _mm512_set1_ps(2.5e-01f), in959);
tmp6378 = _mm512_fmadd_ps(tmp6370, _mm512_set1_ps(-1.25e+00f), tmp6378);
tmp6382 = _mm512_fmadd_ps(tmp6374, _mm512_set1_ps(-1.25e+00f), tmp6382);
tmp6370 = _mm512_fmadd_ps(tmp6370, _mm512_set1_ps(-5e+00f), tmp6372);
tmp6374 = _mm512_fmadd_ps(tmp6374, _mm512_set1_ps(-5e+00f), tmp6376);
tmp6377 = _mm512_fmadd_ps(in955, _mm512_set1_ps(-1.25e+00f), tmp6377);
tmp6381 = _mm512_fmadd_ps(in963, _mm512_set1_ps(-1.25e+00f), tmp6381);
in953 = _mm512_fmadd_ps(tmp6377, _mm512_set1_ps(2e+00f), tmp6378);
in961 = _mm512_fmadd_ps(tmp6381, _mm512_set1_ps(2e+00f), tmp6382);
tmp6378 = _mm512_fnmadd_ps(tmp6377, _mm512_set1_ps(2e+00f), tmp6378);
tmp6382 = _mm512_fnmadd_ps(tmp6381, _mm512_set1_ps(2e+00f), tmp6382);
tmp6377 = _mm512_fmadd_ps(in951, _mm512_set1_ps(2.5e-01f), tmp6371);
tmp6381 = _mm512_fmadd_ps(in959, _mm512_set1_ps(2.5e-01f), tmp6375);
tmp6371 = _mm512_sub_ps(in952, tmp6371);
tmp6375 = _mm512_sub_ps(in960, tmp6375);
tmp6377 = _mm512_fmadd_ps(in955, _mm512_set1_ps(-1.25e+00f), tmp6377);
tmp6381 = _mm512_fmadd_ps(in963, _mm512_set1_ps(-1.25e+00f), tmp6381);
in955 = _mm512_sub_ps(in955, in951);
in963 = _mm512_sub_ps(in963, in959);
in955 = _mm512_fmadd_ps(in955, _mm512_set1_ps(5.25e+00f), tmp6371);
in963 = _mm512_fmadd_ps(in963, _mm512_set1_ps(5.25e+00f), tmp6375);
tmp6372 = _mm512_fmadd_ps(tmp6377, _mm512_set1_ps(2e+00f), tmp6370);
tmp6376 = _mm512_fmadd_ps(tmp6381, _mm512_set1_ps(2e+00f), tmp6374);
tmp6370 = _mm512_fnmadd_ps(tmp6377, _mm512_set1_ps(2e+00f), tmp6370);
tmp6374 = _mm512_fnmadd_ps(tmp6381, _mm512_set1_ps(2e+00f), tmp6374);
__m512 out899 = _mm512_shuffle_f32x4(in949, tmp6379, 68);
__m512 out907 = _mm512_shuffle_f32x4(in949, tmp6379, 238);
__m512 out900 = _mm512_shuffle_f32x4(tmp6380, in953, 68);
__m512 out908 = _mm512_shuffle_f32x4(tmp6380, in953, 238);
__m512 out901 = _mm512_shuffle_f32x4(tmp6378, tmp6372, 68);
__m512 out909 = _mm512_shuffle_f32x4(tmp6378, tmp6372, 238);
__m512 out902 = _mm512_shuffle_f32x4(tmp6370, in955, 68);
__m512 out910 = _mm512_shuffle_f32x4(tmp6370, in955, 238);
__m512 out903 = _mm512_shuffle_f32x4(in957, tmp6383, 68);
__m512 out911 = _mm512_shuffle_f32x4(in957, tmp6383, 238);
__m512 out904 = _mm512_shuffle_f32x4(tmp6384, in961, 68);
__m512 out912 = _mm512_shuffle_f32x4(tmp6384, in961, 238);
__m512 out905 = _mm512_shuffle_f32x4(tmp6382, tmp6376, 68);
__m512 out913 = _mm512_shuffle_f32x4(tmp6382, tmp6376, 238);
__m512 out906 = _mm512_shuffle_f32x4(tmp6374, in963, 68);
__m512 out914 = _mm512_shuffle_f32x4(tmp6374, in963, 238);
_mm512_storeu_ps(dfPtr6+0+1638400*i26+24576*j21+24576*s20+768*k84, out899);
_mm512_storeu_ps(dfPtr6+128+1638400*i26+24576*j21+24576*s20+768*k84, out907);
_mm512_storeu_ps(dfPtr6+64+1638400*i26+24576*j21+24576*s20+768*k84, out903);
_mm512_storeu_ps(dfPtr6+192+1638400*i26+24576*j21+24576*s20+768*k84, out911);
_mm512_storeu_ps(dfPtr6+409600+1638400*i26+24576*j21+24576*s20+768*k84, out900);
_mm512_storeu_ps(dfPtr6+409728+1638400*i26+24576*j21+24576*s20+768*k84, out908);
_mm512_storeu_ps(dfPtr6+409664+1638400*i26+24576*j21+24576*s20+768*k84, out904);
_mm512_storeu_ps(dfPtr6+409792+1638400*i26+24576*j21+24576*s20+768*k84, out912);
_mm512_storeu_ps(dfPtr6+819200+1638400*i26+24576*j21+24576*s20+768*k84, out901);
_mm512_storeu_ps(dfPtr6+819328+1638400*i26+24576*j21+24576*s20+768*k84, out909);
_mm512_storeu_ps(dfPtr6+819264+1638400*i26+24576*j21+24576*s20+768*k84, out905);
_mm512_storeu_ps(dfPtr6+819392+1638400*i26+24576*j21+24576*s20+768*k84, out913);
_mm512_storeu_ps(dfPtr6+1228800+1638400*i26+24576*j21+24576*s20+768*k84, out902);
_mm512_storeu_ps(dfPtr6+1228928+1638400*i26+24576*j21+24576*s20+768*k84, out910);
_mm512_storeu_ps(dfPtr6+1228864+1638400*i26+24576*j21+24576*s20+768*k84, out906);
_mm512_storeu_ps(dfPtr6+1228992+1638400*i26+24576*j21+24576*s20+768*k84, out914);
__m512 dat1493 = _mm512_maskz_loadu_ps(511, datPtr12+96+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1494 = _mm512_maskz_loadu_ps(16383, datPtr12+12608+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512i pm131 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in965 = _mm512_permutexvar_ps(pm131, dat1493);
__m512i pm132 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in973 = _mm512_permutexvar_ps(pm132, dat1494);
__m512 dat1495 = _mm512_maskz_loadu_ps(511, datPtr12+320+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1496 = _mm512_maskz_loadu_ps(16383, datPtr12+12832+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in966 = _mm512_permutexvar_ps(pm131, dat1495);
__m512 in974 = _mm512_permutexvar_ps(pm132, dat1496);
__m512 dat1497 = _mm512_maskz_loadu_ps(511, datPtr12+544+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1498 = _mm512_maskz_loadu_ps(16383, datPtr12+13056+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in967 = _mm512_permutexvar_ps(pm131, dat1497);
__m512 in975 = _mm512_permutexvar_ps(pm132, dat1498);
__m512 dat1499 = _mm512_maskz_loadu_ps(511, datPtr12+768+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1500 = _mm512_maskz_loadu_ps(16383, datPtr12+13280+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in968 = _mm512_permutexvar_ps(pm131, dat1499);
__m512 in976 = _mm512_permutexvar_ps(pm132, dat1500);
__m512 dat1501 = _mm512_maskz_loadu_ps(511, datPtr12+992+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1502 = _mm512_maskz_loadu_ps(16383, datPtr12+13504+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in969 = _mm512_permutexvar_ps(pm131, dat1501);
__m512 in977 = _mm512_permutexvar_ps(pm132, dat1502);
__m512 dat1503 = _mm512_maskz_loadu_ps(511, datPtr12+1216+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1504 = _mm512_maskz_loadu_ps(16383, datPtr12+13728+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in970 = _mm512_permutexvar_ps(pm131, dat1503);
__m512 in978 = _mm512_permutexvar_ps(pm132, dat1504);
__m512 dat1505 = _mm512_maskz_loadu_ps(511, datPtr12+1440+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1506 = _mm512_maskz_loadu_ps(16383, datPtr12+13952+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in971 = _mm512_permutexvar_ps(pm131, dat1505);
__m512 in979 = _mm512_permutexvar_ps(pm132, dat1506);
__m512 dat1507 = _mm512_maskz_loadu_ps(511, datPtr12+1664+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1508 = _mm512_maskz_loadu_ps(16383, datPtr12+14176+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in972 = _mm512_permutexvar_ps(pm131, dat1507);
__m512 in980 = _mm512_permutexvar_ps(pm132, dat1508);
__m512 tmp6433 = _mm512_add_ps(in966, in970);
__m512 tmp6437 = _mm512_add_ps(in974, in978);
__m512 tmp6434 = _mm512_sub_ps(in969, in967);
__m512 tmp6438 = _mm512_sub_ps(in977, in975);
__m512 tmp6435 = _mm512_add_ps(in967, in971);
__m512 tmp6439 = _mm512_add_ps(in975, in979);
in965 = _mm512_sub_ps(in965, in971);
in973 = _mm512_sub_ps(in973, in979);
tmp6433 = _mm512_fmadd_ps(in968, _mm512_set1_ps(-4.25e+00f), tmp6433);
tmp6437 = _mm512_fmadd_ps(in976, _mm512_set1_ps(-4.25e+00f), tmp6437);
tmp6435 = _mm512_fmadd_ps(in969, _mm512_set1_ps(-4.25e+00f), tmp6435);
tmp6439 = _mm512_fmadd_ps(in977, _mm512_set1_ps(-4.25e+00f), tmp6439);
in965 = _mm512_fmadd_ps(tmp6434, _mm512_set1_ps(5.25e+00f), in965);
in973 = _mm512_fmadd_ps(tmp6438, _mm512_set1_ps(5.25e+00f), in973);
tmp6434 = _mm512_fmadd_ps(in967, _mm512_set1_ps(2.5e-01f), in971);
tmp6438 = _mm512_fmadd_ps(in975, _mm512_set1_ps(2.5e-01f), in979);
in967 = _mm512_fmadd_ps(in967, _mm512_set1_ps(4e+00f), in971);
in975 = _mm512_fmadd_ps(in975, _mm512_set1_ps(4e+00f), in979);
__m512 tmp6436 = _mm512_sub_ps(tmp6435, tmp6433);
__m512 tmp6440 = _mm512_sub_ps(tmp6439, tmp6437);
tmp6435 = _mm512_add_ps(tmp6433, tmp6435);
tmp6439 = _mm512_add_ps(tmp6437, tmp6439);
tmp6433 = _mm512_fmadd_ps(in966, _mm512_set1_ps(2.5e-01f), in970);
tmp6437 = _mm512_fmadd_ps(in974, _mm512_set1_ps(2.5e-01f), in978);
tmp6434 = _mm512_fmadd_ps(in969, _mm512_set1_ps(-1.25e+00f), tmp6434);
tmp6438 = _mm512_fmadd_ps(in977, _mm512_set1_ps(-1.25e+00f), tmp6438);
in969 = _mm512_fmadd_ps(in969, _mm512_set1_ps(-5e+00f), in967);
in977 = _mm512_fmadd_ps(in977, _mm512_set1_ps(-5e+00f), in975);
tmp6433 = _mm512_fmadd_ps(in968, _mm512_set1_ps(-1.25e+00f), tmp6433);
tmp6437 = _mm512_fmadd_ps(in976, _mm512_set1_ps(-1.25e+00f), tmp6437);
in971 = _mm512_fmadd_ps(tmp6433, _mm512_set1_ps(2e+00f), tmp6434);
in979 = _mm512_fmadd_ps(tmp6437, _mm512_set1_ps(2e+00f), tmp6438);
tmp6434 = _mm512_fnmadd_ps(tmp6433, _mm512_set1_ps(2e+00f), tmp6434);
tmp6438 = _mm512_fnmadd_ps(tmp6437, _mm512_set1_ps(2e+00f), tmp6438);
tmp6433 = _mm512_fmadd_ps(in970, _mm512_set1_ps(2.5e-01f), in966);
tmp6437 = _mm512_fmadd_ps(in978, _mm512_set1_ps(2.5e-01f), in974);
in966 = _mm512_sub_ps(in972, in966);
in974 = _mm512_sub_ps(in980, in974);
tmp6433 = _mm512_fmadd_ps(in968, _mm512_set1_ps(-1.25e+00f), tmp6433);
tmp6437 = _mm512_fmadd_ps(in976, _mm512_set1_ps(-1.25e+00f), tmp6437);
in968 = _mm512_sub_ps(in968, in970);
in976 = _mm512_sub_ps(in976, in978);
in968 = _mm512_fmadd_ps(in968, _mm512_set1_ps(5.25e+00f), in966);
in976 = _mm512_fmadd_ps(in976, _mm512_set1_ps(5.25e+00f), in974);
in967 = _mm512_fmadd_ps(tmp6433, _mm512_set1_ps(2e+00f), in969);
in975 = _mm512_fmadd_ps(tmp6437, _mm512_set1_ps(2e+00f), in977);
in969 = _mm512_fnmadd_ps(tmp6433, _mm512_set1_ps(2e+00f), in969);
in977 = _mm512_fnmadd_ps(tmp6437, _mm512_set1_ps(2e+00f), in977);
__m512 tmp6449 = _mm512_unpacklo_ps(in965, tmp6435);
__m512 tmp6450 = _mm512_unpackhi_ps(in965, tmp6435);
__m512 tmp6451 = _mm512_unpacklo_ps(tmp6436, in971);
__m512 tmp6452 = _mm512_unpackhi_ps(tmp6436, in971);
__m512 tmp6453 = _mm512_unpacklo_ps(tmp6434, in967);
__m512 tmp6454 = _mm512_unpackhi_ps(tmp6434, in967);
__m512 tmp6455 = _mm512_unpacklo_ps(in969, in968);
__m512 tmp6456 = _mm512_unpackhi_ps(in969, in968);
__m512 tmp6457 = _mm512_unpacklo_ps(in973, tmp6439);
__m512 tmp6458 = _mm512_unpackhi_ps(in973, tmp6439);
__m512 tmp6459 = _mm512_unpacklo_ps(tmp6440, in979);
__m512 tmp6460 = _mm512_unpackhi_ps(tmp6440, in979);
__m512 tmp6461 = _mm512_unpacklo_ps(tmp6438, in975);
__m512 tmp6462 = _mm512_unpackhi_ps(tmp6438, in975);
__m512 tmp6463 = _mm512_unpacklo_ps(in977, in976);
__m512 tmp6464 = _mm512_unpackhi_ps(in977, in976);
__m512 tmp6465 = _mm512_shuffle_ps(tmp6449, tmp6451, 68);
__m512 tmp6466 = _mm512_shuffle_ps(tmp6449, tmp6451, 238);
__m512 tmp6467 = _mm512_shuffle_ps(tmp6450, tmp6452, 68);
__m512 tmp6468 = _mm512_shuffle_ps(tmp6450, tmp6452, 238);
__m512 tmp6469 = _mm512_shuffle_ps(tmp6453, tmp6455, 68);
__m512 tmp6470 = _mm512_shuffle_ps(tmp6453, tmp6455, 238);
__m512 tmp6471 = _mm512_shuffle_ps(tmp6454, tmp6456, 68);
__m512 tmp6472 = _mm512_shuffle_ps(tmp6454, tmp6456, 238);
__m512 tmp6473 = _mm512_shuffle_ps(tmp6457, tmp6459, 68);
__m512 tmp6474 = _mm512_shuffle_ps(tmp6457, tmp6459, 238);
__m512 tmp6475 = _mm512_shuffle_ps(tmp6458, tmp6460, 68);
__m512 tmp6476 = _mm512_shuffle_ps(tmp6458, tmp6460, 238);
__m512 tmp6477 = _mm512_shuffle_ps(tmp6461, tmp6463, 68);
__m512 tmp6478 = _mm512_shuffle_ps(tmp6461, tmp6463, 238);
__m512 tmp6479 = _mm512_shuffle_ps(tmp6462, tmp6464, 68);
__m512 tmp6480 = _mm512_shuffle_ps(tmp6462, tmp6464, 238);
__m512 tmp6481 = _mm512_shuffle_f32x4(tmp6465, tmp6469, 136);
__m512 tmp6482 = _mm512_shuffle_f32x4(tmp6465, tmp6469, 221);
__m512 tmp6483 = _mm512_shuffle_f32x4(tmp6466, tmp6470, 136);
__m512 tmp6484 = _mm512_shuffle_f32x4(tmp6466, tmp6470, 221);
__m512 tmp6485 = _mm512_shuffle_f32x4(tmp6467, tmp6471, 136);
__m512 tmp6486 = _mm512_shuffle_f32x4(tmp6467, tmp6471, 221);
__m512 tmp6487 = _mm512_shuffle_f32x4(tmp6468, tmp6472, 136);
__m512 tmp6488 = _mm512_shuffle_f32x4(tmp6468, tmp6472, 221);
__m512 tmp6489 = _mm512_shuffle_f32x4(tmp6473, tmp6477, 136);
__m512 tmp6490 = _mm512_shuffle_f32x4(tmp6473, tmp6477, 221);
__m512 tmp6491 = _mm512_shuffle_f32x4(tmp6474, tmp6478, 136);
__m512 tmp6492 = _mm512_shuffle_f32x4(tmp6474, tmp6478, 221);
__m512 tmp6493 = _mm512_shuffle_f32x4(tmp6475, tmp6479, 136);
__m512 tmp6494 = _mm512_shuffle_f32x4(tmp6475, tmp6479, 221);
__m512 tmp6495 = _mm512_shuffle_f32x4(tmp6476, tmp6480, 136);
__m512 tmp6496 = _mm512_shuffle_f32x4(tmp6476, tmp6480, 221);
in965 = _mm512_shuffle_f32x4(tmp6481, tmp6489, 136);
in973 = _mm512_shuffle_f32x4(tmp6481, tmp6489, 221);
tmp6435 = _mm512_shuffle_f32x4(tmp6483, tmp6491, 136);
tmp6439 = _mm512_shuffle_f32x4(tmp6483, tmp6491, 221);
tmp6436 = _mm512_shuffle_f32x4(tmp6485, tmp6493, 136);
tmp6440 = _mm512_shuffle_f32x4(tmp6485, tmp6493, 221);
in971 = _mm512_shuffle_f32x4(tmp6487, tmp6495, 136);
in979 = _mm512_shuffle_f32x4(tmp6487, tmp6495, 221);
tmp6434 = _mm512_shuffle_f32x4(tmp6482, tmp6490, 136);
tmp6438 = _mm512_shuffle_f32x4(tmp6482, tmp6490, 221);
in967 = _mm512_shuffle_f32x4(tmp6484, tmp6492, 136);
in975 = _mm512_shuffle_f32x4(tmp6484, tmp6492, 221);
in969 = _mm512_shuffle_f32x4(tmp6486, tmp6494, 136);
in977 = _mm512_shuffle_f32x4(tmp6486, tmp6494, 221);
in968 = _mm512_shuffle_f32x4(tmp6488, tmp6496, 136);
in976 = _mm512_shuffle_f32x4(tmp6488, tmp6496, 221);
__m512 tmp6441 = _mm512_add_ps(tmp6435, in967);
__m512 tmp6445 = _mm512_add_ps(tmp6439, in975);
__m512 tmp6442 = _mm512_sub_ps(tmp6434, tmp6436);
__m512 tmp6446 = _mm512_sub_ps(tmp6438, tmp6440);
__m512 tmp6443 = _mm512_add_ps(tmp6436, in969);
__m512 tmp6447 = _mm512_add_ps(tmp6440, in977);
in965 = _mm512_sub_ps(in965, in969);
in973 = _mm512_sub_ps(in973, in977);
tmp6441 = _mm512_fmadd_ps(in971, _mm512_set1_ps(-4.25e+00f), tmp6441);
tmp6445 = _mm512_fmadd_ps(in979, _mm512_set1_ps(-4.25e+00f), tmp6445);
tmp6443 = _mm512_fmadd_ps(tmp6434, _mm512_set1_ps(-4.25e+00f), tmp6443);
tmp6447 = _mm512_fmadd_ps(tmp6438, _mm512_set1_ps(-4.25e+00f), tmp6447);
in965 = _mm512_fmadd_ps(tmp6442, _mm512_set1_ps(5.25e+00f), in965);
in973 = _mm512_fmadd_ps(tmp6446, _mm512_set1_ps(5.25e+00f), in973);
tmp6442 = _mm512_fmadd_ps(tmp6436, _mm512_set1_ps(2.5e-01f), in969);
tmp6446 = _mm512_fmadd_ps(tmp6440, _mm512_set1_ps(2.5e-01f), in977);
tmp6436 = _mm512_fmadd_ps(tmp6436, _mm512_set1_ps(4e+00f), in969);
tmp6440 = _mm512_fmadd_ps(tmp6440, _mm512_set1_ps(4e+00f), in977);
__m512 tmp6444 = _mm512_sub_ps(tmp6443, tmp6441);
__m512 tmp6448 = _mm512_sub_ps(tmp6447, tmp6445);
tmp6443 = _mm512_add_ps(tmp6441, tmp6443);
tmp6447 = _mm512_add_ps(tmp6445, tmp6447);
tmp6441 = _mm512_fmadd_ps(tmp6435, _mm512_set1_ps(2.5e-01f), in967);
tmp6445 = _mm512_fmadd_ps(tmp6439, _mm512_set1_ps(2.5e-01f), in975);
tmp6442 = _mm512_fmadd_ps(tmp6434, _mm512_set1_ps(-1.25e+00f), tmp6442);
tmp6446 = _mm512_fmadd_ps(tmp6438, _mm512_set1_ps(-1.25e+00f), tmp6446);
tmp6434 = _mm512_fmadd_ps(tmp6434, _mm512_set1_ps(-5e+00f), tmp6436);
tmp6438 = _mm512_fmadd_ps(tmp6438, _mm512_set1_ps(-5e+00f), tmp6440);
tmp6441 = _mm512_fmadd_ps(in971, _mm512_set1_ps(-1.25e+00f), tmp6441);
tmp6445 = _mm512_fmadd_ps(in979, _mm512_set1_ps(-1.25e+00f), tmp6445);
in969 = _mm512_fmadd_ps(tmp6441, _mm512_set1_ps(2e+00f), tmp6442);
in977 = _mm512_fmadd_ps(tmp6445, _mm512_set1_ps(2e+00f), tmp6446);
tmp6442 = _mm512_fnmadd_ps(tmp6441, _mm512_set1_ps(2e+00f), tmp6442);
tmp6446 = _mm512_fnmadd_ps(tmp6445, _mm512_set1_ps(2e+00f), tmp6446);
tmp6441 = _mm512_fmadd_ps(in967, _mm512_set1_ps(2.5e-01f), tmp6435);
tmp6445 = _mm512_fmadd_ps(in975, _mm512_set1_ps(2.5e-01f), tmp6439);
tmp6435 = _mm512_sub_ps(in968, tmp6435);
tmp6439 = _mm512_sub_ps(in976, tmp6439);
tmp6441 = _mm512_fmadd_ps(in971, _mm512_set1_ps(-1.25e+00f), tmp6441);
tmp6445 = _mm512_fmadd_ps(in979, _mm512_set1_ps(-1.25e+00f), tmp6445);
in971 = _mm512_sub_ps(in971, in967);
in979 = _mm512_sub_ps(in979, in975);
in971 = _mm512_fmadd_ps(in971, _mm512_set1_ps(5.25e+00f), tmp6435);
in979 = _mm512_fmadd_ps(in979, _mm512_set1_ps(5.25e+00f), tmp6439);
tmp6436 = _mm512_fmadd_ps(tmp6441, _mm512_set1_ps(2e+00f), tmp6434);
tmp6440 = _mm512_fmadd_ps(tmp6445, _mm512_set1_ps(2e+00f), tmp6438);
tmp6434 = _mm512_fnmadd_ps(tmp6441, _mm512_set1_ps(2e+00f), tmp6434);
tmp6438 = _mm512_fnmadd_ps(tmp6445, _mm512_set1_ps(2e+00f), tmp6438);
__m512 out915 = _mm512_shuffle_f32x4(in965, tmp6443, 68);
__m512 out923 = _mm512_shuffle_f32x4(in965, tmp6443, 238);
__m512 out916 = _mm512_shuffle_f32x4(tmp6444, in969, 68);
__m512 out924 = _mm512_shuffle_f32x4(tmp6444, in969, 238);
__m512 out917 = _mm512_shuffle_f32x4(tmp6442, tmp6436, 68);
__m512 out925 = _mm512_shuffle_f32x4(tmp6442, tmp6436, 238);
__m512 out918 = _mm512_shuffle_f32x4(tmp6434, in971, 68);
__m512 out926 = _mm512_shuffle_f32x4(tmp6434, in971, 238);
__m512 out919 = _mm512_shuffle_f32x4(in973, tmp6447, 68);
__m512 out927 = _mm512_shuffle_f32x4(in973, tmp6447, 238);
__m512 out920 = _mm512_shuffle_f32x4(tmp6448, in977, 68);
__m512 out928 = _mm512_shuffle_f32x4(tmp6448, in977, 238);
__m512 out921 = _mm512_shuffle_f32x4(tmp6446, tmp6440, 68);
__m512 out929 = _mm512_shuffle_f32x4(tmp6446, tmp6440, 238);
__m512 out922 = _mm512_shuffle_f32x4(tmp6438, in979, 68);
__m512 out930 = _mm512_shuffle_f32x4(tmp6438, in979, 238);
_mm512_storeu_ps(dfPtr6+256+1638400*i26+24576*j21+24576*s20+768*k84, out915);
_mm512_storeu_ps(dfPtr6+384+1638400*i26+24576*j21+24576*s20+768*k84, out923);
_mm512_storeu_ps(dfPtr6+320+1638400*i26+24576*j21+24576*s20+768*k84, out919);
_mm512_storeu_ps(dfPtr6+448+1638400*i26+24576*j21+24576*s20+768*k84, out927);
_mm512_storeu_ps(dfPtr6+409856+1638400*i26+24576*j21+24576*s20+768*k84, out916);
_mm512_storeu_ps(dfPtr6+409984+1638400*i26+24576*j21+24576*s20+768*k84, out924);
_mm512_storeu_ps(dfPtr6+409920+1638400*i26+24576*j21+24576*s20+768*k84, out920);
_mm512_storeu_ps(dfPtr6+410048+1638400*i26+24576*j21+24576*s20+768*k84, out928);
_mm512_storeu_ps(dfPtr6+819456+1638400*i26+24576*j21+24576*s20+768*k84, out917);
_mm512_storeu_ps(dfPtr6+819584+1638400*i26+24576*j21+24576*s20+768*k84, out925);
_mm512_storeu_ps(dfPtr6+819520+1638400*i26+24576*j21+24576*s20+768*k84, out921);
_mm512_storeu_ps(dfPtr6+819648+1638400*i26+24576*j21+24576*s20+768*k84, out929);
_mm512_storeu_ps(dfPtr6+1229056+1638400*i26+24576*j21+24576*s20+768*k84, out918);
_mm512_storeu_ps(dfPtr6+1229184+1638400*i26+24576*j21+24576*s20+768*k84, out926);
_mm512_storeu_ps(dfPtr6+1229120+1638400*i26+24576*j21+24576*s20+768*k84, out922);
_mm512_storeu_ps(dfPtr6+1229248+1638400*i26+24576*j21+24576*s20+768*k84, out930);
__m512 dat1509 = _mm512_maskz_loadu_ps(16383, datPtr12+12656+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1510 = _mm512_maskz_loadu_ps(511, datPtr12+12704+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512i pm133 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in981 = _mm512_permutexvar_ps(pm133, dat1509);
__m512i pm134 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in989 = _mm512_permutexvar_ps(pm134, dat1510);
__m512 dat1511 = _mm512_maskz_loadu_ps(16383, datPtr12+12880+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1512 = _mm512_maskz_loadu_ps(511, datPtr12+12928+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in982 = _mm512_permutexvar_ps(pm133, dat1511);
__m512 in990 = _mm512_permutexvar_ps(pm134, dat1512);
__m512 dat1513 = _mm512_maskz_loadu_ps(16383, datPtr12+13104+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1514 = _mm512_maskz_loadu_ps(511, datPtr12+13152+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in983 = _mm512_permutexvar_ps(pm133, dat1513);
__m512 in991 = _mm512_permutexvar_ps(pm134, dat1514);
__m512 dat1515 = _mm512_maskz_loadu_ps(16383, datPtr12+13328+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1516 = _mm512_maskz_loadu_ps(511, datPtr12+13376+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in984 = _mm512_permutexvar_ps(pm133, dat1515);
__m512 in992 = _mm512_permutexvar_ps(pm134, dat1516);
__m512 dat1517 = _mm512_maskz_loadu_ps(16383, datPtr12+13552+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1518 = _mm512_maskz_loadu_ps(511, datPtr12+13600+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in985 = _mm512_permutexvar_ps(pm133, dat1517);
__m512 in993 = _mm512_permutexvar_ps(pm134, dat1518);
__m512 dat1519 = _mm512_maskz_loadu_ps(16383, datPtr12+13776+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1520 = _mm512_maskz_loadu_ps(511, datPtr12+13824+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in986 = _mm512_permutexvar_ps(pm133, dat1519);
__m512 in994 = _mm512_permutexvar_ps(pm134, dat1520);
__m512 dat1521 = _mm512_maskz_loadu_ps(16383, datPtr12+14000+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1522 = _mm512_maskz_loadu_ps(511, datPtr12+14048+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in987 = _mm512_permutexvar_ps(pm133, dat1521);
__m512 in995 = _mm512_permutexvar_ps(pm134, dat1522);
__m512 dat1523 = _mm512_maskz_loadu_ps(16383, datPtr12+14224+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 dat1524 = _mm512_maskz_loadu_ps(511, datPtr12+14272+806912*i26+224*h33+4*w40+806912*s20+25216*k84);
__m512 in988 = _mm512_permutexvar_ps(pm133, dat1523);
__m512 in996 = _mm512_permutexvar_ps(pm134, dat1524);
__m512 tmp6497 = _mm512_add_ps(in982, in986);
__m512 tmp6501 = _mm512_add_ps(in990, in994);
__m512 tmp6498 = _mm512_sub_ps(in985, in983);
__m512 tmp6502 = _mm512_sub_ps(in993, in991);
__m512 tmp6499 = _mm512_add_ps(in983, in987);
__m512 tmp6503 = _mm512_add_ps(in991, in995);
in981 = _mm512_sub_ps(in981, in987);
in989 = _mm512_sub_ps(in989, in995);
tmp6497 = _mm512_fmadd_ps(in984, _mm512_set1_ps(-4.25e+00f), tmp6497);
tmp6501 = _mm512_fmadd_ps(in992, _mm512_set1_ps(-4.25e+00f), tmp6501);
tmp6499 = _mm512_fmadd_ps(in985, _mm512_set1_ps(-4.25e+00f), tmp6499);
tmp6503 = _mm512_fmadd_ps(in993, _mm512_set1_ps(-4.25e+00f), tmp6503);
in981 = _mm512_fmadd_ps(tmp6498, _mm512_set1_ps(5.25e+00f), in981);
in989 = _mm512_fmadd_ps(tmp6502, _mm512_set1_ps(5.25e+00f), in989);
tmp6498 = _mm512_fmadd_ps(in983, _mm512_set1_ps(2.5e-01f), in987);
tmp6502 = _mm512_fmadd_ps(in991, _mm512_set1_ps(2.5e-01f), in995);
in983 = _mm512_fmadd_ps(in983, _mm512_set1_ps(4e+00f), in987);
in991 = _mm512_fmadd_ps(in991, _mm512_set1_ps(4e+00f), in995);
__m512 tmp6500 = _mm512_sub_ps(tmp6499, tmp6497);
__m512 tmp6504 = _mm512_sub_ps(tmp6503, tmp6501);
tmp6499 = _mm512_add_ps(tmp6497, tmp6499);
tmp6503 = _mm512_add_ps(tmp6501, tmp6503);
tmp6497 = _mm512_fmadd_ps(in982, _mm512_set1_ps(2.5e-01f), in986);
tmp6501 = _mm512_fmadd_ps(in990, _mm512_set1_ps(2.5e-01f), in994);
tmp6498 = _mm512_fmadd_ps(in985, _mm512_set1_ps(-1.25e+00f), tmp6498);
tmp6502 = _mm512_fmadd_ps(in993, _mm512_set1_ps(-1.25e+00f), tmp6502);
in985 = _mm512_fmadd_ps(in985, _mm512_set1_ps(-5e+00f), in983);
in993 = _mm512_fmadd_ps(in993, _mm512_set1_ps(-5e+00f), in991);
tmp6497 = _mm512_fmadd_ps(in984, _mm512_set1_ps(-1.25e+00f), tmp6497);
tmp6501 = _mm512_fmadd_ps(in992, _mm512_set1_ps(-1.25e+00f), tmp6501);
in987 = _mm512_fmadd_ps(tmp6497, _mm512_set1_ps(2e+00f), tmp6498);
in995 = _mm512_fmadd_ps(tmp6501, _mm512_set1_ps(2e+00f), tmp6502);
tmp6498 = _mm512_fnmadd_ps(tmp6497, _mm512_set1_ps(2e+00f), tmp6498);
tmp6502 = _mm512_fnmadd_ps(tmp6501, _mm512_set1_ps(2e+00f), tmp6502);
tmp6497 = _mm512_fmadd_ps(in986, _mm512_set1_ps(2.5e-01f), in982);
tmp6501 = _mm512_fmadd_ps(in994, _mm512_set1_ps(2.5e-01f), in990);
in982 = _mm512_sub_ps(in988, in982);
in990 = _mm512_sub_ps(in996, in990);
tmp6497 = _mm512_fmadd_ps(in984, _mm512_set1_ps(-1.25e+00f), tmp6497);
tmp6501 = _mm512_fmadd_ps(in992, _mm512_set1_ps(-1.25e+00f), tmp6501);
in984 = _mm512_sub_ps(in984, in986);
in992 = _mm512_sub_ps(in992, in994);
in984 = _mm512_fmadd_ps(in984, _mm512_set1_ps(5.25e+00f), in982);
in992 = _mm512_fmadd_ps(in992, _mm512_set1_ps(5.25e+00f), in990);
in983 = _mm512_fmadd_ps(tmp6497, _mm512_set1_ps(2e+00f), in985);
in991 = _mm512_fmadd_ps(tmp6501, _mm512_set1_ps(2e+00f), in993);
in985 = _mm512_fnmadd_ps(tmp6497, _mm512_set1_ps(2e+00f), in985);
in993 = _mm512_fnmadd_ps(tmp6501, _mm512_set1_ps(2e+00f), in993);
__m512 tmp6513 = _mm512_unpacklo_ps(in981, tmp6499);
__m512 tmp6514 = _mm512_unpackhi_ps(in981, tmp6499);
__m512 tmp6515 = _mm512_unpacklo_ps(tmp6500, in987);
__m512 tmp6516 = _mm512_unpackhi_ps(tmp6500, in987);
__m512 tmp6517 = _mm512_unpacklo_ps(tmp6498, in983);
__m512 tmp6518 = _mm512_unpackhi_ps(tmp6498, in983);
__m512 tmp6519 = _mm512_unpacklo_ps(in985, in984);
__m512 tmp6520 = _mm512_unpackhi_ps(in985, in984);
__m512 tmp6521 = _mm512_unpacklo_ps(in989, tmp6503);
__m512 tmp6522 = _mm512_unpackhi_ps(in989, tmp6503);
__m512 tmp6523 = _mm512_unpacklo_ps(tmp6504, in995);
__m512 tmp6524 = _mm512_unpackhi_ps(tmp6504, in995);
__m512 tmp6525 = _mm512_unpacklo_ps(tmp6502, in991);
__m512 tmp6526 = _mm512_unpackhi_ps(tmp6502, in991);
__m512 tmp6527 = _mm512_unpacklo_ps(in993, in992);
__m512 tmp6528 = _mm512_unpackhi_ps(in993, in992);
__m512 tmp6529 = _mm512_shuffle_ps(tmp6513, tmp6515, 68);
__m512 tmp6530 = _mm512_shuffle_ps(tmp6513, tmp6515, 238);
__m512 tmp6531 = _mm512_shuffle_ps(tmp6514, tmp6516, 68);
__m512 tmp6532 = _mm512_shuffle_ps(tmp6514, tmp6516, 238);
__m512 tmp6533 = _mm512_shuffle_ps(tmp6517, tmp6519, 68);
__m512 tmp6534 = _mm512_shuffle_ps(tmp6517, tmp6519, 238);
__m512 tmp6535 = _mm512_shuffle_ps(tmp6518, tmp6520, 68);
__m512 tmp6536 = _mm512_shuffle_ps(tmp6518, tmp6520, 238);
__m512 tmp6537 = _mm512_shuffle_ps(tmp6521, tmp6523, 68);
__m512 tmp6538 = _mm512_shuffle_ps(tmp6521, tmp6523, 238);
__m512 tmp6539 = _mm512_shuffle_ps(tmp6522, tmp6524, 68);
__m512 tmp6540 = _mm512_shuffle_ps(tmp6522, tmp6524, 238);
__m512 tmp6541 = _mm512_shuffle_ps(tmp6525, tmp6527, 68);
__m512 tmp6542 = _mm512_shuffle_ps(tmp6525, tmp6527, 238);
__m512 tmp6543 = _mm512_shuffle_ps(tmp6526, tmp6528, 68);
__m512 tmp6544 = _mm512_shuffle_ps(tmp6526, tmp6528, 238);
__m512 tmp6545 = _mm512_shuffle_f32x4(tmp6529, tmp6533, 136);
__m512 tmp6546 = _mm512_shuffle_f32x4(tmp6529, tmp6533, 221);
__m512 tmp6547 = _mm512_shuffle_f32x4(tmp6530, tmp6534, 136);
__m512 tmp6548 = _mm512_shuffle_f32x4(tmp6530, tmp6534, 221);
__m512 tmp6549 = _mm512_shuffle_f32x4(tmp6531, tmp6535, 136);
__m512 tmp6550 = _mm512_shuffle_f32x4(tmp6531, tmp6535, 221);
__m512 tmp6551 = _mm512_shuffle_f32x4(tmp6532, tmp6536, 136);
__m512 tmp6552 = _mm512_shuffle_f32x4(tmp6532, tmp6536, 221);
__m512 tmp6553 = _mm512_shuffle_f32x4(tmp6537, tmp6541, 136);
__m512 tmp6554 = _mm512_shuffle_f32x4(tmp6537, tmp6541, 221);
__m512 tmp6555 = _mm512_shuffle_f32x4(tmp6538, tmp6542, 136);
__m512 tmp6556 = _mm512_shuffle_f32x4(tmp6538, tmp6542, 221);
__m512 tmp6557 = _mm512_shuffle_f32x4(tmp6539, tmp6543, 136);
__m512 tmp6558 = _mm512_shuffle_f32x4(tmp6539, tmp6543, 221);
__m512 tmp6559 = _mm512_shuffle_f32x4(tmp6540, tmp6544, 136);
__m512 tmp6560 = _mm512_shuffle_f32x4(tmp6540, tmp6544, 221);
in981 = _mm512_shuffle_f32x4(tmp6545, tmp6553, 136);
in989 = _mm512_shuffle_f32x4(tmp6545, tmp6553, 221);
tmp6499 = _mm512_shuffle_f32x4(tmp6547, tmp6555, 136);
tmp6503 = _mm512_shuffle_f32x4(tmp6547, tmp6555, 221);
tmp6500 = _mm512_shuffle_f32x4(tmp6549, tmp6557, 136);
tmp6504 = _mm512_shuffle_f32x4(tmp6549, tmp6557, 221);
in987 = _mm512_shuffle_f32x4(tmp6551, tmp6559, 136);
in995 = _mm512_shuffle_f32x4(tmp6551, tmp6559, 221);
tmp6498 = _mm512_shuffle_f32x4(tmp6546, tmp6554, 136);
tmp6502 = _mm512_shuffle_f32x4(tmp6546, tmp6554, 221);
in983 = _mm512_shuffle_f32x4(tmp6548, tmp6556, 136);
in991 = _mm512_shuffle_f32x4(tmp6548, tmp6556, 221);
in985 = _mm512_shuffle_f32x4(tmp6550, tmp6558, 136);
in993 = _mm512_shuffle_f32x4(tmp6550, tmp6558, 221);
in984 = _mm512_shuffle_f32x4(tmp6552, tmp6560, 136);
in992 = _mm512_shuffle_f32x4(tmp6552, tmp6560, 221);
__m512 tmp6505 = _mm512_add_ps(tmp6499, in983);
__m512 tmp6509 = _mm512_add_ps(tmp6503, in991);
__m512 tmp6506 = _mm512_sub_ps(tmp6498, tmp6500);
__m512 tmp6510 = _mm512_sub_ps(tmp6502, tmp6504);
__m512 tmp6507 = _mm512_add_ps(tmp6500, in985);
__m512 tmp6511 = _mm512_add_ps(tmp6504, in993);
in981 = _mm512_sub_ps(in981, in985);
in989 = _mm512_sub_ps(in989, in993);
tmp6505 = _mm512_fmadd_ps(in987, _mm512_set1_ps(-4.25e+00f), tmp6505);
tmp6509 = _mm512_fmadd_ps(in995, _mm512_set1_ps(-4.25e+00f), tmp6509);
tmp6507 = _mm512_fmadd_ps(tmp6498, _mm512_set1_ps(-4.25e+00f), tmp6507);
tmp6511 = _mm512_fmadd_ps(tmp6502, _mm512_set1_ps(-4.25e+00f), tmp6511);
in981 = _mm512_fmadd_ps(tmp6506, _mm512_set1_ps(5.25e+00f), in981);
in989 = _mm512_fmadd_ps(tmp6510, _mm512_set1_ps(5.25e+00f), in989);
tmp6506 = _mm512_fmadd_ps(tmp6500, _mm512_set1_ps(2.5e-01f), in985);
tmp6510 = _mm512_fmadd_ps(tmp6504, _mm512_set1_ps(2.5e-01f), in993);
tmp6500 = _mm512_fmadd_ps(tmp6500, _mm512_set1_ps(4e+00f), in985);
tmp6504 = _mm512_fmadd_ps(tmp6504, _mm512_set1_ps(4e+00f), in993);
__m512 tmp6508 = _mm512_sub_ps(tmp6507, tmp6505);
__m512 tmp6512 = _mm512_sub_ps(tmp6511, tmp6509);
tmp6507 = _mm512_add_ps(tmp6505, tmp6507);
tmp6511 = _mm512_add_ps(tmp6509, tmp6511);
tmp6505 = _mm512_fmadd_ps(tmp6499, _mm512_set1_ps(2.5e-01f), in983);
tmp6509 = _mm512_fmadd_ps(tmp6503, _mm512_set1_ps(2.5e-01f), in991);
tmp6506 = _mm512_fmadd_ps(tmp6498, _mm512_set1_ps(-1.25e+00f), tmp6506);
tmp6510 = _mm512_fmadd_ps(tmp6502, _mm512_set1_ps(-1.25e+00f), tmp6510);
tmp6498 = _mm512_fmadd_ps(tmp6498, _mm512_set1_ps(-5e+00f), tmp6500);
tmp6502 = _mm512_fmadd_ps(tmp6502, _mm512_set1_ps(-5e+00f), tmp6504);
tmp6505 = _mm512_fmadd_ps(in987, _mm512_set1_ps(-1.25e+00f), tmp6505);
tmp6509 = _mm512_fmadd_ps(in995, _mm512_set1_ps(-1.25e+00f), tmp6509);
in985 = _mm512_fmadd_ps(tmp6505, _mm512_set1_ps(2e+00f), tmp6506);
in993 = _mm512_fmadd_ps(tmp6509, _mm512_set1_ps(2e+00f), tmp6510);
tmp6506 = _mm512_fnmadd_ps(tmp6505, _mm512_set1_ps(2e+00f), tmp6506);
tmp6510 = _mm512_fnmadd_ps(tmp6509, _mm512_set1_ps(2e+00f), tmp6510);
tmp6505 = _mm512_fmadd_ps(in983, _mm512_set1_ps(2.5e-01f), tmp6499);
tmp6509 = _mm512_fmadd_ps(in991, _mm512_set1_ps(2.5e-01f), tmp6503);
tmp6499 = _mm512_sub_ps(in984, tmp6499);
tmp6503 = _mm512_sub_ps(in992, tmp6503);
tmp6505 = _mm512_fmadd_ps(in987, _mm512_set1_ps(-1.25e+00f), tmp6505);
tmp6509 = _mm512_fmadd_ps(in995, _mm512_set1_ps(-1.25e+00f), tmp6509);
in987 = _mm512_sub_ps(in987, in983);
in995 = _mm512_sub_ps(in995, in991);
in987 = _mm512_fmadd_ps(in987, _mm512_set1_ps(5.25e+00f), tmp6499);
in995 = _mm512_fmadd_ps(in995, _mm512_set1_ps(5.25e+00f), tmp6503);
tmp6500 = _mm512_fmadd_ps(tmp6505, _mm512_set1_ps(2e+00f), tmp6498);
tmp6504 = _mm512_fmadd_ps(tmp6509, _mm512_set1_ps(2e+00f), tmp6502);
tmp6498 = _mm512_fnmadd_ps(tmp6505, _mm512_set1_ps(2e+00f), tmp6498);
tmp6502 = _mm512_fnmadd_ps(tmp6509, _mm512_set1_ps(2e+00f), tmp6502);
__m512 out931 = _mm512_shuffle_f32x4(in981, tmp6507, 68);
__m512 out939 = _mm512_shuffle_f32x4(in981, tmp6507, 238);
__m512 out932 = _mm512_shuffle_f32x4(tmp6508, in985, 68);
__m512 out940 = _mm512_shuffle_f32x4(tmp6508, in985, 238);
__m512 out933 = _mm512_shuffle_f32x4(tmp6506, tmp6500, 68);
__m512 out941 = _mm512_shuffle_f32x4(tmp6506, tmp6500, 238);
__m512 out934 = _mm512_shuffle_f32x4(tmp6498, in987, 68);
__m512 out942 = _mm512_shuffle_f32x4(tmp6498, in987, 238);
__m512 out935 = _mm512_shuffle_f32x4(in989, tmp6511, 68);
__m512 out943 = _mm512_shuffle_f32x4(in989, tmp6511, 238);
__m512 out936 = _mm512_shuffle_f32x4(tmp6512, in993, 68);
__m512 out944 = _mm512_shuffle_f32x4(tmp6512, in993, 238);
__m512 out937 = _mm512_shuffle_f32x4(tmp6510, tmp6504, 68);
__m512 out945 = _mm512_shuffle_f32x4(tmp6510, tmp6504, 238);
__m512 out938 = _mm512_shuffle_f32x4(tmp6502, in995, 68);
__m512 out946 = _mm512_shuffle_f32x4(tmp6502, in995, 238);
_mm512_storeu_ps(dfPtr6+512+1638400*i26+24576*j21+24576*s20+768*k84, out931);
_mm512_storeu_ps(dfPtr6+640+1638400*i26+24576*j21+24576*s20+768*k84, out939);
_mm512_storeu_ps(dfPtr6+576+1638400*i26+24576*j21+24576*s20+768*k84, out935);
_mm512_storeu_ps(dfPtr6+704+1638400*i26+24576*j21+24576*s20+768*k84, out943);
_mm512_storeu_ps(dfPtr6+410112+1638400*i26+24576*j21+24576*s20+768*k84, out932);
_mm512_storeu_ps(dfPtr6+410240+1638400*i26+24576*j21+24576*s20+768*k84, out940);
_mm512_storeu_ps(dfPtr6+410176+1638400*i26+24576*j21+24576*s20+768*k84, out936);
_mm512_storeu_ps(dfPtr6+410304+1638400*i26+24576*j21+24576*s20+768*k84, out944);
_mm512_storeu_ps(dfPtr6+819712+1638400*i26+24576*j21+24576*s20+768*k84, out933);
_mm512_storeu_ps(dfPtr6+819840+1638400*i26+24576*j21+24576*s20+768*k84, out941);
_mm512_storeu_ps(dfPtr6+819776+1638400*i26+24576*j21+24576*s20+768*k84, out937);
_mm512_storeu_ps(dfPtr6+819904+1638400*i26+24576*j21+24576*s20+768*k84, out945);
_mm512_storeu_ps(dfPtr6+1229312+1638400*i26+24576*j21+24576*s20+768*k84, out934);
_mm512_storeu_ps(dfPtr6+1229440+1638400*i26+24576*j21+24576*s20+768*k84, out942);
_mm512_storeu_ps(dfPtr6+1229376+1638400*i26+24576*j21+24576*s20+768*k84, out938);
_mm512_storeu_ps(dfPtr6+1229504+1638400*i26+24576*j21+24576*s20+768*k84, out946);
}
if (j21 >= last5) return;
++j21;
if (j21 >= 15) break;
rel14 = 3;
}
if (rel14 < 4) {
ptrdiff_t h34 = base14+12;
ptrdiff_t w41 = 0;
ptrdiff_t k85 = 0;
for (; k85 != 32; ++k85) {
__m512 dat1525 = _mm512_maskz_loadu_ps(8191, datPtr12+4+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1526 = _mm512_maskz_loadu_ps(16383, datPtr12+48+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512i pm135 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in997 = _mm512_permutexvar_ps(pm135, dat1525);
__m512i pm136 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1005 = _mm512_permutexvar_ps(pm136, dat1526);
__m512 dat1527 = _mm512_maskz_loadu_ps(8191, datPtr12+228+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1528 = _mm512_maskz_loadu_ps(16383, datPtr12+272+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in998 = _mm512_permutexvar_ps(pm135, dat1527);
__m512 in1006 = _mm512_permutexvar_ps(pm136, dat1528);
__m512 dat1529 = _mm512_maskz_loadu_ps(8191, datPtr12+452+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1530 = _mm512_maskz_loadu_ps(16383, datPtr12+496+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in999 = _mm512_permutexvar_ps(pm135, dat1529);
__m512 in1007 = _mm512_permutexvar_ps(pm136, dat1530);
__m512 dat1531 = _mm512_maskz_loadu_ps(8191, datPtr12+676+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1532 = _mm512_maskz_loadu_ps(16383, datPtr12+720+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1000 = _mm512_permutexvar_ps(pm135, dat1531);
__m512 in1008 = _mm512_permutexvar_ps(pm136, dat1532);
__m512 dat1533 = _mm512_maskz_loadu_ps(8191, datPtr12+900+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1534 = _mm512_maskz_loadu_ps(16383, datPtr12+944+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1001 = _mm512_permutexvar_ps(pm135, dat1533);
__m512 in1009 = _mm512_permutexvar_ps(pm136, dat1534);
__m512 dat1535 = _mm512_maskz_loadu_ps(8191, datPtr12+1124+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1536 = _mm512_maskz_loadu_ps(16383, datPtr12+1168+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1002 = _mm512_permutexvar_ps(pm135, dat1535);
__m512 in1010 = _mm512_permutexvar_ps(pm136, dat1536);
__m512 dat1537 = _mm512_maskz_loadu_ps(8191, datPtr12+1348+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1538 = _mm512_maskz_loadu_ps(16383, datPtr12+1392+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1003 = _mm512_permutexvar_ps(pm135, dat1537);
__m512 in1011 = _mm512_permutexvar_ps(pm136, dat1538);
__m512 dat1539 = _mm512_maskz_loadu_ps(8191, datPtr12+1572+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1540 = _mm512_maskz_loadu_ps(16383, datPtr12+1616+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1004 = _mm512_permutexvar_ps(pm135, dat1539);
__m512 in1012 = _mm512_permutexvar_ps(pm136, dat1540);
__m512 tmp6561 = _mm512_add_ps(in998, in1002);
__m512 tmp6565 = _mm512_add_ps(in1006, in1010);
__m512 tmp6562 = _mm512_sub_ps(in1001, in999);
__m512 tmp6566 = _mm512_sub_ps(in1009, in1007);
__m512 tmp6563 = _mm512_add_ps(in999, in1003);
__m512 tmp6567 = _mm512_add_ps(in1007, in1011);
in997 = _mm512_sub_ps(in997, in1003);
in1005 = _mm512_sub_ps(in1005, in1011);
tmp6561 = _mm512_fmadd_ps(in1000, _mm512_set1_ps(-4.25e+00f), tmp6561);
tmp6565 = _mm512_fmadd_ps(in1008, _mm512_set1_ps(-4.25e+00f), tmp6565);
tmp6563 = _mm512_fmadd_ps(in1001, _mm512_set1_ps(-4.25e+00f), tmp6563);
tmp6567 = _mm512_fmadd_ps(in1009, _mm512_set1_ps(-4.25e+00f), tmp6567);
in997 = _mm512_fmadd_ps(tmp6562, _mm512_set1_ps(5.25e+00f), in997);
in1005 = _mm512_fmadd_ps(tmp6566, _mm512_set1_ps(5.25e+00f), in1005);
tmp6562 = _mm512_fmadd_ps(in999, _mm512_set1_ps(2.5e-01f), in1003);
tmp6566 = _mm512_fmadd_ps(in1007, _mm512_set1_ps(2.5e-01f), in1011);
in999 = _mm512_fmadd_ps(in999, _mm512_set1_ps(4e+00f), in1003);
in1007 = _mm512_fmadd_ps(in1007, _mm512_set1_ps(4e+00f), in1011);
__m512 tmp6564 = _mm512_sub_ps(tmp6563, tmp6561);
__m512 tmp6568 = _mm512_sub_ps(tmp6567, tmp6565);
tmp6563 = _mm512_add_ps(tmp6561, tmp6563);
tmp6567 = _mm512_add_ps(tmp6565, tmp6567);
tmp6561 = _mm512_fmadd_ps(in998, _mm512_set1_ps(2.5e-01f), in1002);
tmp6565 = _mm512_fmadd_ps(in1006, _mm512_set1_ps(2.5e-01f), in1010);
tmp6562 = _mm512_fmadd_ps(in1001, _mm512_set1_ps(-1.25e+00f), tmp6562);
tmp6566 = _mm512_fmadd_ps(in1009, _mm512_set1_ps(-1.25e+00f), tmp6566);
in1001 = _mm512_fmadd_ps(in1001, _mm512_set1_ps(-5e+00f), in999);
in1009 = _mm512_fmadd_ps(in1009, _mm512_set1_ps(-5e+00f), in1007);
tmp6561 = _mm512_fmadd_ps(in1000, _mm512_set1_ps(-1.25e+00f), tmp6561);
tmp6565 = _mm512_fmadd_ps(in1008, _mm512_set1_ps(-1.25e+00f), tmp6565);
in1003 = _mm512_fmadd_ps(tmp6561, _mm512_set1_ps(2e+00f), tmp6562);
in1011 = _mm512_fmadd_ps(tmp6565, _mm512_set1_ps(2e+00f), tmp6566);
tmp6562 = _mm512_fnmadd_ps(tmp6561, _mm512_set1_ps(2e+00f), tmp6562);
tmp6566 = _mm512_fnmadd_ps(tmp6565, _mm512_set1_ps(2e+00f), tmp6566);
tmp6561 = _mm512_fmadd_ps(in1002, _mm512_set1_ps(2.5e-01f), in998);
tmp6565 = _mm512_fmadd_ps(in1010, _mm512_set1_ps(2.5e-01f), in1006);
in998 = _mm512_sub_ps(in1004, in998);
in1006 = _mm512_sub_ps(in1012, in1006);
tmp6561 = _mm512_fmadd_ps(in1000, _mm512_set1_ps(-1.25e+00f), tmp6561);
tmp6565 = _mm512_fmadd_ps(in1008, _mm512_set1_ps(-1.25e+00f), tmp6565);
in1000 = _mm512_sub_ps(in1000, in1002);
in1008 = _mm512_sub_ps(in1008, in1010);
in1000 = _mm512_fmadd_ps(in1000, _mm512_set1_ps(5.25e+00f), in998);
in1008 = _mm512_fmadd_ps(in1008, _mm512_set1_ps(5.25e+00f), in1006);
in999 = _mm512_fmadd_ps(tmp6561, _mm512_set1_ps(2e+00f), in1001);
in1007 = _mm512_fmadd_ps(tmp6565, _mm512_set1_ps(2e+00f), in1009);
in1001 = _mm512_fnmadd_ps(tmp6561, _mm512_set1_ps(2e+00f), in1001);
in1009 = _mm512_fnmadd_ps(tmp6565, _mm512_set1_ps(2e+00f), in1009);
__m512 tmp6577 = _mm512_unpacklo_ps(in997, tmp6563);
__m512 tmp6578 = _mm512_unpackhi_ps(in997, tmp6563);
__m512 tmp6579 = _mm512_unpacklo_ps(tmp6564, in1003);
__m512 tmp6580 = _mm512_unpackhi_ps(tmp6564, in1003);
__m512 tmp6581 = _mm512_unpacklo_ps(tmp6562, in999);
__m512 tmp6582 = _mm512_unpackhi_ps(tmp6562, in999);
__m512 tmp6583 = _mm512_unpacklo_ps(in1001, in1000);
__m512 tmp6584 = _mm512_unpackhi_ps(in1001, in1000);
__m512 tmp6585 = _mm512_unpacklo_ps(in1005, tmp6567);
__m512 tmp6586 = _mm512_unpackhi_ps(in1005, tmp6567);
__m512 tmp6587 = _mm512_unpacklo_ps(tmp6568, in1011);
__m512 tmp6588 = _mm512_unpackhi_ps(tmp6568, in1011);
__m512 tmp6589 = _mm512_unpacklo_ps(tmp6566, in1007);
__m512 tmp6590 = _mm512_unpackhi_ps(tmp6566, in1007);
__m512 tmp6591 = _mm512_unpacklo_ps(in1009, in1008);
__m512 tmp6592 = _mm512_unpackhi_ps(in1009, in1008);
__m512 tmp6593 = _mm512_shuffle_ps(tmp6577, tmp6579, 68);
__m512 tmp6594 = _mm512_shuffle_ps(tmp6577, tmp6579, 238);
__m512 tmp6595 = _mm512_shuffle_ps(tmp6578, tmp6580, 68);
__m512 tmp6596 = _mm512_shuffle_ps(tmp6578, tmp6580, 238);
__m512 tmp6597 = _mm512_shuffle_ps(tmp6581, tmp6583, 68);
__m512 tmp6598 = _mm512_shuffle_ps(tmp6581, tmp6583, 238);
__m512 tmp6599 = _mm512_shuffle_ps(tmp6582, tmp6584, 68);
__m512 tmp6600 = _mm512_shuffle_ps(tmp6582, tmp6584, 238);
__m512 tmp6601 = _mm512_shuffle_ps(tmp6585, tmp6587, 68);
__m512 tmp6602 = _mm512_shuffle_ps(tmp6585, tmp6587, 238);
__m512 tmp6603 = _mm512_shuffle_ps(tmp6586, tmp6588, 68);
__m512 tmp6604 = _mm512_shuffle_ps(tmp6586, tmp6588, 238);
__m512 tmp6605 = _mm512_shuffle_ps(tmp6589, tmp6591, 68);
__m512 tmp6606 = _mm512_shuffle_ps(tmp6589, tmp6591, 238);
__m512 tmp6607 = _mm512_shuffle_ps(tmp6590, tmp6592, 68);
__m512 tmp6608 = _mm512_shuffle_ps(tmp6590, tmp6592, 238);
__m512 tmp6609 = _mm512_shuffle_f32x4(tmp6593, tmp6597, 136);
__m512 tmp6610 = _mm512_shuffle_f32x4(tmp6593, tmp6597, 221);
__m512 tmp6611 = _mm512_shuffle_f32x4(tmp6594, tmp6598, 136);
__m512 tmp6612 = _mm512_shuffle_f32x4(tmp6594, tmp6598, 221);
__m512 tmp6613 = _mm512_shuffle_f32x4(tmp6595, tmp6599, 136);
__m512 tmp6614 = _mm512_shuffle_f32x4(tmp6595, tmp6599, 221);
__m512 tmp6615 = _mm512_shuffle_f32x4(tmp6596, tmp6600, 136);
__m512 tmp6616 = _mm512_shuffle_f32x4(tmp6596, tmp6600, 221);
__m512 tmp6617 = _mm512_shuffle_f32x4(tmp6601, tmp6605, 136);
__m512 tmp6618 = _mm512_shuffle_f32x4(tmp6601, tmp6605, 221);
__m512 tmp6619 = _mm512_shuffle_f32x4(tmp6602, tmp6606, 136);
__m512 tmp6620 = _mm512_shuffle_f32x4(tmp6602, tmp6606, 221);
__m512 tmp6621 = _mm512_shuffle_f32x4(tmp6603, tmp6607, 136);
__m512 tmp6622 = _mm512_shuffle_f32x4(tmp6603, tmp6607, 221);
__m512 tmp6623 = _mm512_shuffle_f32x4(tmp6604, tmp6608, 136);
__m512 tmp6624 = _mm512_shuffle_f32x4(tmp6604, tmp6608, 221);
in997 = _mm512_shuffle_f32x4(tmp6609, tmp6617, 136);
in1005 = _mm512_shuffle_f32x4(tmp6609, tmp6617, 221);
tmp6563 = _mm512_shuffle_f32x4(tmp6611, tmp6619, 136);
tmp6567 = _mm512_shuffle_f32x4(tmp6611, tmp6619, 221);
tmp6564 = _mm512_shuffle_f32x4(tmp6613, tmp6621, 136);
tmp6568 = _mm512_shuffle_f32x4(tmp6613, tmp6621, 221);
in1003 = _mm512_shuffle_f32x4(tmp6615, tmp6623, 136);
in1011 = _mm512_shuffle_f32x4(tmp6615, tmp6623, 221);
tmp6562 = _mm512_shuffle_f32x4(tmp6610, tmp6618, 136);
tmp6566 = _mm512_shuffle_f32x4(tmp6610, tmp6618, 221);
in999 = _mm512_shuffle_f32x4(tmp6612, tmp6620, 136);
in1007 = _mm512_shuffle_f32x4(tmp6612, tmp6620, 221);
in1001 = _mm512_shuffle_f32x4(tmp6614, tmp6622, 136);
in1009 = _mm512_shuffle_f32x4(tmp6614, tmp6622, 221);
in1000 = _mm512_shuffle_f32x4(tmp6616, tmp6624, 136);
in1008 = _mm512_shuffle_f32x4(tmp6616, tmp6624, 221);
__m512 tmp6569 = _mm512_add_ps(tmp6563, in999);
__m512 tmp6573 = _mm512_add_ps(tmp6567, in1007);
__m512 tmp6570 = _mm512_sub_ps(tmp6562, tmp6564);
__m512 tmp6574 = _mm512_sub_ps(tmp6566, tmp6568);
__m512 tmp6571 = _mm512_add_ps(tmp6564, in1001);
__m512 tmp6575 = _mm512_add_ps(tmp6568, in1009);
in997 = _mm512_sub_ps(in997, in1001);
in1005 = _mm512_sub_ps(in1005, in1009);
tmp6569 = _mm512_fmadd_ps(in1003, _mm512_set1_ps(-4.25e+00f), tmp6569);
tmp6573 = _mm512_fmadd_ps(in1011, _mm512_set1_ps(-4.25e+00f), tmp6573);
tmp6571 = _mm512_fmadd_ps(tmp6562, _mm512_set1_ps(-4.25e+00f), tmp6571);
tmp6575 = _mm512_fmadd_ps(tmp6566, _mm512_set1_ps(-4.25e+00f), tmp6575);
in997 = _mm512_fmadd_ps(tmp6570, _mm512_set1_ps(5.25e+00f), in997);
in1005 = _mm512_fmadd_ps(tmp6574, _mm512_set1_ps(5.25e+00f), in1005);
tmp6570 = _mm512_fmadd_ps(tmp6564, _mm512_set1_ps(2.5e-01f), in1001);
tmp6574 = _mm512_fmadd_ps(tmp6568, _mm512_set1_ps(2.5e-01f), in1009);
tmp6564 = _mm512_fmadd_ps(tmp6564, _mm512_set1_ps(4e+00f), in1001);
tmp6568 = _mm512_fmadd_ps(tmp6568, _mm512_set1_ps(4e+00f), in1009);
__m512 tmp6572 = _mm512_sub_ps(tmp6571, tmp6569);
__m512 tmp6576 = _mm512_sub_ps(tmp6575, tmp6573);
tmp6571 = _mm512_add_ps(tmp6569, tmp6571);
tmp6575 = _mm512_add_ps(tmp6573, tmp6575);
tmp6569 = _mm512_fmadd_ps(tmp6563, _mm512_set1_ps(2.5e-01f), in999);
tmp6573 = _mm512_fmadd_ps(tmp6567, _mm512_set1_ps(2.5e-01f), in1007);
tmp6570 = _mm512_fmadd_ps(tmp6562, _mm512_set1_ps(-1.25e+00f), tmp6570);
tmp6574 = _mm512_fmadd_ps(tmp6566, _mm512_set1_ps(-1.25e+00f), tmp6574);
tmp6562 = _mm512_fmadd_ps(tmp6562, _mm512_set1_ps(-5e+00f), tmp6564);
tmp6566 = _mm512_fmadd_ps(tmp6566, _mm512_set1_ps(-5e+00f), tmp6568);
tmp6569 = _mm512_fmadd_ps(in1003, _mm512_set1_ps(-1.25e+00f), tmp6569);
tmp6573 = _mm512_fmadd_ps(in1011, _mm512_set1_ps(-1.25e+00f), tmp6573);
in1001 = _mm512_fmadd_ps(tmp6569, _mm512_set1_ps(2e+00f), tmp6570);
in1009 = _mm512_fmadd_ps(tmp6573, _mm512_set1_ps(2e+00f), tmp6574);
tmp6570 = _mm512_fnmadd_ps(tmp6569, _mm512_set1_ps(2e+00f), tmp6570);
tmp6574 = _mm512_fnmadd_ps(tmp6573, _mm512_set1_ps(2e+00f), tmp6574);
tmp6569 = _mm512_fmadd_ps(in999, _mm512_set1_ps(2.5e-01f), tmp6563);
tmp6573 = _mm512_fmadd_ps(in1007, _mm512_set1_ps(2.5e-01f), tmp6567);
tmp6563 = _mm512_sub_ps(in1000, tmp6563);
tmp6567 = _mm512_sub_ps(in1008, tmp6567);
tmp6569 = _mm512_fmadd_ps(in1003, _mm512_set1_ps(-1.25e+00f), tmp6569);
tmp6573 = _mm512_fmadd_ps(in1011, _mm512_set1_ps(-1.25e+00f), tmp6573);
in1003 = _mm512_sub_ps(in1003, in999);
in1011 = _mm512_sub_ps(in1011, in1007);
in1003 = _mm512_fmadd_ps(in1003, _mm512_set1_ps(5.25e+00f), tmp6563);
in1011 = _mm512_fmadd_ps(in1011, _mm512_set1_ps(5.25e+00f), tmp6567);
tmp6564 = _mm512_fmadd_ps(tmp6569, _mm512_set1_ps(2e+00f), tmp6562);
tmp6568 = _mm512_fmadd_ps(tmp6573, _mm512_set1_ps(2e+00f), tmp6566);
tmp6562 = _mm512_fnmadd_ps(tmp6569, _mm512_set1_ps(2e+00f), tmp6562);
tmp6566 = _mm512_fnmadd_ps(tmp6573, _mm512_set1_ps(2e+00f), tmp6566);
__m512 out947 = _mm512_shuffle_f32x4(in997, tmp6571, 68);
__m512 out955 = _mm512_shuffle_f32x4(in997, tmp6571, 238);
__m512 out948 = _mm512_shuffle_f32x4(tmp6572, in1001, 68);
__m512 out956 = _mm512_shuffle_f32x4(tmp6572, in1001, 238);
__m512 out949 = _mm512_shuffle_f32x4(tmp6570, tmp6564, 68);
__m512 out957 = _mm512_shuffle_f32x4(tmp6570, tmp6564, 238);
__m512 out950 = _mm512_shuffle_f32x4(tmp6562, in1003, 68);
__m512 out958 = _mm512_shuffle_f32x4(tmp6562, in1003, 238);
__m512 out951 = _mm512_shuffle_f32x4(in1005, tmp6575, 68);
__m512 out959 = _mm512_shuffle_f32x4(in1005, tmp6575, 238);
__m512 out952 = _mm512_shuffle_f32x4(tmp6576, in1009, 68);
__m512 out960 = _mm512_shuffle_f32x4(tmp6576, in1009, 238);
__m512 out953 = _mm512_shuffle_f32x4(tmp6574, tmp6568, 68);
__m512 out961 = _mm512_shuffle_f32x4(tmp6574, tmp6568, 238);
__m512 out954 = _mm512_shuffle_f32x4(tmp6566, in1011, 68);
__m512 out962 = _mm512_shuffle_f32x4(tmp6566, in1011, 238);
_mm512_storeu_ps(dfPtr6+0+1638400*i26+24576*j21+24576*s20+768*k85, out947);
_mm512_storeu_ps(dfPtr6+128+1638400*i26+24576*j21+24576*s20+768*k85, out955);
_mm512_storeu_ps(dfPtr6+64+1638400*i26+24576*j21+24576*s20+768*k85, out951);
_mm512_storeu_ps(dfPtr6+192+1638400*i26+24576*j21+24576*s20+768*k85, out959);
_mm512_storeu_ps(dfPtr6+409600+1638400*i26+24576*j21+24576*s20+768*k85, out948);
_mm512_storeu_ps(dfPtr6+409728+1638400*i26+24576*j21+24576*s20+768*k85, out956);
_mm512_storeu_ps(dfPtr6+409664+1638400*i26+24576*j21+24576*s20+768*k85, out952);
_mm512_storeu_ps(dfPtr6+409792+1638400*i26+24576*j21+24576*s20+768*k85, out960);
_mm512_storeu_ps(dfPtr6+819200+1638400*i26+24576*j21+24576*s20+768*k85, out949);
_mm512_storeu_ps(dfPtr6+819328+1638400*i26+24576*j21+24576*s20+768*k85, out957);
_mm512_storeu_ps(dfPtr6+819264+1638400*i26+24576*j21+24576*s20+768*k85, out953);
_mm512_storeu_ps(dfPtr6+819392+1638400*i26+24576*j21+24576*s20+768*k85, out961);
_mm512_storeu_ps(dfPtr6+1228800+1638400*i26+24576*j21+24576*s20+768*k85, out950);
_mm512_storeu_ps(dfPtr6+1228928+1638400*i26+24576*j21+24576*s20+768*k85, out958);
_mm512_storeu_ps(dfPtr6+1228864+1638400*i26+24576*j21+24576*s20+768*k85, out954);
_mm512_storeu_ps(dfPtr6+1228992+1638400*i26+24576*j21+24576*s20+768*k85, out962);
__m512 dat1541 = _mm512_maskz_loadu_ps(16383, datPtr12+96+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1542 = _mm512_maskz_loadu_ps(8191, datPtr12+12612+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512i pm137 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1013 = _mm512_permutexvar_ps(pm137, dat1541);
__m512i pm138 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1021 = _mm512_permutexvar_ps(pm138, dat1542);
__m512 dat1543 = _mm512_maskz_loadu_ps(16383, datPtr12+320+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1544 = _mm512_maskz_loadu_ps(8191, datPtr12+12836+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1014 = _mm512_permutexvar_ps(pm137, dat1543);
__m512 in1022 = _mm512_permutexvar_ps(pm138, dat1544);
__m512 dat1545 = _mm512_maskz_loadu_ps(16383, datPtr12+544+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1546 = _mm512_maskz_loadu_ps(8191, datPtr12+13060+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1015 = _mm512_permutexvar_ps(pm137, dat1545);
__m512 in1023 = _mm512_permutexvar_ps(pm138, dat1546);
__m512 dat1547 = _mm512_maskz_loadu_ps(16383, datPtr12+768+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1548 = _mm512_maskz_loadu_ps(8191, datPtr12+13284+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1016 = _mm512_permutexvar_ps(pm137, dat1547);
__m512 in1024 = _mm512_permutexvar_ps(pm138, dat1548);
__m512 dat1549 = _mm512_maskz_loadu_ps(16383, datPtr12+992+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1550 = _mm512_maskz_loadu_ps(8191, datPtr12+13508+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1017 = _mm512_permutexvar_ps(pm137, dat1549);
__m512 in1025 = _mm512_permutexvar_ps(pm138, dat1550);
__m512 dat1551 = _mm512_maskz_loadu_ps(16383, datPtr12+1216+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1552 = _mm512_maskz_loadu_ps(8191, datPtr12+13732+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1018 = _mm512_permutexvar_ps(pm137, dat1551);
__m512 in1026 = _mm512_permutexvar_ps(pm138, dat1552);
__m512 dat1553 = _mm512_maskz_loadu_ps(16383, datPtr12+1440+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1554 = _mm512_maskz_loadu_ps(8191, datPtr12+13956+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1019 = _mm512_permutexvar_ps(pm137, dat1553);
__m512 in1027 = _mm512_permutexvar_ps(pm138, dat1554);
__m512 dat1555 = _mm512_maskz_loadu_ps(16383, datPtr12+1664+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1556 = _mm512_maskz_loadu_ps(8191, datPtr12+14180+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1020 = _mm512_permutexvar_ps(pm137, dat1555);
__m512 in1028 = _mm512_permutexvar_ps(pm138, dat1556);
__m512 tmp6625 = _mm512_add_ps(in1014, in1018);
__m512 tmp6629 = _mm512_add_ps(in1022, in1026);
__m512 tmp6626 = _mm512_sub_ps(in1017, in1015);
__m512 tmp6630 = _mm512_sub_ps(in1025, in1023);
__m512 tmp6627 = _mm512_add_ps(in1015, in1019);
__m512 tmp6631 = _mm512_add_ps(in1023, in1027);
in1013 = _mm512_sub_ps(in1013, in1019);
in1021 = _mm512_sub_ps(in1021, in1027);
tmp6625 = _mm512_fmadd_ps(in1016, _mm512_set1_ps(-4.25e+00f), tmp6625);
tmp6629 = _mm512_fmadd_ps(in1024, _mm512_set1_ps(-4.25e+00f), tmp6629);
tmp6627 = _mm512_fmadd_ps(in1017, _mm512_set1_ps(-4.25e+00f), tmp6627);
tmp6631 = _mm512_fmadd_ps(in1025, _mm512_set1_ps(-4.25e+00f), tmp6631);
in1013 = _mm512_fmadd_ps(tmp6626, _mm512_set1_ps(5.25e+00f), in1013);
in1021 = _mm512_fmadd_ps(tmp6630, _mm512_set1_ps(5.25e+00f), in1021);
tmp6626 = _mm512_fmadd_ps(in1015, _mm512_set1_ps(2.5e-01f), in1019);
tmp6630 = _mm512_fmadd_ps(in1023, _mm512_set1_ps(2.5e-01f), in1027);
in1015 = _mm512_fmadd_ps(in1015, _mm512_set1_ps(4e+00f), in1019);
in1023 = _mm512_fmadd_ps(in1023, _mm512_set1_ps(4e+00f), in1027);
__m512 tmp6628 = _mm512_sub_ps(tmp6627, tmp6625);
__m512 tmp6632 = _mm512_sub_ps(tmp6631, tmp6629);
tmp6627 = _mm512_add_ps(tmp6625, tmp6627);
tmp6631 = _mm512_add_ps(tmp6629, tmp6631);
tmp6625 = _mm512_fmadd_ps(in1014, _mm512_set1_ps(2.5e-01f), in1018);
tmp6629 = _mm512_fmadd_ps(in1022, _mm512_set1_ps(2.5e-01f), in1026);
tmp6626 = _mm512_fmadd_ps(in1017, _mm512_set1_ps(-1.25e+00f), tmp6626);
tmp6630 = _mm512_fmadd_ps(in1025, _mm512_set1_ps(-1.25e+00f), tmp6630);
in1017 = _mm512_fmadd_ps(in1017, _mm512_set1_ps(-5e+00f), in1015);
in1025 = _mm512_fmadd_ps(in1025, _mm512_set1_ps(-5e+00f), in1023);
tmp6625 = _mm512_fmadd_ps(in1016, _mm512_set1_ps(-1.25e+00f), tmp6625);
tmp6629 = _mm512_fmadd_ps(in1024, _mm512_set1_ps(-1.25e+00f), tmp6629);
in1019 = _mm512_fmadd_ps(tmp6625, _mm512_set1_ps(2e+00f), tmp6626);
in1027 = _mm512_fmadd_ps(tmp6629, _mm512_set1_ps(2e+00f), tmp6630);
tmp6626 = _mm512_fnmadd_ps(tmp6625, _mm512_set1_ps(2e+00f), tmp6626);
tmp6630 = _mm512_fnmadd_ps(tmp6629, _mm512_set1_ps(2e+00f), tmp6630);
tmp6625 = _mm512_fmadd_ps(in1018, _mm512_set1_ps(2.5e-01f), in1014);
tmp6629 = _mm512_fmadd_ps(in1026, _mm512_set1_ps(2.5e-01f), in1022);
in1014 = _mm512_sub_ps(in1020, in1014);
in1022 = _mm512_sub_ps(in1028, in1022);
tmp6625 = _mm512_fmadd_ps(in1016, _mm512_set1_ps(-1.25e+00f), tmp6625);
tmp6629 = _mm512_fmadd_ps(in1024, _mm512_set1_ps(-1.25e+00f), tmp6629);
in1016 = _mm512_sub_ps(in1016, in1018);
in1024 = _mm512_sub_ps(in1024, in1026);
in1016 = _mm512_fmadd_ps(in1016, _mm512_set1_ps(5.25e+00f), in1014);
in1024 = _mm512_fmadd_ps(in1024, _mm512_set1_ps(5.25e+00f), in1022);
in1015 = _mm512_fmadd_ps(tmp6625, _mm512_set1_ps(2e+00f), in1017);
in1023 = _mm512_fmadd_ps(tmp6629, _mm512_set1_ps(2e+00f), in1025);
in1017 = _mm512_fnmadd_ps(tmp6625, _mm512_set1_ps(2e+00f), in1017);
in1025 = _mm512_fnmadd_ps(tmp6629, _mm512_set1_ps(2e+00f), in1025);
__m512 tmp6641 = _mm512_unpacklo_ps(in1013, tmp6627);
__m512 tmp6642 = _mm512_unpackhi_ps(in1013, tmp6627);
__m512 tmp6643 = _mm512_unpacklo_ps(tmp6628, in1019);
__m512 tmp6644 = _mm512_unpackhi_ps(tmp6628, in1019);
__m512 tmp6645 = _mm512_unpacklo_ps(tmp6626, in1015);
__m512 tmp6646 = _mm512_unpackhi_ps(tmp6626, in1015);
__m512 tmp6647 = _mm512_unpacklo_ps(in1017, in1016);
__m512 tmp6648 = _mm512_unpackhi_ps(in1017, in1016);
__m512 tmp6649 = _mm512_unpacklo_ps(in1021, tmp6631);
__m512 tmp6650 = _mm512_unpackhi_ps(in1021, tmp6631);
__m512 tmp6651 = _mm512_unpacklo_ps(tmp6632, in1027);
__m512 tmp6652 = _mm512_unpackhi_ps(tmp6632, in1027);
__m512 tmp6653 = _mm512_unpacklo_ps(tmp6630, in1023);
__m512 tmp6654 = _mm512_unpackhi_ps(tmp6630, in1023);
__m512 tmp6655 = _mm512_unpacklo_ps(in1025, in1024);
__m512 tmp6656 = _mm512_unpackhi_ps(in1025, in1024);
__m512 tmp6657 = _mm512_shuffle_ps(tmp6641, tmp6643, 68);
__m512 tmp6658 = _mm512_shuffle_ps(tmp6641, tmp6643, 238);
__m512 tmp6659 = _mm512_shuffle_ps(tmp6642, tmp6644, 68);
__m512 tmp6660 = _mm512_shuffle_ps(tmp6642, tmp6644, 238);
__m512 tmp6661 = _mm512_shuffle_ps(tmp6645, tmp6647, 68);
__m512 tmp6662 = _mm512_shuffle_ps(tmp6645, tmp6647, 238);
__m512 tmp6663 = _mm512_shuffle_ps(tmp6646, tmp6648, 68);
__m512 tmp6664 = _mm512_shuffle_ps(tmp6646, tmp6648, 238);
__m512 tmp6665 = _mm512_shuffle_ps(tmp6649, tmp6651, 68);
__m512 tmp6666 = _mm512_shuffle_ps(tmp6649, tmp6651, 238);
__m512 tmp6667 = _mm512_shuffle_ps(tmp6650, tmp6652, 68);
__m512 tmp6668 = _mm512_shuffle_ps(tmp6650, tmp6652, 238);
__m512 tmp6669 = _mm512_shuffle_ps(tmp6653, tmp6655, 68);
__m512 tmp6670 = _mm512_shuffle_ps(tmp6653, tmp6655, 238);
__m512 tmp6671 = _mm512_shuffle_ps(tmp6654, tmp6656, 68);
__m512 tmp6672 = _mm512_shuffle_ps(tmp6654, tmp6656, 238);
__m512 tmp6673 = _mm512_shuffle_f32x4(tmp6657, tmp6661, 136);
__m512 tmp6674 = _mm512_shuffle_f32x4(tmp6657, tmp6661, 221);
__m512 tmp6675 = _mm512_shuffle_f32x4(tmp6658, tmp6662, 136);
__m512 tmp6676 = _mm512_shuffle_f32x4(tmp6658, tmp6662, 221);
__m512 tmp6677 = _mm512_shuffle_f32x4(tmp6659, tmp6663, 136);
__m512 tmp6678 = _mm512_shuffle_f32x4(tmp6659, tmp6663, 221);
__m512 tmp6679 = _mm512_shuffle_f32x4(tmp6660, tmp6664, 136);
__m512 tmp6680 = _mm512_shuffle_f32x4(tmp6660, tmp6664, 221);
__m512 tmp6681 = _mm512_shuffle_f32x4(tmp6665, tmp6669, 136);
__m512 tmp6682 = _mm512_shuffle_f32x4(tmp6665, tmp6669, 221);
__m512 tmp6683 = _mm512_shuffle_f32x4(tmp6666, tmp6670, 136);
__m512 tmp6684 = _mm512_shuffle_f32x4(tmp6666, tmp6670, 221);
__m512 tmp6685 = _mm512_shuffle_f32x4(tmp6667, tmp6671, 136);
__m512 tmp6686 = _mm512_shuffle_f32x4(tmp6667, tmp6671, 221);
__m512 tmp6687 = _mm512_shuffle_f32x4(tmp6668, tmp6672, 136);
__m512 tmp6688 = _mm512_shuffle_f32x4(tmp6668, tmp6672, 221);
in1013 = _mm512_shuffle_f32x4(tmp6673, tmp6681, 136);
in1021 = _mm512_shuffle_f32x4(tmp6673, tmp6681, 221);
tmp6627 = _mm512_shuffle_f32x4(tmp6675, tmp6683, 136);
tmp6631 = _mm512_shuffle_f32x4(tmp6675, tmp6683, 221);
tmp6628 = _mm512_shuffle_f32x4(tmp6677, tmp6685, 136);
tmp6632 = _mm512_shuffle_f32x4(tmp6677, tmp6685, 221);
in1019 = _mm512_shuffle_f32x4(tmp6679, tmp6687, 136);
in1027 = _mm512_shuffle_f32x4(tmp6679, tmp6687, 221);
tmp6626 = _mm512_shuffle_f32x4(tmp6674, tmp6682, 136);
tmp6630 = _mm512_shuffle_f32x4(tmp6674, tmp6682, 221);
in1015 = _mm512_shuffle_f32x4(tmp6676, tmp6684, 136);
in1023 = _mm512_shuffle_f32x4(tmp6676, tmp6684, 221);
in1017 = _mm512_shuffle_f32x4(tmp6678, tmp6686, 136);
in1025 = _mm512_shuffle_f32x4(tmp6678, tmp6686, 221);
in1016 = _mm512_shuffle_f32x4(tmp6680, tmp6688, 136);
in1024 = _mm512_shuffle_f32x4(tmp6680, tmp6688, 221);
__m512 tmp6633 = _mm512_add_ps(tmp6627, in1015);
__m512 tmp6637 = _mm512_add_ps(tmp6631, in1023);
__m512 tmp6634 = _mm512_sub_ps(tmp6626, tmp6628);
__m512 tmp6638 = _mm512_sub_ps(tmp6630, tmp6632);
__m512 tmp6635 = _mm512_add_ps(tmp6628, in1017);
__m512 tmp6639 = _mm512_add_ps(tmp6632, in1025);
in1013 = _mm512_sub_ps(in1013, in1017);
in1021 = _mm512_sub_ps(in1021, in1025);
tmp6633 = _mm512_fmadd_ps(in1019, _mm512_set1_ps(-4.25e+00f), tmp6633);
tmp6637 = _mm512_fmadd_ps(in1027, _mm512_set1_ps(-4.25e+00f), tmp6637);
tmp6635 = _mm512_fmadd_ps(tmp6626, _mm512_set1_ps(-4.25e+00f), tmp6635);
tmp6639 = _mm512_fmadd_ps(tmp6630, _mm512_set1_ps(-4.25e+00f), tmp6639);
in1013 = _mm512_fmadd_ps(tmp6634, _mm512_set1_ps(5.25e+00f), in1013);
in1021 = _mm512_fmadd_ps(tmp6638, _mm512_set1_ps(5.25e+00f), in1021);
tmp6634 = _mm512_fmadd_ps(tmp6628, _mm512_set1_ps(2.5e-01f), in1017);
tmp6638 = _mm512_fmadd_ps(tmp6632, _mm512_set1_ps(2.5e-01f), in1025);
tmp6628 = _mm512_fmadd_ps(tmp6628, _mm512_set1_ps(4e+00f), in1017);
tmp6632 = _mm512_fmadd_ps(tmp6632, _mm512_set1_ps(4e+00f), in1025);
__m512 tmp6636 = _mm512_sub_ps(tmp6635, tmp6633);
__m512 tmp6640 = _mm512_sub_ps(tmp6639, tmp6637);
tmp6635 = _mm512_add_ps(tmp6633, tmp6635);
tmp6639 = _mm512_add_ps(tmp6637, tmp6639);
tmp6633 = _mm512_fmadd_ps(tmp6627, _mm512_set1_ps(2.5e-01f), in1015);
tmp6637 = _mm512_fmadd_ps(tmp6631, _mm512_set1_ps(2.5e-01f), in1023);
tmp6634 = _mm512_fmadd_ps(tmp6626, _mm512_set1_ps(-1.25e+00f), tmp6634);
tmp6638 = _mm512_fmadd_ps(tmp6630, _mm512_set1_ps(-1.25e+00f), tmp6638);
tmp6626 = _mm512_fmadd_ps(tmp6626, _mm512_set1_ps(-5e+00f), tmp6628);
tmp6630 = _mm512_fmadd_ps(tmp6630, _mm512_set1_ps(-5e+00f), tmp6632);
tmp6633 = _mm512_fmadd_ps(in1019, _mm512_set1_ps(-1.25e+00f), tmp6633);
tmp6637 = _mm512_fmadd_ps(in1027, _mm512_set1_ps(-1.25e+00f), tmp6637);
in1017 = _mm512_fmadd_ps(tmp6633, _mm512_set1_ps(2e+00f), tmp6634);
in1025 = _mm512_fmadd_ps(tmp6637, _mm512_set1_ps(2e+00f), tmp6638);
tmp6634 = _mm512_fnmadd_ps(tmp6633, _mm512_set1_ps(2e+00f), tmp6634);
tmp6638 = _mm512_fnmadd_ps(tmp6637, _mm512_set1_ps(2e+00f), tmp6638);
tmp6633 = _mm512_fmadd_ps(in1015, _mm512_set1_ps(2.5e-01f), tmp6627);
tmp6637 = _mm512_fmadd_ps(in1023, _mm512_set1_ps(2.5e-01f), tmp6631);
tmp6627 = _mm512_sub_ps(in1016, tmp6627);
tmp6631 = _mm512_sub_ps(in1024, tmp6631);
tmp6633 = _mm512_fmadd_ps(in1019, _mm512_set1_ps(-1.25e+00f), tmp6633);
tmp6637 = _mm512_fmadd_ps(in1027, _mm512_set1_ps(-1.25e+00f), tmp6637);
in1019 = _mm512_sub_ps(in1019, in1015);
in1027 = _mm512_sub_ps(in1027, in1023);
in1019 = _mm512_fmadd_ps(in1019, _mm512_set1_ps(5.25e+00f), tmp6627);
in1027 = _mm512_fmadd_ps(in1027, _mm512_set1_ps(5.25e+00f), tmp6631);
tmp6628 = _mm512_fmadd_ps(tmp6633, _mm512_set1_ps(2e+00f), tmp6626);
tmp6632 = _mm512_fmadd_ps(tmp6637, _mm512_set1_ps(2e+00f), tmp6630);
tmp6626 = _mm512_fnmadd_ps(tmp6633, _mm512_set1_ps(2e+00f), tmp6626);
tmp6630 = _mm512_fnmadd_ps(tmp6637, _mm512_set1_ps(2e+00f), tmp6630);
__m512 out963 = _mm512_shuffle_f32x4(in1013, tmp6635, 68);
__m512 out971 = _mm512_shuffle_f32x4(in1013, tmp6635, 238);
__m512 out964 = _mm512_shuffle_f32x4(tmp6636, in1017, 68);
__m512 out972 = _mm512_shuffle_f32x4(tmp6636, in1017, 238);
__m512 out965 = _mm512_shuffle_f32x4(tmp6634, tmp6628, 68);
__m512 out973 = _mm512_shuffle_f32x4(tmp6634, tmp6628, 238);
__m512 out966 = _mm512_shuffle_f32x4(tmp6626, in1019, 68);
__m512 out974 = _mm512_shuffle_f32x4(tmp6626, in1019, 238);
__m512 out967 = _mm512_shuffle_f32x4(in1021, tmp6639, 68);
__m512 out975 = _mm512_shuffle_f32x4(in1021, tmp6639, 238);
__m512 out968 = _mm512_shuffle_f32x4(tmp6640, in1025, 68);
__m512 out976 = _mm512_shuffle_f32x4(tmp6640, in1025, 238);
__m512 out969 = _mm512_shuffle_f32x4(tmp6638, tmp6632, 68);
__m512 out977 = _mm512_shuffle_f32x4(tmp6638, tmp6632, 238);
__m512 out970 = _mm512_shuffle_f32x4(tmp6630, in1027, 68);
__m512 out978 = _mm512_shuffle_f32x4(tmp6630, in1027, 238);
_mm512_storeu_ps(dfPtr6+256+1638400*i26+24576*j21+24576*s20+768*k85, out963);
_mm512_storeu_ps(dfPtr6+384+1638400*i26+24576*j21+24576*s20+768*k85, out971);
_mm512_storeu_ps(dfPtr6+320+1638400*i26+24576*j21+24576*s20+768*k85, out967);
_mm512_storeu_ps(dfPtr6+448+1638400*i26+24576*j21+24576*s20+768*k85, out975);
_mm512_storeu_ps(dfPtr6+409856+1638400*i26+24576*j21+24576*s20+768*k85, out964);
_mm512_storeu_ps(dfPtr6+409984+1638400*i26+24576*j21+24576*s20+768*k85, out972);
_mm512_storeu_ps(dfPtr6+409920+1638400*i26+24576*j21+24576*s20+768*k85, out968);
_mm512_storeu_ps(dfPtr6+410048+1638400*i26+24576*j21+24576*s20+768*k85, out976);
_mm512_storeu_ps(dfPtr6+819456+1638400*i26+24576*j21+24576*s20+768*k85, out965);
_mm512_storeu_ps(dfPtr6+819584+1638400*i26+24576*j21+24576*s20+768*k85, out973);
_mm512_storeu_ps(dfPtr6+819520+1638400*i26+24576*j21+24576*s20+768*k85, out969);
_mm512_storeu_ps(dfPtr6+819648+1638400*i26+24576*j21+24576*s20+768*k85, out977);
_mm512_storeu_ps(dfPtr6+1229056+1638400*i26+24576*j21+24576*s20+768*k85, out966);
_mm512_storeu_ps(dfPtr6+1229184+1638400*i26+24576*j21+24576*s20+768*k85, out974);
_mm512_storeu_ps(dfPtr6+1229120+1638400*i26+24576*j21+24576*s20+768*k85, out970);
_mm512_storeu_ps(dfPtr6+1229248+1638400*i26+24576*j21+24576*s20+768*k85, out978);
__m512 dat1557 = _mm512_maskz_loadu_ps(16383, datPtr12+12656+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1558 = _mm512_maskz_loadu_ps(16383, datPtr12+12704+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512i pm139 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1029 = _mm512_permutexvar_ps(pm139, dat1557);
__m512 in1037 = _mm512_permutexvar_ps(pm139, dat1558);
__m512 dat1559 = _mm512_maskz_loadu_ps(16383, datPtr12+12880+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1560 = _mm512_maskz_loadu_ps(16383, datPtr12+12928+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1030 = _mm512_permutexvar_ps(pm139, dat1559);
__m512 in1038 = _mm512_permutexvar_ps(pm139, dat1560);
__m512 dat1561 = _mm512_maskz_loadu_ps(16383, datPtr12+13104+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1562 = _mm512_maskz_loadu_ps(16383, datPtr12+13152+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1031 = _mm512_permutexvar_ps(pm139, dat1561);
__m512 in1039 = _mm512_permutexvar_ps(pm139, dat1562);
__m512 dat1563 = _mm512_maskz_loadu_ps(16383, datPtr12+13328+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1564 = _mm512_maskz_loadu_ps(16383, datPtr12+13376+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1032 = _mm512_permutexvar_ps(pm139, dat1563);
__m512 in1040 = _mm512_permutexvar_ps(pm139, dat1564);
__m512 dat1565 = _mm512_maskz_loadu_ps(16383, datPtr12+13552+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1566 = _mm512_maskz_loadu_ps(16383, datPtr12+13600+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1033 = _mm512_permutexvar_ps(pm139, dat1565);
__m512 in1041 = _mm512_permutexvar_ps(pm139, dat1566);
__m512 dat1567 = _mm512_maskz_loadu_ps(16383, datPtr12+13776+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1568 = _mm512_maskz_loadu_ps(16383, datPtr12+13824+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1034 = _mm512_permutexvar_ps(pm139, dat1567);
__m512 in1042 = _mm512_permutexvar_ps(pm139, dat1568);
__m512 dat1569 = _mm512_maskz_loadu_ps(16383, datPtr12+14000+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1570 = _mm512_maskz_loadu_ps(16383, datPtr12+14048+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1035 = _mm512_permutexvar_ps(pm139, dat1569);
__m512 in1043 = _mm512_permutexvar_ps(pm139, dat1570);
__m512 dat1571 = _mm512_maskz_loadu_ps(16383, datPtr12+14224+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 dat1572 = _mm512_maskz_loadu_ps(16383, datPtr12+14272+806912*i26+224*h34+4*w41+806912*s20+25216*k85);
__m512 in1036 = _mm512_permutexvar_ps(pm139, dat1571);
__m512 in1044 = _mm512_permutexvar_ps(pm139, dat1572);
__m512 tmp6689 = _mm512_add_ps(in1030, in1034);
__m512 tmp6693 = _mm512_add_ps(in1038, in1042);
__m512 tmp6690 = _mm512_sub_ps(in1033, in1031);
__m512 tmp6694 = _mm512_sub_ps(in1041, in1039);
__m512 tmp6691 = _mm512_add_ps(in1031, in1035);
__m512 tmp6695 = _mm512_add_ps(in1039, in1043);
in1029 = _mm512_sub_ps(in1029, in1035);
in1037 = _mm512_sub_ps(in1037, in1043);
tmp6689 = _mm512_fmadd_ps(in1032, _mm512_set1_ps(-4.25e+00f), tmp6689);
tmp6693 = _mm512_fmadd_ps(in1040, _mm512_set1_ps(-4.25e+00f), tmp6693);
tmp6691 = _mm512_fmadd_ps(in1033, _mm512_set1_ps(-4.25e+00f), tmp6691);
tmp6695 = _mm512_fmadd_ps(in1041, _mm512_set1_ps(-4.25e+00f), tmp6695);
in1029 = _mm512_fmadd_ps(tmp6690, _mm512_set1_ps(5.25e+00f), in1029);
in1037 = _mm512_fmadd_ps(tmp6694, _mm512_set1_ps(5.25e+00f), in1037);
tmp6690 = _mm512_fmadd_ps(in1031, _mm512_set1_ps(2.5e-01f), in1035);
tmp6694 = _mm512_fmadd_ps(in1039, _mm512_set1_ps(2.5e-01f), in1043);
in1031 = _mm512_fmadd_ps(in1031, _mm512_set1_ps(4e+00f), in1035);
in1039 = _mm512_fmadd_ps(in1039, _mm512_set1_ps(4e+00f), in1043);
__m512 tmp6692 = _mm512_sub_ps(tmp6691, tmp6689);
__m512 tmp6696 = _mm512_sub_ps(tmp6695, tmp6693);
tmp6691 = _mm512_add_ps(tmp6689, tmp6691);
tmp6695 = _mm512_add_ps(tmp6693, tmp6695);
tmp6689 = _mm512_fmadd_ps(in1030, _mm512_set1_ps(2.5e-01f), in1034);
tmp6693 = _mm512_fmadd_ps(in1038, _mm512_set1_ps(2.5e-01f), in1042);
tmp6690 = _mm512_fmadd_ps(in1033, _mm512_set1_ps(-1.25e+00f), tmp6690);
tmp6694 = _mm512_fmadd_ps(in1041, _mm512_set1_ps(-1.25e+00f), tmp6694);
in1033 = _mm512_fmadd_ps(in1033, _mm512_set1_ps(-5e+00f), in1031);
in1041 = _mm512_fmadd_ps(in1041, _mm512_set1_ps(-5e+00f), in1039);
tmp6689 = _mm512_fmadd_ps(in1032, _mm512_set1_ps(-1.25e+00f), tmp6689);
tmp6693 = _mm512_fmadd_ps(in1040, _mm512_set1_ps(-1.25e+00f), tmp6693);
in1035 = _mm512_fmadd_ps(tmp6689, _mm512_set1_ps(2e+00f), tmp6690);
in1043 = _mm512_fmadd_ps(tmp6693, _mm512_set1_ps(2e+00f), tmp6694);
tmp6690 = _mm512_fnmadd_ps(tmp6689, _mm512_set1_ps(2e+00f), tmp6690);
tmp6694 = _mm512_fnmadd_ps(tmp6693, _mm512_set1_ps(2e+00f), tmp6694);
tmp6689 = _mm512_fmadd_ps(in1034, _mm512_set1_ps(2.5e-01f), in1030);
tmp6693 = _mm512_fmadd_ps(in1042, _mm512_set1_ps(2.5e-01f), in1038);
in1030 = _mm512_sub_ps(in1036, in1030);
in1038 = _mm512_sub_ps(in1044, in1038);
tmp6689 = _mm512_fmadd_ps(in1032, _mm512_set1_ps(-1.25e+00f), tmp6689);
tmp6693 = _mm512_fmadd_ps(in1040, _mm512_set1_ps(-1.25e+00f), tmp6693);
in1032 = _mm512_sub_ps(in1032, in1034);
in1040 = _mm512_sub_ps(in1040, in1042);
in1032 = _mm512_fmadd_ps(in1032, _mm512_set1_ps(5.25e+00f), in1030);
in1040 = _mm512_fmadd_ps(in1040, _mm512_set1_ps(5.25e+00f), in1038);
in1031 = _mm512_fmadd_ps(tmp6689, _mm512_set1_ps(2e+00f), in1033);
in1039 = _mm512_fmadd_ps(tmp6693, _mm512_set1_ps(2e+00f), in1041);
in1033 = _mm512_fnmadd_ps(tmp6689, _mm512_set1_ps(2e+00f), in1033);
in1041 = _mm512_fnmadd_ps(tmp6693, _mm512_set1_ps(2e+00f), in1041);
__m512 tmp6705 = _mm512_unpacklo_ps(in1029, tmp6691);
__m512 tmp6706 = _mm512_unpackhi_ps(in1029, tmp6691);
__m512 tmp6707 = _mm512_unpacklo_ps(tmp6692, in1035);
__m512 tmp6708 = _mm512_unpackhi_ps(tmp6692, in1035);
__m512 tmp6709 = _mm512_unpacklo_ps(tmp6690, in1031);
__m512 tmp6710 = _mm512_unpackhi_ps(tmp6690, in1031);
__m512 tmp6711 = _mm512_unpacklo_ps(in1033, in1032);
__m512 tmp6712 = _mm512_unpackhi_ps(in1033, in1032);
__m512 tmp6713 = _mm512_unpacklo_ps(in1037, tmp6695);
__m512 tmp6714 = _mm512_unpackhi_ps(in1037, tmp6695);
__m512 tmp6715 = _mm512_unpacklo_ps(tmp6696, in1043);
__m512 tmp6716 = _mm512_unpackhi_ps(tmp6696, in1043);
__m512 tmp6717 = _mm512_unpacklo_ps(tmp6694, in1039);
__m512 tmp6718 = _mm512_unpackhi_ps(tmp6694, in1039);
__m512 tmp6719 = _mm512_unpacklo_ps(in1041, in1040);
__m512 tmp6720 = _mm512_unpackhi_ps(in1041, in1040);
__m512 tmp6721 = _mm512_shuffle_ps(tmp6705, tmp6707, 68);
__m512 tmp6722 = _mm512_shuffle_ps(tmp6705, tmp6707, 238);
__m512 tmp6723 = _mm512_shuffle_ps(tmp6706, tmp6708, 68);
__m512 tmp6724 = _mm512_shuffle_ps(tmp6706, tmp6708, 238);
__m512 tmp6725 = _mm512_shuffle_ps(tmp6709, tmp6711, 68);
__m512 tmp6726 = _mm512_shuffle_ps(tmp6709, tmp6711, 238);
__m512 tmp6727 = _mm512_shuffle_ps(tmp6710, tmp6712, 68);
__m512 tmp6728 = _mm512_shuffle_ps(tmp6710, tmp6712, 238);
__m512 tmp6729 = _mm512_shuffle_ps(tmp6713, tmp6715, 68);
__m512 tmp6730 = _mm512_shuffle_ps(tmp6713, tmp6715, 238);
__m512 tmp6731 = _mm512_shuffle_ps(tmp6714, tmp6716, 68);
__m512 tmp6732 = _mm512_shuffle_ps(tmp6714, tmp6716, 238);
__m512 tmp6733 = _mm512_shuffle_ps(tmp6717, tmp6719, 68);
__m512 tmp6734 = _mm512_shuffle_ps(tmp6717, tmp6719, 238);
__m512 tmp6735 = _mm512_shuffle_ps(tmp6718, tmp6720, 68);
__m512 tmp6736 = _mm512_shuffle_ps(tmp6718, tmp6720, 238);
__m512 tmp6737 = _mm512_shuffle_f32x4(tmp6721, tmp6725, 136);
__m512 tmp6738 = _mm512_shuffle_f32x4(tmp6721, tmp6725, 221);
__m512 tmp6739 = _mm512_shuffle_f32x4(tmp6722, tmp6726, 136);
__m512 tmp6740 = _mm512_shuffle_f32x4(tmp6722, tmp6726, 221);
__m512 tmp6741 = _mm512_shuffle_f32x4(tmp6723, tmp6727, 136);
__m512 tmp6742 = _mm512_shuffle_f32x4(tmp6723, tmp6727, 221);
__m512 tmp6743 = _mm512_shuffle_f32x4(tmp6724, tmp6728, 136);
__m512 tmp6744 = _mm512_shuffle_f32x4(tmp6724, tmp6728, 221);
__m512 tmp6745 = _mm512_shuffle_f32x4(tmp6729, tmp6733, 136);
__m512 tmp6746 = _mm512_shuffle_f32x4(tmp6729, tmp6733, 221);
__m512 tmp6747 = _mm512_shuffle_f32x4(tmp6730, tmp6734, 136);
__m512 tmp6748 = _mm512_shuffle_f32x4(tmp6730, tmp6734, 221);
__m512 tmp6749 = _mm512_shuffle_f32x4(tmp6731, tmp6735, 136);
__m512 tmp6750 = _mm512_shuffle_f32x4(tmp6731, tmp6735, 221);
__m512 tmp6751 = _mm512_shuffle_f32x4(tmp6732, tmp6736, 136);
__m512 tmp6752 = _mm512_shuffle_f32x4(tmp6732, tmp6736, 221);
in1029 = _mm512_shuffle_f32x4(tmp6737, tmp6745, 136);
in1037 = _mm512_shuffle_f32x4(tmp6737, tmp6745, 221);
tmp6691 = _mm512_shuffle_f32x4(tmp6739, tmp6747, 136);
tmp6695 = _mm512_shuffle_f32x4(tmp6739, tmp6747, 221);
tmp6692 = _mm512_shuffle_f32x4(tmp6741, tmp6749, 136);
tmp6696 = _mm512_shuffle_f32x4(tmp6741, tmp6749, 221);
in1035 = _mm512_shuffle_f32x4(tmp6743, tmp6751, 136);
in1043 = _mm512_shuffle_f32x4(tmp6743, tmp6751, 221);
tmp6690 = _mm512_shuffle_f32x4(tmp6738, tmp6746, 136);
tmp6694 = _mm512_shuffle_f32x4(tmp6738, tmp6746, 221);
in1031 = _mm512_shuffle_f32x4(tmp6740, tmp6748, 136);
in1039 = _mm512_shuffle_f32x4(tmp6740, tmp6748, 221);
in1033 = _mm512_shuffle_f32x4(tmp6742, tmp6750, 136);
in1041 = _mm512_shuffle_f32x4(tmp6742, tmp6750, 221);
in1032 = _mm512_shuffle_f32x4(tmp6744, tmp6752, 136);
in1040 = _mm512_shuffle_f32x4(tmp6744, tmp6752, 221);
__m512 tmp6697 = _mm512_add_ps(tmp6691, in1031);
__m512 tmp6701 = _mm512_add_ps(tmp6695, in1039);
__m512 tmp6698 = _mm512_sub_ps(tmp6690, tmp6692);
__m512 tmp6702 = _mm512_sub_ps(tmp6694, tmp6696);
__m512 tmp6699 = _mm512_add_ps(tmp6692, in1033);
__m512 tmp6703 = _mm512_add_ps(tmp6696, in1041);
in1029 = _mm512_sub_ps(in1029, in1033);
in1037 = _mm512_sub_ps(in1037, in1041);
tmp6697 = _mm512_fmadd_ps(in1035, _mm512_set1_ps(-4.25e+00f), tmp6697);
tmp6701 = _mm512_fmadd_ps(in1043, _mm512_set1_ps(-4.25e+00f), tmp6701);
tmp6699 = _mm512_fmadd_ps(tmp6690, _mm512_set1_ps(-4.25e+00f), tmp6699);
tmp6703 = _mm512_fmadd_ps(tmp6694, _mm512_set1_ps(-4.25e+00f), tmp6703);
in1029 = _mm512_fmadd_ps(tmp6698, _mm512_set1_ps(5.25e+00f), in1029);
in1037 = _mm512_fmadd_ps(tmp6702, _mm512_set1_ps(5.25e+00f), in1037);
tmp6698 = _mm512_fmadd_ps(tmp6692, _mm512_set1_ps(2.5e-01f), in1033);
tmp6702 = _mm512_fmadd_ps(tmp6696, _mm512_set1_ps(2.5e-01f), in1041);
tmp6692 = _mm512_fmadd_ps(tmp6692, _mm512_set1_ps(4e+00f), in1033);
tmp6696 = _mm512_fmadd_ps(tmp6696, _mm512_set1_ps(4e+00f), in1041);
__m512 tmp6700 = _mm512_sub_ps(tmp6699, tmp6697);
__m512 tmp6704 = _mm512_sub_ps(tmp6703, tmp6701);
tmp6699 = _mm512_add_ps(tmp6697, tmp6699);
tmp6703 = _mm512_add_ps(tmp6701, tmp6703);
tmp6697 = _mm512_fmadd_ps(tmp6691, _mm512_set1_ps(2.5e-01f), in1031);
tmp6701 = _mm512_fmadd_ps(tmp6695, _mm512_set1_ps(2.5e-01f), in1039);
tmp6698 = _mm512_fmadd_ps(tmp6690, _mm512_set1_ps(-1.25e+00f), tmp6698);
tmp6702 = _mm512_fmadd_ps(tmp6694, _mm512_set1_ps(-1.25e+00f), tmp6702);
tmp6690 = _mm512_fmadd_ps(tmp6690, _mm512_set1_ps(-5e+00f), tmp6692);
tmp6694 = _mm512_fmadd_ps(tmp6694, _mm512_set1_ps(-5e+00f), tmp6696);
tmp6697 = _mm512_fmadd_ps(in1035, _mm512_set1_ps(-1.25e+00f), tmp6697);
tmp6701 = _mm512_fmadd_ps(in1043, _mm512_set1_ps(-1.25e+00f), tmp6701);
in1033 = _mm512_fmadd_ps(tmp6697, _mm512_set1_ps(2e+00f), tmp6698);
in1041 = _mm512_fmadd_ps(tmp6701, _mm512_set1_ps(2e+00f), tmp6702);
tmp6698 = _mm512_fnmadd_ps(tmp6697, _mm512_set1_ps(2e+00f), tmp6698);
tmp6702 = _mm512_fnmadd_ps(tmp6701, _mm512_set1_ps(2e+00f), tmp6702);
tmp6697 = _mm512_fmadd_ps(in1031, _mm512_set1_ps(2.5e-01f), tmp6691);
tmp6701 = _mm512_fmadd_ps(in1039, _mm512_set1_ps(2.5e-01f), tmp6695);
tmp6691 = _mm512_sub_ps(in1032, tmp6691);
tmp6695 = _mm512_sub_ps(in1040, tmp6695);
tmp6697 = _mm512_fmadd_ps(in1035, _mm512_set1_ps(-1.25e+00f), tmp6697);
tmp6701 = _mm512_fmadd_ps(in1043, _mm512_set1_ps(-1.25e+00f), tmp6701);
in1035 = _mm512_sub_ps(in1035, in1031);
in1043 = _mm512_sub_ps(in1043, in1039);
in1035 = _mm512_fmadd_ps(in1035, _mm512_set1_ps(5.25e+00f), tmp6691);
in1043 = _mm512_fmadd_ps(in1043, _mm512_set1_ps(5.25e+00f), tmp6695);
tmp6692 = _mm512_fmadd_ps(tmp6697, _mm512_set1_ps(2e+00f), tmp6690);
tmp6696 = _mm512_fmadd_ps(tmp6701, _mm512_set1_ps(2e+00f), tmp6694);
tmp6690 = _mm512_fnmadd_ps(tmp6697, _mm512_set1_ps(2e+00f), tmp6690);
tmp6694 = _mm512_fnmadd_ps(tmp6701, _mm512_set1_ps(2e+00f), tmp6694);
__m512 out979 = _mm512_shuffle_f32x4(in1029, tmp6699, 68);
__m512 out987 = _mm512_shuffle_f32x4(in1029, tmp6699, 238);
__m512 out980 = _mm512_shuffle_f32x4(tmp6700, in1033, 68);
__m512 out988 = _mm512_shuffle_f32x4(tmp6700, in1033, 238);
__m512 out981 = _mm512_shuffle_f32x4(tmp6698, tmp6692, 68);
__m512 out989 = _mm512_shuffle_f32x4(tmp6698, tmp6692, 238);
__m512 out982 = _mm512_shuffle_f32x4(tmp6690, in1035, 68);
__m512 out990 = _mm512_shuffle_f32x4(tmp6690, in1035, 238);
__m512 out983 = _mm512_shuffle_f32x4(in1037, tmp6703, 68);
__m512 out991 = _mm512_shuffle_f32x4(in1037, tmp6703, 238);
__m512 out984 = _mm512_shuffle_f32x4(tmp6704, in1041, 68);
__m512 out992 = _mm512_shuffle_f32x4(tmp6704, in1041, 238);
__m512 out985 = _mm512_shuffle_f32x4(tmp6702, tmp6696, 68);
__m512 out993 = _mm512_shuffle_f32x4(tmp6702, tmp6696, 238);
__m512 out986 = _mm512_shuffle_f32x4(tmp6694, in1043, 68);
__m512 out994 = _mm512_shuffle_f32x4(tmp6694, in1043, 238);
_mm512_storeu_ps(dfPtr6+512+1638400*i26+24576*j21+24576*s20+768*k85, out979);
_mm512_storeu_ps(dfPtr6+640+1638400*i26+24576*j21+24576*s20+768*k85, out987);
_mm512_storeu_ps(dfPtr6+576+1638400*i26+24576*j21+24576*s20+768*k85, out983);
_mm512_storeu_ps(dfPtr6+704+1638400*i26+24576*j21+24576*s20+768*k85, out991);
_mm512_storeu_ps(dfPtr6+410112+1638400*i26+24576*j21+24576*s20+768*k85, out980);
_mm512_storeu_ps(dfPtr6+410240+1638400*i26+24576*j21+24576*s20+768*k85, out988);
_mm512_storeu_ps(dfPtr6+410176+1638400*i26+24576*j21+24576*s20+768*k85, out984);
_mm512_storeu_ps(dfPtr6+410304+1638400*i26+24576*j21+24576*s20+768*k85, out992);
_mm512_storeu_ps(dfPtr6+819712+1638400*i26+24576*j21+24576*s20+768*k85, out981);
_mm512_storeu_ps(dfPtr6+819840+1638400*i26+24576*j21+24576*s20+768*k85, out989);
_mm512_storeu_ps(dfPtr6+819776+1638400*i26+24576*j21+24576*s20+768*k85, out985);
_mm512_storeu_ps(dfPtr6+819904+1638400*i26+24576*j21+24576*s20+768*k85, out993);
_mm512_storeu_ps(dfPtr6+1229312+1638400*i26+24576*j21+24576*s20+768*k85, out982);
_mm512_storeu_ps(dfPtr6+1229440+1638400*i26+24576*j21+24576*s20+768*k85, out990);
_mm512_storeu_ps(dfPtr6+1229376+1638400*i26+24576*j21+24576*s20+768*k85, out986);
_mm512_storeu_ps(dfPtr6+1229504+1638400*i26+24576*j21+24576*s20+768*k85, out994);
}
if (j21 >= last5) return;
++j21;
rel14 = 4;
}
ptrdiff_t h35 = base14+12;
ptrdiff_t w42 = 36;
ptrdiff_t k86 = 0;
for (; k86 != 32; ++k86) {
__m512 dat1573 = _mm512_maskz_loadu_ps(16383, datPtr12+0+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1574 = _mm512_maskz_loadu_ps(511, datPtr12+48+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512i pm140 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1045 = _mm512_permutexvar_ps(pm140, dat1573);
__m512i pm141 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1053 = _mm512_permutexvar_ps(pm141, dat1574);
__m512 dat1575 = _mm512_maskz_loadu_ps(16383, datPtr12+224+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1576 = _mm512_maskz_loadu_ps(511, datPtr12+272+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1046 = _mm512_permutexvar_ps(pm140, dat1575);
__m512 in1054 = _mm512_permutexvar_ps(pm141, dat1576);
__m512 dat1577 = _mm512_maskz_loadu_ps(16383, datPtr12+448+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1578 = _mm512_maskz_loadu_ps(511, datPtr12+496+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1047 = _mm512_permutexvar_ps(pm140, dat1577);
__m512 in1055 = _mm512_permutexvar_ps(pm141, dat1578);
__m512 dat1579 = _mm512_maskz_loadu_ps(16383, datPtr12+672+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1580 = _mm512_maskz_loadu_ps(511, datPtr12+720+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1048 = _mm512_permutexvar_ps(pm140, dat1579);
__m512 in1056 = _mm512_permutexvar_ps(pm141, dat1580);
__m512 dat1581 = _mm512_maskz_loadu_ps(16383, datPtr12+896+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1582 = _mm512_maskz_loadu_ps(511, datPtr12+944+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1049 = _mm512_permutexvar_ps(pm140, dat1581);
__m512 in1057 = _mm512_permutexvar_ps(pm141, dat1582);
__m512 dat1583 = _mm512_maskz_loadu_ps(16383, datPtr12+1120+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1584 = _mm512_maskz_loadu_ps(511, datPtr12+1168+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1050 = _mm512_permutexvar_ps(pm140, dat1583);
__m512 in1058 = _mm512_permutexvar_ps(pm141, dat1584);
__m512 dat1585 = _mm512_maskz_loadu_ps(16383, datPtr12+1344+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1586 = _mm512_maskz_loadu_ps(511, datPtr12+1392+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1051 = _mm512_permutexvar_ps(pm140, dat1585);
__m512 in1059 = _mm512_permutexvar_ps(pm141, dat1586);
__m512 dat1587 = _mm512_maskz_loadu_ps(16383, datPtr12+1568+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1588 = _mm512_maskz_loadu_ps(511, datPtr12+1616+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1052 = _mm512_permutexvar_ps(pm140, dat1587);
__m512 in1060 = _mm512_permutexvar_ps(pm141, dat1588);
__m512 tmp6753 = _mm512_add_ps(in1046, in1050);
__m512 tmp6757 = _mm512_add_ps(in1054, in1058);
__m512 tmp6754 = _mm512_sub_ps(in1049, in1047);
__m512 tmp6758 = _mm512_sub_ps(in1057, in1055);
__m512 tmp6755 = _mm512_add_ps(in1047, in1051);
__m512 tmp6759 = _mm512_add_ps(in1055, in1059);
in1045 = _mm512_sub_ps(in1045, in1051);
in1053 = _mm512_sub_ps(in1053, in1059);
tmp6753 = _mm512_fmadd_ps(in1048, _mm512_set1_ps(-4.25e+00f), tmp6753);
tmp6757 = _mm512_fmadd_ps(in1056, _mm512_set1_ps(-4.25e+00f), tmp6757);
tmp6755 = _mm512_fmadd_ps(in1049, _mm512_set1_ps(-4.25e+00f), tmp6755);
tmp6759 = _mm512_fmadd_ps(in1057, _mm512_set1_ps(-4.25e+00f), tmp6759);
in1045 = _mm512_fmadd_ps(tmp6754, _mm512_set1_ps(5.25e+00f), in1045);
in1053 = _mm512_fmadd_ps(tmp6758, _mm512_set1_ps(5.25e+00f), in1053);
tmp6754 = _mm512_fmadd_ps(in1047, _mm512_set1_ps(2.5e-01f), in1051);
tmp6758 = _mm512_fmadd_ps(in1055, _mm512_set1_ps(2.5e-01f), in1059);
in1047 = _mm512_fmadd_ps(in1047, _mm512_set1_ps(4e+00f), in1051);
in1055 = _mm512_fmadd_ps(in1055, _mm512_set1_ps(4e+00f), in1059);
__m512 tmp6756 = _mm512_sub_ps(tmp6755, tmp6753);
__m512 tmp6760 = _mm512_sub_ps(tmp6759, tmp6757);
tmp6755 = _mm512_add_ps(tmp6753, tmp6755);
tmp6759 = _mm512_add_ps(tmp6757, tmp6759);
tmp6753 = _mm512_fmadd_ps(in1046, _mm512_set1_ps(2.5e-01f), in1050);
tmp6757 = _mm512_fmadd_ps(in1054, _mm512_set1_ps(2.5e-01f), in1058);
tmp6754 = _mm512_fmadd_ps(in1049, _mm512_set1_ps(-1.25e+00f), tmp6754);
tmp6758 = _mm512_fmadd_ps(in1057, _mm512_set1_ps(-1.25e+00f), tmp6758);
in1049 = _mm512_fmadd_ps(in1049, _mm512_set1_ps(-5e+00f), in1047);
in1057 = _mm512_fmadd_ps(in1057, _mm512_set1_ps(-5e+00f), in1055);
tmp6753 = _mm512_fmadd_ps(in1048, _mm512_set1_ps(-1.25e+00f), tmp6753);
tmp6757 = _mm512_fmadd_ps(in1056, _mm512_set1_ps(-1.25e+00f), tmp6757);
in1051 = _mm512_fmadd_ps(tmp6753, _mm512_set1_ps(2e+00f), tmp6754);
in1059 = _mm512_fmadd_ps(tmp6757, _mm512_set1_ps(2e+00f), tmp6758);
tmp6754 = _mm512_fnmadd_ps(tmp6753, _mm512_set1_ps(2e+00f), tmp6754);
tmp6758 = _mm512_fnmadd_ps(tmp6757, _mm512_set1_ps(2e+00f), tmp6758);
tmp6753 = _mm512_fmadd_ps(in1050, _mm512_set1_ps(2.5e-01f), in1046);
tmp6757 = _mm512_fmadd_ps(in1058, _mm512_set1_ps(2.5e-01f), in1054);
in1046 = _mm512_sub_ps(in1052, in1046);
in1054 = _mm512_sub_ps(in1060, in1054);
tmp6753 = _mm512_fmadd_ps(in1048, _mm512_set1_ps(-1.25e+00f), tmp6753);
tmp6757 = _mm512_fmadd_ps(in1056, _mm512_set1_ps(-1.25e+00f), tmp6757);
in1048 = _mm512_sub_ps(in1048, in1050);
in1056 = _mm512_sub_ps(in1056, in1058);
in1048 = _mm512_fmadd_ps(in1048, _mm512_set1_ps(5.25e+00f), in1046);
in1056 = _mm512_fmadd_ps(in1056, _mm512_set1_ps(5.25e+00f), in1054);
in1047 = _mm512_fmadd_ps(tmp6753, _mm512_set1_ps(2e+00f), in1049);
in1055 = _mm512_fmadd_ps(tmp6757, _mm512_set1_ps(2e+00f), in1057);
in1049 = _mm512_fnmadd_ps(tmp6753, _mm512_set1_ps(2e+00f), in1049);
in1057 = _mm512_fnmadd_ps(tmp6757, _mm512_set1_ps(2e+00f), in1057);
__m512 tmp6769 = _mm512_unpacklo_ps(in1045, tmp6755);
__m512 tmp6770 = _mm512_unpackhi_ps(in1045, tmp6755);
__m512 tmp6771 = _mm512_unpacklo_ps(tmp6756, in1051);
__m512 tmp6772 = _mm512_unpackhi_ps(tmp6756, in1051);
__m512 tmp6773 = _mm512_unpacklo_ps(tmp6754, in1047);
__m512 tmp6774 = _mm512_unpackhi_ps(tmp6754, in1047);
__m512 tmp6775 = _mm512_unpacklo_ps(in1049, in1048);
__m512 tmp6776 = _mm512_unpackhi_ps(in1049, in1048);
__m512 tmp6777 = _mm512_unpacklo_ps(in1053, tmp6759);
__m512 tmp6778 = _mm512_unpackhi_ps(in1053, tmp6759);
__m512 tmp6779 = _mm512_unpacklo_ps(tmp6760, in1059);
__m512 tmp6780 = _mm512_unpackhi_ps(tmp6760, in1059);
__m512 tmp6781 = _mm512_unpacklo_ps(tmp6758, in1055);
__m512 tmp6782 = _mm512_unpackhi_ps(tmp6758, in1055);
__m512 tmp6783 = _mm512_unpacklo_ps(in1057, in1056);
__m512 tmp6784 = _mm512_unpackhi_ps(in1057, in1056);
__m512 tmp6785 = _mm512_shuffle_ps(tmp6769, tmp6771, 68);
__m512 tmp6786 = _mm512_shuffle_ps(tmp6769, tmp6771, 238);
__m512 tmp6787 = _mm512_shuffle_ps(tmp6770, tmp6772, 68);
__m512 tmp6788 = _mm512_shuffle_ps(tmp6770, tmp6772, 238);
__m512 tmp6789 = _mm512_shuffle_ps(tmp6773, tmp6775, 68);
__m512 tmp6790 = _mm512_shuffle_ps(tmp6773, tmp6775, 238);
__m512 tmp6791 = _mm512_shuffle_ps(tmp6774, tmp6776, 68);
__m512 tmp6792 = _mm512_shuffle_ps(tmp6774, tmp6776, 238);
__m512 tmp6793 = _mm512_shuffle_ps(tmp6777, tmp6779, 68);
__m512 tmp6794 = _mm512_shuffle_ps(tmp6777, tmp6779, 238);
__m512 tmp6795 = _mm512_shuffle_ps(tmp6778, tmp6780, 68);
__m512 tmp6796 = _mm512_shuffle_ps(tmp6778, tmp6780, 238);
__m512 tmp6797 = _mm512_shuffle_ps(tmp6781, tmp6783, 68);
__m512 tmp6798 = _mm512_shuffle_ps(tmp6781, tmp6783, 238);
__m512 tmp6799 = _mm512_shuffle_ps(tmp6782, tmp6784, 68);
__m512 tmp6800 = _mm512_shuffle_ps(tmp6782, tmp6784, 238);
__m512 tmp6801 = _mm512_shuffle_f32x4(tmp6785, tmp6789, 136);
__m512 tmp6802 = _mm512_shuffle_f32x4(tmp6785, tmp6789, 221);
__m512 tmp6803 = _mm512_shuffle_f32x4(tmp6786, tmp6790, 136);
__m512 tmp6804 = _mm512_shuffle_f32x4(tmp6786, tmp6790, 221);
__m512 tmp6805 = _mm512_shuffle_f32x4(tmp6787, tmp6791, 136);
__m512 tmp6806 = _mm512_shuffle_f32x4(tmp6787, tmp6791, 221);
__m512 tmp6807 = _mm512_shuffle_f32x4(tmp6788, tmp6792, 136);
__m512 tmp6808 = _mm512_shuffle_f32x4(tmp6788, tmp6792, 221);
__m512 tmp6809 = _mm512_shuffle_f32x4(tmp6793, tmp6797, 136);
__m512 tmp6810 = _mm512_shuffle_f32x4(tmp6793, tmp6797, 221);
__m512 tmp6811 = _mm512_shuffle_f32x4(tmp6794, tmp6798, 136);
__m512 tmp6812 = _mm512_shuffle_f32x4(tmp6794, tmp6798, 221);
__m512 tmp6813 = _mm512_shuffle_f32x4(tmp6795, tmp6799, 136);
__m512 tmp6814 = _mm512_shuffle_f32x4(tmp6795, tmp6799, 221);
__m512 tmp6815 = _mm512_shuffle_f32x4(tmp6796, tmp6800, 136);
__m512 tmp6816 = _mm512_shuffle_f32x4(tmp6796, tmp6800, 221);
in1045 = _mm512_shuffle_f32x4(tmp6801, tmp6809, 136);
in1053 = _mm512_shuffle_f32x4(tmp6801, tmp6809, 221);
tmp6755 = _mm512_shuffle_f32x4(tmp6803, tmp6811, 136);
tmp6759 = _mm512_shuffle_f32x4(tmp6803, tmp6811, 221);
tmp6756 = _mm512_shuffle_f32x4(tmp6805, tmp6813, 136);
tmp6760 = _mm512_shuffle_f32x4(tmp6805, tmp6813, 221);
in1051 = _mm512_shuffle_f32x4(tmp6807, tmp6815, 136);
in1059 = _mm512_shuffle_f32x4(tmp6807, tmp6815, 221);
tmp6754 = _mm512_shuffle_f32x4(tmp6802, tmp6810, 136);
tmp6758 = _mm512_shuffle_f32x4(tmp6802, tmp6810, 221);
in1047 = _mm512_shuffle_f32x4(tmp6804, tmp6812, 136);
in1055 = _mm512_shuffle_f32x4(tmp6804, tmp6812, 221);
in1049 = _mm512_shuffle_f32x4(tmp6806, tmp6814, 136);
in1057 = _mm512_shuffle_f32x4(tmp6806, tmp6814, 221);
in1048 = _mm512_shuffle_f32x4(tmp6808, tmp6816, 136);
in1056 = _mm512_shuffle_f32x4(tmp6808, tmp6816, 221);
__m512 tmp6761 = _mm512_add_ps(tmp6755, in1047);
__m512 tmp6765 = _mm512_add_ps(tmp6759, in1055);
__m512 tmp6762 = _mm512_sub_ps(tmp6754, tmp6756);
__m512 tmp6766 = _mm512_sub_ps(tmp6758, tmp6760);
__m512 tmp6763 = _mm512_add_ps(tmp6756, in1049);
__m512 tmp6767 = _mm512_add_ps(tmp6760, in1057);
in1045 = _mm512_sub_ps(in1045, in1049);
in1053 = _mm512_sub_ps(in1053, in1057);
tmp6761 = _mm512_fmadd_ps(in1051, _mm512_set1_ps(-4.25e+00f), tmp6761);
tmp6765 = _mm512_fmadd_ps(in1059, _mm512_set1_ps(-4.25e+00f), tmp6765);
tmp6763 = _mm512_fmadd_ps(tmp6754, _mm512_set1_ps(-4.25e+00f), tmp6763);
tmp6767 = _mm512_fmadd_ps(tmp6758, _mm512_set1_ps(-4.25e+00f), tmp6767);
in1045 = _mm512_fmadd_ps(tmp6762, _mm512_set1_ps(5.25e+00f), in1045);
in1053 = _mm512_fmadd_ps(tmp6766, _mm512_set1_ps(5.25e+00f), in1053);
tmp6762 = _mm512_fmadd_ps(tmp6756, _mm512_set1_ps(2.5e-01f), in1049);
tmp6766 = _mm512_fmadd_ps(tmp6760, _mm512_set1_ps(2.5e-01f), in1057);
tmp6756 = _mm512_fmadd_ps(tmp6756, _mm512_set1_ps(4e+00f), in1049);
tmp6760 = _mm512_fmadd_ps(tmp6760, _mm512_set1_ps(4e+00f), in1057);
__m512 tmp6764 = _mm512_sub_ps(tmp6763, tmp6761);
__m512 tmp6768 = _mm512_sub_ps(tmp6767, tmp6765);
tmp6763 = _mm512_add_ps(tmp6761, tmp6763);
tmp6767 = _mm512_add_ps(tmp6765, tmp6767);
tmp6761 = _mm512_fmadd_ps(tmp6755, _mm512_set1_ps(2.5e-01f), in1047);
tmp6765 = _mm512_fmadd_ps(tmp6759, _mm512_set1_ps(2.5e-01f), in1055);
tmp6762 = _mm512_fmadd_ps(tmp6754, _mm512_set1_ps(-1.25e+00f), tmp6762);
tmp6766 = _mm512_fmadd_ps(tmp6758, _mm512_set1_ps(-1.25e+00f), tmp6766);
tmp6754 = _mm512_fmadd_ps(tmp6754, _mm512_set1_ps(-5e+00f), tmp6756);
tmp6758 = _mm512_fmadd_ps(tmp6758, _mm512_set1_ps(-5e+00f), tmp6760);
tmp6761 = _mm512_fmadd_ps(in1051, _mm512_set1_ps(-1.25e+00f), tmp6761);
tmp6765 = _mm512_fmadd_ps(in1059, _mm512_set1_ps(-1.25e+00f), tmp6765);
in1049 = _mm512_fmadd_ps(tmp6761, _mm512_set1_ps(2e+00f), tmp6762);
in1057 = _mm512_fmadd_ps(tmp6765, _mm512_set1_ps(2e+00f), tmp6766);
tmp6762 = _mm512_fnmadd_ps(tmp6761, _mm512_set1_ps(2e+00f), tmp6762);
tmp6766 = _mm512_fnmadd_ps(tmp6765, _mm512_set1_ps(2e+00f), tmp6766);
tmp6761 = _mm512_fmadd_ps(in1047, _mm512_set1_ps(2.5e-01f), tmp6755);
tmp6765 = _mm512_fmadd_ps(in1055, _mm512_set1_ps(2.5e-01f), tmp6759);
tmp6755 = _mm512_sub_ps(in1048, tmp6755);
tmp6759 = _mm512_sub_ps(in1056, tmp6759);
tmp6761 = _mm512_fmadd_ps(in1051, _mm512_set1_ps(-1.25e+00f), tmp6761);
tmp6765 = _mm512_fmadd_ps(in1059, _mm512_set1_ps(-1.25e+00f), tmp6765);
in1051 = _mm512_sub_ps(in1051, in1047);
in1059 = _mm512_sub_ps(in1059, in1055);
in1051 = _mm512_fmadd_ps(in1051, _mm512_set1_ps(5.25e+00f), tmp6755);
in1059 = _mm512_fmadd_ps(in1059, _mm512_set1_ps(5.25e+00f), tmp6759);
tmp6756 = _mm512_fmadd_ps(tmp6761, _mm512_set1_ps(2e+00f), tmp6754);
tmp6760 = _mm512_fmadd_ps(tmp6765, _mm512_set1_ps(2e+00f), tmp6758);
tmp6754 = _mm512_fnmadd_ps(tmp6761, _mm512_set1_ps(2e+00f), tmp6754);
tmp6758 = _mm512_fnmadd_ps(tmp6765, _mm512_set1_ps(2e+00f), tmp6758);
__m512 out995 = _mm512_shuffle_f32x4(in1045, tmp6763, 68);
__m512 out1003 = _mm512_shuffle_f32x4(in1045, tmp6763, 238);
__m512 out996 = _mm512_shuffle_f32x4(tmp6764, in1049, 68);
__m512 out1004 = _mm512_shuffle_f32x4(tmp6764, in1049, 238);
__m512 out997 = _mm512_shuffle_f32x4(tmp6762, tmp6756, 68);
__m512 out1005 = _mm512_shuffle_f32x4(tmp6762, tmp6756, 238);
__m512 out998 = _mm512_shuffle_f32x4(tmp6754, in1051, 68);
__m512 out1006 = _mm512_shuffle_f32x4(tmp6754, in1051, 238);
__m512 out999 = _mm512_shuffle_f32x4(in1053, tmp6767, 68);
__m512 out1007 = _mm512_shuffle_f32x4(in1053, tmp6767, 238);
__m512 out1000 = _mm512_shuffle_f32x4(tmp6768, in1057, 68);
__m512 out1008 = _mm512_shuffle_f32x4(tmp6768, in1057, 238);
__m512 out1001 = _mm512_shuffle_f32x4(tmp6766, tmp6760, 68);
__m512 out1009 = _mm512_shuffle_f32x4(tmp6766, tmp6760, 238);
__m512 out1002 = _mm512_shuffle_f32x4(tmp6758, in1059, 68);
__m512 out1010 = _mm512_shuffle_f32x4(tmp6758, in1059, 238);
_mm512_storeu_ps(dfPtr6+0+1638400*i26+24576*j21+24576*s20+768*k86, out995);
_mm512_storeu_ps(dfPtr6+128+1638400*i26+24576*j21+24576*s20+768*k86, out1003);
_mm512_storeu_ps(dfPtr6+64+1638400*i26+24576*j21+24576*s20+768*k86, out999);
_mm512_storeu_ps(dfPtr6+192+1638400*i26+24576*j21+24576*s20+768*k86, out1007);
_mm512_storeu_ps(dfPtr6+409600+1638400*i26+24576*j21+24576*s20+768*k86, out996);
_mm512_storeu_ps(dfPtr6+409728+1638400*i26+24576*j21+24576*s20+768*k86, out1004);
_mm512_storeu_ps(dfPtr6+409664+1638400*i26+24576*j21+24576*s20+768*k86, out1000);
_mm512_storeu_ps(dfPtr6+409792+1638400*i26+24576*j21+24576*s20+768*k86, out1008);
_mm512_storeu_ps(dfPtr6+819200+1638400*i26+24576*j21+24576*s20+768*k86, out997);
_mm512_storeu_ps(dfPtr6+819328+1638400*i26+24576*j21+24576*s20+768*k86, out1005);
_mm512_storeu_ps(dfPtr6+819264+1638400*i26+24576*j21+24576*s20+768*k86, out1001);
_mm512_storeu_ps(dfPtr6+819392+1638400*i26+24576*j21+24576*s20+768*k86, out1009);
_mm512_storeu_ps(dfPtr6+1228800+1638400*i26+24576*j21+24576*s20+768*k86, out998);
_mm512_storeu_ps(dfPtr6+1228928+1638400*i26+24576*j21+24576*s20+768*k86, out1006);
_mm512_storeu_ps(dfPtr6+1228864+1638400*i26+24576*j21+24576*s20+768*k86, out1002);
_mm512_storeu_ps(dfPtr6+1228992+1638400*i26+24576*j21+24576*s20+768*k86, out1010);
__m512 dat1589 = _mm512_maskz_loadu_ps(8191, datPtr12+1204+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1590 = _mm512_maskz_loadu_ps(16383, datPtr12+12608+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512i pm142 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1061 = _mm512_permutexvar_ps(pm142, dat1589);
__m512i pm143 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1069 = _mm512_permutexvar_ps(pm143, dat1590);
__m512 dat1591 = _mm512_maskz_loadu_ps(8191, datPtr12+1428+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1592 = _mm512_maskz_loadu_ps(16383, datPtr12+12832+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1062 = _mm512_permutexvar_ps(pm142, dat1591);
__m512 in1070 = _mm512_permutexvar_ps(pm143, dat1592);
__m512 dat1593 = _mm512_maskz_loadu_ps(8191, datPtr12+1652+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1594 = _mm512_maskz_loadu_ps(16383, datPtr12+13056+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1063 = _mm512_permutexvar_ps(pm142, dat1593);
__m512 in1071 = _mm512_permutexvar_ps(pm143, dat1594);
__m512 dat1595 = _mm512_maskz_loadu_ps(8191, datPtr12+1876+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1596 = _mm512_maskz_loadu_ps(16383, datPtr12+13280+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1064 = _mm512_permutexvar_ps(pm142, dat1595);
__m512 in1072 = _mm512_permutexvar_ps(pm143, dat1596);
__m512 dat1597 = _mm512_maskz_loadu_ps(8191, datPtr12+2100+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1598 = _mm512_maskz_loadu_ps(16383, datPtr12+13504+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1065 = _mm512_permutexvar_ps(pm142, dat1597);
__m512 in1073 = _mm512_permutexvar_ps(pm143, dat1598);
__m512 dat1599 = _mm512_maskz_loadu_ps(8191, datPtr12+2324+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1600 = _mm512_maskz_loadu_ps(16383, datPtr12+13728+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1066 = _mm512_permutexvar_ps(pm142, dat1599);
__m512 in1074 = _mm512_permutexvar_ps(pm143, dat1600);
__m512 dat1601 = _mm512_maskz_loadu_ps(8191, datPtr12+2548+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1602 = _mm512_maskz_loadu_ps(16383, datPtr12+13952+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1067 = _mm512_permutexvar_ps(pm142, dat1601);
__m512 in1075 = _mm512_permutexvar_ps(pm143, dat1602);
__m512 dat1603 = _mm512_maskz_loadu_ps(8191, datPtr12+2772+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1604 = _mm512_maskz_loadu_ps(16383, datPtr12+14176+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1068 = _mm512_permutexvar_ps(pm142, dat1603);
__m512 in1076 = _mm512_permutexvar_ps(pm143, dat1604);
__m512 tmp6817 = _mm512_add_ps(in1062, in1066);
__m512 tmp6821 = _mm512_add_ps(in1070, in1074);
__m512 tmp6818 = _mm512_sub_ps(in1065, in1063);
__m512 tmp6822 = _mm512_sub_ps(in1073, in1071);
__m512 tmp6819 = _mm512_add_ps(in1063, in1067);
__m512 tmp6823 = _mm512_add_ps(in1071, in1075);
in1061 = _mm512_sub_ps(in1061, in1067);
in1069 = _mm512_sub_ps(in1069, in1075);
tmp6817 = _mm512_fmadd_ps(in1064, _mm512_set1_ps(-4.25e+00f), tmp6817);
tmp6821 = _mm512_fmadd_ps(in1072, _mm512_set1_ps(-4.25e+00f), tmp6821);
tmp6819 = _mm512_fmadd_ps(in1065, _mm512_set1_ps(-4.25e+00f), tmp6819);
tmp6823 = _mm512_fmadd_ps(in1073, _mm512_set1_ps(-4.25e+00f), tmp6823);
in1061 = _mm512_fmadd_ps(tmp6818, _mm512_set1_ps(5.25e+00f), in1061);
in1069 = _mm512_fmadd_ps(tmp6822, _mm512_set1_ps(5.25e+00f), in1069);
tmp6818 = _mm512_fmadd_ps(in1063, _mm512_set1_ps(2.5e-01f), in1067);
tmp6822 = _mm512_fmadd_ps(in1071, _mm512_set1_ps(2.5e-01f), in1075);
in1063 = _mm512_fmadd_ps(in1063, _mm512_set1_ps(4e+00f), in1067);
in1071 = _mm512_fmadd_ps(in1071, _mm512_set1_ps(4e+00f), in1075);
__m512 tmp6820 = _mm512_sub_ps(tmp6819, tmp6817);
__m512 tmp6824 = _mm512_sub_ps(tmp6823, tmp6821);
tmp6819 = _mm512_add_ps(tmp6817, tmp6819);
tmp6823 = _mm512_add_ps(tmp6821, tmp6823);
tmp6817 = _mm512_fmadd_ps(in1062, _mm512_set1_ps(2.5e-01f), in1066);
tmp6821 = _mm512_fmadd_ps(in1070, _mm512_set1_ps(2.5e-01f), in1074);
tmp6818 = _mm512_fmadd_ps(in1065, _mm512_set1_ps(-1.25e+00f), tmp6818);
tmp6822 = _mm512_fmadd_ps(in1073, _mm512_set1_ps(-1.25e+00f), tmp6822);
in1065 = _mm512_fmadd_ps(in1065, _mm512_set1_ps(-5e+00f), in1063);
in1073 = _mm512_fmadd_ps(in1073, _mm512_set1_ps(-5e+00f), in1071);
tmp6817 = _mm512_fmadd_ps(in1064, _mm512_set1_ps(-1.25e+00f), tmp6817);
tmp6821 = _mm512_fmadd_ps(in1072, _mm512_set1_ps(-1.25e+00f), tmp6821);
in1067 = _mm512_fmadd_ps(tmp6817, _mm512_set1_ps(2e+00f), tmp6818);
in1075 = _mm512_fmadd_ps(tmp6821, _mm512_set1_ps(2e+00f), tmp6822);
tmp6818 = _mm512_fnmadd_ps(tmp6817, _mm512_set1_ps(2e+00f), tmp6818);
tmp6822 = _mm512_fnmadd_ps(tmp6821, _mm512_set1_ps(2e+00f), tmp6822);
tmp6817 = _mm512_fmadd_ps(in1066, _mm512_set1_ps(2.5e-01f), in1062);
tmp6821 = _mm512_fmadd_ps(in1074, _mm512_set1_ps(2.5e-01f), in1070);
in1062 = _mm512_sub_ps(in1068, in1062);
in1070 = _mm512_sub_ps(in1076, in1070);
tmp6817 = _mm512_fmadd_ps(in1064, _mm512_set1_ps(-1.25e+00f), tmp6817);
tmp6821 = _mm512_fmadd_ps(in1072, _mm512_set1_ps(-1.25e+00f), tmp6821);
in1064 = _mm512_sub_ps(in1064, in1066);
in1072 = _mm512_sub_ps(in1072, in1074);
in1064 = _mm512_fmadd_ps(in1064, _mm512_set1_ps(5.25e+00f), in1062);
in1072 = _mm512_fmadd_ps(in1072, _mm512_set1_ps(5.25e+00f), in1070);
in1063 = _mm512_fmadd_ps(tmp6817, _mm512_set1_ps(2e+00f), in1065);
in1071 = _mm512_fmadd_ps(tmp6821, _mm512_set1_ps(2e+00f), in1073);
in1065 = _mm512_fnmadd_ps(tmp6817, _mm512_set1_ps(2e+00f), in1065);
in1073 = _mm512_fnmadd_ps(tmp6821, _mm512_set1_ps(2e+00f), in1073);
__m512 tmp6833 = _mm512_unpacklo_ps(in1061, tmp6819);
__m512 tmp6834 = _mm512_unpackhi_ps(in1061, tmp6819);
__m512 tmp6835 = _mm512_unpacklo_ps(tmp6820, in1067);
__m512 tmp6836 = _mm512_unpackhi_ps(tmp6820, in1067);
__m512 tmp6837 = _mm512_unpacklo_ps(tmp6818, in1063);
__m512 tmp6838 = _mm512_unpackhi_ps(tmp6818, in1063);
__m512 tmp6839 = _mm512_unpacklo_ps(in1065, in1064);
__m512 tmp6840 = _mm512_unpackhi_ps(in1065, in1064);
__m512 tmp6841 = _mm512_unpacklo_ps(in1069, tmp6823);
__m512 tmp6842 = _mm512_unpackhi_ps(in1069, tmp6823);
__m512 tmp6843 = _mm512_unpacklo_ps(tmp6824, in1075);
__m512 tmp6844 = _mm512_unpackhi_ps(tmp6824, in1075);
__m512 tmp6845 = _mm512_unpacklo_ps(tmp6822, in1071);
__m512 tmp6846 = _mm512_unpackhi_ps(tmp6822, in1071);
__m512 tmp6847 = _mm512_unpacklo_ps(in1073, in1072);
__m512 tmp6848 = _mm512_unpackhi_ps(in1073, in1072);
__m512 tmp6849 = _mm512_shuffle_ps(tmp6833, tmp6835, 68);
__m512 tmp6850 = _mm512_shuffle_ps(tmp6833, tmp6835, 238);
__m512 tmp6851 = _mm512_shuffle_ps(tmp6834, tmp6836, 68);
__m512 tmp6852 = _mm512_shuffle_ps(tmp6834, tmp6836, 238);
__m512 tmp6853 = _mm512_shuffle_ps(tmp6837, tmp6839, 68);
__m512 tmp6854 = _mm512_shuffle_ps(tmp6837, tmp6839, 238);
__m512 tmp6855 = _mm512_shuffle_ps(tmp6838, tmp6840, 68);
__m512 tmp6856 = _mm512_shuffle_ps(tmp6838, tmp6840, 238);
__m512 tmp6857 = _mm512_shuffle_ps(tmp6841, tmp6843, 68);
__m512 tmp6858 = _mm512_shuffle_ps(tmp6841, tmp6843, 238);
__m512 tmp6859 = _mm512_shuffle_ps(tmp6842, tmp6844, 68);
__m512 tmp6860 = _mm512_shuffle_ps(tmp6842, tmp6844, 238);
__m512 tmp6861 = _mm512_shuffle_ps(tmp6845, tmp6847, 68);
__m512 tmp6862 = _mm512_shuffle_ps(tmp6845, tmp6847, 238);
__m512 tmp6863 = _mm512_shuffle_ps(tmp6846, tmp6848, 68);
__m512 tmp6864 = _mm512_shuffle_ps(tmp6846, tmp6848, 238);
__m512 tmp6865 = _mm512_shuffle_f32x4(tmp6849, tmp6853, 136);
__m512 tmp6866 = _mm512_shuffle_f32x4(tmp6849, tmp6853, 221);
__m512 tmp6867 = _mm512_shuffle_f32x4(tmp6850, tmp6854, 136);
__m512 tmp6868 = _mm512_shuffle_f32x4(tmp6850, tmp6854, 221);
__m512 tmp6869 = _mm512_shuffle_f32x4(tmp6851, tmp6855, 136);
__m512 tmp6870 = _mm512_shuffle_f32x4(tmp6851, tmp6855, 221);
__m512 tmp6871 = _mm512_shuffle_f32x4(tmp6852, tmp6856, 136);
__m512 tmp6872 = _mm512_shuffle_f32x4(tmp6852, tmp6856, 221);
__m512 tmp6873 = _mm512_shuffle_f32x4(tmp6857, tmp6861, 136);
__m512 tmp6874 = _mm512_shuffle_f32x4(tmp6857, tmp6861, 221);
__m512 tmp6875 = _mm512_shuffle_f32x4(tmp6858, tmp6862, 136);
__m512 tmp6876 = _mm512_shuffle_f32x4(tmp6858, tmp6862, 221);
__m512 tmp6877 = _mm512_shuffle_f32x4(tmp6859, tmp6863, 136);
__m512 tmp6878 = _mm512_shuffle_f32x4(tmp6859, tmp6863, 221);
__m512 tmp6879 = _mm512_shuffle_f32x4(tmp6860, tmp6864, 136);
__m512 tmp6880 = _mm512_shuffle_f32x4(tmp6860, tmp6864, 221);
in1061 = _mm512_shuffle_f32x4(tmp6865, tmp6873, 136);
in1069 = _mm512_shuffle_f32x4(tmp6865, tmp6873, 221);
tmp6819 = _mm512_shuffle_f32x4(tmp6867, tmp6875, 136);
tmp6823 = _mm512_shuffle_f32x4(tmp6867, tmp6875, 221);
tmp6820 = _mm512_shuffle_f32x4(tmp6869, tmp6877, 136);
tmp6824 = _mm512_shuffle_f32x4(tmp6869, tmp6877, 221);
in1067 = _mm512_shuffle_f32x4(tmp6871, tmp6879, 136);
in1075 = _mm512_shuffle_f32x4(tmp6871, tmp6879, 221);
tmp6818 = _mm512_shuffle_f32x4(tmp6866, tmp6874, 136);
tmp6822 = _mm512_shuffle_f32x4(tmp6866, tmp6874, 221);
in1063 = _mm512_shuffle_f32x4(tmp6868, tmp6876, 136);
in1071 = _mm512_shuffle_f32x4(tmp6868, tmp6876, 221);
in1065 = _mm512_shuffle_f32x4(tmp6870, tmp6878, 136);
in1073 = _mm512_shuffle_f32x4(tmp6870, tmp6878, 221);
in1064 = _mm512_shuffle_f32x4(tmp6872, tmp6880, 136);
in1072 = _mm512_shuffle_f32x4(tmp6872, tmp6880, 221);
__m512 tmp6825 = _mm512_add_ps(tmp6819, in1063);
__m512 tmp6829 = _mm512_add_ps(tmp6823, in1071);
__m512 tmp6826 = _mm512_sub_ps(tmp6818, tmp6820);
__m512 tmp6830 = _mm512_sub_ps(tmp6822, tmp6824);
__m512 tmp6827 = _mm512_add_ps(tmp6820, in1065);
__m512 tmp6831 = _mm512_add_ps(tmp6824, in1073);
in1061 = _mm512_sub_ps(in1061, in1065);
in1069 = _mm512_sub_ps(in1069, in1073);
tmp6825 = _mm512_fmadd_ps(in1067, _mm512_set1_ps(-4.25e+00f), tmp6825);
tmp6829 = _mm512_fmadd_ps(in1075, _mm512_set1_ps(-4.25e+00f), tmp6829);
tmp6827 = _mm512_fmadd_ps(tmp6818, _mm512_set1_ps(-4.25e+00f), tmp6827);
tmp6831 = _mm512_fmadd_ps(tmp6822, _mm512_set1_ps(-4.25e+00f), tmp6831);
in1061 = _mm512_fmadd_ps(tmp6826, _mm512_set1_ps(5.25e+00f), in1061);
in1069 = _mm512_fmadd_ps(tmp6830, _mm512_set1_ps(5.25e+00f), in1069);
tmp6826 = _mm512_fmadd_ps(tmp6820, _mm512_set1_ps(2.5e-01f), in1065);
tmp6830 = _mm512_fmadd_ps(tmp6824, _mm512_set1_ps(2.5e-01f), in1073);
tmp6820 = _mm512_fmadd_ps(tmp6820, _mm512_set1_ps(4e+00f), in1065);
tmp6824 = _mm512_fmadd_ps(tmp6824, _mm512_set1_ps(4e+00f), in1073);
__m512 tmp6828 = _mm512_sub_ps(tmp6827, tmp6825);
__m512 tmp6832 = _mm512_sub_ps(tmp6831, tmp6829);
tmp6827 = _mm512_add_ps(tmp6825, tmp6827);
tmp6831 = _mm512_add_ps(tmp6829, tmp6831);
tmp6825 = _mm512_fmadd_ps(tmp6819, _mm512_set1_ps(2.5e-01f), in1063);
tmp6829 = _mm512_fmadd_ps(tmp6823, _mm512_set1_ps(2.5e-01f), in1071);
tmp6826 = _mm512_fmadd_ps(tmp6818, _mm512_set1_ps(-1.25e+00f), tmp6826);
tmp6830 = _mm512_fmadd_ps(tmp6822, _mm512_set1_ps(-1.25e+00f), tmp6830);
tmp6818 = _mm512_fmadd_ps(tmp6818, _mm512_set1_ps(-5e+00f), tmp6820);
tmp6822 = _mm512_fmadd_ps(tmp6822, _mm512_set1_ps(-5e+00f), tmp6824);
tmp6825 = _mm512_fmadd_ps(in1067, _mm512_set1_ps(-1.25e+00f), tmp6825);
tmp6829 = _mm512_fmadd_ps(in1075, _mm512_set1_ps(-1.25e+00f), tmp6829);
in1065 = _mm512_fmadd_ps(tmp6825, _mm512_set1_ps(2e+00f), tmp6826);
in1073 = _mm512_fmadd_ps(tmp6829, _mm512_set1_ps(2e+00f), tmp6830);
tmp6826 = _mm512_fnmadd_ps(tmp6825, _mm512_set1_ps(2e+00f), tmp6826);
tmp6830 = _mm512_fnmadd_ps(tmp6829, _mm512_set1_ps(2e+00f), tmp6830);
tmp6825 = _mm512_fmadd_ps(in1063, _mm512_set1_ps(2.5e-01f), tmp6819);
tmp6829 = _mm512_fmadd_ps(in1071, _mm512_set1_ps(2.5e-01f), tmp6823);
tmp6819 = _mm512_sub_ps(in1064, tmp6819);
tmp6823 = _mm512_sub_ps(in1072, tmp6823);
tmp6825 = _mm512_fmadd_ps(in1067, _mm512_set1_ps(-1.25e+00f), tmp6825);
tmp6829 = _mm512_fmadd_ps(in1075, _mm512_set1_ps(-1.25e+00f), tmp6829);
in1067 = _mm512_sub_ps(in1067, in1063);
in1075 = _mm512_sub_ps(in1075, in1071);
in1067 = _mm512_fmadd_ps(in1067, _mm512_set1_ps(5.25e+00f), tmp6819);
in1075 = _mm512_fmadd_ps(in1075, _mm512_set1_ps(5.25e+00f), tmp6823);
tmp6820 = _mm512_fmadd_ps(tmp6825, _mm512_set1_ps(2e+00f), tmp6818);
tmp6824 = _mm512_fmadd_ps(tmp6829, _mm512_set1_ps(2e+00f), tmp6822);
tmp6818 = _mm512_fnmadd_ps(tmp6825, _mm512_set1_ps(2e+00f), tmp6818);
tmp6822 = _mm512_fnmadd_ps(tmp6829, _mm512_set1_ps(2e+00f), tmp6822);
__m512 out1011 = _mm512_shuffle_f32x4(in1061, tmp6827, 68);
__m512 out1019 = _mm512_shuffle_f32x4(in1061, tmp6827, 238);
__m512 out1012 = _mm512_shuffle_f32x4(tmp6828, in1065, 68);
__m512 out1020 = _mm512_shuffle_f32x4(tmp6828, in1065, 238);
__m512 out1013 = _mm512_shuffle_f32x4(tmp6826, tmp6820, 68);
__m512 out1021 = _mm512_shuffle_f32x4(tmp6826, tmp6820, 238);
__m512 out1014 = _mm512_shuffle_f32x4(tmp6818, in1067, 68);
__m512 out1022 = _mm512_shuffle_f32x4(tmp6818, in1067, 238);
__m512 out1015 = _mm512_shuffle_f32x4(in1069, tmp6831, 68);
__m512 out1023 = _mm512_shuffle_f32x4(in1069, tmp6831, 238);
__m512 out1016 = _mm512_shuffle_f32x4(tmp6832, in1073, 68);
__m512 out1024 = _mm512_shuffle_f32x4(tmp6832, in1073, 238);
__m512 out1017 = _mm512_shuffle_f32x4(tmp6830, tmp6824, 68);
__m512 out1025 = _mm512_shuffle_f32x4(tmp6830, tmp6824, 238);
__m512 out1018 = _mm512_shuffle_f32x4(tmp6822, in1075, 68);
__m512 out1026 = _mm512_shuffle_f32x4(tmp6822, in1075, 238);
_mm512_storeu_ps(dfPtr6+256+1638400*i26+24576*j21+24576*s20+768*k86, out1011);
_mm512_storeu_ps(dfPtr6+384+1638400*i26+24576*j21+24576*s20+768*k86, out1019);
_mm512_storeu_ps(dfPtr6+320+1638400*i26+24576*j21+24576*s20+768*k86, out1015);
_mm512_storeu_ps(dfPtr6+448+1638400*i26+24576*j21+24576*s20+768*k86, out1023);
_mm512_storeu_ps(dfPtr6+409856+1638400*i26+24576*j21+24576*s20+768*k86, out1012);
_mm512_storeu_ps(dfPtr6+409984+1638400*i26+24576*j21+24576*s20+768*k86, out1020);
_mm512_storeu_ps(dfPtr6+409920+1638400*i26+24576*j21+24576*s20+768*k86, out1016);
_mm512_storeu_ps(dfPtr6+410048+1638400*i26+24576*j21+24576*s20+768*k86, out1024);
_mm512_storeu_ps(dfPtr6+819456+1638400*i26+24576*j21+24576*s20+768*k86, out1013);
_mm512_storeu_ps(dfPtr6+819584+1638400*i26+24576*j21+24576*s20+768*k86, out1021);
_mm512_storeu_ps(dfPtr6+819520+1638400*i26+24576*j21+24576*s20+768*k86, out1017);
_mm512_storeu_ps(dfPtr6+819648+1638400*i26+24576*j21+24576*s20+768*k86, out1025);
_mm512_storeu_ps(dfPtr6+1229056+1638400*i26+24576*j21+24576*s20+768*k86, out1014);
_mm512_storeu_ps(dfPtr6+1229184+1638400*i26+24576*j21+24576*s20+768*k86, out1022);
_mm512_storeu_ps(dfPtr6+1229120+1638400*i26+24576*j21+24576*s20+768*k86, out1018);
_mm512_storeu_ps(dfPtr6+1229248+1638400*i26+24576*j21+24576*s20+768*k86, out1026);
__m512 dat1605 = _mm512_maskz_loadu_ps(511, datPtr12+12656+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1606 = _mm512_maskz_loadu_ps(8191, datPtr12+13812+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512i pm144 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1077 = _mm512_permutexvar_ps(pm144, dat1605);
__m512i pm145 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1085 = _mm512_permutexvar_ps(pm145, dat1606);
__m512 dat1607 = _mm512_maskz_loadu_ps(511, datPtr12+12880+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1608 = _mm512_maskz_loadu_ps(8191, datPtr12+14036+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1078 = _mm512_permutexvar_ps(pm144, dat1607);
__m512 in1086 = _mm512_permutexvar_ps(pm145, dat1608);
__m512 dat1609 = _mm512_maskz_loadu_ps(511, datPtr12+13104+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1610 = _mm512_maskz_loadu_ps(8191, datPtr12+14260+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1079 = _mm512_permutexvar_ps(pm144, dat1609);
__m512 in1087 = _mm512_permutexvar_ps(pm145, dat1610);
__m512 dat1611 = _mm512_maskz_loadu_ps(511, datPtr12+13328+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1612 = _mm512_maskz_loadu_ps(8191, datPtr12+14484+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1080 = _mm512_permutexvar_ps(pm144, dat1611);
__m512 in1088 = _mm512_permutexvar_ps(pm145, dat1612);
__m512 dat1613 = _mm512_maskz_loadu_ps(511, datPtr12+13552+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1614 = _mm512_maskz_loadu_ps(8191, datPtr12+14708+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1081 = _mm512_permutexvar_ps(pm144, dat1613);
__m512 in1089 = _mm512_permutexvar_ps(pm145, dat1614);
__m512 dat1615 = _mm512_maskz_loadu_ps(511, datPtr12+13776+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1616 = _mm512_maskz_loadu_ps(8191, datPtr12+14932+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1082 = _mm512_permutexvar_ps(pm144, dat1615);
__m512 in1090 = _mm512_permutexvar_ps(pm145, dat1616);
__m512 dat1617 = _mm512_maskz_loadu_ps(511, datPtr12+14000+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1618 = _mm512_maskz_loadu_ps(8191, datPtr12+15156+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1083 = _mm512_permutexvar_ps(pm144, dat1617);
__m512 in1091 = _mm512_permutexvar_ps(pm145, dat1618);
__m512 dat1619 = _mm512_maskz_loadu_ps(511, datPtr12+14224+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 dat1620 = _mm512_maskz_loadu_ps(8191, datPtr12+15380+806912*i26+224*h35+4*w42+806912*s20+25216*k86);
__m512 in1084 = _mm512_permutexvar_ps(pm144, dat1619);
__m512 in1092 = _mm512_permutexvar_ps(pm145, dat1620);
__m512 tmp6881 = _mm512_add_ps(in1078, in1082);
__m512 tmp6885 = _mm512_add_ps(in1086, in1090);
__m512 tmp6882 = _mm512_sub_ps(in1081, in1079);
__m512 tmp6886 = _mm512_sub_ps(in1089, in1087);
__m512 tmp6883 = _mm512_add_ps(in1079, in1083);
__m512 tmp6887 = _mm512_add_ps(in1087, in1091);
in1077 = _mm512_sub_ps(in1077, in1083);
in1085 = _mm512_sub_ps(in1085, in1091);
tmp6881 = _mm512_fmadd_ps(in1080, _mm512_set1_ps(-4.25e+00f), tmp6881);
tmp6885 = _mm512_fmadd_ps(in1088, _mm512_set1_ps(-4.25e+00f), tmp6885);
tmp6883 = _mm512_fmadd_ps(in1081, _mm512_set1_ps(-4.25e+00f), tmp6883);
tmp6887 = _mm512_fmadd_ps(in1089, _mm512_set1_ps(-4.25e+00f), tmp6887);
in1077 = _mm512_fmadd_ps(tmp6882, _mm512_set1_ps(5.25e+00f), in1077);
in1085 = _mm512_fmadd_ps(tmp6886, _mm512_set1_ps(5.25e+00f), in1085);
tmp6882 = _mm512_fmadd_ps(in1079, _mm512_set1_ps(2.5e-01f), in1083);
tmp6886 = _mm512_fmadd_ps(in1087, _mm512_set1_ps(2.5e-01f), in1091);
in1079 = _mm512_fmadd_ps(in1079, _mm512_set1_ps(4e+00f), in1083);
in1087 = _mm512_fmadd_ps(in1087, _mm512_set1_ps(4e+00f), in1091);
__m512 tmp6884 = _mm512_sub_ps(tmp6883, tmp6881);
__m512 tmp6888 = _mm512_sub_ps(tmp6887, tmp6885);
tmp6883 = _mm512_add_ps(tmp6881, tmp6883);
tmp6887 = _mm512_add_ps(tmp6885, tmp6887);
tmp6881 = _mm512_fmadd_ps(in1078, _mm512_set1_ps(2.5e-01f), in1082);
tmp6885 = _mm512_fmadd_ps(in1086, _mm512_set1_ps(2.5e-01f), in1090);
tmp6882 = _mm512_fmadd_ps(in1081, _mm512_set1_ps(-1.25e+00f), tmp6882);
tmp6886 = _mm512_fmadd_ps(in1089, _mm512_set1_ps(-1.25e+00f), tmp6886);
in1081 = _mm512_fmadd_ps(in1081, _mm512_set1_ps(-5e+00f), in1079);
in1089 = _mm512_fmadd_ps(in1089, _mm512_set1_ps(-5e+00f), in1087);
tmp6881 = _mm512_fmadd_ps(in1080, _mm512_set1_ps(-1.25e+00f), tmp6881);
tmp6885 = _mm512_fmadd_ps(in1088, _mm512_set1_ps(-1.25e+00f), tmp6885);
in1083 = _mm512_fmadd_ps(tmp6881, _mm512_set1_ps(2e+00f), tmp6882);
in1091 = _mm512_fmadd_ps(tmp6885, _mm512_set1_ps(2e+00f), tmp6886);
tmp6882 = _mm512_fnmadd_ps(tmp6881, _mm512_set1_ps(2e+00f), tmp6882);
tmp6886 = _mm512_fnmadd_ps(tmp6885, _mm512_set1_ps(2e+00f), tmp6886);
tmp6881 = _mm512_fmadd_ps(in1082, _mm512_set1_ps(2.5e-01f), in1078);
tmp6885 = _mm512_fmadd_ps(in1090, _mm512_set1_ps(2.5e-01f), in1086);
in1078 = _mm512_sub_ps(in1084, in1078);
in1086 = _mm512_sub_ps(in1092, in1086);
tmp6881 = _mm512_fmadd_ps(in1080, _mm512_set1_ps(-1.25e+00f), tmp6881);
tmp6885 = _mm512_fmadd_ps(in1088, _mm512_set1_ps(-1.25e+00f), tmp6885);
in1080 = _mm512_sub_ps(in1080, in1082);
in1088 = _mm512_sub_ps(in1088, in1090);
in1080 = _mm512_fmadd_ps(in1080, _mm512_set1_ps(5.25e+00f), in1078);
in1088 = _mm512_fmadd_ps(in1088, _mm512_set1_ps(5.25e+00f), in1086);
in1079 = _mm512_fmadd_ps(tmp6881, _mm512_set1_ps(2e+00f), in1081);
in1087 = _mm512_fmadd_ps(tmp6885, _mm512_set1_ps(2e+00f), in1089);
in1081 = _mm512_fnmadd_ps(tmp6881, _mm512_set1_ps(2e+00f), in1081);
in1089 = _mm512_fnmadd_ps(tmp6885, _mm512_set1_ps(2e+00f), in1089);
__m512 tmp6897 = _mm512_unpacklo_ps(in1077, tmp6883);
__m512 tmp6898 = _mm512_unpackhi_ps(in1077, tmp6883);
__m512 tmp6899 = _mm512_unpacklo_ps(tmp6884, in1083);
__m512 tmp6900 = _mm512_unpackhi_ps(tmp6884, in1083);
__m512 tmp6901 = _mm512_unpacklo_ps(tmp6882, in1079);
__m512 tmp6902 = _mm512_unpackhi_ps(tmp6882, in1079);
__m512 tmp6903 = _mm512_unpacklo_ps(in1081, in1080);
__m512 tmp6904 = _mm512_unpackhi_ps(in1081, in1080);
__m512 tmp6905 = _mm512_unpacklo_ps(in1085, tmp6887);
__m512 tmp6906 = _mm512_unpackhi_ps(in1085, tmp6887);
__m512 tmp6907 = _mm512_unpacklo_ps(tmp6888, in1091);
__m512 tmp6908 = _mm512_unpackhi_ps(tmp6888, in1091);
__m512 tmp6909 = _mm512_unpacklo_ps(tmp6886, in1087);
__m512 tmp6910 = _mm512_unpackhi_ps(tmp6886, in1087);
__m512 tmp6911 = _mm512_unpacklo_ps(in1089, in1088);
__m512 tmp6912 = _mm512_unpackhi_ps(in1089, in1088);
__m512 tmp6913 = _mm512_shuffle_ps(tmp6897, tmp6899, 68);
__m512 tmp6914 = _mm512_shuffle_ps(tmp6897, tmp6899, 238);
__m512 tmp6915 = _mm512_shuffle_ps(tmp6898, tmp6900, 68);
__m512 tmp6916 = _mm512_shuffle_ps(tmp6898, tmp6900, 238);
__m512 tmp6917 = _mm512_shuffle_ps(tmp6901, tmp6903, 68);
__m512 tmp6918 = _mm512_shuffle_ps(tmp6901, tmp6903, 238);
__m512 tmp6919 = _mm512_shuffle_ps(tmp6902, tmp6904, 68);
__m512 tmp6920 = _mm512_shuffle_ps(tmp6902, tmp6904, 238);
__m512 tmp6921 = _mm512_shuffle_ps(tmp6905, tmp6907, 68);
__m512 tmp6922 = _mm512_shuffle_ps(tmp6905, tmp6907, 238);
__m512 tmp6923 = _mm512_shuffle_ps(tmp6906, tmp6908, 68);
__m512 tmp6924 = _mm512_shuffle_ps(tmp6906, tmp6908, 238);
__m512 tmp6925 = _mm512_shuffle_ps(tmp6909, tmp6911, 68);
__m512 tmp6926 = _mm512_shuffle_ps(tmp6909, tmp6911, 238);
__m512 tmp6927 = _mm512_shuffle_ps(tmp6910, tmp6912, 68);
__m512 tmp6928 = _mm512_shuffle_ps(tmp6910, tmp6912, 238);
__m512 tmp6929 = _mm512_shuffle_f32x4(tmp6913, tmp6917, 136);
__m512 tmp6930 = _mm512_shuffle_f32x4(tmp6913, tmp6917, 221);
__m512 tmp6931 = _mm512_shuffle_f32x4(tmp6914, tmp6918, 136);
__m512 tmp6932 = _mm512_shuffle_f32x4(tmp6914, tmp6918, 221);
__m512 tmp6933 = _mm512_shuffle_f32x4(tmp6915, tmp6919, 136);
__m512 tmp6934 = _mm512_shuffle_f32x4(tmp6915, tmp6919, 221);
__m512 tmp6935 = _mm512_shuffle_f32x4(tmp6916, tmp6920, 136);
__m512 tmp6936 = _mm512_shuffle_f32x4(tmp6916, tmp6920, 221);
__m512 tmp6937 = _mm512_shuffle_f32x4(tmp6921, tmp6925, 136);
__m512 tmp6938 = _mm512_shuffle_f32x4(tmp6921, tmp6925, 221);
__m512 tmp6939 = _mm512_shuffle_f32x4(tmp6922, tmp6926, 136);
__m512 tmp6940 = _mm512_shuffle_f32x4(tmp6922, tmp6926, 221);
__m512 tmp6941 = _mm512_shuffle_f32x4(tmp6923, tmp6927, 136);
__m512 tmp6942 = _mm512_shuffle_f32x4(tmp6923, tmp6927, 221);
__m512 tmp6943 = _mm512_shuffle_f32x4(tmp6924, tmp6928, 136);
__m512 tmp6944 = _mm512_shuffle_f32x4(tmp6924, tmp6928, 221);
in1077 = _mm512_shuffle_f32x4(tmp6929, tmp6937, 136);
in1085 = _mm512_shuffle_f32x4(tmp6929, tmp6937, 221);
tmp6883 = _mm512_shuffle_f32x4(tmp6931, tmp6939, 136);
tmp6887 = _mm512_shuffle_f32x4(tmp6931, tmp6939, 221);
tmp6884 = _mm512_shuffle_f32x4(tmp6933, tmp6941, 136);
tmp6888 = _mm512_shuffle_f32x4(tmp6933, tmp6941, 221);
in1083 = _mm512_shuffle_f32x4(tmp6935, tmp6943, 136);
in1091 = _mm512_shuffle_f32x4(tmp6935, tmp6943, 221);
tmp6882 = _mm512_shuffle_f32x4(tmp6930, tmp6938, 136);
tmp6886 = _mm512_shuffle_f32x4(tmp6930, tmp6938, 221);
in1079 = _mm512_shuffle_f32x4(tmp6932, tmp6940, 136);
in1087 = _mm512_shuffle_f32x4(tmp6932, tmp6940, 221);
in1081 = _mm512_shuffle_f32x4(tmp6934, tmp6942, 136);
in1089 = _mm512_shuffle_f32x4(tmp6934, tmp6942, 221);
in1080 = _mm512_shuffle_f32x4(tmp6936, tmp6944, 136);
in1088 = _mm512_shuffle_f32x4(tmp6936, tmp6944, 221);
__m512 tmp6889 = _mm512_add_ps(tmp6883, in1079);
__m512 tmp6893 = _mm512_add_ps(tmp6887, in1087);
__m512 tmp6890 = _mm512_sub_ps(tmp6882, tmp6884);
__m512 tmp6894 = _mm512_sub_ps(tmp6886, tmp6888);
__m512 tmp6891 = _mm512_add_ps(tmp6884, in1081);
__m512 tmp6895 = _mm512_add_ps(tmp6888, in1089);
in1077 = _mm512_sub_ps(in1077, in1081);
in1085 = _mm512_sub_ps(in1085, in1089);
tmp6889 = _mm512_fmadd_ps(in1083, _mm512_set1_ps(-4.25e+00f), tmp6889);
tmp6893 = _mm512_fmadd_ps(in1091, _mm512_set1_ps(-4.25e+00f), tmp6893);
tmp6891 = _mm512_fmadd_ps(tmp6882, _mm512_set1_ps(-4.25e+00f), tmp6891);
tmp6895 = _mm512_fmadd_ps(tmp6886, _mm512_set1_ps(-4.25e+00f), tmp6895);
in1077 = _mm512_fmadd_ps(tmp6890, _mm512_set1_ps(5.25e+00f), in1077);
in1085 = _mm512_fmadd_ps(tmp6894, _mm512_set1_ps(5.25e+00f), in1085);
tmp6890 = _mm512_fmadd_ps(tmp6884, _mm512_set1_ps(2.5e-01f), in1081);
tmp6894 = _mm512_fmadd_ps(tmp6888, _mm512_set1_ps(2.5e-01f), in1089);
tmp6884 = _mm512_fmadd_ps(tmp6884, _mm512_set1_ps(4e+00f), in1081);
tmp6888 = _mm512_fmadd_ps(tmp6888, _mm512_set1_ps(4e+00f), in1089);
__m512 tmp6892 = _mm512_sub_ps(tmp6891, tmp6889);
__m512 tmp6896 = _mm512_sub_ps(tmp6895, tmp6893);
tmp6891 = _mm512_add_ps(tmp6889, tmp6891);
tmp6895 = _mm512_add_ps(tmp6893, tmp6895);
tmp6889 = _mm512_fmadd_ps(tmp6883, _mm512_set1_ps(2.5e-01f), in1079);
tmp6893 = _mm512_fmadd_ps(tmp6887, _mm512_set1_ps(2.5e-01f), in1087);
tmp6890 = _mm512_fmadd_ps(tmp6882, _mm512_set1_ps(-1.25e+00f), tmp6890);
tmp6894 = _mm512_fmadd_ps(tmp6886, _mm512_set1_ps(-1.25e+00f), tmp6894);
tmp6882 = _mm512_fmadd_ps(tmp6882, _mm512_set1_ps(-5e+00f), tmp6884);
tmp6886 = _mm512_fmadd_ps(tmp6886, _mm512_set1_ps(-5e+00f), tmp6888);
tmp6889 = _mm512_fmadd_ps(in1083, _mm512_set1_ps(-1.25e+00f), tmp6889);
tmp6893 = _mm512_fmadd_ps(in1091, _mm512_set1_ps(-1.25e+00f), tmp6893);
in1081 = _mm512_fmadd_ps(tmp6889, _mm512_set1_ps(2e+00f), tmp6890);
in1089 = _mm512_fmadd_ps(tmp6893, _mm512_set1_ps(2e+00f), tmp6894);
tmp6890 = _mm512_fnmadd_ps(tmp6889, _mm512_set1_ps(2e+00f), tmp6890);
tmp6894 = _mm512_fnmadd_ps(tmp6893, _mm512_set1_ps(2e+00f), tmp6894);
tmp6889 = _mm512_fmadd_ps(in1079, _mm512_set1_ps(2.5e-01f), tmp6883);
tmp6893 = _mm512_fmadd_ps(in1087, _mm512_set1_ps(2.5e-01f), tmp6887);
tmp6883 = _mm512_sub_ps(in1080, tmp6883);
tmp6887 = _mm512_sub_ps(in1088, tmp6887);
tmp6889 = _mm512_fmadd_ps(in1083, _mm512_set1_ps(-1.25e+00f), tmp6889);
tmp6893 = _mm512_fmadd_ps(in1091, _mm512_set1_ps(-1.25e+00f), tmp6893);
in1083 = _mm512_sub_ps(in1083, in1079);
in1091 = _mm512_sub_ps(in1091, in1087);
in1083 = _mm512_fmadd_ps(in1083, _mm512_set1_ps(5.25e+00f), tmp6883);
in1091 = _mm512_fmadd_ps(in1091, _mm512_set1_ps(5.25e+00f), tmp6887);
tmp6884 = _mm512_fmadd_ps(tmp6889, _mm512_set1_ps(2e+00f), tmp6882);
tmp6888 = _mm512_fmadd_ps(tmp6893, _mm512_set1_ps(2e+00f), tmp6886);
tmp6882 = _mm512_fnmadd_ps(tmp6889, _mm512_set1_ps(2e+00f), tmp6882);
tmp6886 = _mm512_fnmadd_ps(tmp6893, _mm512_set1_ps(2e+00f), tmp6886);
__m512 out1027 = _mm512_shuffle_f32x4(in1077, tmp6891, 68);
__m512 out1035 = _mm512_shuffle_f32x4(in1077, tmp6891, 238);
__m512 out1028 = _mm512_shuffle_f32x4(tmp6892, in1081, 68);
__m512 out1036 = _mm512_shuffle_f32x4(tmp6892, in1081, 238);
__m512 out1029 = _mm512_shuffle_f32x4(tmp6890, tmp6884, 68);
__m512 out1037 = _mm512_shuffle_f32x4(tmp6890, tmp6884, 238);
__m512 out1030 = _mm512_shuffle_f32x4(tmp6882, in1083, 68);
__m512 out1038 = _mm512_shuffle_f32x4(tmp6882, in1083, 238);
__m512 out1031 = _mm512_shuffle_f32x4(in1085, tmp6895, 68);
__m512 out1039 = _mm512_shuffle_f32x4(in1085, tmp6895, 238);
__m512 out1032 = _mm512_shuffle_f32x4(tmp6896, in1089, 68);
__m512 out1040 = _mm512_shuffle_f32x4(tmp6896, in1089, 238);
__m512 out1033 = _mm512_shuffle_f32x4(tmp6894, tmp6888, 68);
__m512 out1041 = _mm512_shuffle_f32x4(tmp6894, tmp6888, 238);
__m512 out1034 = _mm512_shuffle_f32x4(tmp6886, in1091, 68);
__m512 out1042 = _mm512_shuffle_f32x4(tmp6886, in1091, 238);
_mm512_storeu_ps(dfPtr6+512+1638400*i26+24576*j21+24576*s20+768*k86, out1027);
_mm512_storeu_ps(dfPtr6+640+1638400*i26+24576*j21+24576*s20+768*k86, out1035);
_mm512_storeu_ps(dfPtr6+576+1638400*i26+24576*j21+24576*s20+768*k86, out1031);
_mm512_storeu_ps(dfPtr6+704+1638400*i26+24576*j21+24576*s20+768*k86, out1039);
_mm512_storeu_ps(dfPtr6+410112+1638400*i26+24576*j21+24576*s20+768*k86, out1028);
_mm512_storeu_ps(dfPtr6+410240+1638400*i26+24576*j21+24576*s20+768*k86, out1036);
_mm512_storeu_ps(dfPtr6+410176+1638400*i26+24576*j21+24576*s20+768*k86, out1032);
_mm512_storeu_ps(dfPtr6+410304+1638400*i26+24576*j21+24576*s20+768*k86, out1040);
_mm512_storeu_ps(dfPtr6+819712+1638400*i26+24576*j21+24576*s20+768*k86, out1029);
_mm512_storeu_ps(dfPtr6+819840+1638400*i26+24576*j21+24576*s20+768*k86, out1037);
_mm512_storeu_ps(dfPtr6+819776+1638400*i26+24576*j21+24576*s20+768*k86, out1033);
_mm512_storeu_ps(dfPtr6+819904+1638400*i26+24576*j21+24576*s20+768*k86, out1041);
_mm512_storeu_ps(dfPtr6+1229312+1638400*i26+24576*j21+24576*s20+768*k86, out1030);
_mm512_storeu_ps(dfPtr6+1229440+1638400*i26+24576*j21+24576*s20+768*k86, out1038);
_mm512_storeu_ps(dfPtr6+1229376+1638400*i26+24576*j21+24576*s20+768*k86, out1034);
_mm512_storeu_ps(dfPtr6+1229504+1638400*i26+24576*j21+24576*s20+768*k86, out1042);
}
if (j21 >= last5) return;
++j21;
}
j21 = 15;
}
ptrdiff_t rel15 = j21-15;
ptrdiff_t base15 = 54;
if (rel15 < 1) {
ptrdiff_t h36 = base15+0;
ptrdiff_t w43 = 0;
ptrdiff_t k87 = 0;
for (; k87 != 32; ++k87) {
__m512 dat1621 = _mm512_maskz_loadu_ps(8191, datPtr12+4+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512 dat1622 = _mm512_maskz_loadu_ps(16383, datPtr12+48+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512i pm146 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1093 = _mm512_permutexvar_ps(pm146, dat1621);
__m512i pm147 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1096 = _mm512_permutexvar_ps(pm147, dat1622);
__m512 dat1623 = _mm512_maskz_loadu_ps(8191, datPtr12+228+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512 dat1624 = _mm512_maskz_loadu_ps(16383, datPtr12+272+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512 in1094 = _mm512_permutexvar_ps(pm146, dat1623);
__m512 in1097 = _mm512_permutexvar_ps(pm147, dat1624);
__m512 dat1625 = _mm512_maskz_loadu_ps(8191, datPtr12+452+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512 dat1626 = _mm512_maskz_loadu_ps(16383, datPtr12+496+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512 in1095 = _mm512_permutexvar_ps(pm146, dat1625);
__m512 in1098 = _mm512_permutexvar_ps(pm147, dat1626);
__m512 tmp6945 = in1094;
__m512 tmp6952 = in1097;
__m512 tmp6946 = _mm512_sub_ps(_mm512_setzero_ps(), in1095);
__m512 tmp6953 = _mm512_sub_ps(_mm512_setzero_ps(), in1098);
__m512 tmp6947 = in1095;
__m512 tmp6954 = in1098;
in1093 = in1093;
in1096 = in1096;
tmp6945 = tmp6945;
tmp6952 = tmp6952;
tmp6947 = tmp6947;
tmp6954 = tmp6954;
in1093 = _mm512_fmadd_ps(tmp6946, _mm512_set1_ps(5.25e+00f), in1093);
in1096 = _mm512_fmadd_ps(tmp6953, _mm512_set1_ps(5.25e+00f), in1096);
tmp6946 = _mm512_mul_ps(in1095, _mm512_set1_ps(2.5e-01f));
tmp6953 = _mm512_mul_ps(in1098, _mm512_set1_ps(2.5e-01f));
in1095 = _mm512_mul_ps(in1095, _mm512_set1_ps(4e+00f));
in1098 = _mm512_mul_ps(in1098, _mm512_set1_ps(4e+00f));
__m512 tmp6948 = _mm512_sub_ps(tmp6947, tmp6945);
__m512 tmp6955 = _mm512_sub_ps(tmp6954, tmp6952);
tmp6947 = _mm512_add_ps(tmp6945, tmp6947);
tmp6954 = _mm512_add_ps(tmp6952, tmp6954);
tmp6945 = _mm512_mul_ps(in1094, _mm512_set1_ps(2.5e-01f));
tmp6952 = _mm512_mul_ps(in1097, _mm512_set1_ps(2.5e-01f));
tmp6946 = tmp6946;
tmp6953 = tmp6953;
__m512 tmp6949 = in1095;
__m512 tmp6956 = in1098;
tmp6945 = tmp6945;
tmp6952 = tmp6952;
__m512 tmp6950 = _mm512_fmadd_ps(tmp6945, _mm512_set1_ps(2e+00f), tmp6946);
__m512 tmp6957 = _mm512_fmadd_ps(tmp6952, _mm512_set1_ps(2e+00f), tmp6953);
tmp6946 = _mm512_fnmadd_ps(tmp6945, _mm512_set1_ps(2e+00f), tmp6946);
tmp6953 = _mm512_fnmadd_ps(tmp6952, _mm512_set1_ps(2e+00f), tmp6953);
tmp6945 = in1094;
tmp6952 = in1097;
in1094 = _mm512_sub_ps(_mm512_setzero_ps(), in1094);
in1097 = _mm512_sub_ps(_mm512_setzero_ps(), in1097);
tmp6945 = tmp6945;
tmp6952 = tmp6952;
__m512 tmp6951 = in1094;
__m512 tmp6958 = in1097;
in1095 = _mm512_fmadd_ps(tmp6945, _mm512_set1_ps(2e+00f), tmp6949);
in1098 = _mm512_fmadd_ps(tmp6952, _mm512_set1_ps(2e+00f), tmp6956);
tmp6949 = _mm512_fnmadd_ps(tmp6945, _mm512_set1_ps(2e+00f), tmp6949);
tmp6956 = _mm512_fnmadd_ps(tmp6952, _mm512_set1_ps(2e+00f), tmp6956);
__m512 tmp6967 = _mm512_unpacklo_ps(in1093, tmp6947);
__m512 tmp6968 = _mm512_unpackhi_ps(in1093, tmp6947);
__m512 tmp6969 = _mm512_unpacklo_ps(tmp6948, tmp6950);
__m512 tmp6970 = _mm512_unpackhi_ps(tmp6948, tmp6950);
__m512 tmp6971 = _mm512_unpacklo_ps(tmp6946, in1095);
__m512 tmp6972 = _mm512_unpackhi_ps(tmp6946, in1095);
__m512 tmp6973 = _mm512_unpacklo_ps(tmp6949, tmp6951);
__m512 tmp6974 = _mm512_unpackhi_ps(tmp6949, tmp6951);
__m512 tmp6975 = _mm512_unpacklo_ps(in1096, tmp6954);
__m512 tmp6976 = _mm512_unpackhi_ps(in1096, tmp6954);
__m512 tmp6977 = _mm512_unpacklo_ps(tmp6955, tmp6957);
__m512 tmp6978 = _mm512_unpackhi_ps(tmp6955, tmp6957);
__m512 tmp6979 = _mm512_unpacklo_ps(tmp6953, in1098);
__m512 tmp6980 = _mm512_unpackhi_ps(tmp6953, in1098);
__m512 tmp6981 = _mm512_unpacklo_ps(tmp6956, tmp6958);
__m512 tmp6982 = _mm512_unpackhi_ps(tmp6956, tmp6958);
__m512 tmp6983 = _mm512_shuffle_ps(tmp6967, tmp6969, 68);
__m512 tmp6984 = _mm512_shuffle_ps(tmp6967, tmp6969, 238);
__m512 tmp6985 = _mm512_shuffle_ps(tmp6968, tmp6970, 68);
__m512 tmp6986 = _mm512_shuffle_ps(tmp6968, tmp6970, 238);
__m512 tmp6987 = _mm512_shuffle_ps(tmp6971, tmp6973, 68);
__m512 tmp6988 = _mm512_shuffle_ps(tmp6971, tmp6973, 238);
__m512 tmp6989 = _mm512_shuffle_ps(tmp6972, tmp6974, 68);
__m512 tmp6990 = _mm512_shuffle_ps(tmp6972, tmp6974, 238);
__m512 tmp6991 = _mm512_shuffle_ps(tmp6975, tmp6977, 68);
__m512 tmp6992 = _mm512_shuffle_ps(tmp6975, tmp6977, 238);
__m512 tmp6993 = _mm512_shuffle_ps(tmp6976, tmp6978, 68);
__m512 tmp6994 = _mm512_shuffle_ps(tmp6976, tmp6978, 238);
__m512 tmp6995 = _mm512_shuffle_ps(tmp6979, tmp6981, 68);
__m512 tmp6996 = _mm512_shuffle_ps(tmp6979, tmp6981, 238);
__m512 tmp6997 = _mm512_shuffle_ps(tmp6980, tmp6982, 68);
__m512 tmp6998 = _mm512_shuffle_ps(tmp6980, tmp6982, 238);
__m512 tmp6999 = _mm512_shuffle_f32x4(tmp6983, tmp6987, 136);
__m512 tmp7000 = _mm512_shuffle_f32x4(tmp6983, tmp6987, 221);
__m512 tmp7001 = _mm512_shuffle_f32x4(tmp6984, tmp6988, 136);
__m512 tmp7002 = _mm512_shuffle_f32x4(tmp6984, tmp6988, 221);
__m512 tmp7003 = _mm512_shuffle_f32x4(tmp6985, tmp6989, 136);
__m512 tmp7004 = _mm512_shuffle_f32x4(tmp6985, tmp6989, 221);
__m512 tmp7005 = _mm512_shuffle_f32x4(tmp6986, tmp6990, 136);
__m512 tmp7006 = _mm512_shuffle_f32x4(tmp6986, tmp6990, 221);
__m512 tmp7007 = _mm512_shuffle_f32x4(tmp6991, tmp6995, 136);
__m512 tmp7008 = _mm512_shuffle_f32x4(tmp6991, tmp6995, 221);
__m512 tmp7009 = _mm512_shuffle_f32x4(tmp6992, tmp6996, 136);
__m512 tmp7010 = _mm512_shuffle_f32x4(tmp6992, tmp6996, 221);
__m512 tmp7011 = _mm512_shuffle_f32x4(tmp6993, tmp6997, 136);
__m512 tmp7012 = _mm512_shuffle_f32x4(tmp6993, tmp6997, 221);
__m512 tmp7013 = _mm512_shuffle_f32x4(tmp6994, tmp6998, 136);
__m512 tmp7014 = _mm512_shuffle_f32x4(tmp6994, tmp6998, 221);
in1093 = _mm512_shuffle_f32x4(tmp6999, tmp7007, 136);
in1096 = _mm512_shuffle_f32x4(tmp6999, tmp7007, 221);
tmp6947 = _mm512_shuffle_f32x4(tmp7001, tmp7009, 136);
tmp6954 = _mm512_shuffle_f32x4(tmp7001, tmp7009, 221);
tmp6948 = _mm512_shuffle_f32x4(tmp7003, tmp7011, 136);
tmp6955 = _mm512_shuffle_f32x4(tmp7003, tmp7011, 221);
tmp6950 = _mm512_shuffle_f32x4(tmp7005, tmp7013, 136);
tmp6957 = _mm512_shuffle_f32x4(tmp7005, tmp7013, 221);
tmp6946 = _mm512_shuffle_f32x4(tmp7000, tmp7008, 136);
tmp6953 = _mm512_shuffle_f32x4(tmp7000, tmp7008, 221);
in1095 = _mm512_shuffle_f32x4(tmp7002, tmp7010, 136);
in1098 = _mm512_shuffle_f32x4(tmp7002, tmp7010, 221);
tmp6949 = _mm512_shuffle_f32x4(tmp7004, tmp7012, 136);
tmp6956 = _mm512_shuffle_f32x4(tmp7004, tmp7012, 221);
tmp6951 = _mm512_shuffle_f32x4(tmp7006, tmp7014, 136);
tmp6958 = _mm512_shuffle_f32x4(tmp7006, tmp7014, 221);
__m512 tmp6959 = _mm512_add_ps(tmp6947, in1095);
__m512 tmp6963 = _mm512_add_ps(tmp6954, in1098);
__m512 tmp6960 = _mm512_sub_ps(tmp6946, tmp6948);
__m512 tmp6964 = _mm512_sub_ps(tmp6953, tmp6955);
__m512 tmp6961 = _mm512_add_ps(tmp6948, tmp6949);
__m512 tmp6965 = _mm512_add_ps(tmp6955, tmp6956);
in1093 = _mm512_sub_ps(in1093, tmp6949);
in1096 = _mm512_sub_ps(in1096, tmp6956);
tmp6959 = _mm512_fmadd_ps(tmp6950, _mm512_set1_ps(-4.25e+00f), tmp6959);
tmp6963 = _mm512_fmadd_ps(tmp6957, _mm512_set1_ps(-4.25e+00f), tmp6963);
tmp6961 = _mm512_fmadd_ps(tmp6946, _mm512_set1_ps(-4.25e+00f), tmp6961);
tmp6965 = _mm512_fmadd_ps(tmp6953, _mm512_set1_ps(-4.25e+00f), tmp6965);
in1093 = _mm512_fmadd_ps(tmp6960, _mm512_set1_ps(5.25e+00f), in1093);
in1096 = _mm512_fmadd_ps(tmp6964, _mm512_set1_ps(5.25e+00f), in1096);
tmp6960 = _mm512_fmadd_ps(tmp6948, _mm512_set1_ps(2.5e-01f), tmp6949);
tmp6964 = _mm512_fmadd_ps(tmp6955, _mm512_set1_ps(2.5e-01f), tmp6956);
tmp6948 = _mm512_fmadd_ps(tmp6948, _mm512_set1_ps(4e+00f), tmp6949);
tmp6955 = _mm512_fmadd_ps(tmp6955, _mm512_set1_ps(4e+00f), tmp6956);
__m512 tmp6962 = _mm512_sub_ps(tmp6961, tmp6959);
__m512 tmp6966 = _mm512_sub_ps(tmp6965, tmp6963);
tmp6961 = _mm512_add_ps(tmp6959, tmp6961);
tmp6965 = _mm512_add_ps(tmp6963, tmp6965);
tmp6959 = _mm512_fmadd_ps(tmp6947, _mm512_set1_ps(2.5e-01f), in1095);
tmp6963 = _mm512_fmadd_ps(tmp6954, _mm512_set1_ps(2.5e-01f), in1098);
tmp6960 = _mm512_fmadd_ps(tmp6946, _mm512_set1_ps(-1.25e+00f), tmp6960);
tmp6964 = _mm512_fmadd_ps(tmp6953, _mm512_set1_ps(-1.25e+00f), tmp6964);
tmp6946 = _mm512_fmadd_ps(tmp6946, _mm512_set1_ps(-5e+00f), tmp6948);
tmp6953 = _mm512_fmadd_ps(tmp6953, _mm512_set1_ps(-5e+00f), tmp6955);
tmp6959 = _mm512_fmadd_ps(tmp6950, _mm512_set1_ps(-1.25e+00f), tmp6959);
tmp6963 = _mm512_fmadd_ps(tmp6957, _mm512_set1_ps(-1.25e+00f), tmp6963);
tmp6949 = _mm512_fmadd_ps(tmp6959, _mm512_set1_ps(2e+00f), tmp6960);
tmp6956 = _mm512_fmadd_ps(tmp6963, _mm512_set1_ps(2e+00f), tmp6964);
tmp6960 = _mm512_fnmadd_ps(tmp6959, _mm512_set1_ps(2e+00f), tmp6960);
tmp6964 = _mm512_fnmadd_ps(tmp6963, _mm512_set1_ps(2e+00f), tmp6964);
tmp6959 = _mm512_fmadd_ps(in1095, _mm512_set1_ps(2.5e-01f), tmp6947);
tmp6963 = _mm512_fmadd_ps(in1098, _mm512_set1_ps(2.5e-01f), tmp6954);
tmp6947 = _mm512_sub_ps(tmp6951, tmp6947);
tmp6954 = _mm512_sub_ps(tmp6958, tmp6954);
tmp6959 = _mm512_fmadd_ps(tmp6950, _mm512_set1_ps(-1.25e+00f), tmp6959);
tmp6963 = _mm512_fmadd_ps(tmp6957, _mm512_set1_ps(-1.25e+00f), tmp6963);
tmp6950 = _mm512_sub_ps(tmp6950, in1095);
tmp6957 = _mm512_sub_ps(tmp6957, in1098);
tmp6950 = _mm512_fmadd_ps(tmp6950, _mm512_set1_ps(5.25e+00f), tmp6947);
tmp6957 = _mm512_fmadd_ps(tmp6957, _mm512_set1_ps(5.25e+00f), tmp6954);
tmp6948 = _mm512_fmadd_ps(tmp6959, _mm512_set1_ps(2e+00f), tmp6946);
tmp6955 = _mm512_fmadd_ps(tmp6963, _mm512_set1_ps(2e+00f), tmp6953);
tmp6946 = _mm512_fnmadd_ps(tmp6959, _mm512_set1_ps(2e+00f), tmp6946);
tmp6953 = _mm512_fnmadd_ps(tmp6963, _mm512_set1_ps(2e+00f), tmp6953);
__m512 out1043 = _mm512_shuffle_f32x4(in1093, tmp6961, 68);
__m512 out1051 = _mm512_shuffle_f32x4(in1093, tmp6961, 238);
__m512 out1044 = _mm512_shuffle_f32x4(tmp6962, tmp6949, 68);
__m512 out1052 = _mm512_shuffle_f32x4(tmp6962, tmp6949, 238);
__m512 out1045 = _mm512_shuffle_f32x4(tmp6960, tmp6948, 68);
__m512 out1053 = _mm512_shuffle_f32x4(tmp6960, tmp6948, 238);
__m512 out1046 = _mm512_shuffle_f32x4(tmp6946, tmp6950, 68);
__m512 out1054 = _mm512_shuffle_f32x4(tmp6946, tmp6950, 238);
__m512 out1047 = _mm512_shuffle_f32x4(in1096, tmp6965, 68);
__m512 out1055 = _mm512_shuffle_f32x4(in1096, tmp6965, 238);
__m512 out1048 = _mm512_shuffle_f32x4(tmp6966, tmp6956, 68);
__m512 out1056 = _mm512_shuffle_f32x4(tmp6966, tmp6956, 238);
__m512 out1049 = _mm512_shuffle_f32x4(tmp6964, tmp6955, 68);
__m512 out1057 = _mm512_shuffle_f32x4(tmp6964, tmp6955, 238);
__m512 out1050 = _mm512_shuffle_f32x4(tmp6953, tmp6957, 68);
__m512 out1058 = _mm512_shuffle_f32x4(tmp6953, tmp6957, 238);
_mm512_storeu_ps(dfPtr6+0+1638400*i26+24576*j21+24576*s20+768*k87, out1043);
_mm512_storeu_ps(dfPtr6+128+1638400*i26+24576*j21+24576*s20+768*k87, out1051);
_mm512_storeu_ps(dfPtr6+64+1638400*i26+24576*j21+24576*s20+768*k87, out1047);
_mm512_storeu_ps(dfPtr6+192+1638400*i26+24576*j21+24576*s20+768*k87, out1055);
_mm512_storeu_ps(dfPtr6+409600+1638400*i26+24576*j21+24576*s20+768*k87, out1044);
_mm512_storeu_ps(dfPtr6+409728+1638400*i26+24576*j21+24576*s20+768*k87, out1052);
_mm512_storeu_ps(dfPtr6+409664+1638400*i26+24576*j21+24576*s20+768*k87, out1048);
_mm512_storeu_ps(dfPtr6+409792+1638400*i26+24576*j21+24576*s20+768*k87, out1056);
_mm512_storeu_ps(dfPtr6+819200+1638400*i26+24576*j21+24576*s20+768*k87, out1045);
_mm512_storeu_ps(dfPtr6+819328+1638400*i26+24576*j21+24576*s20+768*k87, out1053);
_mm512_storeu_ps(dfPtr6+819264+1638400*i26+24576*j21+24576*s20+768*k87, out1049);
_mm512_storeu_ps(dfPtr6+819392+1638400*i26+24576*j21+24576*s20+768*k87, out1057);
_mm512_storeu_ps(dfPtr6+1228800+1638400*i26+24576*j21+24576*s20+768*k87, out1046);
_mm512_storeu_ps(dfPtr6+1228928+1638400*i26+24576*j21+24576*s20+768*k87, out1054);
_mm512_storeu_ps(dfPtr6+1228864+1638400*i26+24576*j21+24576*s20+768*k87, out1050);
_mm512_storeu_ps(dfPtr6+1228992+1638400*i26+24576*j21+24576*s20+768*k87, out1058);
__m512 dat1627 = _mm512_maskz_loadu_ps(16383, datPtr12+96+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512 dat1628 = _mm512_maskz_loadu_ps(8191, datPtr12+12612+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512i pm148 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1099 = _mm512_permutexvar_ps(pm148, dat1627);
__m512i pm149 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1102 = _mm512_permutexvar_ps(pm149, dat1628);
__m512 dat1629 = _mm512_maskz_loadu_ps(16383, datPtr12+320+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512 dat1630 = _mm512_maskz_loadu_ps(8191, datPtr12+12836+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512 in1100 = _mm512_permutexvar_ps(pm148, dat1629);
__m512 in1103 = _mm512_permutexvar_ps(pm149, dat1630);
__m512 dat1631 = _mm512_maskz_loadu_ps(16383, datPtr12+544+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512 dat1632 = _mm512_maskz_loadu_ps(8191, datPtr12+13060+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512 in1101 = _mm512_permutexvar_ps(pm148, dat1631);
__m512 in1104 = _mm512_permutexvar_ps(pm149, dat1632);
__m512 tmp7015 = in1100;
__m512 tmp7022 = in1103;
__m512 tmp7016 = _mm512_sub_ps(_mm512_setzero_ps(), in1101);
__m512 tmp7023 = _mm512_sub_ps(_mm512_setzero_ps(), in1104);
__m512 tmp7017 = in1101;
__m512 tmp7024 = in1104;
in1099 = in1099;
in1102 = in1102;
tmp7015 = tmp7015;
tmp7022 = tmp7022;
tmp7017 = tmp7017;
tmp7024 = tmp7024;
in1099 = _mm512_fmadd_ps(tmp7016, _mm512_set1_ps(5.25e+00f), in1099);
in1102 = _mm512_fmadd_ps(tmp7023, _mm512_set1_ps(5.25e+00f), in1102);
tmp7016 = _mm512_mul_ps(in1101, _mm512_set1_ps(2.5e-01f));
tmp7023 = _mm512_mul_ps(in1104, _mm512_set1_ps(2.5e-01f));
in1101 = _mm512_mul_ps(in1101, _mm512_set1_ps(4e+00f));
in1104 = _mm512_mul_ps(in1104, _mm512_set1_ps(4e+00f));
__m512 tmp7018 = _mm512_sub_ps(tmp7017, tmp7015);
__m512 tmp7025 = _mm512_sub_ps(tmp7024, tmp7022);
tmp7017 = _mm512_add_ps(tmp7015, tmp7017);
tmp7024 = _mm512_add_ps(tmp7022, tmp7024);
tmp7015 = _mm512_mul_ps(in1100, _mm512_set1_ps(2.5e-01f));
tmp7022 = _mm512_mul_ps(in1103, _mm512_set1_ps(2.5e-01f));
tmp7016 = tmp7016;
tmp7023 = tmp7023;
__m512 tmp7019 = in1101;
__m512 tmp7026 = in1104;
tmp7015 = tmp7015;
tmp7022 = tmp7022;
__m512 tmp7020 = _mm512_fmadd_ps(tmp7015, _mm512_set1_ps(2e+00f), tmp7016);
__m512 tmp7027 = _mm512_fmadd_ps(tmp7022, _mm512_set1_ps(2e+00f), tmp7023);
tmp7016 = _mm512_fnmadd_ps(tmp7015, _mm512_set1_ps(2e+00f), tmp7016);
tmp7023 = _mm512_fnmadd_ps(tmp7022, _mm512_set1_ps(2e+00f), tmp7023);
tmp7015 = in1100;
tmp7022 = in1103;
in1100 = _mm512_sub_ps(_mm512_setzero_ps(), in1100);
in1103 = _mm512_sub_ps(_mm512_setzero_ps(), in1103);
tmp7015 = tmp7015;
tmp7022 = tmp7022;
__m512 tmp7021 = in1100;
__m512 tmp7028 = in1103;
in1101 = _mm512_fmadd_ps(tmp7015, _mm512_set1_ps(2e+00f), tmp7019);
in1104 = _mm512_fmadd_ps(tmp7022, _mm512_set1_ps(2e+00f), tmp7026);
tmp7019 = _mm512_fnmadd_ps(tmp7015, _mm512_set1_ps(2e+00f), tmp7019);
tmp7026 = _mm512_fnmadd_ps(tmp7022, _mm512_set1_ps(2e+00f), tmp7026);
__m512 tmp7037 = _mm512_unpacklo_ps(in1099, tmp7017);
__m512 tmp7038 = _mm512_unpackhi_ps(in1099, tmp7017);
__m512 tmp7039 = _mm512_unpacklo_ps(tmp7018, tmp7020);
__m512 tmp7040 = _mm512_unpackhi_ps(tmp7018, tmp7020);
__m512 tmp7041 = _mm512_unpacklo_ps(tmp7016, in1101);
__m512 tmp7042 = _mm512_unpackhi_ps(tmp7016, in1101);
__m512 tmp7043 = _mm512_unpacklo_ps(tmp7019, tmp7021);
__m512 tmp7044 = _mm512_unpackhi_ps(tmp7019, tmp7021);
__m512 tmp7045 = _mm512_unpacklo_ps(in1102, tmp7024);
__m512 tmp7046 = _mm512_unpackhi_ps(in1102, tmp7024);
__m512 tmp7047 = _mm512_unpacklo_ps(tmp7025, tmp7027);
__m512 tmp7048 = _mm512_unpackhi_ps(tmp7025, tmp7027);
__m512 tmp7049 = _mm512_unpacklo_ps(tmp7023, in1104);
__m512 tmp7050 = _mm512_unpackhi_ps(tmp7023, in1104);
__m512 tmp7051 = _mm512_unpacklo_ps(tmp7026, tmp7028);
__m512 tmp7052 = _mm512_unpackhi_ps(tmp7026, tmp7028);
__m512 tmp7053 = _mm512_shuffle_ps(tmp7037, tmp7039, 68);
__m512 tmp7054 = _mm512_shuffle_ps(tmp7037, tmp7039, 238);
__m512 tmp7055 = _mm512_shuffle_ps(tmp7038, tmp7040, 68);
__m512 tmp7056 = _mm512_shuffle_ps(tmp7038, tmp7040, 238);
__m512 tmp7057 = _mm512_shuffle_ps(tmp7041, tmp7043, 68);
__m512 tmp7058 = _mm512_shuffle_ps(tmp7041, tmp7043, 238);
__m512 tmp7059 = _mm512_shuffle_ps(tmp7042, tmp7044, 68);
__m512 tmp7060 = _mm512_shuffle_ps(tmp7042, tmp7044, 238);
__m512 tmp7061 = _mm512_shuffle_ps(tmp7045, tmp7047, 68);
__m512 tmp7062 = _mm512_shuffle_ps(tmp7045, tmp7047, 238);
__m512 tmp7063 = _mm512_shuffle_ps(tmp7046, tmp7048, 68);
__m512 tmp7064 = _mm512_shuffle_ps(tmp7046, tmp7048, 238);
__m512 tmp7065 = _mm512_shuffle_ps(tmp7049, tmp7051, 68);
__m512 tmp7066 = _mm512_shuffle_ps(tmp7049, tmp7051, 238);
__m512 tmp7067 = _mm512_shuffle_ps(tmp7050, tmp7052, 68);
__m512 tmp7068 = _mm512_shuffle_ps(tmp7050, tmp7052, 238);
__m512 tmp7069 = _mm512_shuffle_f32x4(tmp7053, tmp7057, 136);
__m512 tmp7070 = _mm512_shuffle_f32x4(tmp7053, tmp7057, 221);
__m512 tmp7071 = _mm512_shuffle_f32x4(tmp7054, tmp7058, 136);
__m512 tmp7072 = _mm512_shuffle_f32x4(tmp7054, tmp7058, 221);
__m512 tmp7073 = _mm512_shuffle_f32x4(tmp7055, tmp7059, 136);
__m512 tmp7074 = _mm512_shuffle_f32x4(tmp7055, tmp7059, 221);
__m512 tmp7075 = _mm512_shuffle_f32x4(tmp7056, tmp7060, 136);
__m512 tmp7076 = _mm512_shuffle_f32x4(tmp7056, tmp7060, 221);
__m512 tmp7077 = _mm512_shuffle_f32x4(tmp7061, tmp7065, 136);
__m512 tmp7078 = _mm512_shuffle_f32x4(tmp7061, tmp7065, 221);
__m512 tmp7079 = _mm512_shuffle_f32x4(tmp7062, tmp7066, 136);
__m512 tmp7080 = _mm512_shuffle_f32x4(tmp7062, tmp7066, 221);
__m512 tmp7081 = _mm512_shuffle_f32x4(tmp7063, tmp7067, 136);
__m512 tmp7082 = _mm512_shuffle_f32x4(tmp7063, tmp7067, 221);
__m512 tmp7083 = _mm512_shuffle_f32x4(tmp7064, tmp7068, 136);
__m512 tmp7084 = _mm512_shuffle_f32x4(tmp7064, tmp7068, 221);
in1099 = _mm512_shuffle_f32x4(tmp7069, tmp7077, 136);
in1102 = _mm512_shuffle_f32x4(tmp7069, tmp7077, 221);
tmp7017 = _mm512_shuffle_f32x4(tmp7071, tmp7079, 136);
tmp7024 = _mm512_shuffle_f32x4(tmp7071, tmp7079, 221);
tmp7018 = _mm512_shuffle_f32x4(tmp7073, tmp7081, 136);
tmp7025 = _mm512_shuffle_f32x4(tmp7073, tmp7081, 221);
tmp7020 = _mm512_shuffle_f32x4(tmp7075, tmp7083, 136);
tmp7027 = _mm512_shuffle_f32x4(tmp7075, tmp7083, 221);
tmp7016 = _mm512_shuffle_f32x4(tmp7070, tmp7078, 136);
tmp7023 = _mm512_shuffle_f32x4(tmp7070, tmp7078, 221);
in1101 = _mm512_shuffle_f32x4(tmp7072, tmp7080, 136);
in1104 = _mm512_shuffle_f32x4(tmp7072, tmp7080, 221);
tmp7019 = _mm512_shuffle_f32x4(tmp7074, tmp7082, 136);
tmp7026 = _mm512_shuffle_f32x4(tmp7074, tmp7082, 221);
tmp7021 = _mm512_shuffle_f32x4(tmp7076, tmp7084, 136);
tmp7028 = _mm512_shuffle_f32x4(tmp7076, tmp7084, 221);
__m512 tmp7029 = _mm512_add_ps(tmp7017, in1101);
__m512 tmp7033 = _mm512_add_ps(tmp7024, in1104);
__m512 tmp7030 = _mm512_sub_ps(tmp7016, tmp7018);
__m512 tmp7034 = _mm512_sub_ps(tmp7023, tmp7025);
__m512 tmp7031 = _mm512_add_ps(tmp7018, tmp7019);
__m512 tmp7035 = _mm512_add_ps(tmp7025, tmp7026);
in1099 = _mm512_sub_ps(in1099, tmp7019);
in1102 = _mm512_sub_ps(in1102, tmp7026);
tmp7029 = _mm512_fmadd_ps(tmp7020, _mm512_set1_ps(-4.25e+00f), tmp7029);
tmp7033 = _mm512_fmadd_ps(tmp7027, _mm512_set1_ps(-4.25e+00f), tmp7033);
tmp7031 = _mm512_fmadd_ps(tmp7016, _mm512_set1_ps(-4.25e+00f), tmp7031);
tmp7035 = _mm512_fmadd_ps(tmp7023, _mm512_set1_ps(-4.25e+00f), tmp7035);
in1099 = _mm512_fmadd_ps(tmp7030, _mm512_set1_ps(5.25e+00f), in1099);
in1102 = _mm512_fmadd_ps(tmp7034, _mm512_set1_ps(5.25e+00f), in1102);
tmp7030 = _mm512_fmadd_ps(tmp7018, _mm512_set1_ps(2.5e-01f), tmp7019);
tmp7034 = _mm512_fmadd_ps(tmp7025, _mm512_set1_ps(2.5e-01f), tmp7026);
tmp7018 = _mm512_fmadd_ps(tmp7018, _mm512_set1_ps(4e+00f), tmp7019);
tmp7025 = _mm512_fmadd_ps(tmp7025, _mm512_set1_ps(4e+00f), tmp7026);
__m512 tmp7032 = _mm512_sub_ps(tmp7031, tmp7029);
__m512 tmp7036 = _mm512_sub_ps(tmp7035, tmp7033);
tmp7031 = _mm512_add_ps(tmp7029, tmp7031);
tmp7035 = _mm512_add_ps(tmp7033, tmp7035);
tmp7029 = _mm512_fmadd_ps(tmp7017, _mm512_set1_ps(2.5e-01f), in1101);
tmp7033 = _mm512_fmadd_ps(tmp7024, _mm512_set1_ps(2.5e-01f), in1104);
tmp7030 = _mm512_fmadd_ps(tmp7016, _mm512_set1_ps(-1.25e+00f), tmp7030);
tmp7034 = _mm512_fmadd_ps(tmp7023, _mm512_set1_ps(-1.25e+00f), tmp7034);
tmp7016 = _mm512_fmadd_ps(tmp7016, _mm512_set1_ps(-5e+00f), tmp7018);
tmp7023 = _mm512_fmadd_ps(tmp7023, _mm512_set1_ps(-5e+00f), tmp7025);
tmp7029 = _mm512_fmadd_ps(tmp7020, _mm512_set1_ps(-1.25e+00f), tmp7029);
tmp7033 = _mm512_fmadd_ps(tmp7027, _mm512_set1_ps(-1.25e+00f), tmp7033);
tmp7019 = _mm512_fmadd_ps(tmp7029, _mm512_set1_ps(2e+00f), tmp7030);
tmp7026 = _mm512_fmadd_ps(tmp7033, _mm512_set1_ps(2e+00f), tmp7034);
tmp7030 = _mm512_fnmadd_ps(tmp7029, _mm512_set1_ps(2e+00f), tmp7030);
tmp7034 = _mm512_fnmadd_ps(tmp7033, _mm512_set1_ps(2e+00f), tmp7034);
tmp7029 = _mm512_fmadd_ps(in1101, _mm512_set1_ps(2.5e-01f), tmp7017);
tmp7033 = _mm512_fmadd_ps(in1104, _mm512_set1_ps(2.5e-01f), tmp7024);
tmp7017 = _mm512_sub_ps(tmp7021, tmp7017);
tmp7024 = _mm512_sub_ps(tmp7028, tmp7024);
tmp7029 = _mm512_fmadd_ps(tmp7020, _mm512_set1_ps(-1.25e+00f), tmp7029);
tmp7033 = _mm512_fmadd_ps(tmp7027, _mm512_set1_ps(-1.25e+00f), tmp7033);
tmp7020 = _mm512_sub_ps(tmp7020, in1101);
tmp7027 = _mm512_sub_ps(tmp7027, in1104);
tmp7020 = _mm512_fmadd_ps(tmp7020, _mm512_set1_ps(5.25e+00f), tmp7017);
tmp7027 = _mm512_fmadd_ps(tmp7027, _mm512_set1_ps(5.25e+00f), tmp7024);
tmp7018 = _mm512_fmadd_ps(tmp7029, _mm512_set1_ps(2e+00f), tmp7016);
tmp7025 = _mm512_fmadd_ps(tmp7033, _mm512_set1_ps(2e+00f), tmp7023);
tmp7016 = _mm512_fnmadd_ps(tmp7029, _mm512_set1_ps(2e+00f), tmp7016);
tmp7023 = _mm512_fnmadd_ps(tmp7033, _mm512_set1_ps(2e+00f), tmp7023);
__m512 out1059 = _mm512_shuffle_f32x4(in1099, tmp7031, 68);
__m512 out1067 = _mm512_shuffle_f32x4(in1099, tmp7031, 238);
__m512 out1060 = _mm512_shuffle_f32x4(tmp7032, tmp7019, 68);
__m512 out1068 = _mm512_shuffle_f32x4(tmp7032, tmp7019, 238);
__m512 out1061 = _mm512_shuffle_f32x4(tmp7030, tmp7018, 68);
__m512 out1069 = _mm512_shuffle_f32x4(tmp7030, tmp7018, 238);
__m512 out1062 = _mm512_shuffle_f32x4(tmp7016, tmp7020, 68);
__m512 out1070 = _mm512_shuffle_f32x4(tmp7016, tmp7020, 238);
__m512 out1063 = _mm512_shuffle_f32x4(in1102, tmp7035, 68);
__m512 out1071 = _mm512_shuffle_f32x4(in1102, tmp7035, 238);
__m512 out1064 = _mm512_shuffle_f32x4(tmp7036, tmp7026, 68);
__m512 out1072 = _mm512_shuffle_f32x4(tmp7036, tmp7026, 238);
__m512 out1065 = _mm512_shuffle_f32x4(tmp7034, tmp7025, 68);
__m512 out1073 = _mm512_shuffle_f32x4(tmp7034, tmp7025, 238);
__m512 out1066 = _mm512_shuffle_f32x4(tmp7023, tmp7027, 68);
__m512 out1074 = _mm512_shuffle_f32x4(tmp7023, tmp7027, 238);
_mm512_storeu_ps(dfPtr6+256+1638400*i26+24576*j21+24576*s20+768*k87, out1059);
_mm512_storeu_ps(dfPtr6+384+1638400*i26+24576*j21+24576*s20+768*k87, out1067);
_mm512_storeu_ps(dfPtr6+320+1638400*i26+24576*j21+24576*s20+768*k87, out1063);
_mm512_storeu_ps(dfPtr6+448+1638400*i26+24576*j21+24576*s20+768*k87, out1071);
_mm512_storeu_ps(dfPtr6+409856+1638400*i26+24576*j21+24576*s20+768*k87, out1060);
_mm512_storeu_ps(dfPtr6+409984+1638400*i26+24576*j21+24576*s20+768*k87, out1068);
_mm512_storeu_ps(dfPtr6+409920+1638400*i26+24576*j21+24576*s20+768*k87, out1064);
_mm512_storeu_ps(dfPtr6+410048+1638400*i26+24576*j21+24576*s20+768*k87, out1072);
_mm512_storeu_ps(dfPtr6+819456+1638400*i26+24576*j21+24576*s20+768*k87, out1061);
_mm512_storeu_ps(dfPtr6+819584+1638400*i26+24576*j21+24576*s20+768*k87, out1069);
_mm512_storeu_ps(dfPtr6+819520+1638400*i26+24576*j21+24576*s20+768*k87, out1065);
_mm512_storeu_ps(dfPtr6+819648+1638400*i26+24576*j21+24576*s20+768*k87, out1073);
_mm512_storeu_ps(dfPtr6+1229056+1638400*i26+24576*j21+24576*s20+768*k87, out1062);
_mm512_storeu_ps(dfPtr6+1229184+1638400*i26+24576*j21+24576*s20+768*k87, out1070);
_mm512_storeu_ps(dfPtr6+1229120+1638400*i26+24576*j21+24576*s20+768*k87, out1066);
_mm512_storeu_ps(dfPtr6+1229248+1638400*i26+24576*j21+24576*s20+768*k87, out1074);
__m512 dat1633 = _mm512_maskz_loadu_ps(16383, datPtr12+12656+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512 dat1634 = _mm512_maskz_loadu_ps(16383, datPtr12+12704+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512i pm150 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1105 = _mm512_permutexvar_ps(pm150, dat1633);
__m512 in1108 = _mm512_permutexvar_ps(pm150, dat1634);
__m512 dat1635 = _mm512_maskz_loadu_ps(16383, datPtr12+12880+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512 dat1636 = _mm512_maskz_loadu_ps(16383, datPtr12+12928+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512 in1106 = _mm512_permutexvar_ps(pm150, dat1635);
__m512 in1109 = _mm512_permutexvar_ps(pm150, dat1636);
__m512 dat1637 = _mm512_maskz_loadu_ps(16383, datPtr12+13104+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512 dat1638 = _mm512_maskz_loadu_ps(16383, datPtr12+13152+806912*i26+224*h36+4*w43+806912*s20+25216*k87);
__m512 in1107 = _mm512_permutexvar_ps(pm150, dat1637);
__m512 in1110 = _mm512_permutexvar_ps(pm150, dat1638);
__m512 tmp7085 = in1106;
__m512 tmp7092 = in1109;
__m512 tmp7086 = _mm512_sub_ps(_mm512_setzero_ps(), in1107);
__m512 tmp7093 = _mm512_sub_ps(_mm512_setzero_ps(), in1110);
__m512 tmp7087 = in1107;
__m512 tmp7094 = in1110;
in1105 = in1105;
in1108 = in1108;
tmp7085 = tmp7085;
tmp7092 = tmp7092;
tmp7087 = tmp7087;
tmp7094 = tmp7094;
in1105 = _mm512_fmadd_ps(tmp7086, _mm512_set1_ps(5.25e+00f), in1105);
in1108 = _mm512_fmadd_ps(tmp7093, _mm512_set1_ps(5.25e+00f), in1108);
tmp7086 = _mm512_mul_ps(in1107, _mm512_set1_ps(2.5e-01f));
tmp7093 = _mm512_mul_ps(in1110, _mm512_set1_ps(2.5e-01f));
in1107 = _mm512_mul_ps(in1107, _mm512_set1_ps(4e+00f));
in1110 = _mm512_mul_ps(in1110, _mm512_set1_ps(4e+00f));
__m512 tmp7088 = _mm512_sub_ps(tmp7087, tmp7085);
__m512 tmp7095 = _mm512_sub_ps(tmp7094, tmp7092);
tmp7087 = _mm512_add_ps(tmp7085, tmp7087);
tmp7094 = _mm512_add_ps(tmp7092, tmp7094);
tmp7085 = _mm512_mul_ps(in1106, _mm512_set1_ps(2.5e-01f));
tmp7092 = _mm512_mul_ps(in1109, _mm512_set1_ps(2.5e-01f));
tmp7086 = tmp7086;
tmp7093 = tmp7093;
__m512 tmp7089 = in1107;
__m512 tmp7096 = in1110;
tmp7085 = tmp7085;
tmp7092 = tmp7092;
__m512 tmp7090 = _mm512_fmadd_ps(tmp7085, _mm512_set1_ps(2e+00f), tmp7086);
__m512 tmp7097 = _mm512_fmadd_ps(tmp7092, _mm512_set1_ps(2e+00f), tmp7093);
tmp7086 = _mm512_fnmadd_ps(tmp7085, _mm512_set1_ps(2e+00f), tmp7086);
tmp7093 = _mm512_fnmadd_ps(tmp7092, _mm512_set1_ps(2e+00f), tmp7093);
tmp7085 = in1106;
tmp7092 = in1109;
in1106 = _mm512_sub_ps(_mm512_setzero_ps(), in1106);
in1109 = _mm512_sub_ps(_mm512_setzero_ps(), in1109);
tmp7085 = tmp7085;
tmp7092 = tmp7092;
__m512 tmp7091 = in1106;
__m512 tmp7098 = in1109;
in1107 = _mm512_fmadd_ps(tmp7085, _mm512_set1_ps(2e+00f), tmp7089);
in1110 = _mm512_fmadd_ps(tmp7092, _mm512_set1_ps(2e+00f), tmp7096);
tmp7089 = _mm512_fnmadd_ps(tmp7085, _mm512_set1_ps(2e+00f), tmp7089);
tmp7096 = _mm512_fnmadd_ps(tmp7092, _mm512_set1_ps(2e+00f), tmp7096);
__m512 tmp7107 = _mm512_unpacklo_ps(in1105, tmp7087);
__m512 tmp7108 = _mm512_unpackhi_ps(in1105, tmp7087);
__m512 tmp7109 = _mm512_unpacklo_ps(tmp7088, tmp7090);
__m512 tmp7110 = _mm512_unpackhi_ps(tmp7088, tmp7090);
__m512 tmp7111 = _mm512_unpacklo_ps(tmp7086, in1107);
__m512 tmp7112 = _mm512_unpackhi_ps(tmp7086, in1107);
__m512 tmp7113 = _mm512_unpacklo_ps(tmp7089, tmp7091);
__m512 tmp7114 = _mm512_unpackhi_ps(tmp7089, tmp7091);
__m512 tmp7115 = _mm512_unpacklo_ps(in1108, tmp7094);
__m512 tmp7116 = _mm512_unpackhi_ps(in1108, tmp7094);
__m512 tmp7117 = _mm512_unpacklo_ps(tmp7095, tmp7097);
__m512 tmp7118 = _mm512_unpackhi_ps(tmp7095, tmp7097);
__m512 tmp7119 = _mm512_unpacklo_ps(tmp7093, in1110);
__m512 tmp7120 = _mm512_unpackhi_ps(tmp7093, in1110);
__m512 tmp7121 = _mm512_unpacklo_ps(tmp7096, tmp7098);
__m512 tmp7122 = _mm512_unpackhi_ps(tmp7096, tmp7098);
__m512 tmp7123 = _mm512_shuffle_ps(tmp7107, tmp7109, 68);
__m512 tmp7124 = _mm512_shuffle_ps(tmp7107, tmp7109, 238);
__m512 tmp7125 = _mm512_shuffle_ps(tmp7108, tmp7110, 68);
__m512 tmp7126 = _mm512_shuffle_ps(tmp7108, tmp7110, 238);
__m512 tmp7127 = _mm512_shuffle_ps(tmp7111, tmp7113, 68);
__m512 tmp7128 = _mm512_shuffle_ps(tmp7111, tmp7113, 238);
__m512 tmp7129 = _mm512_shuffle_ps(tmp7112, tmp7114, 68);
__m512 tmp7130 = _mm512_shuffle_ps(tmp7112, tmp7114, 238);
__m512 tmp7131 = _mm512_shuffle_ps(tmp7115, tmp7117, 68);
__m512 tmp7132 = _mm512_shuffle_ps(tmp7115, tmp7117, 238);
__m512 tmp7133 = _mm512_shuffle_ps(tmp7116, tmp7118, 68);
__m512 tmp7134 = _mm512_shuffle_ps(tmp7116, tmp7118, 238);
__m512 tmp7135 = _mm512_shuffle_ps(tmp7119, tmp7121, 68);
__m512 tmp7136 = _mm512_shuffle_ps(tmp7119, tmp7121, 238);
__m512 tmp7137 = _mm512_shuffle_ps(tmp7120, tmp7122, 68);
__m512 tmp7138 = _mm512_shuffle_ps(tmp7120, tmp7122, 238);
__m512 tmp7139 = _mm512_shuffle_f32x4(tmp7123, tmp7127, 136);
__m512 tmp7140 = _mm512_shuffle_f32x4(tmp7123, tmp7127, 221);
__m512 tmp7141 = _mm512_shuffle_f32x4(tmp7124, tmp7128, 136);
__m512 tmp7142 = _mm512_shuffle_f32x4(tmp7124, tmp7128, 221);
__m512 tmp7143 = _mm512_shuffle_f32x4(tmp7125, tmp7129, 136);
__m512 tmp7144 = _mm512_shuffle_f32x4(tmp7125, tmp7129, 221);
__m512 tmp7145 = _mm512_shuffle_f32x4(tmp7126, tmp7130, 136);
__m512 tmp7146 = _mm512_shuffle_f32x4(tmp7126, tmp7130, 221);
__m512 tmp7147 = _mm512_shuffle_f32x4(tmp7131, tmp7135, 136);
__m512 tmp7148 = _mm512_shuffle_f32x4(tmp7131, tmp7135, 221);
__m512 tmp7149 = _mm512_shuffle_f32x4(tmp7132, tmp7136, 136);
__m512 tmp7150 = _mm512_shuffle_f32x4(tmp7132, tmp7136, 221);
__m512 tmp7151 = _mm512_shuffle_f32x4(tmp7133, tmp7137, 136);
__m512 tmp7152 = _mm512_shuffle_f32x4(tmp7133, tmp7137, 221);
__m512 tmp7153 = _mm512_shuffle_f32x4(tmp7134, tmp7138, 136);
__m512 tmp7154 = _mm512_shuffle_f32x4(tmp7134, tmp7138, 221);
in1105 = _mm512_shuffle_f32x4(tmp7139, tmp7147, 136);
in1108 = _mm512_shuffle_f32x4(tmp7139, tmp7147, 221);
tmp7087 = _mm512_shuffle_f32x4(tmp7141, tmp7149, 136);
tmp7094 = _mm512_shuffle_f32x4(tmp7141, tmp7149, 221);
tmp7088 = _mm512_shuffle_f32x4(tmp7143, tmp7151, 136);
tmp7095 = _mm512_shuffle_f32x4(tmp7143, tmp7151, 221);
tmp7090 = _mm512_shuffle_f32x4(tmp7145, tmp7153, 136);
tmp7097 = _mm512_shuffle_f32x4(tmp7145, tmp7153, 221);
tmp7086 = _mm512_shuffle_f32x4(tmp7140, tmp7148, 136);
tmp7093 = _mm512_shuffle_f32x4(tmp7140, tmp7148, 221);
in1107 = _mm512_shuffle_f32x4(tmp7142, tmp7150, 136);
in1110 = _mm512_shuffle_f32x4(tmp7142, tmp7150, 221);
tmp7089 = _mm512_shuffle_f32x4(tmp7144, tmp7152, 136);
tmp7096 = _mm512_shuffle_f32x4(tmp7144, tmp7152, 221);
tmp7091 = _mm512_shuffle_f32x4(tmp7146, tmp7154, 136);
tmp7098 = _mm512_shuffle_f32x4(tmp7146, tmp7154, 221);
__m512 tmp7099 = _mm512_add_ps(tmp7087, in1107);
__m512 tmp7103 = _mm512_add_ps(tmp7094, in1110);
__m512 tmp7100 = _mm512_sub_ps(tmp7086, tmp7088);
__m512 tmp7104 = _mm512_sub_ps(tmp7093, tmp7095);
__m512 tmp7101 = _mm512_add_ps(tmp7088, tmp7089);
__m512 tmp7105 = _mm512_add_ps(tmp7095, tmp7096);
in1105 = _mm512_sub_ps(in1105, tmp7089);
in1108 = _mm512_sub_ps(in1108, tmp7096);
tmp7099 = _mm512_fmadd_ps(tmp7090, _mm512_set1_ps(-4.25e+00f), tmp7099);
tmp7103 = _mm512_fmadd_ps(tmp7097, _mm512_set1_ps(-4.25e+00f), tmp7103);
tmp7101 = _mm512_fmadd_ps(tmp7086, _mm512_set1_ps(-4.25e+00f), tmp7101);
tmp7105 = _mm512_fmadd_ps(tmp7093, _mm512_set1_ps(-4.25e+00f), tmp7105);
in1105 = _mm512_fmadd_ps(tmp7100, _mm512_set1_ps(5.25e+00f), in1105);
in1108 = _mm512_fmadd_ps(tmp7104, _mm512_set1_ps(5.25e+00f), in1108);
tmp7100 = _mm512_fmadd_ps(tmp7088, _mm512_set1_ps(2.5e-01f), tmp7089);
tmp7104 = _mm512_fmadd_ps(tmp7095, _mm512_set1_ps(2.5e-01f), tmp7096);
tmp7088 = _mm512_fmadd_ps(tmp7088, _mm512_set1_ps(4e+00f), tmp7089);
tmp7095 = _mm512_fmadd_ps(tmp7095, _mm512_set1_ps(4e+00f), tmp7096);
__m512 tmp7102 = _mm512_sub_ps(tmp7101, tmp7099);
__m512 tmp7106 = _mm512_sub_ps(tmp7105, tmp7103);
tmp7101 = _mm512_add_ps(tmp7099, tmp7101);
tmp7105 = _mm512_add_ps(tmp7103, tmp7105);
tmp7099 = _mm512_fmadd_ps(tmp7087, _mm512_set1_ps(2.5e-01f), in1107);
tmp7103 = _mm512_fmadd_ps(tmp7094, _mm512_set1_ps(2.5e-01f), in1110);
tmp7100 = _mm512_fmadd_ps(tmp7086, _mm512_set1_ps(-1.25e+00f), tmp7100);
tmp7104 = _mm512_fmadd_ps(tmp7093, _mm512_set1_ps(-1.25e+00f), tmp7104);
tmp7086 = _mm512_fmadd_ps(tmp7086, _mm512_set1_ps(-5e+00f), tmp7088);
tmp7093 = _mm512_fmadd_ps(tmp7093, _mm512_set1_ps(-5e+00f), tmp7095);
tmp7099 = _mm512_fmadd_ps(tmp7090, _mm512_set1_ps(-1.25e+00f), tmp7099);
tmp7103 = _mm512_fmadd_ps(tmp7097, _mm512_set1_ps(-1.25e+00f), tmp7103);
tmp7089 = _mm512_fmadd_ps(tmp7099, _mm512_set1_ps(2e+00f), tmp7100);
tmp7096 = _mm512_fmadd_ps(tmp7103, _mm512_set1_ps(2e+00f), tmp7104);
tmp7100 = _mm512_fnmadd_ps(tmp7099, _mm512_set1_ps(2e+00f), tmp7100);
tmp7104 = _mm512_fnmadd_ps(tmp7103, _mm512_set1_ps(2e+00f), tmp7104);
tmp7099 = _mm512_fmadd_ps(in1107, _mm512_set1_ps(2.5e-01f), tmp7087);
tmp7103 = _mm512_fmadd_ps(in1110, _mm512_set1_ps(2.5e-01f), tmp7094);
tmp7087 = _mm512_sub_ps(tmp7091, tmp7087);
tmp7094 = _mm512_sub_ps(tmp7098, tmp7094);
tmp7099 = _mm512_fmadd_ps(tmp7090, _mm512_set1_ps(-1.25e+00f), tmp7099);
tmp7103 = _mm512_fmadd_ps(tmp7097, _mm512_set1_ps(-1.25e+00f), tmp7103);
tmp7090 = _mm512_sub_ps(tmp7090, in1107);
tmp7097 = _mm512_sub_ps(tmp7097, in1110);
tmp7090 = _mm512_fmadd_ps(tmp7090, _mm512_set1_ps(5.25e+00f), tmp7087);
tmp7097 = _mm512_fmadd_ps(tmp7097, _mm512_set1_ps(5.25e+00f), tmp7094);
tmp7088 = _mm512_fmadd_ps(tmp7099, _mm512_set1_ps(2e+00f), tmp7086);
tmp7095 = _mm512_fmadd_ps(tmp7103, _mm512_set1_ps(2e+00f), tmp7093);
tmp7086 = _mm512_fnmadd_ps(tmp7099, _mm512_set1_ps(2e+00f), tmp7086);
tmp7093 = _mm512_fnmadd_ps(tmp7103, _mm512_set1_ps(2e+00f), tmp7093);
__m512 out1075 = _mm512_shuffle_f32x4(in1105, tmp7101, 68);
__m512 out1083 = _mm512_shuffle_f32x4(in1105, tmp7101, 238);
__m512 out1076 = _mm512_shuffle_f32x4(tmp7102, tmp7089, 68);
__m512 out1084 = _mm512_shuffle_f32x4(tmp7102, tmp7089, 238);
__m512 out1077 = _mm512_shuffle_f32x4(tmp7100, tmp7088, 68);
__m512 out1085 = _mm512_shuffle_f32x4(tmp7100, tmp7088, 238);
__m512 out1078 = _mm512_shuffle_f32x4(tmp7086, tmp7090, 68);
__m512 out1086 = _mm512_shuffle_f32x4(tmp7086, tmp7090, 238);
__m512 out1079 = _mm512_shuffle_f32x4(in1108, tmp7105, 68);
__m512 out1087 = _mm512_shuffle_f32x4(in1108, tmp7105, 238);
__m512 out1080 = _mm512_shuffle_f32x4(tmp7106, tmp7096, 68);
__m512 out1088 = _mm512_shuffle_f32x4(tmp7106, tmp7096, 238);
__m512 out1081 = _mm512_shuffle_f32x4(tmp7104, tmp7095, 68);
__m512 out1089 = _mm512_shuffle_f32x4(tmp7104, tmp7095, 238);
__m512 out1082 = _mm512_shuffle_f32x4(tmp7093, tmp7097, 68);
__m512 out1090 = _mm512_shuffle_f32x4(tmp7093, tmp7097, 238);
_mm512_storeu_ps(dfPtr6+512+1638400*i26+24576*j21+24576*s20+768*k87, out1075);
_mm512_storeu_ps(dfPtr6+640+1638400*i26+24576*j21+24576*s20+768*k87, out1083);
_mm512_storeu_ps(dfPtr6+576+1638400*i26+24576*j21+24576*s20+768*k87, out1079);
_mm512_storeu_ps(dfPtr6+704+1638400*i26+24576*j21+24576*s20+768*k87, out1087);
_mm512_storeu_ps(dfPtr6+410112+1638400*i26+24576*j21+24576*s20+768*k87, out1076);
_mm512_storeu_ps(dfPtr6+410240+1638400*i26+24576*j21+24576*s20+768*k87, out1084);
_mm512_storeu_ps(dfPtr6+410176+1638400*i26+24576*j21+24576*s20+768*k87, out1080);
_mm512_storeu_ps(dfPtr6+410304+1638400*i26+24576*j21+24576*s20+768*k87, out1088);
_mm512_storeu_ps(dfPtr6+819712+1638400*i26+24576*j21+24576*s20+768*k87, out1077);
_mm512_storeu_ps(dfPtr6+819840+1638400*i26+24576*j21+24576*s20+768*k87, out1085);
_mm512_storeu_ps(dfPtr6+819776+1638400*i26+24576*j21+24576*s20+768*k87, out1081);
_mm512_storeu_ps(dfPtr6+819904+1638400*i26+24576*j21+24576*s20+768*k87, out1089);
_mm512_storeu_ps(dfPtr6+1229312+1638400*i26+24576*j21+24576*s20+768*k87, out1078);
_mm512_storeu_ps(dfPtr6+1229440+1638400*i26+24576*j21+24576*s20+768*k87, out1086);
_mm512_storeu_ps(dfPtr6+1229376+1638400*i26+24576*j21+24576*s20+768*k87, out1082);
_mm512_storeu_ps(dfPtr6+1229504+1638400*i26+24576*j21+24576*s20+768*k87, out1090);
}
if (j21 >= last5) return;
++j21;
rel15 = 1;
}
ptrdiff_t h37 = base15+0;
ptrdiff_t w44 = 36;
ptrdiff_t k88 = 0;
for (; k88 != 64; ++k88) {
__m512 dat1639 = _mm512_maskz_loadu_ps(16383, datPtr12+0+806912*i26+224*h37+4*w44+806912*s20+12608*k88);
__m512 dat1640 = _mm512_maskz_loadu_ps(511, datPtr12+48+806912*i26+224*h37+4*w44+806912*s20+12608*k88);
__m512i pm151 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1111 = _mm512_permutexvar_ps(pm151, dat1639);
__m512i pm152 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1114 = _mm512_permutexvar_ps(pm152, dat1640);
__m512 dat1641 = _mm512_maskz_loadu_ps(16383, datPtr12+224+806912*i26+224*h37+4*w44+806912*s20+12608*k88);
__m512 dat1642 = _mm512_maskz_loadu_ps(511, datPtr12+272+806912*i26+224*h37+4*w44+806912*s20+12608*k88);
__m512 in1112 = _mm512_permutexvar_ps(pm151, dat1641);
__m512 in1115 = _mm512_permutexvar_ps(pm152, dat1642);
__m512 dat1643 = _mm512_maskz_loadu_ps(16383, datPtr12+448+806912*i26+224*h37+4*w44+806912*s20+12608*k88);
__m512 dat1644 = _mm512_maskz_loadu_ps(511, datPtr12+496+806912*i26+224*h37+4*w44+806912*s20+12608*k88);
__m512 in1113 = _mm512_permutexvar_ps(pm151, dat1643);
__m512 in1116 = _mm512_permutexvar_ps(pm152, dat1644);
__m512 tmp7155 = in1112;
__m512 tmp7162 = in1115;
__m512 tmp7156 = _mm512_sub_ps(_mm512_setzero_ps(), in1113);
__m512 tmp7163 = _mm512_sub_ps(_mm512_setzero_ps(), in1116);
__m512 tmp7157 = in1113;
__m512 tmp7164 = in1116;
in1111 = in1111;
in1114 = in1114;
tmp7155 = tmp7155;
tmp7162 = tmp7162;
tmp7157 = tmp7157;
tmp7164 = tmp7164;
in1111 = _mm512_fmadd_ps(tmp7156, _mm512_set1_ps(5.25e+00f), in1111);
in1114 = _mm512_fmadd_ps(tmp7163, _mm512_set1_ps(5.25e+00f), in1114);
tmp7156 = _mm512_mul_ps(in1113, _mm512_set1_ps(2.5e-01f));
tmp7163 = _mm512_mul_ps(in1116, _mm512_set1_ps(2.5e-01f));
in1113 = _mm512_mul_ps(in1113, _mm512_set1_ps(4e+00f));
in1116 = _mm512_mul_ps(in1116, _mm512_set1_ps(4e+00f));
__m512 tmp7158 = _mm512_sub_ps(tmp7157, tmp7155);
__m512 tmp7165 = _mm512_sub_ps(tmp7164, tmp7162);
tmp7157 = _mm512_add_ps(tmp7155, tmp7157);
tmp7164 = _mm512_add_ps(tmp7162, tmp7164);
tmp7155 = _mm512_mul_ps(in1112, _mm512_set1_ps(2.5e-01f));
tmp7162 = _mm512_mul_ps(in1115, _mm512_set1_ps(2.5e-01f));
tmp7156 = tmp7156;
tmp7163 = tmp7163;
__m512 tmp7159 = in1113;
__m512 tmp7166 = in1116;
tmp7155 = tmp7155;
tmp7162 = tmp7162;
__m512 tmp7160 = _mm512_fmadd_ps(tmp7155, _mm512_set1_ps(2e+00f), tmp7156);
__m512 tmp7167 = _mm512_fmadd_ps(tmp7162, _mm512_set1_ps(2e+00f), tmp7163);
tmp7156 = _mm512_fnmadd_ps(tmp7155, _mm512_set1_ps(2e+00f), tmp7156);
tmp7163 = _mm512_fnmadd_ps(tmp7162, _mm512_set1_ps(2e+00f), tmp7163);
tmp7155 = in1112;
tmp7162 = in1115;
in1112 = _mm512_sub_ps(_mm512_setzero_ps(), in1112);
in1115 = _mm512_sub_ps(_mm512_setzero_ps(), in1115);
tmp7155 = tmp7155;
tmp7162 = tmp7162;
__m512 tmp7161 = in1112;
__m512 tmp7168 = in1115;
in1113 = _mm512_fmadd_ps(tmp7155, _mm512_set1_ps(2e+00f), tmp7159);
in1116 = _mm512_fmadd_ps(tmp7162, _mm512_set1_ps(2e+00f), tmp7166);
tmp7159 = _mm512_fnmadd_ps(tmp7155, _mm512_set1_ps(2e+00f), tmp7159);
tmp7166 = _mm512_fnmadd_ps(tmp7162, _mm512_set1_ps(2e+00f), tmp7166);
__m512 tmp7177 = _mm512_unpacklo_ps(in1111, tmp7157);
__m512 tmp7178 = _mm512_unpackhi_ps(in1111, tmp7157);
__m512 tmp7179 = _mm512_unpacklo_ps(tmp7158, tmp7160);
__m512 tmp7180 = _mm512_unpackhi_ps(tmp7158, tmp7160);
__m512 tmp7181 = _mm512_unpacklo_ps(tmp7156, in1113);
__m512 tmp7182 = _mm512_unpackhi_ps(tmp7156, in1113);
__m512 tmp7183 = _mm512_unpacklo_ps(tmp7159, tmp7161);
__m512 tmp7184 = _mm512_unpackhi_ps(tmp7159, tmp7161);
__m512 tmp7185 = _mm512_unpacklo_ps(in1114, tmp7164);
__m512 tmp7186 = _mm512_unpackhi_ps(in1114, tmp7164);
__m512 tmp7187 = _mm512_unpacklo_ps(tmp7165, tmp7167);
__m512 tmp7188 = _mm512_unpackhi_ps(tmp7165, tmp7167);
__m512 tmp7189 = _mm512_unpacklo_ps(tmp7163, in1116);
__m512 tmp7190 = _mm512_unpackhi_ps(tmp7163, in1116);
__m512 tmp7191 = _mm512_unpacklo_ps(tmp7166, tmp7168);
__m512 tmp7192 = _mm512_unpackhi_ps(tmp7166, tmp7168);
__m512 tmp7193 = _mm512_shuffle_ps(tmp7177, tmp7179, 68);
__m512 tmp7194 = _mm512_shuffle_ps(tmp7177, tmp7179, 238);
__m512 tmp7195 = _mm512_shuffle_ps(tmp7178, tmp7180, 68);
__m512 tmp7196 = _mm512_shuffle_ps(tmp7178, tmp7180, 238);
__m512 tmp7197 = _mm512_shuffle_ps(tmp7181, tmp7183, 68);
__m512 tmp7198 = _mm512_shuffle_ps(tmp7181, tmp7183, 238);
__m512 tmp7199 = _mm512_shuffle_ps(tmp7182, tmp7184, 68);
__m512 tmp7200 = _mm512_shuffle_ps(tmp7182, tmp7184, 238);
__m512 tmp7201 = _mm512_shuffle_ps(tmp7185, tmp7187, 68);
__m512 tmp7202 = _mm512_shuffle_ps(tmp7185, tmp7187, 238);
__m512 tmp7203 = _mm512_shuffle_ps(tmp7186, tmp7188, 68);
__m512 tmp7204 = _mm512_shuffle_ps(tmp7186, tmp7188, 238);
__m512 tmp7205 = _mm512_shuffle_ps(tmp7189, tmp7191, 68);
__m512 tmp7206 = _mm512_shuffle_ps(tmp7189, tmp7191, 238);
__m512 tmp7207 = _mm512_shuffle_ps(tmp7190, tmp7192, 68);
__m512 tmp7208 = _mm512_shuffle_ps(tmp7190, tmp7192, 238);
__m512 tmp7209 = _mm512_shuffle_f32x4(tmp7193, tmp7197, 136);
__m512 tmp7210 = _mm512_shuffle_f32x4(tmp7193, tmp7197, 221);
__m512 tmp7211 = _mm512_shuffle_f32x4(tmp7194, tmp7198, 136);
__m512 tmp7212 = _mm512_shuffle_f32x4(tmp7194, tmp7198, 221);
__m512 tmp7213 = _mm512_shuffle_f32x4(tmp7195, tmp7199, 136);
__m512 tmp7214 = _mm512_shuffle_f32x4(tmp7195, tmp7199, 221);
__m512 tmp7215 = _mm512_shuffle_f32x4(tmp7196, tmp7200, 136);
__m512 tmp7216 = _mm512_shuffle_f32x4(tmp7196, tmp7200, 221);
__m512 tmp7217 = _mm512_shuffle_f32x4(tmp7201, tmp7205, 136);
__m512 tmp7218 = _mm512_shuffle_f32x4(tmp7201, tmp7205, 221);
__m512 tmp7219 = _mm512_shuffle_f32x4(tmp7202, tmp7206, 136);
__m512 tmp7220 = _mm512_shuffle_f32x4(tmp7202, tmp7206, 221);
__m512 tmp7221 = _mm512_shuffle_f32x4(tmp7203, tmp7207, 136);
__m512 tmp7222 = _mm512_shuffle_f32x4(tmp7203, tmp7207, 221);
__m512 tmp7223 = _mm512_shuffle_f32x4(tmp7204, tmp7208, 136);
__m512 tmp7224 = _mm512_shuffle_f32x4(tmp7204, tmp7208, 221);
in1111 = _mm512_shuffle_f32x4(tmp7209, tmp7217, 136);
in1114 = _mm512_shuffle_f32x4(tmp7209, tmp7217, 221);
tmp7157 = _mm512_shuffle_f32x4(tmp7211, tmp7219, 136);
tmp7164 = _mm512_shuffle_f32x4(tmp7211, tmp7219, 221);
tmp7158 = _mm512_shuffle_f32x4(tmp7213, tmp7221, 136);
tmp7165 = _mm512_shuffle_f32x4(tmp7213, tmp7221, 221);
tmp7160 = _mm512_shuffle_f32x4(tmp7215, tmp7223, 136);
tmp7167 = _mm512_shuffle_f32x4(tmp7215, tmp7223, 221);
tmp7156 = _mm512_shuffle_f32x4(tmp7210, tmp7218, 136);
tmp7163 = _mm512_shuffle_f32x4(tmp7210, tmp7218, 221);
in1113 = _mm512_shuffle_f32x4(tmp7212, tmp7220, 136);
in1116 = _mm512_shuffle_f32x4(tmp7212, tmp7220, 221);
tmp7159 = _mm512_shuffle_f32x4(tmp7214, tmp7222, 136);
tmp7166 = _mm512_shuffle_f32x4(tmp7214, tmp7222, 221);
tmp7161 = _mm512_shuffle_f32x4(tmp7216, tmp7224, 136);
tmp7168 = _mm512_shuffle_f32x4(tmp7216, tmp7224, 221);
__m512 tmp7169 = _mm512_add_ps(tmp7157, in1113);
__m512 tmp7173 = _mm512_add_ps(tmp7164, in1116);
__m512 tmp7170 = _mm512_sub_ps(tmp7156, tmp7158);
__m512 tmp7174 = _mm512_sub_ps(tmp7163, tmp7165);
__m512 tmp7171 = _mm512_add_ps(tmp7158, tmp7159);
__m512 tmp7175 = _mm512_add_ps(tmp7165, tmp7166);
in1111 = _mm512_sub_ps(in1111, tmp7159);
in1114 = _mm512_sub_ps(in1114, tmp7166);
tmp7169 = _mm512_fmadd_ps(tmp7160, _mm512_set1_ps(-4.25e+00f), tmp7169);
tmp7173 = _mm512_fmadd_ps(tmp7167, _mm512_set1_ps(-4.25e+00f), tmp7173);
tmp7171 = _mm512_fmadd_ps(tmp7156, _mm512_set1_ps(-4.25e+00f), tmp7171);
tmp7175 = _mm512_fmadd_ps(tmp7163, _mm512_set1_ps(-4.25e+00f), tmp7175);
in1111 = _mm512_fmadd_ps(tmp7170, _mm512_set1_ps(5.25e+00f), in1111);
in1114 = _mm512_fmadd_ps(tmp7174, _mm512_set1_ps(5.25e+00f), in1114);
tmp7170 = _mm512_fmadd_ps(tmp7158, _mm512_set1_ps(2.5e-01f), tmp7159);
tmp7174 = _mm512_fmadd_ps(tmp7165, _mm512_set1_ps(2.5e-01f), tmp7166);
tmp7158 = _mm512_fmadd_ps(tmp7158, _mm512_set1_ps(4e+00f), tmp7159);
tmp7165 = _mm512_fmadd_ps(tmp7165, _mm512_set1_ps(4e+00f), tmp7166);
__m512 tmp7172 = _mm512_sub_ps(tmp7171, tmp7169);
__m512 tmp7176 = _mm512_sub_ps(tmp7175, tmp7173);
tmp7171 = _mm512_add_ps(tmp7169, tmp7171);
tmp7175 = _mm512_add_ps(tmp7173, tmp7175);
tmp7169 = _mm512_fmadd_ps(tmp7157, _mm512_set1_ps(2.5e-01f), in1113);
tmp7173 = _mm512_fmadd_ps(tmp7164, _mm512_set1_ps(2.5e-01f), in1116);
tmp7170 = _mm512_fmadd_ps(tmp7156, _mm512_set1_ps(-1.25e+00f), tmp7170);
tmp7174 = _mm512_fmadd_ps(tmp7163, _mm512_set1_ps(-1.25e+00f), tmp7174);
tmp7156 = _mm512_fmadd_ps(tmp7156, _mm512_set1_ps(-5e+00f), tmp7158);
tmp7163 = _mm512_fmadd_ps(tmp7163, _mm512_set1_ps(-5e+00f), tmp7165);
tmp7169 = _mm512_fmadd_ps(tmp7160, _mm512_set1_ps(-1.25e+00f), tmp7169);
tmp7173 = _mm512_fmadd_ps(tmp7167, _mm512_set1_ps(-1.25e+00f), tmp7173);
tmp7159 = _mm512_fmadd_ps(tmp7169, _mm512_set1_ps(2e+00f), tmp7170);
tmp7166 = _mm512_fmadd_ps(tmp7173, _mm512_set1_ps(2e+00f), tmp7174);
tmp7170 = _mm512_fnmadd_ps(tmp7169, _mm512_set1_ps(2e+00f), tmp7170);
tmp7174 = _mm512_fnmadd_ps(tmp7173, _mm512_set1_ps(2e+00f), tmp7174);
tmp7169 = _mm512_fmadd_ps(in1113, _mm512_set1_ps(2.5e-01f), tmp7157);
tmp7173 = _mm512_fmadd_ps(in1116, _mm512_set1_ps(2.5e-01f), tmp7164);
tmp7157 = _mm512_sub_ps(tmp7161, tmp7157);
tmp7164 = _mm512_sub_ps(tmp7168, tmp7164);
tmp7169 = _mm512_fmadd_ps(tmp7160, _mm512_set1_ps(-1.25e+00f), tmp7169);
tmp7173 = _mm512_fmadd_ps(tmp7167, _mm512_set1_ps(-1.25e+00f), tmp7173);
tmp7160 = _mm512_sub_ps(tmp7160, in1113);
tmp7167 = _mm512_sub_ps(tmp7167, in1116);
tmp7160 = _mm512_fmadd_ps(tmp7160, _mm512_set1_ps(5.25e+00f), tmp7157);
tmp7167 = _mm512_fmadd_ps(tmp7167, _mm512_set1_ps(5.25e+00f), tmp7164);
tmp7158 = _mm512_fmadd_ps(tmp7169, _mm512_set1_ps(2e+00f), tmp7156);
tmp7165 = _mm512_fmadd_ps(tmp7173, _mm512_set1_ps(2e+00f), tmp7163);
tmp7156 = _mm512_fnmadd_ps(tmp7169, _mm512_set1_ps(2e+00f), tmp7156);
tmp7163 = _mm512_fnmadd_ps(tmp7173, _mm512_set1_ps(2e+00f), tmp7163);
__m512 out1091 = _mm512_shuffle_f32x4(in1111, tmp7171, 68);
__m512 out1099 = _mm512_shuffle_f32x4(in1111, tmp7171, 238);
__m512 out1092 = _mm512_shuffle_f32x4(tmp7172, tmp7159, 68);
__m512 out1100 = _mm512_shuffle_f32x4(tmp7172, tmp7159, 238);
__m512 out1093 = _mm512_shuffle_f32x4(tmp7170, tmp7158, 68);
__m512 out1101 = _mm512_shuffle_f32x4(tmp7170, tmp7158, 238);
__m512 out1094 = _mm512_shuffle_f32x4(tmp7156, tmp7160, 68);
__m512 out1102 = _mm512_shuffle_f32x4(tmp7156, tmp7160, 238);
__m512 out1095 = _mm512_shuffle_f32x4(in1114, tmp7175, 68);
__m512 out1103 = _mm512_shuffle_f32x4(in1114, tmp7175, 238);
__m512 out1096 = _mm512_shuffle_f32x4(tmp7176, tmp7166, 68);
__m512 out1104 = _mm512_shuffle_f32x4(tmp7176, tmp7166, 238);
__m512 out1097 = _mm512_shuffle_f32x4(tmp7174, tmp7165, 68);
__m512 out1105 = _mm512_shuffle_f32x4(tmp7174, tmp7165, 238);
__m512 out1098 = _mm512_shuffle_f32x4(tmp7163, tmp7167, 68);
__m512 out1106 = _mm512_shuffle_f32x4(tmp7163, tmp7167, 238);
_mm512_storeu_ps(dfPtr6+0+1638400*i26+24576*j21+16384*s20+256*k88, out1091);
_mm512_storeu_ps(dfPtr6+128+1638400*i26+24576*j21+16384*s20+256*k88, out1099);
_mm512_storeu_ps(dfPtr6+64+1638400*i26+24576*j21+16384*s20+256*k88, out1095);
_mm512_storeu_ps(dfPtr6+192+1638400*i26+24576*j21+16384*s20+256*k88, out1103);
_mm512_storeu_ps(dfPtr6+409600+1638400*i26+24576*j21+16384*s20+256*k88, out1092);
_mm512_storeu_ps(dfPtr6+409728+1638400*i26+24576*j21+16384*s20+256*k88, out1100);
_mm512_storeu_ps(dfPtr6+409664+1638400*i26+24576*j21+16384*s20+256*k88, out1096);
_mm512_storeu_ps(dfPtr6+409792+1638400*i26+24576*j21+16384*s20+256*k88, out1104);
_mm512_storeu_ps(dfPtr6+819200+1638400*i26+24576*j21+16384*s20+256*k88, out1093);
_mm512_storeu_ps(dfPtr6+819328+1638400*i26+24576*j21+16384*s20+256*k88, out1101);
_mm512_storeu_ps(dfPtr6+819264+1638400*i26+24576*j21+16384*s20+256*k88, out1097);
_mm512_storeu_ps(dfPtr6+819392+1638400*i26+24576*j21+16384*s20+256*k88, out1105);
_mm512_storeu_ps(dfPtr6+1228800+1638400*i26+24576*j21+16384*s20+256*k88, out1094);
_mm512_storeu_ps(dfPtr6+1228928+1638400*i26+24576*j21+16384*s20+256*k88, out1102);
_mm512_storeu_ps(dfPtr6+1228864+1638400*i26+24576*j21+16384*s20+256*k88, out1098);
_mm512_storeu_ps(dfPtr6+1228992+1638400*i26+24576*j21+16384*s20+256*k88, out1106);
}
if (j21 >= last5) return;
++j21;
}

static void ResNet50ThreeArrangeDats2(ResNet50ThreaderTeam1* team33, char** tensors39) {
ResNet50ThreaderTask1 task43;
task43.callee1 = ResNet50ThreeArrangeDats2Callee1;
task43.any1 = tensors39;
task43.nd1 = 4;
task43.hull1[0] = 1;
task43.hull1[1] = 8;
task43.hull1[2] = 1;
task43.hull1[3] = 1;
ResNet50ThreaderDo1(team33, &task43);
}

static void ResNet50ThreeProduceSums2Callee1(ResNet50ThreaderTask1* task44, int64_t* pt27) {
void** pair10 = task44->any1;
char** tensors42 = pair10[0];
ptrdiff_t e14 = 0;
ptrdiff_t g15 = 0;
ptrdiff_t f45 = pt27[2];
ptrdiff_t d8 = pt27[1];
ptrdiff_t w45 = pt27[0];
char*restrict bfPtr7 = tensors42[0]+256*e14;
char*restrict wfPtr7 = tensors42[0]+256+3244032*e14;
char*restrict dfPtr7 = tensors42[1]+10137600*e14;
char*restrict sfPtr6 = tensors42[2];
ptrdiff_t i27 = 1*g15;
ptrdiff_t j22 = 1*f45;
ptrdiff_t k89 = 1*d8;
ptrdiff_t kk31 = k89+0;
for (; k89 != 16; ++k89) {
ptrdiff_t l27 = 8*w45;
ptrdiff_t ll3 = l27+7;
for (; l27 != 16; ++l27) {
__m512 sum165;
__m512 sum171;
__m512 sum177;
__m512 sum183;
if (__builtin_expect(!j22, 0)) {
sum165 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr7+0+256*i27+16*l27)));
sum171 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr7+4+256*i27+16*l27)));
sum177 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr7+8+256*i27+16*l27)));
sum183 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr7+12+256*i27+16*l27)));
} else {
sum165 = _mm512_setzero_ps();
sum171 = _mm512_setzero_ps();
sum177 = _mm512_setzero_ps();
sum183 = _mm512_setzero_ps();
}
__m512 sum166 = sum165;
__m512 sum167 = sum165;
__m512 sum168 = sum165;
__m512 sum169 = sum165;
__m512 sum170 = sum165;
__m512 sum172 = sum171;
__m512 sum173 = sum171;
__m512 sum174 = sum171;
__m512 sum175 = sum171;
__m512 sum176 = sum171;
__m512 sum178 = sum177;
__m512 sum179 = sum177;
__m512 sum180 = sum177;
__m512 sum181 = sum177;
__m512 sum182 = sum177;
__m512 sum184 = sum183;
__m512 sum185 = sum183;
__m512 sum186 = sum183;
__m512 sum187 = sum183;
__m512 sum188 = sum183;
ptrdiff_t b51 = 0;
for (; b51 != 64; ++b51) {
__m512i wfs21 = _mm512_maskz_loadu_epi32(65535, wfPtr7+0+524288*i27+131072*j22+8192*l27+128*b51);
__m512 wf73 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs21));
__m512 df651 = _mm512_loadu_ps(dfPtr7+0+1638400*i27+409600*j22+24576*k89+384*b51);
sum165 = _mm512_fmadd_ps(wf73, df651, sum165);
__m512 df652 = _mm512_loadu_ps(dfPtr7+64+1638400*i27+409600*j22+24576*k89+384*b51);
sum166 = _mm512_fmadd_ps(wf73, df652, sum166);
__m512 df653 = _mm512_loadu_ps(dfPtr7+128+1638400*i27+409600*j22+24576*k89+384*b51);
sum167 = _mm512_fmadd_ps(wf73, df653, sum167);
__m512 df654 = _mm512_loadu_ps(dfPtr7+192+1638400*i27+409600*j22+24576*k89+384*b51);
sum168 = _mm512_fmadd_ps(wf73, df654, sum168);
__m512 df655 = _mm512_loadu_ps(dfPtr7+256+1638400*i27+409600*j22+24576*k89+384*b51);
sum169 = _mm512_fmadd_ps(wf73, df655, sum169);
__m512 df656 = _mm512_loadu_ps(dfPtr7+320+1638400*i27+409600*j22+24576*k89+384*b51);
sum170 = _mm512_fmadd_ps(wf73, df656, sum170);
__m512 wf74 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs21, 1));
sum171 = _mm512_fmadd_ps(wf74, df651, sum171);
sum172 = _mm512_fmadd_ps(wf74, df652, sum172);
sum173 = _mm512_fmadd_ps(wf74, df653, sum173);
sum174 = _mm512_fmadd_ps(wf74, df654, sum174);
sum175 = _mm512_fmadd_ps(wf74, df655, sum175);
sum176 = _mm512_fmadd_ps(wf74, df656, sum176);
__m512i wfs22 = _mm512_maskz_loadu_epi32(65535, wfPtr7+64+524288*i27+131072*j22+8192*l27+128*b51);
__m512 wf75 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs22));
sum177 = _mm512_fmadd_ps(wf75, df651, sum177);
sum178 = _mm512_fmadd_ps(wf75, df652, sum178);
sum179 = _mm512_fmadd_ps(wf75, df653, sum179);
sum180 = _mm512_fmadd_ps(wf75, df654, sum180);
sum181 = _mm512_fmadd_ps(wf75, df655, sum181);
sum182 = _mm512_fmadd_ps(wf75, df656, sum182);
__m512 wf76 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs22, 1));
sum183 = _mm512_fmadd_ps(wf76, df651, sum183);
sum184 = _mm512_fmadd_ps(wf76, df652, sum184);
sum185 = _mm512_fmadd_ps(wf76, df653, sum185);
sum186 = _mm512_fmadd_ps(wf76, df654, sum186);
sum187 = _mm512_fmadd_ps(wf76, df655, sum187);
sum188 = _mm512_fmadd_ps(wf76, df656, sum188);
}
_mm512_storeu_ps(sfPtr6+0+1638400*i27+409600*j22+24576*k89+1536*l27, sum165);
_mm512_storeu_ps(sfPtr6+64+1638400*i27+409600*j22+24576*k89+1536*l27, sum166);
_mm512_storeu_ps(sfPtr6+128+1638400*i27+409600*j22+24576*k89+1536*l27, sum167);
_mm512_storeu_ps(sfPtr6+192+1638400*i27+409600*j22+24576*k89+1536*l27, sum168);
_mm512_storeu_ps(sfPtr6+256+1638400*i27+409600*j22+24576*k89+1536*l27, sum169);
_mm512_storeu_ps(sfPtr6+320+1638400*i27+409600*j22+24576*k89+1536*l27, sum170);
_mm512_storeu_ps(sfPtr6+384+1638400*i27+409600*j22+24576*k89+1536*l27, sum171);
_mm512_storeu_ps(sfPtr6+448+1638400*i27+409600*j22+24576*k89+1536*l27, sum172);
_mm512_storeu_ps(sfPtr6+512+1638400*i27+409600*j22+24576*k89+1536*l27, sum173);
_mm512_storeu_ps(sfPtr6+576+1638400*i27+409600*j22+24576*k89+1536*l27, sum174);
_mm512_storeu_ps(sfPtr6+640+1638400*i27+409600*j22+24576*k89+1536*l27, sum175);
_mm512_storeu_ps(sfPtr6+704+1638400*i27+409600*j22+24576*k89+1536*l27, sum176);
_mm512_storeu_ps(sfPtr6+768+1638400*i27+409600*j22+24576*k89+1536*l27, sum177);
_mm512_storeu_ps(sfPtr6+832+1638400*i27+409600*j22+24576*k89+1536*l27, sum178);
_mm512_storeu_ps(sfPtr6+896+1638400*i27+409600*j22+24576*k89+1536*l27, sum179);
_mm512_storeu_ps(sfPtr6+960+1638400*i27+409600*j22+24576*k89+1536*l27, sum180);
_mm512_storeu_ps(sfPtr6+1024+1638400*i27+409600*j22+24576*k89+1536*l27, sum181);
_mm512_storeu_ps(sfPtr6+1088+1638400*i27+409600*j22+24576*k89+1536*l27, sum182);
_mm512_storeu_ps(sfPtr6+1152+1638400*i27+409600*j22+24576*k89+1536*l27, sum183);
_mm512_storeu_ps(sfPtr6+1216+1638400*i27+409600*j22+24576*k89+1536*l27, sum184);
_mm512_storeu_ps(sfPtr6+1280+1638400*i27+409600*j22+24576*k89+1536*l27, sum185);
_mm512_storeu_ps(sfPtr6+1344+1638400*i27+409600*j22+24576*k89+1536*l27, sum186);
_mm512_storeu_ps(sfPtr6+1408+1638400*i27+409600*j22+24576*k89+1536*l27, sum187);
_mm512_storeu_ps(sfPtr6+1472+1638400*i27+409600*j22+24576*k89+1536*l27, sum188);
if (l27 >= ll3) return;
}
if (k89 >= kk31) return;
}
ptrdiff_t l28 = 8*w45;
ptrdiff_t ll4 = l28+7;
for (; l28 != 16; ++l28) {
__m512 sum189;
__m512 sum193;
__m512 sum197;
__m512 sum201;
if (__builtin_expect(!j22, 0)) {
sum189 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr7+0+256*i27+16*l28)));
sum193 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr7+4+256*i27+16*l28)));
sum197 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr7+8+256*i27+16*l28)));
sum201 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr7+12+256*i27+16*l28)));
} else {
sum189 = _mm512_setzero_ps();
sum193 = _mm512_setzero_ps();
sum197 = _mm512_setzero_ps();
sum201 = _mm512_setzero_ps();
}
__m512 sum190 = sum189;
__m512 sum191 = sum189;
__m512 sum192 = sum189;
__m512 sum194 = sum193;
__m512 sum195 = sum193;
__m512 sum196 = sum193;
__m512 sum198 = sum197;
__m512 sum199 = sum197;
__m512 sum200 = sum197;
__m512 sum202 = sum201;
__m512 sum203 = sum201;
__m512 sum204 = sum201;
ptrdiff_t b52 = 0;
for (; b52 != 64; ++b52) {
__m512i wfs23 = _mm512_maskz_loadu_epi32(65535, wfPtr7+0+524288*i27+131072*j22+8192*l28+128*b52);
__m512 wf77 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs23));
__m512 df657 = _mm512_loadu_ps(dfPtr7+0+1638400*i27+409600*j22+24576*k89+256*b52);
sum189 = _mm512_fmadd_ps(wf77, df657, sum189);
__m512 df658 = _mm512_loadu_ps(dfPtr7+64+1638400*i27+409600*j22+24576*k89+256*b52);
sum190 = _mm512_fmadd_ps(wf77, df658, sum190);
__m512 df659 = _mm512_loadu_ps(dfPtr7+128+1638400*i27+409600*j22+24576*k89+256*b52);
sum191 = _mm512_fmadd_ps(wf77, df659, sum191);
__m512 df660 = _mm512_loadu_ps(dfPtr7+192+1638400*i27+409600*j22+24576*k89+256*b52);
sum192 = _mm512_fmadd_ps(wf77, df660, sum192);
__m512 wf78 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs23, 1));
sum193 = _mm512_fmadd_ps(wf78, df657, sum193);
sum194 = _mm512_fmadd_ps(wf78, df658, sum194);
sum195 = _mm512_fmadd_ps(wf78, df659, sum195);
sum196 = _mm512_fmadd_ps(wf78, df660, sum196);
__m512i wfs24 = _mm512_maskz_loadu_epi32(65535, wfPtr7+64+524288*i27+131072*j22+8192*l28+128*b52);
__m512 wf79 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs24));
sum197 = _mm512_fmadd_ps(wf79, df657, sum197);
sum198 = _mm512_fmadd_ps(wf79, df658, sum198);
sum199 = _mm512_fmadd_ps(wf79, df659, sum199);
sum200 = _mm512_fmadd_ps(wf79, df660, sum200);
__m512 wf80 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs24, 1));
sum201 = _mm512_fmadd_ps(wf80, df657, sum201);
sum202 = _mm512_fmadd_ps(wf80, df658, sum202);
sum203 = _mm512_fmadd_ps(wf80, df659, sum203);
sum204 = _mm512_fmadd_ps(wf80, df660, sum204);
}
_mm512_storeu_ps(sfPtr6+0+1638400*i27+409600*j22+24576*k89+1024*l28, sum189);
_mm512_storeu_ps(sfPtr6+64+1638400*i27+409600*j22+24576*k89+1024*l28, sum190);
_mm512_storeu_ps(sfPtr6+128+1638400*i27+409600*j22+24576*k89+1024*l28, sum191);
_mm512_storeu_ps(sfPtr6+192+1638400*i27+409600*j22+24576*k89+1024*l28, sum192);
_mm512_storeu_ps(sfPtr6+256+1638400*i27+409600*j22+24576*k89+1024*l28, sum193);
_mm512_storeu_ps(sfPtr6+320+1638400*i27+409600*j22+24576*k89+1024*l28, sum194);
_mm512_storeu_ps(sfPtr6+384+1638400*i27+409600*j22+24576*k89+1024*l28, sum195);
_mm512_storeu_ps(sfPtr6+448+1638400*i27+409600*j22+24576*k89+1024*l28, sum196);
_mm512_storeu_ps(sfPtr6+512+1638400*i27+409600*j22+24576*k89+1024*l28, sum197);
_mm512_storeu_ps(sfPtr6+576+1638400*i27+409600*j22+24576*k89+1024*l28, sum198);
_mm512_storeu_ps(sfPtr6+640+1638400*i27+409600*j22+24576*k89+1024*l28, sum199);
_mm512_storeu_ps(sfPtr6+704+1638400*i27+409600*j22+24576*k89+1024*l28, sum200);
_mm512_storeu_ps(sfPtr6+768+1638400*i27+409600*j22+24576*k89+1024*l28, sum201);
_mm512_storeu_ps(sfPtr6+832+1638400*i27+409600*j22+24576*k89+1024*l28, sum202);
_mm512_storeu_ps(sfPtr6+896+1638400*i27+409600*j22+24576*k89+1024*l28, sum203);
_mm512_storeu_ps(sfPtr6+960+1638400*i27+409600*j22+24576*k89+1024*l28, sum204);
if (l28 >= ll4) return;
}
}

static void ResNet50ThreeProduceSums2(ResNet50ThreaderTeam1* team34, char** tensors41) {
void* pair9[] = {tensors41, 0};
ResNet50ThreaderTask1 task45;
task45.callee1 = ResNet50ThreeProduceSums2Callee1;
task45.any1 = pair9;
task45.nd1 = 4;
task45.hull1[0] = 2;
task45.hull1[1] = 17;
task45.hull1[2] = 4;
task45.hull1[3] = 1;
ResNet50ThreaderDo1(team34, &task45);
}

static void ResNet50ThreeConsumeSums2Callee1(ResNet50ThreaderTask1* task46, int64_t* pt28) {
char** tensors44 = task46->any1;
ptrdiff_t w46 = 0;
ptrdiff_t d9 = pt28[1];
ptrdiff_t g16 = 0;
char*restrict sfPtr7 = tensors44[0];
char*restrict datPtr13 = tensors44[1];
ptrdiff_t i28 = 1*g16;
ptrdiff_t j23 = 2*d9;
ptrdiff_t last6 = j23+(d9 < 7 ? 1 : 2);
if (j23 < 2) {
ptrdiff_t rel16 = j23-0;
ptrdiff_t base16 = 0;
if (rel16 < 1) {
ptrdiff_t toH29 = base16+0;
ptrdiff_t toW29 = 0;
ptrdiff_t k90 = 16*w46;
for (; k90 != 16; ++k90) {
ptrdiff_t l29 = 0;
for (; l29 != 2; ++l29) {
__m512 sf401 = _mm512_loadu_ps(sfPtr7+0+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf402 = _mm512_loadu_ps(sfPtr7+128+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1117 = _mm512_shuffle_f32x4(sf401, sf402, 68);
__m512 in1118 = _mm512_shuffle_f32x4(sf401, sf402, 238);
__m512 sf403 = _mm512_loadu_ps(sfPtr7+64+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf404 = _mm512_loadu_ps(sfPtr7+192+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1125 = _mm512_shuffle_f32x4(sf403, sf404, 68);
__m512 in1126 = _mm512_shuffle_f32x4(sf403, sf404, 238);
__m512 sf405 = _mm512_loadu_ps(sfPtr7+409600+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf406 = _mm512_loadu_ps(sfPtr7+409728+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1119 = _mm512_shuffle_f32x4(sf405, sf406, 68);
__m512 in1120 = _mm512_shuffle_f32x4(sf405, sf406, 238);
__m512 sf407 = _mm512_loadu_ps(sfPtr7+409664+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf408 = _mm512_loadu_ps(sfPtr7+409792+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1127 = _mm512_shuffle_f32x4(sf407, sf408, 68);
__m512 in1128 = _mm512_shuffle_f32x4(sf407, sf408, 238);
__m512 sf409 = _mm512_loadu_ps(sfPtr7+819200+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf410 = _mm512_loadu_ps(sfPtr7+819328+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1121 = _mm512_shuffle_f32x4(sf409, sf410, 68);
__m512 in1122 = _mm512_shuffle_f32x4(sf409, sf410, 238);
__m512 sf411 = _mm512_loadu_ps(sfPtr7+819264+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf412 = _mm512_loadu_ps(sfPtr7+819392+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1129 = _mm512_shuffle_f32x4(sf411, sf412, 68);
__m512 in1130 = _mm512_shuffle_f32x4(sf411, sf412, 238);
__m512 sf413 = _mm512_loadu_ps(sfPtr7+1228800+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf414 = _mm512_loadu_ps(sfPtr7+1228928+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1123 = _mm512_shuffle_f32x4(sf413, sf414, 68);
__m512 in1124 = _mm512_shuffle_f32x4(sf413, sf414, 238);
__m512 sf415 = _mm512_loadu_ps(sfPtr7+1228864+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf416 = _mm512_loadu_ps(sfPtr7+1228992+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1131 = _mm512_shuffle_f32x4(sf415, sf416, 68);
__m512 in1132 = _mm512_shuffle_f32x4(sf415, sf416, 238);
__m512 tmp7241 = _mm512_add_ps(in1118, in1119);
__m512 tmp7261 = _mm512_add_ps(in1126, in1127);
__m512 tmp7240 = _mm512_add_ps(in1120, in1121);
__m512 tmp7260 = _mm512_add_ps(in1128, in1129);
__m512 tmp7246 = _mm512_sub_ps(in1120, in1121);
__m512 tmp7266 = _mm512_sub_ps(in1128, in1129);
__m512 tmp7245 = _mm512_sub_ps(in1118, in1119);
__m512 tmp7265 = _mm512_sub_ps(in1126, in1127);
__m512 tmp7242 = _mm512_add_ps(in1122, in1123);
__m512 tmp7262 = _mm512_add_ps(in1130, in1131);
__m512 tmp7247 = _mm512_sub_ps(in1122, in1123);
__m512 tmp7267 = _mm512_sub_ps(in1130, in1131);
__m512 tmp7244 = _mm512_fmadd_ps(tmp7246, _mm512_set1_ps(2e+00f), tmp7245);
__m512 tmp7264 = _mm512_fmadd_ps(tmp7266, _mm512_set1_ps(2e+00f), tmp7265);
__m512 tmp7251 = _mm512_fmadd_ps(tmp7246, _mm512_set1_ps(8e+00f), tmp7245);
__m512 tmp7271 = _mm512_fmadd_ps(tmp7266, _mm512_set1_ps(8e+00f), tmp7265);
__m512 tmp7239 = _mm512_add_ps(tmp7240, tmp7241);
__m512 tmp7259 = _mm512_add_ps(tmp7260, tmp7261);
__m512 tmp7243 = _mm512_fmadd_ps(tmp7247, _mm512_set1_ps(1.6e+01f), tmp7244);
__m512 tmp7263 = _mm512_fmadd_ps(tmp7267, _mm512_set1_ps(1.6e+01f), tmp7264);
__m512 tmp7250 = _mm512_fmadd_ps(tmp7247, _mm512_set1_ps(4e+00f), tmp7251);
__m512 tmp7270 = _mm512_fmadd_ps(tmp7267, _mm512_set1_ps(4e+00f), tmp7271);
__m512 tmp7256 = _mm512_add_ps(tmp7247, tmp7245);
__m512 tmp7276 = _mm512_add_ps(tmp7267, tmp7265);
__m512 tmp7249 = _mm512_fmadd_ps(tmp7240, _mm512_set1_ps(4e+00f), tmp7241);
__m512 tmp7269 = _mm512_fmadd_ps(tmp7260, _mm512_set1_ps(4e+00f), tmp7261);
__m512 tmp7253 = _mm512_fmadd_ps(tmp7240, _mm512_set1_ps(1.6e+01f), tmp7241);
__m512 tmp7273 = _mm512_fmadd_ps(tmp7260, _mm512_set1_ps(1.6e+01f), tmp7261);
__m512 tmp7238 = _mm512_add_ps(tmp7239, in1117);
__m512 tmp7258 = _mm512_add_ps(tmp7259, in1125);
__m512 tmp7255 = _mm512_add_ps(tmp7256, in1124);
__m512 tmp7275 = _mm512_add_ps(tmp7276, in1132);
__m512 tmp7237 = _mm512_fmadd_ps(tmp7242, _mm512_set1_ps(3.2e+01f), tmp7238);
__m512 tmp7257 = _mm512_fmadd_ps(tmp7262, _mm512_set1_ps(3.2e+01f), tmp7258);
__m512 tmp7248 = _mm512_fmadd_ps(tmp7242, _mm512_set1_ps(8e+00f), tmp7249);
__m512 tmp7268 = _mm512_fmadd_ps(tmp7262, _mm512_set1_ps(8e+00f), tmp7269);
__m512 tmp7254 = _mm512_fmadd_ps(tmp7246, _mm512_set1_ps(3.2e+01f), tmp7255);
__m512 tmp7274 = _mm512_fmadd_ps(tmp7266, _mm512_set1_ps(3.2e+01f), tmp7275);
__m512 tmp7252 = _mm512_fmadd_ps(tmp7242, _mm512_set1_ps(2e+00f), tmp7253);
__m512 tmp7272 = _mm512_fmadd_ps(tmp7262, _mm512_set1_ps(2e+00f), tmp7273);
__m512 tmp7225 = tmp7237;
__m512 tmp7231 = tmp7257;
__m512 tmp7226 = tmp7243;
__m512 tmp7232 = tmp7263;
__m512 tmp7227 = tmp7248;
__m512 tmp7233 = tmp7268;
__m512 tmp7228 = tmp7250;
__m512 tmp7234 = tmp7270;
__m512 tmp7229 = tmp7252;
__m512 tmp7235 = tmp7272;
__m512 tmp7230 = tmp7254;
__m512 tmp7236 = tmp7274;
__m512 tmp7321 = _mm512_unpacklo_ps(tmp7225, tmp7226);
__m512 tmp7322 = _mm512_unpackhi_ps(tmp7225, tmp7226);
__m512 tmp7323 = _mm512_unpacklo_ps(tmp7227, tmp7228);
__m512 tmp7324 = _mm512_unpackhi_ps(tmp7227, tmp7228);
__m512 tmp7325 = _mm512_unpacklo_ps(tmp7229, tmp7230);
__m512 tmp7326 = _mm512_unpackhi_ps(tmp7229, tmp7230);
__m512 tmp7327 = _mm512_unpacklo_ps(tmp7231, tmp7232);
__m512 tmp7328 = _mm512_unpackhi_ps(tmp7231, tmp7232);
__m512 tmp7329 = _mm512_unpacklo_ps(tmp7233, tmp7234);
__m512 tmp7330 = _mm512_unpackhi_ps(tmp7233, tmp7234);
__m512 tmp7331 = _mm512_unpacklo_ps(tmp7235, tmp7236);
__m512 tmp7332 = _mm512_unpackhi_ps(tmp7235, tmp7236);
__m512 tmp7333 = _mm512_shuffle_ps(tmp7321, tmp7323, 68);
__m512 tmp7334 = _mm512_shuffle_ps(tmp7321, tmp7323, 238);
__m512 tmp7335 = _mm512_shuffle_ps(tmp7322, tmp7324, 68);
__m512 tmp7336 = _mm512_shuffle_ps(tmp7322, tmp7324, 238);
__m512 tmp7337 = _mm512_shuffle_ps(tmp7325, tmp7327, 68);
__m512 tmp7338 = _mm512_shuffle_ps(tmp7325, tmp7327, 238);
__m512 tmp7339 = _mm512_shuffle_ps(tmp7326, tmp7328, 68);
__m512 tmp7340 = _mm512_shuffle_ps(tmp7326, tmp7328, 238);
__m512 tmp7341 = _mm512_shuffle_ps(tmp7329, tmp7331, 68);
__m512 tmp7342 = _mm512_shuffle_ps(tmp7329, tmp7331, 238);
__m512 tmp7343 = _mm512_shuffle_ps(tmp7330, tmp7332, 68);
__m512 tmp7344 = _mm512_shuffle_ps(tmp7330, tmp7332, 238);
__m512 tmp7345 = _mm512_shuffle_f32x4(tmp7333, tmp7337, 136);
__m512 tmp7346 = _mm512_shuffle_f32x4(tmp7333, tmp7337, 221);
__m512 tmp7347 = _mm512_shuffle_f32x4(tmp7334, tmp7338, 136);
__m512 tmp7348 = _mm512_shuffle_f32x4(tmp7334, tmp7338, 221);
__m512 tmp7349 = _mm512_shuffle_f32x4(tmp7335, tmp7339, 136);
__m512 tmp7350 = _mm512_shuffle_f32x4(tmp7335, tmp7339, 221);
__m512 tmp7351 = _mm512_shuffle_f32x4(tmp7336, tmp7340, 136);
__m512 tmp7352 = _mm512_shuffle_f32x4(tmp7336, tmp7340, 221);
__m512 tmp7353 = _mm512_shuffle_f32x4(tmp7341, tmp7341, 136);
__m512 tmp7354 = _mm512_shuffle_f32x4(tmp7341, tmp7341, 221);
__m512 tmp7355 = _mm512_shuffle_f32x4(tmp7342, tmp7342, 136);
__m512 tmp7356 = _mm512_shuffle_f32x4(tmp7342, tmp7342, 221);
__m512 tmp7357 = _mm512_shuffle_f32x4(tmp7343, tmp7343, 136);
__m512 tmp7358 = _mm512_shuffle_f32x4(tmp7343, tmp7343, 221);
__m512 tmp7359 = _mm512_shuffle_f32x4(tmp7344, tmp7344, 136);
__m512 tmp7360 = _mm512_shuffle_f32x4(tmp7344, tmp7344, 221);
tmp7225 = _mm512_shuffle_f32x4(tmp7345, tmp7353, 136);
tmp7233 = _mm512_shuffle_f32x4(tmp7345, tmp7353, 221);
tmp7226 = _mm512_shuffle_f32x4(tmp7347, tmp7355, 136);
tmp7234 = _mm512_shuffle_f32x4(tmp7347, tmp7355, 221);
tmp7227 = _mm512_shuffle_f32x4(tmp7349, tmp7357, 136);
tmp7235 = _mm512_shuffle_f32x4(tmp7349, tmp7357, 221);
tmp7228 = _mm512_shuffle_f32x4(tmp7351, tmp7359, 136);
tmp7236 = _mm512_shuffle_f32x4(tmp7351, tmp7359, 221);
tmp7229 = _mm512_shuffle_f32x4(tmp7346, tmp7354, 136);
__m512 tmp7277 = _mm512_shuffle_f32x4(tmp7346, tmp7354, 221);
tmp7230 = _mm512_shuffle_f32x4(tmp7348, tmp7356, 136);
__m512 tmp7278 = _mm512_shuffle_f32x4(tmp7348, tmp7356, 221);
tmp7231 = _mm512_shuffle_f32x4(tmp7350, tmp7358, 136);
__m512 tmp7279 = _mm512_shuffle_f32x4(tmp7350, tmp7358, 221);
tmp7232 = _mm512_shuffle_f32x4(tmp7352, tmp7360, 136);
__m512 tmp7280 = _mm512_shuffle_f32x4(tmp7352, tmp7360, 221);
__m512 tmp7285 = _mm512_add_ps(tmp7226, tmp7227);
__m512 tmp7305 = _mm512_add_ps(tmp7234, tmp7235);
__m512 tmp7284 = _mm512_add_ps(tmp7228, tmp7229);
__m512 tmp7304 = _mm512_add_ps(tmp7236, tmp7277);
__m512 tmp7290 = _mm512_sub_ps(tmp7228, tmp7229);
__m512 tmp7310 = _mm512_sub_ps(tmp7236, tmp7277);
__m512 tmp7289 = _mm512_sub_ps(tmp7226, tmp7227);
__m512 tmp7309 = _mm512_sub_ps(tmp7234, tmp7235);
__m512 tmp7286 = _mm512_add_ps(tmp7230, tmp7231);
__m512 tmp7306 = _mm512_add_ps(tmp7278, tmp7279);
__m512 tmp7291 = _mm512_sub_ps(tmp7230, tmp7231);
__m512 tmp7311 = _mm512_sub_ps(tmp7278, tmp7279);
__m512 tmp7288 = _mm512_fmadd_ps(tmp7290, _mm512_set1_ps(2e+00f), tmp7289);
__m512 tmp7308 = _mm512_fmadd_ps(tmp7310, _mm512_set1_ps(2e+00f), tmp7309);
__m512 tmp7295 = _mm512_fmadd_ps(tmp7290, _mm512_set1_ps(8e+00f), tmp7289);
__m512 tmp7315 = _mm512_fmadd_ps(tmp7310, _mm512_set1_ps(8e+00f), tmp7309);
__m512 tmp7283 = _mm512_add_ps(tmp7284, tmp7285);
__m512 tmp7303 = _mm512_add_ps(tmp7304, tmp7305);
__m512 tmp7287 = _mm512_fmadd_ps(tmp7291, _mm512_set1_ps(1.6e+01f), tmp7288);
__m512 tmp7307 = _mm512_fmadd_ps(tmp7311, _mm512_set1_ps(1.6e+01f), tmp7308);
__m512 tmp7294 = _mm512_fmadd_ps(tmp7291, _mm512_set1_ps(4e+00f), tmp7295);
__m512 tmp7314 = _mm512_fmadd_ps(tmp7311, _mm512_set1_ps(4e+00f), tmp7315);
__m512 tmp7300 = _mm512_add_ps(tmp7291, tmp7289);
__m512 tmp7320 = _mm512_add_ps(tmp7311, tmp7309);
__m512 tmp7293 = _mm512_fmadd_ps(tmp7284, _mm512_set1_ps(4e+00f), tmp7285);
__m512 tmp7313 = _mm512_fmadd_ps(tmp7304, _mm512_set1_ps(4e+00f), tmp7305);
__m512 tmp7297 = _mm512_fmadd_ps(tmp7284, _mm512_set1_ps(1.6e+01f), tmp7285);
__m512 tmp7317 = _mm512_fmadd_ps(tmp7304, _mm512_set1_ps(1.6e+01f), tmp7305);
__m512 tmp7282 = _mm512_add_ps(tmp7283, tmp7225);
__m512 tmp7302 = _mm512_add_ps(tmp7303, tmp7233);
__m512 tmp7299 = _mm512_add_ps(tmp7300, tmp7232);
__m512 tmp7319 = _mm512_add_ps(tmp7320, tmp7280);
__m512 tmp7281 = _mm512_fmadd_ps(tmp7286, _mm512_set1_ps(3.2e+01f), tmp7282);
__m512 tmp7301 = _mm512_fmadd_ps(tmp7306, _mm512_set1_ps(3.2e+01f), tmp7302);
__m512 tmp7292 = _mm512_fmadd_ps(tmp7286, _mm512_set1_ps(8e+00f), tmp7293);
__m512 tmp7312 = _mm512_fmadd_ps(tmp7306, _mm512_set1_ps(8e+00f), tmp7313);
__m512 tmp7298 = _mm512_fmadd_ps(tmp7290, _mm512_set1_ps(3.2e+01f), tmp7299);
__m512 tmp7318 = _mm512_fmadd_ps(tmp7310, _mm512_set1_ps(3.2e+01f), tmp7319);
__m512 tmp7296 = _mm512_fmadd_ps(tmp7286, _mm512_set1_ps(2e+00f), tmp7297);
__m512 tmp7316 = _mm512_fmadd_ps(tmp7306, _mm512_set1_ps(2e+00f), tmp7317);
__m512 out1107 = tmp7281;
__m512 out1113 = tmp7301;
__m512 out1108 = tmp7287;
__m512 out1114 = tmp7307;
__m512 out1109 = tmp7292;
__m512 out1115 = tmp7312;
__m512 out1110 = tmp7294;
__m512 out1116 = tmp7314;
__m512 out1111 = tmp7296;
__m512 out1117 = tmp7316;
__m512 out1112 = tmp7298;
__m512 out1118 = tmp7318;
out1107 = _mm512_max_ps(_mm512_setzero_ps(), out1107);
out1113 = _mm512_max_ps(_mm512_setzero_ps(), out1113);
out1108 = _mm512_max_ps(_mm512_setzero_ps(), out1108);
out1114 = _mm512_max_ps(_mm512_setzero_ps(), out1114);
out1109 = _mm512_max_ps(_mm512_setzero_ps(), out1109);
out1115 = _mm512_max_ps(_mm512_setzero_ps(), out1115);
out1110 = _mm512_max_ps(_mm512_setzero_ps(), out1110);
out1116 = _mm512_max_ps(_mm512_setzero_ps(), out1116);
out1111 = _mm512_max_ps(_mm512_setzero_ps(), out1111);
out1117 = _mm512_max_ps(_mm512_setzero_ps(), out1117);
out1112 = _mm512_max_ps(_mm512_setzero_ps(), out1112);
out1118 = _mm512_max_ps(_mm512_setzero_ps(), out1118);
_mm512_mask_storeu_ps(datPtr13+0+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1107);
_mm512_mask_storeu_ps(datPtr13+48+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1113);
_mm512_mask_storeu_ps(datPtr13+224+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1108);
_mm512_mask_storeu_ps(datPtr13+272+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1114);
_mm512_mask_storeu_ps(datPtr13+448+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1109);
_mm512_mask_storeu_ps(datPtr13+496+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1115);
_mm512_mask_storeu_ps(datPtr13+672+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1110);
_mm512_mask_storeu_ps(datPtr13+720+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1116);
_mm512_mask_storeu_ps(datPtr13+896+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1111);
_mm512_mask_storeu_ps(datPtr13+944+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1117);
_mm512_mask_storeu_ps(datPtr13+1120+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1112);
_mm512_mask_storeu_ps(datPtr13+1168+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1118);
__m512 sf417 = _mm512_loadu_ps(sfPtr7+256+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf418 = _mm512_loadu_ps(sfPtr7+384+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1133 = _mm512_shuffle_f32x4(sf417, sf418, 68);
__m512 in1134 = _mm512_shuffle_f32x4(sf417, sf418, 238);
__m512 sf419 = _mm512_loadu_ps(sfPtr7+320+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf420 = _mm512_loadu_ps(sfPtr7+448+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1141 = _mm512_shuffle_f32x4(sf419, sf420, 68);
__m512 in1142 = _mm512_shuffle_f32x4(sf419, sf420, 238);
__m512 sf421 = _mm512_loadu_ps(sfPtr7+409856+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf422 = _mm512_loadu_ps(sfPtr7+409984+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1135 = _mm512_shuffle_f32x4(sf421, sf422, 68);
__m512 in1136 = _mm512_shuffle_f32x4(sf421, sf422, 238);
__m512 sf423 = _mm512_loadu_ps(sfPtr7+409920+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf424 = _mm512_loadu_ps(sfPtr7+410048+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1143 = _mm512_shuffle_f32x4(sf423, sf424, 68);
__m512 in1144 = _mm512_shuffle_f32x4(sf423, sf424, 238);
__m512 sf425 = _mm512_loadu_ps(sfPtr7+819456+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf426 = _mm512_loadu_ps(sfPtr7+819584+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1137 = _mm512_shuffle_f32x4(sf425, sf426, 68);
__m512 in1138 = _mm512_shuffle_f32x4(sf425, sf426, 238);
__m512 sf427 = _mm512_loadu_ps(sfPtr7+819520+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf428 = _mm512_loadu_ps(sfPtr7+819648+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1145 = _mm512_shuffle_f32x4(sf427, sf428, 68);
__m512 in1146 = _mm512_shuffle_f32x4(sf427, sf428, 238);
__m512 sf429 = _mm512_loadu_ps(sfPtr7+1229056+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf430 = _mm512_loadu_ps(sfPtr7+1229184+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1139 = _mm512_shuffle_f32x4(sf429, sf430, 68);
__m512 in1140 = _mm512_shuffle_f32x4(sf429, sf430, 238);
__m512 sf431 = _mm512_loadu_ps(sfPtr7+1229120+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf432 = _mm512_loadu_ps(sfPtr7+1229248+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1147 = _mm512_shuffle_f32x4(sf431, sf432, 68);
__m512 in1148 = _mm512_shuffle_f32x4(sf431, sf432, 238);
__m512 tmp7377 = _mm512_add_ps(in1134, in1135);
__m512 tmp7397 = _mm512_add_ps(in1142, in1143);
__m512 tmp7376 = _mm512_add_ps(in1136, in1137);
__m512 tmp7396 = _mm512_add_ps(in1144, in1145);
__m512 tmp7382 = _mm512_sub_ps(in1136, in1137);
__m512 tmp7402 = _mm512_sub_ps(in1144, in1145);
__m512 tmp7381 = _mm512_sub_ps(in1134, in1135);
__m512 tmp7401 = _mm512_sub_ps(in1142, in1143);
__m512 tmp7378 = _mm512_add_ps(in1138, in1139);
__m512 tmp7398 = _mm512_add_ps(in1146, in1147);
__m512 tmp7383 = _mm512_sub_ps(in1138, in1139);
__m512 tmp7403 = _mm512_sub_ps(in1146, in1147);
__m512 tmp7380 = _mm512_fmadd_ps(tmp7382, _mm512_set1_ps(2e+00f), tmp7381);
__m512 tmp7400 = _mm512_fmadd_ps(tmp7402, _mm512_set1_ps(2e+00f), tmp7401);
__m512 tmp7387 = _mm512_fmadd_ps(tmp7382, _mm512_set1_ps(8e+00f), tmp7381);
__m512 tmp7407 = _mm512_fmadd_ps(tmp7402, _mm512_set1_ps(8e+00f), tmp7401);
__m512 tmp7375 = _mm512_add_ps(tmp7376, tmp7377);
__m512 tmp7395 = _mm512_add_ps(tmp7396, tmp7397);
__m512 tmp7379 = _mm512_fmadd_ps(tmp7383, _mm512_set1_ps(1.6e+01f), tmp7380);
__m512 tmp7399 = _mm512_fmadd_ps(tmp7403, _mm512_set1_ps(1.6e+01f), tmp7400);
__m512 tmp7386 = _mm512_fmadd_ps(tmp7383, _mm512_set1_ps(4e+00f), tmp7387);
__m512 tmp7406 = _mm512_fmadd_ps(tmp7403, _mm512_set1_ps(4e+00f), tmp7407);
__m512 tmp7392 = _mm512_add_ps(tmp7383, tmp7381);
__m512 tmp7412 = _mm512_add_ps(tmp7403, tmp7401);
__m512 tmp7385 = _mm512_fmadd_ps(tmp7376, _mm512_set1_ps(4e+00f), tmp7377);
__m512 tmp7405 = _mm512_fmadd_ps(tmp7396, _mm512_set1_ps(4e+00f), tmp7397);
__m512 tmp7389 = _mm512_fmadd_ps(tmp7376, _mm512_set1_ps(1.6e+01f), tmp7377);
__m512 tmp7409 = _mm512_fmadd_ps(tmp7396, _mm512_set1_ps(1.6e+01f), tmp7397);
__m512 tmp7374 = _mm512_add_ps(tmp7375, in1133);
__m512 tmp7394 = _mm512_add_ps(tmp7395, in1141);
__m512 tmp7391 = _mm512_add_ps(tmp7392, in1140);
__m512 tmp7411 = _mm512_add_ps(tmp7412, in1148);
__m512 tmp7373 = _mm512_fmadd_ps(tmp7378, _mm512_set1_ps(3.2e+01f), tmp7374);
__m512 tmp7393 = _mm512_fmadd_ps(tmp7398, _mm512_set1_ps(3.2e+01f), tmp7394);
__m512 tmp7384 = _mm512_fmadd_ps(tmp7378, _mm512_set1_ps(8e+00f), tmp7385);
__m512 tmp7404 = _mm512_fmadd_ps(tmp7398, _mm512_set1_ps(8e+00f), tmp7405);
__m512 tmp7390 = _mm512_fmadd_ps(tmp7382, _mm512_set1_ps(3.2e+01f), tmp7391);
__m512 tmp7410 = _mm512_fmadd_ps(tmp7402, _mm512_set1_ps(3.2e+01f), tmp7411);
__m512 tmp7388 = _mm512_fmadd_ps(tmp7378, _mm512_set1_ps(2e+00f), tmp7389);
__m512 tmp7408 = _mm512_fmadd_ps(tmp7398, _mm512_set1_ps(2e+00f), tmp7409);
__m512 tmp7361 = tmp7373;
__m512 tmp7367 = tmp7393;
__m512 tmp7362 = tmp7379;
__m512 tmp7368 = tmp7399;
__m512 tmp7363 = tmp7384;
__m512 tmp7369 = tmp7404;
__m512 tmp7364 = tmp7386;
__m512 tmp7370 = tmp7406;
__m512 tmp7365 = tmp7388;
__m512 tmp7371 = tmp7408;
__m512 tmp7366 = tmp7390;
__m512 tmp7372 = tmp7410;
__m512 tmp7457 = _mm512_unpacklo_ps(tmp7361, tmp7362);
__m512 tmp7458 = _mm512_unpackhi_ps(tmp7361, tmp7362);
__m512 tmp7459 = _mm512_unpacklo_ps(tmp7363, tmp7364);
__m512 tmp7460 = _mm512_unpackhi_ps(tmp7363, tmp7364);
__m512 tmp7461 = _mm512_unpacklo_ps(tmp7365, tmp7366);
__m512 tmp7462 = _mm512_unpackhi_ps(tmp7365, tmp7366);
__m512 tmp7463 = _mm512_unpacklo_ps(tmp7367, tmp7368);
__m512 tmp7464 = _mm512_unpackhi_ps(tmp7367, tmp7368);
__m512 tmp7465 = _mm512_unpacklo_ps(tmp7369, tmp7370);
__m512 tmp7466 = _mm512_unpackhi_ps(tmp7369, tmp7370);
__m512 tmp7467 = _mm512_unpacklo_ps(tmp7371, tmp7372);
__m512 tmp7468 = _mm512_unpackhi_ps(tmp7371, tmp7372);
__m512 tmp7469 = _mm512_shuffle_ps(tmp7457, tmp7459, 68);
__m512 tmp7470 = _mm512_shuffle_ps(tmp7457, tmp7459, 238);
__m512 tmp7471 = _mm512_shuffle_ps(tmp7458, tmp7460, 68);
__m512 tmp7472 = _mm512_shuffle_ps(tmp7458, tmp7460, 238);
__m512 tmp7473 = _mm512_shuffle_ps(tmp7461, tmp7463, 68);
__m512 tmp7474 = _mm512_shuffle_ps(tmp7461, tmp7463, 238);
__m512 tmp7475 = _mm512_shuffle_ps(tmp7462, tmp7464, 68);
__m512 tmp7476 = _mm512_shuffle_ps(tmp7462, tmp7464, 238);
__m512 tmp7477 = _mm512_shuffle_ps(tmp7465, tmp7467, 68);
__m512 tmp7478 = _mm512_shuffle_ps(tmp7465, tmp7467, 238);
__m512 tmp7479 = _mm512_shuffle_ps(tmp7466, tmp7468, 68);
__m512 tmp7480 = _mm512_shuffle_ps(tmp7466, tmp7468, 238);
__m512 tmp7481 = _mm512_shuffle_f32x4(tmp7469, tmp7473, 136);
__m512 tmp7482 = _mm512_shuffle_f32x4(tmp7469, tmp7473, 221);
__m512 tmp7483 = _mm512_shuffle_f32x4(tmp7470, tmp7474, 136);
__m512 tmp7484 = _mm512_shuffle_f32x4(tmp7470, tmp7474, 221);
__m512 tmp7485 = _mm512_shuffle_f32x4(tmp7471, tmp7475, 136);
__m512 tmp7486 = _mm512_shuffle_f32x4(tmp7471, tmp7475, 221);
__m512 tmp7487 = _mm512_shuffle_f32x4(tmp7472, tmp7476, 136);
__m512 tmp7488 = _mm512_shuffle_f32x4(tmp7472, tmp7476, 221);
__m512 tmp7489 = _mm512_shuffle_f32x4(tmp7477, tmp7477, 136);
__m512 tmp7490 = _mm512_shuffle_f32x4(tmp7477, tmp7477, 221);
__m512 tmp7491 = _mm512_shuffle_f32x4(tmp7478, tmp7478, 136);
__m512 tmp7492 = _mm512_shuffle_f32x4(tmp7478, tmp7478, 221);
__m512 tmp7493 = _mm512_shuffle_f32x4(tmp7479, tmp7479, 136);
__m512 tmp7494 = _mm512_shuffle_f32x4(tmp7479, tmp7479, 221);
__m512 tmp7495 = _mm512_shuffle_f32x4(tmp7480, tmp7480, 136);
__m512 tmp7496 = _mm512_shuffle_f32x4(tmp7480, tmp7480, 221);
tmp7361 = _mm512_shuffle_f32x4(tmp7481, tmp7489, 136);
tmp7369 = _mm512_shuffle_f32x4(tmp7481, tmp7489, 221);
tmp7362 = _mm512_shuffle_f32x4(tmp7483, tmp7491, 136);
tmp7370 = _mm512_shuffle_f32x4(tmp7483, tmp7491, 221);
tmp7363 = _mm512_shuffle_f32x4(tmp7485, tmp7493, 136);
tmp7371 = _mm512_shuffle_f32x4(tmp7485, tmp7493, 221);
tmp7364 = _mm512_shuffle_f32x4(tmp7487, tmp7495, 136);
tmp7372 = _mm512_shuffle_f32x4(tmp7487, tmp7495, 221);
tmp7365 = _mm512_shuffle_f32x4(tmp7482, tmp7490, 136);
__m512 tmp7413 = _mm512_shuffle_f32x4(tmp7482, tmp7490, 221);
tmp7366 = _mm512_shuffle_f32x4(tmp7484, tmp7492, 136);
__m512 tmp7414 = _mm512_shuffle_f32x4(tmp7484, tmp7492, 221);
tmp7367 = _mm512_shuffle_f32x4(tmp7486, tmp7494, 136);
__m512 tmp7415 = _mm512_shuffle_f32x4(tmp7486, tmp7494, 221);
tmp7368 = _mm512_shuffle_f32x4(tmp7488, tmp7496, 136);
__m512 tmp7416 = _mm512_shuffle_f32x4(tmp7488, tmp7496, 221);
__m512 tmp7421 = _mm512_add_ps(tmp7362, tmp7363);
__m512 tmp7441 = _mm512_add_ps(tmp7370, tmp7371);
__m512 tmp7420 = _mm512_add_ps(tmp7364, tmp7365);
__m512 tmp7440 = _mm512_add_ps(tmp7372, tmp7413);
__m512 tmp7426 = _mm512_sub_ps(tmp7364, tmp7365);
__m512 tmp7446 = _mm512_sub_ps(tmp7372, tmp7413);
__m512 tmp7425 = _mm512_sub_ps(tmp7362, tmp7363);
__m512 tmp7445 = _mm512_sub_ps(tmp7370, tmp7371);
__m512 tmp7422 = _mm512_add_ps(tmp7366, tmp7367);
__m512 tmp7442 = _mm512_add_ps(tmp7414, tmp7415);
__m512 tmp7427 = _mm512_sub_ps(tmp7366, tmp7367);
__m512 tmp7447 = _mm512_sub_ps(tmp7414, tmp7415);
__m512 tmp7424 = _mm512_fmadd_ps(tmp7426, _mm512_set1_ps(2e+00f), tmp7425);
__m512 tmp7444 = _mm512_fmadd_ps(tmp7446, _mm512_set1_ps(2e+00f), tmp7445);
__m512 tmp7431 = _mm512_fmadd_ps(tmp7426, _mm512_set1_ps(8e+00f), tmp7425);
__m512 tmp7451 = _mm512_fmadd_ps(tmp7446, _mm512_set1_ps(8e+00f), tmp7445);
__m512 tmp7419 = _mm512_add_ps(tmp7420, tmp7421);
__m512 tmp7439 = _mm512_add_ps(tmp7440, tmp7441);
__m512 tmp7423 = _mm512_fmadd_ps(tmp7427, _mm512_set1_ps(1.6e+01f), tmp7424);
__m512 tmp7443 = _mm512_fmadd_ps(tmp7447, _mm512_set1_ps(1.6e+01f), tmp7444);
__m512 tmp7430 = _mm512_fmadd_ps(tmp7427, _mm512_set1_ps(4e+00f), tmp7431);
__m512 tmp7450 = _mm512_fmadd_ps(tmp7447, _mm512_set1_ps(4e+00f), tmp7451);
__m512 tmp7436 = _mm512_add_ps(tmp7427, tmp7425);
__m512 tmp7456 = _mm512_add_ps(tmp7447, tmp7445);
__m512 tmp7429 = _mm512_fmadd_ps(tmp7420, _mm512_set1_ps(4e+00f), tmp7421);
__m512 tmp7449 = _mm512_fmadd_ps(tmp7440, _mm512_set1_ps(4e+00f), tmp7441);
__m512 tmp7433 = _mm512_fmadd_ps(tmp7420, _mm512_set1_ps(1.6e+01f), tmp7421);
__m512 tmp7453 = _mm512_fmadd_ps(tmp7440, _mm512_set1_ps(1.6e+01f), tmp7441);
__m512 tmp7418 = _mm512_add_ps(tmp7419, tmp7361);
__m512 tmp7438 = _mm512_add_ps(tmp7439, tmp7369);
__m512 tmp7435 = _mm512_add_ps(tmp7436, tmp7368);
__m512 tmp7455 = _mm512_add_ps(tmp7456, tmp7416);
__m512 tmp7417 = _mm512_fmadd_ps(tmp7422, _mm512_set1_ps(3.2e+01f), tmp7418);
__m512 tmp7437 = _mm512_fmadd_ps(tmp7442, _mm512_set1_ps(3.2e+01f), tmp7438);
__m512 tmp7428 = _mm512_fmadd_ps(tmp7422, _mm512_set1_ps(8e+00f), tmp7429);
__m512 tmp7448 = _mm512_fmadd_ps(tmp7442, _mm512_set1_ps(8e+00f), tmp7449);
__m512 tmp7434 = _mm512_fmadd_ps(tmp7426, _mm512_set1_ps(3.2e+01f), tmp7435);
__m512 tmp7454 = _mm512_fmadd_ps(tmp7446, _mm512_set1_ps(3.2e+01f), tmp7455);
__m512 tmp7432 = _mm512_fmadd_ps(tmp7422, _mm512_set1_ps(2e+00f), tmp7433);
__m512 tmp7452 = _mm512_fmadd_ps(tmp7442, _mm512_set1_ps(2e+00f), tmp7453);
__m512 out1119 = tmp7417;
__m512 out1125 = tmp7437;
__m512 out1120 = tmp7423;
__m512 out1126 = tmp7443;
__m512 out1121 = tmp7428;
__m512 out1127 = tmp7448;
__m512 out1122 = tmp7430;
__m512 out1128 = tmp7450;
__m512 out1123 = tmp7432;
__m512 out1129 = tmp7452;
__m512 out1124 = tmp7434;
__m512 out1130 = tmp7454;
out1119 = _mm512_max_ps(_mm512_setzero_ps(), out1119);
out1125 = _mm512_max_ps(_mm512_setzero_ps(), out1125);
out1120 = _mm512_max_ps(_mm512_setzero_ps(), out1120);
out1126 = _mm512_max_ps(_mm512_setzero_ps(), out1126);
out1121 = _mm512_max_ps(_mm512_setzero_ps(), out1121);
out1127 = _mm512_max_ps(_mm512_setzero_ps(), out1127);
out1122 = _mm512_max_ps(_mm512_setzero_ps(), out1122);
out1128 = _mm512_max_ps(_mm512_setzero_ps(), out1128);
out1123 = _mm512_max_ps(_mm512_setzero_ps(), out1123);
out1129 = _mm512_max_ps(_mm512_setzero_ps(), out1129);
out1124 = _mm512_max_ps(_mm512_setzero_ps(), out1124);
out1130 = _mm512_max_ps(_mm512_setzero_ps(), out1130);
_mm512_mask_storeu_ps(datPtr13+96+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1119);
_mm512_mask_storeu_ps(datPtr13+12608+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1125);
_mm512_mask_storeu_ps(datPtr13+320+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1120);
_mm512_mask_storeu_ps(datPtr13+12832+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1126);
_mm512_mask_storeu_ps(datPtr13+544+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1121);
_mm512_mask_storeu_ps(datPtr13+13056+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1127);
_mm512_mask_storeu_ps(datPtr13+768+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1122);
_mm512_mask_storeu_ps(datPtr13+13280+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1128);
_mm512_mask_storeu_ps(datPtr13+992+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1123);
_mm512_mask_storeu_ps(datPtr13+13504+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1129);
_mm512_mask_storeu_ps(datPtr13+1216+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1124);
_mm512_mask_storeu_ps(datPtr13+13728+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1130);
__m512 sf433 = _mm512_loadu_ps(sfPtr7+512+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf434 = _mm512_loadu_ps(sfPtr7+640+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1149 = _mm512_shuffle_f32x4(sf433, sf434, 68);
__m512 in1150 = _mm512_shuffle_f32x4(sf433, sf434, 238);
__m512 sf435 = _mm512_loadu_ps(sfPtr7+576+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf436 = _mm512_loadu_ps(sfPtr7+704+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1157 = _mm512_shuffle_f32x4(sf435, sf436, 68);
__m512 in1158 = _mm512_shuffle_f32x4(sf435, sf436, 238);
__m512 sf437 = _mm512_loadu_ps(sfPtr7+410112+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf438 = _mm512_loadu_ps(sfPtr7+410240+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1151 = _mm512_shuffle_f32x4(sf437, sf438, 68);
__m512 in1152 = _mm512_shuffle_f32x4(sf437, sf438, 238);
__m512 sf439 = _mm512_loadu_ps(sfPtr7+410176+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf440 = _mm512_loadu_ps(sfPtr7+410304+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1159 = _mm512_shuffle_f32x4(sf439, sf440, 68);
__m512 in1160 = _mm512_shuffle_f32x4(sf439, sf440, 238);
__m512 sf441 = _mm512_loadu_ps(sfPtr7+819712+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf442 = _mm512_loadu_ps(sfPtr7+819840+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1153 = _mm512_shuffle_f32x4(sf441, sf442, 68);
__m512 in1154 = _mm512_shuffle_f32x4(sf441, sf442, 238);
__m512 sf443 = _mm512_loadu_ps(sfPtr7+819776+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf444 = _mm512_loadu_ps(sfPtr7+819904+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1161 = _mm512_shuffle_f32x4(sf443, sf444, 68);
__m512 in1162 = _mm512_shuffle_f32x4(sf443, sf444, 238);
__m512 sf445 = _mm512_loadu_ps(sfPtr7+1229312+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf446 = _mm512_loadu_ps(sfPtr7+1229440+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1155 = _mm512_shuffle_f32x4(sf445, sf446, 68);
__m512 in1156 = _mm512_shuffle_f32x4(sf445, sf446, 238);
__m512 sf447 = _mm512_loadu_ps(sfPtr7+1229376+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 sf448 = _mm512_loadu_ps(sfPtr7+1229504+1638400*i28+24576*j23+1536*k90+768*l29);
__m512 in1163 = _mm512_shuffle_f32x4(sf447, sf448, 68);
__m512 in1164 = _mm512_shuffle_f32x4(sf447, sf448, 238);
__m512 tmp7513 = _mm512_add_ps(in1150, in1151);
__m512 tmp7533 = _mm512_add_ps(in1158, in1159);
__m512 tmp7512 = _mm512_add_ps(in1152, in1153);
__m512 tmp7532 = _mm512_add_ps(in1160, in1161);
__m512 tmp7518 = _mm512_sub_ps(in1152, in1153);
__m512 tmp7538 = _mm512_sub_ps(in1160, in1161);
__m512 tmp7517 = _mm512_sub_ps(in1150, in1151);
__m512 tmp7537 = _mm512_sub_ps(in1158, in1159);
__m512 tmp7514 = _mm512_add_ps(in1154, in1155);
__m512 tmp7534 = _mm512_add_ps(in1162, in1163);
__m512 tmp7519 = _mm512_sub_ps(in1154, in1155);
__m512 tmp7539 = _mm512_sub_ps(in1162, in1163);
__m512 tmp7516 = _mm512_fmadd_ps(tmp7518, _mm512_set1_ps(2e+00f), tmp7517);
__m512 tmp7536 = _mm512_fmadd_ps(tmp7538, _mm512_set1_ps(2e+00f), tmp7537);
__m512 tmp7523 = _mm512_fmadd_ps(tmp7518, _mm512_set1_ps(8e+00f), tmp7517);
__m512 tmp7543 = _mm512_fmadd_ps(tmp7538, _mm512_set1_ps(8e+00f), tmp7537);
__m512 tmp7511 = _mm512_add_ps(tmp7512, tmp7513);
__m512 tmp7531 = _mm512_add_ps(tmp7532, tmp7533);
__m512 tmp7515 = _mm512_fmadd_ps(tmp7519, _mm512_set1_ps(1.6e+01f), tmp7516);
__m512 tmp7535 = _mm512_fmadd_ps(tmp7539, _mm512_set1_ps(1.6e+01f), tmp7536);
__m512 tmp7522 = _mm512_fmadd_ps(tmp7519, _mm512_set1_ps(4e+00f), tmp7523);
__m512 tmp7542 = _mm512_fmadd_ps(tmp7539, _mm512_set1_ps(4e+00f), tmp7543);
__m512 tmp7528 = _mm512_add_ps(tmp7519, tmp7517);
__m512 tmp7548 = _mm512_add_ps(tmp7539, tmp7537);
__m512 tmp7521 = _mm512_fmadd_ps(tmp7512, _mm512_set1_ps(4e+00f), tmp7513);
__m512 tmp7541 = _mm512_fmadd_ps(tmp7532, _mm512_set1_ps(4e+00f), tmp7533);
__m512 tmp7525 = _mm512_fmadd_ps(tmp7512, _mm512_set1_ps(1.6e+01f), tmp7513);
__m512 tmp7545 = _mm512_fmadd_ps(tmp7532, _mm512_set1_ps(1.6e+01f), tmp7533);
__m512 tmp7510 = _mm512_add_ps(tmp7511, in1149);
__m512 tmp7530 = _mm512_add_ps(tmp7531, in1157);
__m512 tmp7527 = _mm512_add_ps(tmp7528, in1156);
__m512 tmp7547 = _mm512_add_ps(tmp7548, in1164);
__m512 tmp7509 = _mm512_fmadd_ps(tmp7514, _mm512_set1_ps(3.2e+01f), tmp7510);
__m512 tmp7529 = _mm512_fmadd_ps(tmp7534, _mm512_set1_ps(3.2e+01f), tmp7530);
__m512 tmp7520 = _mm512_fmadd_ps(tmp7514, _mm512_set1_ps(8e+00f), tmp7521);
__m512 tmp7540 = _mm512_fmadd_ps(tmp7534, _mm512_set1_ps(8e+00f), tmp7541);
__m512 tmp7526 = _mm512_fmadd_ps(tmp7518, _mm512_set1_ps(3.2e+01f), tmp7527);
__m512 tmp7546 = _mm512_fmadd_ps(tmp7538, _mm512_set1_ps(3.2e+01f), tmp7547);
__m512 tmp7524 = _mm512_fmadd_ps(tmp7514, _mm512_set1_ps(2e+00f), tmp7525);
__m512 tmp7544 = _mm512_fmadd_ps(tmp7534, _mm512_set1_ps(2e+00f), tmp7545);
__m512 tmp7497 = tmp7509;
__m512 tmp7503 = tmp7529;
__m512 tmp7498 = tmp7515;
__m512 tmp7504 = tmp7535;
__m512 tmp7499 = tmp7520;
__m512 tmp7505 = tmp7540;
__m512 tmp7500 = tmp7522;
__m512 tmp7506 = tmp7542;
__m512 tmp7501 = tmp7524;
__m512 tmp7507 = tmp7544;
__m512 tmp7502 = tmp7526;
__m512 tmp7508 = tmp7546;
__m512 tmp7593 = _mm512_unpacklo_ps(tmp7497, tmp7498);
__m512 tmp7594 = _mm512_unpackhi_ps(tmp7497, tmp7498);
__m512 tmp7595 = _mm512_unpacklo_ps(tmp7499, tmp7500);
__m512 tmp7596 = _mm512_unpackhi_ps(tmp7499, tmp7500);
__m512 tmp7597 = _mm512_unpacklo_ps(tmp7501, tmp7502);
__m512 tmp7598 = _mm512_unpackhi_ps(tmp7501, tmp7502);
__m512 tmp7599 = _mm512_unpacklo_ps(tmp7503, tmp7504);
__m512 tmp7600 = _mm512_unpackhi_ps(tmp7503, tmp7504);
__m512 tmp7601 = _mm512_unpacklo_ps(tmp7505, tmp7506);
__m512 tmp7602 = _mm512_unpackhi_ps(tmp7505, tmp7506);
__m512 tmp7603 = _mm512_unpacklo_ps(tmp7507, tmp7508);
__m512 tmp7604 = _mm512_unpackhi_ps(tmp7507, tmp7508);
__m512 tmp7605 = _mm512_shuffle_ps(tmp7593, tmp7595, 68);
__m512 tmp7606 = _mm512_shuffle_ps(tmp7593, tmp7595, 238);
__m512 tmp7607 = _mm512_shuffle_ps(tmp7594, tmp7596, 68);
__m512 tmp7608 = _mm512_shuffle_ps(tmp7594, tmp7596, 238);
__m512 tmp7609 = _mm512_shuffle_ps(tmp7597, tmp7599, 68);
__m512 tmp7610 = _mm512_shuffle_ps(tmp7597, tmp7599, 238);
__m512 tmp7611 = _mm512_shuffle_ps(tmp7598, tmp7600, 68);
__m512 tmp7612 = _mm512_shuffle_ps(tmp7598, tmp7600, 238);
__m512 tmp7613 = _mm512_shuffle_ps(tmp7601, tmp7603, 68);
__m512 tmp7614 = _mm512_shuffle_ps(tmp7601, tmp7603, 238);
__m512 tmp7615 = _mm512_shuffle_ps(tmp7602, tmp7604, 68);
__m512 tmp7616 = _mm512_shuffle_ps(tmp7602, tmp7604, 238);
__m512 tmp7617 = _mm512_shuffle_f32x4(tmp7605, tmp7609, 136);
__m512 tmp7618 = _mm512_shuffle_f32x4(tmp7605, tmp7609, 221);
__m512 tmp7619 = _mm512_shuffle_f32x4(tmp7606, tmp7610, 136);
__m512 tmp7620 = _mm512_shuffle_f32x4(tmp7606, tmp7610, 221);
__m512 tmp7621 = _mm512_shuffle_f32x4(tmp7607, tmp7611, 136);
__m512 tmp7622 = _mm512_shuffle_f32x4(tmp7607, tmp7611, 221);
__m512 tmp7623 = _mm512_shuffle_f32x4(tmp7608, tmp7612, 136);
__m512 tmp7624 = _mm512_shuffle_f32x4(tmp7608, tmp7612, 221);
__m512 tmp7625 = _mm512_shuffle_f32x4(tmp7613, tmp7613, 136);
__m512 tmp7626 = _mm512_shuffle_f32x4(tmp7613, tmp7613, 221);
__m512 tmp7627 = _mm512_shuffle_f32x4(tmp7614, tmp7614, 136);
__m512 tmp7628 = _mm512_shuffle_f32x4(tmp7614, tmp7614, 221);
__m512 tmp7629 = _mm512_shuffle_f32x4(tmp7615, tmp7615, 136);
__m512 tmp7630 = _mm512_shuffle_f32x4(tmp7615, tmp7615, 221);
__m512 tmp7631 = _mm512_shuffle_f32x4(tmp7616, tmp7616, 136);
__m512 tmp7632 = _mm512_shuffle_f32x4(tmp7616, tmp7616, 221);
tmp7497 = _mm512_shuffle_f32x4(tmp7617, tmp7625, 136);
tmp7505 = _mm512_shuffle_f32x4(tmp7617, tmp7625, 221);
tmp7498 = _mm512_shuffle_f32x4(tmp7619, tmp7627, 136);
tmp7506 = _mm512_shuffle_f32x4(tmp7619, tmp7627, 221);
tmp7499 = _mm512_shuffle_f32x4(tmp7621, tmp7629, 136);
tmp7507 = _mm512_shuffle_f32x4(tmp7621, tmp7629, 221);
tmp7500 = _mm512_shuffle_f32x4(tmp7623, tmp7631, 136);
tmp7508 = _mm512_shuffle_f32x4(tmp7623, tmp7631, 221);
tmp7501 = _mm512_shuffle_f32x4(tmp7618, tmp7626, 136);
__m512 tmp7549 = _mm512_shuffle_f32x4(tmp7618, tmp7626, 221);
tmp7502 = _mm512_shuffle_f32x4(tmp7620, tmp7628, 136);
__m512 tmp7550 = _mm512_shuffle_f32x4(tmp7620, tmp7628, 221);
tmp7503 = _mm512_shuffle_f32x4(tmp7622, tmp7630, 136);
__m512 tmp7551 = _mm512_shuffle_f32x4(tmp7622, tmp7630, 221);
tmp7504 = _mm512_shuffle_f32x4(tmp7624, tmp7632, 136);
__m512 tmp7552 = _mm512_shuffle_f32x4(tmp7624, tmp7632, 221);
__m512 tmp7557 = _mm512_add_ps(tmp7498, tmp7499);
__m512 tmp7577 = _mm512_add_ps(tmp7506, tmp7507);
__m512 tmp7556 = _mm512_add_ps(tmp7500, tmp7501);
__m512 tmp7576 = _mm512_add_ps(tmp7508, tmp7549);
__m512 tmp7562 = _mm512_sub_ps(tmp7500, tmp7501);
__m512 tmp7582 = _mm512_sub_ps(tmp7508, tmp7549);
__m512 tmp7561 = _mm512_sub_ps(tmp7498, tmp7499);
__m512 tmp7581 = _mm512_sub_ps(tmp7506, tmp7507);
__m512 tmp7558 = _mm512_add_ps(tmp7502, tmp7503);
__m512 tmp7578 = _mm512_add_ps(tmp7550, tmp7551);
__m512 tmp7563 = _mm512_sub_ps(tmp7502, tmp7503);
__m512 tmp7583 = _mm512_sub_ps(tmp7550, tmp7551);
__m512 tmp7560 = _mm512_fmadd_ps(tmp7562, _mm512_set1_ps(2e+00f), tmp7561);
__m512 tmp7580 = _mm512_fmadd_ps(tmp7582, _mm512_set1_ps(2e+00f), tmp7581);
__m512 tmp7567 = _mm512_fmadd_ps(tmp7562, _mm512_set1_ps(8e+00f), tmp7561);
__m512 tmp7587 = _mm512_fmadd_ps(tmp7582, _mm512_set1_ps(8e+00f), tmp7581);
__m512 tmp7555 = _mm512_add_ps(tmp7556, tmp7557);
__m512 tmp7575 = _mm512_add_ps(tmp7576, tmp7577);
__m512 tmp7559 = _mm512_fmadd_ps(tmp7563, _mm512_set1_ps(1.6e+01f), tmp7560);
__m512 tmp7579 = _mm512_fmadd_ps(tmp7583, _mm512_set1_ps(1.6e+01f), tmp7580);
__m512 tmp7566 = _mm512_fmadd_ps(tmp7563, _mm512_set1_ps(4e+00f), tmp7567);
__m512 tmp7586 = _mm512_fmadd_ps(tmp7583, _mm512_set1_ps(4e+00f), tmp7587);
__m512 tmp7572 = _mm512_add_ps(tmp7563, tmp7561);
__m512 tmp7592 = _mm512_add_ps(tmp7583, tmp7581);
__m512 tmp7565 = _mm512_fmadd_ps(tmp7556, _mm512_set1_ps(4e+00f), tmp7557);
__m512 tmp7585 = _mm512_fmadd_ps(tmp7576, _mm512_set1_ps(4e+00f), tmp7577);
__m512 tmp7569 = _mm512_fmadd_ps(tmp7556, _mm512_set1_ps(1.6e+01f), tmp7557);
__m512 tmp7589 = _mm512_fmadd_ps(tmp7576, _mm512_set1_ps(1.6e+01f), tmp7577);
__m512 tmp7554 = _mm512_add_ps(tmp7555, tmp7497);
__m512 tmp7574 = _mm512_add_ps(tmp7575, tmp7505);
__m512 tmp7571 = _mm512_add_ps(tmp7572, tmp7504);
__m512 tmp7591 = _mm512_add_ps(tmp7592, tmp7552);
__m512 tmp7553 = _mm512_fmadd_ps(tmp7558, _mm512_set1_ps(3.2e+01f), tmp7554);
__m512 tmp7573 = _mm512_fmadd_ps(tmp7578, _mm512_set1_ps(3.2e+01f), tmp7574);
__m512 tmp7564 = _mm512_fmadd_ps(tmp7558, _mm512_set1_ps(8e+00f), tmp7565);
__m512 tmp7584 = _mm512_fmadd_ps(tmp7578, _mm512_set1_ps(8e+00f), tmp7585);
__m512 tmp7570 = _mm512_fmadd_ps(tmp7562, _mm512_set1_ps(3.2e+01f), tmp7571);
__m512 tmp7590 = _mm512_fmadd_ps(tmp7582, _mm512_set1_ps(3.2e+01f), tmp7591);
__m512 tmp7568 = _mm512_fmadd_ps(tmp7558, _mm512_set1_ps(2e+00f), tmp7569);
__m512 tmp7588 = _mm512_fmadd_ps(tmp7578, _mm512_set1_ps(2e+00f), tmp7589);
__m512 out1131 = tmp7553;
__m512 out1137 = tmp7573;
__m512 out1132 = tmp7559;
__m512 out1138 = tmp7579;
__m512 out1133 = tmp7564;
__m512 out1139 = tmp7584;
__m512 out1134 = tmp7566;
__m512 out1140 = tmp7586;
__m512 out1135 = tmp7568;
__m512 out1141 = tmp7588;
__m512 out1136 = tmp7570;
__m512 out1142 = tmp7590;
out1131 = _mm512_max_ps(_mm512_setzero_ps(), out1131);
out1137 = _mm512_max_ps(_mm512_setzero_ps(), out1137);
out1132 = _mm512_max_ps(_mm512_setzero_ps(), out1132);
out1138 = _mm512_max_ps(_mm512_setzero_ps(), out1138);
out1133 = _mm512_max_ps(_mm512_setzero_ps(), out1133);
out1139 = _mm512_max_ps(_mm512_setzero_ps(), out1139);
out1134 = _mm512_max_ps(_mm512_setzero_ps(), out1134);
out1140 = _mm512_max_ps(_mm512_setzero_ps(), out1140);
out1135 = _mm512_max_ps(_mm512_setzero_ps(), out1135);
out1141 = _mm512_max_ps(_mm512_setzero_ps(), out1141);
out1136 = _mm512_max_ps(_mm512_setzero_ps(), out1136);
out1142 = _mm512_max_ps(_mm512_setzero_ps(), out1142);
_mm512_mask_storeu_ps(datPtr13+12656+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1131);
_mm512_mask_storeu_ps(datPtr13+12704+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1137);
_mm512_mask_storeu_ps(datPtr13+12880+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1132);
_mm512_mask_storeu_ps(datPtr13+12928+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1138);
_mm512_mask_storeu_ps(datPtr13+13104+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1133);
_mm512_mask_storeu_ps(datPtr13+13152+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1139);
_mm512_mask_storeu_ps(datPtr13+13328+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1134);
_mm512_mask_storeu_ps(datPtr13+13376+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1140);
_mm512_mask_storeu_ps(datPtr13+13552+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1135);
_mm512_mask_storeu_ps(datPtr13+13600+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1141);
_mm512_mask_storeu_ps(datPtr13+13776+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1136);
_mm512_mask_storeu_ps(datPtr13+13824+806912*i28+224*toH29+4*toW29+50432*k90+25216*l29, 4095, out1142);
}
}
if (j23 >= last6) return;
++j23;
rel16 = 1;
}
ptrdiff_t toH30 = base16+0;
ptrdiff_t toW30 = 36;
ptrdiff_t k91 = 16*w46;
for (; k91 != 16; ++k91) {
ptrdiff_t l30 = 0;
for (; l30 != 2; ++l30) {
__m512 sf449 = _mm512_loadu_ps(sfPtr7+0+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf450 = _mm512_loadu_ps(sfPtr7+128+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1165 = _mm512_shuffle_f32x4(sf449, sf450, 68);
__m512 in1166 = _mm512_shuffle_f32x4(sf449, sf450, 238);
__m512 sf451 = _mm512_loadu_ps(sfPtr7+64+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf452 = _mm512_loadu_ps(sfPtr7+192+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1173 = _mm512_shuffle_f32x4(sf451, sf452, 68);
__m512 in1174 = _mm512_shuffle_f32x4(sf451, sf452, 238);
__m512 sf453 = _mm512_loadu_ps(sfPtr7+409600+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf454 = _mm512_loadu_ps(sfPtr7+409728+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1167 = _mm512_shuffle_f32x4(sf453, sf454, 68);
__m512 in1168 = _mm512_shuffle_f32x4(sf453, sf454, 238);
__m512 sf455 = _mm512_loadu_ps(sfPtr7+409664+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf456 = _mm512_loadu_ps(sfPtr7+409792+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1175 = _mm512_shuffle_f32x4(sf455, sf456, 68);
__m512 in1176 = _mm512_shuffle_f32x4(sf455, sf456, 238);
__m512 sf457 = _mm512_loadu_ps(sfPtr7+819200+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf458 = _mm512_loadu_ps(sfPtr7+819328+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1169 = _mm512_shuffle_f32x4(sf457, sf458, 68);
__m512 in1170 = _mm512_shuffle_f32x4(sf457, sf458, 238);
__m512 sf459 = _mm512_loadu_ps(sfPtr7+819264+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf460 = _mm512_loadu_ps(sfPtr7+819392+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1177 = _mm512_shuffle_f32x4(sf459, sf460, 68);
__m512 in1178 = _mm512_shuffle_f32x4(sf459, sf460, 238);
__m512 sf461 = _mm512_loadu_ps(sfPtr7+1228800+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf462 = _mm512_loadu_ps(sfPtr7+1228928+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1171 = _mm512_shuffle_f32x4(sf461, sf462, 68);
__m512 in1172 = _mm512_shuffle_f32x4(sf461, sf462, 238);
__m512 sf463 = _mm512_loadu_ps(sfPtr7+1228864+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf464 = _mm512_loadu_ps(sfPtr7+1228992+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1179 = _mm512_shuffle_f32x4(sf463, sf464, 68);
__m512 in1180 = _mm512_shuffle_f32x4(sf463, sf464, 238);
__m512 tmp7649 = _mm512_add_ps(in1166, in1167);
__m512 tmp7669 = _mm512_add_ps(in1174, in1175);
__m512 tmp7648 = _mm512_add_ps(in1168, in1169);
__m512 tmp7668 = _mm512_add_ps(in1176, in1177);
__m512 tmp7654 = _mm512_sub_ps(in1168, in1169);
__m512 tmp7674 = _mm512_sub_ps(in1176, in1177);
__m512 tmp7653 = _mm512_sub_ps(in1166, in1167);
__m512 tmp7673 = _mm512_sub_ps(in1174, in1175);
__m512 tmp7650 = _mm512_add_ps(in1170, in1171);
__m512 tmp7670 = _mm512_add_ps(in1178, in1179);
__m512 tmp7655 = _mm512_sub_ps(in1170, in1171);
__m512 tmp7675 = _mm512_sub_ps(in1178, in1179);
__m512 tmp7652 = _mm512_fmadd_ps(tmp7654, _mm512_set1_ps(2e+00f), tmp7653);
__m512 tmp7672 = _mm512_fmadd_ps(tmp7674, _mm512_set1_ps(2e+00f), tmp7673);
__m512 tmp7659 = _mm512_fmadd_ps(tmp7654, _mm512_set1_ps(8e+00f), tmp7653);
__m512 tmp7679 = _mm512_fmadd_ps(tmp7674, _mm512_set1_ps(8e+00f), tmp7673);
__m512 tmp7647 = _mm512_add_ps(tmp7648, tmp7649);
__m512 tmp7667 = _mm512_add_ps(tmp7668, tmp7669);
__m512 tmp7651 = _mm512_fmadd_ps(tmp7655, _mm512_set1_ps(1.6e+01f), tmp7652);
__m512 tmp7671 = _mm512_fmadd_ps(tmp7675, _mm512_set1_ps(1.6e+01f), tmp7672);
__m512 tmp7658 = _mm512_fmadd_ps(tmp7655, _mm512_set1_ps(4e+00f), tmp7659);
__m512 tmp7678 = _mm512_fmadd_ps(tmp7675, _mm512_set1_ps(4e+00f), tmp7679);
__m512 tmp7664 = _mm512_add_ps(tmp7655, tmp7653);
__m512 tmp7684 = _mm512_add_ps(tmp7675, tmp7673);
__m512 tmp7657 = _mm512_fmadd_ps(tmp7648, _mm512_set1_ps(4e+00f), tmp7649);
__m512 tmp7677 = _mm512_fmadd_ps(tmp7668, _mm512_set1_ps(4e+00f), tmp7669);
__m512 tmp7661 = _mm512_fmadd_ps(tmp7648, _mm512_set1_ps(1.6e+01f), tmp7649);
__m512 tmp7681 = _mm512_fmadd_ps(tmp7668, _mm512_set1_ps(1.6e+01f), tmp7669);
__m512 tmp7646 = _mm512_add_ps(tmp7647, in1165);
__m512 tmp7666 = _mm512_add_ps(tmp7667, in1173);
__m512 tmp7663 = _mm512_add_ps(tmp7664, in1172);
__m512 tmp7683 = _mm512_add_ps(tmp7684, in1180);
__m512 tmp7645 = _mm512_fmadd_ps(tmp7650, _mm512_set1_ps(3.2e+01f), tmp7646);
__m512 tmp7665 = _mm512_fmadd_ps(tmp7670, _mm512_set1_ps(3.2e+01f), tmp7666);
__m512 tmp7656 = _mm512_fmadd_ps(tmp7650, _mm512_set1_ps(8e+00f), tmp7657);
__m512 tmp7676 = _mm512_fmadd_ps(tmp7670, _mm512_set1_ps(8e+00f), tmp7677);
__m512 tmp7662 = _mm512_fmadd_ps(tmp7654, _mm512_set1_ps(3.2e+01f), tmp7663);
__m512 tmp7682 = _mm512_fmadd_ps(tmp7674, _mm512_set1_ps(3.2e+01f), tmp7683);
__m512 tmp7660 = _mm512_fmadd_ps(tmp7650, _mm512_set1_ps(2e+00f), tmp7661);
__m512 tmp7680 = _mm512_fmadd_ps(tmp7670, _mm512_set1_ps(2e+00f), tmp7681);
__m512 tmp7633 = tmp7645;
__m512 tmp7639 = tmp7665;
__m512 tmp7634 = tmp7651;
__m512 tmp7640 = tmp7671;
__m512 tmp7635 = tmp7656;
__m512 tmp7641 = tmp7676;
__m512 tmp7636 = tmp7658;
__m512 tmp7642 = tmp7678;
__m512 tmp7637 = tmp7660;
__m512 tmp7643 = tmp7680;
__m512 tmp7638 = tmp7662;
__m512 tmp7644 = tmp7682;
__m512 tmp7729 = _mm512_unpacklo_ps(tmp7633, tmp7634);
__m512 tmp7730 = _mm512_unpackhi_ps(tmp7633, tmp7634);
__m512 tmp7731 = _mm512_unpacklo_ps(tmp7635, tmp7636);
__m512 tmp7732 = _mm512_unpackhi_ps(tmp7635, tmp7636);
__m512 tmp7733 = _mm512_unpacklo_ps(tmp7637, tmp7638);
__m512 tmp7734 = _mm512_unpackhi_ps(tmp7637, tmp7638);
__m512 tmp7735 = _mm512_unpacklo_ps(tmp7639, tmp7640);
__m512 tmp7736 = _mm512_unpackhi_ps(tmp7639, tmp7640);
__m512 tmp7737 = _mm512_unpacklo_ps(tmp7641, tmp7642);
__m512 tmp7738 = _mm512_unpackhi_ps(tmp7641, tmp7642);
__m512 tmp7739 = _mm512_unpacklo_ps(tmp7643, tmp7644);
__m512 tmp7740 = _mm512_unpackhi_ps(tmp7643, tmp7644);
__m512 tmp7741 = _mm512_shuffle_ps(tmp7729, tmp7731, 68);
__m512 tmp7742 = _mm512_shuffle_ps(tmp7729, tmp7731, 238);
__m512 tmp7743 = _mm512_shuffle_ps(tmp7730, tmp7732, 68);
__m512 tmp7744 = _mm512_shuffle_ps(tmp7730, tmp7732, 238);
__m512 tmp7745 = _mm512_shuffle_ps(tmp7733, tmp7735, 68);
__m512 tmp7746 = _mm512_shuffle_ps(tmp7733, tmp7735, 238);
__m512 tmp7747 = _mm512_shuffle_ps(tmp7734, tmp7736, 68);
__m512 tmp7748 = _mm512_shuffle_ps(tmp7734, tmp7736, 238);
__m512 tmp7749 = _mm512_shuffle_ps(tmp7737, tmp7739, 68);
__m512 tmp7750 = _mm512_shuffle_ps(tmp7737, tmp7739, 238);
__m512 tmp7751 = _mm512_shuffle_ps(tmp7738, tmp7740, 68);
__m512 tmp7752 = _mm512_shuffle_ps(tmp7738, tmp7740, 238);
__m512 tmp7753 = _mm512_shuffle_f32x4(tmp7741, tmp7745, 136);
__m512 tmp7754 = _mm512_shuffle_f32x4(tmp7741, tmp7745, 221);
__m512 tmp7755 = _mm512_shuffle_f32x4(tmp7742, tmp7746, 136);
__m512 tmp7756 = _mm512_shuffle_f32x4(tmp7742, tmp7746, 221);
__m512 tmp7757 = _mm512_shuffle_f32x4(tmp7743, tmp7747, 136);
__m512 tmp7758 = _mm512_shuffle_f32x4(tmp7743, tmp7747, 221);
__m512 tmp7759 = _mm512_shuffle_f32x4(tmp7744, tmp7748, 136);
__m512 tmp7760 = _mm512_shuffle_f32x4(tmp7744, tmp7748, 221);
__m512 tmp7761 = _mm512_shuffle_f32x4(tmp7749, tmp7749, 136);
__m512 tmp7762 = _mm512_shuffle_f32x4(tmp7749, tmp7749, 221);
__m512 tmp7763 = _mm512_shuffle_f32x4(tmp7750, tmp7750, 136);
__m512 tmp7764 = _mm512_shuffle_f32x4(tmp7750, tmp7750, 221);
__m512 tmp7765 = _mm512_shuffle_f32x4(tmp7751, tmp7751, 136);
__m512 tmp7766 = _mm512_shuffle_f32x4(tmp7751, tmp7751, 221);
__m512 tmp7767 = _mm512_shuffle_f32x4(tmp7752, tmp7752, 136);
__m512 tmp7768 = _mm512_shuffle_f32x4(tmp7752, tmp7752, 221);
tmp7633 = _mm512_shuffle_f32x4(tmp7753, tmp7761, 136);
tmp7641 = _mm512_shuffle_f32x4(tmp7753, tmp7761, 221);
tmp7634 = _mm512_shuffle_f32x4(tmp7755, tmp7763, 136);
tmp7642 = _mm512_shuffle_f32x4(tmp7755, tmp7763, 221);
tmp7635 = _mm512_shuffle_f32x4(tmp7757, tmp7765, 136);
tmp7643 = _mm512_shuffle_f32x4(tmp7757, tmp7765, 221);
tmp7636 = _mm512_shuffle_f32x4(tmp7759, tmp7767, 136);
tmp7644 = _mm512_shuffle_f32x4(tmp7759, tmp7767, 221);
tmp7637 = _mm512_shuffle_f32x4(tmp7754, tmp7762, 136);
__m512 tmp7685 = _mm512_shuffle_f32x4(tmp7754, tmp7762, 221);
tmp7638 = _mm512_shuffle_f32x4(tmp7756, tmp7764, 136);
__m512 tmp7686 = _mm512_shuffle_f32x4(tmp7756, tmp7764, 221);
tmp7639 = _mm512_shuffle_f32x4(tmp7758, tmp7766, 136);
__m512 tmp7687 = _mm512_shuffle_f32x4(tmp7758, tmp7766, 221);
tmp7640 = _mm512_shuffle_f32x4(tmp7760, tmp7768, 136);
__m512 tmp7688 = _mm512_shuffle_f32x4(tmp7760, tmp7768, 221);
__m512 tmp7693 = _mm512_add_ps(tmp7634, tmp7635);
__m512 tmp7713 = _mm512_add_ps(tmp7642, tmp7643);
__m512 tmp7692 = _mm512_add_ps(tmp7636, tmp7637);
__m512 tmp7712 = _mm512_add_ps(tmp7644, tmp7685);
__m512 tmp7698 = _mm512_sub_ps(tmp7636, tmp7637);
__m512 tmp7718 = _mm512_sub_ps(tmp7644, tmp7685);
__m512 tmp7697 = _mm512_sub_ps(tmp7634, tmp7635);
__m512 tmp7717 = _mm512_sub_ps(tmp7642, tmp7643);
__m512 tmp7694 = _mm512_add_ps(tmp7638, tmp7639);
__m512 tmp7714 = _mm512_add_ps(tmp7686, tmp7687);
__m512 tmp7699 = _mm512_sub_ps(tmp7638, tmp7639);
__m512 tmp7719 = _mm512_sub_ps(tmp7686, tmp7687);
__m512 tmp7696 = _mm512_fmadd_ps(tmp7698, _mm512_set1_ps(2e+00f), tmp7697);
__m512 tmp7716 = _mm512_fmadd_ps(tmp7718, _mm512_set1_ps(2e+00f), tmp7717);
__m512 tmp7703 = _mm512_fmadd_ps(tmp7698, _mm512_set1_ps(8e+00f), tmp7697);
__m512 tmp7723 = _mm512_fmadd_ps(tmp7718, _mm512_set1_ps(8e+00f), tmp7717);
__m512 tmp7691 = _mm512_add_ps(tmp7692, tmp7693);
__m512 tmp7711 = _mm512_add_ps(tmp7712, tmp7713);
__m512 tmp7695 = _mm512_fmadd_ps(tmp7699, _mm512_set1_ps(1.6e+01f), tmp7696);
__m512 tmp7715 = _mm512_fmadd_ps(tmp7719, _mm512_set1_ps(1.6e+01f), tmp7716);
__m512 tmp7702 = _mm512_fmadd_ps(tmp7699, _mm512_set1_ps(4e+00f), tmp7703);
__m512 tmp7722 = _mm512_fmadd_ps(tmp7719, _mm512_set1_ps(4e+00f), tmp7723);
__m512 tmp7708 = _mm512_add_ps(tmp7699, tmp7697);
__m512 tmp7728 = _mm512_add_ps(tmp7719, tmp7717);
__m512 tmp7701 = _mm512_fmadd_ps(tmp7692, _mm512_set1_ps(4e+00f), tmp7693);
__m512 tmp7721 = _mm512_fmadd_ps(tmp7712, _mm512_set1_ps(4e+00f), tmp7713);
__m512 tmp7705 = _mm512_fmadd_ps(tmp7692, _mm512_set1_ps(1.6e+01f), tmp7693);
__m512 tmp7725 = _mm512_fmadd_ps(tmp7712, _mm512_set1_ps(1.6e+01f), tmp7713);
__m512 tmp7690 = _mm512_add_ps(tmp7691, tmp7633);
__m512 tmp7710 = _mm512_add_ps(tmp7711, tmp7641);
__m512 tmp7707 = _mm512_add_ps(tmp7708, tmp7640);
__m512 tmp7727 = _mm512_add_ps(tmp7728, tmp7688);
__m512 tmp7689 = _mm512_fmadd_ps(tmp7694, _mm512_set1_ps(3.2e+01f), tmp7690);
__m512 tmp7709 = _mm512_fmadd_ps(tmp7714, _mm512_set1_ps(3.2e+01f), tmp7710);
__m512 tmp7700 = _mm512_fmadd_ps(tmp7694, _mm512_set1_ps(8e+00f), tmp7701);
__m512 tmp7720 = _mm512_fmadd_ps(tmp7714, _mm512_set1_ps(8e+00f), tmp7721);
__m512 tmp7706 = _mm512_fmadd_ps(tmp7698, _mm512_set1_ps(3.2e+01f), tmp7707);
__m512 tmp7726 = _mm512_fmadd_ps(tmp7718, _mm512_set1_ps(3.2e+01f), tmp7727);
__m512 tmp7704 = _mm512_fmadd_ps(tmp7694, _mm512_set1_ps(2e+00f), tmp7705);
__m512 tmp7724 = _mm512_fmadd_ps(tmp7714, _mm512_set1_ps(2e+00f), tmp7725);
__m512 out1143 = tmp7689;
__m512 out1149 = tmp7709;
__m512 out1144 = tmp7695;
__m512 out1150 = tmp7715;
__m512 out1145 = tmp7700;
__m512 out1151 = tmp7720;
__m512 out1146 = tmp7702;
__m512 out1152 = tmp7722;
__m512 out1147 = tmp7704;
__m512 out1153 = tmp7724;
__m512 out1148 = tmp7706;
__m512 out1154 = tmp7726;
out1143 = _mm512_max_ps(_mm512_setzero_ps(), out1143);
out1149 = _mm512_max_ps(_mm512_setzero_ps(), out1149);
out1144 = _mm512_max_ps(_mm512_setzero_ps(), out1144);
out1150 = _mm512_max_ps(_mm512_setzero_ps(), out1150);
out1145 = _mm512_max_ps(_mm512_setzero_ps(), out1145);
out1151 = _mm512_max_ps(_mm512_setzero_ps(), out1151);
out1146 = _mm512_max_ps(_mm512_setzero_ps(), out1146);
out1152 = _mm512_max_ps(_mm512_setzero_ps(), out1152);
out1147 = _mm512_max_ps(_mm512_setzero_ps(), out1147);
out1153 = _mm512_max_ps(_mm512_setzero_ps(), out1153);
out1148 = _mm512_max_ps(_mm512_setzero_ps(), out1148);
out1154 = _mm512_max_ps(_mm512_setzero_ps(), out1154);
_mm512_mask_storeu_ps(datPtr13+0+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1143);
_mm512_mask_storeu_ps(datPtr13+48+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 255, out1149);
_mm512_mask_storeu_ps(datPtr13+224+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1144);
_mm512_mask_storeu_ps(datPtr13+272+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 255, out1150);
_mm512_mask_storeu_ps(datPtr13+448+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1145);
_mm512_mask_storeu_ps(datPtr13+496+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 255, out1151);
_mm512_mask_storeu_ps(datPtr13+672+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1146);
_mm512_mask_storeu_ps(datPtr13+720+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 255, out1152);
_mm512_mask_storeu_ps(datPtr13+896+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1147);
_mm512_mask_storeu_ps(datPtr13+944+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 255, out1153);
_mm512_mask_storeu_ps(datPtr13+1120+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1148);
_mm512_mask_storeu_ps(datPtr13+1168+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 255, out1154);
__m512 sf465 = _mm512_loadu_ps(sfPtr7+256+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf466 = _mm512_loadu_ps(sfPtr7+384+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1181 = _mm512_shuffle_f32x4(sf465, sf466, 68);
__m512 in1182 = _mm512_shuffle_f32x4(sf465, sf466, 238);
__m512 sf467 = _mm512_loadu_ps(sfPtr7+320+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf468 = _mm512_loadu_ps(sfPtr7+448+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1189 = _mm512_shuffle_f32x4(sf467, sf468, 68);
__m512 in1190 = _mm512_shuffle_f32x4(sf467, sf468, 238);
__m512 sf469 = _mm512_loadu_ps(sfPtr7+409856+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf470 = _mm512_loadu_ps(sfPtr7+409984+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1183 = _mm512_shuffle_f32x4(sf469, sf470, 68);
__m512 in1184 = _mm512_shuffle_f32x4(sf469, sf470, 238);
__m512 sf471 = _mm512_loadu_ps(sfPtr7+409920+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf472 = _mm512_loadu_ps(sfPtr7+410048+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1191 = _mm512_shuffle_f32x4(sf471, sf472, 68);
__m512 in1192 = _mm512_shuffle_f32x4(sf471, sf472, 238);
__m512 sf473 = _mm512_loadu_ps(sfPtr7+819456+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf474 = _mm512_loadu_ps(sfPtr7+819584+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1185 = _mm512_shuffle_f32x4(sf473, sf474, 68);
__m512 in1186 = _mm512_shuffle_f32x4(sf473, sf474, 238);
__m512 sf475 = _mm512_loadu_ps(sfPtr7+819520+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf476 = _mm512_loadu_ps(sfPtr7+819648+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1193 = _mm512_shuffle_f32x4(sf475, sf476, 68);
__m512 in1194 = _mm512_shuffle_f32x4(sf475, sf476, 238);
__m512 sf477 = _mm512_loadu_ps(sfPtr7+1229056+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf478 = _mm512_loadu_ps(sfPtr7+1229184+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1187 = _mm512_shuffle_f32x4(sf477, sf478, 68);
__m512 in1188 = _mm512_shuffle_f32x4(sf477, sf478, 238);
__m512 sf479 = _mm512_loadu_ps(sfPtr7+1229120+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf480 = _mm512_loadu_ps(sfPtr7+1229248+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1195 = _mm512_shuffle_f32x4(sf479, sf480, 68);
__m512 in1196 = _mm512_shuffle_f32x4(sf479, sf480, 238);
__m512 tmp7785 = _mm512_add_ps(in1182, in1183);
__m512 tmp7805 = _mm512_add_ps(in1190, in1191);
__m512 tmp7784 = _mm512_add_ps(in1184, in1185);
__m512 tmp7804 = _mm512_add_ps(in1192, in1193);
__m512 tmp7790 = _mm512_sub_ps(in1184, in1185);
__m512 tmp7810 = _mm512_sub_ps(in1192, in1193);
__m512 tmp7789 = _mm512_sub_ps(in1182, in1183);
__m512 tmp7809 = _mm512_sub_ps(in1190, in1191);
__m512 tmp7786 = _mm512_add_ps(in1186, in1187);
__m512 tmp7806 = _mm512_add_ps(in1194, in1195);
__m512 tmp7791 = _mm512_sub_ps(in1186, in1187);
__m512 tmp7811 = _mm512_sub_ps(in1194, in1195);
__m512 tmp7788 = _mm512_fmadd_ps(tmp7790, _mm512_set1_ps(2e+00f), tmp7789);
__m512 tmp7808 = _mm512_fmadd_ps(tmp7810, _mm512_set1_ps(2e+00f), tmp7809);
__m512 tmp7795 = _mm512_fmadd_ps(tmp7790, _mm512_set1_ps(8e+00f), tmp7789);
__m512 tmp7815 = _mm512_fmadd_ps(tmp7810, _mm512_set1_ps(8e+00f), tmp7809);
__m512 tmp7783 = _mm512_add_ps(tmp7784, tmp7785);
__m512 tmp7803 = _mm512_add_ps(tmp7804, tmp7805);
__m512 tmp7787 = _mm512_fmadd_ps(tmp7791, _mm512_set1_ps(1.6e+01f), tmp7788);
__m512 tmp7807 = _mm512_fmadd_ps(tmp7811, _mm512_set1_ps(1.6e+01f), tmp7808);
__m512 tmp7794 = _mm512_fmadd_ps(tmp7791, _mm512_set1_ps(4e+00f), tmp7795);
__m512 tmp7814 = _mm512_fmadd_ps(tmp7811, _mm512_set1_ps(4e+00f), tmp7815);
__m512 tmp7800 = _mm512_add_ps(tmp7791, tmp7789);
__m512 tmp7820 = _mm512_add_ps(tmp7811, tmp7809);
__m512 tmp7793 = _mm512_fmadd_ps(tmp7784, _mm512_set1_ps(4e+00f), tmp7785);
__m512 tmp7813 = _mm512_fmadd_ps(tmp7804, _mm512_set1_ps(4e+00f), tmp7805);
__m512 tmp7797 = _mm512_fmadd_ps(tmp7784, _mm512_set1_ps(1.6e+01f), tmp7785);
__m512 tmp7817 = _mm512_fmadd_ps(tmp7804, _mm512_set1_ps(1.6e+01f), tmp7805);
__m512 tmp7782 = _mm512_add_ps(tmp7783, in1181);
__m512 tmp7802 = _mm512_add_ps(tmp7803, in1189);
__m512 tmp7799 = _mm512_add_ps(tmp7800, in1188);
__m512 tmp7819 = _mm512_add_ps(tmp7820, in1196);
__m512 tmp7781 = _mm512_fmadd_ps(tmp7786, _mm512_set1_ps(3.2e+01f), tmp7782);
__m512 tmp7801 = _mm512_fmadd_ps(tmp7806, _mm512_set1_ps(3.2e+01f), tmp7802);
__m512 tmp7792 = _mm512_fmadd_ps(tmp7786, _mm512_set1_ps(8e+00f), tmp7793);
__m512 tmp7812 = _mm512_fmadd_ps(tmp7806, _mm512_set1_ps(8e+00f), tmp7813);
__m512 tmp7798 = _mm512_fmadd_ps(tmp7790, _mm512_set1_ps(3.2e+01f), tmp7799);
__m512 tmp7818 = _mm512_fmadd_ps(tmp7810, _mm512_set1_ps(3.2e+01f), tmp7819);
__m512 tmp7796 = _mm512_fmadd_ps(tmp7786, _mm512_set1_ps(2e+00f), tmp7797);
__m512 tmp7816 = _mm512_fmadd_ps(tmp7806, _mm512_set1_ps(2e+00f), tmp7817);
__m512 tmp7769 = tmp7781;
__m512 tmp7775 = tmp7801;
__m512 tmp7770 = tmp7787;
__m512 tmp7776 = tmp7807;
__m512 tmp7771 = tmp7792;
__m512 tmp7777 = tmp7812;
__m512 tmp7772 = tmp7794;
__m512 tmp7778 = tmp7814;
__m512 tmp7773 = tmp7796;
__m512 tmp7779 = tmp7816;
__m512 tmp7774 = tmp7798;
__m512 tmp7780 = tmp7818;
__m512 tmp7865 = _mm512_unpacklo_ps(tmp7769, tmp7770);
__m512 tmp7866 = _mm512_unpackhi_ps(tmp7769, tmp7770);
__m512 tmp7867 = _mm512_unpacklo_ps(tmp7771, tmp7772);
__m512 tmp7868 = _mm512_unpackhi_ps(tmp7771, tmp7772);
__m512 tmp7869 = _mm512_unpacklo_ps(tmp7773, tmp7774);
__m512 tmp7870 = _mm512_unpackhi_ps(tmp7773, tmp7774);
__m512 tmp7871 = _mm512_unpacklo_ps(tmp7775, tmp7776);
__m512 tmp7872 = _mm512_unpackhi_ps(tmp7775, tmp7776);
__m512 tmp7873 = _mm512_unpacklo_ps(tmp7777, tmp7778);
__m512 tmp7874 = _mm512_unpackhi_ps(tmp7777, tmp7778);
__m512 tmp7875 = _mm512_unpacklo_ps(tmp7779, tmp7780);
__m512 tmp7876 = _mm512_unpackhi_ps(tmp7779, tmp7780);
__m512 tmp7877 = _mm512_shuffle_ps(tmp7865, tmp7867, 68);
__m512 tmp7878 = _mm512_shuffle_ps(tmp7865, tmp7867, 238);
__m512 tmp7879 = _mm512_shuffle_ps(tmp7866, tmp7868, 68);
__m512 tmp7880 = _mm512_shuffle_ps(tmp7866, tmp7868, 238);
__m512 tmp7881 = _mm512_shuffle_ps(tmp7869, tmp7871, 68);
__m512 tmp7882 = _mm512_shuffle_ps(tmp7869, tmp7871, 238);
__m512 tmp7883 = _mm512_shuffle_ps(tmp7870, tmp7872, 68);
__m512 tmp7884 = _mm512_shuffle_ps(tmp7870, tmp7872, 238);
__m512 tmp7885 = _mm512_shuffle_ps(tmp7873, tmp7875, 68);
__m512 tmp7886 = _mm512_shuffle_ps(tmp7873, tmp7875, 238);
__m512 tmp7887 = _mm512_shuffle_ps(tmp7874, tmp7876, 68);
__m512 tmp7888 = _mm512_shuffle_ps(tmp7874, tmp7876, 238);
__m512 tmp7889 = _mm512_shuffle_f32x4(tmp7877, tmp7881, 136);
__m512 tmp7890 = _mm512_shuffle_f32x4(tmp7877, tmp7881, 221);
__m512 tmp7891 = _mm512_shuffle_f32x4(tmp7878, tmp7882, 136);
__m512 tmp7892 = _mm512_shuffle_f32x4(tmp7878, tmp7882, 221);
__m512 tmp7893 = _mm512_shuffle_f32x4(tmp7879, tmp7883, 136);
__m512 tmp7894 = _mm512_shuffle_f32x4(tmp7879, tmp7883, 221);
__m512 tmp7895 = _mm512_shuffle_f32x4(tmp7880, tmp7884, 136);
__m512 tmp7896 = _mm512_shuffle_f32x4(tmp7880, tmp7884, 221);
__m512 tmp7897 = _mm512_shuffle_f32x4(tmp7885, tmp7885, 136);
__m512 tmp7898 = _mm512_shuffle_f32x4(tmp7885, tmp7885, 221);
__m512 tmp7899 = _mm512_shuffle_f32x4(tmp7886, tmp7886, 136);
__m512 tmp7900 = _mm512_shuffle_f32x4(tmp7886, tmp7886, 221);
__m512 tmp7901 = _mm512_shuffle_f32x4(tmp7887, tmp7887, 136);
__m512 tmp7902 = _mm512_shuffle_f32x4(tmp7887, tmp7887, 221);
__m512 tmp7903 = _mm512_shuffle_f32x4(tmp7888, tmp7888, 136);
__m512 tmp7904 = _mm512_shuffle_f32x4(tmp7888, tmp7888, 221);
tmp7769 = _mm512_shuffle_f32x4(tmp7889, tmp7897, 136);
tmp7777 = _mm512_shuffle_f32x4(tmp7889, tmp7897, 221);
tmp7770 = _mm512_shuffle_f32x4(tmp7891, tmp7899, 136);
tmp7778 = _mm512_shuffle_f32x4(tmp7891, tmp7899, 221);
tmp7771 = _mm512_shuffle_f32x4(tmp7893, tmp7901, 136);
tmp7779 = _mm512_shuffle_f32x4(tmp7893, tmp7901, 221);
tmp7772 = _mm512_shuffle_f32x4(tmp7895, tmp7903, 136);
tmp7780 = _mm512_shuffle_f32x4(tmp7895, tmp7903, 221);
tmp7773 = _mm512_shuffle_f32x4(tmp7890, tmp7898, 136);
__m512 tmp7821 = _mm512_shuffle_f32x4(tmp7890, tmp7898, 221);
tmp7774 = _mm512_shuffle_f32x4(tmp7892, tmp7900, 136);
__m512 tmp7822 = _mm512_shuffle_f32x4(tmp7892, tmp7900, 221);
tmp7775 = _mm512_shuffle_f32x4(tmp7894, tmp7902, 136);
__m512 tmp7823 = _mm512_shuffle_f32x4(tmp7894, tmp7902, 221);
tmp7776 = _mm512_shuffle_f32x4(tmp7896, tmp7904, 136);
__m512 tmp7824 = _mm512_shuffle_f32x4(tmp7896, tmp7904, 221);
__m512 tmp7829 = _mm512_add_ps(tmp7770, tmp7771);
__m512 tmp7849 = _mm512_add_ps(tmp7778, tmp7779);
__m512 tmp7828 = _mm512_add_ps(tmp7772, tmp7773);
__m512 tmp7848 = _mm512_add_ps(tmp7780, tmp7821);
__m512 tmp7834 = _mm512_sub_ps(tmp7772, tmp7773);
__m512 tmp7854 = _mm512_sub_ps(tmp7780, tmp7821);
__m512 tmp7833 = _mm512_sub_ps(tmp7770, tmp7771);
__m512 tmp7853 = _mm512_sub_ps(tmp7778, tmp7779);
__m512 tmp7830 = _mm512_add_ps(tmp7774, tmp7775);
__m512 tmp7850 = _mm512_add_ps(tmp7822, tmp7823);
__m512 tmp7835 = _mm512_sub_ps(tmp7774, tmp7775);
__m512 tmp7855 = _mm512_sub_ps(tmp7822, tmp7823);
__m512 tmp7832 = _mm512_fmadd_ps(tmp7834, _mm512_set1_ps(2e+00f), tmp7833);
__m512 tmp7852 = _mm512_fmadd_ps(tmp7854, _mm512_set1_ps(2e+00f), tmp7853);
__m512 tmp7839 = _mm512_fmadd_ps(tmp7834, _mm512_set1_ps(8e+00f), tmp7833);
__m512 tmp7859 = _mm512_fmadd_ps(tmp7854, _mm512_set1_ps(8e+00f), tmp7853);
__m512 tmp7827 = _mm512_add_ps(tmp7828, tmp7829);
__m512 tmp7847 = _mm512_add_ps(tmp7848, tmp7849);
__m512 tmp7831 = _mm512_fmadd_ps(tmp7835, _mm512_set1_ps(1.6e+01f), tmp7832);
__m512 tmp7851 = _mm512_fmadd_ps(tmp7855, _mm512_set1_ps(1.6e+01f), tmp7852);
__m512 tmp7838 = _mm512_fmadd_ps(tmp7835, _mm512_set1_ps(4e+00f), tmp7839);
__m512 tmp7858 = _mm512_fmadd_ps(tmp7855, _mm512_set1_ps(4e+00f), tmp7859);
__m512 tmp7844 = _mm512_add_ps(tmp7835, tmp7833);
__m512 tmp7864 = _mm512_add_ps(tmp7855, tmp7853);
__m512 tmp7837 = _mm512_fmadd_ps(tmp7828, _mm512_set1_ps(4e+00f), tmp7829);
__m512 tmp7857 = _mm512_fmadd_ps(tmp7848, _mm512_set1_ps(4e+00f), tmp7849);
__m512 tmp7841 = _mm512_fmadd_ps(tmp7828, _mm512_set1_ps(1.6e+01f), tmp7829);
__m512 tmp7861 = _mm512_fmadd_ps(tmp7848, _mm512_set1_ps(1.6e+01f), tmp7849);
__m512 tmp7826 = _mm512_add_ps(tmp7827, tmp7769);
__m512 tmp7846 = _mm512_add_ps(tmp7847, tmp7777);
__m512 tmp7843 = _mm512_add_ps(tmp7844, tmp7776);
__m512 tmp7863 = _mm512_add_ps(tmp7864, tmp7824);
__m512 tmp7825 = _mm512_fmadd_ps(tmp7830, _mm512_set1_ps(3.2e+01f), tmp7826);
__m512 tmp7845 = _mm512_fmadd_ps(tmp7850, _mm512_set1_ps(3.2e+01f), tmp7846);
__m512 tmp7836 = _mm512_fmadd_ps(tmp7830, _mm512_set1_ps(8e+00f), tmp7837);
__m512 tmp7856 = _mm512_fmadd_ps(tmp7850, _mm512_set1_ps(8e+00f), tmp7857);
__m512 tmp7842 = _mm512_fmadd_ps(tmp7834, _mm512_set1_ps(3.2e+01f), tmp7843);
__m512 tmp7862 = _mm512_fmadd_ps(tmp7854, _mm512_set1_ps(3.2e+01f), tmp7863);
__m512 tmp7840 = _mm512_fmadd_ps(tmp7830, _mm512_set1_ps(2e+00f), tmp7841);
__m512 tmp7860 = _mm512_fmadd_ps(tmp7850, _mm512_set1_ps(2e+00f), tmp7861);
__m512 out1155 = tmp7825;
__m512 out1161 = tmp7845;
__m512 out1156 = tmp7831;
__m512 out1162 = tmp7851;
__m512 out1157 = tmp7836;
__m512 out1163 = tmp7856;
__m512 out1158 = tmp7838;
__m512 out1164 = tmp7858;
__m512 out1159 = tmp7840;
__m512 out1165 = tmp7860;
__m512 out1160 = tmp7842;
__m512 out1166 = tmp7862;
out1155 = _mm512_max_ps(_mm512_setzero_ps(), out1155);
out1161 = _mm512_max_ps(_mm512_setzero_ps(), out1161);
out1156 = _mm512_max_ps(_mm512_setzero_ps(), out1156);
out1162 = _mm512_max_ps(_mm512_setzero_ps(), out1162);
out1157 = _mm512_max_ps(_mm512_setzero_ps(), out1157);
out1163 = _mm512_max_ps(_mm512_setzero_ps(), out1163);
out1158 = _mm512_max_ps(_mm512_setzero_ps(), out1158);
out1164 = _mm512_max_ps(_mm512_setzero_ps(), out1164);
out1159 = _mm512_max_ps(_mm512_setzero_ps(), out1159);
out1165 = _mm512_max_ps(_mm512_setzero_ps(), out1165);
out1160 = _mm512_max_ps(_mm512_setzero_ps(), out1160);
out1166 = _mm512_max_ps(_mm512_setzero_ps(), out1166);
_mm512_mask_storeu_ps(datPtr13+1200+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1155);
_mm512_mask_storeu_ps(datPtr13+12608+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1161);
_mm512_mask_storeu_ps(datPtr13+1424+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1156);
_mm512_mask_storeu_ps(datPtr13+12832+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1162);
_mm512_mask_storeu_ps(datPtr13+1648+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1157);
_mm512_mask_storeu_ps(datPtr13+13056+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1163);
_mm512_mask_storeu_ps(datPtr13+1872+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1158);
_mm512_mask_storeu_ps(datPtr13+13280+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1164);
_mm512_mask_storeu_ps(datPtr13+2096+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1159);
_mm512_mask_storeu_ps(datPtr13+13504+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1165);
_mm512_mask_storeu_ps(datPtr13+2320+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1160);
_mm512_mask_storeu_ps(datPtr13+13728+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1166);
__m512 sf481 = _mm512_loadu_ps(sfPtr7+512+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf482 = _mm512_loadu_ps(sfPtr7+640+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1197 = _mm512_shuffle_f32x4(sf481, sf482, 68);
__m512 in1198 = _mm512_shuffle_f32x4(sf481, sf482, 238);
__m512 sf483 = _mm512_loadu_ps(sfPtr7+576+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf484 = _mm512_loadu_ps(sfPtr7+704+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1205 = _mm512_shuffle_f32x4(sf483, sf484, 68);
__m512 in1206 = _mm512_shuffle_f32x4(sf483, sf484, 238);
__m512 sf485 = _mm512_loadu_ps(sfPtr7+410112+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf486 = _mm512_loadu_ps(sfPtr7+410240+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1199 = _mm512_shuffle_f32x4(sf485, sf486, 68);
__m512 in1200 = _mm512_shuffle_f32x4(sf485, sf486, 238);
__m512 sf487 = _mm512_loadu_ps(sfPtr7+410176+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf488 = _mm512_loadu_ps(sfPtr7+410304+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1207 = _mm512_shuffle_f32x4(sf487, sf488, 68);
__m512 in1208 = _mm512_shuffle_f32x4(sf487, sf488, 238);
__m512 sf489 = _mm512_loadu_ps(sfPtr7+819712+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf490 = _mm512_loadu_ps(sfPtr7+819840+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1201 = _mm512_shuffle_f32x4(sf489, sf490, 68);
__m512 in1202 = _mm512_shuffle_f32x4(sf489, sf490, 238);
__m512 sf491 = _mm512_loadu_ps(sfPtr7+819776+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf492 = _mm512_loadu_ps(sfPtr7+819904+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1209 = _mm512_shuffle_f32x4(sf491, sf492, 68);
__m512 in1210 = _mm512_shuffle_f32x4(sf491, sf492, 238);
__m512 sf493 = _mm512_loadu_ps(sfPtr7+1229312+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf494 = _mm512_loadu_ps(sfPtr7+1229440+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1203 = _mm512_shuffle_f32x4(sf493, sf494, 68);
__m512 in1204 = _mm512_shuffle_f32x4(sf493, sf494, 238);
__m512 sf495 = _mm512_loadu_ps(sfPtr7+1229376+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 sf496 = _mm512_loadu_ps(sfPtr7+1229504+1638400*i28+24576*j23+1536*k91+768*l30);
__m512 in1211 = _mm512_shuffle_f32x4(sf495, sf496, 68);
__m512 in1212 = _mm512_shuffle_f32x4(sf495, sf496, 238);
__m512 tmp7921 = _mm512_add_ps(in1198, in1199);
__m512 tmp7941 = _mm512_add_ps(in1206, in1207);
__m512 tmp7920 = _mm512_add_ps(in1200, in1201);
__m512 tmp7940 = _mm512_add_ps(in1208, in1209);
__m512 tmp7926 = _mm512_sub_ps(in1200, in1201);
__m512 tmp7946 = _mm512_sub_ps(in1208, in1209);
__m512 tmp7925 = _mm512_sub_ps(in1198, in1199);
__m512 tmp7945 = _mm512_sub_ps(in1206, in1207);
__m512 tmp7922 = _mm512_add_ps(in1202, in1203);
__m512 tmp7942 = _mm512_add_ps(in1210, in1211);
__m512 tmp7927 = _mm512_sub_ps(in1202, in1203);
__m512 tmp7947 = _mm512_sub_ps(in1210, in1211);
__m512 tmp7924 = _mm512_fmadd_ps(tmp7926, _mm512_set1_ps(2e+00f), tmp7925);
__m512 tmp7944 = _mm512_fmadd_ps(tmp7946, _mm512_set1_ps(2e+00f), tmp7945);
__m512 tmp7931 = _mm512_fmadd_ps(tmp7926, _mm512_set1_ps(8e+00f), tmp7925);
__m512 tmp7951 = _mm512_fmadd_ps(tmp7946, _mm512_set1_ps(8e+00f), tmp7945);
__m512 tmp7919 = _mm512_add_ps(tmp7920, tmp7921);
__m512 tmp7939 = _mm512_add_ps(tmp7940, tmp7941);
__m512 tmp7923 = _mm512_fmadd_ps(tmp7927, _mm512_set1_ps(1.6e+01f), tmp7924);
__m512 tmp7943 = _mm512_fmadd_ps(tmp7947, _mm512_set1_ps(1.6e+01f), tmp7944);
__m512 tmp7930 = _mm512_fmadd_ps(tmp7927, _mm512_set1_ps(4e+00f), tmp7931);
__m512 tmp7950 = _mm512_fmadd_ps(tmp7947, _mm512_set1_ps(4e+00f), tmp7951);
__m512 tmp7936 = _mm512_add_ps(tmp7927, tmp7925);
__m512 tmp7956 = _mm512_add_ps(tmp7947, tmp7945);
__m512 tmp7929 = _mm512_fmadd_ps(tmp7920, _mm512_set1_ps(4e+00f), tmp7921);
__m512 tmp7949 = _mm512_fmadd_ps(tmp7940, _mm512_set1_ps(4e+00f), tmp7941);
__m512 tmp7933 = _mm512_fmadd_ps(tmp7920, _mm512_set1_ps(1.6e+01f), tmp7921);
__m512 tmp7953 = _mm512_fmadd_ps(tmp7940, _mm512_set1_ps(1.6e+01f), tmp7941);
__m512 tmp7918 = _mm512_add_ps(tmp7919, in1197);
__m512 tmp7938 = _mm512_add_ps(tmp7939, in1205);
__m512 tmp7935 = _mm512_add_ps(tmp7936, in1204);
__m512 tmp7955 = _mm512_add_ps(tmp7956, in1212);
__m512 tmp7917 = _mm512_fmadd_ps(tmp7922, _mm512_set1_ps(3.2e+01f), tmp7918);
__m512 tmp7937 = _mm512_fmadd_ps(tmp7942, _mm512_set1_ps(3.2e+01f), tmp7938);
__m512 tmp7928 = _mm512_fmadd_ps(tmp7922, _mm512_set1_ps(8e+00f), tmp7929);
__m512 tmp7948 = _mm512_fmadd_ps(tmp7942, _mm512_set1_ps(8e+00f), tmp7949);
__m512 tmp7934 = _mm512_fmadd_ps(tmp7926, _mm512_set1_ps(3.2e+01f), tmp7935);
__m512 tmp7954 = _mm512_fmadd_ps(tmp7946, _mm512_set1_ps(3.2e+01f), tmp7955);
__m512 tmp7932 = _mm512_fmadd_ps(tmp7922, _mm512_set1_ps(2e+00f), tmp7933);
__m512 tmp7952 = _mm512_fmadd_ps(tmp7942, _mm512_set1_ps(2e+00f), tmp7953);
__m512 tmp7905 = tmp7917;
__m512 tmp7911 = tmp7937;
__m512 tmp7906 = tmp7923;
__m512 tmp7912 = tmp7943;
__m512 tmp7907 = tmp7928;
__m512 tmp7913 = tmp7948;
__m512 tmp7908 = tmp7930;
__m512 tmp7914 = tmp7950;
__m512 tmp7909 = tmp7932;
__m512 tmp7915 = tmp7952;
__m512 tmp7910 = tmp7934;
__m512 tmp7916 = tmp7954;
__m512 tmp8001 = _mm512_unpacklo_ps(tmp7905, tmp7906);
__m512 tmp8002 = _mm512_unpackhi_ps(tmp7905, tmp7906);
__m512 tmp8003 = _mm512_unpacklo_ps(tmp7907, tmp7908);
__m512 tmp8004 = _mm512_unpackhi_ps(tmp7907, tmp7908);
__m512 tmp8005 = _mm512_unpacklo_ps(tmp7909, tmp7910);
__m512 tmp8006 = _mm512_unpackhi_ps(tmp7909, tmp7910);
__m512 tmp8007 = _mm512_unpacklo_ps(tmp7911, tmp7912);
__m512 tmp8008 = _mm512_unpackhi_ps(tmp7911, tmp7912);
__m512 tmp8009 = _mm512_unpacklo_ps(tmp7913, tmp7914);
__m512 tmp8010 = _mm512_unpackhi_ps(tmp7913, tmp7914);
__m512 tmp8011 = _mm512_unpacklo_ps(tmp7915, tmp7916);
__m512 tmp8012 = _mm512_unpackhi_ps(tmp7915, tmp7916);
__m512 tmp8013 = _mm512_shuffle_ps(tmp8001, tmp8003, 68);
__m512 tmp8014 = _mm512_shuffle_ps(tmp8001, tmp8003, 238);
__m512 tmp8015 = _mm512_shuffle_ps(tmp8002, tmp8004, 68);
__m512 tmp8016 = _mm512_shuffle_ps(tmp8002, tmp8004, 238);
__m512 tmp8017 = _mm512_shuffle_ps(tmp8005, tmp8007, 68);
__m512 tmp8018 = _mm512_shuffle_ps(tmp8005, tmp8007, 238);
__m512 tmp8019 = _mm512_shuffle_ps(tmp8006, tmp8008, 68);
__m512 tmp8020 = _mm512_shuffle_ps(tmp8006, tmp8008, 238);
__m512 tmp8021 = _mm512_shuffle_ps(tmp8009, tmp8011, 68);
__m512 tmp8022 = _mm512_shuffle_ps(tmp8009, tmp8011, 238);
__m512 tmp8023 = _mm512_shuffle_ps(tmp8010, tmp8012, 68);
__m512 tmp8024 = _mm512_shuffle_ps(tmp8010, tmp8012, 238);
__m512 tmp8025 = _mm512_shuffle_f32x4(tmp8013, tmp8017, 136);
__m512 tmp8026 = _mm512_shuffle_f32x4(tmp8013, tmp8017, 221);
__m512 tmp8027 = _mm512_shuffle_f32x4(tmp8014, tmp8018, 136);
__m512 tmp8028 = _mm512_shuffle_f32x4(tmp8014, tmp8018, 221);
__m512 tmp8029 = _mm512_shuffle_f32x4(tmp8015, tmp8019, 136);
__m512 tmp8030 = _mm512_shuffle_f32x4(tmp8015, tmp8019, 221);
__m512 tmp8031 = _mm512_shuffle_f32x4(tmp8016, tmp8020, 136);
__m512 tmp8032 = _mm512_shuffle_f32x4(tmp8016, tmp8020, 221);
__m512 tmp8033 = _mm512_shuffle_f32x4(tmp8021, tmp8021, 136);
__m512 tmp8034 = _mm512_shuffle_f32x4(tmp8021, tmp8021, 221);
__m512 tmp8035 = _mm512_shuffle_f32x4(tmp8022, tmp8022, 136);
__m512 tmp8036 = _mm512_shuffle_f32x4(tmp8022, tmp8022, 221);
__m512 tmp8037 = _mm512_shuffle_f32x4(tmp8023, tmp8023, 136);
__m512 tmp8038 = _mm512_shuffle_f32x4(tmp8023, tmp8023, 221);
__m512 tmp8039 = _mm512_shuffle_f32x4(tmp8024, tmp8024, 136);
__m512 tmp8040 = _mm512_shuffle_f32x4(tmp8024, tmp8024, 221);
tmp7905 = _mm512_shuffle_f32x4(tmp8025, tmp8033, 136);
tmp7913 = _mm512_shuffle_f32x4(tmp8025, tmp8033, 221);
tmp7906 = _mm512_shuffle_f32x4(tmp8027, tmp8035, 136);
tmp7914 = _mm512_shuffle_f32x4(tmp8027, tmp8035, 221);
tmp7907 = _mm512_shuffle_f32x4(tmp8029, tmp8037, 136);
tmp7915 = _mm512_shuffle_f32x4(tmp8029, tmp8037, 221);
tmp7908 = _mm512_shuffle_f32x4(tmp8031, tmp8039, 136);
tmp7916 = _mm512_shuffle_f32x4(tmp8031, tmp8039, 221);
tmp7909 = _mm512_shuffle_f32x4(tmp8026, tmp8034, 136);
__m512 tmp7957 = _mm512_shuffle_f32x4(tmp8026, tmp8034, 221);
tmp7910 = _mm512_shuffle_f32x4(tmp8028, tmp8036, 136);
__m512 tmp7958 = _mm512_shuffle_f32x4(tmp8028, tmp8036, 221);
tmp7911 = _mm512_shuffle_f32x4(tmp8030, tmp8038, 136);
__m512 tmp7959 = _mm512_shuffle_f32x4(tmp8030, tmp8038, 221);
tmp7912 = _mm512_shuffle_f32x4(tmp8032, tmp8040, 136);
__m512 tmp7960 = _mm512_shuffle_f32x4(tmp8032, tmp8040, 221);
__m512 tmp7965 = _mm512_add_ps(tmp7906, tmp7907);
__m512 tmp7985 = _mm512_add_ps(tmp7914, tmp7915);
__m512 tmp7964 = _mm512_add_ps(tmp7908, tmp7909);
__m512 tmp7984 = _mm512_add_ps(tmp7916, tmp7957);
__m512 tmp7970 = _mm512_sub_ps(tmp7908, tmp7909);
__m512 tmp7990 = _mm512_sub_ps(tmp7916, tmp7957);
__m512 tmp7969 = _mm512_sub_ps(tmp7906, tmp7907);
__m512 tmp7989 = _mm512_sub_ps(tmp7914, tmp7915);
__m512 tmp7966 = _mm512_add_ps(tmp7910, tmp7911);
__m512 tmp7986 = _mm512_add_ps(tmp7958, tmp7959);
__m512 tmp7971 = _mm512_sub_ps(tmp7910, tmp7911);
__m512 tmp7991 = _mm512_sub_ps(tmp7958, tmp7959);
__m512 tmp7968 = _mm512_fmadd_ps(tmp7970, _mm512_set1_ps(2e+00f), tmp7969);
__m512 tmp7988 = _mm512_fmadd_ps(tmp7990, _mm512_set1_ps(2e+00f), tmp7989);
__m512 tmp7975 = _mm512_fmadd_ps(tmp7970, _mm512_set1_ps(8e+00f), tmp7969);
__m512 tmp7995 = _mm512_fmadd_ps(tmp7990, _mm512_set1_ps(8e+00f), tmp7989);
__m512 tmp7963 = _mm512_add_ps(tmp7964, tmp7965);
__m512 tmp7983 = _mm512_add_ps(tmp7984, tmp7985);
__m512 tmp7967 = _mm512_fmadd_ps(tmp7971, _mm512_set1_ps(1.6e+01f), tmp7968);
__m512 tmp7987 = _mm512_fmadd_ps(tmp7991, _mm512_set1_ps(1.6e+01f), tmp7988);
__m512 tmp7974 = _mm512_fmadd_ps(tmp7971, _mm512_set1_ps(4e+00f), tmp7975);
__m512 tmp7994 = _mm512_fmadd_ps(tmp7991, _mm512_set1_ps(4e+00f), tmp7995);
__m512 tmp7980 = _mm512_add_ps(tmp7971, tmp7969);
__m512 tmp8000 = _mm512_add_ps(tmp7991, tmp7989);
__m512 tmp7973 = _mm512_fmadd_ps(tmp7964, _mm512_set1_ps(4e+00f), tmp7965);
__m512 tmp7993 = _mm512_fmadd_ps(tmp7984, _mm512_set1_ps(4e+00f), tmp7985);
__m512 tmp7977 = _mm512_fmadd_ps(tmp7964, _mm512_set1_ps(1.6e+01f), tmp7965);
__m512 tmp7997 = _mm512_fmadd_ps(tmp7984, _mm512_set1_ps(1.6e+01f), tmp7985);
__m512 tmp7962 = _mm512_add_ps(tmp7963, tmp7905);
__m512 tmp7982 = _mm512_add_ps(tmp7983, tmp7913);
__m512 tmp7979 = _mm512_add_ps(tmp7980, tmp7912);
__m512 tmp7999 = _mm512_add_ps(tmp8000, tmp7960);
__m512 tmp7961 = _mm512_fmadd_ps(tmp7966, _mm512_set1_ps(3.2e+01f), tmp7962);
__m512 tmp7981 = _mm512_fmadd_ps(tmp7986, _mm512_set1_ps(3.2e+01f), tmp7982);
__m512 tmp7972 = _mm512_fmadd_ps(tmp7966, _mm512_set1_ps(8e+00f), tmp7973);
__m512 tmp7992 = _mm512_fmadd_ps(tmp7986, _mm512_set1_ps(8e+00f), tmp7993);
__m512 tmp7978 = _mm512_fmadd_ps(tmp7970, _mm512_set1_ps(3.2e+01f), tmp7979);
__m512 tmp7998 = _mm512_fmadd_ps(tmp7990, _mm512_set1_ps(3.2e+01f), tmp7999);
__m512 tmp7976 = _mm512_fmadd_ps(tmp7966, _mm512_set1_ps(2e+00f), tmp7977);
__m512 tmp7996 = _mm512_fmadd_ps(tmp7986, _mm512_set1_ps(2e+00f), tmp7997);
__m512 out1167 = tmp7961;
__m512 out1173 = tmp7981;
__m512 out1168 = tmp7967;
__m512 out1174 = tmp7987;
__m512 out1169 = tmp7972;
__m512 out1175 = tmp7992;
__m512 out1170 = tmp7974;
__m512 out1176 = tmp7994;
__m512 out1171 = tmp7976;
__m512 out1177 = tmp7996;
__m512 out1172 = tmp7978;
__m512 out1178 = tmp7998;
out1167 = _mm512_max_ps(_mm512_setzero_ps(), out1167);
out1173 = _mm512_max_ps(_mm512_setzero_ps(), out1173);
out1168 = _mm512_max_ps(_mm512_setzero_ps(), out1168);
out1174 = _mm512_max_ps(_mm512_setzero_ps(), out1174);
out1169 = _mm512_max_ps(_mm512_setzero_ps(), out1169);
out1175 = _mm512_max_ps(_mm512_setzero_ps(), out1175);
out1170 = _mm512_max_ps(_mm512_setzero_ps(), out1170);
out1176 = _mm512_max_ps(_mm512_setzero_ps(), out1176);
out1171 = _mm512_max_ps(_mm512_setzero_ps(), out1171);
out1177 = _mm512_max_ps(_mm512_setzero_ps(), out1177);
out1172 = _mm512_max_ps(_mm512_setzero_ps(), out1172);
out1178 = _mm512_max_ps(_mm512_setzero_ps(), out1178);
_mm512_mask_storeu_ps(datPtr13+12656+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 255, out1167);
_mm512_mask_storeu_ps(datPtr13+13808+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1173);
_mm512_mask_storeu_ps(datPtr13+12880+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 255, out1168);
_mm512_mask_storeu_ps(datPtr13+14032+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1174);
_mm512_mask_storeu_ps(datPtr13+13104+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 255, out1169);
_mm512_mask_storeu_ps(datPtr13+14256+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1175);
_mm512_mask_storeu_ps(datPtr13+13328+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 255, out1170);
_mm512_mask_storeu_ps(datPtr13+14480+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1176);
_mm512_mask_storeu_ps(datPtr13+13552+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 255, out1171);
_mm512_mask_storeu_ps(datPtr13+14704+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1177);
_mm512_mask_storeu_ps(datPtr13+13776+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 255, out1172);
_mm512_mask_storeu_ps(datPtr13+14928+806912*i28+224*toH30+4*toW30+50432*k91+25216*l30, 4095, out1178);
}
}
if (j23 >= last6) return;
++j23;
j23 = 2;
}
if (j23 < 15) {
ptrdiff_t rel17 = (size_t)(j23-2)%5;
ptrdiff_t base17 = 6+(size_t)(j23-2)/5*18;
for (; ; rel17 = 0, base17 += 18) {
if (rel17 < 2) {
if (rel17 < 1) {
ptrdiff_t toH31 = base17+0;
ptrdiff_t toW31 = 12;
ptrdiff_t k92 = 16*w46;
for (; k92 != 16; ++k92) {
ptrdiff_t l31 = 0;
for (; l31 != 2; ++l31) {
__m512 sf497 = _mm512_loadu_ps(sfPtr7+0+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf498 = _mm512_loadu_ps(sfPtr7+128+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1213 = _mm512_shuffle_f32x4(sf497, sf498, 68);
__m512 in1214 = _mm512_shuffle_f32x4(sf497, sf498, 238);
__m512 sf499 = _mm512_loadu_ps(sfPtr7+64+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf500 = _mm512_loadu_ps(sfPtr7+192+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1221 = _mm512_shuffle_f32x4(sf499, sf500, 68);
__m512 in1222 = _mm512_shuffle_f32x4(sf499, sf500, 238);
__m512 sf501 = _mm512_loadu_ps(sfPtr7+409600+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf502 = _mm512_loadu_ps(sfPtr7+409728+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1215 = _mm512_shuffle_f32x4(sf501, sf502, 68);
__m512 in1216 = _mm512_shuffle_f32x4(sf501, sf502, 238);
__m512 sf503 = _mm512_loadu_ps(sfPtr7+409664+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf504 = _mm512_loadu_ps(sfPtr7+409792+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1223 = _mm512_shuffle_f32x4(sf503, sf504, 68);
__m512 in1224 = _mm512_shuffle_f32x4(sf503, sf504, 238);
__m512 sf505 = _mm512_loadu_ps(sfPtr7+819200+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf506 = _mm512_loadu_ps(sfPtr7+819328+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1217 = _mm512_shuffle_f32x4(sf505, sf506, 68);
__m512 in1218 = _mm512_shuffle_f32x4(sf505, sf506, 238);
__m512 sf507 = _mm512_loadu_ps(sfPtr7+819264+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf508 = _mm512_loadu_ps(sfPtr7+819392+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1225 = _mm512_shuffle_f32x4(sf507, sf508, 68);
__m512 in1226 = _mm512_shuffle_f32x4(sf507, sf508, 238);
__m512 sf509 = _mm512_loadu_ps(sfPtr7+1228800+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf510 = _mm512_loadu_ps(sfPtr7+1228928+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1219 = _mm512_shuffle_f32x4(sf509, sf510, 68);
__m512 in1220 = _mm512_shuffle_f32x4(sf509, sf510, 238);
__m512 sf511 = _mm512_loadu_ps(sfPtr7+1228864+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf512 = _mm512_loadu_ps(sfPtr7+1228992+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1227 = _mm512_shuffle_f32x4(sf511, sf512, 68);
__m512 in1228 = _mm512_shuffle_f32x4(sf511, sf512, 238);
__m512 tmp8057 = _mm512_add_ps(in1214, in1215);
__m512 tmp8077 = _mm512_add_ps(in1222, in1223);
__m512 tmp8056 = _mm512_add_ps(in1216, in1217);
__m512 tmp8076 = _mm512_add_ps(in1224, in1225);
__m512 tmp8062 = _mm512_sub_ps(in1216, in1217);
__m512 tmp8082 = _mm512_sub_ps(in1224, in1225);
__m512 tmp8061 = _mm512_sub_ps(in1214, in1215);
__m512 tmp8081 = _mm512_sub_ps(in1222, in1223);
__m512 tmp8058 = _mm512_add_ps(in1218, in1219);
__m512 tmp8078 = _mm512_add_ps(in1226, in1227);
__m512 tmp8063 = _mm512_sub_ps(in1218, in1219);
__m512 tmp8083 = _mm512_sub_ps(in1226, in1227);
__m512 tmp8060 = _mm512_fmadd_ps(tmp8062, _mm512_set1_ps(2e+00f), tmp8061);
__m512 tmp8080 = _mm512_fmadd_ps(tmp8082, _mm512_set1_ps(2e+00f), tmp8081);
__m512 tmp8067 = _mm512_fmadd_ps(tmp8062, _mm512_set1_ps(8e+00f), tmp8061);
__m512 tmp8087 = _mm512_fmadd_ps(tmp8082, _mm512_set1_ps(8e+00f), tmp8081);
__m512 tmp8055 = _mm512_add_ps(tmp8056, tmp8057);
__m512 tmp8075 = _mm512_add_ps(tmp8076, tmp8077);
__m512 tmp8059 = _mm512_fmadd_ps(tmp8063, _mm512_set1_ps(1.6e+01f), tmp8060);
__m512 tmp8079 = _mm512_fmadd_ps(tmp8083, _mm512_set1_ps(1.6e+01f), tmp8080);
__m512 tmp8066 = _mm512_fmadd_ps(tmp8063, _mm512_set1_ps(4e+00f), tmp8067);
__m512 tmp8086 = _mm512_fmadd_ps(tmp8083, _mm512_set1_ps(4e+00f), tmp8087);
__m512 tmp8072 = _mm512_add_ps(tmp8063, tmp8061);
__m512 tmp8092 = _mm512_add_ps(tmp8083, tmp8081);
__m512 tmp8065 = _mm512_fmadd_ps(tmp8056, _mm512_set1_ps(4e+00f), tmp8057);
__m512 tmp8085 = _mm512_fmadd_ps(tmp8076, _mm512_set1_ps(4e+00f), tmp8077);
__m512 tmp8069 = _mm512_fmadd_ps(tmp8056, _mm512_set1_ps(1.6e+01f), tmp8057);
__m512 tmp8089 = _mm512_fmadd_ps(tmp8076, _mm512_set1_ps(1.6e+01f), tmp8077);
__m512 tmp8054 = _mm512_add_ps(tmp8055, in1213);
__m512 tmp8074 = _mm512_add_ps(tmp8075, in1221);
__m512 tmp8071 = _mm512_add_ps(tmp8072, in1220);
__m512 tmp8091 = _mm512_add_ps(tmp8092, in1228);
__m512 tmp8053 = _mm512_fmadd_ps(tmp8058, _mm512_set1_ps(3.2e+01f), tmp8054);
__m512 tmp8073 = _mm512_fmadd_ps(tmp8078, _mm512_set1_ps(3.2e+01f), tmp8074);
__m512 tmp8064 = _mm512_fmadd_ps(tmp8058, _mm512_set1_ps(8e+00f), tmp8065);
__m512 tmp8084 = _mm512_fmadd_ps(tmp8078, _mm512_set1_ps(8e+00f), tmp8085);
__m512 tmp8070 = _mm512_fmadd_ps(tmp8062, _mm512_set1_ps(3.2e+01f), tmp8071);
__m512 tmp8090 = _mm512_fmadd_ps(tmp8082, _mm512_set1_ps(3.2e+01f), tmp8091);
__m512 tmp8068 = _mm512_fmadd_ps(tmp8058, _mm512_set1_ps(2e+00f), tmp8069);
__m512 tmp8088 = _mm512_fmadd_ps(tmp8078, _mm512_set1_ps(2e+00f), tmp8089);
__m512 tmp8041 = tmp8053;
__m512 tmp8047 = tmp8073;
__m512 tmp8042 = tmp8059;
__m512 tmp8048 = tmp8079;
__m512 tmp8043 = tmp8064;
__m512 tmp8049 = tmp8084;
__m512 tmp8044 = tmp8066;
__m512 tmp8050 = tmp8086;
__m512 tmp8045 = tmp8068;
__m512 tmp8051 = tmp8088;
__m512 tmp8046 = tmp8070;
__m512 tmp8052 = tmp8090;
__m512 tmp8137 = _mm512_unpacklo_ps(tmp8041, tmp8042);
__m512 tmp8138 = _mm512_unpackhi_ps(tmp8041, tmp8042);
__m512 tmp8139 = _mm512_unpacklo_ps(tmp8043, tmp8044);
__m512 tmp8140 = _mm512_unpackhi_ps(tmp8043, tmp8044);
__m512 tmp8141 = _mm512_unpacklo_ps(tmp8045, tmp8046);
__m512 tmp8142 = _mm512_unpackhi_ps(tmp8045, tmp8046);
__m512 tmp8143 = _mm512_unpacklo_ps(tmp8047, tmp8048);
__m512 tmp8144 = _mm512_unpackhi_ps(tmp8047, tmp8048);
__m512 tmp8145 = _mm512_unpacklo_ps(tmp8049, tmp8050);
__m512 tmp8146 = _mm512_unpackhi_ps(tmp8049, tmp8050);
__m512 tmp8147 = _mm512_unpacklo_ps(tmp8051, tmp8052);
__m512 tmp8148 = _mm512_unpackhi_ps(tmp8051, tmp8052);
__m512 tmp8149 = _mm512_shuffle_ps(tmp8137, tmp8139, 68);
__m512 tmp8150 = _mm512_shuffle_ps(tmp8137, tmp8139, 238);
__m512 tmp8151 = _mm512_shuffle_ps(tmp8138, tmp8140, 68);
__m512 tmp8152 = _mm512_shuffle_ps(tmp8138, tmp8140, 238);
__m512 tmp8153 = _mm512_shuffle_ps(tmp8141, tmp8143, 68);
__m512 tmp8154 = _mm512_shuffle_ps(tmp8141, tmp8143, 238);
__m512 tmp8155 = _mm512_shuffle_ps(tmp8142, tmp8144, 68);
__m512 tmp8156 = _mm512_shuffle_ps(tmp8142, tmp8144, 238);
__m512 tmp8157 = _mm512_shuffle_ps(tmp8145, tmp8147, 68);
__m512 tmp8158 = _mm512_shuffle_ps(tmp8145, tmp8147, 238);
__m512 tmp8159 = _mm512_shuffle_ps(tmp8146, tmp8148, 68);
__m512 tmp8160 = _mm512_shuffle_ps(tmp8146, tmp8148, 238);
__m512 tmp8161 = _mm512_shuffle_f32x4(tmp8149, tmp8153, 136);
__m512 tmp8162 = _mm512_shuffle_f32x4(tmp8149, tmp8153, 221);
__m512 tmp8163 = _mm512_shuffle_f32x4(tmp8150, tmp8154, 136);
__m512 tmp8164 = _mm512_shuffle_f32x4(tmp8150, tmp8154, 221);
__m512 tmp8165 = _mm512_shuffle_f32x4(tmp8151, tmp8155, 136);
__m512 tmp8166 = _mm512_shuffle_f32x4(tmp8151, tmp8155, 221);
__m512 tmp8167 = _mm512_shuffle_f32x4(tmp8152, tmp8156, 136);
__m512 tmp8168 = _mm512_shuffle_f32x4(tmp8152, tmp8156, 221);
__m512 tmp8169 = _mm512_shuffle_f32x4(tmp8157, tmp8157, 136);
__m512 tmp8170 = _mm512_shuffle_f32x4(tmp8157, tmp8157, 221);
__m512 tmp8171 = _mm512_shuffle_f32x4(tmp8158, tmp8158, 136);
__m512 tmp8172 = _mm512_shuffle_f32x4(tmp8158, tmp8158, 221);
__m512 tmp8173 = _mm512_shuffle_f32x4(tmp8159, tmp8159, 136);
__m512 tmp8174 = _mm512_shuffle_f32x4(tmp8159, tmp8159, 221);
__m512 tmp8175 = _mm512_shuffle_f32x4(tmp8160, tmp8160, 136);
__m512 tmp8176 = _mm512_shuffle_f32x4(tmp8160, tmp8160, 221);
tmp8041 = _mm512_shuffle_f32x4(tmp8161, tmp8169, 136);
tmp8049 = _mm512_shuffle_f32x4(tmp8161, tmp8169, 221);
tmp8042 = _mm512_shuffle_f32x4(tmp8163, tmp8171, 136);
tmp8050 = _mm512_shuffle_f32x4(tmp8163, tmp8171, 221);
tmp8043 = _mm512_shuffle_f32x4(tmp8165, tmp8173, 136);
tmp8051 = _mm512_shuffle_f32x4(tmp8165, tmp8173, 221);
tmp8044 = _mm512_shuffle_f32x4(tmp8167, tmp8175, 136);
tmp8052 = _mm512_shuffle_f32x4(tmp8167, tmp8175, 221);
tmp8045 = _mm512_shuffle_f32x4(tmp8162, tmp8170, 136);
__m512 tmp8093 = _mm512_shuffle_f32x4(tmp8162, tmp8170, 221);
tmp8046 = _mm512_shuffle_f32x4(tmp8164, tmp8172, 136);
__m512 tmp8094 = _mm512_shuffle_f32x4(tmp8164, tmp8172, 221);
tmp8047 = _mm512_shuffle_f32x4(tmp8166, tmp8174, 136);
__m512 tmp8095 = _mm512_shuffle_f32x4(tmp8166, tmp8174, 221);
tmp8048 = _mm512_shuffle_f32x4(tmp8168, tmp8176, 136);
__m512 tmp8096 = _mm512_shuffle_f32x4(tmp8168, tmp8176, 221);
__m512 tmp8101 = _mm512_add_ps(tmp8042, tmp8043);
__m512 tmp8121 = _mm512_add_ps(tmp8050, tmp8051);
__m512 tmp8100 = _mm512_add_ps(tmp8044, tmp8045);
__m512 tmp8120 = _mm512_add_ps(tmp8052, tmp8093);
__m512 tmp8106 = _mm512_sub_ps(tmp8044, tmp8045);
__m512 tmp8126 = _mm512_sub_ps(tmp8052, tmp8093);
__m512 tmp8105 = _mm512_sub_ps(tmp8042, tmp8043);
__m512 tmp8125 = _mm512_sub_ps(tmp8050, tmp8051);
__m512 tmp8102 = _mm512_add_ps(tmp8046, tmp8047);
__m512 tmp8122 = _mm512_add_ps(tmp8094, tmp8095);
__m512 tmp8107 = _mm512_sub_ps(tmp8046, tmp8047);
__m512 tmp8127 = _mm512_sub_ps(tmp8094, tmp8095);
__m512 tmp8104 = _mm512_fmadd_ps(tmp8106, _mm512_set1_ps(2e+00f), tmp8105);
__m512 tmp8124 = _mm512_fmadd_ps(tmp8126, _mm512_set1_ps(2e+00f), tmp8125);
__m512 tmp8111 = _mm512_fmadd_ps(tmp8106, _mm512_set1_ps(8e+00f), tmp8105);
__m512 tmp8131 = _mm512_fmadd_ps(tmp8126, _mm512_set1_ps(8e+00f), tmp8125);
__m512 tmp8099 = _mm512_add_ps(tmp8100, tmp8101);
__m512 tmp8119 = _mm512_add_ps(tmp8120, tmp8121);
__m512 tmp8103 = _mm512_fmadd_ps(tmp8107, _mm512_set1_ps(1.6e+01f), tmp8104);
__m512 tmp8123 = _mm512_fmadd_ps(tmp8127, _mm512_set1_ps(1.6e+01f), tmp8124);
__m512 tmp8110 = _mm512_fmadd_ps(tmp8107, _mm512_set1_ps(4e+00f), tmp8111);
__m512 tmp8130 = _mm512_fmadd_ps(tmp8127, _mm512_set1_ps(4e+00f), tmp8131);
__m512 tmp8116 = _mm512_add_ps(tmp8107, tmp8105);
__m512 tmp8136 = _mm512_add_ps(tmp8127, tmp8125);
__m512 tmp8109 = _mm512_fmadd_ps(tmp8100, _mm512_set1_ps(4e+00f), tmp8101);
__m512 tmp8129 = _mm512_fmadd_ps(tmp8120, _mm512_set1_ps(4e+00f), tmp8121);
__m512 tmp8113 = _mm512_fmadd_ps(tmp8100, _mm512_set1_ps(1.6e+01f), tmp8101);
__m512 tmp8133 = _mm512_fmadd_ps(tmp8120, _mm512_set1_ps(1.6e+01f), tmp8121);
__m512 tmp8098 = _mm512_add_ps(tmp8099, tmp8041);
__m512 tmp8118 = _mm512_add_ps(tmp8119, tmp8049);
__m512 tmp8115 = _mm512_add_ps(tmp8116, tmp8048);
__m512 tmp8135 = _mm512_add_ps(tmp8136, tmp8096);
__m512 tmp8097 = _mm512_fmadd_ps(tmp8102, _mm512_set1_ps(3.2e+01f), tmp8098);
__m512 tmp8117 = _mm512_fmadd_ps(tmp8122, _mm512_set1_ps(3.2e+01f), tmp8118);
__m512 tmp8108 = _mm512_fmadd_ps(tmp8102, _mm512_set1_ps(8e+00f), tmp8109);
__m512 tmp8128 = _mm512_fmadd_ps(tmp8122, _mm512_set1_ps(8e+00f), tmp8129);
__m512 tmp8114 = _mm512_fmadd_ps(tmp8106, _mm512_set1_ps(3.2e+01f), tmp8115);
__m512 tmp8134 = _mm512_fmadd_ps(tmp8126, _mm512_set1_ps(3.2e+01f), tmp8135);
__m512 tmp8112 = _mm512_fmadd_ps(tmp8102, _mm512_set1_ps(2e+00f), tmp8113);
__m512 tmp8132 = _mm512_fmadd_ps(tmp8122, _mm512_set1_ps(2e+00f), tmp8133);
__m512 out1179 = tmp8097;
__m512 out1185 = tmp8117;
__m512 out1180 = tmp8103;
__m512 out1186 = tmp8123;
__m512 out1181 = tmp8108;
__m512 out1187 = tmp8128;
__m512 out1182 = tmp8110;
__m512 out1188 = tmp8130;
__m512 out1183 = tmp8112;
__m512 out1189 = tmp8132;
__m512 out1184 = tmp8114;
__m512 out1190 = tmp8134;
out1179 = _mm512_max_ps(_mm512_setzero_ps(), out1179);
out1185 = _mm512_max_ps(_mm512_setzero_ps(), out1185);
out1180 = _mm512_max_ps(_mm512_setzero_ps(), out1180);
out1186 = _mm512_max_ps(_mm512_setzero_ps(), out1186);
out1181 = _mm512_max_ps(_mm512_setzero_ps(), out1181);
out1187 = _mm512_max_ps(_mm512_setzero_ps(), out1187);
out1182 = _mm512_max_ps(_mm512_setzero_ps(), out1182);
out1188 = _mm512_max_ps(_mm512_setzero_ps(), out1188);
out1183 = _mm512_max_ps(_mm512_setzero_ps(), out1183);
out1189 = _mm512_max_ps(_mm512_setzero_ps(), out1189);
out1184 = _mm512_max_ps(_mm512_setzero_ps(), out1184);
out1190 = _mm512_max_ps(_mm512_setzero_ps(), out1190);
_mm512_mask_storeu_ps(datPtr13+0+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1179);
_mm512_mask_storeu_ps(datPtr13+48+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1185);
_mm512_mask_storeu_ps(datPtr13+224+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1180);
_mm512_mask_storeu_ps(datPtr13+272+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1186);
_mm512_mask_storeu_ps(datPtr13+448+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1181);
_mm512_mask_storeu_ps(datPtr13+496+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1187);
_mm512_mask_storeu_ps(datPtr13+672+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1182);
_mm512_mask_storeu_ps(datPtr13+720+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1188);
_mm512_mask_storeu_ps(datPtr13+896+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1183);
_mm512_mask_storeu_ps(datPtr13+944+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1189);
_mm512_mask_storeu_ps(datPtr13+1120+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1184);
_mm512_mask_storeu_ps(datPtr13+1168+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1190);
__m512 sf513 = _mm512_loadu_ps(sfPtr7+256+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf514 = _mm512_loadu_ps(sfPtr7+384+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1229 = _mm512_shuffle_f32x4(sf513, sf514, 68);
__m512 in1230 = _mm512_shuffle_f32x4(sf513, sf514, 238);
__m512 sf515 = _mm512_loadu_ps(sfPtr7+320+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf516 = _mm512_loadu_ps(sfPtr7+448+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1237 = _mm512_shuffle_f32x4(sf515, sf516, 68);
__m512 in1238 = _mm512_shuffle_f32x4(sf515, sf516, 238);
__m512 sf517 = _mm512_loadu_ps(sfPtr7+409856+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf518 = _mm512_loadu_ps(sfPtr7+409984+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1231 = _mm512_shuffle_f32x4(sf517, sf518, 68);
__m512 in1232 = _mm512_shuffle_f32x4(sf517, sf518, 238);
__m512 sf519 = _mm512_loadu_ps(sfPtr7+409920+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf520 = _mm512_loadu_ps(sfPtr7+410048+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1239 = _mm512_shuffle_f32x4(sf519, sf520, 68);
__m512 in1240 = _mm512_shuffle_f32x4(sf519, sf520, 238);
__m512 sf521 = _mm512_loadu_ps(sfPtr7+819456+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf522 = _mm512_loadu_ps(sfPtr7+819584+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1233 = _mm512_shuffle_f32x4(sf521, sf522, 68);
__m512 in1234 = _mm512_shuffle_f32x4(sf521, sf522, 238);
__m512 sf523 = _mm512_loadu_ps(sfPtr7+819520+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf524 = _mm512_loadu_ps(sfPtr7+819648+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1241 = _mm512_shuffle_f32x4(sf523, sf524, 68);
__m512 in1242 = _mm512_shuffle_f32x4(sf523, sf524, 238);
__m512 sf525 = _mm512_loadu_ps(sfPtr7+1229056+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf526 = _mm512_loadu_ps(sfPtr7+1229184+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1235 = _mm512_shuffle_f32x4(sf525, sf526, 68);
__m512 in1236 = _mm512_shuffle_f32x4(sf525, sf526, 238);
__m512 sf527 = _mm512_loadu_ps(sfPtr7+1229120+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf528 = _mm512_loadu_ps(sfPtr7+1229248+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1243 = _mm512_shuffle_f32x4(sf527, sf528, 68);
__m512 in1244 = _mm512_shuffle_f32x4(sf527, sf528, 238);
__m512 tmp8193 = _mm512_add_ps(in1230, in1231);
__m512 tmp8213 = _mm512_add_ps(in1238, in1239);
__m512 tmp8192 = _mm512_add_ps(in1232, in1233);
__m512 tmp8212 = _mm512_add_ps(in1240, in1241);
__m512 tmp8198 = _mm512_sub_ps(in1232, in1233);
__m512 tmp8218 = _mm512_sub_ps(in1240, in1241);
__m512 tmp8197 = _mm512_sub_ps(in1230, in1231);
__m512 tmp8217 = _mm512_sub_ps(in1238, in1239);
__m512 tmp8194 = _mm512_add_ps(in1234, in1235);
__m512 tmp8214 = _mm512_add_ps(in1242, in1243);
__m512 tmp8199 = _mm512_sub_ps(in1234, in1235);
__m512 tmp8219 = _mm512_sub_ps(in1242, in1243);
__m512 tmp8196 = _mm512_fmadd_ps(tmp8198, _mm512_set1_ps(2e+00f), tmp8197);
__m512 tmp8216 = _mm512_fmadd_ps(tmp8218, _mm512_set1_ps(2e+00f), tmp8217);
__m512 tmp8203 = _mm512_fmadd_ps(tmp8198, _mm512_set1_ps(8e+00f), tmp8197);
__m512 tmp8223 = _mm512_fmadd_ps(tmp8218, _mm512_set1_ps(8e+00f), tmp8217);
__m512 tmp8191 = _mm512_add_ps(tmp8192, tmp8193);
__m512 tmp8211 = _mm512_add_ps(tmp8212, tmp8213);
__m512 tmp8195 = _mm512_fmadd_ps(tmp8199, _mm512_set1_ps(1.6e+01f), tmp8196);
__m512 tmp8215 = _mm512_fmadd_ps(tmp8219, _mm512_set1_ps(1.6e+01f), tmp8216);
__m512 tmp8202 = _mm512_fmadd_ps(tmp8199, _mm512_set1_ps(4e+00f), tmp8203);
__m512 tmp8222 = _mm512_fmadd_ps(tmp8219, _mm512_set1_ps(4e+00f), tmp8223);
__m512 tmp8208 = _mm512_add_ps(tmp8199, tmp8197);
__m512 tmp8228 = _mm512_add_ps(tmp8219, tmp8217);
__m512 tmp8201 = _mm512_fmadd_ps(tmp8192, _mm512_set1_ps(4e+00f), tmp8193);
__m512 tmp8221 = _mm512_fmadd_ps(tmp8212, _mm512_set1_ps(4e+00f), tmp8213);
__m512 tmp8205 = _mm512_fmadd_ps(tmp8192, _mm512_set1_ps(1.6e+01f), tmp8193);
__m512 tmp8225 = _mm512_fmadd_ps(tmp8212, _mm512_set1_ps(1.6e+01f), tmp8213);
__m512 tmp8190 = _mm512_add_ps(tmp8191, in1229);
__m512 tmp8210 = _mm512_add_ps(tmp8211, in1237);
__m512 tmp8207 = _mm512_add_ps(tmp8208, in1236);
__m512 tmp8227 = _mm512_add_ps(tmp8228, in1244);
__m512 tmp8189 = _mm512_fmadd_ps(tmp8194, _mm512_set1_ps(3.2e+01f), tmp8190);
__m512 tmp8209 = _mm512_fmadd_ps(tmp8214, _mm512_set1_ps(3.2e+01f), tmp8210);
__m512 tmp8200 = _mm512_fmadd_ps(tmp8194, _mm512_set1_ps(8e+00f), tmp8201);
__m512 tmp8220 = _mm512_fmadd_ps(tmp8214, _mm512_set1_ps(8e+00f), tmp8221);
__m512 tmp8206 = _mm512_fmadd_ps(tmp8198, _mm512_set1_ps(3.2e+01f), tmp8207);
__m512 tmp8226 = _mm512_fmadd_ps(tmp8218, _mm512_set1_ps(3.2e+01f), tmp8227);
__m512 tmp8204 = _mm512_fmadd_ps(tmp8194, _mm512_set1_ps(2e+00f), tmp8205);
__m512 tmp8224 = _mm512_fmadd_ps(tmp8214, _mm512_set1_ps(2e+00f), tmp8225);
__m512 tmp8177 = tmp8189;
__m512 tmp8183 = tmp8209;
__m512 tmp8178 = tmp8195;
__m512 tmp8184 = tmp8215;
__m512 tmp8179 = tmp8200;
__m512 tmp8185 = tmp8220;
__m512 tmp8180 = tmp8202;
__m512 tmp8186 = tmp8222;
__m512 tmp8181 = tmp8204;
__m512 tmp8187 = tmp8224;
__m512 tmp8182 = tmp8206;
__m512 tmp8188 = tmp8226;
__m512 tmp8273 = _mm512_unpacklo_ps(tmp8177, tmp8178);
__m512 tmp8274 = _mm512_unpackhi_ps(tmp8177, tmp8178);
__m512 tmp8275 = _mm512_unpacklo_ps(tmp8179, tmp8180);
__m512 tmp8276 = _mm512_unpackhi_ps(tmp8179, tmp8180);
__m512 tmp8277 = _mm512_unpacklo_ps(tmp8181, tmp8182);
__m512 tmp8278 = _mm512_unpackhi_ps(tmp8181, tmp8182);
__m512 tmp8279 = _mm512_unpacklo_ps(tmp8183, tmp8184);
__m512 tmp8280 = _mm512_unpackhi_ps(tmp8183, tmp8184);
__m512 tmp8281 = _mm512_unpacklo_ps(tmp8185, tmp8186);
__m512 tmp8282 = _mm512_unpackhi_ps(tmp8185, tmp8186);
__m512 tmp8283 = _mm512_unpacklo_ps(tmp8187, tmp8188);
__m512 tmp8284 = _mm512_unpackhi_ps(tmp8187, tmp8188);
__m512 tmp8285 = _mm512_shuffle_ps(tmp8273, tmp8275, 68);
__m512 tmp8286 = _mm512_shuffle_ps(tmp8273, tmp8275, 238);
__m512 tmp8287 = _mm512_shuffle_ps(tmp8274, tmp8276, 68);
__m512 tmp8288 = _mm512_shuffle_ps(tmp8274, tmp8276, 238);
__m512 tmp8289 = _mm512_shuffle_ps(tmp8277, tmp8279, 68);
__m512 tmp8290 = _mm512_shuffle_ps(tmp8277, tmp8279, 238);
__m512 tmp8291 = _mm512_shuffle_ps(tmp8278, tmp8280, 68);
__m512 tmp8292 = _mm512_shuffle_ps(tmp8278, tmp8280, 238);
__m512 tmp8293 = _mm512_shuffle_ps(tmp8281, tmp8283, 68);
__m512 tmp8294 = _mm512_shuffle_ps(tmp8281, tmp8283, 238);
__m512 tmp8295 = _mm512_shuffle_ps(tmp8282, tmp8284, 68);
__m512 tmp8296 = _mm512_shuffle_ps(tmp8282, tmp8284, 238);
__m512 tmp8297 = _mm512_shuffle_f32x4(tmp8285, tmp8289, 136);
__m512 tmp8298 = _mm512_shuffle_f32x4(tmp8285, tmp8289, 221);
__m512 tmp8299 = _mm512_shuffle_f32x4(tmp8286, tmp8290, 136);
__m512 tmp8300 = _mm512_shuffle_f32x4(tmp8286, tmp8290, 221);
__m512 tmp8301 = _mm512_shuffle_f32x4(tmp8287, tmp8291, 136);
__m512 tmp8302 = _mm512_shuffle_f32x4(tmp8287, tmp8291, 221);
__m512 tmp8303 = _mm512_shuffle_f32x4(tmp8288, tmp8292, 136);
__m512 tmp8304 = _mm512_shuffle_f32x4(tmp8288, tmp8292, 221);
__m512 tmp8305 = _mm512_shuffle_f32x4(tmp8293, tmp8293, 136);
__m512 tmp8306 = _mm512_shuffle_f32x4(tmp8293, tmp8293, 221);
__m512 tmp8307 = _mm512_shuffle_f32x4(tmp8294, tmp8294, 136);
__m512 tmp8308 = _mm512_shuffle_f32x4(tmp8294, tmp8294, 221);
__m512 tmp8309 = _mm512_shuffle_f32x4(tmp8295, tmp8295, 136);
__m512 tmp8310 = _mm512_shuffle_f32x4(tmp8295, tmp8295, 221);
__m512 tmp8311 = _mm512_shuffle_f32x4(tmp8296, tmp8296, 136);
__m512 tmp8312 = _mm512_shuffle_f32x4(tmp8296, tmp8296, 221);
tmp8177 = _mm512_shuffle_f32x4(tmp8297, tmp8305, 136);
tmp8185 = _mm512_shuffle_f32x4(tmp8297, tmp8305, 221);
tmp8178 = _mm512_shuffle_f32x4(tmp8299, tmp8307, 136);
tmp8186 = _mm512_shuffle_f32x4(tmp8299, tmp8307, 221);
tmp8179 = _mm512_shuffle_f32x4(tmp8301, tmp8309, 136);
tmp8187 = _mm512_shuffle_f32x4(tmp8301, tmp8309, 221);
tmp8180 = _mm512_shuffle_f32x4(tmp8303, tmp8311, 136);
tmp8188 = _mm512_shuffle_f32x4(tmp8303, tmp8311, 221);
tmp8181 = _mm512_shuffle_f32x4(tmp8298, tmp8306, 136);
__m512 tmp8229 = _mm512_shuffle_f32x4(tmp8298, tmp8306, 221);
tmp8182 = _mm512_shuffle_f32x4(tmp8300, tmp8308, 136);
__m512 tmp8230 = _mm512_shuffle_f32x4(tmp8300, tmp8308, 221);
tmp8183 = _mm512_shuffle_f32x4(tmp8302, tmp8310, 136);
__m512 tmp8231 = _mm512_shuffle_f32x4(tmp8302, tmp8310, 221);
tmp8184 = _mm512_shuffle_f32x4(tmp8304, tmp8312, 136);
__m512 tmp8232 = _mm512_shuffle_f32x4(tmp8304, tmp8312, 221);
__m512 tmp8237 = _mm512_add_ps(tmp8178, tmp8179);
__m512 tmp8257 = _mm512_add_ps(tmp8186, tmp8187);
__m512 tmp8236 = _mm512_add_ps(tmp8180, tmp8181);
__m512 tmp8256 = _mm512_add_ps(tmp8188, tmp8229);
__m512 tmp8242 = _mm512_sub_ps(tmp8180, tmp8181);
__m512 tmp8262 = _mm512_sub_ps(tmp8188, tmp8229);
__m512 tmp8241 = _mm512_sub_ps(tmp8178, tmp8179);
__m512 tmp8261 = _mm512_sub_ps(tmp8186, tmp8187);
__m512 tmp8238 = _mm512_add_ps(tmp8182, tmp8183);
__m512 tmp8258 = _mm512_add_ps(tmp8230, tmp8231);
__m512 tmp8243 = _mm512_sub_ps(tmp8182, tmp8183);
__m512 tmp8263 = _mm512_sub_ps(tmp8230, tmp8231);
__m512 tmp8240 = _mm512_fmadd_ps(tmp8242, _mm512_set1_ps(2e+00f), tmp8241);
__m512 tmp8260 = _mm512_fmadd_ps(tmp8262, _mm512_set1_ps(2e+00f), tmp8261);
__m512 tmp8247 = _mm512_fmadd_ps(tmp8242, _mm512_set1_ps(8e+00f), tmp8241);
__m512 tmp8267 = _mm512_fmadd_ps(tmp8262, _mm512_set1_ps(8e+00f), tmp8261);
__m512 tmp8235 = _mm512_add_ps(tmp8236, tmp8237);
__m512 tmp8255 = _mm512_add_ps(tmp8256, tmp8257);
__m512 tmp8239 = _mm512_fmadd_ps(tmp8243, _mm512_set1_ps(1.6e+01f), tmp8240);
__m512 tmp8259 = _mm512_fmadd_ps(tmp8263, _mm512_set1_ps(1.6e+01f), tmp8260);
__m512 tmp8246 = _mm512_fmadd_ps(tmp8243, _mm512_set1_ps(4e+00f), tmp8247);
__m512 tmp8266 = _mm512_fmadd_ps(tmp8263, _mm512_set1_ps(4e+00f), tmp8267);
__m512 tmp8252 = _mm512_add_ps(tmp8243, tmp8241);
__m512 tmp8272 = _mm512_add_ps(tmp8263, tmp8261);
__m512 tmp8245 = _mm512_fmadd_ps(tmp8236, _mm512_set1_ps(4e+00f), tmp8237);
__m512 tmp8265 = _mm512_fmadd_ps(tmp8256, _mm512_set1_ps(4e+00f), tmp8257);
__m512 tmp8249 = _mm512_fmadd_ps(tmp8236, _mm512_set1_ps(1.6e+01f), tmp8237);
__m512 tmp8269 = _mm512_fmadd_ps(tmp8256, _mm512_set1_ps(1.6e+01f), tmp8257);
__m512 tmp8234 = _mm512_add_ps(tmp8235, tmp8177);
__m512 tmp8254 = _mm512_add_ps(tmp8255, tmp8185);
__m512 tmp8251 = _mm512_add_ps(tmp8252, tmp8184);
__m512 tmp8271 = _mm512_add_ps(tmp8272, tmp8232);
__m512 tmp8233 = _mm512_fmadd_ps(tmp8238, _mm512_set1_ps(3.2e+01f), tmp8234);
__m512 tmp8253 = _mm512_fmadd_ps(tmp8258, _mm512_set1_ps(3.2e+01f), tmp8254);
__m512 tmp8244 = _mm512_fmadd_ps(tmp8238, _mm512_set1_ps(8e+00f), tmp8245);
__m512 tmp8264 = _mm512_fmadd_ps(tmp8258, _mm512_set1_ps(8e+00f), tmp8265);
__m512 tmp8250 = _mm512_fmadd_ps(tmp8242, _mm512_set1_ps(3.2e+01f), tmp8251);
__m512 tmp8270 = _mm512_fmadd_ps(tmp8262, _mm512_set1_ps(3.2e+01f), tmp8271);
__m512 tmp8248 = _mm512_fmadd_ps(tmp8238, _mm512_set1_ps(2e+00f), tmp8249);
__m512 tmp8268 = _mm512_fmadd_ps(tmp8258, _mm512_set1_ps(2e+00f), tmp8269);
__m512 out1191 = tmp8233;
__m512 out1197 = tmp8253;
__m512 out1192 = tmp8239;
__m512 out1198 = tmp8259;
__m512 out1193 = tmp8244;
__m512 out1199 = tmp8264;
__m512 out1194 = tmp8246;
__m512 out1200 = tmp8266;
__m512 out1195 = tmp8248;
__m512 out1201 = tmp8268;
__m512 out1196 = tmp8250;
__m512 out1202 = tmp8270;
out1191 = _mm512_max_ps(_mm512_setzero_ps(), out1191);
out1197 = _mm512_max_ps(_mm512_setzero_ps(), out1197);
out1192 = _mm512_max_ps(_mm512_setzero_ps(), out1192);
out1198 = _mm512_max_ps(_mm512_setzero_ps(), out1198);
out1193 = _mm512_max_ps(_mm512_setzero_ps(), out1193);
out1199 = _mm512_max_ps(_mm512_setzero_ps(), out1199);
out1194 = _mm512_max_ps(_mm512_setzero_ps(), out1194);
out1200 = _mm512_max_ps(_mm512_setzero_ps(), out1200);
out1195 = _mm512_max_ps(_mm512_setzero_ps(), out1195);
out1201 = _mm512_max_ps(_mm512_setzero_ps(), out1201);
out1196 = _mm512_max_ps(_mm512_setzero_ps(), out1196);
out1202 = _mm512_max_ps(_mm512_setzero_ps(), out1202);
_mm512_mask_storeu_ps(datPtr13+96+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1191);
_mm512_mask_storeu_ps(datPtr13+12608+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1197);
_mm512_mask_storeu_ps(datPtr13+320+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1192);
_mm512_mask_storeu_ps(datPtr13+12832+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1198);
_mm512_mask_storeu_ps(datPtr13+544+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1193);
_mm512_mask_storeu_ps(datPtr13+13056+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1199);
_mm512_mask_storeu_ps(datPtr13+768+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1194);
_mm512_mask_storeu_ps(datPtr13+13280+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1200);
_mm512_mask_storeu_ps(datPtr13+992+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1195);
_mm512_mask_storeu_ps(datPtr13+13504+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1201);
_mm512_mask_storeu_ps(datPtr13+1216+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1196);
_mm512_mask_storeu_ps(datPtr13+13728+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1202);
__m512 sf529 = _mm512_loadu_ps(sfPtr7+512+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf530 = _mm512_loadu_ps(sfPtr7+640+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1245 = _mm512_shuffle_f32x4(sf529, sf530, 68);
__m512 in1246 = _mm512_shuffle_f32x4(sf529, sf530, 238);
__m512 sf531 = _mm512_loadu_ps(sfPtr7+576+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf532 = _mm512_loadu_ps(sfPtr7+704+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1253 = _mm512_shuffle_f32x4(sf531, sf532, 68);
__m512 in1254 = _mm512_shuffle_f32x4(sf531, sf532, 238);
__m512 sf533 = _mm512_loadu_ps(sfPtr7+410112+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf534 = _mm512_loadu_ps(sfPtr7+410240+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1247 = _mm512_shuffle_f32x4(sf533, sf534, 68);
__m512 in1248 = _mm512_shuffle_f32x4(sf533, sf534, 238);
__m512 sf535 = _mm512_loadu_ps(sfPtr7+410176+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf536 = _mm512_loadu_ps(sfPtr7+410304+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1255 = _mm512_shuffle_f32x4(sf535, sf536, 68);
__m512 in1256 = _mm512_shuffle_f32x4(sf535, sf536, 238);
__m512 sf537 = _mm512_loadu_ps(sfPtr7+819712+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf538 = _mm512_loadu_ps(sfPtr7+819840+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1249 = _mm512_shuffle_f32x4(sf537, sf538, 68);
__m512 in1250 = _mm512_shuffle_f32x4(sf537, sf538, 238);
__m512 sf539 = _mm512_loadu_ps(sfPtr7+819776+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf540 = _mm512_loadu_ps(sfPtr7+819904+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1257 = _mm512_shuffle_f32x4(sf539, sf540, 68);
__m512 in1258 = _mm512_shuffle_f32x4(sf539, sf540, 238);
__m512 sf541 = _mm512_loadu_ps(sfPtr7+1229312+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf542 = _mm512_loadu_ps(sfPtr7+1229440+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1251 = _mm512_shuffle_f32x4(sf541, sf542, 68);
__m512 in1252 = _mm512_shuffle_f32x4(sf541, sf542, 238);
__m512 sf543 = _mm512_loadu_ps(sfPtr7+1229376+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 sf544 = _mm512_loadu_ps(sfPtr7+1229504+1638400*i28+24576*j23+1536*k92+768*l31);
__m512 in1259 = _mm512_shuffle_f32x4(sf543, sf544, 68);
__m512 in1260 = _mm512_shuffle_f32x4(sf543, sf544, 238);
__m512 tmp8329 = _mm512_add_ps(in1246, in1247);
__m512 tmp8349 = _mm512_add_ps(in1254, in1255);
__m512 tmp8328 = _mm512_add_ps(in1248, in1249);
__m512 tmp8348 = _mm512_add_ps(in1256, in1257);
__m512 tmp8334 = _mm512_sub_ps(in1248, in1249);
__m512 tmp8354 = _mm512_sub_ps(in1256, in1257);
__m512 tmp8333 = _mm512_sub_ps(in1246, in1247);
__m512 tmp8353 = _mm512_sub_ps(in1254, in1255);
__m512 tmp8330 = _mm512_add_ps(in1250, in1251);
__m512 tmp8350 = _mm512_add_ps(in1258, in1259);
__m512 tmp8335 = _mm512_sub_ps(in1250, in1251);
__m512 tmp8355 = _mm512_sub_ps(in1258, in1259);
__m512 tmp8332 = _mm512_fmadd_ps(tmp8334, _mm512_set1_ps(2e+00f), tmp8333);
__m512 tmp8352 = _mm512_fmadd_ps(tmp8354, _mm512_set1_ps(2e+00f), tmp8353);
__m512 tmp8339 = _mm512_fmadd_ps(tmp8334, _mm512_set1_ps(8e+00f), tmp8333);
__m512 tmp8359 = _mm512_fmadd_ps(tmp8354, _mm512_set1_ps(8e+00f), tmp8353);
__m512 tmp8327 = _mm512_add_ps(tmp8328, tmp8329);
__m512 tmp8347 = _mm512_add_ps(tmp8348, tmp8349);
__m512 tmp8331 = _mm512_fmadd_ps(tmp8335, _mm512_set1_ps(1.6e+01f), tmp8332);
__m512 tmp8351 = _mm512_fmadd_ps(tmp8355, _mm512_set1_ps(1.6e+01f), tmp8352);
__m512 tmp8338 = _mm512_fmadd_ps(tmp8335, _mm512_set1_ps(4e+00f), tmp8339);
__m512 tmp8358 = _mm512_fmadd_ps(tmp8355, _mm512_set1_ps(4e+00f), tmp8359);
__m512 tmp8344 = _mm512_add_ps(tmp8335, tmp8333);
__m512 tmp8364 = _mm512_add_ps(tmp8355, tmp8353);
__m512 tmp8337 = _mm512_fmadd_ps(tmp8328, _mm512_set1_ps(4e+00f), tmp8329);
__m512 tmp8357 = _mm512_fmadd_ps(tmp8348, _mm512_set1_ps(4e+00f), tmp8349);
__m512 tmp8341 = _mm512_fmadd_ps(tmp8328, _mm512_set1_ps(1.6e+01f), tmp8329);
__m512 tmp8361 = _mm512_fmadd_ps(tmp8348, _mm512_set1_ps(1.6e+01f), tmp8349);
__m512 tmp8326 = _mm512_add_ps(tmp8327, in1245);
__m512 tmp8346 = _mm512_add_ps(tmp8347, in1253);
__m512 tmp8343 = _mm512_add_ps(tmp8344, in1252);
__m512 tmp8363 = _mm512_add_ps(tmp8364, in1260);
__m512 tmp8325 = _mm512_fmadd_ps(tmp8330, _mm512_set1_ps(3.2e+01f), tmp8326);
__m512 tmp8345 = _mm512_fmadd_ps(tmp8350, _mm512_set1_ps(3.2e+01f), tmp8346);
__m512 tmp8336 = _mm512_fmadd_ps(tmp8330, _mm512_set1_ps(8e+00f), tmp8337);
__m512 tmp8356 = _mm512_fmadd_ps(tmp8350, _mm512_set1_ps(8e+00f), tmp8357);
__m512 tmp8342 = _mm512_fmadd_ps(tmp8334, _mm512_set1_ps(3.2e+01f), tmp8343);
__m512 tmp8362 = _mm512_fmadd_ps(tmp8354, _mm512_set1_ps(3.2e+01f), tmp8363);
__m512 tmp8340 = _mm512_fmadd_ps(tmp8330, _mm512_set1_ps(2e+00f), tmp8341);
__m512 tmp8360 = _mm512_fmadd_ps(tmp8350, _mm512_set1_ps(2e+00f), tmp8361);
__m512 tmp8313 = tmp8325;
__m512 tmp8319 = tmp8345;
__m512 tmp8314 = tmp8331;
__m512 tmp8320 = tmp8351;
__m512 tmp8315 = tmp8336;
__m512 tmp8321 = tmp8356;
__m512 tmp8316 = tmp8338;
__m512 tmp8322 = tmp8358;
__m512 tmp8317 = tmp8340;
__m512 tmp8323 = tmp8360;
__m512 tmp8318 = tmp8342;
__m512 tmp8324 = tmp8362;
__m512 tmp8409 = _mm512_unpacklo_ps(tmp8313, tmp8314);
__m512 tmp8410 = _mm512_unpackhi_ps(tmp8313, tmp8314);
__m512 tmp8411 = _mm512_unpacklo_ps(tmp8315, tmp8316);
__m512 tmp8412 = _mm512_unpackhi_ps(tmp8315, tmp8316);
__m512 tmp8413 = _mm512_unpacklo_ps(tmp8317, tmp8318);
__m512 tmp8414 = _mm512_unpackhi_ps(tmp8317, tmp8318);
__m512 tmp8415 = _mm512_unpacklo_ps(tmp8319, tmp8320);
__m512 tmp8416 = _mm512_unpackhi_ps(tmp8319, tmp8320);
__m512 tmp8417 = _mm512_unpacklo_ps(tmp8321, tmp8322);
__m512 tmp8418 = _mm512_unpackhi_ps(tmp8321, tmp8322);
__m512 tmp8419 = _mm512_unpacklo_ps(tmp8323, tmp8324);
__m512 tmp8420 = _mm512_unpackhi_ps(tmp8323, tmp8324);
__m512 tmp8421 = _mm512_shuffle_ps(tmp8409, tmp8411, 68);
__m512 tmp8422 = _mm512_shuffle_ps(tmp8409, tmp8411, 238);
__m512 tmp8423 = _mm512_shuffle_ps(tmp8410, tmp8412, 68);
__m512 tmp8424 = _mm512_shuffle_ps(tmp8410, tmp8412, 238);
__m512 tmp8425 = _mm512_shuffle_ps(tmp8413, tmp8415, 68);
__m512 tmp8426 = _mm512_shuffle_ps(tmp8413, tmp8415, 238);
__m512 tmp8427 = _mm512_shuffle_ps(tmp8414, tmp8416, 68);
__m512 tmp8428 = _mm512_shuffle_ps(tmp8414, tmp8416, 238);
__m512 tmp8429 = _mm512_shuffle_ps(tmp8417, tmp8419, 68);
__m512 tmp8430 = _mm512_shuffle_ps(tmp8417, tmp8419, 238);
__m512 tmp8431 = _mm512_shuffle_ps(tmp8418, tmp8420, 68);
__m512 tmp8432 = _mm512_shuffle_ps(tmp8418, tmp8420, 238);
__m512 tmp8433 = _mm512_shuffle_f32x4(tmp8421, tmp8425, 136);
__m512 tmp8434 = _mm512_shuffle_f32x4(tmp8421, tmp8425, 221);
__m512 tmp8435 = _mm512_shuffle_f32x4(tmp8422, tmp8426, 136);
__m512 tmp8436 = _mm512_shuffle_f32x4(tmp8422, tmp8426, 221);
__m512 tmp8437 = _mm512_shuffle_f32x4(tmp8423, tmp8427, 136);
__m512 tmp8438 = _mm512_shuffle_f32x4(tmp8423, tmp8427, 221);
__m512 tmp8439 = _mm512_shuffle_f32x4(tmp8424, tmp8428, 136);
__m512 tmp8440 = _mm512_shuffle_f32x4(tmp8424, tmp8428, 221);
__m512 tmp8441 = _mm512_shuffle_f32x4(tmp8429, tmp8429, 136);
__m512 tmp8442 = _mm512_shuffle_f32x4(tmp8429, tmp8429, 221);
__m512 tmp8443 = _mm512_shuffle_f32x4(tmp8430, tmp8430, 136);
__m512 tmp8444 = _mm512_shuffle_f32x4(tmp8430, tmp8430, 221);
__m512 tmp8445 = _mm512_shuffle_f32x4(tmp8431, tmp8431, 136);
__m512 tmp8446 = _mm512_shuffle_f32x4(tmp8431, tmp8431, 221);
__m512 tmp8447 = _mm512_shuffle_f32x4(tmp8432, tmp8432, 136);
__m512 tmp8448 = _mm512_shuffle_f32x4(tmp8432, tmp8432, 221);
tmp8313 = _mm512_shuffle_f32x4(tmp8433, tmp8441, 136);
tmp8321 = _mm512_shuffle_f32x4(tmp8433, tmp8441, 221);
tmp8314 = _mm512_shuffle_f32x4(tmp8435, tmp8443, 136);
tmp8322 = _mm512_shuffle_f32x4(tmp8435, tmp8443, 221);
tmp8315 = _mm512_shuffle_f32x4(tmp8437, tmp8445, 136);
tmp8323 = _mm512_shuffle_f32x4(tmp8437, tmp8445, 221);
tmp8316 = _mm512_shuffle_f32x4(tmp8439, tmp8447, 136);
tmp8324 = _mm512_shuffle_f32x4(tmp8439, tmp8447, 221);
tmp8317 = _mm512_shuffle_f32x4(tmp8434, tmp8442, 136);
__m512 tmp8365 = _mm512_shuffle_f32x4(tmp8434, tmp8442, 221);
tmp8318 = _mm512_shuffle_f32x4(tmp8436, tmp8444, 136);
__m512 tmp8366 = _mm512_shuffle_f32x4(tmp8436, tmp8444, 221);
tmp8319 = _mm512_shuffle_f32x4(tmp8438, tmp8446, 136);
__m512 tmp8367 = _mm512_shuffle_f32x4(tmp8438, tmp8446, 221);
tmp8320 = _mm512_shuffle_f32x4(tmp8440, tmp8448, 136);
__m512 tmp8368 = _mm512_shuffle_f32x4(tmp8440, tmp8448, 221);
__m512 tmp8373 = _mm512_add_ps(tmp8314, tmp8315);
__m512 tmp8393 = _mm512_add_ps(tmp8322, tmp8323);
__m512 tmp8372 = _mm512_add_ps(tmp8316, tmp8317);
__m512 tmp8392 = _mm512_add_ps(tmp8324, tmp8365);
__m512 tmp8378 = _mm512_sub_ps(tmp8316, tmp8317);
__m512 tmp8398 = _mm512_sub_ps(tmp8324, tmp8365);
__m512 tmp8377 = _mm512_sub_ps(tmp8314, tmp8315);
__m512 tmp8397 = _mm512_sub_ps(tmp8322, tmp8323);
__m512 tmp8374 = _mm512_add_ps(tmp8318, tmp8319);
__m512 tmp8394 = _mm512_add_ps(tmp8366, tmp8367);
__m512 tmp8379 = _mm512_sub_ps(tmp8318, tmp8319);
__m512 tmp8399 = _mm512_sub_ps(tmp8366, tmp8367);
__m512 tmp8376 = _mm512_fmadd_ps(tmp8378, _mm512_set1_ps(2e+00f), tmp8377);
__m512 tmp8396 = _mm512_fmadd_ps(tmp8398, _mm512_set1_ps(2e+00f), tmp8397);
__m512 tmp8383 = _mm512_fmadd_ps(tmp8378, _mm512_set1_ps(8e+00f), tmp8377);
__m512 tmp8403 = _mm512_fmadd_ps(tmp8398, _mm512_set1_ps(8e+00f), tmp8397);
__m512 tmp8371 = _mm512_add_ps(tmp8372, tmp8373);
__m512 tmp8391 = _mm512_add_ps(tmp8392, tmp8393);
__m512 tmp8375 = _mm512_fmadd_ps(tmp8379, _mm512_set1_ps(1.6e+01f), tmp8376);
__m512 tmp8395 = _mm512_fmadd_ps(tmp8399, _mm512_set1_ps(1.6e+01f), tmp8396);
__m512 tmp8382 = _mm512_fmadd_ps(tmp8379, _mm512_set1_ps(4e+00f), tmp8383);
__m512 tmp8402 = _mm512_fmadd_ps(tmp8399, _mm512_set1_ps(4e+00f), tmp8403);
__m512 tmp8388 = _mm512_add_ps(tmp8379, tmp8377);
__m512 tmp8408 = _mm512_add_ps(tmp8399, tmp8397);
__m512 tmp8381 = _mm512_fmadd_ps(tmp8372, _mm512_set1_ps(4e+00f), tmp8373);
__m512 tmp8401 = _mm512_fmadd_ps(tmp8392, _mm512_set1_ps(4e+00f), tmp8393);
__m512 tmp8385 = _mm512_fmadd_ps(tmp8372, _mm512_set1_ps(1.6e+01f), tmp8373);
__m512 tmp8405 = _mm512_fmadd_ps(tmp8392, _mm512_set1_ps(1.6e+01f), tmp8393);
__m512 tmp8370 = _mm512_add_ps(tmp8371, tmp8313);
__m512 tmp8390 = _mm512_add_ps(tmp8391, tmp8321);
__m512 tmp8387 = _mm512_add_ps(tmp8388, tmp8320);
__m512 tmp8407 = _mm512_add_ps(tmp8408, tmp8368);
__m512 tmp8369 = _mm512_fmadd_ps(tmp8374, _mm512_set1_ps(3.2e+01f), tmp8370);
__m512 tmp8389 = _mm512_fmadd_ps(tmp8394, _mm512_set1_ps(3.2e+01f), tmp8390);
__m512 tmp8380 = _mm512_fmadd_ps(tmp8374, _mm512_set1_ps(8e+00f), tmp8381);
__m512 tmp8400 = _mm512_fmadd_ps(tmp8394, _mm512_set1_ps(8e+00f), tmp8401);
__m512 tmp8386 = _mm512_fmadd_ps(tmp8378, _mm512_set1_ps(3.2e+01f), tmp8387);
__m512 tmp8406 = _mm512_fmadd_ps(tmp8398, _mm512_set1_ps(3.2e+01f), tmp8407);
__m512 tmp8384 = _mm512_fmadd_ps(tmp8374, _mm512_set1_ps(2e+00f), tmp8385);
__m512 tmp8404 = _mm512_fmadd_ps(tmp8394, _mm512_set1_ps(2e+00f), tmp8405);
__m512 out1203 = tmp8369;
__m512 out1209 = tmp8389;
__m512 out1204 = tmp8375;
__m512 out1210 = tmp8395;
__m512 out1205 = tmp8380;
__m512 out1211 = tmp8400;
__m512 out1206 = tmp8382;
__m512 out1212 = tmp8402;
__m512 out1207 = tmp8384;
__m512 out1213 = tmp8404;
__m512 out1208 = tmp8386;
__m512 out1214 = tmp8406;
out1203 = _mm512_max_ps(_mm512_setzero_ps(), out1203);
out1209 = _mm512_max_ps(_mm512_setzero_ps(), out1209);
out1204 = _mm512_max_ps(_mm512_setzero_ps(), out1204);
out1210 = _mm512_max_ps(_mm512_setzero_ps(), out1210);
out1205 = _mm512_max_ps(_mm512_setzero_ps(), out1205);
out1211 = _mm512_max_ps(_mm512_setzero_ps(), out1211);
out1206 = _mm512_max_ps(_mm512_setzero_ps(), out1206);
out1212 = _mm512_max_ps(_mm512_setzero_ps(), out1212);
out1207 = _mm512_max_ps(_mm512_setzero_ps(), out1207);
out1213 = _mm512_max_ps(_mm512_setzero_ps(), out1213);
out1208 = _mm512_max_ps(_mm512_setzero_ps(), out1208);
out1214 = _mm512_max_ps(_mm512_setzero_ps(), out1214);
_mm512_mask_storeu_ps(datPtr13+12656+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1203);
_mm512_mask_storeu_ps(datPtr13+12704+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1209);
_mm512_mask_storeu_ps(datPtr13+12880+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1204);
_mm512_mask_storeu_ps(datPtr13+12928+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1210);
_mm512_mask_storeu_ps(datPtr13+13104+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1205);
_mm512_mask_storeu_ps(datPtr13+13152+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1211);
_mm512_mask_storeu_ps(datPtr13+13328+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1206);
_mm512_mask_storeu_ps(datPtr13+13376+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1212);
_mm512_mask_storeu_ps(datPtr13+13552+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1207);
_mm512_mask_storeu_ps(datPtr13+13600+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1213);
_mm512_mask_storeu_ps(datPtr13+13776+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1208);
_mm512_mask_storeu_ps(datPtr13+13824+806912*i28+224*toH31+4*toW31+50432*k92+25216*l31, 4095, out1214);
}
}
if (j23 >= last6) return;
++j23;
rel17 = 1;
}
ptrdiff_t toH32 = base17+0;
ptrdiff_t toW32 = 48;
ptrdiff_t k93 = 16*w46;
for (; k93 != 16; ++k93) {
ptrdiff_t l32 = 0;
for (; l32 != 2; ++l32) {
__m512 sf545 = _mm512_loadu_ps(sfPtr7+0+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf546 = _mm512_loadu_ps(sfPtr7+128+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1261 = _mm512_shuffle_f32x4(sf545, sf546, 68);
__m512 in1262 = _mm512_shuffle_f32x4(sf545, sf546, 238);
__m512 sf547 = _mm512_loadu_ps(sfPtr7+64+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf548 = _mm512_loadu_ps(sfPtr7+192+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1269 = _mm512_shuffle_f32x4(sf547, sf548, 68);
__m512 in1270 = _mm512_shuffle_f32x4(sf547, sf548, 238);
__m512 sf549 = _mm512_loadu_ps(sfPtr7+409600+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf550 = _mm512_loadu_ps(sfPtr7+409728+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1263 = _mm512_shuffle_f32x4(sf549, sf550, 68);
__m512 in1264 = _mm512_shuffle_f32x4(sf549, sf550, 238);
__m512 sf551 = _mm512_loadu_ps(sfPtr7+409664+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf552 = _mm512_loadu_ps(sfPtr7+409792+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1271 = _mm512_shuffle_f32x4(sf551, sf552, 68);
__m512 in1272 = _mm512_shuffle_f32x4(sf551, sf552, 238);
__m512 sf553 = _mm512_loadu_ps(sfPtr7+819200+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf554 = _mm512_loadu_ps(sfPtr7+819328+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1265 = _mm512_shuffle_f32x4(sf553, sf554, 68);
__m512 in1266 = _mm512_shuffle_f32x4(sf553, sf554, 238);
__m512 sf555 = _mm512_loadu_ps(sfPtr7+819264+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf556 = _mm512_loadu_ps(sfPtr7+819392+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1273 = _mm512_shuffle_f32x4(sf555, sf556, 68);
__m512 in1274 = _mm512_shuffle_f32x4(sf555, sf556, 238);
__m512 sf557 = _mm512_loadu_ps(sfPtr7+1228800+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf558 = _mm512_loadu_ps(sfPtr7+1228928+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1267 = _mm512_shuffle_f32x4(sf557, sf558, 68);
__m512 in1268 = _mm512_shuffle_f32x4(sf557, sf558, 238);
__m512 sf559 = _mm512_loadu_ps(sfPtr7+1228864+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf560 = _mm512_loadu_ps(sfPtr7+1228992+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1275 = _mm512_shuffle_f32x4(sf559, sf560, 68);
__m512 in1276 = _mm512_shuffle_f32x4(sf559, sf560, 238);
__m512 tmp8465 = _mm512_add_ps(in1262, in1263);
__m512 tmp8485 = _mm512_add_ps(in1270, in1271);
__m512 tmp8464 = _mm512_add_ps(in1264, in1265);
__m512 tmp8484 = _mm512_add_ps(in1272, in1273);
__m512 tmp8470 = _mm512_sub_ps(in1264, in1265);
__m512 tmp8490 = _mm512_sub_ps(in1272, in1273);
__m512 tmp8469 = _mm512_sub_ps(in1262, in1263);
__m512 tmp8489 = _mm512_sub_ps(in1270, in1271);
__m512 tmp8466 = _mm512_add_ps(in1266, in1267);
__m512 tmp8486 = _mm512_add_ps(in1274, in1275);
__m512 tmp8471 = _mm512_sub_ps(in1266, in1267);
__m512 tmp8491 = _mm512_sub_ps(in1274, in1275);
__m512 tmp8468 = _mm512_fmadd_ps(tmp8470, _mm512_set1_ps(2e+00f), tmp8469);
__m512 tmp8488 = _mm512_fmadd_ps(tmp8490, _mm512_set1_ps(2e+00f), tmp8489);
__m512 tmp8475 = _mm512_fmadd_ps(tmp8470, _mm512_set1_ps(8e+00f), tmp8469);
__m512 tmp8495 = _mm512_fmadd_ps(tmp8490, _mm512_set1_ps(8e+00f), tmp8489);
__m512 tmp8463 = _mm512_add_ps(tmp8464, tmp8465);
__m512 tmp8483 = _mm512_add_ps(tmp8484, tmp8485);
__m512 tmp8467 = _mm512_fmadd_ps(tmp8471, _mm512_set1_ps(1.6e+01f), tmp8468);
__m512 tmp8487 = _mm512_fmadd_ps(tmp8491, _mm512_set1_ps(1.6e+01f), tmp8488);
__m512 tmp8474 = _mm512_fmadd_ps(tmp8471, _mm512_set1_ps(4e+00f), tmp8475);
__m512 tmp8494 = _mm512_fmadd_ps(tmp8491, _mm512_set1_ps(4e+00f), tmp8495);
__m512 tmp8480 = _mm512_add_ps(tmp8471, tmp8469);
__m512 tmp8500 = _mm512_add_ps(tmp8491, tmp8489);
__m512 tmp8473 = _mm512_fmadd_ps(tmp8464, _mm512_set1_ps(4e+00f), tmp8465);
__m512 tmp8493 = _mm512_fmadd_ps(tmp8484, _mm512_set1_ps(4e+00f), tmp8485);
__m512 tmp8477 = _mm512_fmadd_ps(tmp8464, _mm512_set1_ps(1.6e+01f), tmp8465);
__m512 tmp8497 = _mm512_fmadd_ps(tmp8484, _mm512_set1_ps(1.6e+01f), tmp8485);
__m512 tmp8462 = _mm512_add_ps(tmp8463, in1261);
__m512 tmp8482 = _mm512_add_ps(tmp8483, in1269);
__m512 tmp8479 = _mm512_add_ps(tmp8480, in1268);
__m512 tmp8499 = _mm512_add_ps(tmp8500, in1276);
__m512 tmp8461 = _mm512_fmadd_ps(tmp8466, _mm512_set1_ps(3.2e+01f), tmp8462);
__m512 tmp8481 = _mm512_fmadd_ps(tmp8486, _mm512_set1_ps(3.2e+01f), tmp8482);
__m512 tmp8472 = _mm512_fmadd_ps(tmp8466, _mm512_set1_ps(8e+00f), tmp8473);
__m512 tmp8492 = _mm512_fmadd_ps(tmp8486, _mm512_set1_ps(8e+00f), tmp8493);
__m512 tmp8478 = _mm512_fmadd_ps(tmp8470, _mm512_set1_ps(3.2e+01f), tmp8479);
__m512 tmp8498 = _mm512_fmadd_ps(tmp8490, _mm512_set1_ps(3.2e+01f), tmp8499);
__m512 tmp8476 = _mm512_fmadd_ps(tmp8466, _mm512_set1_ps(2e+00f), tmp8477);
__m512 tmp8496 = _mm512_fmadd_ps(tmp8486, _mm512_set1_ps(2e+00f), tmp8497);
__m512 tmp8449 = tmp8461;
__m512 tmp8455 = tmp8481;
__m512 tmp8450 = tmp8467;
__m512 tmp8456 = tmp8487;
__m512 tmp8451 = tmp8472;
__m512 tmp8457 = tmp8492;
__m512 tmp8452 = tmp8474;
__m512 tmp8458 = tmp8494;
__m512 tmp8453 = tmp8476;
__m512 tmp8459 = tmp8496;
__m512 tmp8454 = tmp8478;
__m512 tmp8460 = tmp8498;
__m512 tmp8545 = _mm512_unpacklo_ps(tmp8449, tmp8450);
__m512 tmp8546 = _mm512_unpackhi_ps(tmp8449, tmp8450);
__m512 tmp8547 = _mm512_unpacklo_ps(tmp8451, tmp8452);
__m512 tmp8548 = _mm512_unpackhi_ps(tmp8451, tmp8452);
__m512 tmp8549 = _mm512_unpacklo_ps(tmp8453, tmp8454);
__m512 tmp8550 = _mm512_unpackhi_ps(tmp8453, tmp8454);
__m512 tmp8551 = _mm512_unpacklo_ps(tmp8455, tmp8456);
__m512 tmp8552 = _mm512_unpackhi_ps(tmp8455, tmp8456);
__m512 tmp8553 = _mm512_unpacklo_ps(tmp8457, tmp8458);
__m512 tmp8554 = _mm512_unpackhi_ps(tmp8457, tmp8458);
__m512 tmp8555 = _mm512_unpacklo_ps(tmp8459, tmp8460);
__m512 tmp8556 = _mm512_unpackhi_ps(tmp8459, tmp8460);
__m512 tmp8557 = _mm512_shuffle_ps(tmp8545, tmp8547, 68);
__m512 tmp8558 = _mm512_shuffle_ps(tmp8545, tmp8547, 238);
__m512 tmp8559 = _mm512_shuffle_ps(tmp8546, tmp8548, 68);
__m512 tmp8560 = _mm512_shuffle_ps(tmp8546, tmp8548, 238);
__m512 tmp8561 = _mm512_shuffle_ps(tmp8549, tmp8551, 68);
__m512 tmp8562 = _mm512_shuffle_ps(tmp8549, tmp8551, 238);
__m512 tmp8563 = _mm512_shuffle_ps(tmp8550, tmp8552, 68);
__m512 tmp8564 = _mm512_shuffle_ps(tmp8550, tmp8552, 238);
__m512 tmp8565 = _mm512_shuffle_ps(tmp8553, tmp8555, 68);
__m512 tmp8566 = _mm512_shuffle_ps(tmp8553, tmp8555, 238);
__m512 tmp8567 = _mm512_shuffle_ps(tmp8554, tmp8556, 68);
__m512 tmp8568 = _mm512_shuffle_ps(tmp8554, tmp8556, 238);
__m512 tmp8569 = _mm512_shuffle_f32x4(tmp8557, tmp8561, 136);
__m512 tmp8570 = _mm512_shuffle_f32x4(tmp8557, tmp8561, 221);
__m512 tmp8571 = _mm512_shuffle_f32x4(tmp8558, tmp8562, 136);
__m512 tmp8572 = _mm512_shuffle_f32x4(tmp8558, tmp8562, 221);
__m512 tmp8573 = _mm512_shuffle_f32x4(tmp8559, tmp8563, 136);
__m512 tmp8574 = _mm512_shuffle_f32x4(tmp8559, tmp8563, 221);
__m512 tmp8575 = _mm512_shuffle_f32x4(tmp8560, tmp8564, 136);
__m512 tmp8576 = _mm512_shuffle_f32x4(tmp8560, tmp8564, 221);
__m512 tmp8577 = _mm512_shuffle_f32x4(tmp8565, tmp8565, 136);
__m512 tmp8578 = _mm512_shuffle_f32x4(tmp8565, tmp8565, 221);
__m512 tmp8579 = _mm512_shuffle_f32x4(tmp8566, tmp8566, 136);
__m512 tmp8580 = _mm512_shuffle_f32x4(tmp8566, tmp8566, 221);
__m512 tmp8581 = _mm512_shuffle_f32x4(tmp8567, tmp8567, 136);
__m512 tmp8582 = _mm512_shuffle_f32x4(tmp8567, tmp8567, 221);
__m512 tmp8583 = _mm512_shuffle_f32x4(tmp8568, tmp8568, 136);
__m512 tmp8584 = _mm512_shuffle_f32x4(tmp8568, tmp8568, 221);
tmp8449 = _mm512_shuffle_f32x4(tmp8569, tmp8577, 136);
tmp8457 = _mm512_shuffle_f32x4(tmp8569, tmp8577, 221);
tmp8450 = _mm512_shuffle_f32x4(tmp8571, tmp8579, 136);
tmp8458 = _mm512_shuffle_f32x4(tmp8571, tmp8579, 221);
tmp8451 = _mm512_shuffle_f32x4(tmp8573, tmp8581, 136);
tmp8459 = _mm512_shuffle_f32x4(tmp8573, tmp8581, 221);
tmp8452 = _mm512_shuffle_f32x4(tmp8575, tmp8583, 136);
tmp8460 = _mm512_shuffle_f32x4(tmp8575, tmp8583, 221);
tmp8453 = _mm512_shuffle_f32x4(tmp8570, tmp8578, 136);
__m512 tmp8501 = _mm512_shuffle_f32x4(tmp8570, tmp8578, 221);
tmp8454 = _mm512_shuffle_f32x4(tmp8572, tmp8580, 136);
__m512 tmp8502 = _mm512_shuffle_f32x4(tmp8572, tmp8580, 221);
tmp8455 = _mm512_shuffle_f32x4(tmp8574, tmp8582, 136);
__m512 tmp8503 = _mm512_shuffle_f32x4(tmp8574, tmp8582, 221);
tmp8456 = _mm512_shuffle_f32x4(tmp8576, tmp8584, 136);
__m512 tmp8504 = _mm512_shuffle_f32x4(tmp8576, tmp8584, 221);
__m512 tmp8509 = _mm512_add_ps(tmp8450, tmp8451);
__m512 tmp8529 = _mm512_add_ps(tmp8458, tmp8459);
__m512 tmp8508 = _mm512_add_ps(tmp8452, tmp8453);
__m512 tmp8528 = _mm512_add_ps(tmp8460, tmp8501);
__m512 tmp8514 = _mm512_sub_ps(tmp8452, tmp8453);
__m512 tmp8534 = _mm512_sub_ps(tmp8460, tmp8501);
__m512 tmp8513 = _mm512_sub_ps(tmp8450, tmp8451);
__m512 tmp8533 = _mm512_sub_ps(tmp8458, tmp8459);
__m512 tmp8510 = _mm512_add_ps(tmp8454, tmp8455);
__m512 tmp8530 = _mm512_add_ps(tmp8502, tmp8503);
__m512 tmp8515 = _mm512_sub_ps(tmp8454, tmp8455);
__m512 tmp8535 = _mm512_sub_ps(tmp8502, tmp8503);
__m512 tmp8512 = _mm512_fmadd_ps(tmp8514, _mm512_set1_ps(2e+00f), tmp8513);
__m512 tmp8532 = _mm512_fmadd_ps(tmp8534, _mm512_set1_ps(2e+00f), tmp8533);
__m512 tmp8519 = _mm512_fmadd_ps(tmp8514, _mm512_set1_ps(8e+00f), tmp8513);
__m512 tmp8539 = _mm512_fmadd_ps(tmp8534, _mm512_set1_ps(8e+00f), tmp8533);
__m512 tmp8507 = _mm512_add_ps(tmp8508, tmp8509);
__m512 tmp8527 = _mm512_add_ps(tmp8528, tmp8529);
__m512 tmp8511 = _mm512_fmadd_ps(tmp8515, _mm512_set1_ps(1.6e+01f), tmp8512);
__m512 tmp8531 = _mm512_fmadd_ps(tmp8535, _mm512_set1_ps(1.6e+01f), tmp8532);
__m512 tmp8518 = _mm512_fmadd_ps(tmp8515, _mm512_set1_ps(4e+00f), tmp8519);
__m512 tmp8538 = _mm512_fmadd_ps(tmp8535, _mm512_set1_ps(4e+00f), tmp8539);
__m512 tmp8524 = _mm512_add_ps(tmp8515, tmp8513);
__m512 tmp8544 = _mm512_add_ps(tmp8535, tmp8533);
__m512 tmp8517 = _mm512_fmadd_ps(tmp8508, _mm512_set1_ps(4e+00f), tmp8509);
__m512 tmp8537 = _mm512_fmadd_ps(tmp8528, _mm512_set1_ps(4e+00f), tmp8529);
__m512 tmp8521 = _mm512_fmadd_ps(tmp8508, _mm512_set1_ps(1.6e+01f), tmp8509);
__m512 tmp8541 = _mm512_fmadd_ps(tmp8528, _mm512_set1_ps(1.6e+01f), tmp8529);
__m512 tmp8506 = _mm512_add_ps(tmp8507, tmp8449);
__m512 tmp8526 = _mm512_add_ps(tmp8527, tmp8457);
__m512 tmp8523 = _mm512_add_ps(tmp8524, tmp8456);
__m512 tmp8543 = _mm512_add_ps(tmp8544, tmp8504);
__m512 tmp8505 = _mm512_fmadd_ps(tmp8510, _mm512_set1_ps(3.2e+01f), tmp8506);
__m512 tmp8525 = _mm512_fmadd_ps(tmp8530, _mm512_set1_ps(3.2e+01f), tmp8526);
__m512 tmp8516 = _mm512_fmadd_ps(tmp8510, _mm512_set1_ps(8e+00f), tmp8517);
__m512 tmp8536 = _mm512_fmadd_ps(tmp8530, _mm512_set1_ps(8e+00f), tmp8537);
__m512 tmp8522 = _mm512_fmadd_ps(tmp8514, _mm512_set1_ps(3.2e+01f), tmp8523);
__m512 tmp8542 = _mm512_fmadd_ps(tmp8534, _mm512_set1_ps(3.2e+01f), tmp8543);
__m512 tmp8520 = _mm512_fmadd_ps(tmp8510, _mm512_set1_ps(2e+00f), tmp8521);
__m512 tmp8540 = _mm512_fmadd_ps(tmp8530, _mm512_set1_ps(2e+00f), tmp8541);
__m512 out1215 = tmp8505;
__m512 out1221 = tmp8525;
__m512 out1216 = tmp8511;
__m512 out1222 = tmp8531;
__m512 out1217 = tmp8516;
__m512 out1223 = tmp8536;
__m512 out1218 = tmp8518;
__m512 out1224 = tmp8538;
__m512 out1219 = tmp8520;
__m512 out1225 = tmp8540;
__m512 out1220 = tmp8522;
__m512 out1226 = tmp8542;
out1215 = _mm512_max_ps(_mm512_setzero_ps(), out1215);
out1221 = _mm512_max_ps(_mm512_setzero_ps(), out1221);
out1216 = _mm512_max_ps(_mm512_setzero_ps(), out1216);
out1222 = _mm512_max_ps(_mm512_setzero_ps(), out1222);
out1217 = _mm512_max_ps(_mm512_setzero_ps(), out1217);
out1223 = _mm512_max_ps(_mm512_setzero_ps(), out1223);
out1218 = _mm512_max_ps(_mm512_setzero_ps(), out1218);
out1224 = _mm512_max_ps(_mm512_setzero_ps(), out1224);
out1219 = _mm512_max_ps(_mm512_setzero_ps(), out1219);
out1225 = _mm512_max_ps(_mm512_setzero_ps(), out1225);
out1220 = _mm512_max_ps(_mm512_setzero_ps(), out1220);
out1226 = _mm512_max_ps(_mm512_setzero_ps(), out1226);
_mm512_mask_storeu_ps(datPtr13+0+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 255, out1215);
_mm512_mask_storeu_ps(datPtr13+1152+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1221);
_mm512_mask_storeu_ps(datPtr13+224+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 255, out1216);
_mm512_mask_storeu_ps(datPtr13+1376+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1222);
_mm512_mask_storeu_ps(datPtr13+448+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 255, out1217);
_mm512_mask_storeu_ps(datPtr13+1600+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1223);
_mm512_mask_storeu_ps(datPtr13+672+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 255, out1218);
_mm512_mask_storeu_ps(datPtr13+1824+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1224);
_mm512_mask_storeu_ps(datPtr13+896+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 255, out1219);
_mm512_mask_storeu_ps(datPtr13+2048+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1225);
_mm512_mask_storeu_ps(datPtr13+1120+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 255, out1220);
_mm512_mask_storeu_ps(datPtr13+2272+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1226);
__m512 sf561 = _mm512_loadu_ps(sfPtr7+256+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf562 = _mm512_loadu_ps(sfPtr7+384+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1277 = _mm512_shuffle_f32x4(sf561, sf562, 68);
__m512 in1278 = _mm512_shuffle_f32x4(sf561, sf562, 238);
__m512 sf563 = _mm512_loadu_ps(sfPtr7+320+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf564 = _mm512_loadu_ps(sfPtr7+448+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1285 = _mm512_shuffle_f32x4(sf563, sf564, 68);
__m512 in1286 = _mm512_shuffle_f32x4(sf563, sf564, 238);
__m512 sf565 = _mm512_loadu_ps(sfPtr7+409856+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf566 = _mm512_loadu_ps(sfPtr7+409984+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1279 = _mm512_shuffle_f32x4(sf565, sf566, 68);
__m512 in1280 = _mm512_shuffle_f32x4(sf565, sf566, 238);
__m512 sf567 = _mm512_loadu_ps(sfPtr7+409920+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf568 = _mm512_loadu_ps(sfPtr7+410048+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1287 = _mm512_shuffle_f32x4(sf567, sf568, 68);
__m512 in1288 = _mm512_shuffle_f32x4(sf567, sf568, 238);
__m512 sf569 = _mm512_loadu_ps(sfPtr7+819456+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf570 = _mm512_loadu_ps(sfPtr7+819584+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1281 = _mm512_shuffle_f32x4(sf569, sf570, 68);
__m512 in1282 = _mm512_shuffle_f32x4(sf569, sf570, 238);
__m512 sf571 = _mm512_loadu_ps(sfPtr7+819520+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf572 = _mm512_loadu_ps(sfPtr7+819648+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1289 = _mm512_shuffle_f32x4(sf571, sf572, 68);
__m512 in1290 = _mm512_shuffle_f32x4(sf571, sf572, 238);
__m512 sf573 = _mm512_loadu_ps(sfPtr7+1229056+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf574 = _mm512_loadu_ps(sfPtr7+1229184+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1283 = _mm512_shuffle_f32x4(sf573, sf574, 68);
__m512 in1284 = _mm512_shuffle_f32x4(sf573, sf574, 238);
__m512 sf575 = _mm512_loadu_ps(sfPtr7+1229120+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf576 = _mm512_loadu_ps(sfPtr7+1229248+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1291 = _mm512_shuffle_f32x4(sf575, sf576, 68);
__m512 in1292 = _mm512_shuffle_f32x4(sf575, sf576, 238);
__m512 tmp8601 = _mm512_add_ps(in1278, in1279);
__m512 tmp8621 = _mm512_add_ps(in1286, in1287);
__m512 tmp8600 = _mm512_add_ps(in1280, in1281);
__m512 tmp8620 = _mm512_add_ps(in1288, in1289);
__m512 tmp8606 = _mm512_sub_ps(in1280, in1281);
__m512 tmp8626 = _mm512_sub_ps(in1288, in1289);
__m512 tmp8605 = _mm512_sub_ps(in1278, in1279);
__m512 tmp8625 = _mm512_sub_ps(in1286, in1287);
__m512 tmp8602 = _mm512_add_ps(in1282, in1283);
__m512 tmp8622 = _mm512_add_ps(in1290, in1291);
__m512 tmp8607 = _mm512_sub_ps(in1282, in1283);
__m512 tmp8627 = _mm512_sub_ps(in1290, in1291);
__m512 tmp8604 = _mm512_fmadd_ps(tmp8606, _mm512_set1_ps(2e+00f), tmp8605);
__m512 tmp8624 = _mm512_fmadd_ps(tmp8626, _mm512_set1_ps(2e+00f), tmp8625);
__m512 tmp8611 = _mm512_fmadd_ps(tmp8606, _mm512_set1_ps(8e+00f), tmp8605);
__m512 tmp8631 = _mm512_fmadd_ps(tmp8626, _mm512_set1_ps(8e+00f), tmp8625);
__m512 tmp8599 = _mm512_add_ps(tmp8600, tmp8601);
__m512 tmp8619 = _mm512_add_ps(tmp8620, tmp8621);
__m512 tmp8603 = _mm512_fmadd_ps(tmp8607, _mm512_set1_ps(1.6e+01f), tmp8604);
__m512 tmp8623 = _mm512_fmadd_ps(tmp8627, _mm512_set1_ps(1.6e+01f), tmp8624);
__m512 tmp8610 = _mm512_fmadd_ps(tmp8607, _mm512_set1_ps(4e+00f), tmp8611);
__m512 tmp8630 = _mm512_fmadd_ps(tmp8627, _mm512_set1_ps(4e+00f), tmp8631);
__m512 tmp8616 = _mm512_add_ps(tmp8607, tmp8605);
__m512 tmp8636 = _mm512_add_ps(tmp8627, tmp8625);
__m512 tmp8609 = _mm512_fmadd_ps(tmp8600, _mm512_set1_ps(4e+00f), tmp8601);
__m512 tmp8629 = _mm512_fmadd_ps(tmp8620, _mm512_set1_ps(4e+00f), tmp8621);
__m512 tmp8613 = _mm512_fmadd_ps(tmp8600, _mm512_set1_ps(1.6e+01f), tmp8601);
__m512 tmp8633 = _mm512_fmadd_ps(tmp8620, _mm512_set1_ps(1.6e+01f), tmp8621);
__m512 tmp8598 = _mm512_add_ps(tmp8599, in1277);
__m512 tmp8618 = _mm512_add_ps(tmp8619, in1285);
__m512 tmp8615 = _mm512_add_ps(tmp8616, in1284);
__m512 tmp8635 = _mm512_add_ps(tmp8636, in1292);
__m512 tmp8597 = _mm512_fmadd_ps(tmp8602, _mm512_set1_ps(3.2e+01f), tmp8598);
__m512 tmp8617 = _mm512_fmadd_ps(tmp8622, _mm512_set1_ps(3.2e+01f), tmp8618);
__m512 tmp8608 = _mm512_fmadd_ps(tmp8602, _mm512_set1_ps(8e+00f), tmp8609);
__m512 tmp8628 = _mm512_fmadd_ps(tmp8622, _mm512_set1_ps(8e+00f), tmp8629);
__m512 tmp8614 = _mm512_fmadd_ps(tmp8606, _mm512_set1_ps(3.2e+01f), tmp8615);
__m512 tmp8634 = _mm512_fmadd_ps(tmp8626, _mm512_set1_ps(3.2e+01f), tmp8635);
__m512 tmp8612 = _mm512_fmadd_ps(tmp8602, _mm512_set1_ps(2e+00f), tmp8613);
__m512 tmp8632 = _mm512_fmadd_ps(tmp8622, _mm512_set1_ps(2e+00f), tmp8633);
__m512 tmp8585 = tmp8597;
__m512 tmp8591 = tmp8617;
__m512 tmp8586 = tmp8603;
__m512 tmp8592 = tmp8623;
__m512 tmp8587 = tmp8608;
__m512 tmp8593 = tmp8628;
__m512 tmp8588 = tmp8610;
__m512 tmp8594 = tmp8630;
__m512 tmp8589 = tmp8612;
__m512 tmp8595 = tmp8632;
__m512 tmp8590 = tmp8614;
__m512 tmp8596 = tmp8634;
__m512 tmp8681 = _mm512_unpacklo_ps(tmp8585, tmp8586);
__m512 tmp8682 = _mm512_unpackhi_ps(tmp8585, tmp8586);
__m512 tmp8683 = _mm512_unpacklo_ps(tmp8587, tmp8588);
__m512 tmp8684 = _mm512_unpackhi_ps(tmp8587, tmp8588);
__m512 tmp8685 = _mm512_unpacklo_ps(tmp8589, tmp8590);
__m512 tmp8686 = _mm512_unpackhi_ps(tmp8589, tmp8590);
__m512 tmp8687 = _mm512_unpacklo_ps(tmp8591, tmp8592);
__m512 tmp8688 = _mm512_unpackhi_ps(tmp8591, tmp8592);
__m512 tmp8689 = _mm512_unpacklo_ps(tmp8593, tmp8594);
__m512 tmp8690 = _mm512_unpackhi_ps(tmp8593, tmp8594);
__m512 tmp8691 = _mm512_unpacklo_ps(tmp8595, tmp8596);
__m512 tmp8692 = _mm512_unpackhi_ps(tmp8595, tmp8596);
__m512 tmp8693 = _mm512_shuffle_ps(tmp8681, tmp8683, 68);
__m512 tmp8694 = _mm512_shuffle_ps(tmp8681, tmp8683, 238);
__m512 tmp8695 = _mm512_shuffle_ps(tmp8682, tmp8684, 68);
__m512 tmp8696 = _mm512_shuffle_ps(tmp8682, tmp8684, 238);
__m512 tmp8697 = _mm512_shuffle_ps(tmp8685, tmp8687, 68);
__m512 tmp8698 = _mm512_shuffle_ps(tmp8685, tmp8687, 238);
__m512 tmp8699 = _mm512_shuffle_ps(tmp8686, tmp8688, 68);
__m512 tmp8700 = _mm512_shuffle_ps(tmp8686, tmp8688, 238);
__m512 tmp8701 = _mm512_shuffle_ps(tmp8689, tmp8691, 68);
__m512 tmp8702 = _mm512_shuffle_ps(tmp8689, tmp8691, 238);
__m512 tmp8703 = _mm512_shuffle_ps(tmp8690, tmp8692, 68);
__m512 tmp8704 = _mm512_shuffle_ps(tmp8690, tmp8692, 238);
__m512 tmp8705 = _mm512_shuffle_f32x4(tmp8693, tmp8697, 136);
__m512 tmp8706 = _mm512_shuffle_f32x4(tmp8693, tmp8697, 221);
__m512 tmp8707 = _mm512_shuffle_f32x4(tmp8694, tmp8698, 136);
__m512 tmp8708 = _mm512_shuffle_f32x4(tmp8694, tmp8698, 221);
__m512 tmp8709 = _mm512_shuffle_f32x4(tmp8695, tmp8699, 136);
__m512 tmp8710 = _mm512_shuffle_f32x4(tmp8695, tmp8699, 221);
__m512 tmp8711 = _mm512_shuffle_f32x4(tmp8696, tmp8700, 136);
__m512 tmp8712 = _mm512_shuffle_f32x4(tmp8696, tmp8700, 221);
__m512 tmp8713 = _mm512_shuffle_f32x4(tmp8701, tmp8701, 136);
__m512 tmp8714 = _mm512_shuffle_f32x4(tmp8701, tmp8701, 221);
__m512 tmp8715 = _mm512_shuffle_f32x4(tmp8702, tmp8702, 136);
__m512 tmp8716 = _mm512_shuffle_f32x4(tmp8702, tmp8702, 221);
__m512 tmp8717 = _mm512_shuffle_f32x4(tmp8703, tmp8703, 136);
__m512 tmp8718 = _mm512_shuffle_f32x4(tmp8703, tmp8703, 221);
__m512 tmp8719 = _mm512_shuffle_f32x4(tmp8704, tmp8704, 136);
__m512 tmp8720 = _mm512_shuffle_f32x4(tmp8704, tmp8704, 221);
tmp8585 = _mm512_shuffle_f32x4(tmp8705, tmp8713, 136);
tmp8593 = _mm512_shuffle_f32x4(tmp8705, tmp8713, 221);
tmp8586 = _mm512_shuffle_f32x4(tmp8707, tmp8715, 136);
tmp8594 = _mm512_shuffle_f32x4(tmp8707, tmp8715, 221);
tmp8587 = _mm512_shuffle_f32x4(tmp8709, tmp8717, 136);
tmp8595 = _mm512_shuffle_f32x4(tmp8709, tmp8717, 221);
tmp8588 = _mm512_shuffle_f32x4(tmp8711, tmp8719, 136);
tmp8596 = _mm512_shuffle_f32x4(tmp8711, tmp8719, 221);
tmp8589 = _mm512_shuffle_f32x4(tmp8706, tmp8714, 136);
__m512 tmp8637 = _mm512_shuffle_f32x4(tmp8706, tmp8714, 221);
tmp8590 = _mm512_shuffle_f32x4(tmp8708, tmp8716, 136);
__m512 tmp8638 = _mm512_shuffle_f32x4(tmp8708, tmp8716, 221);
tmp8591 = _mm512_shuffle_f32x4(tmp8710, tmp8718, 136);
__m512 tmp8639 = _mm512_shuffle_f32x4(tmp8710, tmp8718, 221);
tmp8592 = _mm512_shuffle_f32x4(tmp8712, tmp8720, 136);
__m512 tmp8640 = _mm512_shuffle_f32x4(tmp8712, tmp8720, 221);
__m512 tmp8645 = _mm512_add_ps(tmp8586, tmp8587);
__m512 tmp8665 = _mm512_add_ps(tmp8594, tmp8595);
__m512 tmp8644 = _mm512_add_ps(tmp8588, tmp8589);
__m512 tmp8664 = _mm512_add_ps(tmp8596, tmp8637);
__m512 tmp8650 = _mm512_sub_ps(tmp8588, tmp8589);
__m512 tmp8670 = _mm512_sub_ps(tmp8596, tmp8637);
__m512 tmp8649 = _mm512_sub_ps(tmp8586, tmp8587);
__m512 tmp8669 = _mm512_sub_ps(tmp8594, tmp8595);
__m512 tmp8646 = _mm512_add_ps(tmp8590, tmp8591);
__m512 tmp8666 = _mm512_add_ps(tmp8638, tmp8639);
__m512 tmp8651 = _mm512_sub_ps(tmp8590, tmp8591);
__m512 tmp8671 = _mm512_sub_ps(tmp8638, tmp8639);
__m512 tmp8648 = _mm512_fmadd_ps(tmp8650, _mm512_set1_ps(2e+00f), tmp8649);
__m512 tmp8668 = _mm512_fmadd_ps(tmp8670, _mm512_set1_ps(2e+00f), tmp8669);
__m512 tmp8655 = _mm512_fmadd_ps(tmp8650, _mm512_set1_ps(8e+00f), tmp8649);
__m512 tmp8675 = _mm512_fmadd_ps(tmp8670, _mm512_set1_ps(8e+00f), tmp8669);
__m512 tmp8643 = _mm512_add_ps(tmp8644, tmp8645);
__m512 tmp8663 = _mm512_add_ps(tmp8664, tmp8665);
__m512 tmp8647 = _mm512_fmadd_ps(tmp8651, _mm512_set1_ps(1.6e+01f), tmp8648);
__m512 tmp8667 = _mm512_fmadd_ps(tmp8671, _mm512_set1_ps(1.6e+01f), tmp8668);
__m512 tmp8654 = _mm512_fmadd_ps(tmp8651, _mm512_set1_ps(4e+00f), tmp8655);
__m512 tmp8674 = _mm512_fmadd_ps(tmp8671, _mm512_set1_ps(4e+00f), tmp8675);
__m512 tmp8660 = _mm512_add_ps(tmp8651, tmp8649);
__m512 tmp8680 = _mm512_add_ps(tmp8671, tmp8669);
__m512 tmp8653 = _mm512_fmadd_ps(tmp8644, _mm512_set1_ps(4e+00f), tmp8645);
__m512 tmp8673 = _mm512_fmadd_ps(tmp8664, _mm512_set1_ps(4e+00f), tmp8665);
__m512 tmp8657 = _mm512_fmadd_ps(tmp8644, _mm512_set1_ps(1.6e+01f), tmp8645);
__m512 tmp8677 = _mm512_fmadd_ps(tmp8664, _mm512_set1_ps(1.6e+01f), tmp8665);
__m512 tmp8642 = _mm512_add_ps(tmp8643, tmp8585);
__m512 tmp8662 = _mm512_add_ps(tmp8663, tmp8593);
__m512 tmp8659 = _mm512_add_ps(tmp8660, tmp8592);
__m512 tmp8679 = _mm512_add_ps(tmp8680, tmp8640);
__m512 tmp8641 = _mm512_fmadd_ps(tmp8646, _mm512_set1_ps(3.2e+01f), tmp8642);
__m512 tmp8661 = _mm512_fmadd_ps(tmp8666, _mm512_set1_ps(3.2e+01f), tmp8662);
__m512 tmp8652 = _mm512_fmadd_ps(tmp8646, _mm512_set1_ps(8e+00f), tmp8653);
__m512 tmp8672 = _mm512_fmadd_ps(tmp8666, _mm512_set1_ps(8e+00f), tmp8673);
__m512 tmp8658 = _mm512_fmadd_ps(tmp8650, _mm512_set1_ps(3.2e+01f), tmp8659);
__m512 tmp8678 = _mm512_fmadd_ps(tmp8670, _mm512_set1_ps(3.2e+01f), tmp8679);
__m512 tmp8656 = _mm512_fmadd_ps(tmp8646, _mm512_set1_ps(2e+00f), tmp8657);
__m512 tmp8676 = _mm512_fmadd_ps(tmp8666, _mm512_set1_ps(2e+00f), tmp8677);
__m512 out1227 = tmp8641;
__m512 out1233 = tmp8661;
__m512 out1228 = tmp8647;
__m512 out1234 = tmp8667;
__m512 out1229 = tmp8652;
__m512 out1235 = tmp8672;
__m512 out1230 = tmp8654;
__m512 out1236 = tmp8674;
__m512 out1231 = tmp8656;
__m512 out1237 = tmp8676;
__m512 out1232 = tmp8658;
__m512 out1238 = tmp8678;
out1227 = _mm512_max_ps(_mm512_setzero_ps(), out1227);
out1233 = _mm512_max_ps(_mm512_setzero_ps(), out1233);
out1228 = _mm512_max_ps(_mm512_setzero_ps(), out1228);
out1234 = _mm512_max_ps(_mm512_setzero_ps(), out1234);
out1229 = _mm512_max_ps(_mm512_setzero_ps(), out1229);
out1235 = _mm512_max_ps(_mm512_setzero_ps(), out1235);
out1230 = _mm512_max_ps(_mm512_setzero_ps(), out1230);
out1236 = _mm512_max_ps(_mm512_setzero_ps(), out1236);
out1231 = _mm512_max_ps(_mm512_setzero_ps(), out1231);
out1237 = _mm512_max_ps(_mm512_setzero_ps(), out1237);
out1232 = _mm512_max_ps(_mm512_setzero_ps(), out1232);
out1238 = _mm512_max_ps(_mm512_setzero_ps(), out1238);
_mm512_mask_storeu_ps(datPtr13+1200+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1227);
_mm512_mask_storeu_ps(datPtr13+12608+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 255, out1233);
_mm512_mask_storeu_ps(datPtr13+1424+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1228);
_mm512_mask_storeu_ps(datPtr13+12832+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 255, out1234);
_mm512_mask_storeu_ps(datPtr13+1648+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1229);
_mm512_mask_storeu_ps(datPtr13+13056+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 255, out1235);
_mm512_mask_storeu_ps(datPtr13+1872+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1230);
_mm512_mask_storeu_ps(datPtr13+13280+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 255, out1236);
_mm512_mask_storeu_ps(datPtr13+2096+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1231);
_mm512_mask_storeu_ps(datPtr13+13504+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 255, out1237);
_mm512_mask_storeu_ps(datPtr13+2320+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1232);
_mm512_mask_storeu_ps(datPtr13+13728+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 255, out1238);
__m512 sf577 = _mm512_loadu_ps(sfPtr7+512+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf578 = _mm512_loadu_ps(sfPtr7+640+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1293 = _mm512_shuffle_f32x4(sf577, sf578, 68);
__m512 in1294 = _mm512_shuffle_f32x4(sf577, sf578, 238);
__m512 sf579 = _mm512_loadu_ps(sfPtr7+576+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf580 = _mm512_loadu_ps(sfPtr7+704+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1301 = _mm512_shuffle_f32x4(sf579, sf580, 68);
__m512 in1302 = _mm512_shuffle_f32x4(sf579, sf580, 238);
__m512 sf581 = _mm512_loadu_ps(sfPtr7+410112+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf582 = _mm512_loadu_ps(sfPtr7+410240+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1295 = _mm512_shuffle_f32x4(sf581, sf582, 68);
__m512 in1296 = _mm512_shuffle_f32x4(sf581, sf582, 238);
__m512 sf583 = _mm512_loadu_ps(sfPtr7+410176+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf584 = _mm512_loadu_ps(sfPtr7+410304+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1303 = _mm512_shuffle_f32x4(sf583, sf584, 68);
__m512 in1304 = _mm512_shuffle_f32x4(sf583, sf584, 238);
__m512 sf585 = _mm512_loadu_ps(sfPtr7+819712+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf586 = _mm512_loadu_ps(sfPtr7+819840+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1297 = _mm512_shuffle_f32x4(sf585, sf586, 68);
__m512 in1298 = _mm512_shuffle_f32x4(sf585, sf586, 238);
__m512 sf587 = _mm512_loadu_ps(sfPtr7+819776+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf588 = _mm512_loadu_ps(sfPtr7+819904+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1305 = _mm512_shuffle_f32x4(sf587, sf588, 68);
__m512 in1306 = _mm512_shuffle_f32x4(sf587, sf588, 238);
__m512 sf589 = _mm512_loadu_ps(sfPtr7+1229312+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf590 = _mm512_loadu_ps(sfPtr7+1229440+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1299 = _mm512_shuffle_f32x4(sf589, sf590, 68);
__m512 in1300 = _mm512_shuffle_f32x4(sf589, sf590, 238);
__m512 sf591 = _mm512_loadu_ps(sfPtr7+1229376+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 sf592 = _mm512_loadu_ps(sfPtr7+1229504+1638400*i28+24576*j23+1536*k93+768*l32);
__m512 in1307 = _mm512_shuffle_f32x4(sf591, sf592, 68);
__m512 in1308 = _mm512_shuffle_f32x4(sf591, sf592, 238);
__m512 tmp8737 = _mm512_add_ps(in1294, in1295);
__m512 tmp8757 = _mm512_add_ps(in1302, in1303);
__m512 tmp8736 = _mm512_add_ps(in1296, in1297);
__m512 tmp8756 = _mm512_add_ps(in1304, in1305);
__m512 tmp8742 = _mm512_sub_ps(in1296, in1297);
__m512 tmp8762 = _mm512_sub_ps(in1304, in1305);
__m512 tmp8741 = _mm512_sub_ps(in1294, in1295);
__m512 tmp8761 = _mm512_sub_ps(in1302, in1303);
__m512 tmp8738 = _mm512_add_ps(in1298, in1299);
__m512 tmp8758 = _mm512_add_ps(in1306, in1307);
__m512 tmp8743 = _mm512_sub_ps(in1298, in1299);
__m512 tmp8763 = _mm512_sub_ps(in1306, in1307);
__m512 tmp8740 = _mm512_fmadd_ps(tmp8742, _mm512_set1_ps(2e+00f), tmp8741);
__m512 tmp8760 = _mm512_fmadd_ps(tmp8762, _mm512_set1_ps(2e+00f), tmp8761);
__m512 tmp8747 = _mm512_fmadd_ps(tmp8742, _mm512_set1_ps(8e+00f), tmp8741);
__m512 tmp8767 = _mm512_fmadd_ps(tmp8762, _mm512_set1_ps(8e+00f), tmp8761);
__m512 tmp8735 = _mm512_add_ps(tmp8736, tmp8737);
__m512 tmp8755 = _mm512_add_ps(tmp8756, tmp8757);
__m512 tmp8739 = _mm512_fmadd_ps(tmp8743, _mm512_set1_ps(1.6e+01f), tmp8740);
__m512 tmp8759 = _mm512_fmadd_ps(tmp8763, _mm512_set1_ps(1.6e+01f), tmp8760);
__m512 tmp8746 = _mm512_fmadd_ps(tmp8743, _mm512_set1_ps(4e+00f), tmp8747);
__m512 tmp8766 = _mm512_fmadd_ps(tmp8763, _mm512_set1_ps(4e+00f), tmp8767);
__m512 tmp8752 = _mm512_add_ps(tmp8743, tmp8741);
__m512 tmp8772 = _mm512_add_ps(tmp8763, tmp8761);
__m512 tmp8745 = _mm512_fmadd_ps(tmp8736, _mm512_set1_ps(4e+00f), tmp8737);
__m512 tmp8765 = _mm512_fmadd_ps(tmp8756, _mm512_set1_ps(4e+00f), tmp8757);
__m512 tmp8749 = _mm512_fmadd_ps(tmp8736, _mm512_set1_ps(1.6e+01f), tmp8737);
__m512 tmp8769 = _mm512_fmadd_ps(tmp8756, _mm512_set1_ps(1.6e+01f), tmp8757);
__m512 tmp8734 = _mm512_add_ps(tmp8735, in1293);
__m512 tmp8754 = _mm512_add_ps(tmp8755, in1301);
__m512 tmp8751 = _mm512_add_ps(tmp8752, in1300);
__m512 tmp8771 = _mm512_add_ps(tmp8772, in1308);
__m512 tmp8733 = _mm512_fmadd_ps(tmp8738, _mm512_set1_ps(3.2e+01f), tmp8734);
__m512 tmp8753 = _mm512_fmadd_ps(tmp8758, _mm512_set1_ps(3.2e+01f), tmp8754);
__m512 tmp8744 = _mm512_fmadd_ps(tmp8738, _mm512_set1_ps(8e+00f), tmp8745);
__m512 tmp8764 = _mm512_fmadd_ps(tmp8758, _mm512_set1_ps(8e+00f), tmp8765);
__m512 tmp8750 = _mm512_fmadd_ps(tmp8742, _mm512_set1_ps(3.2e+01f), tmp8751);
__m512 tmp8770 = _mm512_fmadd_ps(tmp8762, _mm512_set1_ps(3.2e+01f), tmp8771);
__m512 tmp8748 = _mm512_fmadd_ps(tmp8738, _mm512_set1_ps(2e+00f), tmp8749);
__m512 tmp8768 = _mm512_fmadd_ps(tmp8758, _mm512_set1_ps(2e+00f), tmp8769);
__m512 tmp8721 = tmp8733;
__m512 tmp8727 = tmp8753;
__m512 tmp8722 = tmp8739;
__m512 tmp8728 = tmp8759;
__m512 tmp8723 = tmp8744;
__m512 tmp8729 = tmp8764;
__m512 tmp8724 = tmp8746;
__m512 tmp8730 = tmp8766;
__m512 tmp8725 = tmp8748;
__m512 tmp8731 = tmp8768;
__m512 tmp8726 = tmp8750;
__m512 tmp8732 = tmp8770;
__m512 tmp8817 = _mm512_unpacklo_ps(tmp8721, tmp8722);
__m512 tmp8818 = _mm512_unpackhi_ps(tmp8721, tmp8722);
__m512 tmp8819 = _mm512_unpacklo_ps(tmp8723, tmp8724);
__m512 tmp8820 = _mm512_unpackhi_ps(tmp8723, tmp8724);
__m512 tmp8821 = _mm512_unpacklo_ps(tmp8725, tmp8726);
__m512 tmp8822 = _mm512_unpackhi_ps(tmp8725, tmp8726);
__m512 tmp8823 = _mm512_unpacklo_ps(tmp8727, tmp8728);
__m512 tmp8824 = _mm512_unpackhi_ps(tmp8727, tmp8728);
__m512 tmp8825 = _mm512_unpacklo_ps(tmp8729, tmp8730);
__m512 tmp8826 = _mm512_unpackhi_ps(tmp8729, tmp8730);
__m512 tmp8827 = _mm512_unpacklo_ps(tmp8731, tmp8732);
__m512 tmp8828 = _mm512_unpackhi_ps(tmp8731, tmp8732);
__m512 tmp8829 = _mm512_shuffle_ps(tmp8817, tmp8819, 68);
__m512 tmp8830 = _mm512_shuffle_ps(tmp8817, tmp8819, 238);
__m512 tmp8831 = _mm512_shuffle_ps(tmp8818, tmp8820, 68);
__m512 tmp8832 = _mm512_shuffle_ps(tmp8818, tmp8820, 238);
__m512 tmp8833 = _mm512_shuffle_ps(tmp8821, tmp8823, 68);
__m512 tmp8834 = _mm512_shuffle_ps(tmp8821, tmp8823, 238);
__m512 tmp8835 = _mm512_shuffle_ps(tmp8822, tmp8824, 68);
__m512 tmp8836 = _mm512_shuffle_ps(tmp8822, tmp8824, 238);
__m512 tmp8837 = _mm512_shuffle_ps(tmp8825, tmp8827, 68);
__m512 tmp8838 = _mm512_shuffle_ps(tmp8825, tmp8827, 238);
__m512 tmp8839 = _mm512_shuffle_ps(tmp8826, tmp8828, 68);
__m512 tmp8840 = _mm512_shuffle_ps(tmp8826, tmp8828, 238);
__m512 tmp8841 = _mm512_shuffle_f32x4(tmp8829, tmp8833, 136);
__m512 tmp8842 = _mm512_shuffle_f32x4(tmp8829, tmp8833, 221);
__m512 tmp8843 = _mm512_shuffle_f32x4(tmp8830, tmp8834, 136);
__m512 tmp8844 = _mm512_shuffle_f32x4(tmp8830, tmp8834, 221);
__m512 tmp8845 = _mm512_shuffle_f32x4(tmp8831, tmp8835, 136);
__m512 tmp8846 = _mm512_shuffle_f32x4(tmp8831, tmp8835, 221);
__m512 tmp8847 = _mm512_shuffle_f32x4(tmp8832, tmp8836, 136);
__m512 tmp8848 = _mm512_shuffle_f32x4(tmp8832, tmp8836, 221);
__m512 tmp8849 = _mm512_shuffle_f32x4(tmp8837, tmp8837, 136);
__m512 tmp8850 = _mm512_shuffle_f32x4(tmp8837, tmp8837, 221);
__m512 tmp8851 = _mm512_shuffle_f32x4(tmp8838, tmp8838, 136);
__m512 tmp8852 = _mm512_shuffle_f32x4(tmp8838, tmp8838, 221);
__m512 tmp8853 = _mm512_shuffle_f32x4(tmp8839, tmp8839, 136);
__m512 tmp8854 = _mm512_shuffle_f32x4(tmp8839, tmp8839, 221);
__m512 tmp8855 = _mm512_shuffle_f32x4(tmp8840, tmp8840, 136);
__m512 tmp8856 = _mm512_shuffle_f32x4(tmp8840, tmp8840, 221);
tmp8721 = _mm512_shuffle_f32x4(tmp8841, tmp8849, 136);
tmp8729 = _mm512_shuffle_f32x4(tmp8841, tmp8849, 221);
tmp8722 = _mm512_shuffle_f32x4(tmp8843, tmp8851, 136);
tmp8730 = _mm512_shuffle_f32x4(tmp8843, tmp8851, 221);
tmp8723 = _mm512_shuffle_f32x4(tmp8845, tmp8853, 136);
tmp8731 = _mm512_shuffle_f32x4(tmp8845, tmp8853, 221);
tmp8724 = _mm512_shuffle_f32x4(tmp8847, tmp8855, 136);
tmp8732 = _mm512_shuffle_f32x4(tmp8847, tmp8855, 221);
tmp8725 = _mm512_shuffle_f32x4(tmp8842, tmp8850, 136);
__m512 tmp8773 = _mm512_shuffle_f32x4(tmp8842, tmp8850, 221);
tmp8726 = _mm512_shuffle_f32x4(tmp8844, tmp8852, 136);
__m512 tmp8774 = _mm512_shuffle_f32x4(tmp8844, tmp8852, 221);
tmp8727 = _mm512_shuffle_f32x4(tmp8846, tmp8854, 136);
__m512 tmp8775 = _mm512_shuffle_f32x4(tmp8846, tmp8854, 221);
tmp8728 = _mm512_shuffle_f32x4(tmp8848, tmp8856, 136);
__m512 tmp8776 = _mm512_shuffle_f32x4(tmp8848, tmp8856, 221);
__m512 tmp8781 = _mm512_add_ps(tmp8722, tmp8723);
__m512 tmp8801 = _mm512_add_ps(tmp8730, tmp8731);
__m512 tmp8780 = _mm512_add_ps(tmp8724, tmp8725);
__m512 tmp8800 = _mm512_add_ps(tmp8732, tmp8773);
__m512 tmp8786 = _mm512_sub_ps(tmp8724, tmp8725);
__m512 tmp8806 = _mm512_sub_ps(tmp8732, tmp8773);
__m512 tmp8785 = _mm512_sub_ps(tmp8722, tmp8723);
__m512 tmp8805 = _mm512_sub_ps(tmp8730, tmp8731);
__m512 tmp8782 = _mm512_add_ps(tmp8726, tmp8727);
__m512 tmp8802 = _mm512_add_ps(tmp8774, tmp8775);
__m512 tmp8787 = _mm512_sub_ps(tmp8726, tmp8727);
__m512 tmp8807 = _mm512_sub_ps(tmp8774, tmp8775);
__m512 tmp8784 = _mm512_fmadd_ps(tmp8786, _mm512_set1_ps(2e+00f), tmp8785);
__m512 tmp8804 = _mm512_fmadd_ps(tmp8806, _mm512_set1_ps(2e+00f), tmp8805);
__m512 tmp8791 = _mm512_fmadd_ps(tmp8786, _mm512_set1_ps(8e+00f), tmp8785);
__m512 tmp8811 = _mm512_fmadd_ps(tmp8806, _mm512_set1_ps(8e+00f), tmp8805);
__m512 tmp8779 = _mm512_add_ps(tmp8780, tmp8781);
__m512 tmp8799 = _mm512_add_ps(tmp8800, tmp8801);
__m512 tmp8783 = _mm512_fmadd_ps(tmp8787, _mm512_set1_ps(1.6e+01f), tmp8784);
__m512 tmp8803 = _mm512_fmadd_ps(tmp8807, _mm512_set1_ps(1.6e+01f), tmp8804);
__m512 tmp8790 = _mm512_fmadd_ps(tmp8787, _mm512_set1_ps(4e+00f), tmp8791);
__m512 tmp8810 = _mm512_fmadd_ps(tmp8807, _mm512_set1_ps(4e+00f), tmp8811);
__m512 tmp8796 = _mm512_add_ps(tmp8787, tmp8785);
__m512 tmp8816 = _mm512_add_ps(tmp8807, tmp8805);
__m512 tmp8789 = _mm512_fmadd_ps(tmp8780, _mm512_set1_ps(4e+00f), tmp8781);
__m512 tmp8809 = _mm512_fmadd_ps(tmp8800, _mm512_set1_ps(4e+00f), tmp8801);
__m512 tmp8793 = _mm512_fmadd_ps(tmp8780, _mm512_set1_ps(1.6e+01f), tmp8781);
__m512 tmp8813 = _mm512_fmadd_ps(tmp8800, _mm512_set1_ps(1.6e+01f), tmp8801);
__m512 tmp8778 = _mm512_add_ps(tmp8779, tmp8721);
__m512 tmp8798 = _mm512_add_ps(tmp8799, tmp8729);
__m512 tmp8795 = _mm512_add_ps(tmp8796, tmp8728);
__m512 tmp8815 = _mm512_add_ps(tmp8816, tmp8776);
__m512 tmp8777 = _mm512_fmadd_ps(tmp8782, _mm512_set1_ps(3.2e+01f), tmp8778);
__m512 tmp8797 = _mm512_fmadd_ps(tmp8802, _mm512_set1_ps(3.2e+01f), tmp8798);
__m512 tmp8788 = _mm512_fmadd_ps(tmp8782, _mm512_set1_ps(8e+00f), tmp8789);
__m512 tmp8808 = _mm512_fmadd_ps(tmp8802, _mm512_set1_ps(8e+00f), tmp8809);
__m512 tmp8794 = _mm512_fmadd_ps(tmp8786, _mm512_set1_ps(3.2e+01f), tmp8795);
__m512 tmp8814 = _mm512_fmadd_ps(tmp8806, _mm512_set1_ps(3.2e+01f), tmp8815);
__m512 tmp8792 = _mm512_fmadd_ps(tmp8782, _mm512_set1_ps(2e+00f), tmp8793);
__m512 tmp8812 = _mm512_fmadd_ps(tmp8802, _mm512_set1_ps(2e+00f), tmp8813);
__m512 out1239 = tmp8777;
__m512 out1245 = tmp8797;
__m512 out1240 = tmp8783;
__m512 out1246 = tmp8803;
__m512 out1241 = tmp8788;
__m512 out1247 = tmp8808;
__m512 out1242 = tmp8790;
__m512 out1248 = tmp8810;
__m512 out1243 = tmp8792;
__m512 out1249 = tmp8812;
__m512 out1244 = tmp8794;
__m512 out1250 = tmp8814;
out1239 = _mm512_max_ps(_mm512_setzero_ps(), out1239);
out1245 = _mm512_max_ps(_mm512_setzero_ps(), out1245);
out1240 = _mm512_max_ps(_mm512_setzero_ps(), out1240);
out1246 = _mm512_max_ps(_mm512_setzero_ps(), out1246);
out1241 = _mm512_max_ps(_mm512_setzero_ps(), out1241);
out1247 = _mm512_max_ps(_mm512_setzero_ps(), out1247);
out1242 = _mm512_max_ps(_mm512_setzero_ps(), out1242);
out1248 = _mm512_max_ps(_mm512_setzero_ps(), out1248);
out1243 = _mm512_max_ps(_mm512_setzero_ps(), out1243);
out1249 = _mm512_max_ps(_mm512_setzero_ps(), out1249);
out1244 = _mm512_max_ps(_mm512_setzero_ps(), out1244);
out1250 = _mm512_max_ps(_mm512_setzero_ps(), out1250);
_mm512_mask_storeu_ps(datPtr13+13760+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1239);
_mm512_mask_storeu_ps(datPtr13+13808+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1245);
_mm512_mask_storeu_ps(datPtr13+13984+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1240);
_mm512_mask_storeu_ps(datPtr13+14032+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1246);
_mm512_mask_storeu_ps(datPtr13+14208+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1241);
_mm512_mask_storeu_ps(datPtr13+14256+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1247);
_mm512_mask_storeu_ps(datPtr13+14432+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1242);
_mm512_mask_storeu_ps(datPtr13+14480+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1248);
_mm512_mask_storeu_ps(datPtr13+14656+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1243);
_mm512_mask_storeu_ps(datPtr13+14704+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1249);
_mm512_mask_storeu_ps(datPtr13+14880+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1244);
_mm512_mask_storeu_ps(datPtr13+14928+806912*i28+224*toH32+4*toW32+50432*k93+25216*l32, 4095, out1250);
}
}
if (j23 >= last6) return;
++j23;
rel17 = 2;
}
if (rel17 < 3) {
ptrdiff_t toH33 = base17+6;
ptrdiff_t toW33 = 24;
ptrdiff_t k94 = 16*w46;
for (; k94 != 16; ++k94) {
ptrdiff_t l33 = 0;
for (; l33 != 2; ++l33) {
__m512 sf593 = _mm512_loadu_ps(sfPtr7+0+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf594 = _mm512_loadu_ps(sfPtr7+128+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1309 = _mm512_shuffle_f32x4(sf593, sf594, 68);
__m512 in1310 = _mm512_shuffle_f32x4(sf593, sf594, 238);
__m512 sf595 = _mm512_loadu_ps(sfPtr7+64+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf596 = _mm512_loadu_ps(sfPtr7+192+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1317 = _mm512_shuffle_f32x4(sf595, sf596, 68);
__m512 in1318 = _mm512_shuffle_f32x4(sf595, sf596, 238);
__m512 sf597 = _mm512_loadu_ps(sfPtr7+409600+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf598 = _mm512_loadu_ps(sfPtr7+409728+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1311 = _mm512_shuffle_f32x4(sf597, sf598, 68);
__m512 in1312 = _mm512_shuffle_f32x4(sf597, sf598, 238);
__m512 sf599 = _mm512_loadu_ps(sfPtr7+409664+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf600 = _mm512_loadu_ps(sfPtr7+409792+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1319 = _mm512_shuffle_f32x4(sf599, sf600, 68);
__m512 in1320 = _mm512_shuffle_f32x4(sf599, sf600, 238);
__m512 sf601 = _mm512_loadu_ps(sfPtr7+819200+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf602 = _mm512_loadu_ps(sfPtr7+819328+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1313 = _mm512_shuffle_f32x4(sf601, sf602, 68);
__m512 in1314 = _mm512_shuffle_f32x4(sf601, sf602, 238);
__m512 sf603 = _mm512_loadu_ps(sfPtr7+819264+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf604 = _mm512_loadu_ps(sfPtr7+819392+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1321 = _mm512_shuffle_f32x4(sf603, sf604, 68);
__m512 in1322 = _mm512_shuffle_f32x4(sf603, sf604, 238);
__m512 sf605 = _mm512_loadu_ps(sfPtr7+1228800+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf606 = _mm512_loadu_ps(sfPtr7+1228928+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1315 = _mm512_shuffle_f32x4(sf605, sf606, 68);
__m512 in1316 = _mm512_shuffle_f32x4(sf605, sf606, 238);
__m512 sf607 = _mm512_loadu_ps(sfPtr7+1228864+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf608 = _mm512_loadu_ps(sfPtr7+1228992+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1323 = _mm512_shuffle_f32x4(sf607, sf608, 68);
__m512 in1324 = _mm512_shuffle_f32x4(sf607, sf608, 238);
__m512 tmp8873 = _mm512_add_ps(in1310, in1311);
__m512 tmp8893 = _mm512_add_ps(in1318, in1319);
__m512 tmp8872 = _mm512_add_ps(in1312, in1313);
__m512 tmp8892 = _mm512_add_ps(in1320, in1321);
__m512 tmp8878 = _mm512_sub_ps(in1312, in1313);
__m512 tmp8898 = _mm512_sub_ps(in1320, in1321);
__m512 tmp8877 = _mm512_sub_ps(in1310, in1311);
__m512 tmp8897 = _mm512_sub_ps(in1318, in1319);
__m512 tmp8874 = _mm512_add_ps(in1314, in1315);
__m512 tmp8894 = _mm512_add_ps(in1322, in1323);
__m512 tmp8879 = _mm512_sub_ps(in1314, in1315);
__m512 tmp8899 = _mm512_sub_ps(in1322, in1323);
__m512 tmp8876 = _mm512_fmadd_ps(tmp8878, _mm512_set1_ps(2e+00f), tmp8877);
__m512 tmp8896 = _mm512_fmadd_ps(tmp8898, _mm512_set1_ps(2e+00f), tmp8897);
__m512 tmp8883 = _mm512_fmadd_ps(tmp8878, _mm512_set1_ps(8e+00f), tmp8877);
__m512 tmp8903 = _mm512_fmadd_ps(tmp8898, _mm512_set1_ps(8e+00f), tmp8897);
__m512 tmp8871 = _mm512_add_ps(tmp8872, tmp8873);
__m512 tmp8891 = _mm512_add_ps(tmp8892, tmp8893);
__m512 tmp8875 = _mm512_fmadd_ps(tmp8879, _mm512_set1_ps(1.6e+01f), tmp8876);
__m512 tmp8895 = _mm512_fmadd_ps(tmp8899, _mm512_set1_ps(1.6e+01f), tmp8896);
__m512 tmp8882 = _mm512_fmadd_ps(tmp8879, _mm512_set1_ps(4e+00f), tmp8883);
__m512 tmp8902 = _mm512_fmadd_ps(tmp8899, _mm512_set1_ps(4e+00f), tmp8903);
__m512 tmp8888 = _mm512_add_ps(tmp8879, tmp8877);
__m512 tmp8908 = _mm512_add_ps(tmp8899, tmp8897);
__m512 tmp8881 = _mm512_fmadd_ps(tmp8872, _mm512_set1_ps(4e+00f), tmp8873);
__m512 tmp8901 = _mm512_fmadd_ps(tmp8892, _mm512_set1_ps(4e+00f), tmp8893);
__m512 tmp8885 = _mm512_fmadd_ps(tmp8872, _mm512_set1_ps(1.6e+01f), tmp8873);
__m512 tmp8905 = _mm512_fmadd_ps(tmp8892, _mm512_set1_ps(1.6e+01f), tmp8893);
__m512 tmp8870 = _mm512_add_ps(tmp8871, in1309);
__m512 tmp8890 = _mm512_add_ps(tmp8891, in1317);
__m512 tmp8887 = _mm512_add_ps(tmp8888, in1316);
__m512 tmp8907 = _mm512_add_ps(tmp8908, in1324);
__m512 tmp8869 = _mm512_fmadd_ps(tmp8874, _mm512_set1_ps(3.2e+01f), tmp8870);
__m512 tmp8889 = _mm512_fmadd_ps(tmp8894, _mm512_set1_ps(3.2e+01f), tmp8890);
__m512 tmp8880 = _mm512_fmadd_ps(tmp8874, _mm512_set1_ps(8e+00f), tmp8881);
__m512 tmp8900 = _mm512_fmadd_ps(tmp8894, _mm512_set1_ps(8e+00f), tmp8901);
__m512 tmp8886 = _mm512_fmadd_ps(tmp8878, _mm512_set1_ps(3.2e+01f), tmp8887);
__m512 tmp8906 = _mm512_fmadd_ps(tmp8898, _mm512_set1_ps(3.2e+01f), tmp8907);
__m512 tmp8884 = _mm512_fmadd_ps(tmp8874, _mm512_set1_ps(2e+00f), tmp8885);
__m512 tmp8904 = _mm512_fmadd_ps(tmp8894, _mm512_set1_ps(2e+00f), tmp8905);
__m512 tmp8857 = tmp8869;
__m512 tmp8863 = tmp8889;
__m512 tmp8858 = tmp8875;
__m512 tmp8864 = tmp8895;
__m512 tmp8859 = tmp8880;
__m512 tmp8865 = tmp8900;
__m512 tmp8860 = tmp8882;
__m512 tmp8866 = tmp8902;
__m512 tmp8861 = tmp8884;
__m512 tmp8867 = tmp8904;
__m512 tmp8862 = tmp8886;
__m512 tmp8868 = tmp8906;
__m512 tmp8953 = _mm512_unpacklo_ps(tmp8857, tmp8858);
__m512 tmp8954 = _mm512_unpackhi_ps(tmp8857, tmp8858);
__m512 tmp8955 = _mm512_unpacklo_ps(tmp8859, tmp8860);
__m512 tmp8956 = _mm512_unpackhi_ps(tmp8859, tmp8860);
__m512 tmp8957 = _mm512_unpacklo_ps(tmp8861, tmp8862);
__m512 tmp8958 = _mm512_unpackhi_ps(tmp8861, tmp8862);
__m512 tmp8959 = _mm512_unpacklo_ps(tmp8863, tmp8864);
__m512 tmp8960 = _mm512_unpackhi_ps(tmp8863, tmp8864);
__m512 tmp8961 = _mm512_unpacklo_ps(tmp8865, tmp8866);
__m512 tmp8962 = _mm512_unpackhi_ps(tmp8865, tmp8866);
__m512 tmp8963 = _mm512_unpacklo_ps(tmp8867, tmp8868);
__m512 tmp8964 = _mm512_unpackhi_ps(tmp8867, tmp8868);
__m512 tmp8965 = _mm512_shuffle_ps(tmp8953, tmp8955, 68);
__m512 tmp8966 = _mm512_shuffle_ps(tmp8953, tmp8955, 238);
__m512 tmp8967 = _mm512_shuffle_ps(tmp8954, tmp8956, 68);
__m512 tmp8968 = _mm512_shuffle_ps(tmp8954, tmp8956, 238);
__m512 tmp8969 = _mm512_shuffle_ps(tmp8957, tmp8959, 68);
__m512 tmp8970 = _mm512_shuffle_ps(tmp8957, tmp8959, 238);
__m512 tmp8971 = _mm512_shuffle_ps(tmp8958, tmp8960, 68);
__m512 tmp8972 = _mm512_shuffle_ps(tmp8958, tmp8960, 238);
__m512 tmp8973 = _mm512_shuffle_ps(tmp8961, tmp8963, 68);
__m512 tmp8974 = _mm512_shuffle_ps(tmp8961, tmp8963, 238);
__m512 tmp8975 = _mm512_shuffle_ps(tmp8962, tmp8964, 68);
__m512 tmp8976 = _mm512_shuffle_ps(tmp8962, tmp8964, 238);
__m512 tmp8977 = _mm512_shuffle_f32x4(tmp8965, tmp8969, 136);
__m512 tmp8978 = _mm512_shuffle_f32x4(tmp8965, tmp8969, 221);
__m512 tmp8979 = _mm512_shuffle_f32x4(tmp8966, tmp8970, 136);
__m512 tmp8980 = _mm512_shuffle_f32x4(tmp8966, tmp8970, 221);
__m512 tmp8981 = _mm512_shuffle_f32x4(tmp8967, tmp8971, 136);
__m512 tmp8982 = _mm512_shuffle_f32x4(tmp8967, tmp8971, 221);
__m512 tmp8983 = _mm512_shuffle_f32x4(tmp8968, tmp8972, 136);
__m512 tmp8984 = _mm512_shuffle_f32x4(tmp8968, tmp8972, 221);
__m512 tmp8985 = _mm512_shuffle_f32x4(tmp8973, tmp8973, 136);
__m512 tmp8986 = _mm512_shuffle_f32x4(tmp8973, tmp8973, 221);
__m512 tmp8987 = _mm512_shuffle_f32x4(tmp8974, tmp8974, 136);
__m512 tmp8988 = _mm512_shuffle_f32x4(tmp8974, tmp8974, 221);
__m512 tmp8989 = _mm512_shuffle_f32x4(tmp8975, tmp8975, 136);
__m512 tmp8990 = _mm512_shuffle_f32x4(tmp8975, tmp8975, 221);
__m512 tmp8991 = _mm512_shuffle_f32x4(tmp8976, tmp8976, 136);
__m512 tmp8992 = _mm512_shuffle_f32x4(tmp8976, tmp8976, 221);
tmp8857 = _mm512_shuffle_f32x4(tmp8977, tmp8985, 136);
tmp8865 = _mm512_shuffle_f32x4(tmp8977, tmp8985, 221);
tmp8858 = _mm512_shuffle_f32x4(tmp8979, tmp8987, 136);
tmp8866 = _mm512_shuffle_f32x4(tmp8979, tmp8987, 221);
tmp8859 = _mm512_shuffle_f32x4(tmp8981, tmp8989, 136);
tmp8867 = _mm512_shuffle_f32x4(tmp8981, tmp8989, 221);
tmp8860 = _mm512_shuffle_f32x4(tmp8983, tmp8991, 136);
tmp8868 = _mm512_shuffle_f32x4(tmp8983, tmp8991, 221);
tmp8861 = _mm512_shuffle_f32x4(tmp8978, tmp8986, 136);
__m512 tmp8909 = _mm512_shuffle_f32x4(tmp8978, tmp8986, 221);
tmp8862 = _mm512_shuffle_f32x4(tmp8980, tmp8988, 136);
__m512 tmp8910 = _mm512_shuffle_f32x4(tmp8980, tmp8988, 221);
tmp8863 = _mm512_shuffle_f32x4(tmp8982, tmp8990, 136);
__m512 tmp8911 = _mm512_shuffle_f32x4(tmp8982, tmp8990, 221);
tmp8864 = _mm512_shuffle_f32x4(tmp8984, tmp8992, 136);
__m512 tmp8912 = _mm512_shuffle_f32x4(tmp8984, tmp8992, 221);
__m512 tmp8917 = _mm512_add_ps(tmp8858, tmp8859);
__m512 tmp8937 = _mm512_add_ps(tmp8866, tmp8867);
__m512 tmp8916 = _mm512_add_ps(tmp8860, tmp8861);
__m512 tmp8936 = _mm512_add_ps(tmp8868, tmp8909);
__m512 tmp8922 = _mm512_sub_ps(tmp8860, tmp8861);
__m512 tmp8942 = _mm512_sub_ps(tmp8868, tmp8909);
__m512 tmp8921 = _mm512_sub_ps(tmp8858, tmp8859);
__m512 tmp8941 = _mm512_sub_ps(tmp8866, tmp8867);
__m512 tmp8918 = _mm512_add_ps(tmp8862, tmp8863);
__m512 tmp8938 = _mm512_add_ps(tmp8910, tmp8911);
__m512 tmp8923 = _mm512_sub_ps(tmp8862, tmp8863);
__m512 tmp8943 = _mm512_sub_ps(tmp8910, tmp8911);
__m512 tmp8920 = _mm512_fmadd_ps(tmp8922, _mm512_set1_ps(2e+00f), tmp8921);
__m512 tmp8940 = _mm512_fmadd_ps(tmp8942, _mm512_set1_ps(2e+00f), tmp8941);
__m512 tmp8927 = _mm512_fmadd_ps(tmp8922, _mm512_set1_ps(8e+00f), tmp8921);
__m512 tmp8947 = _mm512_fmadd_ps(tmp8942, _mm512_set1_ps(8e+00f), tmp8941);
__m512 tmp8915 = _mm512_add_ps(tmp8916, tmp8917);
__m512 tmp8935 = _mm512_add_ps(tmp8936, tmp8937);
__m512 tmp8919 = _mm512_fmadd_ps(tmp8923, _mm512_set1_ps(1.6e+01f), tmp8920);
__m512 tmp8939 = _mm512_fmadd_ps(tmp8943, _mm512_set1_ps(1.6e+01f), tmp8940);
__m512 tmp8926 = _mm512_fmadd_ps(tmp8923, _mm512_set1_ps(4e+00f), tmp8927);
__m512 tmp8946 = _mm512_fmadd_ps(tmp8943, _mm512_set1_ps(4e+00f), tmp8947);
__m512 tmp8932 = _mm512_add_ps(tmp8923, tmp8921);
__m512 tmp8952 = _mm512_add_ps(tmp8943, tmp8941);
__m512 tmp8925 = _mm512_fmadd_ps(tmp8916, _mm512_set1_ps(4e+00f), tmp8917);
__m512 tmp8945 = _mm512_fmadd_ps(tmp8936, _mm512_set1_ps(4e+00f), tmp8937);
__m512 tmp8929 = _mm512_fmadd_ps(tmp8916, _mm512_set1_ps(1.6e+01f), tmp8917);
__m512 tmp8949 = _mm512_fmadd_ps(tmp8936, _mm512_set1_ps(1.6e+01f), tmp8937);
__m512 tmp8914 = _mm512_add_ps(tmp8915, tmp8857);
__m512 tmp8934 = _mm512_add_ps(tmp8935, tmp8865);
__m512 tmp8931 = _mm512_add_ps(tmp8932, tmp8864);
__m512 tmp8951 = _mm512_add_ps(tmp8952, tmp8912);
__m512 tmp8913 = _mm512_fmadd_ps(tmp8918, _mm512_set1_ps(3.2e+01f), tmp8914);
__m512 tmp8933 = _mm512_fmadd_ps(tmp8938, _mm512_set1_ps(3.2e+01f), tmp8934);
__m512 tmp8924 = _mm512_fmadd_ps(tmp8918, _mm512_set1_ps(8e+00f), tmp8925);
__m512 tmp8944 = _mm512_fmadd_ps(tmp8938, _mm512_set1_ps(8e+00f), tmp8945);
__m512 tmp8930 = _mm512_fmadd_ps(tmp8922, _mm512_set1_ps(3.2e+01f), tmp8931);
__m512 tmp8950 = _mm512_fmadd_ps(tmp8942, _mm512_set1_ps(3.2e+01f), tmp8951);
__m512 tmp8928 = _mm512_fmadd_ps(tmp8918, _mm512_set1_ps(2e+00f), tmp8929);
__m512 tmp8948 = _mm512_fmadd_ps(tmp8938, _mm512_set1_ps(2e+00f), tmp8949);
__m512 out1251 = tmp8913;
__m512 out1257 = tmp8933;
__m512 out1252 = tmp8919;
__m512 out1258 = tmp8939;
__m512 out1253 = tmp8924;
__m512 out1259 = tmp8944;
__m512 out1254 = tmp8926;
__m512 out1260 = tmp8946;
__m512 out1255 = tmp8928;
__m512 out1261 = tmp8948;
__m512 out1256 = tmp8930;
__m512 out1262 = tmp8950;
out1251 = _mm512_max_ps(_mm512_setzero_ps(), out1251);
out1257 = _mm512_max_ps(_mm512_setzero_ps(), out1257);
out1252 = _mm512_max_ps(_mm512_setzero_ps(), out1252);
out1258 = _mm512_max_ps(_mm512_setzero_ps(), out1258);
out1253 = _mm512_max_ps(_mm512_setzero_ps(), out1253);
out1259 = _mm512_max_ps(_mm512_setzero_ps(), out1259);
out1254 = _mm512_max_ps(_mm512_setzero_ps(), out1254);
out1260 = _mm512_max_ps(_mm512_setzero_ps(), out1260);
out1255 = _mm512_max_ps(_mm512_setzero_ps(), out1255);
out1261 = _mm512_max_ps(_mm512_setzero_ps(), out1261);
out1256 = _mm512_max_ps(_mm512_setzero_ps(), out1256);
out1262 = _mm512_max_ps(_mm512_setzero_ps(), out1262);
_mm512_mask_storeu_ps(datPtr13+0+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1251);
_mm512_mask_storeu_ps(datPtr13+48+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1257);
_mm512_mask_storeu_ps(datPtr13+224+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1252);
_mm512_mask_storeu_ps(datPtr13+272+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1258);
_mm512_mask_storeu_ps(datPtr13+448+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1253);
_mm512_mask_storeu_ps(datPtr13+496+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1259);
_mm512_mask_storeu_ps(datPtr13+672+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1254);
_mm512_mask_storeu_ps(datPtr13+720+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1260);
_mm512_mask_storeu_ps(datPtr13+896+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1255);
_mm512_mask_storeu_ps(datPtr13+944+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1261);
_mm512_mask_storeu_ps(datPtr13+1120+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1256);
_mm512_mask_storeu_ps(datPtr13+1168+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1262);
__m512 sf609 = _mm512_loadu_ps(sfPtr7+256+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf610 = _mm512_loadu_ps(sfPtr7+384+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1325 = _mm512_shuffle_f32x4(sf609, sf610, 68);
__m512 in1326 = _mm512_shuffle_f32x4(sf609, sf610, 238);
__m512 sf611 = _mm512_loadu_ps(sfPtr7+320+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf612 = _mm512_loadu_ps(sfPtr7+448+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1333 = _mm512_shuffle_f32x4(sf611, sf612, 68);
__m512 in1334 = _mm512_shuffle_f32x4(sf611, sf612, 238);
__m512 sf613 = _mm512_loadu_ps(sfPtr7+409856+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf614 = _mm512_loadu_ps(sfPtr7+409984+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1327 = _mm512_shuffle_f32x4(sf613, sf614, 68);
__m512 in1328 = _mm512_shuffle_f32x4(sf613, sf614, 238);
__m512 sf615 = _mm512_loadu_ps(sfPtr7+409920+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf616 = _mm512_loadu_ps(sfPtr7+410048+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1335 = _mm512_shuffle_f32x4(sf615, sf616, 68);
__m512 in1336 = _mm512_shuffle_f32x4(sf615, sf616, 238);
__m512 sf617 = _mm512_loadu_ps(sfPtr7+819456+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf618 = _mm512_loadu_ps(sfPtr7+819584+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1329 = _mm512_shuffle_f32x4(sf617, sf618, 68);
__m512 in1330 = _mm512_shuffle_f32x4(sf617, sf618, 238);
__m512 sf619 = _mm512_loadu_ps(sfPtr7+819520+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf620 = _mm512_loadu_ps(sfPtr7+819648+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1337 = _mm512_shuffle_f32x4(sf619, sf620, 68);
__m512 in1338 = _mm512_shuffle_f32x4(sf619, sf620, 238);
__m512 sf621 = _mm512_loadu_ps(sfPtr7+1229056+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf622 = _mm512_loadu_ps(sfPtr7+1229184+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1331 = _mm512_shuffle_f32x4(sf621, sf622, 68);
__m512 in1332 = _mm512_shuffle_f32x4(sf621, sf622, 238);
__m512 sf623 = _mm512_loadu_ps(sfPtr7+1229120+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf624 = _mm512_loadu_ps(sfPtr7+1229248+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1339 = _mm512_shuffle_f32x4(sf623, sf624, 68);
__m512 in1340 = _mm512_shuffle_f32x4(sf623, sf624, 238);
__m512 tmp9009 = _mm512_add_ps(in1326, in1327);
__m512 tmp9029 = _mm512_add_ps(in1334, in1335);
__m512 tmp9008 = _mm512_add_ps(in1328, in1329);
__m512 tmp9028 = _mm512_add_ps(in1336, in1337);
__m512 tmp9014 = _mm512_sub_ps(in1328, in1329);
__m512 tmp9034 = _mm512_sub_ps(in1336, in1337);
__m512 tmp9013 = _mm512_sub_ps(in1326, in1327);
__m512 tmp9033 = _mm512_sub_ps(in1334, in1335);
__m512 tmp9010 = _mm512_add_ps(in1330, in1331);
__m512 tmp9030 = _mm512_add_ps(in1338, in1339);
__m512 tmp9015 = _mm512_sub_ps(in1330, in1331);
__m512 tmp9035 = _mm512_sub_ps(in1338, in1339);
__m512 tmp9012 = _mm512_fmadd_ps(tmp9014, _mm512_set1_ps(2e+00f), tmp9013);
__m512 tmp9032 = _mm512_fmadd_ps(tmp9034, _mm512_set1_ps(2e+00f), tmp9033);
__m512 tmp9019 = _mm512_fmadd_ps(tmp9014, _mm512_set1_ps(8e+00f), tmp9013);
__m512 tmp9039 = _mm512_fmadd_ps(tmp9034, _mm512_set1_ps(8e+00f), tmp9033);
__m512 tmp9007 = _mm512_add_ps(tmp9008, tmp9009);
__m512 tmp9027 = _mm512_add_ps(tmp9028, tmp9029);
__m512 tmp9011 = _mm512_fmadd_ps(tmp9015, _mm512_set1_ps(1.6e+01f), tmp9012);
__m512 tmp9031 = _mm512_fmadd_ps(tmp9035, _mm512_set1_ps(1.6e+01f), tmp9032);
__m512 tmp9018 = _mm512_fmadd_ps(tmp9015, _mm512_set1_ps(4e+00f), tmp9019);
__m512 tmp9038 = _mm512_fmadd_ps(tmp9035, _mm512_set1_ps(4e+00f), tmp9039);
__m512 tmp9024 = _mm512_add_ps(tmp9015, tmp9013);
__m512 tmp9044 = _mm512_add_ps(tmp9035, tmp9033);
__m512 tmp9017 = _mm512_fmadd_ps(tmp9008, _mm512_set1_ps(4e+00f), tmp9009);
__m512 tmp9037 = _mm512_fmadd_ps(tmp9028, _mm512_set1_ps(4e+00f), tmp9029);
__m512 tmp9021 = _mm512_fmadd_ps(tmp9008, _mm512_set1_ps(1.6e+01f), tmp9009);
__m512 tmp9041 = _mm512_fmadd_ps(tmp9028, _mm512_set1_ps(1.6e+01f), tmp9029);
__m512 tmp9006 = _mm512_add_ps(tmp9007, in1325);
__m512 tmp9026 = _mm512_add_ps(tmp9027, in1333);
__m512 tmp9023 = _mm512_add_ps(tmp9024, in1332);
__m512 tmp9043 = _mm512_add_ps(tmp9044, in1340);
__m512 tmp9005 = _mm512_fmadd_ps(tmp9010, _mm512_set1_ps(3.2e+01f), tmp9006);
__m512 tmp9025 = _mm512_fmadd_ps(tmp9030, _mm512_set1_ps(3.2e+01f), tmp9026);
__m512 tmp9016 = _mm512_fmadd_ps(tmp9010, _mm512_set1_ps(8e+00f), tmp9017);
__m512 tmp9036 = _mm512_fmadd_ps(tmp9030, _mm512_set1_ps(8e+00f), tmp9037);
__m512 tmp9022 = _mm512_fmadd_ps(tmp9014, _mm512_set1_ps(3.2e+01f), tmp9023);
__m512 tmp9042 = _mm512_fmadd_ps(tmp9034, _mm512_set1_ps(3.2e+01f), tmp9043);
__m512 tmp9020 = _mm512_fmadd_ps(tmp9010, _mm512_set1_ps(2e+00f), tmp9021);
__m512 tmp9040 = _mm512_fmadd_ps(tmp9030, _mm512_set1_ps(2e+00f), tmp9041);
__m512 tmp8993 = tmp9005;
__m512 tmp8999 = tmp9025;
__m512 tmp8994 = tmp9011;
__m512 tmp9000 = tmp9031;
__m512 tmp8995 = tmp9016;
__m512 tmp9001 = tmp9036;
__m512 tmp8996 = tmp9018;
__m512 tmp9002 = tmp9038;
__m512 tmp8997 = tmp9020;
__m512 tmp9003 = tmp9040;
__m512 tmp8998 = tmp9022;
__m512 tmp9004 = tmp9042;
__m512 tmp9089 = _mm512_unpacklo_ps(tmp8993, tmp8994);
__m512 tmp9090 = _mm512_unpackhi_ps(tmp8993, tmp8994);
__m512 tmp9091 = _mm512_unpacklo_ps(tmp8995, tmp8996);
__m512 tmp9092 = _mm512_unpackhi_ps(tmp8995, tmp8996);
__m512 tmp9093 = _mm512_unpacklo_ps(tmp8997, tmp8998);
__m512 tmp9094 = _mm512_unpackhi_ps(tmp8997, tmp8998);
__m512 tmp9095 = _mm512_unpacklo_ps(tmp8999, tmp9000);
__m512 tmp9096 = _mm512_unpackhi_ps(tmp8999, tmp9000);
__m512 tmp9097 = _mm512_unpacklo_ps(tmp9001, tmp9002);
__m512 tmp9098 = _mm512_unpackhi_ps(tmp9001, tmp9002);
__m512 tmp9099 = _mm512_unpacklo_ps(tmp9003, tmp9004);
__m512 tmp9100 = _mm512_unpackhi_ps(tmp9003, tmp9004);
__m512 tmp9101 = _mm512_shuffle_ps(tmp9089, tmp9091, 68);
__m512 tmp9102 = _mm512_shuffle_ps(tmp9089, tmp9091, 238);
__m512 tmp9103 = _mm512_shuffle_ps(tmp9090, tmp9092, 68);
__m512 tmp9104 = _mm512_shuffle_ps(tmp9090, tmp9092, 238);
__m512 tmp9105 = _mm512_shuffle_ps(tmp9093, tmp9095, 68);
__m512 tmp9106 = _mm512_shuffle_ps(tmp9093, tmp9095, 238);
__m512 tmp9107 = _mm512_shuffle_ps(tmp9094, tmp9096, 68);
__m512 tmp9108 = _mm512_shuffle_ps(tmp9094, tmp9096, 238);
__m512 tmp9109 = _mm512_shuffle_ps(tmp9097, tmp9099, 68);
__m512 tmp9110 = _mm512_shuffle_ps(tmp9097, tmp9099, 238);
__m512 tmp9111 = _mm512_shuffle_ps(tmp9098, tmp9100, 68);
__m512 tmp9112 = _mm512_shuffle_ps(tmp9098, tmp9100, 238);
__m512 tmp9113 = _mm512_shuffle_f32x4(tmp9101, tmp9105, 136);
__m512 tmp9114 = _mm512_shuffle_f32x4(tmp9101, tmp9105, 221);
__m512 tmp9115 = _mm512_shuffle_f32x4(tmp9102, tmp9106, 136);
__m512 tmp9116 = _mm512_shuffle_f32x4(tmp9102, tmp9106, 221);
__m512 tmp9117 = _mm512_shuffle_f32x4(tmp9103, tmp9107, 136);
__m512 tmp9118 = _mm512_shuffle_f32x4(tmp9103, tmp9107, 221);
__m512 tmp9119 = _mm512_shuffle_f32x4(tmp9104, tmp9108, 136);
__m512 tmp9120 = _mm512_shuffle_f32x4(tmp9104, tmp9108, 221);
__m512 tmp9121 = _mm512_shuffle_f32x4(tmp9109, tmp9109, 136);
__m512 tmp9122 = _mm512_shuffle_f32x4(tmp9109, tmp9109, 221);
__m512 tmp9123 = _mm512_shuffle_f32x4(tmp9110, tmp9110, 136);
__m512 tmp9124 = _mm512_shuffle_f32x4(tmp9110, tmp9110, 221);
__m512 tmp9125 = _mm512_shuffle_f32x4(tmp9111, tmp9111, 136);
__m512 tmp9126 = _mm512_shuffle_f32x4(tmp9111, tmp9111, 221);
__m512 tmp9127 = _mm512_shuffle_f32x4(tmp9112, tmp9112, 136);
__m512 tmp9128 = _mm512_shuffle_f32x4(tmp9112, tmp9112, 221);
tmp8993 = _mm512_shuffle_f32x4(tmp9113, tmp9121, 136);
tmp9001 = _mm512_shuffle_f32x4(tmp9113, tmp9121, 221);
tmp8994 = _mm512_shuffle_f32x4(tmp9115, tmp9123, 136);
tmp9002 = _mm512_shuffle_f32x4(tmp9115, tmp9123, 221);
tmp8995 = _mm512_shuffle_f32x4(tmp9117, tmp9125, 136);
tmp9003 = _mm512_shuffle_f32x4(tmp9117, tmp9125, 221);
tmp8996 = _mm512_shuffle_f32x4(tmp9119, tmp9127, 136);
tmp9004 = _mm512_shuffle_f32x4(tmp9119, tmp9127, 221);
tmp8997 = _mm512_shuffle_f32x4(tmp9114, tmp9122, 136);
__m512 tmp9045 = _mm512_shuffle_f32x4(tmp9114, tmp9122, 221);
tmp8998 = _mm512_shuffle_f32x4(tmp9116, tmp9124, 136);
__m512 tmp9046 = _mm512_shuffle_f32x4(tmp9116, tmp9124, 221);
tmp8999 = _mm512_shuffle_f32x4(tmp9118, tmp9126, 136);
__m512 tmp9047 = _mm512_shuffle_f32x4(tmp9118, tmp9126, 221);
tmp9000 = _mm512_shuffle_f32x4(tmp9120, tmp9128, 136);
__m512 tmp9048 = _mm512_shuffle_f32x4(tmp9120, tmp9128, 221);
__m512 tmp9053 = _mm512_add_ps(tmp8994, tmp8995);
__m512 tmp9073 = _mm512_add_ps(tmp9002, tmp9003);
__m512 tmp9052 = _mm512_add_ps(tmp8996, tmp8997);
__m512 tmp9072 = _mm512_add_ps(tmp9004, tmp9045);
__m512 tmp9058 = _mm512_sub_ps(tmp8996, tmp8997);
__m512 tmp9078 = _mm512_sub_ps(tmp9004, tmp9045);
__m512 tmp9057 = _mm512_sub_ps(tmp8994, tmp8995);
__m512 tmp9077 = _mm512_sub_ps(tmp9002, tmp9003);
__m512 tmp9054 = _mm512_add_ps(tmp8998, tmp8999);
__m512 tmp9074 = _mm512_add_ps(tmp9046, tmp9047);
__m512 tmp9059 = _mm512_sub_ps(tmp8998, tmp8999);
__m512 tmp9079 = _mm512_sub_ps(tmp9046, tmp9047);
__m512 tmp9056 = _mm512_fmadd_ps(tmp9058, _mm512_set1_ps(2e+00f), tmp9057);
__m512 tmp9076 = _mm512_fmadd_ps(tmp9078, _mm512_set1_ps(2e+00f), tmp9077);
__m512 tmp9063 = _mm512_fmadd_ps(tmp9058, _mm512_set1_ps(8e+00f), tmp9057);
__m512 tmp9083 = _mm512_fmadd_ps(tmp9078, _mm512_set1_ps(8e+00f), tmp9077);
__m512 tmp9051 = _mm512_add_ps(tmp9052, tmp9053);
__m512 tmp9071 = _mm512_add_ps(tmp9072, tmp9073);
__m512 tmp9055 = _mm512_fmadd_ps(tmp9059, _mm512_set1_ps(1.6e+01f), tmp9056);
__m512 tmp9075 = _mm512_fmadd_ps(tmp9079, _mm512_set1_ps(1.6e+01f), tmp9076);
__m512 tmp9062 = _mm512_fmadd_ps(tmp9059, _mm512_set1_ps(4e+00f), tmp9063);
__m512 tmp9082 = _mm512_fmadd_ps(tmp9079, _mm512_set1_ps(4e+00f), tmp9083);
__m512 tmp9068 = _mm512_add_ps(tmp9059, tmp9057);
__m512 tmp9088 = _mm512_add_ps(tmp9079, tmp9077);
__m512 tmp9061 = _mm512_fmadd_ps(tmp9052, _mm512_set1_ps(4e+00f), tmp9053);
__m512 tmp9081 = _mm512_fmadd_ps(tmp9072, _mm512_set1_ps(4e+00f), tmp9073);
__m512 tmp9065 = _mm512_fmadd_ps(tmp9052, _mm512_set1_ps(1.6e+01f), tmp9053);
__m512 tmp9085 = _mm512_fmadd_ps(tmp9072, _mm512_set1_ps(1.6e+01f), tmp9073);
__m512 tmp9050 = _mm512_add_ps(tmp9051, tmp8993);
__m512 tmp9070 = _mm512_add_ps(tmp9071, tmp9001);
__m512 tmp9067 = _mm512_add_ps(tmp9068, tmp9000);
__m512 tmp9087 = _mm512_add_ps(tmp9088, tmp9048);
__m512 tmp9049 = _mm512_fmadd_ps(tmp9054, _mm512_set1_ps(3.2e+01f), tmp9050);
__m512 tmp9069 = _mm512_fmadd_ps(tmp9074, _mm512_set1_ps(3.2e+01f), tmp9070);
__m512 tmp9060 = _mm512_fmadd_ps(tmp9054, _mm512_set1_ps(8e+00f), tmp9061);
__m512 tmp9080 = _mm512_fmadd_ps(tmp9074, _mm512_set1_ps(8e+00f), tmp9081);
__m512 tmp9066 = _mm512_fmadd_ps(tmp9058, _mm512_set1_ps(3.2e+01f), tmp9067);
__m512 tmp9086 = _mm512_fmadd_ps(tmp9078, _mm512_set1_ps(3.2e+01f), tmp9087);
__m512 tmp9064 = _mm512_fmadd_ps(tmp9054, _mm512_set1_ps(2e+00f), tmp9065);
__m512 tmp9084 = _mm512_fmadd_ps(tmp9074, _mm512_set1_ps(2e+00f), tmp9085);
__m512 out1263 = tmp9049;
__m512 out1269 = tmp9069;
__m512 out1264 = tmp9055;
__m512 out1270 = tmp9075;
__m512 out1265 = tmp9060;
__m512 out1271 = tmp9080;
__m512 out1266 = tmp9062;
__m512 out1272 = tmp9082;
__m512 out1267 = tmp9064;
__m512 out1273 = tmp9084;
__m512 out1268 = tmp9066;
__m512 out1274 = tmp9086;
out1263 = _mm512_max_ps(_mm512_setzero_ps(), out1263);
out1269 = _mm512_max_ps(_mm512_setzero_ps(), out1269);
out1264 = _mm512_max_ps(_mm512_setzero_ps(), out1264);
out1270 = _mm512_max_ps(_mm512_setzero_ps(), out1270);
out1265 = _mm512_max_ps(_mm512_setzero_ps(), out1265);
out1271 = _mm512_max_ps(_mm512_setzero_ps(), out1271);
out1266 = _mm512_max_ps(_mm512_setzero_ps(), out1266);
out1272 = _mm512_max_ps(_mm512_setzero_ps(), out1272);
out1267 = _mm512_max_ps(_mm512_setzero_ps(), out1267);
out1273 = _mm512_max_ps(_mm512_setzero_ps(), out1273);
out1268 = _mm512_max_ps(_mm512_setzero_ps(), out1268);
out1274 = _mm512_max_ps(_mm512_setzero_ps(), out1274);
_mm512_mask_storeu_ps(datPtr13+96+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 255, out1263);
_mm512_mask_storeu_ps(datPtr13+12608+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1269);
_mm512_mask_storeu_ps(datPtr13+320+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 255, out1264);
_mm512_mask_storeu_ps(datPtr13+12832+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1270);
_mm512_mask_storeu_ps(datPtr13+544+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 255, out1265);
_mm512_mask_storeu_ps(datPtr13+13056+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1271);
_mm512_mask_storeu_ps(datPtr13+768+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 255, out1266);
_mm512_mask_storeu_ps(datPtr13+13280+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1272);
_mm512_mask_storeu_ps(datPtr13+992+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 255, out1267);
_mm512_mask_storeu_ps(datPtr13+13504+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1273);
_mm512_mask_storeu_ps(datPtr13+1216+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 255, out1268);
_mm512_mask_storeu_ps(datPtr13+13728+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1274);
__m512 sf625 = _mm512_loadu_ps(sfPtr7+512+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf626 = _mm512_loadu_ps(sfPtr7+640+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1341 = _mm512_shuffle_f32x4(sf625, sf626, 68);
__m512 in1342 = _mm512_shuffle_f32x4(sf625, sf626, 238);
__m512 sf627 = _mm512_loadu_ps(sfPtr7+576+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf628 = _mm512_loadu_ps(sfPtr7+704+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1349 = _mm512_shuffle_f32x4(sf627, sf628, 68);
__m512 in1350 = _mm512_shuffle_f32x4(sf627, sf628, 238);
__m512 sf629 = _mm512_loadu_ps(sfPtr7+410112+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf630 = _mm512_loadu_ps(sfPtr7+410240+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1343 = _mm512_shuffle_f32x4(sf629, sf630, 68);
__m512 in1344 = _mm512_shuffle_f32x4(sf629, sf630, 238);
__m512 sf631 = _mm512_loadu_ps(sfPtr7+410176+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf632 = _mm512_loadu_ps(sfPtr7+410304+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1351 = _mm512_shuffle_f32x4(sf631, sf632, 68);
__m512 in1352 = _mm512_shuffle_f32x4(sf631, sf632, 238);
__m512 sf633 = _mm512_loadu_ps(sfPtr7+819712+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf634 = _mm512_loadu_ps(sfPtr7+819840+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1345 = _mm512_shuffle_f32x4(sf633, sf634, 68);
__m512 in1346 = _mm512_shuffle_f32x4(sf633, sf634, 238);
__m512 sf635 = _mm512_loadu_ps(sfPtr7+819776+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf636 = _mm512_loadu_ps(sfPtr7+819904+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1353 = _mm512_shuffle_f32x4(sf635, sf636, 68);
__m512 in1354 = _mm512_shuffle_f32x4(sf635, sf636, 238);
__m512 sf637 = _mm512_loadu_ps(sfPtr7+1229312+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf638 = _mm512_loadu_ps(sfPtr7+1229440+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1347 = _mm512_shuffle_f32x4(sf637, sf638, 68);
__m512 in1348 = _mm512_shuffle_f32x4(sf637, sf638, 238);
__m512 sf639 = _mm512_loadu_ps(sfPtr7+1229376+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 sf640 = _mm512_loadu_ps(sfPtr7+1229504+1638400*i28+24576*j23+1536*k94+768*l33);
__m512 in1355 = _mm512_shuffle_f32x4(sf639, sf640, 68);
__m512 in1356 = _mm512_shuffle_f32x4(sf639, sf640, 238);
__m512 tmp9145 = _mm512_add_ps(in1342, in1343);
__m512 tmp9165 = _mm512_add_ps(in1350, in1351);
__m512 tmp9144 = _mm512_add_ps(in1344, in1345);
__m512 tmp9164 = _mm512_add_ps(in1352, in1353);
__m512 tmp9150 = _mm512_sub_ps(in1344, in1345);
__m512 tmp9170 = _mm512_sub_ps(in1352, in1353);
__m512 tmp9149 = _mm512_sub_ps(in1342, in1343);
__m512 tmp9169 = _mm512_sub_ps(in1350, in1351);
__m512 tmp9146 = _mm512_add_ps(in1346, in1347);
__m512 tmp9166 = _mm512_add_ps(in1354, in1355);
__m512 tmp9151 = _mm512_sub_ps(in1346, in1347);
__m512 tmp9171 = _mm512_sub_ps(in1354, in1355);
__m512 tmp9148 = _mm512_fmadd_ps(tmp9150, _mm512_set1_ps(2e+00f), tmp9149);
__m512 tmp9168 = _mm512_fmadd_ps(tmp9170, _mm512_set1_ps(2e+00f), tmp9169);
__m512 tmp9155 = _mm512_fmadd_ps(tmp9150, _mm512_set1_ps(8e+00f), tmp9149);
__m512 tmp9175 = _mm512_fmadd_ps(tmp9170, _mm512_set1_ps(8e+00f), tmp9169);
__m512 tmp9143 = _mm512_add_ps(tmp9144, tmp9145);
__m512 tmp9163 = _mm512_add_ps(tmp9164, tmp9165);
__m512 tmp9147 = _mm512_fmadd_ps(tmp9151, _mm512_set1_ps(1.6e+01f), tmp9148);
__m512 tmp9167 = _mm512_fmadd_ps(tmp9171, _mm512_set1_ps(1.6e+01f), tmp9168);
__m512 tmp9154 = _mm512_fmadd_ps(tmp9151, _mm512_set1_ps(4e+00f), tmp9155);
__m512 tmp9174 = _mm512_fmadd_ps(tmp9171, _mm512_set1_ps(4e+00f), tmp9175);
__m512 tmp9160 = _mm512_add_ps(tmp9151, tmp9149);
__m512 tmp9180 = _mm512_add_ps(tmp9171, tmp9169);
__m512 tmp9153 = _mm512_fmadd_ps(tmp9144, _mm512_set1_ps(4e+00f), tmp9145);
__m512 tmp9173 = _mm512_fmadd_ps(tmp9164, _mm512_set1_ps(4e+00f), tmp9165);
__m512 tmp9157 = _mm512_fmadd_ps(tmp9144, _mm512_set1_ps(1.6e+01f), tmp9145);
__m512 tmp9177 = _mm512_fmadd_ps(tmp9164, _mm512_set1_ps(1.6e+01f), tmp9165);
__m512 tmp9142 = _mm512_add_ps(tmp9143, in1341);
__m512 tmp9162 = _mm512_add_ps(tmp9163, in1349);
__m512 tmp9159 = _mm512_add_ps(tmp9160, in1348);
__m512 tmp9179 = _mm512_add_ps(tmp9180, in1356);
__m512 tmp9141 = _mm512_fmadd_ps(tmp9146, _mm512_set1_ps(3.2e+01f), tmp9142);
__m512 tmp9161 = _mm512_fmadd_ps(tmp9166, _mm512_set1_ps(3.2e+01f), tmp9162);
__m512 tmp9152 = _mm512_fmadd_ps(tmp9146, _mm512_set1_ps(8e+00f), tmp9153);
__m512 tmp9172 = _mm512_fmadd_ps(tmp9166, _mm512_set1_ps(8e+00f), tmp9173);
__m512 tmp9158 = _mm512_fmadd_ps(tmp9150, _mm512_set1_ps(3.2e+01f), tmp9159);
__m512 tmp9178 = _mm512_fmadd_ps(tmp9170, _mm512_set1_ps(3.2e+01f), tmp9179);
__m512 tmp9156 = _mm512_fmadd_ps(tmp9146, _mm512_set1_ps(2e+00f), tmp9157);
__m512 tmp9176 = _mm512_fmadd_ps(tmp9166, _mm512_set1_ps(2e+00f), tmp9177);
__m512 tmp9129 = tmp9141;
__m512 tmp9135 = tmp9161;
__m512 tmp9130 = tmp9147;
__m512 tmp9136 = tmp9167;
__m512 tmp9131 = tmp9152;
__m512 tmp9137 = tmp9172;
__m512 tmp9132 = tmp9154;
__m512 tmp9138 = tmp9174;
__m512 tmp9133 = tmp9156;
__m512 tmp9139 = tmp9176;
__m512 tmp9134 = tmp9158;
__m512 tmp9140 = tmp9178;
__m512 tmp9225 = _mm512_unpacklo_ps(tmp9129, tmp9130);
__m512 tmp9226 = _mm512_unpackhi_ps(tmp9129, tmp9130);
__m512 tmp9227 = _mm512_unpacklo_ps(tmp9131, tmp9132);
__m512 tmp9228 = _mm512_unpackhi_ps(tmp9131, tmp9132);
__m512 tmp9229 = _mm512_unpacklo_ps(tmp9133, tmp9134);
__m512 tmp9230 = _mm512_unpackhi_ps(tmp9133, tmp9134);
__m512 tmp9231 = _mm512_unpacklo_ps(tmp9135, tmp9136);
__m512 tmp9232 = _mm512_unpackhi_ps(tmp9135, tmp9136);
__m512 tmp9233 = _mm512_unpacklo_ps(tmp9137, tmp9138);
__m512 tmp9234 = _mm512_unpackhi_ps(tmp9137, tmp9138);
__m512 tmp9235 = _mm512_unpacklo_ps(tmp9139, tmp9140);
__m512 tmp9236 = _mm512_unpackhi_ps(tmp9139, tmp9140);
__m512 tmp9237 = _mm512_shuffle_ps(tmp9225, tmp9227, 68);
__m512 tmp9238 = _mm512_shuffle_ps(tmp9225, tmp9227, 238);
__m512 tmp9239 = _mm512_shuffle_ps(tmp9226, tmp9228, 68);
__m512 tmp9240 = _mm512_shuffle_ps(tmp9226, tmp9228, 238);
__m512 tmp9241 = _mm512_shuffle_ps(tmp9229, tmp9231, 68);
__m512 tmp9242 = _mm512_shuffle_ps(tmp9229, tmp9231, 238);
__m512 tmp9243 = _mm512_shuffle_ps(tmp9230, tmp9232, 68);
__m512 tmp9244 = _mm512_shuffle_ps(tmp9230, tmp9232, 238);
__m512 tmp9245 = _mm512_shuffle_ps(tmp9233, tmp9235, 68);
__m512 tmp9246 = _mm512_shuffle_ps(tmp9233, tmp9235, 238);
__m512 tmp9247 = _mm512_shuffle_ps(tmp9234, tmp9236, 68);
__m512 tmp9248 = _mm512_shuffle_ps(tmp9234, tmp9236, 238);
__m512 tmp9249 = _mm512_shuffle_f32x4(tmp9237, tmp9241, 136);
__m512 tmp9250 = _mm512_shuffle_f32x4(tmp9237, tmp9241, 221);
__m512 tmp9251 = _mm512_shuffle_f32x4(tmp9238, tmp9242, 136);
__m512 tmp9252 = _mm512_shuffle_f32x4(tmp9238, tmp9242, 221);
__m512 tmp9253 = _mm512_shuffle_f32x4(tmp9239, tmp9243, 136);
__m512 tmp9254 = _mm512_shuffle_f32x4(tmp9239, tmp9243, 221);
__m512 tmp9255 = _mm512_shuffle_f32x4(tmp9240, tmp9244, 136);
__m512 tmp9256 = _mm512_shuffle_f32x4(tmp9240, tmp9244, 221);
__m512 tmp9257 = _mm512_shuffle_f32x4(tmp9245, tmp9245, 136);
__m512 tmp9258 = _mm512_shuffle_f32x4(tmp9245, tmp9245, 221);
__m512 tmp9259 = _mm512_shuffle_f32x4(tmp9246, tmp9246, 136);
__m512 tmp9260 = _mm512_shuffle_f32x4(tmp9246, tmp9246, 221);
__m512 tmp9261 = _mm512_shuffle_f32x4(tmp9247, tmp9247, 136);
__m512 tmp9262 = _mm512_shuffle_f32x4(tmp9247, tmp9247, 221);
__m512 tmp9263 = _mm512_shuffle_f32x4(tmp9248, tmp9248, 136);
__m512 tmp9264 = _mm512_shuffle_f32x4(tmp9248, tmp9248, 221);
tmp9129 = _mm512_shuffle_f32x4(tmp9249, tmp9257, 136);
tmp9137 = _mm512_shuffle_f32x4(tmp9249, tmp9257, 221);
tmp9130 = _mm512_shuffle_f32x4(tmp9251, tmp9259, 136);
tmp9138 = _mm512_shuffle_f32x4(tmp9251, tmp9259, 221);
tmp9131 = _mm512_shuffle_f32x4(tmp9253, tmp9261, 136);
tmp9139 = _mm512_shuffle_f32x4(tmp9253, tmp9261, 221);
tmp9132 = _mm512_shuffle_f32x4(tmp9255, tmp9263, 136);
tmp9140 = _mm512_shuffle_f32x4(tmp9255, tmp9263, 221);
tmp9133 = _mm512_shuffle_f32x4(tmp9250, tmp9258, 136);
__m512 tmp9181 = _mm512_shuffle_f32x4(tmp9250, tmp9258, 221);
tmp9134 = _mm512_shuffle_f32x4(tmp9252, tmp9260, 136);
__m512 tmp9182 = _mm512_shuffle_f32x4(tmp9252, tmp9260, 221);
tmp9135 = _mm512_shuffle_f32x4(tmp9254, tmp9262, 136);
__m512 tmp9183 = _mm512_shuffle_f32x4(tmp9254, tmp9262, 221);
tmp9136 = _mm512_shuffle_f32x4(tmp9256, tmp9264, 136);
__m512 tmp9184 = _mm512_shuffle_f32x4(tmp9256, tmp9264, 221);
__m512 tmp9189 = _mm512_add_ps(tmp9130, tmp9131);
__m512 tmp9209 = _mm512_add_ps(tmp9138, tmp9139);
__m512 tmp9188 = _mm512_add_ps(tmp9132, tmp9133);
__m512 tmp9208 = _mm512_add_ps(tmp9140, tmp9181);
__m512 tmp9194 = _mm512_sub_ps(tmp9132, tmp9133);
__m512 tmp9214 = _mm512_sub_ps(tmp9140, tmp9181);
__m512 tmp9193 = _mm512_sub_ps(tmp9130, tmp9131);
__m512 tmp9213 = _mm512_sub_ps(tmp9138, tmp9139);
__m512 tmp9190 = _mm512_add_ps(tmp9134, tmp9135);
__m512 tmp9210 = _mm512_add_ps(tmp9182, tmp9183);
__m512 tmp9195 = _mm512_sub_ps(tmp9134, tmp9135);
__m512 tmp9215 = _mm512_sub_ps(tmp9182, tmp9183);
__m512 tmp9192 = _mm512_fmadd_ps(tmp9194, _mm512_set1_ps(2e+00f), tmp9193);
__m512 tmp9212 = _mm512_fmadd_ps(tmp9214, _mm512_set1_ps(2e+00f), tmp9213);
__m512 tmp9199 = _mm512_fmadd_ps(tmp9194, _mm512_set1_ps(8e+00f), tmp9193);
__m512 tmp9219 = _mm512_fmadd_ps(tmp9214, _mm512_set1_ps(8e+00f), tmp9213);
__m512 tmp9187 = _mm512_add_ps(tmp9188, tmp9189);
__m512 tmp9207 = _mm512_add_ps(tmp9208, tmp9209);
__m512 tmp9191 = _mm512_fmadd_ps(tmp9195, _mm512_set1_ps(1.6e+01f), tmp9192);
__m512 tmp9211 = _mm512_fmadd_ps(tmp9215, _mm512_set1_ps(1.6e+01f), tmp9212);
__m512 tmp9198 = _mm512_fmadd_ps(tmp9195, _mm512_set1_ps(4e+00f), tmp9199);
__m512 tmp9218 = _mm512_fmadd_ps(tmp9215, _mm512_set1_ps(4e+00f), tmp9219);
__m512 tmp9204 = _mm512_add_ps(tmp9195, tmp9193);
__m512 tmp9224 = _mm512_add_ps(tmp9215, tmp9213);
__m512 tmp9197 = _mm512_fmadd_ps(tmp9188, _mm512_set1_ps(4e+00f), tmp9189);
__m512 tmp9217 = _mm512_fmadd_ps(tmp9208, _mm512_set1_ps(4e+00f), tmp9209);
__m512 tmp9201 = _mm512_fmadd_ps(tmp9188, _mm512_set1_ps(1.6e+01f), tmp9189);
__m512 tmp9221 = _mm512_fmadd_ps(tmp9208, _mm512_set1_ps(1.6e+01f), tmp9209);
__m512 tmp9186 = _mm512_add_ps(tmp9187, tmp9129);
__m512 tmp9206 = _mm512_add_ps(tmp9207, tmp9137);
__m512 tmp9203 = _mm512_add_ps(tmp9204, tmp9136);
__m512 tmp9223 = _mm512_add_ps(tmp9224, tmp9184);
__m512 tmp9185 = _mm512_fmadd_ps(tmp9190, _mm512_set1_ps(3.2e+01f), tmp9186);
__m512 tmp9205 = _mm512_fmadd_ps(tmp9210, _mm512_set1_ps(3.2e+01f), tmp9206);
__m512 tmp9196 = _mm512_fmadd_ps(tmp9190, _mm512_set1_ps(8e+00f), tmp9197);
__m512 tmp9216 = _mm512_fmadd_ps(tmp9210, _mm512_set1_ps(8e+00f), tmp9217);
__m512 tmp9202 = _mm512_fmadd_ps(tmp9194, _mm512_set1_ps(3.2e+01f), tmp9203);
__m512 tmp9222 = _mm512_fmadd_ps(tmp9214, _mm512_set1_ps(3.2e+01f), tmp9223);
__m512 tmp9200 = _mm512_fmadd_ps(tmp9190, _mm512_set1_ps(2e+00f), tmp9201);
__m512 tmp9220 = _mm512_fmadd_ps(tmp9210, _mm512_set1_ps(2e+00f), tmp9221);
__m512 out1275 = tmp9185;
__m512 out1281 = tmp9205;
__m512 out1276 = tmp9191;
__m512 out1282 = tmp9211;
__m512 out1277 = tmp9196;
__m512 out1283 = tmp9216;
__m512 out1278 = tmp9198;
__m512 out1284 = tmp9218;
__m512 out1279 = tmp9200;
__m512 out1285 = tmp9220;
__m512 out1280 = tmp9202;
__m512 out1286 = tmp9222;
out1275 = _mm512_max_ps(_mm512_setzero_ps(), out1275);
out1281 = _mm512_max_ps(_mm512_setzero_ps(), out1281);
out1276 = _mm512_max_ps(_mm512_setzero_ps(), out1276);
out1282 = _mm512_max_ps(_mm512_setzero_ps(), out1282);
out1277 = _mm512_max_ps(_mm512_setzero_ps(), out1277);
out1283 = _mm512_max_ps(_mm512_setzero_ps(), out1283);
out1278 = _mm512_max_ps(_mm512_setzero_ps(), out1278);
out1284 = _mm512_max_ps(_mm512_setzero_ps(), out1284);
out1279 = _mm512_max_ps(_mm512_setzero_ps(), out1279);
out1285 = _mm512_max_ps(_mm512_setzero_ps(), out1285);
out1280 = _mm512_max_ps(_mm512_setzero_ps(), out1280);
out1286 = _mm512_max_ps(_mm512_setzero_ps(), out1286);
_mm512_mask_storeu_ps(datPtr13+12656+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1275);
_mm512_mask_storeu_ps(datPtr13+12704+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 255, out1281);
_mm512_mask_storeu_ps(datPtr13+12880+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1276);
_mm512_mask_storeu_ps(datPtr13+12928+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 255, out1282);
_mm512_mask_storeu_ps(datPtr13+13104+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1277);
_mm512_mask_storeu_ps(datPtr13+13152+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 255, out1283);
_mm512_mask_storeu_ps(datPtr13+13328+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1278);
_mm512_mask_storeu_ps(datPtr13+13376+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 255, out1284);
_mm512_mask_storeu_ps(datPtr13+13552+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1279);
_mm512_mask_storeu_ps(datPtr13+13600+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 255, out1285);
_mm512_mask_storeu_ps(datPtr13+13776+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 4095, out1280);
_mm512_mask_storeu_ps(datPtr13+13824+806912*i28+224*toH33+4*toW33+50432*k94+25216*l33, 255, out1286);
}
}
if (j23 >= last6) return;
++j23;
if (j23 >= 15) break;
rel17 = 3;
}
if (rel17 < 4) {
ptrdiff_t toH34 = base17+12;
ptrdiff_t toW34 = 0;
ptrdiff_t k95 = 16*w46;
for (; k95 != 16; ++k95) {
ptrdiff_t l34 = 0;
for (; l34 != 2; ++l34) {
__m512 sf641 = _mm512_loadu_ps(sfPtr7+0+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf642 = _mm512_loadu_ps(sfPtr7+128+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1357 = _mm512_shuffle_f32x4(sf641, sf642, 68);
__m512 in1358 = _mm512_shuffle_f32x4(sf641, sf642, 238);
__m512 sf643 = _mm512_loadu_ps(sfPtr7+64+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf644 = _mm512_loadu_ps(sfPtr7+192+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1365 = _mm512_shuffle_f32x4(sf643, sf644, 68);
__m512 in1366 = _mm512_shuffle_f32x4(sf643, sf644, 238);
__m512 sf645 = _mm512_loadu_ps(sfPtr7+409600+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf646 = _mm512_loadu_ps(sfPtr7+409728+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1359 = _mm512_shuffle_f32x4(sf645, sf646, 68);
__m512 in1360 = _mm512_shuffle_f32x4(sf645, sf646, 238);
__m512 sf647 = _mm512_loadu_ps(sfPtr7+409664+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf648 = _mm512_loadu_ps(sfPtr7+409792+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1367 = _mm512_shuffle_f32x4(sf647, sf648, 68);
__m512 in1368 = _mm512_shuffle_f32x4(sf647, sf648, 238);
__m512 sf649 = _mm512_loadu_ps(sfPtr7+819200+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf650 = _mm512_loadu_ps(sfPtr7+819328+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1361 = _mm512_shuffle_f32x4(sf649, sf650, 68);
__m512 in1362 = _mm512_shuffle_f32x4(sf649, sf650, 238);
__m512 sf651 = _mm512_loadu_ps(sfPtr7+819264+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf652 = _mm512_loadu_ps(sfPtr7+819392+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1369 = _mm512_shuffle_f32x4(sf651, sf652, 68);
__m512 in1370 = _mm512_shuffle_f32x4(sf651, sf652, 238);
__m512 sf653 = _mm512_loadu_ps(sfPtr7+1228800+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf654 = _mm512_loadu_ps(sfPtr7+1228928+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1363 = _mm512_shuffle_f32x4(sf653, sf654, 68);
__m512 in1364 = _mm512_shuffle_f32x4(sf653, sf654, 238);
__m512 sf655 = _mm512_loadu_ps(sfPtr7+1228864+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf656 = _mm512_loadu_ps(sfPtr7+1228992+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1371 = _mm512_shuffle_f32x4(sf655, sf656, 68);
__m512 in1372 = _mm512_shuffle_f32x4(sf655, sf656, 238);
__m512 tmp9281 = _mm512_add_ps(in1358, in1359);
__m512 tmp9301 = _mm512_add_ps(in1366, in1367);
__m512 tmp9280 = _mm512_add_ps(in1360, in1361);
__m512 tmp9300 = _mm512_add_ps(in1368, in1369);
__m512 tmp9286 = _mm512_sub_ps(in1360, in1361);
__m512 tmp9306 = _mm512_sub_ps(in1368, in1369);
__m512 tmp9285 = _mm512_sub_ps(in1358, in1359);
__m512 tmp9305 = _mm512_sub_ps(in1366, in1367);
__m512 tmp9282 = _mm512_add_ps(in1362, in1363);
__m512 tmp9302 = _mm512_add_ps(in1370, in1371);
__m512 tmp9287 = _mm512_sub_ps(in1362, in1363);
__m512 tmp9307 = _mm512_sub_ps(in1370, in1371);
__m512 tmp9284 = _mm512_fmadd_ps(tmp9286, _mm512_set1_ps(2e+00f), tmp9285);
__m512 tmp9304 = _mm512_fmadd_ps(tmp9306, _mm512_set1_ps(2e+00f), tmp9305);
__m512 tmp9291 = _mm512_fmadd_ps(tmp9286, _mm512_set1_ps(8e+00f), tmp9285);
__m512 tmp9311 = _mm512_fmadd_ps(tmp9306, _mm512_set1_ps(8e+00f), tmp9305);
__m512 tmp9279 = _mm512_add_ps(tmp9280, tmp9281);
__m512 tmp9299 = _mm512_add_ps(tmp9300, tmp9301);
__m512 tmp9283 = _mm512_fmadd_ps(tmp9287, _mm512_set1_ps(1.6e+01f), tmp9284);
__m512 tmp9303 = _mm512_fmadd_ps(tmp9307, _mm512_set1_ps(1.6e+01f), tmp9304);
__m512 tmp9290 = _mm512_fmadd_ps(tmp9287, _mm512_set1_ps(4e+00f), tmp9291);
__m512 tmp9310 = _mm512_fmadd_ps(tmp9307, _mm512_set1_ps(4e+00f), tmp9311);
__m512 tmp9296 = _mm512_add_ps(tmp9287, tmp9285);
__m512 tmp9316 = _mm512_add_ps(tmp9307, tmp9305);
__m512 tmp9289 = _mm512_fmadd_ps(tmp9280, _mm512_set1_ps(4e+00f), tmp9281);
__m512 tmp9309 = _mm512_fmadd_ps(tmp9300, _mm512_set1_ps(4e+00f), tmp9301);
__m512 tmp9293 = _mm512_fmadd_ps(tmp9280, _mm512_set1_ps(1.6e+01f), tmp9281);
__m512 tmp9313 = _mm512_fmadd_ps(tmp9300, _mm512_set1_ps(1.6e+01f), tmp9301);
__m512 tmp9278 = _mm512_add_ps(tmp9279, in1357);
__m512 tmp9298 = _mm512_add_ps(tmp9299, in1365);
__m512 tmp9295 = _mm512_add_ps(tmp9296, in1364);
__m512 tmp9315 = _mm512_add_ps(tmp9316, in1372);
__m512 tmp9277 = _mm512_fmadd_ps(tmp9282, _mm512_set1_ps(3.2e+01f), tmp9278);
__m512 tmp9297 = _mm512_fmadd_ps(tmp9302, _mm512_set1_ps(3.2e+01f), tmp9298);
__m512 tmp9288 = _mm512_fmadd_ps(tmp9282, _mm512_set1_ps(8e+00f), tmp9289);
__m512 tmp9308 = _mm512_fmadd_ps(tmp9302, _mm512_set1_ps(8e+00f), tmp9309);
__m512 tmp9294 = _mm512_fmadd_ps(tmp9286, _mm512_set1_ps(3.2e+01f), tmp9295);
__m512 tmp9314 = _mm512_fmadd_ps(tmp9306, _mm512_set1_ps(3.2e+01f), tmp9315);
__m512 tmp9292 = _mm512_fmadd_ps(tmp9282, _mm512_set1_ps(2e+00f), tmp9293);
__m512 tmp9312 = _mm512_fmadd_ps(tmp9302, _mm512_set1_ps(2e+00f), tmp9313);
__m512 tmp9265 = tmp9277;
__m512 tmp9271 = tmp9297;
__m512 tmp9266 = tmp9283;
__m512 tmp9272 = tmp9303;
__m512 tmp9267 = tmp9288;
__m512 tmp9273 = tmp9308;
__m512 tmp9268 = tmp9290;
__m512 tmp9274 = tmp9310;
__m512 tmp9269 = tmp9292;
__m512 tmp9275 = tmp9312;
__m512 tmp9270 = tmp9294;
__m512 tmp9276 = tmp9314;
__m512 tmp9361 = _mm512_unpacklo_ps(tmp9265, tmp9266);
__m512 tmp9362 = _mm512_unpackhi_ps(tmp9265, tmp9266);
__m512 tmp9363 = _mm512_unpacklo_ps(tmp9267, tmp9268);
__m512 tmp9364 = _mm512_unpackhi_ps(tmp9267, tmp9268);
__m512 tmp9365 = _mm512_unpacklo_ps(tmp9269, tmp9270);
__m512 tmp9366 = _mm512_unpackhi_ps(tmp9269, tmp9270);
__m512 tmp9367 = _mm512_unpacklo_ps(tmp9271, tmp9272);
__m512 tmp9368 = _mm512_unpackhi_ps(tmp9271, tmp9272);
__m512 tmp9369 = _mm512_unpacklo_ps(tmp9273, tmp9274);
__m512 tmp9370 = _mm512_unpackhi_ps(tmp9273, tmp9274);
__m512 tmp9371 = _mm512_unpacklo_ps(tmp9275, tmp9276);
__m512 tmp9372 = _mm512_unpackhi_ps(tmp9275, tmp9276);
__m512 tmp9373 = _mm512_shuffle_ps(tmp9361, tmp9363, 68);
__m512 tmp9374 = _mm512_shuffle_ps(tmp9361, tmp9363, 238);
__m512 tmp9375 = _mm512_shuffle_ps(tmp9362, tmp9364, 68);
__m512 tmp9376 = _mm512_shuffle_ps(tmp9362, tmp9364, 238);
__m512 tmp9377 = _mm512_shuffle_ps(tmp9365, tmp9367, 68);
__m512 tmp9378 = _mm512_shuffle_ps(tmp9365, tmp9367, 238);
__m512 tmp9379 = _mm512_shuffle_ps(tmp9366, tmp9368, 68);
__m512 tmp9380 = _mm512_shuffle_ps(tmp9366, tmp9368, 238);
__m512 tmp9381 = _mm512_shuffle_ps(tmp9369, tmp9371, 68);
__m512 tmp9382 = _mm512_shuffle_ps(tmp9369, tmp9371, 238);
__m512 tmp9383 = _mm512_shuffle_ps(tmp9370, tmp9372, 68);
__m512 tmp9384 = _mm512_shuffle_ps(tmp9370, tmp9372, 238);
__m512 tmp9385 = _mm512_shuffle_f32x4(tmp9373, tmp9377, 136);
__m512 tmp9386 = _mm512_shuffle_f32x4(tmp9373, tmp9377, 221);
__m512 tmp9387 = _mm512_shuffle_f32x4(tmp9374, tmp9378, 136);
__m512 tmp9388 = _mm512_shuffle_f32x4(tmp9374, tmp9378, 221);
__m512 tmp9389 = _mm512_shuffle_f32x4(tmp9375, tmp9379, 136);
__m512 tmp9390 = _mm512_shuffle_f32x4(tmp9375, tmp9379, 221);
__m512 tmp9391 = _mm512_shuffle_f32x4(tmp9376, tmp9380, 136);
__m512 tmp9392 = _mm512_shuffle_f32x4(tmp9376, tmp9380, 221);
__m512 tmp9393 = _mm512_shuffle_f32x4(tmp9381, tmp9381, 136);
__m512 tmp9394 = _mm512_shuffle_f32x4(tmp9381, tmp9381, 221);
__m512 tmp9395 = _mm512_shuffle_f32x4(tmp9382, tmp9382, 136);
__m512 tmp9396 = _mm512_shuffle_f32x4(tmp9382, tmp9382, 221);
__m512 tmp9397 = _mm512_shuffle_f32x4(tmp9383, tmp9383, 136);
__m512 tmp9398 = _mm512_shuffle_f32x4(tmp9383, tmp9383, 221);
__m512 tmp9399 = _mm512_shuffle_f32x4(tmp9384, tmp9384, 136);
__m512 tmp9400 = _mm512_shuffle_f32x4(tmp9384, tmp9384, 221);
tmp9265 = _mm512_shuffle_f32x4(tmp9385, tmp9393, 136);
tmp9273 = _mm512_shuffle_f32x4(tmp9385, tmp9393, 221);
tmp9266 = _mm512_shuffle_f32x4(tmp9387, tmp9395, 136);
tmp9274 = _mm512_shuffle_f32x4(tmp9387, tmp9395, 221);
tmp9267 = _mm512_shuffle_f32x4(tmp9389, tmp9397, 136);
tmp9275 = _mm512_shuffle_f32x4(tmp9389, tmp9397, 221);
tmp9268 = _mm512_shuffle_f32x4(tmp9391, tmp9399, 136);
tmp9276 = _mm512_shuffle_f32x4(tmp9391, tmp9399, 221);
tmp9269 = _mm512_shuffle_f32x4(tmp9386, tmp9394, 136);
__m512 tmp9317 = _mm512_shuffle_f32x4(tmp9386, tmp9394, 221);
tmp9270 = _mm512_shuffle_f32x4(tmp9388, tmp9396, 136);
__m512 tmp9318 = _mm512_shuffle_f32x4(tmp9388, tmp9396, 221);
tmp9271 = _mm512_shuffle_f32x4(tmp9390, tmp9398, 136);
__m512 tmp9319 = _mm512_shuffle_f32x4(tmp9390, tmp9398, 221);
tmp9272 = _mm512_shuffle_f32x4(tmp9392, tmp9400, 136);
__m512 tmp9320 = _mm512_shuffle_f32x4(tmp9392, tmp9400, 221);
__m512 tmp9325 = _mm512_add_ps(tmp9266, tmp9267);
__m512 tmp9345 = _mm512_add_ps(tmp9274, tmp9275);
__m512 tmp9324 = _mm512_add_ps(tmp9268, tmp9269);
__m512 tmp9344 = _mm512_add_ps(tmp9276, tmp9317);
__m512 tmp9330 = _mm512_sub_ps(tmp9268, tmp9269);
__m512 tmp9350 = _mm512_sub_ps(tmp9276, tmp9317);
__m512 tmp9329 = _mm512_sub_ps(tmp9266, tmp9267);
__m512 tmp9349 = _mm512_sub_ps(tmp9274, tmp9275);
__m512 tmp9326 = _mm512_add_ps(tmp9270, tmp9271);
__m512 tmp9346 = _mm512_add_ps(tmp9318, tmp9319);
__m512 tmp9331 = _mm512_sub_ps(tmp9270, tmp9271);
__m512 tmp9351 = _mm512_sub_ps(tmp9318, tmp9319);
__m512 tmp9328 = _mm512_fmadd_ps(tmp9330, _mm512_set1_ps(2e+00f), tmp9329);
__m512 tmp9348 = _mm512_fmadd_ps(tmp9350, _mm512_set1_ps(2e+00f), tmp9349);
__m512 tmp9335 = _mm512_fmadd_ps(tmp9330, _mm512_set1_ps(8e+00f), tmp9329);
__m512 tmp9355 = _mm512_fmadd_ps(tmp9350, _mm512_set1_ps(8e+00f), tmp9349);
__m512 tmp9323 = _mm512_add_ps(tmp9324, tmp9325);
__m512 tmp9343 = _mm512_add_ps(tmp9344, tmp9345);
__m512 tmp9327 = _mm512_fmadd_ps(tmp9331, _mm512_set1_ps(1.6e+01f), tmp9328);
__m512 tmp9347 = _mm512_fmadd_ps(tmp9351, _mm512_set1_ps(1.6e+01f), tmp9348);
__m512 tmp9334 = _mm512_fmadd_ps(tmp9331, _mm512_set1_ps(4e+00f), tmp9335);
__m512 tmp9354 = _mm512_fmadd_ps(tmp9351, _mm512_set1_ps(4e+00f), tmp9355);
__m512 tmp9340 = _mm512_add_ps(tmp9331, tmp9329);
__m512 tmp9360 = _mm512_add_ps(tmp9351, tmp9349);
__m512 tmp9333 = _mm512_fmadd_ps(tmp9324, _mm512_set1_ps(4e+00f), tmp9325);
__m512 tmp9353 = _mm512_fmadd_ps(tmp9344, _mm512_set1_ps(4e+00f), tmp9345);
__m512 tmp9337 = _mm512_fmadd_ps(tmp9324, _mm512_set1_ps(1.6e+01f), tmp9325);
__m512 tmp9357 = _mm512_fmadd_ps(tmp9344, _mm512_set1_ps(1.6e+01f), tmp9345);
__m512 tmp9322 = _mm512_add_ps(tmp9323, tmp9265);
__m512 tmp9342 = _mm512_add_ps(tmp9343, tmp9273);
__m512 tmp9339 = _mm512_add_ps(tmp9340, tmp9272);
__m512 tmp9359 = _mm512_add_ps(tmp9360, tmp9320);
__m512 tmp9321 = _mm512_fmadd_ps(tmp9326, _mm512_set1_ps(3.2e+01f), tmp9322);
__m512 tmp9341 = _mm512_fmadd_ps(tmp9346, _mm512_set1_ps(3.2e+01f), tmp9342);
__m512 tmp9332 = _mm512_fmadd_ps(tmp9326, _mm512_set1_ps(8e+00f), tmp9333);
__m512 tmp9352 = _mm512_fmadd_ps(tmp9346, _mm512_set1_ps(8e+00f), tmp9353);
__m512 tmp9338 = _mm512_fmadd_ps(tmp9330, _mm512_set1_ps(3.2e+01f), tmp9339);
__m512 tmp9358 = _mm512_fmadd_ps(tmp9350, _mm512_set1_ps(3.2e+01f), tmp9359);
__m512 tmp9336 = _mm512_fmadd_ps(tmp9326, _mm512_set1_ps(2e+00f), tmp9337);
__m512 tmp9356 = _mm512_fmadd_ps(tmp9346, _mm512_set1_ps(2e+00f), tmp9357);
__m512 out1287 = tmp9321;
__m512 out1293 = tmp9341;
__m512 out1288 = tmp9327;
__m512 out1294 = tmp9347;
__m512 out1289 = tmp9332;
__m512 out1295 = tmp9352;
__m512 out1290 = tmp9334;
__m512 out1296 = tmp9354;
__m512 out1291 = tmp9336;
__m512 out1297 = tmp9356;
__m512 out1292 = tmp9338;
__m512 out1298 = tmp9358;
out1287 = _mm512_max_ps(_mm512_setzero_ps(), out1287);
out1293 = _mm512_max_ps(_mm512_setzero_ps(), out1293);
out1288 = _mm512_max_ps(_mm512_setzero_ps(), out1288);
out1294 = _mm512_max_ps(_mm512_setzero_ps(), out1294);
out1289 = _mm512_max_ps(_mm512_setzero_ps(), out1289);
out1295 = _mm512_max_ps(_mm512_setzero_ps(), out1295);
out1290 = _mm512_max_ps(_mm512_setzero_ps(), out1290);
out1296 = _mm512_max_ps(_mm512_setzero_ps(), out1296);
out1291 = _mm512_max_ps(_mm512_setzero_ps(), out1291);
out1297 = _mm512_max_ps(_mm512_setzero_ps(), out1297);
out1292 = _mm512_max_ps(_mm512_setzero_ps(), out1292);
out1298 = _mm512_max_ps(_mm512_setzero_ps(), out1298);
_mm512_mask_storeu_ps(datPtr13+0+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1287);
_mm512_mask_storeu_ps(datPtr13+48+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1293);
_mm512_mask_storeu_ps(datPtr13+224+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1288);
_mm512_mask_storeu_ps(datPtr13+272+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1294);
_mm512_mask_storeu_ps(datPtr13+448+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1289);
_mm512_mask_storeu_ps(datPtr13+496+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1295);
_mm512_mask_storeu_ps(datPtr13+672+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1290);
_mm512_mask_storeu_ps(datPtr13+720+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1296);
_mm512_mask_storeu_ps(datPtr13+896+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1291);
_mm512_mask_storeu_ps(datPtr13+944+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1297);
_mm512_mask_storeu_ps(datPtr13+1120+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1292);
_mm512_mask_storeu_ps(datPtr13+1168+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1298);
__m512 sf657 = _mm512_loadu_ps(sfPtr7+256+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf658 = _mm512_loadu_ps(sfPtr7+384+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1373 = _mm512_shuffle_f32x4(sf657, sf658, 68);
__m512 in1374 = _mm512_shuffle_f32x4(sf657, sf658, 238);
__m512 sf659 = _mm512_loadu_ps(sfPtr7+320+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf660 = _mm512_loadu_ps(sfPtr7+448+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1381 = _mm512_shuffle_f32x4(sf659, sf660, 68);
__m512 in1382 = _mm512_shuffle_f32x4(sf659, sf660, 238);
__m512 sf661 = _mm512_loadu_ps(sfPtr7+409856+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf662 = _mm512_loadu_ps(sfPtr7+409984+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1375 = _mm512_shuffle_f32x4(sf661, sf662, 68);
__m512 in1376 = _mm512_shuffle_f32x4(sf661, sf662, 238);
__m512 sf663 = _mm512_loadu_ps(sfPtr7+409920+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf664 = _mm512_loadu_ps(sfPtr7+410048+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1383 = _mm512_shuffle_f32x4(sf663, sf664, 68);
__m512 in1384 = _mm512_shuffle_f32x4(sf663, sf664, 238);
__m512 sf665 = _mm512_loadu_ps(sfPtr7+819456+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf666 = _mm512_loadu_ps(sfPtr7+819584+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1377 = _mm512_shuffle_f32x4(sf665, sf666, 68);
__m512 in1378 = _mm512_shuffle_f32x4(sf665, sf666, 238);
__m512 sf667 = _mm512_loadu_ps(sfPtr7+819520+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf668 = _mm512_loadu_ps(sfPtr7+819648+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1385 = _mm512_shuffle_f32x4(sf667, sf668, 68);
__m512 in1386 = _mm512_shuffle_f32x4(sf667, sf668, 238);
__m512 sf669 = _mm512_loadu_ps(sfPtr7+1229056+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf670 = _mm512_loadu_ps(sfPtr7+1229184+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1379 = _mm512_shuffle_f32x4(sf669, sf670, 68);
__m512 in1380 = _mm512_shuffle_f32x4(sf669, sf670, 238);
__m512 sf671 = _mm512_loadu_ps(sfPtr7+1229120+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf672 = _mm512_loadu_ps(sfPtr7+1229248+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1387 = _mm512_shuffle_f32x4(sf671, sf672, 68);
__m512 in1388 = _mm512_shuffle_f32x4(sf671, sf672, 238);
__m512 tmp9417 = _mm512_add_ps(in1374, in1375);
__m512 tmp9437 = _mm512_add_ps(in1382, in1383);
__m512 tmp9416 = _mm512_add_ps(in1376, in1377);
__m512 tmp9436 = _mm512_add_ps(in1384, in1385);
__m512 tmp9422 = _mm512_sub_ps(in1376, in1377);
__m512 tmp9442 = _mm512_sub_ps(in1384, in1385);
__m512 tmp9421 = _mm512_sub_ps(in1374, in1375);
__m512 tmp9441 = _mm512_sub_ps(in1382, in1383);
__m512 tmp9418 = _mm512_add_ps(in1378, in1379);
__m512 tmp9438 = _mm512_add_ps(in1386, in1387);
__m512 tmp9423 = _mm512_sub_ps(in1378, in1379);
__m512 tmp9443 = _mm512_sub_ps(in1386, in1387);
__m512 tmp9420 = _mm512_fmadd_ps(tmp9422, _mm512_set1_ps(2e+00f), tmp9421);
__m512 tmp9440 = _mm512_fmadd_ps(tmp9442, _mm512_set1_ps(2e+00f), tmp9441);
__m512 tmp9427 = _mm512_fmadd_ps(tmp9422, _mm512_set1_ps(8e+00f), tmp9421);
__m512 tmp9447 = _mm512_fmadd_ps(tmp9442, _mm512_set1_ps(8e+00f), tmp9441);
__m512 tmp9415 = _mm512_add_ps(tmp9416, tmp9417);
__m512 tmp9435 = _mm512_add_ps(tmp9436, tmp9437);
__m512 tmp9419 = _mm512_fmadd_ps(tmp9423, _mm512_set1_ps(1.6e+01f), tmp9420);
__m512 tmp9439 = _mm512_fmadd_ps(tmp9443, _mm512_set1_ps(1.6e+01f), tmp9440);
__m512 tmp9426 = _mm512_fmadd_ps(tmp9423, _mm512_set1_ps(4e+00f), tmp9427);
__m512 tmp9446 = _mm512_fmadd_ps(tmp9443, _mm512_set1_ps(4e+00f), tmp9447);
__m512 tmp9432 = _mm512_add_ps(tmp9423, tmp9421);
__m512 tmp9452 = _mm512_add_ps(tmp9443, tmp9441);
__m512 tmp9425 = _mm512_fmadd_ps(tmp9416, _mm512_set1_ps(4e+00f), tmp9417);
__m512 tmp9445 = _mm512_fmadd_ps(tmp9436, _mm512_set1_ps(4e+00f), tmp9437);
__m512 tmp9429 = _mm512_fmadd_ps(tmp9416, _mm512_set1_ps(1.6e+01f), tmp9417);
__m512 tmp9449 = _mm512_fmadd_ps(tmp9436, _mm512_set1_ps(1.6e+01f), tmp9437);
__m512 tmp9414 = _mm512_add_ps(tmp9415, in1373);
__m512 tmp9434 = _mm512_add_ps(tmp9435, in1381);
__m512 tmp9431 = _mm512_add_ps(tmp9432, in1380);
__m512 tmp9451 = _mm512_add_ps(tmp9452, in1388);
__m512 tmp9413 = _mm512_fmadd_ps(tmp9418, _mm512_set1_ps(3.2e+01f), tmp9414);
__m512 tmp9433 = _mm512_fmadd_ps(tmp9438, _mm512_set1_ps(3.2e+01f), tmp9434);
__m512 tmp9424 = _mm512_fmadd_ps(tmp9418, _mm512_set1_ps(8e+00f), tmp9425);
__m512 tmp9444 = _mm512_fmadd_ps(tmp9438, _mm512_set1_ps(8e+00f), tmp9445);
__m512 tmp9430 = _mm512_fmadd_ps(tmp9422, _mm512_set1_ps(3.2e+01f), tmp9431);
__m512 tmp9450 = _mm512_fmadd_ps(tmp9442, _mm512_set1_ps(3.2e+01f), tmp9451);
__m512 tmp9428 = _mm512_fmadd_ps(tmp9418, _mm512_set1_ps(2e+00f), tmp9429);
__m512 tmp9448 = _mm512_fmadd_ps(tmp9438, _mm512_set1_ps(2e+00f), tmp9449);
__m512 tmp9401 = tmp9413;
__m512 tmp9407 = tmp9433;
__m512 tmp9402 = tmp9419;
__m512 tmp9408 = tmp9439;
__m512 tmp9403 = tmp9424;
__m512 tmp9409 = tmp9444;
__m512 tmp9404 = tmp9426;
__m512 tmp9410 = tmp9446;
__m512 tmp9405 = tmp9428;
__m512 tmp9411 = tmp9448;
__m512 tmp9406 = tmp9430;
__m512 tmp9412 = tmp9450;
__m512 tmp9497 = _mm512_unpacklo_ps(tmp9401, tmp9402);
__m512 tmp9498 = _mm512_unpackhi_ps(tmp9401, tmp9402);
__m512 tmp9499 = _mm512_unpacklo_ps(tmp9403, tmp9404);
__m512 tmp9500 = _mm512_unpackhi_ps(tmp9403, tmp9404);
__m512 tmp9501 = _mm512_unpacklo_ps(tmp9405, tmp9406);
__m512 tmp9502 = _mm512_unpackhi_ps(tmp9405, tmp9406);
__m512 tmp9503 = _mm512_unpacklo_ps(tmp9407, tmp9408);
__m512 tmp9504 = _mm512_unpackhi_ps(tmp9407, tmp9408);
__m512 tmp9505 = _mm512_unpacklo_ps(tmp9409, tmp9410);
__m512 tmp9506 = _mm512_unpackhi_ps(tmp9409, tmp9410);
__m512 tmp9507 = _mm512_unpacklo_ps(tmp9411, tmp9412);
__m512 tmp9508 = _mm512_unpackhi_ps(tmp9411, tmp9412);
__m512 tmp9509 = _mm512_shuffle_ps(tmp9497, tmp9499, 68);
__m512 tmp9510 = _mm512_shuffle_ps(tmp9497, tmp9499, 238);
__m512 tmp9511 = _mm512_shuffle_ps(tmp9498, tmp9500, 68);
__m512 tmp9512 = _mm512_shuffle_ps(tmp9498, tmp9500, 238);
__m512 tmp9513 = _mm512_shuffle_ps(tmp9501, tmp9503, 68);
__m512 tmp9514 = _mm512_shuffle_ps(tmp9501, tmp9503, 238);
__m512 tmp9515 = _mm512_shuffle_ps(tmp9502, tmp9504, 68);
__m512 tmp9516 = _mm512_shuffle_ps(tmp9502, tmp9504, 238);
__m512 tmp9517 = _mm512_shuffle_ps(tmp9505, tmp9507, 68);
__m512 tmp9518 = _mm512_shuffle_ps(tmp9505, tmp9507, 238);
__m512 tmp9519 = _mm512_shuffle_ps(tmp9506, tmp9508, 68);
__m512 tmp9520 = _mm512_shuffle_ps(tmp9506, tmp9508, 238);
__m512 tmp9521 = _mm512_shuffle_f32x4(tmp9509, tmp9513, 136);
__m512 tmp9522 = _mm512_shuffle_f32x4(tmp9509, tmp9513, 221);
__m512 tmp9523 = _mm512_shuffle_f32x4(tmp9510, tmp9514, 136);
__m512 tmp9524 = _mm512_shuffle_f32x4(tmp9510, tmp9514, 221);
__m512 tmp9525 = _mm512_shuffle_f32x4(tmp9511, tmp9515, 136);
__m512 tmp9526 = _mm512_shuffle_f32x4(tmp9511, tmp9515, 221);
__m512 tmp9527 = _mm512_shuffle_f32x4(tmp9512, tmp9516, 136);
__m512 tmp9528 = _mm512_shuffle_f32x4(tmp9512, tmp9516, 221);
__m512 tmp9529 = _mm512_shuffle_f32x4(tmp9517, tmp9517, 136);
__m512 tmp9530 = _mm512_shuffle_f32x4(tmp9517, tmp9517, 221);
__m512 tmp9531 = _mm512_shuffle_f32x4(tmp9518, tmp9518, 136);
__m512 tmp9532 = _mm512_shuffle_f32x4(tmp9518, tmp9518, 221);
__m512 tmp9533 = _mm512_shuffle_f32x4(tmp9519, tmp9519, 136);
__m512 tmp9534 = _mm512_shuffle_f32x4(tmp9519, tmp9519, 221);
__m512 tmp9535 = _mm512_shuffle_f32x4(tmp9520, tmp9520, 136);
__m512 tmp9536 = _mm512_shuffle_f32x4(tmp9520, tmp9520, 221);
tmp9401 = _mm512_shuffle_f32x4(tmp9521, tmp9529, 136);
tmp9409 = _mm512_shuffle_f32x4(tmp9521, tmp9529, 221);
tmp9402 = _mm512_shuffle_f32x4(tmp9523, tmp9531, 136);
tmp9410 = _mm512_shuffle_f32x4(tmp9523, tmp9531, 221);
tmp9403 = _mm512_shuffle_f32x4(tmp9525, tmp9533, 136);
tmp9411 = _mm512_shuffle_f32x4(tmp9525, tmp9533, 221);
tmp9404 = _mm512_shuffle_f32x4(tmp9527, tmp9535, 136);
tmp9412 = _mm512_shuffle_f32x4(tmp9527, tmp9535, 221);
tmp9405 = _mm512_shuffle_f32x4(tmp9522, tmp9530, 136);
__m512 tmp9453 = _mm512_shuffle_f32x4(tmp9522, tmp9530, 221);
tmp9406 = _mm512_shuffle_f32x4(tmp9524, tmp9532, 136);
__m512 tmp9454 = _mm512_shuffle_f32x4(tmp9524, tmp9532, 221);
tmp9407 = _mm512_shuffle_f32x4(tmp9526, tmp9534, 136);
__m512 tmp9455 = _mm512_shuffle_f32x4(tmp9526, tmp9534, 221);
tmp9408 = _mm512_shuffle_f32x4(tmp9528, tmp9536, 136);
__m512 tmp9456 = _mm512_shuffle_f32x4(tmp9528, tmp9536, 221);
__m512 tmp9461 = _mm512_add_ps(tmp9402, tmp9403);
__m512 tmp9481 = _mm512_add_ps(tmp9410, tmp9411);
__m512 tmp9460 = _mm512_add_ps(tmp9404, tmp9405);
__m512 tmp9480 = _mm512_add_ps(tmp9412, tmp9453);
__m512 tmp9466 = _mm512_sub_ps(tmp9404, tmp9405);
__m512 tmp9486 = _mm512_sub_ps(tmp9412, tmp9453);
__m512 tmp9465 = _mm512_sub_ps(tmp9402, tmp9403);
__m512 tmp9485 = _mm512_sub_ps(tmp9410, tmp9411);
__m512 tmp9462 = _mm512_add_ps(tmp9406, tmp9407);
__m512 tmp9482 = _mm512_add_ps(tmp9454, tmp9455);
__m512 tmp9467 = _mm512_sub_ps(tmp9406, tmp9407);
__m512 tmp9487 = _mm512_sub_ps(tmp9454, tmp9455);
__m512 tmp9464 = _mm512_fmadd_ps(tmp9466, _mm512_set1_ps(2e+00f), tmp9465);
__m512 tmp9484 = _mm512_fmadd_ps(tmp9486, _mm512_set1_ps(2e+00f), tmp9485);
__m512 tmp9471 = _mm512_fmadd_ps(tmp9466, _mm512_set1_ps(8e+00f), tmp9465);
__m512 tmp9491 = _mm512_fmadd_ps(tmp9486, _mm512_set1_ps(8e+00f), tmp9485);
__m512 tmp9459 = _mm512_add_ps(tmp9460, tmp9461);
__m512 tmp9479 = _mm512_add_ps(tmp9480, tmp9481);
__m512 tmp9463 = _mm512_fmadd_ps(tmp9467, _mm512_set1_ps(1.6e+01f), tmp9464);
__m512 tmp9483 = _mm512_fmadd_ps(tmp9487, _mm512_set1_ps(1.6e+01f), tmp9484);
__m512 tmp9470 = _mm512_fmadd_ps(tmp9467, _mm512_set1_ps(4e+00f), tmp9471);
__m512 tmp9490 = _mm512_fmadd_ps(tmp9487, _mm512_set1_ps(4e+00f), tmp9491);
__m512 tmp9476 = _mm512_add_ps(tmp9467, tmp9465);
__m512 tmp9496 = _mm512_add_ps(tmp9487, tmp9485);
__m512 tmp9469 = _mm512_fmadd_ps(tmp9460, _mm512_set1_ps(4e+00f), tmp9461);
__m512 tmp9489 = _mm512_fmadd_ps(tmp9480, _mm512_set1_ps(4e+00f), tmp9481);
__m512 tmp9473 = _mm512_fmadd_ps(tmp9460, _mm512_set1_ps(1.6e+01f), tmp9461);
__m512 tmp9493 = _mm512_fmadd_ps(tmp9480, _mm512_set1_ps(1.6e+01f), tmp9481);
__m512 tmp9458 = _mm512_add_ps(tmp9459, tmp9401);
__m512 tmp9478 = _mm512_add_ps(tmp9479, tmp9409);
__m512 tmp9475 = _mm512_add_ps(tmp9476, tmp9408);
__m512 tmp9495 = _mm512_add_ps(tmp9496, tmp9456);
__m512 tmp9457 = _mm512_fmadd_ps(tmp9462, _mm512_set1_ps(3.2e+01f), tmp9458);
__m512 tmp9477 = _mm512_fmadd_ps(tmp9482, _mm512_set1_ps(3.2e+01f), tmp9478);
__m512 tmp9468 = _mm512_fmadd_ps(tmp9462, _mm512_set1_ps(8e+00f), tmp9469);
__m512 tmp9488 = _mm512_fmadd_ps(tmp9482, _mm512_set1_ps(8e+00f), tmp9489);
__m512 tmp9474 = _mm512_fmadd_ps(tmp9466, _mm512_set1_ps(3.2e+01f), tmp9475);
__m512 tmp9494 = _mm512_fmadd_ps(tmp9486, _mm512_set1_ps(3.2e+01f), tmp9495);
__m512 tmp9472 = _mm512_fmadd_ps(tmp9462, _mm512_set1_ps(2e+00f), tmp9473);
__m512 tmp9492 = _mm512_fmadd_ps(tmp9482, _mm512_set1_ps(2e+00f), tmp9493);
__m512 out1299 = tmp9457;
__m512 out1305 = tmp9477;
__m512 out1300 = tmp9463;
__m512 out1306 = tmp9483;
__m512 out1301 = tmp9468;
__m512 out1307 = tmp9488;
__m512 out1302 = tmp9470;
__m512 out1308 = tmp9490;
__m512 out1303 = tmp9472;
__m512 out1309 = tmp9492;
__m512 out1304 = tmp9474;
__m512 out1310 = tmp9494;
out1299 = _mm512_max_ps(_mm512_setzero_ps(), out1299);
out1305 = _mm512_max_ps(_mm512_setzero_ps(), out1305);
out1300 = _mm512_max_ps(_mm512_setzero_ps(), out1300);
out1306 = _mm512_max_ps(_mm512_setzero_ps(), out1306);
out1301 = _mm512_max_ps(_mm512_setzero_ps(), out1301);
out1307 = _mm512_max_ps(_mm512_setzero_ps(), out1307);
out1302 = _mm512_max_ps(_mm512_setzero_ps(), out1302);
out1308 = _mm512_max_ps(_mm512_setzero_ps(), out1308);
out1303 = _mm512_max_ps(_mm512_setzero_ps(), out1303);
out1309 = _mm512_max_ps(_mm512_setzero_ps(), out1309);
out1304 = _mm512_max_ps(_mm512_setzero_ps(), out1304);
out1310 = _mm512_max_ps(_mm512_setzero_ps(), out1310);
_mm512_mask_storeu_ps(datPtr13+96+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1299);
_mm512_mask_storeu_ps(datPtr13+12608+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1305);
_mm512_mask_storeu_ps(datPtr13+320+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1300);
_mm512_mask_storeu_ps(datPtr13+12832+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1306);
_mm512_mask_storeu_ps(datPtr13+544+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1301);
_mm512_mask_storeu_ps(datPtr13+13056+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1307);
_mm512_mask_storeu_ps(datPtr13+768+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1302);
_mm512_mask_storeu_ps(datPtr13+13280+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1308);
_mm512_mask_storeu_ps(datPtr13+992+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1303);
_mm512_mask_storeu_ps(datPtr13+13504+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1309);
_mm512_mask_storeu_ps(datPtr13+1216+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1304);
_mm512_mask_storeu_ps(datPtr13+13728+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1310);
__m512 sf673 = _mm512_loadu_ps(sfPtr7+512+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf674 = _mm512_loadu_ps(sfPtr7+640+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1389 = _mm512_shuffle_f32x4(sf673, sf674, 68);
__m512 in1390 = _mm512_shuffle_f32x4(sf673, sf674, 238);
__m512 sf675 = _mm512_loadu_ps(sfPtr7+576+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf676 = _mm512_loadu_ps(sfPtr7+704+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1397 = _mm512_shuffle_f32x4(sf675, sf676, 68);
__m512 in1398 = _mm512_shuffle_f32x4(sf675, sf676, 238);
__m512 sf677 = _mm512_loadu_ps(sfPtr7+410112+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf678 = _mm512_loadu_ps(sfPtr7+410240+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1391 = _mm512_shuffle_f32x4(sf677, sf678, 68);
__m512 in1392 = _mm512_shuffle_f32x4(sf677, sf678, 238);
__m512 sf679 = _mm512_loadu_ps(sfPtr7+410176+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf680 = _mm512_loadu_ps(sfPtr7+410304+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1399 = _mm512_shuffle_f32x4(sf679, sf680, 68);
__m512 in1400 = _mm512_shuffle_f32x4(sf679, sf680, 238);
__m512 sf681 = _mm512_loadu_ps(sfPtr7+819712+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf682 = _mm512_loadu_ps(sfPtr7+819840+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1393 = _mm512_shuffle_f32x4(sf681, sf682, 68);
__m512 in1394 = _mm512_shuffle_f32x4(sf681, sf682, 238);
__m512 sf683 = _mm512_loadu_ps(sfPtr7+819776+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf684 = _mm512_loadu_ps(sfPtr7+819904+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1401 = _mm512_shuffle_f32x4(sf683, sf684, 68);
__m512 in1402 = _mm512_shuffle_f32x4(sf683, sf684, 238);
__m512 sf685 = _mm512_loadu_ps(sfPtr7+1229312+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf686 = _mm512_loadu_ps(sfPtr7+1229440+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1395 = _mm512_shuffle_f32x4(sf685, sf686, 68);
__m512 in1396 = _mm512_shuffle_f32x4(sf685, sf686, 238);
__m512 sf687 = _mm512_loadu_ps(sfPtr7+1229376+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 sf688 = _mm512_loadu_ps(sfPtr7+1229504+1638400*i28+24576*j23+1536*k95+768*l34);
__m512 in1403 = _mm512_shuffle_f32x4(sf687, sf688, 68);
__m512 in1404 = _mm512_shuffle_f32x4(sf687, sf688, 238);
__m512 tmp9553 = _mm512_add_ps(in1390, in1391);
__m512 tmp9573 = _mm512_add_ps(in1398, in1399);
__m512 tmp9552 = _mm512_add_ps(in1392, in1393);
__m512 tmp9572 = _mm512_add_ps(in1400, in1401);
__m512 tmp9558 = _mm512_sub_ps(in1392, in1393);
__m512 tmp9578 = _mm512_sub_ps(in1400, in1401);
__m512 tmp9557 = _mm512_sub_ps(in1390, in1391);
__m512 tmp9577 = _mm512_sub_ps(in1398, in1399);
__m512 tmp9554 = _mm512_add_ps(in1394, in1395);
__m512 tmp9574 = _mm512_add_ps(in1402, in1403);
__m512 tmp9559 = _mm512_sub_ps(in1394, in1395);
__m512 tmp9579 = _mm512_sub_ps(in1402, in1403);
__m512 tmp9556 = _mm512_fmadd_ps(tmp9558, _mm512_set1_ps(2e+00f), tmp9557);
__m512 tmp9576 = _mm512_fmadd_ps(tmp9578, _mm512_set1_ps(2e+00f), tmp9577);
__m512 tmp9563 = _mm512_fmadd_ps(tmp9558, _mm512_set1_ps(8e+00f), tmp9557);
__m512 tmp9583 = _mm512_fmadd_ps(tmp9578, _mm512_set1_ps(8e+00f), tmp9577);
__m512 tmp9551 = _mm512_add_ps(tmp9552, tmp9553);
__m512 tmp9571 = _mm512_add_ps(tmp9572, tmp9573);
__m512 tmp9555 = _mm512_fmadd_ps(tmp9559, _mm512_set1_ps(1.6e+01f), tmp9556);
__m512 tmp9575 = _mm512_fmadd_ps(tmp9579, _mm512_set1_ps(1.6e+01f), tmp9576);
__m512 tmp9562 = _mm512_fmadd_ps(tmp9559, _mm512_set1_ps(4e+00f), tmp9563);
__m512 tmp9582 = _mm512_fmadd_ps(tmp9579, _mm512_set1_ps(4e+00f), tmp9583);
__m512 tmp9568 = _mm512_add_ps(tmp9559, tmp9557);
__m512 tmp9588 = _mm512_add_ps(tmp9579, tmp9577);
__m512 tmp9561 = _mm512_fmadd_ps(tmp9552, _mm512_set1_ps(4e+00f), tmp9553);
__m512 tmp9581 = _mm512_fmadd_ps(tmp9572, _mm512_set1_ps(4e+00f), tmp9573);
__m512 tmp9565 = _mm512_fmadd_ps(tmp9552, _mm512_set1_ps(1.6e+01f), tmp9553);
__m512 tmp9585 = _mm512_fmadd_ps(tmp9572, _mm512_set1_ps(1.6e+01f), tmp9573);
__m512 tmp9550 = _mm512_add_ps(tmp9551, in1389);
__m512 tmp9570 = _mm512_add_ps(tmp9571, in1397);
__m512 tmp9567 = _mm512_add_ps(tmp9568, in1396);
__m512 tmp9587 = _mm512_add_ps(tmp9588, in1404);
__m512 tmp9549 = _mm512_fmadd_ps(tmp9554, _mm512_set1_ps(3.2e+01f), tmp9550);
__m512 tmp9569 = _mm512_fmadd_ps(tmp9574, _mm512_set1_ps(3.2e+01f), tmp9570);
__m512 tmp9560 = _mm512_fmadd_ps(tmp9554, _mm512_set1_ps(8e+00f), tmp9561);
__m512 tmp9580 = _mm512_fmadd_ps(tmp9574, _mm512_set1_ps(8e+00f), tmp9581);
__m512 tmp9566 = _mm512_fmadd_ps(tmp9558, _mm512_set1_ps(3.2e+01f), tmp9567);
__m512 tmp9586 = _mm512_fmadd_ps(tmp9578, _mm512_set1_ps(3.2e+01f), tmp9587);
__m512 tmp9564 = _mm512_fmadd_ps(tmp9554, _mm512_set1_ps(2e+00f), tmp9565);
__m512 tmp9584 = _mm512_fmadd_ps(tmp9574, _mm512_set1_ps(2e+00f), tmp9585);
__m512 tmp9537 = tmp9549;
__m512 tmp9543 = tmp9569;
__m512 tmp9538 = tmp9555;
__m512 tmp9544 = tmp9575;
__m512 tmp9539 = tmp9560;
__m512 tmp9545 = tmp9580;
__m512 tmp9540 = tmp9562;
__m512 tmp9546 = tmp9582;
__m512 tmp9541 = tmp9564;
__m512 tmp9547 = tmp9584;
__m512 tmp9542 = tmp9566;
__m512 tmp9548 = tmp9586;
__m512 tmp9633 = _mm512_unpacklo_ps(tmp9537, tmp9538);
__m512 tmp9634 = _mm512_unpackhi_ps(tmp9537, tmp9538);
__m512 tmp9635 = _mm512_unpacklo_ps(tmp9539, tmp9540);
__m512 tmp9636 = _mm512_unpackhi_ps(tmp9539, tmp9540);
__m512 tmp9637 = _mm512_unpacklo_ps(tmp9541, tmp9542);
__m512 tmp9638 = _mm512_unpackhi_ps(tmp9541, tmp9542);
__m512 tmp9639 = _mm512_unpacklo_ps(tmp9543, tmp9544);
__m512 tmp9640 = _mm512_unpackhi_ps(tmp9543, tmp9544);
__m512 tmp9641 = _mm512_unpacklo_ps(tmp9545, tmp9546);
__m512 tmp9642 = _mm512_unpackhi_ps(tmp9545, tmp9546);
__m512 tmp9643 = _mm512_unpacklo_ps(tmp9547, tmp9548);
__m512 tmp9644 = _mm512_unpackhi_ps(tmp9547, tmp9548);
__m512 tmp9645 = _mm512_shuffle_ps(tmp9633, tmp9635, 68);
__m512 tmp9646 = _mm512_shuffle_ps(tmp9633, tmp9635, 238);
__m512 tmp9647 = _mm512_shuffle_ps(tmp9634, tmp9636, 68);
__m512 tmp9648 = _mm512_shuffle_ps(tmp9634, tmp9636, 238);
__m512 tmp9649 = _mm512_shuffle_ps(tmp9637, tmp9639, 68);
__m512 tmp9650 = _mm512_shuffle_ps(tmp9637, tmp9639, 238);
__m512 tmp9651 = _mm512_shuffle_ps(tmp9638, tmp9640, 68);
__m512 tmp9652 = _mm512_shuffle_ps(tmp9638, tmp9640, 238);
__m512 tmp9653 = _mm512_shuffle_ps(tmp9641, tmp9643, 68);
__m512 tmp9654 = _mm512_shuffle_ps(tmp9641, tmp9643, 238);
__m512 tmp9655 = _mm512_shuffle_ps(tmp9642, tmp9644, 68);
__m512 tmp9656 = _mm512_shuffle_ps(tmp9642, tmp9644, 238);
__m512 tmp9657 = _mm512_shuffle_f32x4(tmp9645, tmp9649, 136);
__m512 tmp9658 = _mm512_shuffle_f32x4(tmp9645, tmp9649, 221);
__m512 tmp9659 = _mm512_shuffle_f32x4(tmp9646, tmp9650, 136);
__m512 tmp9660 = _mm512_shuffle_f32x4(tmp9646, tmp9650, 221);
__m512 tmp9661 = _mm512_shuffle_f32x4(tmp9647, tmp9651, 136);
__m512 tmp9662 = _mm512_shuffle_f32x4(tmp9647, tmp9651, 221);
__m512 tmp9663 = _mm512_shuffle_f32x4(tmp9648, tmp9652, 136);
__m512 tmp9664 = _mm512_shuffle_f32x4(tmp9648, tmp9652, 221);
__m512 tmp9665 = _mm512_shuffle_f32x4(tmp9653, tmp9653, 136);
__m512 tmp9666 = _mm512_shuffle_f32x4(tmp9653, tmp9653, 221);
__m512 tmp9667 = _mm512_shuffle_f32x4(tmp9654, tmp9654, 136);
__m512 tmp9668 = _mm512_shuffle_f32x4(tmp9654, tmp9654, 221);
__m512 tmp9669 = _mm512_shuffle_f32x4(tmp9655, tmp9655, 136);
__m512 tmp9670 = _mm512_shuffle_f32x4(tmp9655, tmp9655, 221);
__m512 tmp9671 = _mm512_shuffle_f32x4(tmp9656, tmp9656, 136);
__m512 tmp9672 = _mm512_shuffle_f32x4(tmp9656, tmp9656, 221);
tmp9537 = _mm512_shuffle_f32x4(tmp9657, tmp9665, 136);
tmp9545 = _mm512_shuffle_f32x4(tmp9657, tmp9665, 221);
tmp9538 = _mm512_shuffle_f32x4(tmp9659, tmp9667, 136);
tmp9546 = _mm512_shuffle_f32x4(tmp9659, tmp9667, 221);
tmp9539 = _mm512_shuffle_f32x4(tmp9661, tmp9669, 136);
tmp9547 = _mm512_shuffle_f32x4(tmp9661, tmp9669, 221);
tmp9540 = _mm512_shuffle_f32x4(tmp9663, tmp9671, 136);
tmp9548 = _mm512_shuffle_f32x4(tmp9663, tmp9671, 221);
tmp9541 = _mm512_shuffle_f32x4(tmp9658, tmp9666, 136);
__m512 tmp9589 = _mm512_shuffle_f32x4(tmp9658, tmp9666, 221);
tmp9542 = _mm512_shuffle_f32x4(tmp9660, tmp9668, 136);
__m512 tmp9590 = _mm512_shuffle_f32x4(tmp9660, tmp9668, 221);
tmp9543 = _mm512_shuffle_f32x4(tmp9662, tmp9670, 136);
__m512 tmp9591 = _mm512_shuffle_f32x4(tmp9662, tmp9670, 221);
tmp9544 = _mm512_shuffle_f32x4(tmp9664, tmp9672, 136);
__m512 tmp9592 = _mm512_shuffle_f32x4(tmp9664, tmp9672, 221);
__m512 tmp9597 = _mm512_add_ps(tmp9538, tmp9539);
__m512 tmp9617 = _mm512_add_ps(tmp9546, tmp9547);
__m512 tmp9596 = _mm512_add_ps(tmp9540, tmp9541);
__m512 tmp9616 = _mm512_add_ps(tmp9548, tmp9589);
__m512 tmp9602 = _mm512_sub_ps(tmp9540, tmp9541);
__m512 tmp9622 = _mm512_sub_ps(tmp9548, tmp9589);
__m512 tmp9601 = _mm512_sub_ps(tmp9538, tmp9539);
__m512 tmp9621 = _mm512_sub_ps(tmp9546, tmp9547);
__m512 tmp9598 = _mm512_add_ps(tmp9542, tmp9543);
__m512 tmp9618 = _mm512_add_ps(tmp9590, tmp9591);
__m512 tmp9603 = _mm512_sub_ps(tmp9542, tmp9543);
__m512 tmp9623 = _mm512_sub_ps(tmp9590, tmp9591);
__m512 tmp9600 = _mm512_fmadd_ps(tmp9602, _mm512_set1_ps(2e+00f), tmp9601);
__m512 tmp9620 = _mm512_fmadd_ps(tmp9622, _mm512_set1_ps(2e+00f), tmp9621);
__m512 tmp9607 = _mm512_fmadd_ps(tmp9602, _mm512_set1_ps(8e+00f), tmp9601);
__m512 tmp9627 = _mm512_fmadd_ps(tmp9622, _mm512_set1_ps(8e+00f), tmp9621);
__m512 tmp9595 = _mm512_add_ps(tmp9596, tmp9597);
__m512 tmp9615 = _mm512_add_ps(tmp9616, tmp9617);
__m512 tmp9599 = _mm512_fmadd_ps(tmp9603, _mm512_set1_ps(1.6e+01f), tmp9600);
__m512 tmp9619 = _mm512_fmadd_ps(tmp9623, _mm512_set1_ps(1.6e+01f), tmp9620);
__m512 tmp9606 = _mm512_fmadd_ps(tmp9603, _mm512_set1_ps(4e+00f), tmp9607);
__m512 tmp9626 = _mm512_fmadd_ps(tmp9623, _mm512_set1_ps(4e+00f), tmp9627);
__m512 tmp9612 = _mm512_add_ps(tmp9603, tmp9601);
__m512 tmp9632 = _mm512_add_ps(tmp9623, tmp9621);
__m512 tmp9605 = _mm512_fmadd_ps(tmp9596, _mm512_set1_ps(4e+00f), tmp9597);
__m512 tmp9625 = _mm512_fmadd_ps(tmp9616, _mm512_set1_ps(4e+00f), tmp9617);
__m512 tmp9609 = _mm512_fmadd_ps(tmp9596, _mm512_set1_ps(1.6e+01f), tmp9597);
__m512 tmp9629 = _mm512_fmadd_ps(tmp9616, _mm512_set1_ps(1.6e+01f), tmp9617);
__m512 tmp9594 = _mm512_add_ps(tmp9595, tmp9537);
__m512 tmp9614 = _mm512_add_ps(tmp9615, tmp9545);
__m512 tmp9611 = _mm512_add_ps(tmp9612, tmp9544);
__m512 tmp9631 = _mm512_add_ps(tmp9632, tmp9592);
__m512 tmp9593 = _mm512_fmadd_ps(tmp9598, _mm512_set1_ps(3.2e+01f), tmp9594);
__m512 tmp9613 = _mm512_fmadd_ps(tmp9618, _mm512_set1_ps(3.2e+01f), tmp9614);
__m512 tmp9604 = _mm512_fmadd_ps(tmp9598, _mm512_set1_ps(8e+00f), tmp9605);
__m512 tmp9624 = _mm512_fmadd_ps(tmp9618, _mm512_set1_ps(8e+00f), tmp9625);
__m512 tmp9610 = _mm512_fmadd_ps(tmp9602, _mm512_set1_ps(3.2e+01f), tmp9611);
__m512 tmp9630 = _mm512_fmadd_ps(tmp9622, _mm512_set1_ps(3.2e+01f), tmp9631);
__m512 tmp9608 = _mm512_fmadd_ps(tmp9598, _mm512_set1_ps(2e+00f), tmp9609);
__m512 tmp9628 = _mm512_fmadd_ps(tmp9618, _mm512_set1_ps(2e+00f), tmp9629);
__m512 out1311 = tmp9593;
__m512 out1317 = tmp9613;
__m512 out1312 = tmp9599;
__m512 out1318 = tmp9619;
__m512 out1313 = tmp9604;
__m512 out1319 = tmp9624;
__m512 out1314 = tmp9606;
__m512 out1320 = tmp9626;
__m512 out1315 = tmp9608;
__m512 out1321 = tmp9628;
__m512 out1316 = tmp9610;
__m512 out1322 = tmp9630;
out1311 = _mm512_max_ps(_mm512_setzero_ps(), out1311);
out1317 = _mm512_max_ps(_mm512_setzero_ps(), out1317);
out1312 = _mm512_max_ps(_mm512_setzero_ps(), out1312);
out1318 = _mm512_max_ps(_mm512_setzero_ps(), out1318);
out1313 = _mm512_max_ps(_mm512_setzero_ps(), out1313);
out1319 = _mm512_max_ps(_mm512_setzero_ps(), out1319);
out1314 = _mm512_max_ps(_mm512_setzero_ps(), out1314);
out1320 = _mm512_max_ps(_mm512_setzero_ps(), out1320);
out1315 = _mm512_max_ps(_mm512_setzero_ps(), out1315);
out1321 = _mm512_max_ps(_mm512_setzero_ps(), out1321);
out1316 = _mm512_max_ps(_mm512_setzero_ps(), out1316);
out1322 = _mm512_max_ps(_mm512_setzero_ps(), out1322);
_mm512_mask_storeu_ps(datPtr13+12656+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1311);
_mm512_mask_storeu_ps(datPtr13+12704+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1317);
_mm512_mask_storeu_ps(datPtr13+12880+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1312);
_mm512_mask_storeu_ps(datPtr13+12928+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1318);
_mm512_mask_storeu_ps(datPtr13+13104+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1313);
_mm512_mask_storeu_ps(datPtr13+13152+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1319);
_mm512_mask_storeu_ps(datPtr13+13328+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1314);
_mm512_mask_storeu_ps(datPtr13+13376+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1320);
_mm512_mask_storeu_ps(datPtr13+13552+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1315);
_mm512_mask_storeu_ps(datPtr13+13600+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1321);
_mm512_mask_storeu_ps(datPtr13+13776+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1316);
_mm512_mask_storeu_ps(datPtr13+13824+806912*i28+224*toH34+4*toW34+50432*k95+25216*l34, 4095, out1322);
}
}
if (j23 >= last6) return;
++j23;
rel17 = 4;
}
ptrdiff_t toH35 = base17+12;
ptrdiff_t toW35 = 36;
ptrdiff_t k96 = 16*w46;
for (; k96 != 16; ++k96) {
ptrdiff_t l35 = 0;
for (; l35 != 2; ++l35) {
__m512 sf689 = _mm512_loadu_ps(sfPtr7+0+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf690 = _mm512_loadu_ps(sfPtr7+128+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1405 = _mm512_shuffle_f32x4(sf689, sf690, 68);
__m512 in1406 = _mm512_shuffle_f32x4(sf689, sf690, 238);
__m512 sf691 = _mm512_loadu_ps(sfPtr7+64+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf692 = _mm512_loadu_ps(sfPtr7+192+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1413 = _mm512_shuffle_f32x4(sf691, sf692, 68);
__m512 in1414 = _mm512_shuffle_f32x4(sf691, sf692, 238);
__m512 sf693 = _mm512_loadu_ps(sfPtr7+409600+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf694 = _mm512_loadu_ps(sfPtr7+409728+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1407 = _mm512_shuffle_f32x4(sf693, sf694, 68);
__m512 in1408 = _mm512_shuffle_f32x4(sf693, sf694, 238);
__m512 sf695 = _mm512_loadu_ps(sfPtr7+409664+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf696 = _mm512_loadu_ps(sfPtr7+409792+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1415 = _mm512_shuffle_f32x4(sf695, sf696, 68);
__m512 in1416 = _mm512_shuffle_f32x4(sf695, sf696, 238);
__m512 sf697 = _mm512_loadu_ps(sfPtr7+819200+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf698 = _mm512_loadu_ps(sfPtr7+819328+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1409 = _mm512_shuffle_f32x4(sf697, sf698, 68);
__m512 in1410 = _mm512_shuffle_f32x4(sf697, sf698, 238);
__m512 sf699 = _mm512_loadu_ps(sfPtr7+819264+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf700 = _mm512_loadu_ps(sfPtr7+819392+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1417 = _mm512_shuffle_f32x4(sf699, sf700, 68);
__m512 in1418 = _mm512_shuffle_f32x4(sf699, sf700, 238);
__m512 sf701 = _mm512_loadu_ps(sfPtr7+1228800+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf702 = _mm512_loadu_ps(sfPtr7+1228928+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1411 = _mm512_shuffle_f32x4(sf701, sf702, 68);
__m512 in1412 = _mm512_shuffle_f32x4(sf701, sf702, 238);
__m512 sf703 = _mm512_loadu_ps(sfPtr7+1228864+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf704 = _mm512_loadu_ps(sfPtr7+1228992+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1419 = _mm512_shuffle_f32x4(sf703, sf704, 68);
__m512 in1420 = _mm512_shuffle_f32x4(sf703, sf704, 238);
__m512 tmp9689 = _mm512_add_ps(in1406, in1407);
__m512 tmp9709 = _mm512_add_ps(in1414, in1415);
__m512 tmp9688 = _mm512_add_ps(in1408, in1409);
__m512 tmp9708 = _mm512_add_ps(in1416, in1417);
__m512 tmp9694 = _mm512_sub_ps(in1408, in1409);
__m512 tmp9714 = _mm512_sub_ps(in1416, in1417);
__m512 tmp9693 = _mm512_sub_ps(in1406, in1407);
__m512 tmp9713 = _mm512_sub_ps(in1414, in1415);
__m512 tmp9690 = _mm512_add_ps(in1410, in1411);
__m512 tmp9710 = _mm512_add_ps(in1418, in1419);
__m512 tmp9695 = _mm512_sub_ps(in1410, in1411);
__m512 tmp9715 = _mm512_sub_ps(in1418, in1419);
__m512 tmp9692 = _mm512_fmadd_ps(tmp9694, _mm512_set1_ps(2e+00f), tmp9693);
__m512 tmp9712 = _mm512_fmadd_ps(tmp9714, _mm512_set1_ps(2e+00f), tmp9713);
__m512 tmp9699 = _mm512_fmadd_ps(tmp9694, _mm512_set1_ps(8e+00f), tmp9693);
__m512 tmp9719 = _mm512_fmadd_ps(tmp9714, _mm512_set1_ps(8e+00f), tmp9713);
__m512 tmp9687 = _mm512_add_ps(tmp9688, tmp9689);
__m512 tmp9707 = _mm512_add_ps(tmp9708, tmp9709);
__m512 tmp9691 = _mm512_fmadd_ps(tmp9695, _mm512_set1_ps(1.6e+01f), tmp9692);
__m512 tmp9711 = _mm512_fmadd_ps(tmp9715, _mm512_set1_ps(1.6e+01f), tmp9712);
__m512 tmp9698 = _mm512_fmadd_ps(tmp9695, _mm512_set1_ps(4e+00f), tmp9699);
__m512 tmp9718 = _mm512_fmadd_ps(tmp9715, _mm512_set1_ps(4e+00f), tmp9719);
__m512 tmp9704 = _mm512_add_ps(tmp9695, tmp9693);
__m512 tmp9724 = _mm512_add_ps(tmp9715, tmp9713);
__m512 tmp9697 = _mm512_fmadd_ps(tmp9688, _mm512_set1_ps(4e+00f), tmp9689);
__m512 tmp9717 = _mm512_fmadd_ps(tmp9708, _mm512_set1_ps(4e+00f), tmp9709);
__m512 tmp9701 = _mm512_fmadd_ps(tmp9688, _mm512_set1_ps(1.6e+01f), tmp9689);
__m512 tmp9721 = _mm512_fmadd_ps(tmp9708, _mm512_set1_ps(1.6e+01f), tmp9709);
__m512 tmp9686 = _mm512_add_ps(tmp9687, in1405);
__m512 tmp9706 = _mm512_add_ps(tmp9707, in1413);
__m512 tmp9703 = _mm512_add_ps(tmp9704, in1412);
__m512 tmp9723 = _mm512_add_ps(tmp9724, in1420);
__m512 tmp9685 = _mm512_fmadd_ps(tmp9690, _mm512_set1_ps(3.2e+01f), tmp9686);
__m512 tmp9705 = _mm512_fmadd_ps(tmp9710, _mm512_set1_ps(3.2e+01f), tmp9706);
__m512 tmp9696 = _mm512_fmadd_ps(tmp9690, _mm512_set1_ps(8e+00f), tmp9697);
__m512 tmp9716 = _mm512_fmadd_ps(tmp9710, _mm512_set1_ps(8e+00f), tmp9717);
__m512 tmp9702 = _mm512_fmadd_ps(tmp9694, _mm512_set1_ps(3.2e+01f), tmp9703);
__m512 tmp9722 = _mm512_fmadd_ps(tmp9714, _mm512_set1_ps(3.2e+01f), tmp9723);
__m512 tmp9700 = _mm512_fmadd_ps(tmp9690, _mm512_set1_ps(2e+00f), tmp9701);
__m512 tmp9720 = _mm512_fmadd_ps(tmp9710, _mm512_set1_ps(2e+00f), tmp9721);
__m512 tmp9673 = tmp9685;
__m512 tmp9679 = tmp9705;
__m512 tmp9674 = tmp9691;
__m512 tmp9680 = tmp9711;
__m512 tmp9675 = tmp9696;
__m512 tmp9681 = tmp9716;
__m512 tmp9676 = tmp9698;
__m512 tmp9682 = tmp9718;
__m512 tmp9677 = tmp9700;
__m512 tmp9683 = tmp9720;
__m512 tmp9678 = tmp9702;
__m512 tmp9684 = tmp9722;
__m512 tmp9769 = _mm512_unpacklo_ps(tmp9673, tmp9674);
__m512 tmp9770 = _mm512_unpackhi_ps(tmp9673, tmp9674);
__m512 tmp9771 = _mm512_unpacklo_ps(tmp9675, tmp9676);
__m512 tmp9772 = _mm512_unpackhi_ps(tmp9675, tmp9676);
__m512 tmp9773 = _mm512_unpacklo_ps(tmp9677, tmp9678);
__m512 tmp9774 = _mm512_unpackhi_ps(tmp9677, tmp9678);
__m512 tmp9775 = _mm512_unpacklo_ps(tmp9679, tmp9680);
__m512 tmp9776 = _mm512_unpackhi_ps(tmp9679, tmp9680);
__m512 tmp9777 = _mm512_unpacklo_ps(tmp9681, tmp9682);
__m512 tmp9778 = _mm512_unpackhi_ps(tmp9681, tmp9682);
__m512 tmp9779 = _mm512_unpacklo_ps(tmp9683, tmp9684);
__m512 tmp9780 = _mm512_unpackhi_ps(tmp9683, tmp9684);
__m512 tmp9781 = _mm512_shuffle_ps(tmp9769, tmp9771, 68);
__m512 tmp9782 = _mm512_shuffle_ps(tmp9769, tmp9771, 238);
__m512 tmp9783 = _mm512_shuffle_ps(tmp9770, tmp9772, 68);
__m512 tmp9784 = _mm512_shuffle_ps(tmp9770, tmp9772, 238);
__m512 tmp9785 = _mm512_shuffle_ps(tmp9773, tmp9775, 68);
__m512 tmp9786 = _mm512_shuffle_ps(tmp9773, tmp9775, 238);
__m512 tmp9787 = _mm512_shuffle_ps(tmp9774, tmp9776, 68);
__m512 tmp9788 = _mm512_shuffle_ps(tmp9774, tmp9776, 238);
__m512 tmp9789 = _mm512_shuffle_ps(tmp9777, tmp9779, 68);
__m512 tmp9790 = _mm512_shuffle_ps(tmp9777, tmp9779, 238);
__m512 tmp9791 = _mm512_shuffle_ps(tmp9778, tmp9780, 68);
__m512 tmp9792 = _mm512_shuffle_ps(tmp9778, tmp9780, 238);
__m512 tmp9793 = _mm512_shuffle_f32x4(tmp9781, tmp9785, 136);
__m512 tmp9794 = _mm512_shuffle_f32x4(tmp9781, tmp9785, 221);
__m512 tmp9795 = _mm512_shuffle_f32x4(tmp9782, tmp9786, 136);
__m512 tmp9796 = _mm512_shuffle_f32x4(tmp9782, tmp9786, 221);
__m512 tmp9797 = _mm512_shuffle_f32x4(tmp9783, tmp9787, 136);
__m512 tmp9798 = _mm512_shuffle_f32x4(tmp9783, tmp9787, 221);
__m512 tmp9799 = _mm512_shuffle_f32x4(tmp9784, tmp9788, 136);
__m512 tmp9800 = _mm512_shuffle_f32x4(tmp9784, tmp9788, 221);
__m512 tmp9801 = _mm512_shuffle_f32x4(tmp9789, tmp9789, 136);
__m512 tmp9802 = _mm512_shuffle_f32x4(tmp9789, tmp9789, 221);
__m512 tmp9803 = _mm512_shuffle_f32x4(tmp9790, tmp9790, 136);
__m512 tmp9804 = _mm512_shuffle_f32x4(tmp9790, tmp9790, 221);
__m512 tmp9805 = _mm512_shuffle_f32x4(tmp9791, tmp9791, 136);
__m512 tmp9806 = _mm512_shuffle_f32x4(tmp9791, tmp9791, 221);
__m512 tmp9807 = _mm512_shuffle_f32x4(tmp9792, tmp9792, 136);
__m512 tmp9808 = _mm512_shuffle_f32x4(tmp9792, tmp9792, 221);
tmp9673 = _mm512_shuffle_f32x4(tmp9793, tmp9801, 136);
tmp9681 = _mm512_shuffle_f32x4(tmp9793, tmp9801, 221);
tmp9674 = _mm512_shuffle_f32x4(tmp9795, tmp9803, 136);
tmp9682 = _mm512_shuffle_f32x4(tmp9795, tmp9803, 221);
tmp9675 = _mm512_shuffle_f32x4(tmp9797, tmp9805, 136);
tmp9683 = _mm512_shuffle_f32x4(tmp9797, tmp9805, 221);
tmp9676 = _mm512_shuffle_f32x4(tmp9799, tmp9807, 136);
tmp9684 = _mm512_shuffle_f32x4(tmp9799, tmp9807, 221);
tmp9677 = _mm512_shuffle_f32x4(tmp9794, tmp9802, 136);
__m512 tmp9725 = _mm512_shuffle_f32x4(tmp9794, tmp9802, 221);
tmp9678 = _mm512_shuffle_f32x4(tmp9796, tmp9804, 136);
__m512 tmp9726 = _mm512_shuffle_f32x4(tmp9796, tmp9804, 221);
tmp9679 = _mm512_shuffle_f32x4(tmp9798, tmp9806, 136);
__m512 tmp9727 = _mm512_shuffle_f32x4(tmp9798, tmp9806, 221);
tmp9680 = _mm512_shuffle_f32x4(tmp9800, tmp9808, 136);
__m512 tmp9728 = _mm512_shuffle_f32x4(tmp9800, tmp9808, 221);
__m512 tmp9733 = _mm512_add_ps(tmp9674, tmp9675);
__m512 tmp9753 = _mm512_add_ps(tmp9682, tmp9683);
__m512 tmp9732 = _mm512_add_ps(tmp9676, tmp9677);
__m512 tmp9752 = _mm512_add_ps(tmp9684, tmp9725);
__m512 tmp9738 = _mm512_sub_ps(tmp9676, tmp9677);
__m512 tmp9758 = _mm512_sub_ps(tmp9684, tmp9725);
__m512 tmp9737 = _mm512_sub_ps(tmp9674, tmp9675);
__m512 tmp9757 = _mm512_sub_ps(tmp9682, tmp9683);
__m512 tmp9734 = _mm512_add_ps(tmp9678, tmp9679);
__m512 tmp9754 = _mm512_add_ps(tmp9726, tmp9727);
__m512 tmp9739 = _mm512_sub_ps(tmp9678, tmp9679);
__m512 tmp9759 = _mm512_sub_ps(tmp9726, tmp9727);
__m512 tmp9736 = _mm512_fmadd_ps(tmp9738, _mm512_set1_ps(2e+00f), tmp9737);
__m512 tmp9756 = _mm512_fmadd_ps(tmp9758, _mm512_set1_ps(2e+00f), tmp9757);
__m512 tmp9743 = _mm512_fmadd_ps(tmp9738, _mm512_set1_ps(8e+00f), tmp9737);
__m512 tmp9763 = _mm512_fmadd_ps(tmp9758, _mm512_set1_ps(8e+00f), tmp9757);
__m512 tmp9731 = _mm512_add_ps(tmp9732, tmp9733);
__m512 tmp9751 = _mm512_add_ps(tmp9752, tmp9753);
__m512 tmp9735 = _mm512_fmadd_ps(tmp9739, _mm512_set1_ps(1.6e+01f), tmp9736);
__m512 tmp9755 = _mm512_fmadd_ps(tmp9759, _mm512_set1_ps(1.6e+01f), tmp9756);
__m512 tmp9742 = _mm512_fmadd_ps(tmp9739, _mm512_set1_ps(4e+00f), tmp9743);
__m512 tmp9762 = _mm512_fmadd_ps(tmp9759, _mm512_set1_ps(4e+00f), tmp9763);
__m512 tmp9748 = _mm512_add_ps(tmp9739, tmp9737);
__m512 tmp9768 = _mm512_add_ps(tmp9759, tmp9757);
__m512 tmp9741 = _mm512_fmadd_ps(tmp9732, _mm512_set1_ps(4e+00f), tmp9733);
__m512 tmp9761 = _mm512_fmadd_ps(tmp9752, _mm512_set1_ps(4e+00f), tmp9753);
__m512 tmp9745 = _mm512_fmadd_ps(tmp9732, _mm512_set1_ps(1.6e+01f), tmp9733);
__m512 tmp9765 = _mm512_fmadd_ps(tmp9752, _mm512_set1_ps(1.6e+01f), tmp9753);
__m512 tmp9730 = _mm512_add_ps(tmp9731, tmp9673);
__m512 tmp9750 = _mm512_add_ps(tmp9751, tmp9681);
__m512 tmp9747 = _mm512_add_ps(tmp9748, tmp9680);
__m512 tmp9767 = _mm512_add_ps(tmp9768, tmp9728);
__m512 tmp9729 = _mm512_fmadd_ps(tmp9734, _mm512_set1_ps(3.2e+01f), tmp9730);
__m512 tmp9749 = _mm512_fmadd_ps(tmp9754, _mm512_set1_ps(3.2e+01f), tmp9750);
__m512 tmp9740 = _mm512_fmadd_ps(tmp9734, _mm512_set1_ps(8e+00f), tmp9741);
__m512 tmp9760 = _mm512_fmadd_ps(tmp9754, _mm512_set1_ps(8e+00f), tmp9761);
__m512 tmp9746 = _mm512_fmadd_ps(tmp9738, _mm512_set1_ps(3.2e+01f), tmp9747);
__m512 tmp9766 = _mm512_fmadd_ps(tmp9758, _mm512_set1_ps(3.2e+01f), tmp9767);
__m512 tmp9744 = _mm512_fmadd_ps(tmp9734, _mm512_set1_ps(2e+00f), tmp9745);
__m512 tmp9764 = _mm512_fmadd_ps(tmp9754, _mm512_set1_ps(2e+00f), tmp9765);
__m512 out1323 = tmp9729;
__m512 out1329 = tmp9749;
__m512 out1324 = tmp9735;
__m512 out1330 = tmp9755;
__m512 out1325 = tmp9740;
__m512 out1331 = tmp9760;
__m512 out1326 = tmp9742;
__m512 out1332 = tmp9762;
__m512 out1327 = tmp9744;
__m512 out1333 = tmp9764;
__m512 out1328 = tmp9746;
__m512 out1334 = tmp9766;
out1323 = _mm512_max_ps(_mm512_setzero_ps(), out1323);
out1329 = _mm512_max_ps(_mm512_setzero_ps(), out1329);
out1324 = _mm512_max_ps(_mm512_setzero_ps(), out1324);
out1330 = _mm512_max_ps(_mm512_setzero_ps(), out1330);
out1325 = _mm512_max_ps(_mm512_setzero_ps(), out1325);
out1331 = _mm512_max_ps(_mm512_setzero_ps(), out1331);
out1326 = _mm512_max_ps(_mm512_setzero_ps(), out1326);
out1332 = _mm512_max_ps(_mm512_setzero_ps(), out1332);
out1327 = _mm512_max_ps(_mm512_setzero_ps(), out1327);
out1333 = _mm512_max_ps(_mm512_setzero_ps(), out1333);
out1328 = _mm512_max_ps(_mm512_setzero_ps(), out1328);
out1334 = _mm512_max_ps(_mm512_setzero_ps(), out1334);
_mm512_mask_storeu_ps(datPtr13+0+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1323);
_mm512_mask_storeu_ps(datPtr13+48+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 255, out1329);
_mm512_mask_storeu_ps(datPtr13+224+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1324);
_mm512_mask_storeu_ps(datPtr13+272+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 255, out1330);
_mm512_mask_storeu_ps(datPtr13+448+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1325);
_mm512_mask_storeu_ps(datPtr13+496+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 255, out1331);
_mm512_mask_storeu_ps(datPtr13+672+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1326);
_mm512_mask_storeu_ps(datPtr13+720+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 255, out1332);
_mm512_mask_storeu_ps(datPtr13+896+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1327);
_mm512_mask_storeu_ps(datPtr13+944+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 255, out1333);
_mm512_mask_storeu_ps(datPtr13+1120+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1328);
_mm512_mask_storeu_ps(datPtr13+1168+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 255, out1334);
__m512 sf705 = _mm512_loadu_ps(sfPtr7+256+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf706 = _mm512_loadu_ps(sfPtr7+384+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1421 = _mm512_shuffle_f32x4(sf705, sf706, 68);
__m512 in1422 = _mm512_shuffle_f32x4(sf705, sf706, 238);
__m512 sf707 = _mm512_loadu_ps(sfPtr7+320+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf708 = _mm512_loadu_ps(sfPtr7+448+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1429 = _mm512_shuffle_f32x4(sf707, sf708, 68);
__m512 in1430 = _mm512_shuffle_f32x4(sf707, sf708, 238);
__m512 sf709 = _mm512_loadu_ps(sfPtr7+409856+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf710 = _mm512_loadu_ps(sfPtr7+409984+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1423 = _mm512_shuffle_f32x4(sf709, sf710, 68);
__m512 in1424 = _mm512_shuffle_f32x4(sf709, sf710, 238);
__m512 sf711 = _mm512_loadu_ps(sfPtr7+409920+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf712 = _mm512_loadu_ps(sfPtr7+410048+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1431 = _mm512_shuffle_f32x4(sf711, sf712, 68);
__m512 in1432 = _mm512_shuffle_f32x4(sf711, sf712, 238);
__m512 sf713 = _mm512_loadu_ps(sfPtr7+819456+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf714 = _mm512_loadu_ps(sfPtr7+819584+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1425 = _mm512_shuffle_f32x4(sf713, sf714, 68);
__m512 in1426 = _mm512_shuffle_f32x4(sf713, sf714, 238);
__m512 sf715 = _mm512_loadu_ps(sfPtr7+819520+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf716 = _mm512_loadu_ps(sfPtr7+819648+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1433 = _mm512_shuffle_f32x4(sf715, sf716, 68);
__m512 in1434 = _mm512_shuffle_f32x4(sf715, sf716, 238);
__m512 sf717 = _mm512_loadu_ps(sfPtr7+1229056+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf718 = _mm512_loadu_ps(sfPtr7+1229184+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1427 = _mm512_shuffle_f32x4(sf717, sf718, 68);
__m512 in1428 = _mm512_shuffle_f32x4(sf717, sf718, 238);
__m512 sf719 = _mm512_loadu_ps(sfPtr7+1229120+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf720 = _mm512_loadu_ps(sfPtr7+1229248+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1435 = _mm512_shuffle_f32x4(sf719, sf720, 68);
__m512 in1436 = _mm512_shuffle_f32x4(sf719, sf720, 238);
__m512 tmp9825 = _mm512_add_ps(in1422, in1423);
__m512 tmp9845 = _mm512_add_ps(in1430, in1431);
__m512 tmp9824 = _mm512_add_ps(in1424, in1425);
__m512 tmp9844 = _mm512_add_ps(in1432, in1433);
__m512 tmp9830 = _mm512_sub_ps(in1424, in1425);
__m512 tmp9850 = _mm512_sub_ps(in1432, in1433);
__m512 tmp9829 = _mm512_sub_ps(in1422, in1423);
__m512 tmp9849 = _mm512_sub_ps(in1430, in1431);
__m512 tmp9826 = _mm512_add_ps(in1426, in1427);
__m512 tmp9846 = _mm512_add_ps(in1434, in1435);
__m512 tmp9831 = _mm512_sub_ps(in1426, in1427);
__m512 tmp9851 = _mm512_sub_ps(in1434, in1435);
__m512 tmp9828 = _mm512_fmadd_ps(tmp9830, _mm512_set1_ps(2e+00f), tmp9829);
__m512 tmp9848 = _mm512_fmadd_ps(tmp9850, _mm512_set1_ps(2e+00f), tmp9849);
__m512 tmp9835 = _mm512_fmadd_ps(tmp9830, _mm512_set1_ps(8e+00f), tmp9829);
__m512 tmp9855 = _mm512_fmadd_ps(tmp9850, _mm512_set1_ps(8e+00f), tmp9849);
__m512 tmp9823 = _mm512_add_ps(tmp9824, tmp9825);
__m512 tmp9843 = _mm512_add_ps(tmp9844, tmp9845);
__m512 tmp9827 = _mm512_fmadd_ps(tmp9831, _mm512_set1_ps(1.6e+01f), tmp9828);
__m512 tmp9847 = _mm512_fmadd_ps(tmp9851, _mm512_set1_ps(1.6e+01f), tmp9848);
__m512 tmp9834 = _mm512_fmadd_ps(tmp9831, _mm512_set1_ps(4e+00f), tmp9835);
__m512 tmp9854 = _mm512_fmadd_ps(tmp9851, _mm512_set1_ps(4e+00f), tmp9855);
__m512 tmp9840 = _mm512_add_ps(tmp9831, tmp9829);
__m512 tmp9860 = _mm512_add_ps(tmp9851, tmp9849);
__m512 tmp9833 = _mm512_fmadd_ps(tmp9824, _mm512_set1_ps(4e+00f), tmp9825);
__m512 tmp9853 = _mm512_fmadd_ps(tmp9844, _mm512_set1_ps(4e+00f), tmp9845);
__m512 tmp9837 = _mm512_fmadd_ps(tmp9824, _mm512_set1_ps(1.6e+01f), tmp9825);
__m512 tmp9857 = _mm512_fmadd_ps(tmp9844, _mm512_set1_ps(1.6e+01f), tmp9845);
__m512 tmp9822 = _mm512_add_ps(tmp9823, in1421);
__m512 tmp9842 = _mm512_add_ps(tmp9843, in1429);
__m512 tmp9839 = _mm512_add_ps(tmp9840, in1428);
__m512 tmp9859 = _mm512_add_ps(tmp9860, in1436);
__m512 tmp9821 = _mm512_fmadd_ps(tmp9826, _mm512_set1_ps(3.2e+01f), tmp9822);
__m512 tmp9841 = _mm512_fmadd_ps(tmp9846, _mm512_set1_ps(3.2e+01f), tmp9842);
__m512 tmp9832 = _mm512_fmadd_ps(tmp9826, _mm512_set1_ps(8e+00f), tmp9833);
__m512 tmp9852 = _mm512_fmadd_ps(tmp9846, _mm512_set1_ps(8e+00f), tmp9853);
__m512 tmp9838 = _mm512_fmadd_ps(tmp9830, _mm512_set1_ps(3.2e+01f), tmp9839);
__m512 tmp9858 = _mm512_fmadd_ps(tmp9850, _mm512_set1_ps(3.2e+01f), tmp9859);
__m512 tmp9836 = _mm512_fmadd_ps(tmp9826, _mm512_set1_ps(2e+00f), tmp9837);
__m512 tmp9856 = _mm512_fmadd_ps(tmp9846, _mm512_set1_ps(2e+00f), tmp9857);
__m512 tmp9809 = tmp9821;
__m512 tmp9815 = tmp9841;
__m512 tmp9810 = tmp9827;
__m512 tmp9816 = tmp9847;
__m512 tmp9811 = tmp9832;
__m512 tmp9817 = tmp9852;
__m512 tmp9812 = tmp9834;
__m512 tmp9818 = tmp9854;
__m512 tmp9813 = tmp9836;
__m512 tmp9819 = tmp9856;
__m512 tmp9814 = tmp9838;
__m512 tmp9820 = tmp9858;
__m512 tmp9905 = _mm512_unpacklo_ps(tmp9809, tmp9810);
__m512 tmp9906 = _mm512_unpackhi_ps(tmp9809, tmp9810);
__m512 tmp9907 = _mm512_unpacklo_ps(tmp9811, tmp9812);
__m512 tmp9908 = _mm512_unpackhi_ps(tmp9811, tmp9812);
__m512 tmp9909 = _mm512_unpacklo_ps(tmp9813, tmp9814);
__m512 tmp9910 = _mm512_unpackhi_ps(tmp9813, tmp9814);
__m512 tmp9911 = _mm512_unpacklo_ps(tmp9815, tmp9816);
__m512 tmp9912 = _mm512_unpackhi_ps(tmp9815, tmp9816);
__m512 tmp9913 = _mm512_unpacklo_ps(tmp9817, tmp9818);
__m512 tmp9914 = _mm512_unpackhi_ps(tmp9817, tmp9818);
__m512 tmp9915 = _mm512_unpacklo_ps(tmp9819, tmp9820);
__m512 tmp9916 = _mm512_unpackhi_ps(tmp9819, tmp9820);
__m512 tmp9917 = _mm512_shuffle_ps(tmp9905, tmp9907, 68);
__m512 tmp9918 = _mm512_shuffle_ps(tmp9905, tmp9907, 238);
__m512 tmp9919 = _mm512_shuffle_ps(tmp9906, tmp9908, 68);
__m512 tmp9920 = _mm512_shuffle_ps(tmp9906, tmp9908, 238);
__m512 tmp9921 = _mm512_shuffle_ps(tmp9909, tmp9911, 68);
__m512 tmp9922 = _mm512_shuffle_ps(tmp9909, tmp9911, 238);
__m512 tmp9923 = _mm512_shuffle_ps(tmp9910, tmp9912, 68);
__m512 tmp9924 = _mm512_shuffle_ps(tmp9910, tmp9912, 238);
__m512 tmp9925 = _mm512_shuffle_ps(tmp9913, tmp9915, 68);
__m512 tmp9926 = _mm512_shuffle_ps(tmp9913, tmp9915, 238);
__m512 tmp9927 = _mm512_shuffle_ps(tmp9914, tmp9916, 68);
__m512 tmp9928 = _mm512_shuffle_ps(tmp9914, tmp9916, 238);
__m512 tmp9929 = _mm512_shuffle_f32x4(tmp9917, tmp9921, 136);
__m512 tmp9930 = _mm512_shuffle_f32x4(tmp9917, tmp9921, 221);
__m512 tmp9931 = _mm512_shuffle_f32x4(tmp9918, tmp9922, 136);
__m512 tmp9932 = _mm512_shuffle_f32x4(tmp9918, tmp9922, 221);
__m512 tmp9933 = _mm512_shuffle_f32x4(tmp9919, tmp9923, 136);
__m512 tmp9934 = _mm512_shuffle_f32x4(tmp9919, tmp9923, 221);
__m512 tmp9935 = _mm512_shuffle_f32x4(tmp9920, tmp9924, 136);
__m512 tmp9936 = _mm512_shuffle_f32x4(tmp9920, tmp9924, 221);
__m512 tmp9937 = _mm512_shuffle_f32x4(tmp9925, tmp9925, 136);
__m512 tmp9938 = _mm512_shuffle_f32x4(tmp9925, tmp9925, 221);
__m512 tmp9939 = _mm512_shuffle_f32x4(tmp9926, tmp9926, 136);
__m512 tmp9940 = _mm512_shuffle_f32x4(tmp9926, tmp9926, 221);
__m512 tmp9941 = _mm512_shuffle_f32x4(tmp9927, tmp9927, 136);
__m512 tmp9942 = _mm512_shuffle_f32x4(tmp9927, tmp9927, 221);
__m512 tmp9943 = _mm512_shuffle_f32x4(tmp9928, tmp9928, 136);
__m512 tmp9944 = _mm512_shuffle_f32x4(tmp9928, tmp9928, 221);
tmp9809 = _mm512_shuffle_f32x4(tmp9929, tmp9937, 136);
tmp9817 = _mm512_shuffle_f32x4(tmp9929, tmp9937, 221);
tmp9810 = _mm512_shuffle_f32x4(tmp9931, tmp9939, 136);
tmp9818 = _mm512_shuffle_f32x4(tmp9931, tmp9939, 221);
tmp9811 = _mm512_shuffle_f32x4(tmp9933, tmp9941, 136);
tmp9819 = _mm512_shuffle_f32x4(tmp9933, tmp9941, 221);
tmp9812 = _mm512_shuffle_f32x4(tmp9935, tmp9943, 136);
tmp9820 = _mm512_shuffle_f32x4(tmp9935, tmp9943, 221);
tmp9813 = _mm512_shuffle_f32x4(tmp9930, tmp9938, 136);
__m512 tmp9861 = _mm512_shuffle_f32x4(tmp9930, tmp9938, 221);
tmp9814 = _mm512_shuffle_f32x4(tmp9932, tmp9940, 136);
__m512 tmp9862 = _mm512_shuffle_f32x4(tmp9932, tmp9940, 221);
tmp9815 = _mm512_shuffle_f32x4(tmp9934, tmp9942, 136);
__m512 tmp9863 = _mm512_shuffle_f32x4(tmp9934, tmp9942, 221);
tmp9816 = _mm512_shuffle_f32x4(tmp9936, tmp9944, 136);
__m512 tmp9864 = _mm512_shuffle_f32x4(tmp9936, tmp9944, 221);
__m512 tmp9869 = _mm512_add_ps(tmp9810, tmp9811);
__m512 tmp9889 = _mm512_add_ps(tmp9818, tmp9819);
__m512 tmp9868 = _mm512_add_ps(tmp9812, tmp9813);
__m512 tmp9888 = _mm512_add_ps(tmp9820, tmp9861);
__m512 tmp9874 = _mm512_sub_ps(tmp9812, tmp9813);
__m512 tmp9894 = _mm512_sub_ps(tmp9820, tmp9861);
__m512 tmp9873 = _mm512_sub_ps(tmp9810, tmp9811);
__m512 tmp9893 = _mm512_sub_ps(tmp9818, tmp9819);
__m512 tmp9870 = _mm512_add_ps(tmp9814, tmp9815);
__m512 tmp9890 = _mm512_add_ps(tmp9862, tmp9863);
__m512 tmp9875 = _mm512_sub_ps(tmp9814, tmp9815);
__m512 tmp9895 = _mm512_sub_ps(tmp9862, tmp9863);
__m512 tmp9872 = _mm512_fmadd_ps(tmp9874, _mm512_set1_ps(2e+00f), tmp9873);
__m512 tmp9892 = _mm512_fmadd_ps(tmp9894, _mm512_set1_ps(2e+00f), tmp9893);
__m512 tmp9879 = _mm512_fmadd_ps(tmp9874, _mm512_set1_ps(8e+00f), tmp9873);
__m512 tmp9899 = _mm512_fmadd_ps(tmp9894, _mm512_set1_ps(8e+00f), tmp9893);
__m512 tmp9867 = _mm512_add_ps(tmp9868, tmp9869);
__m512 tmp9887 = _mm512_add_ps(tmp9888, tmp9889);
__m512 tmp9871 = _mm512_fmadd_ps(tmp9875, _mm512_set1_ps(1.6e+01f), tmp9872);
__m512 tmp9891 = _mm512_fmadd_ps(tmp9895, _mm512_set1_ps(1.6e+01f), tmp9892);
__m512 tmp9878 = _mm512_fmadd_ps(tmp9875, _mm512_set1_ps(4e+00f), tmp9879);
__m512 tmp9898 = _mm512_fmadd_ps(tmp9895, _mm512_set1_ps(4e+00f), tmp9899);
__m512 tmp9884 = _mm512_add_ps(tmp9875, tmp9873);
__m512 tmp9904 = _mm512_add_ps(tmp9895, tmp9893);
__m512 tmp9877 = _mm512_fmadd_ps(tmp9868, _mm512_set1_ps(4e+00f), tmp9869);
__m512 tmp9897 = _mm512_fmadd_ps(tmp9888, _mm512_set1_ps(4e+00f), tmp9889);
__m512 tmp9881 = _mm512_fmadd_ps(tmp9868, _mm512_set1_ps(1.6e+01f), tmp9869);
__m512 tmp9901 = _mm512_fmadd_ps(tmp9888, _mm512_set1_ps(1.6e+01f), tmp9889);
__m512 tmp9866 = _mm512_add_ps(tmp9867, tmp9809);
__m512 tmp9886 = _mm512_add_ps(tmp9887, tmp9817);
__m512 tmp9883 = _mm512_add_ps(tmp9884, tmp9816);
__m512 tmp9903 = _mm512_add_ps(tmp9904, tmp9864);
__m512 tmp9865 = _mm512_fmadd_ps(tmp9870, _mm512_set1_ps(3.2e+01f), tmp9866);
__m512 tmp9885 = _mm512_fmadd_ps(tmp9890, _mm512_set1_ps(3.2e+01f), tmp9886);
__m512 tmp9876 = _mm512_fmadd_ps(tmp9870, _mm512_set1_ps(8e+00f), tmp9877);
__m512 tmp9896 = _mm512_fmadd_ps(tmp9890, _mm512_set1_ps(8e+00f), tmp9897);
__m512 tmp9882 = _mm512_fmadd_ps(tmp9874, _mm512_set1_ps(3.2e+01f), tmp9883);
__m512 tmp9902 = _mm512_fmadd_ps(tmp9894, _mm512_set1_ps(3.2e+01f), tmp9903);
__m512 tmp9880 = _mm512_fmadd_ps(tmp9870, _mm512_set1_ps(2e+00f), tmp9881);
__m512 tmp9900 = _mm512_fmadd_ps(tmp9890, _mm512_set1_ps(2e+00f), tmp9901);
__m512 out1335 = tmp9865;
__m512 out1341 = tmp9885;
__m512 out1336 = tmp9871;
__m512 out1342 = tmp9891;
__m512 out1337 = tmp9876;
__m512 out1343 = tmp9896;
__m512 out1338 = tmp9878;
__m512 out1344 = tmp9898;
__m512 out1339 = tmp9880;
__m512 out1345 = tmp9900;
__m512 out1340 = tmp9882;
__m512 out1346 = tmp9902;
out1335 = _mm512_max_ps(_mm512_setzero_ps(), out1335);
out1341 = _mm512_max_ps(_mm512_setzero_ps(), out1341);
out1336 = _mm512_max_ps(_mm512_setzero_ps(), out1336);
out1342 = _mm512_max_ps(_mm512_setzero_ps(), out1342);
out1337 = _mm512_max_ps(_mm512_setzero_ps(), out1337);
out1343 = _mm512_max_ps(_mm512_setzero_ps(), out1343);
out1338 = _mm512_max_ps(_mm512_setzero_ps(), out1338);
out1344 = _mm512_max_ps(_mm512_setzero_ps(), out1344);
out1339 = _mm512_max_ps(_mm512_setzero_ps(), out1339);
out1345 = _mm512_max_ps(_mm512_setzero_ps(), out1345);
out1340 = _mm512_max_ps(_mm512_setzero_ps(), out1340);
out1346 = _mm512_max_ps(_mm512_setzero_ps(), out1346);
_mm512_mask_storeu_ps(datPtr13+1200+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1335);
_mm512_mask_storeu_ps(datPtr13+12608+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1341);
_mm512_mask_storeu_ps(datPtr13+1424+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1336);
_mm512_mask_storeu_ps(datPtr13+12832+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1342);
_mm512_mask_storeu_ps(datPtr13+1648+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1337);
_mm512_mask_storeu_ps(datPtr13+13056+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1343);
_mm512_mask_storeu_ps(datPtr13+1872+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1338);
_mm512_mask_storeu_ps(datPtr13+13280+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1344);
_mm512_mask_storeu_ps(datPtr13+2096+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1339);
_mm512_mask_storeu_ps(datPtr13+13504+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1345);
_mm512_mask_storeu_ps(datPtr13+2320+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1340);
_mm512_mask_storeu_ps(datPtr13+13728+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1346);
__m512 sf721 = _mm512_loadu_ps(sfPtr7+512+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf722 = _mm512_loadu_ps(sfPtr7+640+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1437 = _mm512_shuffle_f32x4(sf721, sf722, 68);
__m512 in1438 = _mm512_shuffle_f32x4(sf721, sf722, 238);
__m512 sf723 = _mm512_loadu_ps(sfPtr7+576+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf724 = _mm512_loadu_ps(sfPtr7+704+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1445 = _mm512_shuffle_f32x4(sf723, sf724, 68);
__m512 in1446 = _mm512_shuffle_f32x4(sf723, sf724, 238);
__m512 sf725 = _mm512_loadu_ps(sfPtr7+410112+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf726 = _mm512_loadu_ps(sfPtr7+410240+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1439 = _mm512_shuffle_f32x4(sf725, sf726, 68);
__m512 in1440 = _mm512_shuffle_f32x4(sf725, sf726, 238);
__m512 sf727 = _mm512_loadu_ps(sfPtr7+410176+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf728 = _mm512_loadu_ps(sfPtr7+410304+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1447 = _mm512_shuffle_f32x4(sf727, sf728, 68);
__m512 in1448 = _mm512_shuffle_f32x4(sf727, sf728, 238);
__m512 sf729 = _mm512_loadu_ps(sfPtr7+819712+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf730 = _mm512_loadu_ps(sfPtr7+819840+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1441 = _mm512_shuffle_f32x4(sf729, sf730, 68);
__m512 in1442 = _mm512_shuffle_f32x4(sf729, sf730, 238);
__m512 sf731 = _mm512_loadu_ps(sfPtr7+819776+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf732 = _mm512_loadu_ps(sfPtr7+819904+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1449 = _mm512_shuffle_f32x4(sf731, sf732, 68);
__m512 in1450 = _mm512_shuffle_f32x4(sf731, sf732, 238);
__m512 sf733 = _mm512_loadu_ps(sfPtr7+1229312+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf734 = _mm512_loadu_ps(sfPtr7+1229440+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1443 = _mm512_shuffle_f32x4(sf733, sf734, 68);
__m512 in1444 = _mm512_shuffle_f32x4(sf733, sf734, 238);
__m512 sf735 = _mm512_loadu_ps(sfPtr7+1229376+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 sf736 = _mm512_loadu_ps(sfPtr7+1229504+1638400*i28+24576*j23+1536*k96+768*l35);
__m512 in1451 = _mm512_shuffle_f32x4(sf735, sf736, 68);
__m512 in1452 = _mm512_shuffle_f32x4(sf735, sf736, 238);
__m512 tmp9961 = _mm512_add_ps(in1438, in1439);
__m512 tmp9981 = _mm512_add_ps(in1446, in1447);
__m512 tmp9960 = _mm512_add_ps(in1440, in1441);
__m512 tmp9980 = _mm512_add_ps(in1448, in1449);
__m512 tmp9966 = _mm512_sub_ps(in1440, in1441);
__m512 tmp9986 = _mm512_sub_ps(in1448, in1449);
__m512 tmp9965 = _mm512_sub_ps(in1438, in1439);
__m512 tmp9985 = _mm512_sub_ps(in1446, in1447);
__m512 tmp9962 = _mm512_add_ps(in1442, in1443);
__m512 tmp9982 = _mm512_add_ps(in1450, in1451);
__m512 tmp9967 = _mm512_sub_ps(in1442, in1443);
__m512 tmp9987 = _mm512_sub_ps(in1450, in1451);
__m512 tmp9964 = _mm512_fmadd_ps(tmp9966, _mm512_set1_ps(2e+00f), tmp9965);
__m512 tmp9984 = _mm512_fmadd_ps(tmp9986, _mm512_set1_ps(2e+00f), tmp9985);
__m512 tmp9971 = _mm512_fmadd_ps(tmp9966, _mm512_set1_ps(8e+00f), tmp9965);
__m512 tmp9991 = _mm512_fmadd_ps(tmp9986, _mm512_set1_ps(8e+00f), tmp9985);
__m512 tmp9959 = _mm512_add_ps(tmp9960, tmp9961);
__m512 tmp9979 = _mm512_add_ps(tmp9980, tmp9981);
__m512 tmp9963 = _mm512_fmadd_ps(tmp9967, _mm512_set1_ps(1.6e+01f), tmp9964);
__m512 tmp9983 = _mm512_fmadd_ps(tmp9987, _mm512_set1_ps(1.6e+01f), tmp9984);
__m512 tmp9970 = _mm512_fmadd_ps(tmp9967, _mm512_set1_ps(4e+00f), tmp9971);
__m512 tmp9990 = _mm512_fmadd_ps(tmp9987, _mm512_set1_ps(4e+00f), tmp9991);
__m512 tmp9976 = _mm512_add_ps(tmp9967, tmp9965);
__m512 tmp9996 = _mm512_add_ps(tmp9987, tmp9985);
__m512 tmp9969 = _mm512_fmadd_ps(tmp9960, _mm512_set1_ps(4e+00f), tmp9961);
__m512 tmp9989 = _mm512_fmadd_ps(tmp9980, _mm512_set1_ps(4e+00f), tmp9981);
__m512 tmp9973 = _mm512_fmadd_ps(tmp9960, _mm512_set1_ps(1.6e+01f), tmp9961);
__m512 tmp9993 = _mm512_fmadd_ps(tmp9980, _mm512_set1_ps(1.6e+01f), tmp9981);
__m512 tmp9958 = _mm512_add_ps(tmp9959, in1437);
__m512 tmp9978 = _mm512_add_ps(tmp9979, in1445);
__m512 tmp9975 = _mm512_add_ps(tmp9976, in1444);
__m512 tmp9995 = _mm512_add_ps(tmp9996, in1452);
__m512 tmp9957 = _mm512_fmadd_ps(tmp9962, _mm512_set1_ps(3.2e+01f), tmp9958);
__m512 tmp9977 = _mm512_fmadd_ps(tmp9982, _mm512_set1_ps(3.2e+01f), tmp9978);
__m512 tmp9968 = _mm512_fmadd_ps(tmp9962, _mm512_set1_ps(8e+00f), tmp9969);
__m512 tmp9988 = _mm512_fmadd_ps(tmp9982, _mm512_set1_ps(8e+00f), tmp9989);
__m512 tmp9974 = _mm512_fmadd_ps(tmp9966, _mm512_set1_ps(3.2e+01f), tmp9975);
__m512 tmp9994 = _mm512_fmadd_ps(tmp9986, _mm512_set1_ps(3.2e+01f), tmp9995);
__m512 tmp9972 = _mm512_fmadd_ps(tmp9962, _mm512_set1_ps(2e+00f), tmp9973);
__m512 tmp9992 = _mm512_fmadd_ps(tmp9982, _mm512_set1_ps(2e+00f), tmp9993);
__m512 tmp9945 = tmp9957;
__m512 tmp9951 = tmp9977;
__m512 tmp9946 = tmp9963;
__m512 tmp9952 = tmp9983;
__m512 tmp9947 = tmp9968;
__m512 tmp9953 = tmp9988;
__m512 tmp9948 = tmp9970;
__m512 tmp9954 = tmp9990;
__m512 tmp9949 = tmp9972;
__m512 tmp9955 = tmp9992;
__m512 tmp9950 = tmp9974;
__m512 tmp9956 = tmp9994;
__m512 tmp10041 = _mm512_unpacklo_ps(tmp9945, tmp9946);
__m512 tmp10042 = _mm512_unpackhi_ps(tmp9945, tmp9946);
__m512 tmp10043 = _mm512_unpacklo_ps(tmp9947, tmp9948);
__m512 tmp10044 = _mm512_unpackhi_ps(tmp9947, tmp9948);
__m512 tmp10045 = _mm512_unpacklo_ps(tmp9949, tmp9950);
__m512 tmp10046 = _mm512_unpackhi_ps(tmp9949, tmp9950);
__m512 tmp10047 = _mm512_unpacklo_ps(tmp9951, tmp9952);
__m512 tmp10048 = _mm512_unpackhi_ps(tmp9951, tmp9952);
__m512 tmp10049 = _mm512_unpacklo_ps(tmp9953, tmp9954);
__m512 tmp10050 = _mm512_unpackhi_ps(tmp9953, tmp9954);
__m512 tmp10051 = _mm512_unpacklo_ps(tmp9955, tmp9956);
__m512 tmp10052 = _mm512_unpackhi_ps(tmp9955, tmp9956);
__m512 tmp10053 = _mm512_shuffle_ps(tmp10041, tmp10043, 68);
__m512 tmp10054 = _mm512_shuffle_ps(tmp10041, tmp10043, 238);
__m512 tmp10055 = _mm512_shuffle_ps(tmp10042, tmp10044, 68);
__m512 tmp10056 = _mm512_shuffle_ps(tmp10042, tmp10044, 238);
__m512 tmp10057 = _mm512_shuffle_ps(tmp10045, tmp10047, 68);
__m512 tmp10058 = _mm512_shuffle_ps(tmp10045, tmp10047, 238);
__m512 tmp10059 = _mm512_shuffle_ps(tmp10046, tmp10048, 68);
__m512 tmp10060 = _mm512_shuffle_ps(tmp10046, tmp10048, 238);
__m512 tmp10061 = _mm512_shuffle_ps(tmp10049, tmp10051, 68);
__m512 tmp10062 = _mm512_shuffle_ps(tmp10049, tmp10051, 238);
__m512 tmp10063 = _mm512_shuffle_ps(tmp10050, tmp10052, 68);
__m512 tmp10064 = _mm512_shuffle_ps(tmp10050, tmp10052, 238);
__m512 tmp10065 = _mm512_shuffle_f32x4(tmp10053, tmp10057, 136);
__m512 tmp10066 = _mm512_shuffle_f32x4(tmp10053, tmp10057, 221);
__m512 tmp10067 = _mm512_shuffle_f32x4(tmp10054, tmp10058, 136);
__m512 tmp10068 = _mm512_shuffle_f32x4(tmp10054, tmp10058, 221);
__m512 tmp10069 = _mm512_shuffle_f32x4(tmp10055, tmp10059, 136);
__m512 tmp10070 = _mm512_shuffle_f32x4(tmp10055, tmp10059, 221);
__m512 tmp10071 = _mm512_shuffle_f32x4(tmp10056, tmp10060, 136);
__m512 tmp10072 = _mm512_shuffle_f32x4(tmp10056, tmp10060, 221);
__m512 tmp10073 = _mm512_shuffle_f32x4(tmp10061, tmp10061, 136);
__m512 tmp10074 = _mm512_shuffle_f32x4(tmp10061, tmp10061, 221);
__m512 tmp10075 = _mm512_shuffle_f32x4(tmp10062, tmp10062, 136);
__m512 tmp10076 = _mm512_shuffle_f32x4(tmp10062, tmp10062, 221);
__m512 tmp10077 = _mm512_shuffle_f32x4(tmp10063, tmp10063, 136);
__m512 tmp10078 = _mm512_shuffle_f32x4(tmp10063, tmp10063, 221);
__m512 tmp10079 = _mm512_shuffle_f32x4(tmp10064, tmp10064, 136);
__m512 tmp10080 = _mm512_shuffle_f32x4(tmp10064, tmp10064, 221);
tmp9945 = _mm512_shuffle_f32x4(tmp10065, tmp10073, 136);
tmp9953 = _mm512_shuffle_f32x4(tmp10065, tmp10073, 221);
tmp9946 = _mm512_shuffle_f32x4(tmp10067, tmp10075, 136);
tmp9954 = _mm512_shuffle_f32x4(tmp10067, tmp10075, 221);
tmp9947 = _mm512_shuffle_f32x4(tmp10069, tmp10077, 136);
tmp9955 = _mm512_shuffle_f32x4(tmp10069, tmp10077, 221);
tmp9948 = _mm512_shuffle_f32x4(tmp10071, tmp10079, 136);
tmp9956 = _mm512_shuffle_f32x4(tmp10071, tmp10079, 221);
tmp9949 = _mm512_shuffle_f32x4(tmp10066, tmp10074, 136);
__m512 tmp9997 = _mm512_shuffle_f32x4(tmp10066, tmp10074, 221);
tmp9950 = _mm512_shuffle_f32x4(tmp10068, tmp10076, 136);
__m512 tmp9998 = _mm512_shuffle_f32x4(tmp10068, tmp10076, 221);
tmp9951 = _mm512_shuffle_f32x4(tmp10070, tmp10078, 136);
__m512 tmp9999 = _mm512_shuffle_f32x4(tmp10070, tmp10078, 221);
tmp9952 = _mm512_shuffle_f32x4(tmp10072, tmp10080, 136);
__m512 tmp10000 = _mm512_shuffle_f32x4(tmp10072, tmp10080, 221);
__m512 tmp10005 = _mm512_add_ps(tmp9946, tmp9947);
__m512 tmp10025 = _mm512_add_ps(tmp9954, tmp9955);
__m512 tmp10004 = _mm512_add_ps(tmp9948, tmp9949);
__m512 tmp10024 = _mm512_add_ps(tmp9956, tmp9997);
__m512 tmp10010 = _mm512_sub_ps(tmp9948, tmp9949);
__m512 tmp10030 = _mm512_sub_ps(tmp9956, tmp9997);
__m512 tmp10009 = _mm512_sub_ps(tmp9946, tmp9947);
__m512 tmp10029 = _mm512_sub_ps(tmp9954, tmp9955);
__m512 tmp10006 = _mm512_add_ps(tmp9950, tmp9951);
__m512 tmp10026 = _mm512_add_ps(tmp9998, tmp9999);
__m512 tmp10011 = _mm512_sub_ps(tmp9950, tmp9951);
__m512 tmp10031 = _mm512_sub_ps(tmp9998, tmp9999);
__m512 tmp10008 = _mm512_fmadd_ps(tmp10010, _mm512_set1_ps(2e+00f), tmp10009);
__m512 tmp10028 = _mm512_fmadd_ps(tmp10030, _mm512_set1_ps(2e+00f), tmp10029);
__m512 tmp10015 = _mm512_fmadd_ps(tmp10010, _mm512_set1_ps(8e+00f), tmp10009);
__m512 tmp10035 = _mm512_fmadd_ps(tmp10030, _mm512_set1_ps(8e+00f), tmp10029);
__m512 tmp10003 = _mm512_add_ps(tmp10004, tmp10005);
__m512 tmp10023 = _mm512_add_ps(tmp10024, tmp10025);
__m512 tmp10007 = _mm512_fmadd_ps(tmp10011, _mm512_set1_ps(1.6e+01f), tmp10008);
__m512 tmp10027 = _mm512_fmadd_ps(tmp10031, _mm512_set1_ps(1.6e+01f), tmp10028);
__m512 tmp10014 = _mm512_fmadd_ps(tmp10011, _mm512_set1_ps(4e+00f), tmp10015);
__m512 tmp10034 = _mm512_fmadd_ps(tmp10031, _mm512_set1_ps(4e+00f), tmp10035);
__m512 tmp10020 = _mm512_add_ps(tmp10011, tmp10009);
__m512 tmp10040 = _mm512_add_ps(tmp10031, tmp10029);
__m512 tmp10013 = _mm512_fmadd_ps(tmp10004, _mm512_set1_ps(4e+00f), tmp10005);
__m512 tmp10033 = _mm512_fmadd_ps(tmp10024, _mm512_set1_ps(4e+00f), tmp10025);
__m512 tmp10017 = _mm512_fmadd_ps(tmp10004, _mm512_set1_ps(1.6e+01f), tmp10005);
__m512 tmp10037 = _mm512_fmadd_ps(tmp10024, _mm512_set1_ps(1.6e+01f), tmp10025);
__m512 tmp10002 = _mm512_add_ps(tmp10003, tmp9945);
__m512 tmp10022 = _mm512_add_ps(tmp10023, tmp9953);
__m512 tmp10019 = _mm512_add_ps(tmp10020, tmp9952);
__m512 tmp10039 = _mm512_add_ps(tmp10040, tmp10000);
__m512 tmp10001 = _mm512_fmadd_ps(tmp10006, _mm512_set1_ps(3.2e+01f), tmp10002);
__m512 tmp10021 = _mm512_fmadd_ps(tmp10026, _mm512_set1_ps(3.2e+01f), tmp10022);
__m512 tmp10012 = _mm512_fmadd_ps(tmp10006, _mm512_set1_ps(8e+00f), tmp10013);
__m512 tmp10032 = _mm512_fmadd_ps(tmp10026, _mm512_set1_ps(8e+00f), tmp10033);
__m512 tmp10018 = _mm512_fmadd_ps(tmp10010, _mm512_set1_ps(3.2e+01f), tmp10019);
__m512 tmp10038 = _mm512_fmadd_ps(tmp10030, _mm512_set1_ps(3.2e+01f), tmp10039);
__m512 tmp10016 = _mm512_fmadd_ps(tmp10006, _mm512_set1_ps(2e+00f), tmp10017);
__m512 tmp10036 = _mm512_fmadd_ps(tmp10026, _mm512_set1_ps(2e+00f), tmp10037);
__m512 out1347 = tmp10001;
__m512 out1353 = tmp10021;
__m512 out1348 = tmp10007;
__m512 out1354 = tmp10027;
__m512 out1349 = tmp10012;
__m512 out1355 = tmp10032;
__m512 out1350 = tmp10014;
__m512 out1356 = tmp10034;
__m512 out1351 = tmp10016;
__m512 out1357 = tmp10036;
__m512 out1352 = tmp10018;
__m512 out1358 = tmp10038;
out1347 = _mm512_max_ps(_mm512_setzero_ps(), out1347);
out1353 = _mm512_max_ps(_mm512_setzero_ps(), out1353);
out1348 = _mm512_max_ps(_mm512_setzero_ps(), out1348);
out1354 = _mm512_max_ps(_mm512_setzero_ps(), out1354);
out1349 = _mm512_max_ps(_mm512_setzero_ps(), out1349);
out1355 = _mm512_max_ps(_mm512_setzero_ps(), out1355);
out1350 = _mm512_max_ps(_mm512_setzero_ps(), out1350);
out1356 = _mm512_max_ps(_mm512_setzero_ps(), out1356);
out1351 = _mm512_max_ps(_mm512_setzero_ps(), out1351);
out1357 = _mm512_max_ps(_mm512_setzero_ps(), out1357);
out1352 = _mm512_max_ps(_mm512_setzero_ps(), out1352);
out1358 = _mm512_max_ps(_mm512_setzero_ps(), out1358);
_mm512_mask_storeu_ps(datPtr13+12656+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 255, out1347);
_mm512_mask_storeu_ps(datPtr13+13808+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1353);
_mm512_mask_storeu_ps(datPtr13+12880+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 255, out1348);
_mm512_mask_storeu_ps(datPtr13+14032+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1354);
_mm512_mask_storeu_ps(datPtr13+13104+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 255, out1349);
_mm512_mask_storeu_ps(datPtr13+14256+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1355);
_mm512_mask_storeu_ps(datPtr13+13328+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 255, out1350);
_mm512_mask_storeu_ps(datPtr13+14480+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1356);
_mm512_mask_storeu_ps(datPtr13+13552+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 255, out1351);
_mm512_mask_storeu_ps(datPtr13+14704+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1357);
_mm512_mask_storeu_ps(datPtr13+13776+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 255, out1352);
_mm512_mask_storeu_ps(datPtr13+14928+806912*i28+224*toH35+4*toW35+50432*k96+25216*l35, 4095, out1358);
}
}
if (j23 >= last6) return;
++j23;
}
j23 = 15;
}
ptrdiff_t rel18 = j23-15;
ptrdiff_t base18 = 54;
if (rel18 < 1) {
ptrdiff_t toH36 = base18+0;
ptrdiff_t toW36 = 0;
ptrdiff_t k97 = 16*w46;
for (; k97 != 16; ++k97) {
ptrdiff_t l36 = 0;
for (; l36 != 2; ++l36) {
__m512 sf737 = _mm512_loadu_ps(sfPtr7+0+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf738 = _mm512_loadu_ps(sfPtr7+128+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1453 = _mm512_shuffle_f32x4(sf737, sf738, 68);
__m512 in1454 = _mm512_shuffle_f32x4(sf737, sf738, 238);
__m512 sf739 = _mm512_loadu_ps(sfPtr7+64+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf740 = _mm512_loadu_ps(sfPtr7+192+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1461 = _mm512_shuffle_f32x4(sf739, sf740, 68);
__m512 in1462 = _mm512_shuffle_f32x4(sf739, sf740, 238);
__m512 sf741 = _mm512_loadu_ps(sfPtr7+409600+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf742 = _mm512_loadu_ps(sfPtr7+409728+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1455 = _mm512_shuffle_f32x4(sf741, sf742, 68);
__m512 in1456 = _mm512_shuffle_f32x4(sf741, sf742, 238);
__m512 sf743 = _mm512_loadu_ps(sfPtr7+409664+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf744 = _mm512_loadu_ps(sfPtr7+409792+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1463 = _mm512_shuffle_f32x4(sf743, sf744, 68);
__m512 in1464 = _mm512_shuffle_f32x4(sf743, sf744, 238);
__m512 sf745 = _mm512_loadu_ps(sfPtr7+819200+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf746 = _mm512_loadu_ps(sfPtr7+819328+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1457 = _mm512_shuffle_f32x4(sf745, sf746, 68);
__m512 in1458 = _mm512_shuffle_f32x4(sf745, sf746, 238);
__m512 sf747 = _mm512_loadu_ps(sfPtr7+819264+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf748 = _mm512_loadu_ps(sfPtr7+819392+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1465 = _mm512_shuffle_f32x4(sf747, sf748, 68);
__m512 in1466 = _mm512_shuffle_f32x4(sf747, sf748, 238);
__m512 sf749 = _mm512_loadu_ps(sfPtr7+1228800+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf750 = _mm512_loadu_ps(sfPtr7+1228928+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1459 = _mm512_shuffle_f32x4(sf749, sf750, 68);
__m512 in1460 = _mm512_shuffle_f32x4(sf749, sf750, 238);
__m512 sf751 = _mm512_loadu_ps(sfPtr7+1228864+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf752 = _mm512_loadu_ps(sfPtr7+1228992+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1467 = _mm512_shuffle_f32x4(sf751, sf752, 68);
__m512 in1468 = _mm512_shuffle_f32x4(sf751, sf752, 238);
__m512 tmp10097 = _mm512_add_ps(in1454, in1455);
__m512 tmp10117 = _mm512_add_ps(in1462, in1463);
__m512 tmp10096 = _mm512_add_ps(in1456, in1457);
__m512 tmp10116 = _mm512_add_ps(in1464, in1465);
__m512 tmp10102 = _mm512_sub_ps(in1456, in1457);
__m512 tmp10122 = _mm512_sub_ps(in1464, in1465);
__m512 tmp10101 = _mm512_sub_ps(in1454, in1455);
__m512 tmp10121 = _mm512_sub_ps(in1462, in1463);
__m512 tmp10098 = _mm512_add_ps(in1458, in1459);
__m512 tmp10118 = _mm512_add_ps(in1466, in1467);
__m512 tmp10103 = _mm512_sub_ps(in1458, in1459);
__m512 tmp10123 = _mm512_sub_ps(in1466, in1467);
__m512 tmp10100 = _mm512_fmadd_ps(tmp10102, _mm512_set1_ps(2e+00f), tmp10101);
__m512 tmp10120 = _mm512_fmadd_ps(tmp10122, _mm512_set1_ps(2e+00f), tmp10121);
__m512 tmp10107 = _mm512_fmadd_ps(tmp10102, _mm512_set1_ps(8e+00f), tmp10101);
__m512 tmp10127 = _mm512_fmadd_ps(tmp10122, _mm512_set1_ps(8e+00f), tmp10121);
__m512 tmp10095 = _mm512_add_ps(tmp10096, tmp10097);
__m512 tmp10115 = _mm512_add_ps(tmp10116, tmp10117);
__m512 tmp10099 = _mm512_fmadd_ps(tmp10103, _mm512_set1_ps(1.6e+01f), tmp10100);
__m512 tmp10119 = _mm512_fmadd_ps(tmp10123, _mm512_set1_ps(1.6e+01f), tmp10120);
__m512 tmp10106 = _mm512_fmadd_ps(tmp10103, _mm512_set1_ps(4e+00f), tmp10107);
__m512 tmp10126 = _mm512_fmadd_ps(tmp10123, _mm512_set1_ps(4e+00f), tmp10127);
__m512 tmp10112 = _mm512_add_ps(tmp10103, tmp10101);
__m512 tmp10132 = _mm512_add_ps(tmp10123, tmp10121);
__m512 tmp10105 = _mm512_fmadd_ps(tmp10096, _mm512_set1_ps(4e+00f), tmp10097);
__m512 tmp10125 = _mm512_fmadd_ps(tmp10116, _mm512_set1_ps(4e+00f), tmp10117);
__m512 tmp10109 = _mm512_fmadd_ps(tmp10096, _mm512_set1_ps(1.6e+01f), tmp10097);
__m512 tmp10129 = _mm512_fmadd_ps(tmp10116, _mm512_set1_ps(1.6e+01f), tmp10117);
__m512 tmp10094 = _mm512_add_ps(tmp10095, in1453);
__m512 tmp10114 = _mm512_add_ps(tmp10115, in1461);
__m512 tmp10111 = _mm512_add_ps(tmp10112, in1460);
__m512 tmp10131 = _mm512_add_ps(tmp10132, in1468);
__m512 tmp10093 = _mm512_fmadd_ps(tmp10098, _mm512_set1_ps(3.2e+01f), tmp10094);
__m512 tmp10113 = _mm512_fmadd_ps(tmp10118, _mm512_set1_ps(3.2e+01f), tmp10114);
__m512 tmp10104 = _mm512_fmadd_ps(tmp10098, _mm512_set1_ps(8e+00f), tmp10105);
__m512 tmp10124 = _mm512_fmadd_ps(tmp10118, _mm512_set1_ps(8e+00f), tmp10125);
__m512 tmp10110 = _mm512_fmadd_ps(tmp10102, _mm512_set1_ps(3.2e+01f), tmp10111);
__m512 tmp10130 = _mm512_fmadd_ps(tmp10122, _mm512_set1_ps(3.2e+01f), tmp10131);
__m512 tmp10108 = _mm512_fmadd_ps(tmp10098, _mm512_set1_ps(2e+00f), tmp10109);
__m512 tmp10128 = _mm512_fmadd_ps(tmp10118, _mm512_set1_ps(2e+00f), tmp10129);
__m512 tmp10081 = tmp10093;
__m512 tmp10087 = tmp10113;
__m512 tmp10082 = tmp10099;
__m512 tmp10088 = tmp10119;
__m512 tmp10083 = tmp10104;
__m512 tmp10089 = tmp10124;
__m512 tmp10084 = tmp10106;
__m512 tmp10090 = tmp10126;
__m512 tmp10085 = tmp10108;
__m512 tmp10091 = tmp10128;
__m512 tmp10086 = tmp10110;
__m512 tmp10092 = tmp10130;
__m512 tmp10159 = _mm512_unpacklo_ps(tmp10081, tmp10082);
__m512 tmp10160 = _mm512_unpackhi_ps(tmp10081, tmp10082);
__m512 tmp10161 = _mm512_unpacklo_ps(tmp10083, tmp10084);
__m512 tmp10162 = _mm512_unpackhi_ps(tmp10083, tmp10084);
__m512 tmp10163 = _mm512_unpacklo_ps(tmp10085, tmp10086);
__m512 tmp10164 = _mm512_unpackhi_ps(tmp10085, tmp10086);
__m512 tmp10165 = _mm512_unpacklo_ps(tmp10087, tmp10088);
__m512 tmp10166 = _mm512_unpackhi_ps(tmp10087, tmp10088);
__m512 tmp10167 = _mm512_unpacklo_ps(tmp10089, tmp10090);
__m512 tmp10168 = _mm512_unpackhi_ps(tmp10089, tmp10090);
__m512 tmp10169 = _mm512_unpacklo_ps(tmp10091, tmp10092);
__m512 tmp10170 = _mm512_unpackhi_ps(tmp10091, tmp10092);
__m512 tmp10171 = _mm512_shuffle_ps(tmp10159, tmp10161, 68);
__m512 tmp10172 = _mm512_shuffle_ps(tmp10159, tmp10161, 238);
__m512 tmp10173 = _mm512_shuffle_ps(tmp10160, tmp10162, 68);
__m512 tmp10174 = _mm512_shuffle_ps(tmp10160, tmp10162, 238);
__m512 tmp10175 = _mm512_shuffle_ps(tmp10163, tmp10165, 68);
__m512 tmp10176 = _mm512_shuffle_ps(tmp10163, tmp10165, 238);
__m512 tmp10177 = _mm512_shuffle_ps(tmp10164, tmp10166, 68);
__m512 tmp10178 = _mm512_shuffle_ps(tmp10164, tmp10166, 238);
__m512 tmp10179 = _mm512_shuffle_ps(tmp10167, tmp10169, 68);
__m512 tmp10180 = _mm512_shuffle_ps(tmp10167, tmp10169, 238);
__m512 tmp10181 = _mm512_shuffle_ps(tmp10168, tmp10170, 68);
__m512 tmp10182 = _mm512_shuffle_ps(tmp10168, tmp10170, 238);
__m512 tmp10183 = _mm512_shuffle_f32x4(tmp10171, tmp10175, 136);
__m512 tmp10184 = _mm512_shuffle_f32x4(tmp10171, tmp10175, 221);
__m512 tmp10185 = _mm512_shuffle_f32x4(tmp10172, tmp10176, 136);
__m512 tmp10186 = _mm512_shuffle_f32x4(tmp10172, tmp10176, 221);
__m512 tmp10187 = _mm512_shuffle_f32x4(tmp10173, tmp10177, 136);
__m512 tmp10188 = _mm512_shuffle_f32x4(tmp10173, tmp10177, 221);
__m512 tmp10189 = _mm512_shuffle_f32x4(tmp10174, tmp10178, 136);
__m512 tmp10190 = _mm512_shuffle_f32x4(tmp10174, tmp10178, 221);
__m512 tmp10191 = _mm512_shuffle_f32x4(tmp10179, tmp10179, 136);
__m512 tmp10192 = _mm512_shuffle_f32x4(tmp10179, tmp10179, 221);
__m512 tmp10193 = _mm512_shuffle_f32x4(tmp10180, tmp10180, 136);
__m512 tmp10194 = _mm512_shuffle_f32x4(tmp10180, tmp10180, 221);
__m512 tmp10195 = _mm512_shuffle_f32x4(tmp10181, tmp10181, 136);
__m512 tmp10196 = _mm512_shuffle_f32x4(tmp10181, tmp10181, 221);
__m512 tmp10197 = _mm512_shuffle_f32x4(tmp10182, tmp10182, 136);
__m512 tmp10198 = _mm512_shuffle_f32x4(tmp10182, tmp10182, 221);
tmp10081 = _mm512_shuffle_f32x4(tmp10183, tmp10191, 136);
tmp10089 = _mm512_shuffle_f32x4(tmp10183, tmp10191, 221);
tmp10082 = _mm512_shuffle_f32x4(tmp10185, tmp10193, 136);
tmp10090 = _mm512_shuffle_f32x4(tmp10185, tmp10193, 221);
tmp10083 = _mm512_shuffle_f32x4(tmp10187, tmp10195, 136);
tmp10091 = _mm512_shuffle_f32x4(tmp10187, tmp10195, 221);
tmp10084 = _mm512_shuffle_f32x4(tmp10189, tmp10197, 136);
tmp10092 = _mm512_shuffle_f32x4(tmp10189, tmp10197, 221);
tmp10085 = _mm512_shuffle_f32x4(tmp10184, tmp10192, 136);
__m512 tmp10133 = _mm512_shuffle_f32x4(tmp10184, tmp10192, 221);
tmp10086 = _mm512_shuffle_f32x4(tmp10186, tmp10194, 136);
__m512 tmp10134 = _mm512_shuffle_f32x4(tmp10186, tmp10194, 221);
tmp10087 = _mm512_shuffle_f32x4(tmp10188, tmp10196, 136);
__m512 tmp10135 = _mm512_shuffle_f32x4(tmp10188, tmp10196, 221);
tmp10088 = _mm512_shuffle_f32x4(tmp10190, tmp10198, 136);
__m512 tmp10136 = _mm512_shuffle_f32x4(tmp10190, tmp10198, 221);
(void)tmp10088;
(void)tmp10136;
__m512 tmp10141 = _mm512_add_ps(tmp10082, tmp10083);
__m512 tmp10152 = _mm512_add_ps(tmp10090, tmp10091);
__m512 tmp10140 = _mm512_add_ps(tmp10084, tmp10085);
__m512 tmp10151 = _mm512_add_ps(tmp10092, tmp10133);
__m512 tmp10146 = _mm512_sub_ps(tmp10084, tmp10085);
__m512 tmp10157 = _mm512_sub_ps(tmp10092, tmp10133);
__m512 tmp10145 = _mm512_sub_ps(tmp10082, tmp10083);
__m512 tmp10156 = _mm512_sub_ps(tmp10090, tmp10091);
__m512 tmp10142 = _mm512_add_ps(tmp10086, tmp10087);
__m512 tmp10153 = _mm512_add_ps(tmp10134, tmp10135);
__m512 tmp10147 = _mm512_sub_ps(tmp10086, tmp10087);
__m512 tmp10158 = _mm512_sub_ps(tmp10134, tmp10135);
__m512 tmp10144 = _mm512_fmadd_ps(tmp10146, _mm512_set1_ps(2e+00f), tmp10145);
__m512 tmp10155 = _mm512_fmadd_ps(tmp10157, _mm512_set1_ps(2e+00f), tmp10156);
__m512 tmp10139 = _mm512_add_ps(tmp10140, tmp10141);
__m512 tmp10150 = _mm512_add_ps(tmp10151, tmp10152);
__m512 tmp10143 = _mm512_fmadd_ps(tmp10147, _mm512_set1_ps(1.6e+01f), tmp10144);
__m512 tmp10154 = _mm512_fmadd_ps(tmp10158, _mm512_set1_ps(1.6e+01f), tmp10155);
__m512 tmp10138 = _mm512_add_ps(tmp10139, tmp10081);
__m512 tmp10149 = _mm512_add_ps(tmp10150, tmp10089);
__m512 tmp10137 = _mm512_fmadd_ps(tmp10142, _mm512_set1_ps(3.2e+01f), tmp10138);
__m512 tmp10148 = _mm512_fmadd_ps(tmp10153, _mm512_set1_ps(3.2e+01f), tmp10149);
__m512 out1359 = tmp10137;
__m512 out1361 = tmp10148;
__m512 out1360 = tmp10143;
__m512 out1362 = tmp10154;
out1359 = _mm512_max_ps(_mm512_setzero_ps(), out1359);
out1361 = _mm512_max_ps(_mm512_setzero_ps(), out1361);
out1360 = _mm512_max_ps(_mm512_setzero_ps(), out1360);
out1362 = _mm512_max_ps(_mm512_setzero_ps(), out1362);
_mm512_mask_storeu_ps(datPtr13+0+806912*i28+224*toH36+4*toW36+50432*k97+25216*l36, 4095, out1359);
_mm512_mask_storeu_ps(datPtr13+48+806912*i28+224*toH36+4*toW36+50432*k97+25216*l36, 4095, out1361);
_mm512_mask_storeu_ps(datPtr13+224+806912*i28+224*toH36+4*toW36+50432*k97+25216*l36, 4095, out1360);
_mm512_mask_storeu_ps(datPtr13+272+806912*i28+224*toH36+4*toW36+50432*k97+25216*l36, 4095, out1362);
__m512 sf753 = _mm512_loadu_ps(sfPtr7+256+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf754 = _mm512_loadu_ps(sfPtr7+384+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1469 = _mm512_shuffle_f32x4(sf753, sf754, 68);
__m512 in1470 = _mm512_shuffle_f32x4(sf753, sf754, 238);
__m512 sf755 = _mm512_loadu_ps(sfPtr7+320+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf756 = _mm512_loadu_ps(sfPtr7+448+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1477 = _mm512_shuffle_f32x4(sf755, sf756, 68);
__m512 in1478 = _mm512_shuffle_f32x4(sf755, sf756, 238);
__m512 sf757 = _mm512_loadu_ps(sfPtr7+409856+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf758 = _mm512_loadu_ps(sfPtr7+409984+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1471 = _mm512_shuffle_f32x4(sf757, sf758, 68);
__m512 in1472 = _mm512_shuffle_f32x4(sf757, sf758, 238);
__m512 sf759 = _mm512_loadu_ps(sfPtr7+409920+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf760 = _mm512_loadu_ps(sfPtr7+410048+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1479 = _mm512_shuffle_f32x4(sf759, sf760, 68);
__m512 in1480 = _mm512_shuffle_f32x4(sf759, sf760, 238);
__m512 sf761 = _mm512_loadu_ps(sfPtr7+819456+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf762 = _mm512_loadu_ps(sfPtr7+819584+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1473 = _mm512_shuffle_f32x4(sf761, sf762, 68);
__m512 in1474 = _mm512_shuffle_f32x4(sf761, sf762, 238);
__m512 sf763 = _mm512_loadu_ps(sfPtr7+819520+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf764 = _mm512_loadu_ps(sfPtr7+819648+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1481 = _mm512_shuffle_f32x4(sf763, sf764, 68);
__m512 in1482 = _mm512_shuffle_f32x4(sf763, sf764, 238);
__m512 sf765 = _mm512_loadu_ps(sfPtr7+1229056+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf766 = _mm512_loadu_ps(sfPtr7+1229184+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1475 = _mm512_shuffle_f32x4(sf765, sf766, 68);
__m512 in1476 = _mm512_shuffle_f32x4(sf765, sf766, 238);
__m512 sf767 = _mm512_loadu_ps(sfPtr7+1229120+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf768 = _mm512_loadu_ps(sfPtr7+1229248+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1483 = _mm512_shuffle_f32x4(sf767, sf768, 68);
__m512 in1484 = _mm512_shuffle_f32x4(sf767, sf768, 238);
__m512 tmp10215 = _mm512_add_ps(in1470, in1471);
__m512 tmp10235 = _mm512_add_ps(in1478, in1479);
__m512 tmp10214 = _mm512_add_ps(in1472, in1473);
__m512 tmp10234 = _mm512_add_ps(in1480, in1481);
__m512 tmp10220 = _mm512_sub_ps(in1472, in1473);
__m512 tmp10240 = _mm512_sub_ps(in1480, in1481);
__m512 tmp10219 = _mm512_sub_ps(in1470, in1471);
__m512 tmp10239 = _mm512_sub_ps(in1478, in1479);
__m512 tmp10216 = _mm512_add_ps(in1474, in1475);
__m512 tmp10236 = _mm512_add_ps(in1482, in1483);
__m512 tmp10221 = _mm512_sub_ps(in1474, in1475);
__m512 tmp10241 = _mm512_sub_ps(in1482, in1483);
__m512 tmp10218 = _mm512_fmadd_ps(tmp10220, _mm512_set1_ps(2e+00f), tmp10219);
__m512 tmp10238 = _mm512_fmadd_ps(tmp10240, _mm512_set1_ps(2e+00f), tmp10239);
__m512 tmp10225 = _mm512_fmadd_ps(tmp10220, _mm512_set1_ps(8e+00f), tmp10219);
__m512 tmp10245 = _mm512_fmadd_ps(tmp10240, _mm512_set1_ps(8e+00f), tmp10239);
__m512 tmp10213 = _mm512_add_ps(tmp10214, tmp10215);
__m512 tmp10233 = _mm512_add_ps(tmp10234, tmp10235);
__m512 tmp10217 = _mm512_fmadd_ps(tmp10221, _mm512_set1_ps(1.6e+01f), tmp10218);
__m512 tmp10237 = _mm512_fmadd_ps(tmp10241, _mm512_set1_ps(1.6e+01f), tmp10238);
__m512 tmp10224 = _mm512_fmadd_ps(tmp10221, _mm512_set1_ps(4e+00f), tmp10225);
__m512 tmp10244 = _mm512_fmadd_ps(tmp10241, _mm512_set1_ps(4e+00f), tmp10245);
__m512 tmp10230 = _mm512_add_ps(tmp10221, tmp10219);
__m512 tmp10250 = _mm512_add_ps(tmp10241, tmp10239);
__m512 tmp10223 = _mm512_fmadd_ps(tmp10214, _mm512_set1_ps(4e+00f), tmp10215);
__m512 tmp10243 = _mm512_fmadd_ps(tmp10234, _mm512_set1_ps(4e+00f), tmp10235);
__m512 tmp10227 = _mm512_fmadd_ps(tmp10214, _mm512_set1_ps(1.6e+01f), tmp10215);
__m512 tmp10247 = _mm512_fmadd_ps(tmp10234, _mm512_set1_ps(1.6e+01f), tmp10235);
__m512 tmp10212 = _mm512_add_ps(tmp10213, in1469);
__m512 tmp10232 = _mm512_add_ps(tmp10233, in1477);
__m512 tmp10229 = _mm512_add_ps(tmp10230, in1476);
__m512 tmp10249 = _mm512_add_ps(tmp10250, in1484);
__m512 tmp10211 = _mm512_fmadd_ps(tmp10216, _mm512_set1_ps(3.2e+01f), tmp10212);
__m512 tmp10231 = _mm512_fmadd_ps(tmp10236, _mm512_set1_ps(3.2e+01f), tmp10232);
__m512 tmp10222 = _mm512_fmadd_ps(tmp10216, _mm512_set1_ps(8e+00f), tmp10223);
__m512 tmp10242 = _mm512_fmadd_ps(tmp10236, _mm512_set1_ps(8e+00f), tmp10243);
__m512 tmp10228 = _mm512_fmadd_ps(tmp10220, _mm512_set1_ps(3.2e+01f), tmp10229);
__m512 tmp10248 = _mm512_fmadd_ps(tmp10240, _mm512_set1_ps(3.2e+01f), tmp10249);
__m512 tmp10226 = _mm512_fmadd_ps(tmp10216, _mm512_set1_ps(2e+00f), tmp10227);
__m512 tmp10246 = _mm512_fmadd_ps(tmp10236, _mm512_set1_ps(2e+00f), tmp10247);
__m512 tmp10199 = tmp10211;
__m512 tmp10205 = tmp10231;
__m512 tmp10200 = tmp10217;
__m512 tmp10206 = tmp10237;
__m512 tmp10201 = tmp10222;
__m512 tmp10207 = tmp10242;
__m512 tmp10202 = tmp10224;
__m512 tmp10208 = tmp10244;
__m512 tmp10203 = tmp10226;
__m512 tmp10209 = tmp10246;
__m512 tmp10204 = tmp10228;
__m512 tmp10210 = tmp10248;
__m512 tmp10277 = _mm512_unpacklo_ps(tmp10199, tmp10200);
__m512 tmp10278 = _mm512_unpackhi_ps(tmp10199, tmp10200);
__m512 tmp10279 = _mm512_unpacklo_ps(tmp10201, tmp10202);
__m512 tmp10280 = _mm512_unpackhi_ps(tmp10201, tmp10202);
__m512 tmp10281 = _mm512_unpacklo_ps(tmp10203, tmp10204);
__m512 tmp10282 = _mm512_unpackhi_ps(tmp10203, tmp10204);
__m512 tmp10283 = _mm512_unpacklo_ps(tmp10205, tmp10206);
__m512 tmp10284 = _mm512_unpackhi_ps(tmp10205, tmp10206);
__m512 tmp10285 = _mm512_unpacklo_ps(tmp10207, tmp10208);
__m512 tmp10286 = _mm512_unpackhi_ps(tmp10207, tmp10208);
__m512 tmp10287 = _mm512_unpacklo_ps(tmp10209, tmp10210);
__m512 tmp10288 = _mm512_unpackhi_ps(tmp10209, tmp10210);
__m512 tmp10289 = _mm512_shuffle_ps(tmp10277, tmp10279, 68);
__m512 tmp10290 = _mm512_shuffle_ps(tmp10277, tmp10279, 238);
__m512 tmp10291 = _mm512_shuffle_ps(tmp10278, tmp10280, 68);
__m512 tmp10292 = _mm512_shuffle_ps(tmp10278, tmp10280, 238);
__m512 tmp10293 = _mm512_shuffle_ps(tmp10281, tmp10283, 68);
__m512 tmp10294 = _mm512_shuffle_ps(tmp10281, tmp10283, 238);
__m512 tmp10295 = _mm512_shuffle_ps(tmp10282, tmp10284, 68);
__m512 tmp10296 = _mm512_shuffle_ps(tmp10282, tmp10284, 238);
__m512 tmp10297 = _mm512_shuffle_ps(tmp10285, tmp10287, 68);
__m512 tmp10298 = _mm512_shuffle_ps(tmp10285, tmp10287, 238);
__m512 tmp10299 = _mm512_shuffle_ps(tmp10286, tmp10288, 68);
__m512 tmp10300 = _mm512_shuffle_ps(tmp10286, tmp10288, 238);
__m512 tmp10301 = _mm512_shuffle_f32x4(tmp10289, tmp10293, 136);
__m512 tmp10302 = _mm512_shuffle_f32x4(tmp10289, tmp10293, 221);
__m512 tmp10303 = _mm512_shuffle_f32x4(tmp10290, tmp10294, 136);
__m512 tmp10304 = _mm512_shuffle_f32x4(tmp10290, tmp10294, 221);
__m512 tmp10305 = _mm512_shuffle_f32x4(tmp10291, tmp10295, 136);
__m512 tmp10306 = _mm512_shuffle_f32x4(tmp10291, tmp10295, 221);
__m512 tmp10307 = _mm512_shuffle_f32x4(tmp10292, tmp10296, 136);
__m512 tmp10308 = _mm512_shuffle_f32x4(tmp10292, tmp10296, 221);
__m512 tmp10309 = _mm512_shuffle_f32x4(tmp10297, tmp10297, 136);
__m512 tmp10310 = _mm512_shuffle_f32x4(tmp10297, tmp10297, 221);
__m512 tmp10311 = _mm512_shuffle_f32x4(tmp10298, tmp10298, 136);
__m512 tmp10312 = _mm512_shuffle_f32x4(tmp10298, tmp10298, 221);
__m512 tmp10313 = _mm512_shuffle_f32x4(tmp10299, tmp10299, 136);
__m512 tmp10314 = _mm512_shuffle_f32x4(tmp10299, tmp10299, 221);
__m512 tmp10315 = _mm512_shuffle_f32x4(tmp10300, tmp10300, 136);
__m512 tmp10316 = _mm512_shuffle_f32x4(tmp10300, tmp10300, 221);
tmp10199 = _mm512_shuffle_f32x4(tmp10301, tmp10309, 136);
tmp10207 = _mm512_shuffle_f32x4(tmp10301, tmp10309, 221);
tmp10200 = _mm512_shuffle_f32x4(tmp10303, tmp10311, 136);
tmp10208 = _mm512_shuffle_f32x4(tmp10303, tmp10311, 221);
tmp10201 = _mm512_shuffle_f32x4(tmp10305, tmp10313, 136);
tmp10209 = _mm512_shuffle_f32x4(tmp10305, tmp10313, 221);
tmp10202 = _mm512_shuffle_f32x4(tmp10307, tmp10315, 136);
tmp10210 = _mm512_shuffle_f32x4(tmp10307, tmp10315, 221);
tmp10203 = _mm512_shuffle_f32x4(tmp10302, tmp10310, 136);
__m512 tmp10251 = _mm512_shuffle_f32x4(tmp10302, tmp10310, 221);
tmp10204 = _mm512_shuffle_f32x4(tmp10304, tmp10312, 136);
__m512 tmp10252 = _mm512_shuffle_f32x4(tmp10304, tmp10312, 221);
tmp10205 = _mm512_shuffle_f32x4(tmp10306, tmp10314, 136);
__m512 tmp10253 = _mm512_shuffle_f32x4(tmp10306, tmp10314, 221);
tmp10206 = _mm512_shuffle_f32x4(tmp10308, tmp10316, 136);
__m512 tmp10254 = _mm512_shuffle_f32x4(tmp10308, tmp10316, 221);
(void)tmp10206;
(void)tmp10254;
__m512 tmp10259 = _mm512_add_ps(tmp10200, tmp10201);
__m512 tmp10270 = _mm512_add_ps(tmp10208, tmp10209);
__m512 tmp10258 = _mm512_add_ps(tmp10202, tmp10203);
__m512 tmp10269 = _mm512_add_ps(tmp10210, tmp10251);
__m512 tmp10264 = _mm512_sub_ps(tmp10202, tmp10203);
__m512 tmp10275 = _mm512_sub_ps(tmp10210, tmp10251);
__m512 tmp10263 = _mm512_sub_ps(tmp10200, tmp10201);
__m512 tmp10274 = _mm512_sub_ps(tmp10208, tmp10209);
__m512 tmp10260 = _mm512_add_ps(tmp10204, tmp10205);
__m512 tmp10271 = _mm512_add_ps(tmp10252, tmp10253);
__m512 tmp10265 = _mm512_sub_ps(tmp10204, tmp10205);
__m512 tmp10276 = _mm512_sub_ps(tmp10252, tmp10253);
__m512 tmp10262 = _mm512_fmadd_ps(tmp10264, _mm512_set1_ps(2e+00f), tmp10263);
__m512 tmp10273 = _mm512_fmadd_ps(tmp10275, _mm512_set1_ps(2e+00f), tmp10274);
__m512 tmp10257 = _mm512_add_ps(tmp10258, tmp10259);
__m512 tmp10268 = _mm512_add_ps(tmp10269, tmp10270);
__m512 tmp10261 = _mm512_fmadd_ps(tmp10265, _mm512_set1_ps(1.6e+01f), tmp10262);
__m512 tmp10272 = _mm512_fmadd_ps(tmp10276, _mm512_set1_ps(1.6e+01f), tmp10273);
__m512 tmp10256 = _mm512_add_ps(tmp10257, tmp10199);
__m512 tmp10267 = _mm512_add_ps(tmp10268, tmp10207);
__m512 tmp10255 = _mm512_fmadd_ps(tmp10260, _mm512_set1_ps(3.2e+01f), tmp10256);
__m512 tmp10266 = _mm512_fmadd_ps(tmp10271, _mm512_set1_ps(3.2e+01f), tmp10267);
__m512 out1363 = tmp10255;
__m512 out1365 = tmp10266;
__m512 out1364 = tmp10261;
__m512 out1366 = tmp10272;
out1363 = _mm512_max_ps(_mm512_setzero_ps(), out1363);
out1365 = _mm512_max_ps(_mm512_setzero_ps(), out1365);
out1364 = _mm512_max_ps(_mm512_setzero_ps(), out1364);
out1366 = _mm512_max_ps(_mm512_setzero_ps(), out1366);
_mm512_mask_storeu_ps(datPtr13+96+806912*i28+224*toH36+4*toW36+50432*k97+25216*l36, 4095, out1363);
_mm512_mask_storeu_ps(datPtr13+12608+806912*i28+224*toH36+4*toW36+50432*k97+25216*l36, 4095, out1365);
_mm512_mask_storeu_ps(datPtr13+320+806912*i28+224*toH36+4*toW36+50432*k97+25216*l36, 4095, out1364);
_mm512_mask_storeu_ps(datPtr13+12832+806912*i28+224*toH36+4*toW36+50432*k97+25216*l36, 4095, out1366);
__m512 sf769 = _mm512_loadu_ps(sfPtr7+512+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf770 = _mm512_loadu_ps(sfPtr7+640+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1485 = _mm512_shuffle_f32x4(sf769, sf770, 68);
__m512 in1486 = _mm512_shuffle_f32x4(sf769, sf770, 238);
__m512 sf771 = _mm512_loadu_ps(sfPtr7+576+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf772 = _mm512_loadu_ps(sfPtr7+704+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1493 = _mm512_shuffle_f32x4(sf771, sf772, 68);
__m512 in1494 = _mm512_shuffle_f32x4(sf771, sf772, 238);
__m512 sf773 = _mm512_loadu_ps(sfPtr7+410112+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf774 = _mm512_loadu_ps(sfPtr7+410240+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1487 = _mm512_shuffle_f32x4(sf773, sf774, 68);
__m512 in1488 = _mm512_shuffle_f32x4(sf773, sf774, 238);
__m512 sf775 = _mm512_loadu_ps(sfPtr7+410176+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf776 = _mm512_loadu_ps(sfPtr7+410304+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1495 = _mm512_shuffle_f32x4(sf775, sf776, 68);
__m512 in1496 = _mm512_shuffle_f32x4(sf775, sf776, 238);
__m512 sf777 = _mm512_loadu_ps(sfPtr7+819712+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf778 = _mm512_loadu_ps(sfPtr7+819840+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1489 = _mm512_shuffle_f32x4(sf777, sf778, 68);
__m512 in1490 = _mm512_shuffle_f32x4(sf777, sf778, 238);
__m512 sf779 = _mm512_loadu_ps(sfPtr7+819776+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf780 = _mm512_loadu_ps(sfPtr7+819904+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1497 = _mm512_shuffle_f32x4(sf779, sf780, 68);
__m512 in1498 = _mm512_shuffle_f32x4(sf779, sf780, 238);
__m512 sf781 = _mm512_loadu_ps(sfPtr7+1229312+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf782 = _mm512_loadu_ps(sfPtr7+1229440+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1491 = _mm512_shuffle_f32x4(sf781, sf782, 68);
__m512 in1492 = _mm512_shuffle_f32x4(sf781, sf782, 238);
__m512 sf783 = _mm512_loadu_ps(sfPtr7+1229376+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 sf784 = _mm512_loadu_ps(sfPtr7+1229504+1638400*i28+24576*j23+1536*k97+768*l36);
__m512 in1499 = _mm512_shuffle_f32x4(sf783, sf784, 68);
__m512 in1500 = _mm512_shuffle_f32x4(sf783, sf784, 238);
__m512 tmp10333 = _mm512_add_ps(in1486, in1487);
__m512 tmp10353 = _mm512_add_ps(in1494, in1495);
__m512 tmp10332 = _mm512_add_ps(in1488, in1489);
__m512 tmp10352 = _mm512_add_ps(in1496, in1497);
__m512 tmp10338 = _mm512_sub_ps(in1488, in1489);
__m512 tmp10358 = _mm512_sub_ps(in1496, in1497);
__m512 tmp10337 = _mm512_sub_ps(in1486, in1487);
__m512 tmp10357 = _mm512_sub_ps(in1494, in1495);
__m512 tmp10334 = _mm512_add_ps(in1490, in1491);
__m512 tmp10354 = _mm512_add_ps(in1498, in1499);
__m512 tmp10339 = _mm512_sub_ps(in1490, in1491);
__m512 tmp10359 = _mm512_sub_ps(in1498, in1499);
__m512 tmp10336 = _mm512_fmadd_ps(tmp10338, _mm512_set1_ps(2e+00f), tmp10337);
__m512 tmp10356 = _mm512_fmadd_ps(tmp10358, _mm512_set1_ps(2e+00f), tmp10357);
__m512 tmp10343 = _mm512_fmadd_ps(tmp10338, _mm512_set1_ps(8e+00f), tmp10337);
__m512 tmp10363 = _mm512_fmadd_ps(tmp10358, _mm512_set1_ps(8e+00f), tmp10357);
__m512 tmp10331 = _mm512_add_ps(tmp10332, tmp10333);
__m512 tmp10351 = _mm512_add_ps(tmp10352, tmp10353);
__m512 tmp10335 = _mm512_fmadd_ps(tmp10339, _mm512_set1_ps(1.6e+01f), tmp10336);
__m512 tmp10355 = _mm512_fmadd_ps(tmp10359, _mm512_set1_ps(1.6e+01f), tmp10356);
__m512 tmp10342 = _mm512_fmadd_ps(tmp10339, _mm512_set1_ps(4e+00f), tmp10343);
__m512 tmp10362 = _mm512_fmadd_ps(tmp10359, _mm512_set1_ps(4e+00f), tmp10363);
__m512 tmp10348 = _mm512_add_ps(tmp10339, tmp10337);
__m512 tmp10368 = _mm512_add_ps(tmp10359, tmp10357);
__m512 tmp10341 = _mm512_fmadd_ps(tmp10332, _mm512_set1_ps(4e+00f), tmp10333);
__m512 tmp10361 = _mm512_fmadd_ps(tmp10352, _mm512_set1_ps(4e+00f), tmp10353);
__m512 tmp10345 = _mm512_fmadd_ps(tmp10332, _mm512_set1_ps(1.6e+01f), tmp10333);
__m512 tmp10365 = _mm512_fmadd_ps(tmp10352, _mm512_set1_ps(1.6e+01f), tmp10353);
__m512 tmp10330 = _mm512_add_ps(tmp10331, in1485);
__m512 tmp10350 = _mm512_add_ps(tmp10351, in1493);
__m512 tmp10347 = _mm512_add_ps(tmp10348, in1492);
__m512 tmp10367 = _mm512_add_ps(tmp10368, in1500);
__m512 tmp10329 = _mm512_fmadd_ps(tmp10334, _mm512_set1_ps(3.2e+01f), tmp10330);
__m512 tmp10349 = _mm512_fmadd_ps(tmp10354, _mm512_set1_ps(3.2e+01f), tmp10350);
__m512 tmp10340 = _mm512_fmadd_ps(tmp10334, _mm512_set1_ps(8e+00f), tmp10341);
__m512 tmp10360 = _mm512_fmadd_ps(tmp10354, _mm512_set1_ps(8e+00f), tmp10361);
__m512 tmp10346 = _mm512_fmadd_ps(tmp10338, _mm512_set1_ps(3.2e+01f), tmp10347);
__m512 tmp10366 = _mm512_fmadd_ps(tmp10358, _mm512_set1_ps(3.2e+01f), tmp10367);
__m512 tmp10344 = _mm512_fmadd_ps(tmp10334, _mm512_set1_ps(2e+00f), tmp10345);
__m512 tmp10364 = _mm512_fmadd_ps(tmp10354, _mm512_set1_ps(2e+00f), tmp10365);
__m512 tmp10317 = tmp10329;
__m512 tmp10323 = tmp10349;
__m512 tmp10318 = tmp10335;
__m512 tmp10324 = tmp10355;
__m512 tmp10319 = tmp10340;
__m512 tmp10325 = tmp10360;
__m512 tmp10320 = tmp10342;
__m512 tmp10326 = tmp10362;
__m512 tmp10321 = tmp10344;
__m512 tmp10327 = tmp10364;
__m512 tmp10322 = tmp10346;
__m512 tmp10328 = tmp10366;
__m512 tmp10395 = _mm512_unpacklo_ps(tmp10317, tmp10318);
__m512 tmp10396 = _mm512_unpackhi_ps(tmp10317, tmp10318);
__m512 tmp10397 = _mm512_unpacklo_ps(tmp10319, tmp10320);
__m512 tmp10398 = _mm512_unpackhi_ps(tmp10319, tmp10320);
__m512 tmp10399 = _mm512_unpacklo_ps(tmp10321, tmp10322);
__m512 tmp10400 = _mm512_unpackhi_ps(tmp10321, tmp10322);
__m512 tmp10401 = _mm512_unpacklo_ps(tmp10323, tmp10324);
__m512 tmp10402 = _mm512_unpackhi_ps(tmp10323, tmp10324);
__m512 tmp10403 = _mm512_unpacklo_ps(tmp10325, tmp10326);
__m512 tmp10404 = _mm512_unpackhi_ps(tmp10325, tmp10326);
__m512 tmp10405 = _mm512_unpacklo_ps(tmp10327, tmp10328);
__m512 tmp10406 = _mm512_unpackhi_ps(tmp10327, tmp10328);
__m512 tmp10407 = _mm512_shuffle_ps(tmp10395, tmp10397, 68);
__m512 tmp10408 = _mm512_shuffle_ps(tmp10395, tmp10397, 238);
__m512 tmp10409 = _mm512_shuffle_ps(tmp10396, tmp10398, 68);
__m512 tmp10410 = _mm512_shuffle_ps(tmp10396, tmp10398, 238);
__m512 tmp10411 = _mm512_shuffle_ps(tmp10399, tmp10401, 68);
__m512 tmp10412 = _mm512_shuffle_ps(tmp10399, tmp10401, 238);
__m512 tmp10413 = _mm512_shuffle_ps(tmp10400, tmp10402, 68);
__m512 tmp10414 = _mm512_shuffle_ps(tmp10400, tmp10402, 238);
__m512 tmp10415 = _mm512_shuffle_ps(tmp10403, tmp10405, 68);
__m512 tmp10416 = _mm512_shuffle_ps(tmp10403, tmp10405, 238);
__m512 tmp10417 = _mm512_shuffle_ps(tmp10404, tmp10406, 68);
__m512 tmp10418 = _mm512_shuffle_ps(tmp10404, tmp10406, 238);
__m512 tmp10419 = _mm512_shuffle_f32x4(tmp10407, tmp10411, 136);
__m512 tmp10420 = _mm512_shuffle_f32x4(tmp10407, tmp10411, 221);
__m512 tmp10421 = _mm512_shuffle_f32x4(tmp10408, tmp10412, 136);
__m512 tmp10422 = _mm512_shuffle_f32x4(tmp10408, tmp10412, 221);
__m512 tmp10423 = _mm512_shuffle_f32x4(tmp10409, tmp10413, 136);
__m512 tmp10424 = _mm512_shuffle_f32x4(tmp10409, tmp10413, 221);
__m512 tmp10425 = _mm512_shuffle_f32x4(tmp10410, tmp10414, 136);
__m512 tmp10426 = _mm512_shuffle_f32x4(tmp10410, tmp10414, 221);
__m512 tmp10427 = _mm512_shuffle_f32x4(tmp10415, tmp10415, 136);
__m512 tmp10428 = _mm512_shuffle_f32x4(tmp10415, tmp10415, 221);
__m512 tmp10429 = _mm512_shuffle_f32x4(tmp10416, tmp10416, 136);
__m512 tmp10430 = _mm512_shuffle_f32x4(tmp10416, tmp10416, 221);
__m512 tmp10431 = _mm512_shuffle_f32x4(tmp10417, tmp10417, 136);
__m512 tmp10432 = _mm512_shuffle_f32x4(tmp10417, tmp10417, 221);
__m512 tmp10433 = _mm512_shuffle_f32x4(tmp10418, tmp10418, 136);
__m512 tmp10434 = _mm512_shuffle_f32x4(tmp10418, tmp10418, 221);
tmp10317 = _mm512_shuffle_f32x4(tmp10419, tmp10427, 136);
tmp10325 = _mm512_shuffle_f32x4(tmp10419, tmp10427, 221);
tmp10318 = _mm512_shuffle_f32x4(tmp10421, tmp10429, 136);
tmp10326 = _mm512_shuffle_f32x4(tmp10421, tmp10429, 221);
tmp10319 = _mm512_shuffle_f32x4(tmp10423, tmp10431, 136);
tmp10327 = _mm512_shuffle_f32x4(tmp10423, tmp10431, 221);
tmp10320 = _mm512_shuffle_f32x4(tmp10425, tmp10433, 136);
tmp10328 = _mm512_shuffle_f32x4(tmp10425, tmp10433, 221);
tmp10321 = _mm512_shuffle_f32x4(tmp10420, tmp10428, 136);
__m512 tmp10369 = _mm512_shuffle_f32x4(tmp10420, tmp10428, 221);
tmp10322 = _mm512_shuffle_f32x4(tmp10422, tmp10430, 136);
__m512 tmp10370 = _mm512_shuffle_f32x4(tmp10422, tmp10430, 221);
tmp10323 = _mm512_shuffle_f32x4(tmp10424, tmp10432, 136);
__m512 tmp10371 = _mm512_shuffle_f32x4(tmp10424, tmp10432, 221);
tmp10324 = _mm512_shuffle_f32x4(tmp10426, tmp10434, 136);
__m512 tmp10372 = _mm512_shuffle_f32x4(tmp10426, tmp10434, 221);
(void)tmp10324;
(void)tmp10372;
__m512 tmp10377 = _mm512_add_ps(tmp10318, tmp10319);
__m512 tmp10388 = _mm512_add_ps(tmp10326, tmp10327);
__m512 tmp10376 = _mm512_add_ps(tmp10320, tmp10321);
__m512 tmp10387 = _mm512_add_ps(tmp10328, tmp10369);
__m512 tmp10382 = _mm512_sub_ps(tmp10320, tmp10321);
__m512 tmp10393 = _mm512_sub_ps(tmp10328, tmp10369);
__m512 tmp10381 = _mm512_sub_ps(tmp10318, tmp10319);
__m512 tmp10392 = _mm512_sub_ps(tmp10326, tmp10327);
__m512 tmp10378 = _mm512_add_ps(tmp10322, tmp10323);
__m512 tmp10389 = _mm512_add_ps(tmp10370, tmp10371);
__m512 tmp10383 = _mm512_sub_ps(tmp10322, tmp10323);
__m512 tmp10394 = _mm512_sub_ps(tmp10370, tmp10371);
__m512 tmp10380 = _mm512_fmadd_ps(tmp10382, _mm512_set1_ps(2e+00f), tmp10381);
__m512 tmp10391 = _mm512_fmadd_ps(tmp10393, _mm512_set1_ps(2e+00f), tmp10392);
__m512 tmp10375 = _mm512_add_ps(tmp10376, tmp10377);
__m512 tmp10386 = _mm512_add_ps(tmp10387, tmp10388);
__m512 tmp10379 = _mm512_fmadd_ps(tmp10383, _mm512_set1_ps(1.6e+01f), tmp10380);
__m512 tmp10390 = _mm512_fmadd_ps(tmp10394, _mm512_set1_ps(1.6e+01f), tmp10391);
__m512 tmp10374 = _mm512_add_ps(tmp10375, tmp10317);
__m512 tmp10385 = _mm512_add_ps(tmp10386, tmp10325);
__m512 tmp10373 = _mm512_fmadd_ps(tmp10378, _mm512_set1_ps(3.2e+01f), tmp10374);
__m512 tmp10384 = _mm512_fmadd_ps(tmp10389, _mm512_set1_ps(3.2e+01f), tmp10385);
__m512 out1367 = tmp10373;
__m512 out1369 = tmp10384;
__m512 out1368 = tmp10379;
__m512 out1370 = tmp10390;
out1367 = _mm512_max_ps(_mm512_setzero_ps(), out1367);
out1369 = _mm512_max_ps(_mm512_setzero_ps(), out1369);
out1368 = _mm512_max_ps(_mm512_setzero_ps(), out1368);
out1370 = _mm512_max_ps(_mm512_setzero_ps(), out1370);
_mm512_mask_storeu_ps(datPtr13+12656+806912*i28+224*toH36+4*toW36+50432*k97+25216*l36, 4095, out1367);
_mm512_mask_storeu_ps(datPtr13+12704+806912*i28+224*toH36+4*toW36+50432*k97+25216*l36, 4095, out1369);
_mm512_mask_storeu_ps(datPtr13+12880+806912*i28+224*toH36+4*toW36+50432*k97+25216*l36, 4095, out1368);
_mm512_mask_storeu_ps(datPtr13+12928+806912*i28+224*toH36+4*toW36+50432*k97+25216*l36, 4095, out1370);
}
}
if (j23 >= last6) return;
++j23;
rel18 = 1;
}
ptrdiff_t toH37 = base18+0;
ptrdiff_t toW37 = 36;
ptrdiff_t k98 = 16*w46;
for (; k98 != 16; ++k98) {
ptrdiff_t l37 = 0;
for (; l37 != 4; ++l37) {
__m512 sf785 = _mm512_loadu_ps(sfPtr7+0+1638400*i28+24576*j23+1024*k98+256*l37);
__m512 sf786 = _mm512_loadu_ps(sfPtr7+128+1638400*i28+24576*j23+1024*k98+256*l37);
__m512 in1501 = _mm512_shuffle_f32x4(sf785, sf786, 68);
__m512 in1502 = _mm512_shuffle_f32x4(sf785, sf786, 238);
__m512 sf787 = _mm512_loadu_ps(sfPtr7+64+1638400*i28+24576*j23+1024*k98+256*l37);
__m512 sf788 = _mm512_loadu_ps(sfPtr7+192+1638400*i28+24576*j23+1024*k98+256*l37);
__m512 in1509 = _mm512_shuffle_f32x4(sf787, sf788, 68);
__m512 in1510 = _mm512_shuffle_f32x4(sf787, sf788, 238);
__m512 sf789 = _mm512_loadu_ps(sfPtr7+409600+1638400*i28+24576*j23+1024*k98+256*l37);
__m512 sf790 = _mm512_loadu_ps(sfPtr7+409728+1638400*i28+24576*j23+1024*k98+256*l37);
__m512 in1503 = _mm512_shuffle_f32x4(sf789, sf790, 68);
__m512 in1504 = _mm512_shuffle_f32x4(sf789, sf790, 238);
__m512 sf791 = _mm512_loadu_ps(sfPtr7+409664+1638400*i28+24576*j23+1024*k98+256*l37);
__m512 sf792 = _mm512_loadu_ps(sfPtr7+409792+1638400*i28+24576*j23+1024*k98+256*l37);
__m512 in1511 = _mm512_shuffle_f32x4(sf791, sf792, 68);
__m512 in1512 = _mm512_shuffle_f32x4(sf791, sf792, 238);
__m512 sf793 = _mm512_loadu_ps(sfPtr7+819200+1638400*i28+24576*j23+1024*k98+256*l37);
__m512 sf794 = _mm512_loadu_ps(sfPtr7+819328+1638400*i28+24576*j23+1024*k98+256*l37);
__m512 in1505 = _mm512_shuffle_f32x4(sf793, sf794, 68);
__m512 in1506 = _mm512_shuffle_f32x4(sf793, sf794, 238);
__m512 sf795 = _mm512_loadu_ps(sfPtr7+819264+1638400*i28+24576*j23+1024*k98+256*l37);
__m512 sf796 = _mm512_loadu_ps(sfPtr7+819392+1638400*i28+24576*j23+1024*k98+256*l37);
__m512 in1513 = _mm512_shuffle_f32x4(sf795, sf796, 68);
__m512 in1514 = _mm512_shuffle_f32x4(sf795, sf796, 238);
__m512 sf797 = _mm512_loadu_ps(sfPtr7+1228800+1638400*i28+24576*j23+1024*k98+256*l37);
__m512 sf798 = _mm512_loadu_ps(sfPtr7+1228928+1638400*i28+24576*j23+1024*k98+256*l37);
__m512 in1507 = _mm512_shuffle_f32x4(sf797, sf798, 68);
__m512 in1508 = _mm512_shuffle_f32x4(sf797, sf798, 238);
__m512 sf799 = _mm512_loadu_ps(sfPtr7+1228864+1638400*i28+24576*j23+1024*k98+256*l37);
__m512 sf800 = _mm512_loadu_ps(sfPtr7+1228992+1638400*i28+24576*j23+1024*k98+256*l37);
__m512 in1515 = _mm512_shuffle_f32x4(sf799, sf800, 68);
__m512 in1516 = _mm512_shuffle_f32x4(sf799, sf800, 238);
__m512 tmp10451 = _mm512_add_ps(in1502, in1503);
__m512 tmp10471 = _mm512_add_ps(in1510, in1511);
__m512 tmp10450 = _mm512_add_ps(in1504, in1505);
__m512 tmp10470 = _mm512_add_ps(in1512, in1513);
__m512 tmp10456 = _mm512_sub_ps(in1504, in1505);
__m512 tmp10476 = _mm512_sub_ps(in1512, in1513);
__m512 tmp10455 = _mm512_sub_ps(in1502, in1503);
__m512 tmp10475 = _mm512_sub_ps(in1510, in1511);
__m512 tmp10452 = _mm512_add_ps(in1506, in1507);
__m512 tmp10472 = _mm512_add_ps(in1514, in1515);
__m512 tmp10457 = _mm512_sub_ps(in1506, in1507);
__m512 tmp10477 = _mm512_sub_ps(in1514, in1515);
__m512 tmp10454 = _mm512_fmadd_ps(tmp10456, _mm512_set1_ps(2e+00f), tmp10455);
__m512 tmp10474 = _mm512_fmadd_ps(tmp10476, _mm512_set1_ps(2e+00f), tmp10475);
__m512 tmp10461 = _mm512_fmadd_ps(tmp10456, _mm512_set1_ps(8e+00f), tmp10455);
__m512 tmp10481 = _mm512_fmadd_ps(tmp10476, _mm512_set1_ps(8e+00f), tmp10475);
__m512 tmp10449 = _mm512_add_ps(tmp10450, tmp10451);
__m512 tmp10469 = _mm512_add_ps(tmp10470, tmp10471);
__m512 tmp10453 = _mm512_fmadd_ps(tmp10457, _mm512_set1_ps(1.6e+01f), tmp10454);
__m512 tmp10473 = _mm512_fmadd_ps(tmp10477, _mm512_set1_ps(1.6e+01f), tmp10474);
__m512 tmp10460 = _mm512_fmadd_ps(tmp10457, _mm512_set1_ps(4e+00f), tmp10461);
__m512 tmp10480 = _mm512_fmadd_ps(tmp10477, _mm512_set1_ps(4e+00f), tmp10481);
__m512 tmp10466 = _mm512_add_ps(tmp10457, tmp10455);
__m512 tmp10486 = _mm512_add_ps(tmp10477, tmp10475);
__m512 tmp10459 = _mm512_fmadd_ps(tmp10450, _mm512_set1_ps(4e+00f), tmp10451);
__m512 tmp10479 = _mm512_fmadd_ps(tmp10470, _mm512_set1_ps(4e+00f), tmp10471);
__m512 tmp10463 = _mm512_fmadd_ps(tmp10450, _mm512_set1_ps(1.6e+01f), tmp10451);
__m512 tmp10483 = _mm512_fmadd_ps(tmp10470, _mm512_set1_ps(1.6e+01f), tmp10471);
__m512 tmp10448 = _mm512_add_ps(tmp10449, in1501);
__m512 tmp10468 = _mm512_add_ps(tmp10469, in1509);
__m512 tmp10465 = _mm512_add_ps(tmp10466, in1508);
__m512 tmp10485 = _mm512_add_ps(tmp10486, in1516);
__m512 tmp10447 = _mm512_fmadd_ps(tmp10452, _mm512_set1_ps(3.2e+01f), tmp10448);
__m512 tmp10467 = _mm512_fmadd_ps(tmp10472, _mm512_set1_ps(3.2e+01f), tmp10468);
__m512 tmp10458 = _mm512_fmadd_ps(tmp10452, _mm512_set1_ps(8e+00f), tmp10459);
__m512 tmp10478 = _mm512_fmadd_ps(tmp10472, _mm512_set1_ps(8e+00f), tmp10479);
__m512 tmp10464 = _mm512_fmadd_ps(tmp10456, _mm512_set1_ps(3.2e+01f), tmp10465);
__m512 tmp10484 = _mm512_fmadd_ps(tmp10476, _mm512_set1_ps(3.2e+01f), tmp10485);
__m512 tmp10462 = _mm512_fmadd_ps(tmp10452, _mm512_set1_ps(2e+00f), tmp10463);
__m512 tmp10482 = _mm512_fmadd_ps(tmp10472, _mm512_set1_ps(2e+00f), tmp10483);
__m512 tmp10435 = tmp10447;
__m512 tmp10441 = tmp10467;
__m512 tmp10436 = tmp10453;
__m512 tmp10442 = tmp10473;
__m512 tmp10437 = tmp10458;
__m512 tmp10443 = tmp10478;
__m512 tmp10438 = tmp10460;
__m512 tmp10444 = tmp10480;
__m512 tmp10439 = tmp10462;
__m512 tmp10445 = tmp10482;
__m512 tmp10440 = tmp10464;
__m512 tmp10446 = tmp10484;
__m512 tmp10513 = _mm512_unpacklo_ps(tmp10435, tmp10436);
__m512 tmp10514 = _mm512_unpackhi_ps(tmp10435, tmp10436);
__m512 tmp10515 = _mm512_unpacklo_ps(tmp10437, tmp10438);
__m512 tmp10516 = _mm512_unpackhi_ps(tmp10437, tmp10438);
__m512 tmp10517 = _mm512_unpacklo_ps(tmp10439, tmp10440);
__m512 tmp10518 = _mm512_unpackhi_ps(tmp10439, tmp10440);
__m512 tmp10519 = _mm512_unpacklo_ps(tmp10441, tmp10442);
__m512 tmp10520 = _mm512_unpackhi_ps(tmp10441, tmp10442);
__m512 tmp10521 = _mm512_unpacklo_ps(tmp10443, tmp10444);
__m512 tmp10522 = _mm512_unpackhi_ps(tmp10443, tmp10444);
__m512 tmp10523 = _mm512_unpacklo_ps(tmp10445, tmp10446);
__m512 tmp10524 = _mm512_unpackhi_ps(tmp10445, tmp10446);
__m512 tmp10525 = _mm512_shuffle_ps(tmp10513, tmp10515, 68);
__m512 tmp10526 = _mm512_shuffle_ps(tmp10513, tmp10515, 238);
__m512 tmp10527 = _mm512_shuffle_ps(tmp10514, tmp10516, 68);
__m512 tmp10528 = _mm512_shuffle_ps(tmp10514, tmp10516, 238);
__m512 tmp10529 = _mm512_shuffle_ps(tmp10517, tmp10519, 68);
__m512 tmp10530 = _mm512_shuffle_ps(tmp10517, tmp10519, 238);
__m512 tmp10531 = _mm512_shuffle_ps(tmp10518, tmp10520, 68);
__m512 tmp10532 = _mm512_shuffle_ps(tmp10518, tmp10520, 238);
__m512 tmp10533 = _mm512_shuffle_ps(tmp10521, tmp10523, 68);
__m512 tmp10534 = _mm512_shuffle_ps(tmp10521, tmp10523, 238);
__m512 tmp10535 = _mm512_shuffle_ps(tmp10522, tmp10524, 68);
__m512 tmp10536 = _mm512_shuffle_ps(tmp10522, tmp10524, 238);
__m512 tmp10537 = _mm512_shuffle_f32x4(tmp10525, tmp10529, 136);
__m512 tmp10538 = _mm512_shuffle_f32x4(tmp10525, tmp10529, 221);
__m512 tmp10539 = _mm512_shuffle_f32x4(tmp10526, tmp10530, 136);
__m512 tmp10540 = _mm512_shuffle_f32x4(tmp10526, tmp10530, 221);
__m512 tmp10541 = _mm512_shuffle_f32x4(tmp10527, tmp10531, 136);
__m512 tmp10542 = _mm512_shuffle_f32x4(tmp10527, tmp10531, 221);
__m512 tmp10543 = _mm512_shuffle_f32x4(tmp10528, tmp10532, 136);
__m512 tmp10544 = _mm512_shuffle_f32x4(tmp10528, tmp10532, 221);
__m512 tmp10545 = _mm512_shuffle_f32x4(tmp10533, tmp10533, 136);
__m512 tmp10546 = _mm512_shuffle_f32x4(tmp10533, tmp10533, 221);
__m512 tmp10547 = _mm512_shuffle_f32x4(tmp10534, tmp10534, 136);
__m512 tmp10548 = _mm512_shuffle_f32x4(tmp10534, tmp10534, 221);
__m512 tmp10549 = _mm512_shuffle_f32x4(tmp10535, tmp10535, 136);
__m512 tmp10550 = _mm512_shuffle_f32x4(tmp10535, tmp10535, 221);
__m512 tmp10551 = _mm512_shuffle_f32x4(tmp10536, tmp10536, 136);
__m512 tmp10552 = _mm512_shuffle_f32x4(tmp10536, tmp10536, 221);
tmp10435 = _mm512_shuffle_f32x4(tmp10537, tmp10545, 136);
tmp10443 = _mm512_shuffle_f32x4(tmp10537, tmp10545, 221);
tmp10436 = _mm512_shuffle_f32x4(tmp10539, tmp10547, 136);
tmp10444 = _mm512_shuffle_f32x4(tmp10539, tmp10547, 221);
tmp10437 = _mm512_shuffle_f32x4(tmp10541, tmp10549, 136);
tmp10445 = _mm512_shuffle_f32x4(tmp10541, tmp10549, 221);
tmp10438 = _mm512_shuffle_f32x4(tmp10543, tmp10551, 136);
tmp10446 = _mm512_shuffle_f32x4(tmp10543, tmp10551, 221);
tmp10439 = _mm512_shuffle_f32x4(tmp10538, tmp10546, 136);
__m512 tmp10487 = _mm512_shuffle_f32x4(tmp10538, tmp10546, 221);
tmp10440 = _mm512_shuffle_f32x4(tmp10540, tmp10548, 136);
__m512 tmp10488 = _mm512_shuffle_f32x4(tmp10540, tmp10548, 221);
tmp10441 = _mm512_shuffle_f32x4(tmp10542, tmp10550, 136);
__m512 tmp10489 = _mm512_shuffle_f32x4(tmp10542, tmp10550, 221);
tmp10442 = _mm512_shuffle_f32x4(tmp10544, tmp10552, 136);
__m512 tmp10490 = _mm512_shuffle_f32x4(tmp10544, tmp10552, 221);
(void)tmp10442;
(void)tmp10490;
__m512 tmp10495 = _mm512_add_ps(tmp10436, tmp10437);
__m512 tmp10506 = _mm512_add_ps(tmp10444, tmp10445);
__m512 tmp10494 = _mm512_add_ps(tmp10438, tmp10439);
__m512 tmp10505 = _mm512_add_ps(tmp10446, tmp10487);
__m512 tmp10500 = _mm512_sub_ps(tmp10438, tmp10439);
__m512 tmp10511 = _mm512_sub_ps(tmp10446, tmp10487);
__m512 tmp10499 = _mm512_sub_ps(tmp10436, tmp10437);
__m512 tmp10510 = _mm512_sub_ps(tmp10444, tmp10445);
__m512 tmp10496 = _mm512_add_ps(tmp10440, tmp10441);
__m512 tmp10507 = _mm512_add_ps(tmp10488, tmp10489);
__m512 tmp10501 = _mm512_sub_ps(tmp10440, tmp10441);
__m512 tmp10512 = _mm512_sub_ps(tmp10488, tmp10489);
__m512 tmp10498 = _mm512_fmadd_ps(tmp10500, _mm512_set1_ps(2e+00f), tmp10499);
__m512 tmp10509 = _mm512_fmadd_ps(tmp10511, _mm512_set1_ps(2e+00f), tmp10510);
__m512 tmp10493 = _mm512_add_ps(tmp10494, tmp10495);
__m512 tmp10504 = _mm512_add_ps(tmp10505, tmp10506);
__m512 tmp10497 = _mm512_fmadd_ps(tmp10501, _mm512_set1_ps(1.6e+01f), tmp10498);
__m512 tmp10508 = _mm512_fmadd_ps(tmp10512, _mm512_set1_ps(1.6e+01f), tmp10509);
__m512 tmp10492 = _mm512_add_ps(tmp10493, tmp10435);
__m512 tmp10503 = _mm512_add_ps(tmp10504, tmp10443);
__m512 tmp10491 = _mm512_fmadd_ps(tmp10496, _mm512_set1_ps(3.2e+01f), tmp10492);
__m512 tmp10502 = _mm512_fmadd_ps(tmp10507, _mm512_set1_ps(3.2e+01f), tmp10503);
__m512 out1371 = tmp10491;
__m512 out1373 = tmp10502;
__m512 out1372 = tmp10497;
__m512 out1374 = tmp10508;
out1371 = _mm512_max_ps(_mm512_setzero_ps(), out1371);
out1373 = _mm512_max_ps(_mm512_setzero_ps(), out1373);
out1372 = _mm512_max_ps(_mm512_setzero_ps(), out1372);
out1374 = _mm512_max_ps(_mm512_setzero_ps(), out1374);
_mm512_mask_storeu_ps(datPtr13+0+806912*i28+224*toH37+4*toW37+50432*k98+12608*l37, 4095, out1371);
_mm512_mask_storeu_ps(datPtr13+48+806912*i28+224*toH37+4*toW37+50432*k98+12608*l37, 255, out1373);
_mm512_mask_storeu_ps(datPtr13+224+806912*i28+224*toH37+4*toW37+50432*k98+12608*l37, 4095, out1372);
_mm512_mask_storeu_ps(datPtr13+272+806912*i28+224*toH37+4*toW37+50432*k98+12608*l37, 255, out1374);
}
}
if (j23 >= last6) return;
++j23;
}

static void ResNet50ThreeConsumeSums2(ResNet50ThreaderTeam1* team35, char** tensors43) {
ResNet50ThreaderTask1 task47;
task47.callee1 = ResNet50ThreeConsumeSums2Callee1;
task47.any1 = tensors43;
task47.nd1 = 3;
task47.hull1[0] = 1;
task47.hull1[1] = 8;
task47.hull1[2] = 1;
ResNet50ThreaderDo1(team35, &task47);
}

static void ResNet50ThreeArrangeFilts3Callee1(ResNet50ThreaderTask1* task54, int64_t* pt32) {
char** tensors52 = task54->any1;
ptrdiff_t b54 = pt32[0];
ptrdiff_t g18 = 0;
ptrdiff_t e16 = 0;
char*restrict bfPtr8 = tensors52[3]+512*e16;
char*restrict wfPtr8 = tensors52[3]+512+6488064*e16;
char*restrict wtPtr10 = tensors52[0]+14256*e16;
char*restrict biasPtr10 = tensors52[1];
char*restrict bnPtr10 = tensors52[2];
ptrdiff_t i34 = 1*g18;
ptrdiff_t j27 = 1*b54;
ptrdiff_t jj33 = j27+0;
if (j27 < 32) {
for (; j27 != 32; ++j27) {
ptrdiff_t k104 = 0+1*j27;
ptrdiff_t cut13 = 0;
__m512 postMul30 = _mm512_set1_ps(((float*)bnPtr10+(ptrdiff_t)2*(0+128*i34+4*j27))[0]);
__m512 postMul31 = _mm512_set1_ps(((float*)bnPtr10+(ptrdiff_t)2*(1+128*i34+4*j27))[0]);
__m512 postMul32 = _mm512_set1_ps(((float*)bnPtr10+(ptrdiff_t)2*(2+128*i34+4*j27))[0]);
__m512 postMul33 = _mm512_set1_ps(((float*)bnPtr10+(ptrdiff_t)2*(3+128*i34+4*j27))[0]);
ptrdiff_t s24 = 0;
for (; s24 != 128; ++s24) {
__m512 wt317 = _mm512_maskz_loadu_ps(511, wtPtr10+0+589824*i34+18432*j27+36*s24);
__m512 wt318 = _mm512_maskz_loadu_ps(511, wtPtr10+4608+589824*i34+18432*j27+36*s24);
__m512 wt319 = _mm512_maskz_loadu_ps(511, wtPtr10+9216+589824*i34+18432*j27+36*s24);
__m512 wt320 = _mm512_maskz_loadu_ps(511, wtPtr10+13824+589824*i34+18432*j27+36*s24);
wt317 = _mm512_mul_ps(wt317, postMul30);
wt318 = _mm512_mul_ps(wt318, postMul31);
wt319 = _mm512_mul_ps(wt319, postMul32);
wt320 = _mm512_mul_ps(wt320, postMul33);
__m512i pm157 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm158 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp10793 = _mm512_permutex2var_ps(wt317, pm157, wt319);
__m512 tmp10794 = _mm512_permutex2var_ps(wt318, pm157, wt320);
__m512 tmp10795 = _mm512_permutex2var_ps(wt317, pm158, wt319);
__m512 tmp10796 = _mm512_permutex2var_ps(wt318, pm158, wt320);
__m512 in1517 = _mm512_permutex2var_ps(tmp10793, pm157, tmp10794);
__m512 in1518 = _mm512_permutex2var_ps(tmp10793, pm158, tmp10794);
__m512 in1519 = _mm512_permutex2var_ps(tmp10795, pm157, tmp10796);
__m512 tmp10797 = _mm512_fmadd_ps(in1517, _mm512_set1_ps(4e+00f), in1519);
__m512 tmp10798 = _mm512_add_ps(in1517, in1519);
__m512 tmp10799 = _mm512_fmadd_ps(in1519, _mm512_set1_ps(4e+00f), in1517);
__m512 tmp10800 = _mm512_add_ps(in1518, tmp10798);
__m512 tmp10801 = _mm512_fmadd_ps(in1518, _mm512_set1_ps(2e+00f), tmp10799);
tmp10799 = _mm512_fnmadd_ps(in1518, _mm512_set1_ps(2e+00f), tmp10799);
__m512 tmp10802 = _mm512_fnmadd_ps(in1518, _mm512_set1_ps(2e+00f), tmp10797);
tmp10797 = _mm512_fmadd_ps(in1518, _mm512_set1_ps(2e+00f), tmp10797);
tmp10798 = _mm512_sub_ps(tmp10798, in1518);
__m512 tmp10819 = _mm512_unpacklo_ps(in1517, tmp10800);
__m512 tmp10820 = _mm512_unpackhi_ps(in1517, tmp10800);
__m512 tmp10821 = _mm512_unpacklo_ps(tmp10798, tmp10801);
__m512 tmp10822 = _mm512_unpackhi_ps(tmp10798, tmp10801);
__m512 tmp10823 = _mm512_unpacklo_ps(tmp10799, tmp10797);
__m512 tmp10824 = _mm512_unpackhi_ps(tmp10799, tmp10797);
__m512 tmp10825 = _mm512_unpacklo_ps(tmp10802, in1519);
__m512 tmp10826 = _mm512_unpackhi_ps(tmp10802, in1519);
__m512 tmp10827 = _mm512_shuffle_ps(tmp10819, tmp10821, 68);
__m512 tmp10828 = _mm512_shuffle_ps(tmp10819, tmp10821, 238);
__m512 tmp10829 = _mm512_shuffle_ps(tmp10820, tmp10822, 68);
__m512 tmp10830 = _mm512_shuffle_ps(tmp10820, tmp10822, 238);
__m512 tmp10831 = _mm512_shuffle_ps(tmp10823, tmp10825, 68);
__m512 tmp10832 = _mm512_shuffle_ps(tmp10823, tmp10825, 238);
__m512 tmp10833 = _mm512_shuffle_ps(tmp10824, tmp10826, 68);
__m512 tmp10834 = _mm512_shuffle_ps(tmp10824, tmp10826, 238);
__m512 tmp10835 = _mm512_shuffle_f32x4(tmp10827, tmp10831, 136);
__m512 tmp10836 = _mm512_shuffle_f32x4(tmp10827, tmp10831, 221);
__m512 tmp10837 = _mm512_shuffle_f32x4(tmp10828, tmp10832, 136);
__m512 tmp10838 = _mm512_shuffle_f32x4(tmp10828, tmp10832, 221);
__m512 tmp10839 = _mm512_shuffle_f32x4(tmp10829, tmp10833, 136);
__m512 tmp10840 = _mm512_shuffle_f32x4(tmp10829, tmp10833, 221);
__m512 tmp10841 = _mm512_shuffle_f32x4(tmp10830, tmp10834, 136);
__m512 tmp10842 = _mm512_shuffle_f32x4(tmp10830, tmp10834, 221);
in1517 = _mm512_shuffle_f32x4(tmp10835, tmp10835, 136);
__m512 tmp10803 = _mm512_shuffle_f32x4(tmp10835, tmp10835, 221);
tmp10800 = _mm512_shuffle_f32x4(tmp10837, tmp10837, 136);
__m512 tmp10804 = _mm512_shuffle_f32x4(tmp10837, tmp10837, 221);
tmp10798 = _mm512_shuffle_f32x4(tmp10839, tmp10839, 136);
__m512 tmp10805 = _mm512_shuffle_f32x4(tmp10839, tmp10839, 221);
tmp10801 = _mm512_shuffle_f32x4(tmp10841, tmp10841, 136);
__m512 tmp10806 = _mm512_shuffle_f32x4(tmp10841, tmp10841, 221);
tmp10799 = _mm512_shuffle_f32x4(tmp10836, tmp10836, 136);
tmp10797 = _mm512_shuffle_f32x4(tmp10838, tmp10838, 136);
tmp10802 = _mm512_shuffle_f32x4(tmp10840, tmp10840, 136);
in1519 = _mm512_shuffle_f32x4(tmp10842, tmp10842, 136);
in1517 = _mm512_shuffle_f32x4(in1517, tmp10801, 68);
tmp10800 = _mm512_shuffle_f32x4(tmp10800, tmp10799, 68);
tmp10798 = _mm512_shuffle_f32x4(tmp10798, tmp10797, 68);
tmp10802 = _mm512_shuffle_f32x4(tmp10802, tmp10804, 68);
in1519 = _mm512_shuffle_f32x4(in1519, tmp10805, 68);
tmp10803 = _mm512_shuffle_f32x4(tmp10803, tmp10806, 68);
__m512 tmp10807 = _mm512_fmadd_ps(in1517, _mm512_set1_ps(4e+00f), tmp10798);
__m512 tmp10813 = _mm512_fmadd_ps(tmp10802, _mm512_set1_ps(4e+00f), tmp10803);
__m512 tmp10808 = _mm512_add_ps(in1517, tmp10798);
__m512 tmp10814 = _mm512_add_ps(tmp10802, tmp10803);
__m512 tmp10809 = _mm512_fmadd_ps(tmp10798, _mm512_set1_ps(4e+00f), in1517);
__m512 tmp10815 = _mm512_fmadd_ps(tmp10803, _mm512_set1_ps(4e+00f), tmp10802);
__m512 tmp10810 = _mm512_add_ps(tmp10800, tmp10808);
__m512 tmp10816 = _mm512_add_ps(in1519, tmp10814);
__m512 tmp10811 = _mm512_fmadd_ps(tmp10800, _mm512_set1_ps(2e+00f), tmp10809);
__m512 tmp10817 = _mm512_fmadd_ps(in1519, _mm512_set1_ps(2e+00f), tmp10815);
tmp10809 = _mm512_fnmadd_ps(tmp10800, _mm512_set1_ps(2e+00f), tmp10809);
tmp10815 = _mm512_fnmadd_ps(in1519, _mm512_set1_ps(2e+00f), tmp10815);
__m512 tmp10812 = _mm512_fnmadd_ps(tmp10800, _mm512_set1_ps(2e+00f), tmp10807);
__m512 tmp10818 = _mm512_fnmadd_ps(in1519, _mm512_set1_ps(2e+00f), tmp10813);
tmp10807 = _mm512_fmadd_ps(tmp10800, _mm512_set1_ps(2e+00f), tmp10807);
tmp10813 = _mm512_fmadd_ps(in1519, _mm512_set1_ps(2e+00f), tmp10813);
tmp10808 = _mm512_sub_ps(tmp10808, tmp10800);
tmp10814 = _mm512_sub_ps(tmp10814, in1519);
in1517 = _mm512_mul_ps(in1517, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp10810 = _mm512_mul_ps(tmp10810, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp10808 = _mm512_mul_ps(tmp10808, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp10811 = _mm512_mul_ps(tmp10811, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp10809 = _mm512_mul_ps(tmp10809, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp10807 = _mm512_mul_ps(tmp10807, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp10812 = _mm512_mul_ps(tmp10812, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp10798 = _mm512_mul_ps(tmp10798, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp10802 = _mm512_mul_ps(tmp10802, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp10816 = _mm512_mul_ps(tmp10816, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp10814 = _mm512_mul_ps(tmp10814, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp10817 = _mm512_mul_ps(tmp10817, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp10815 = _mm512_mul_ps(tmp10815, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp10813 = _mm512_mul_ps(tmp10813, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp10818 = _mm512_mul_ps(tmp10818, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp10803 = _mm512_mul_ps(tmp10803, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out1375 = _mm512_shuffle_f32x4(in1517, tmp10810, 68);
__m512 out1379 = _mm512_shuffle_f32x4(in1517, tmp10810, 238);
__m512 out1376 = _mm512_shuffle_f32x4(tmp10808, tmp10811, 68);
__m512 out1380 = _mm512_shuffle_f32x4(tmp10808, tmp10811, 238);
__m512 out1377 = _mm512_shuffle_f32x4(tmp10809, tmp10807, 68);
__m512 out1381 = _mm512_shuffle_f32x4(tmp10809, tmp10807, 238);
__m512 out1378 = _mm512_shuffle_f32x4(tmp10812, tmp10798, 68);
__m512 out1382 = _mm512_shuffle_f32x4(tmp10812, tmp10798, 238);
__m512 out1383 = _mm512_shuffle_f32x4(tmp10802, tmp10816, 68);
__m512 out1387 = _mm512_shuffle_f32x4(tmp10802, tmp10816, 238);
__m512 out1384 = _mm512_shuffle_f32x4(tmp10814, tmp10817, 68);
__m512 out1388 = _mm512_shuffle_f32x4(tmp10814, tmp10817, 238);
__m512 out1385 = _mm512_shuffle_f32x4(tmp10815, tmp10813, 68);
__m512 out1389 = _mm512_shuffle_f32x4(tmp10815, tmp10813, 238);
__m512 out1386 = _mm512_shuffle_f32x4(tmp10818, tmp10803, 68);
__m512 out1390 = _mm512_shuffle_f32x4(tmp10818, tmp10803, 238);
ptrdiff_t off9 = 32*cut13;
ptrdiff_t off10 = (size_t)(cut13+1)/4*16384+(size_t)(cut13+1)%4*32;
ptrdiff_t off11 = (size_t)(cut13+2)/4*16384+(size_t)(cut13+2)%4*32;
ptrdiff_t off12 = (size_t)(cut13+3)/4*16384+(size_t)(cut13+3)%4*32;
__m512i wf81 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1375, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf82 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1379, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf83 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1383, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf84 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1387, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf85 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1376, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf86 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1380, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf87 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1384, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf88 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1388, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf89 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1377, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf90 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1381, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf91 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1385, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf92 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1389, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf93 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1378, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf94 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1382, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf95 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1386, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf96 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1390, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr8+0+2097152*i34+16384*k104+off9+128*s24, 255, wf81);
_mm512_mask_storeu_epi32(wfPtr8+0+2097152*i34+16384*k104+off10+128*s24, 255, wf82);
_mm512_mask_storeu_epi32(wfPtr8+0+2097152*i34+16384*k104+off11+128*s24, 255, wf83);
_mm512_mask_storeu_epi32(wfPtr8+0+2097152*i34+16384*k104+off12+128*s24, 255, wf84);
_mm512_mask_storeu_epi32(wfPtr8+524288+2097152*i34+16384*k104+off9+128*s24, 255, wf85);
_mm512_mask_storeu_epi32(wfPtr8+524288+2097152*i34+16384*k104+off10+128*s24, 255, wf86);
_mm512_mask_storeu_epi32(wfPtr8+524288+2097152*i34+16384*k104+off11+128*s24, 255, wf87);
_mm512_mask_storeu_epi32(wfPtr8+524288+2097152*i34+16384*k104+off12+128*s24, 255, wf88);
_mm512_mask_storeu_epi32(wfPtr8+1048576+2097152*i34+16384*k104+off9+128*s24, 255, wf89);
_mm512_mask_storeu_epi32(wfPtr8+1048576+2097152*i34+16384*k104+off10+128*s24, 255, wf90);
_mm512_mask_storeu_epi32(wfPtr8+1048576+2097152*i34+16384*k104+off11+128*s24, 255, wf91);
_mm512_mask_storeu_epi32(wfPtr8+1048576+2097152*i34+16384*k104+off12+128*s24, 255, wf92);
_mm512_mask_storeu_epi32(wfPtr8+1572864+2097152*i34+16384*k104+off9+128*s24, 255, wf93);
_mm512_mask_storeu_epi32(wfPtr8+1572864+2097152*i34+16384*k104+off10+128*s24, 255, wf94);
_mm512_mask_storeu_epi32(wfPtr8+1572864+2097152*i34+16384*k104+off11+128*s24, 255, wf95);
_mm512_mask_storeu_epi32(wfPtr8+1572864+2097152*i34+16384*k104+off12+128*s24, 255, wf96);
}
__m512 bias4 = _mm512_setzero_ps();
if (!e16) {
bias4 = _mm512_maskz_loadu_ps(15, biasPtr10-0+512*i34+16*j27);
__m512i pmMul20 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd20 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas8 = _mm512_maskz_loadu_ps(255, bnPtr10+(ptrdiff_t)8*(0+128*i34+4*j27));
__m512 postMul34 = _mm512_permutexvar_ps(pmMul20, mas8);
__m512 postAdd20 = _mm512_permutexvar_ps(pmAdd20, mas8);
bias4 = _mm512_fmadd_ps(bias4, postMul34, postAdd20);
}
_mm512_mask_storeu_ps(bfPtr8-0+512*i34+16*j27, 15, bias4);
if (j27 >= jj33) return;
}
}
}

static void ResNet50ThreeArrangeFilts3(ResNet50ThreaderTeam1* team39, char** tensors51) {
ResNet50ThreaderTask1 task55;
task55.callee1 = ResNet50ThreeArrangeFilts3Callee1;
task55.any1 = tensors51;
task55.nd1 = 3;
task55.hull1[0] = 32;
task55.hull1[1] = 1;
task55.hull1[2] = 1;
ResNet50ThreaderDo1(team39, &task55);
}

static void ResNet50ThreeArrangeDats3Callee1(ResNet50ThreaderTask1* task56, int64_t* pt33) {
char** tensors54 = task56->any1;
ptrdiff_t s25 = 0;
ptrdiff_t c27 = pt33[1];
ptrdiff_t g19 = 0;
ptrdiff_t e17 = 0;
char*restrict datPtr16 = tensors54[0]-116+1241856*e17;
char*restrict dfPtr8 = tensors54[1]+2534400*e17;
ptrdiff_t i35 = 1*g19;
ptrdiff_t j28 = 1*c27;
ptrdiff_t last7 = j28+0;
ptrdiff_t rel19 = j28-0;
ptrdiff_t base19 = 0;
if (rel19 < 2) {
if (rel19 < 1) {
ptrdiff_t h40 = base19+0;
ptrdiff_t w48 = 0;
ptrdiff_t k105 = 0;
for (; k105 != 64; ++k105) {
__m512 dat1685 = _mm512_maskz_loadu_ps(8191, datPtr16+116+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1685 = _mm512_max_ps(_mm512_setzero_ps(), dat1685);
__m512 dat1686 = _mm512_maskz_loadu_ps(16383, datPtr16+160+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1686 = _mm512_max_ps(_mm512_setzero_ps(), dat1686);
__m512i pm159 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1520 = _mm512_permutexvar_ps(pm159, dat1685);
__m512i pm160 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1527 = _mm512_permutexvar_ps(pm160, dat1686);
__m512 dat1687 = _mm512_maskz_loadu_ps(8191, datPtr16+228+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1687 = _mm512_max_ps(_mm512_setzero_ps(), dat1687);
__m512 dat1688 = _mm512_maskz_loadu_ps(16383, datPtr16+272+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1688 = _mm512_max_ps(_mm512_setzero_ps(), dat1688);
__m512 in1521 = _mm512_permutexvar_ps(pm159, dat1687);
__m512 in1528 = _mm512_permutexvar_ps(pm160, dat1688);
__m512 dat1689 = _mm512_maskz_loadu_ps(8191, datPtr16+340+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1689 = _mm512_max_ps(_mm512_setzero_ps(), dat1689);
__m512 dat1690 = _mm512_maskz_loadu_ps(16383, datPtr16+384+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1690 = _mm512_max_ps(_mm512_setzero_ps(), dat1690);
__m512 in1522 = _mm512_permutexvar_ps(pm159, dat1689);
__m512 in1529 = _mm512_permutexvar_ps(pm160, dat1690);
__m512 dat1691 = _mm512_maskz_loadu_ps(8191, datPtr16+452+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1691 = _mm512_max_ps(_mm512_setzero_ps(), dat1691);
__m512 dat1692 = _mm512_maskz_loadu_ps(16383, datPtr16+496+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1692 = _mm512_max_ps(_mm512_setzero_ps(), dat1692);
__m512 in1523 = _mm512_permutexvar_ps(pm159, dat1691);
__m512 in1530 = _mm512_permutexvar_ps(pm160, dat1692);
__m512 dat1693 = _mm512_maskz_loadu_ps(8191, datPtr16+564+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1693 = _mm512_max_ps(_mm512_setzero_ps(), dat1693);
__m512 dat1694 = _mm512_maskz_loadu_ps(16383, datPtr16+608+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1694 = _mm512_max_ps(_mm512_setzero_ps(), dat1694);
__m512 in1524 = _mm512_permutexvar_ps(pm159, dat1693);
__m512 in1531 = _mm512_permutexvar_ps(pm160, dat1694);
__m512 dat1695 = _mm512_maskz_loadu_ps(8191, datPtr16+676+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1695 = _mm512_max_ps(_mm512_setzero_ps(), dat1695);
__m512 dat1696 = _mm512_maskz_loadu_ps(16383, datPtr16+720+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1696 = _mm512_max_ps(_mm512_setzero_ps(), dat1696);
__m512 in1525 = _mm512_permutexvar_ps(pm159, dat1695);
__m512 in1532 = _mm512_permutexvar_ps(pm160, dat1696);
__m512 dat1697 = _mm512_maskz_loadu_ps(8191, datPtr16+788+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1697 = _mm512_max_ps(_mm512_setzero_ps(), dat1697);
__m512 dat1698 = _mm512_maskz_loadu_ps(16383, datPtr16+832+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1698 = _mm512_max_ps(_mm512_setzero_ps(), dat1698);
__m512 in1526 = _mm512_permutexvar_ps(pm159, dat1697);
__m512 in1533 = _mm512_permutexvar_ps(pm160, dat1698);
__m512 tmp10843 = _mm512_add_ps(in1520, in1524);
__m512 tmp10848 = _mm512_add_ps(in1527, in1531);
__m512 tmp10844 = _mm512_sub_ps(in1523, in1521);
__m512 tmp10849 = _mm512_sub_ps(in1530, in1528);
__m512 tmp10845 = _mm512_add_ps(in1521, in1525);
__m512 tmp10850 = _mm512_add_ps(in1528, in1532);
__m512 tmp10846 = _mm512_sub_ps(_mm512_setzero_ps(), in1525);
__m512 tmp10851 = _mm512_sub_ps(_mm512_setzero_ps(), in1532);
tmp10843 = _mm512_fmadd_ps(in1522, _mm512_set1_ps(-4.25e+00f), tmp10843);
tmp10848 = _mm512_fmadd_ps(in1529, _mm512_set1_ps(-4.25e+00f), tmp10848);
tmp10845 = _mm512_fmadd_ps(in1523, _mm512_set1_ps(-4.25e+00f), tmp10845);
tmp10850 = _mm512_fmadd_ps(in1530, _mm512_set1_ps(-4.25e+00f), tmp10850);
tmp10846 = _mm512_fmadd_ps(tmp10844, _mm512_set1_ps(5.25e+00f), tmp10846);
tmp10851 = _mm512_fmadd_ps(tmp10849, _mm512_set1_ps(5.25e+00f), tmp10851);
tmp10844 = _mm512_fmadd_ps(in1521, _mm512_set1_ps(2.5e-01f), in1525);
tmp10849 = _mm512_fmadd_ps(in1528, _mm512_set1_ps(2.5e-01f), in1532);
in1521 = _mm512_fmadd_ps(in1521, _mm512_set1_ps(4e+00f), in1525);
in1528 = _mm512_fmadd_ps(in1528, _mm512_set1_ps(4e+00f), in1532);
__m512 tmp10847 = _mm512_sub_ps(tmp10845, tmp10843);
__m512 tmp10852 = _mm512_sub_ps(tmp10850, tmp10848);
tmp10845 = _mm512_add_ps(tmp10843, tmp10845);
tmp10850 = _mm512_add_ps(tmp10848, tmp10850);
tmp10843 = _mm512_fmadd_ps(in1520, _mm512_set1_ps(2.5e-01f), in1524);
tmp10848 = _mm512_fmadd_ps(in1527, _mm512_set1_ps(2.5e-01f), in1531);
tmp10844 = _mm512_fmadd_ps(in1523, _mm512_set1_ps(-1.25e+00f), tmp10844);
tmp10849 = _mm512_fmadd_ps(in1530, _mm512_set1_ps(-1.25e+00f), tmp10849);
in1523 = _mm512_fmadd_ps(in1523, _mm512_set1_ps(-5e+00f), in1521);
in1530 = _mm512_fmadd_ps(in1530, _mm512_set1_ps(-5e+00f), in1528);
tmp10843 = _mm512_fmadd_ps(in1522, _mm512_set1_ps(-1.25e+00f), tmp10843);
tmp10848 = _mm512_fmadd_ps(in1529, _mm512_set1_ps(-1.25e+00f), tmp10848);
in1525 = _mm512_fmadd_ps(tmp10843, _mm512_set1_ps(2e+00f), tmp10844);
in1532 = _mm512_fmadd_ps(tmp10848, _mm512_set1_ps(2e+00f), tmp10849);
tmp10844 = _mm512_fnmadd_ps(tmp10843, _mm512_set1_ps(2e+00f), tmp10844);
tmp10849 = _mm512_fnmadd_ps(tmp10848, _mm512_set1_ps(2e+00f), tmp10849);
tmp10843 = _mm512_fmadd_ps(in1524, _mm512_set1_ps(2.5e-01f), in1520);
tmp10848 = _mm512_fmadd_ps(in1531, _mm512_set1_ps(2.5e-01f), in1527);
in1520 = _mm512_sub_ps(in1526, in1520);
in1527 = _mm512_sub_ps(in1533, in1527);
tmp10843 = _mm512_fmadd_ps(in1522, _mm512_set1_ps(-1.25e+00f), tmp10843);
tmp10848 = _mm512_fmadd_ps(in1529, _mm512_set1_ps(-1.25e+00f), tmp10848);
in1522 = _mm512_sub_ps(in1522, in1524);
in1529 = _mm512_sub_ps(in1529, in1531);
in1522 = _mm512_fmadd_ps(in1522, _mm512_set1_ps(5.25e+00f), in1520);
in1529 = _mm512_fmadd_ps(in1529, _mm512_set1_ps(5.25e+00f), in1527);
in1521 = _mm512_fmadd_ps(tmp10843, _mm512_set1_ps(2e+00f), in1523);
in1528 = _mm512_fmadd_ps(tmp10848, _mm512_set1_ps(2e+00f), in1530);
in1523 = _mm512_fnmadd_ps(tmp10843, _mm512_set1_ps(2e+00f), in1523);
in1530 = _mm512_fnmadd_ps(tmp10848, _mm512_set1_ps(2e+00f), in1530);
__m512 tmp10861 = _mm512_unpacklo_ps(tmp10846, tmp10845);
__m512 tmp10862 = _mm512_unpackhi_ps(tmp10846, tmp10845);
__m512 tmp10863 = _mm512_unpacklo_ps(tmp10847, in1525);
__m512 tmp10864 = _mm512_unpackhi_ps(tmp10847, in1525);
__m512 tmp10865 = _mm512_unpacklo_ps(tmp10844, in1521);
__m512 tmp10866 = _mm512_unpackhi_ps(tmp10844, in1521);
__m512 tmp10867 = _mm512_unpacklo_ps(in1523, in1522);
__m512 tmp10868 = _mm512_unpackhi_ps(in1523, in1522);
__m512 tmp10869 = _mm512_unpacklo_ps(tmp10851, tmp10850);
__m512 tmp10870 = _mm512_unpackhi_ps(tmp10851, tmp10850);
__m512 tmp10871 = _mm512_unpacklo_ps(tmp10852, in1532);
__m512 tmp10872 = _mm512_unpackhi_ps(tmp10852, in1532);
__m512 tmp10873 = _mm512_unpacklo_ps(tmp10849, in1528);
__m512 tmp10874 = _mm512_unpackhi_ps(tmp10849, in1528);
__m512 tmp10875 = _mm512_unpacklo_ps(in1530, in1529);
__m512 tmp10876 = _mm512_unpackhi_ps(in1530, in1529);
__m512 tmp10877 = _mm512_shuffle_ps(tmp10861, tmp10863, 68);
__m512 tmp10878 = _mm512_shuffle_ps(tmp10861, tmp10863, 238);
__m512 tmp10879 = _mm512_shuffle_ps(tmp10862, tmp10864, 68);
__m512 tmp10880 = _mm512_shuffle_ps(tmp10862, tmp10864, 238);
__m512 tmp10881 = _mm512_shuffle_ps(tmp10865, tmp10867, 68);
__m512 tmp10882 = _mm512_shuffle_ps(tmp10865, tmp10867, 238);
__m512 tmp10883 = _mm512_shuffle_ps(tmp10866, tmp10868, 68);
__m512 tmp10884 = _mm512_shuffle_ps(tmp10866, tmp10868, 238);
__m512 tmp10885 = _mm512_shuffle_ps(tmp10869, tmp10871, 68);
__m512 tmp10886 = _mm512_shuffle_ps(tmp10869, tmp10871, 238);
__m512 tmp10887 = _mm512_shuffle_ps(tmp10870, tmp10872, 68);
__m512 tmp10888 = _mm512_shuffle_ps(tmp10870, tmp10872, 238);
__m512 tmp10889 = _mm512_shuffle_ps(tmp10873, tmp10875, 68);
__m512 tmp10890 = _mm512_shuffle_ps(tmp10873, tmp10875, 238);
__m512 tmp10891 = _mm512_shuffle_ps(tmp10874, tmp10876, 68);
__m512 tmp10892 = _mm512_shuffle_ps(tmp10874, tmp10876, 238);
__m512 tmp10893 = _mm512_shuffle_f32x4(tmp10877, tmp10881, 136);
__m512 tmp10894 = _mm512_shuffle_f32x4(tmp10877, tmp10881, 221);
__m512 tmp10895 = _mm512_shuffle_f32x4(tmp10878, tmp10882, 136);
__m512 tmp10896 = _mm512_shuffle_f32x4(tmp10878, tmp10882, 221);
__m512 tmp10897 = _mm512_shuffle_f32x4(tmp10879, tmp10883, 136);
__m512 tmp10898 = _mm512_shuffle_f32x4(tmp10879, tmp10883, 221);
__m512 tmp10899 = _mm512_shuffle_f32x4(tmp10880, tmp10884, 136);
__m512 tmp10900 = _mm512_shuffle_f32x4(tmp10880, tmp10884, 221);
__m512 tmp10901 = _mm512_shuffle_f32x4(tmp10885, tmp10889, 136);
__m512 tmp10902 = _mm512_shuffle_f32x4(tmp10885, tmp10889, 221);
__m512 tmp10903 = _mm512_shuffle_f32x4(tmp10886, tmp10890, 136);
__m512 tmp10904 = _mm512_shuffle_f32x4(tmp10886, tmp10890, 221);
__m512 tmp10905 = _mm512_shuffle_f32x4(tmp10887, tmp10891, 136);
__m512 tmp10906 = _mm512_shuffle_f32x4(tmp10887, tmp10891, 221);
__m512 tmp10907 = _mm512_shuffle_f32x4(tmp10888, tmp10892, 136);
__m512 tmp10908 = _mm512_shuffle_f32x4(tmp10888, tmp10892, 221);
tmp10846 = _mm512_shuffle_f32x4(tmp10893, tmp10901, 136);
tmp10851 = _mm512_shuffle_f32x4(tmp10893, tmp10901, 221);
tmp10845 = _mm512_shuffle_f32x4(tmp10895, tmp10903, 136);
tmp10850 = _mm512_shuffle_f32x4(tmp10895, tmp10903, 221);
tmp10847 = _mm512_shuffle_f32x4(tmp10897, tmp10905, 136);
tmp10852 = _mm512_shuffle_f32x4(tmp10897, tmp10905, 221);
in1525 = _mm512_shuffle_f32x4(tmp10899, tmp10907, 136);
in1532 = _mm512_shuffle_f32x4(tmp10899, tmp10907, 221);
tmp10844 = _mm512_shuffle_f32x4(tmp10894, tmp10902, 136);
tmp10849 = _mm512_shuffle_f32x4(tmp10894, tmp10902, 221);
in1521 = _mm512_shuffle_f32x4(tmp10896, tmp10904, 136);
in1528 = _mm512_shuffle_f32x4(tmp10896, tmp10904, 221);
in1523 = _mm512_shuffle_f32x4(tmp10898, tmp10906, 136);
in1530 = _mm512_shuffle_f32x4(tmp10898, tmp10906, 221);
in1522 = _mm512_shuffle_f32x4(tmp10900, tmp10908, 136);
in1529 = _mm512_shuffle_f32x4(tmp10900, tmp10908, 221);
__m512 tmp10853 = _mm512_add_ps(tmp10845, in1521);
__m512 tmp10857 = _mm512_add_ps(tmp10850, in1528);
__m512 tmp10854 = _mm512_sub_ps(tmp10844, tmp10847);
__m512 tmp10858 = _mm512_sub_ps(tmp10849, tmp10852);
__m512 tmp10855 = _mm512_add_ps(tmp10847, in1523);
__m512 tmp10859 = _mm512_add_ps(tmp10852, in1530);
tmp10846 = _mm512_sub_ps(tmp10846, in1523);
tmp10851 = _mm512_sub_ps(tmp10851, in1530);
tmp10853 = _mm512_fmadd_ps(in1525, _mm512_set1_ps(-4.25e+00f), tmp10853);
tmp10857 = _mm512_fmadd_ps(in1532, _mm512_set1_ps(-4.25e+00f), tmp10857);
tmp10855 = _mm512_fmadd_ps(tmp10844, _mm512_set1_ps(-4.25e+00f), tmp10855);
tmp10859 = _mm512_fmadd_ps(tmp10849, _mm512_set1_ps(-4.25e+00f), tmp10859);
tmp10846 = _mm512_fmadd_ps(tmp10854, _mm512_set1_ps(5.25e+00f), tmp10846);
tmp10851 = _mm512_fmadd_ps(tmp10858, _mm512_set1_ps(5.25e+00f), tmp10851);
tmp10854 = _mm512_fmadd_ps(tmp10847, _mm512_set1_ps(2.5e-01f), in1523);
tmp10858 = _mm512_fmadd_ps(tmp10852, _mm512_set1_ps(2.5e-01f), in1530);
tmp10847 = _mm512_fmadd_ps(tmp10847, _mm512_set1_ps(4e+00f), in1523);
tmp10852 = _mm512_fmadd_ps(tmp10852, _mm512_set1_ps(4e+00f), in1530);
__m512 tmp10856 = _mm512_sub_ps(tmp10855, tmp10853);
__m512 tmp10860 = _mm512_sub_ps(tmp10859, tmp10857);
tmp10855 = _mm512_add_ps(tmp10853, tmp10855);
tmp10859 = _mm512_add_ps(tmp10857, tmp10859);
tmp10853 = _mm512_fmadd_ps(tmp10845, _mm512_set1_ps(2.5e-01f), in1521);
tmp10857 = _mm512_fmadd_ps(tmp10850, _mm512_set1_ps(2.5e-01f), in1528);
tmp10854 = _mm512_fmadd_ps(tmp10844, _mm512_set1_ps(-1.25e+00f), tmp10854);
tmp10858 = _mm512_fmadd_ps(tmp10849, _mm512_set1_ps(-1.25e+00f), tmp10858);
tmp10844 = _mm512_fmadd_ps(tmp10844, _mm512_set1_ps(-5e+00f), tmp10847);
tmp10849 = _mm512_fmadd_ps(tmp10849, _mm512_set1_ps(-5e+00f), tmp10852);
tmp10853 = _mm512_fmadd_ps(in1525, _mm512_set1_ps(-1.25e+00f), tmp10853);
tmp10857 = _mm512_fmadd_ps(in1532, _mm512_set1_ps(-1.25e+00f), tmp10857);
in1523 = _mm512_fmadd_ps(tmp10853, _mm512_set1_ps(2e+00f), tmp10854);
in1530 = _mm512_fmadd_ps(tmp10857, _mm512_set1_ps(2e+00f), tmp10858);
tmp10854 = _mm512_fnmadd_ps(tmp10853, _mm512_set1_ps(2e+00f), tmp10854);
tmp10858 = _mm512_fnmadd_ps(tmp10857, _mm512_set1_ps(2e+00f), tmp10858);
tmp10853 = _mm512_fmadd_ps(in1521, _mm512_set1_ps(2.5e-01f), tmp10845);
tmp10857 = _mm512_fmadd_ps(in1528, _mm512_set1_ps(2.5e-01f), tmp10850);
tmp10845 = _mm512_sub_ps(in1522, tmp10845);
tmp10850 = _mm512_sub_ps(in1529, tmp10850);
tmp10853 = _mm512_fmadd_ps(in1525, _mm512_set1_ps(-1.25e+00f), tmp10853);
tmp10857 = _mm512_fmadd_ps(in1532, _mm512_set1_ps(-1.25e+00f), tmp10857);
in1525 = _mm512_sub_ps(in1525, in1521);
in1532 = _mm512_sub_ps(in1532, in1528);
in1525 = _mm512_fmadd_ps(in1525, _mm512_set1_ps(5.25e+00f), tmp10845);
in1532 = _mm512_fmadd_ps(in1532, _mm512_set1_ps(5.25e+00f), tmp10850);
tmp10847 = _mm512_fmadd_ps(tmp10853, _mm512_set1_ps(2e+00f), tmp10844);
tmp10852 = _mm512_fmadd_ps(tmp10857, _mm512_set1_ps(2e+00f), tmp10849);
tmp10844 = _mm512_fnmadd_ps(tmp10853, _mm512_set1_ps(2e+00f), tmp10844);
tmp10849 = _mm512_fnmadd_ps(tmp10857, _mm512_set1_ps(2e+00f), tmp10849);
__m512 out1391 = _mm512_shuffle_f32x4(tmp10846, tmp10855, 68);
__m512 out1399 = _mm512_shuffle_f32x4(tmp10846, tmp10855, 238);
__m512 out1392 = _mm512_shuffle_f32x4(tmp10856, in1523, 68);
__m512 out1400 = _mm512_shuffle_f32x4(tmp10856, in1523, 238);
__m512 out1393 = _mm512_shuffle_f32x4(tmp10854, tmp10847, 68);
__m512 out1401 = _mm512_shuffle_f32x4(tmp10854, tmp10847, 238);
__m512 out1394 = _mm512_shuffle_f32x4(tmp10844, in1525, 68);
__m512 out1402 = _mm512_shuffle_f32x4(tmp10844, in1525, 238);
__m512 out1395 = _mm512_shuffle_f32x4(tmp10851, tmp10859, 68);
__m512 out1403 = _mm512_shuffle_f32x4(tmp10851, tmp10859, 238);
__m512 out1396 = _mm512_shuffle_f32x4(tmp10860, in1530, 68);
__m512 out1404 = _mm512_shuffle_f32x4(tmp10860, in1530, 238);
__m512 out1397 = _mm512_shuffle_f32x4(tmp10858, tmp10852, 68);
__m512 out1405 = _mm512_shuffle_f32x4(tmp10858, tmp10852, 238);
__m512 out1398 = _mm512_shuffle_f32x4(tmp10849, in1532, 68);
__m512 out1406 = _mm512_shuffle_f32x4(tmp10849, in1532, 238);
_mm512_storeu_ps(dfPtr8+0+819200*i35+49152*j28+49152*s25+768*k105, out1391);
_mm512_storeu_ps(dfPtr8+128+819200*i35+49152*j28+49152*s25+768*k105, out1399);
_mm512_storeu_ps(dfPtr8+64+819200*i35+49152*j28+49152*s25+768*k105, out1395);
_mm512_storeu_ps(dfPtr8+192+819200*i35+49152*j28+49152*s25+768*k105, out1403);
_mm512_storeu_ps(dfPtr8+204800+819200*i35+49152*j28+49152*s25+768*k105, out1392);
_mm512_storeu_ps(dfPtr8+204928+819200*i35+49152*j28+49152*s25+768*k105, out1400);
_mm512_storeu_ps(dfPtr8+204864+819200*i35+49152*j28+49152*s25+768*k105, out1396);
_mm512_storeu_ps(dfPtr8+204992+819200*i35+49152*j28+49152*s25+768*k105, out1404);
_mm512_storeu_ps(dfPtr8+409600+819200*i35+49152*j28+49152*s25+768*k105, out1393);
_mm512_storeu_ps(dfPtr8+409728+819200*i35+49152*j28+49152*s25+768*k105, out1401);
_mm512_storeu_ps(dfPtr8+409664+819200*i35+49152*j28+49152*s25+768*k105, out1397);
_mm512_storeu_ps(dfPtr8+409792+819200*i35+49152*j28+49152*s25+768*k105, out1405);
_mm512_storeu_ps(dfPtr8+614400+819200*i35+49152*j28+49152*s25+768*k105, out1394);
_mm512_storeu_ps(dfPtr8+614528+819200*i35+49152*j28+49152*s25+768*k105, out1402);
_mm512_storeu_ps(dfPtr8+614464+819200*i35+49152*j28+49152*s25+768*k105, out1398);
_mm512_storeu_ps(dfPtr8+614592+819200*i35+49152*j28+49152*s25+768*k105, out1406);
__m512 dat1699 = _mm512_maskz_loadu_ps(127, datPtr16+676+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1699 = _mm512_max_ps(_mm512_setzero_ps(), dat1699);
__m512i pm161 = _mm512_set_epi32(6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15);
__m512 in1534 = _mm512_permutexvar_ps(pm161, dat1699);
__m512 dat1700 = _mm512_maskz_loadu_ps(31, datPtr16+208+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1700 = _mm512_max_ps(_mm512_setzero_ps(), dat1700);
__m512 dat1701 = _mm512_maskz_loadu_ps(127, datPtr16+788+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1701 = _mm512_max_ps(_mm512_setzero_ps(), dat1701);
__m512 dat1702 = _mm512_maskz_loadu_ps(8191, datPtr16+3252+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1702 = _mm512_max_ps(_mm512_setzero_ps(), dat1702);
__m512i pm162 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 4, 3, 2, 1, 0);
__m512 in1535 = _mm512_permutex2var_ps(dat1700, pm162, dat1701);
__m512i pm163 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1542 = _mm512_permutexvar_ps(pm163, dat1702);
__m512 dat1703 = _mm512_maskz_loadu_ps(31, datPtr16+320+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1703 = _mm512_max_ps(_mm512_setzero_ps(), dat1703);
__m512 dat1704 = _mm512_maskz_loadu_ps(127, datPtr16+900+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1704 = _mm512_max_ps(_mm512_setzero_ps(), dat1704);
__m512 dat1705 = _mm512_maskz_loadu_ps(8191, datPtr16+3364+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1705 = _mm512_max_ps(_mm512_setzero_ps(), dat1705);
__m512 in1536 = _mm512_permutex2var_ps(dat1703, pm162, dat1704);
__m512 in1543 = _mm512_permutexvar_ps(pm163, dat1705);
__m512 dat1706 = _mm512_maskz_loadu_ps(31, datPtr16+432+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1706 = _mm512_max_ps(_mm512_setzero_ps(), dat1706);
__m512 dat1707 = _mm512_maskz_loadu_ps(127, datPtr16+1012+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1707 = _mm512_max_ps(_mm512_setzero_ps(), dat1707);
__m512 dat1708 = _mm512_maskz_loadu_ps(8191, datPtr16+3476+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1708 = _mm512_max_ps(_mm512_setzero_ps(), dat1708);
__m512 in1537 = _mm512_permutex2var_ps(dat1706, pm162, dat1707);
__m512 in1544 = _mm512_permutexvar_ps(pm163, dat1708);
__m512 dat1709 = _mm512_maskz_loadu_ps(31, datPtr16+544+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1709 = _mm512_max_ps(_mm512_setzero_ps(), dat1709);
__m512 dat1710 = _mm512_maskz_loadu_ps(127, datPtr16+1124+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1710 = _mm512_max_ps(_mm512_setzero_ps(), dat1710);
__m512 dat1711 = _mm512_maskz_loadu_ps(8191, datPtr16+3588+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1711 = _mm512_max_ps(_mm512_setzero_ps(), dat1711);
__m512 in1538 = _mm512_permutex2var_ps(dat1709, pm162, dat1710);
__m512 in1545 = _mm512_permutexvar_ps(pm163, dat1711);
__m512 dat1712 = _mm512_maskz_loadu_ps(31, datPtr16+656+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1712 = _mm512_max_ps(_mm512_setzero_ps(), dat1712);
__m512 dat1713 = _mm512_maskz_loadu_ps(127, datPtr16+1236+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1713 = _mm512_max_ps(_mm512_setzero_ps(), dat1713);
__m512 dat1714 = _mm512_maskz_loadu_ps(8191, datPtr16+3700+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1714 = _mm512_max_ps(_mm512_setzero_ps(), dat1714);
__m512 in1539 = _mm512_permutex2var_ps(dat1712, pm162, dat1713);
__m512 in1546 = _mm512_permutexvar_ps(pm163, dat1714);
__m512 dat1715 = _mm512_maskz_loadu_ps(31, datPtr16+768+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1715 = _mm512_max_ps(_mm512_setzero_ps(), dat1715);
__m512 dat1716 = _mm512_maskz_loadu_ps(127, datPtr16+1348+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1716 = _mm512_max_ps(_mm512_setzero_ps(), dat1716);
__m512 dat1717 = _mm512_maskz_loadu_ps(8191, datPtr16+3812+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1717 = _mm512_max_ps(_mm512_setzero_ps(), dat1717);
__m512 in1540 = _mm512_permutex2var_ps(dat1715, pm162, dat1716);
__m512 in1547 = _mm512_permutexvar_ps(pm163, dat1717);
__m512 dat1718 = _mm512_maskz_loadu_ps(31, datPtr16+880+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1718 = _mm512_max_ps(_mm512_setzero_ps(), dat1718);
__m512 dat1719 = _mm512_maskz_loadu_ps(127, datPtr16+1460+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1719 = _mm512_max_ps(_mm512_setzero_ps(), dat1719);
__m512 dat1720 = _mm512_maskz_loadu_ps(8191, datPtr16+3924+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1720 = _mm512_max_ps(_mm512_setzero_ps(), dat1720);
__m512 in1541 = _mm512_permutex2var_ps(dat1718, pm162, dat1719);
__m512 in1548 = _mm512_permutexvar_ps(pm163, dat1720);
__m512 tmp10909 = _mm512_add_ps(in1535, in1539);
__m512 tmp10913 = _mm512_add_ps(in1542, in1546);
__m512 tmp10910 = _mm512_sub_ps(in1538, in1536);
__m512 tmp10914 = _mm512_sub_ps(in1545, in1543);
__m512 tmp10911 = _mm512_add_ps(in1536, in1540);
__m512 tmp10915 = _mm512_add_ps(in1543, in1547);
in1534 = _mm512_sub_ps(in1534, in1540);
__m512 tmp10916 = _mm512_sub_ps(_mm512_setzero_ps(), in1547);
tmp10909 = _mm512_fmadd_ps(in1537, _mm512_set1_ps(-4.25e+00f), tmp10909);
tmp10913 = _mm512_fmadd_ps(in1544, _mm512_set1_ps(-4.25e+00f), tmp10913);
tmp10911 = _mm512_fmadd_ps(in1538, _mm512_set1_ps(-4.25e+00f), tmp10911);
tmp10915 = _mm512_fmadd_ps(in1545, _mm512_set1_ps(-4.25e+00f), tmp10915);
in1534 = _mm512_fmadd_ps(tmp10910, _mm512_set1_ps(5.25e+00f), in1534);
tmp10916 = _mm512_fmadd_ps(tmp10914, _mm512_set1_ps(5.25e+00f), tmp10916);
tmp10910 = _mm512_fmadd_ps(in1536, _mm512_set1_ps(2.5e-01f), in1540);
tmp10914 = _mm512_fmadd_ps(in1543, _mm512_set1_ps(2.5e-01f), in1547);
in1536 = _mm512_fmadd_ps(in1536, _mm512_set1_ps(4e+00f), in1540);
in1543 = _mm512_fmadd_ps(in1543, _mm512_set1_ps(4e+00f), in1547);
__m512 tmp10912 = _mm512_sub_ps(tmp10911, tmp10909);
__m512 tmp10917 = _mm512_sub_ps(tmp10915, tmp10913);
tmp10911 = _mm512_add_ps(tmp10909, tmp10911);
tmp10915 = _mm512_add_ps(tmp10913, tmp10915);
tmp10909 = _mm512_fmadd_ps(in1535, _mm512_set1_ps(2.5e-01f), in1539);
tmp10913 = _mm512_fmadd_ps(in1542, _mm512_set1_ps(2.5e-01f), in1546);
tmp10910 = _mm512_fmadd_ps(in1538, _mm512_set1_ps(-1.25e+00f), tmp10910);
tmp10914 = _mm512_fmadd_ps(in1545, _mm512_set1_ps(-1.25e+00f), tmp10914);
in1538 = _mm512_fmadd_ps(in1538, _mm512_set1_ps(-5e+00f), in1536);
in1545 = _mm512_fmadd_ps(in1545, _mm512_set1_ps(-5e+00f), in1543);
tmp10909 = _mm512_fmadd_ps(in1537, _mm512_set1_ps(-1.25e+00f), tmp10909);
tmp10913 = _mm512_fmadd_ps(in1544, _mm512_set1_ps(-1.25e+00f), tmp10913);
in1540 = _mm512_fmadd_ps(tmp10909, _mm512_set1_ps(2e+00f), tmp10910);
in1547 = _mm512_fmadd_ps(tmp10913, _mm512_set1_ps(2e+00f), tmp10914);
tmp10910 = _mm512_fnmadd_ps(tmp10909, _mm512_set1_ps(2e+00f), tmp10910);
tmp10914 = _mm512_fnmadd_ps(tmp10913, _mm512_set1_ps(2e+00f), tmp10914);
tmp10909 = _mm512_fmadd_ps(in1539, _mm512_set1_ps(2.5e-01f), in1535);
tmp10913 = _mm512_fmadd_ps(in1546, _mm512_set1_ps(2.5e-01f), in1542);
in1535 = _mm512_sub_ps(in1541, in1535);
in1542 = _mm512_sub_ps(in1548, in1542);
tmp10909 = _mm512_fmadd_ps(in1537, _mm512_set1_ps(-1.25e+00f), tmp10909);
tmp10913 = _mm512_fmadd_ps(in1544, _mm512_set1_ps(-1.25e+00f), tmp10913);
in1537 = _mm512_sub_ps(in1537, in1539);
in1544 = _mm512_sub_ps(in1544, in1546);
in1537 = _mm512_fmadd_ps(in1537, _mm512_set1_ps(5.25e+00f), in1535);
in1544 = _mm512_fmadd_ps(in1544, _mm512_set1_ps(5.25e+00f), in1542);
in1536 = _mm512_fmadd_ps(tmp10909, _mm512_set1_ps(2e+00f), in1538);
in1543 = _mm512_fmadd_ps(tmp10913, _mm512_set1_ps(2e+00f), in1545);
in1538 = _mm512_fnmadd_ps(tmp10909, _mm512_set1_ps(2e+00f), in1538);
in1545 = _mm512_fnmadd_ps(tmp10913, _mm512_set1_ps(2e+00f), in1545);
__m512 tmp10926 = _mm512_unpacklo_ps(in1534, tmp10911);
__m512 tmp10927 = _mm512_unpackhi_ps(in1534, tmp10911);
__m512 tmp10928 = _mm512_unpacklo_ps(tmp10912, in1540);
__m512 tmp10929 = _mm512_unpackhi_ps(tmp10912, in1540);
__m512 tmp10930 = _mm512_unpacklo_ps(tmp10910, in1536);
__m512 tmp10931 = _mm512_unpackhi_ps(tmp10910, in1536);
__m512 tmp10932 = _mm512_unpacklo_ps(in1538, in1537);
__m512 tmp10933 = _mm512_unpackhi_ps(in1538, in1537);
__m512 tmp10934 = _mm512_unpacklo_ps(tmp10916, tmp10915);
__m512 tmp10935 = _mm512_unpackhi_ps(tmp10916, tmp10915);
__m512 tmp10936 = _mm512_unpacklo_ps(tmp10917, in1547);
__m512 tmp10937 = _mm512_unpackhi_ps(tmp10917, in1547);
__m512 tmp10938 = _mm512_unpacklo_ps(tmp10914, in1543);
__m512 tmp10939 = _mm512_unpackhi_ps(tmp10914, in1543);
__m512 tmp10940 = _mm512_unpacklo_ps(in1545, in1544);
__m512 tmp10941 = _mm512_unpackhi_ps(in1545, in1544);
__m512 tmp10942 = _mm512_shuffle_ps(tmp10926, tmp10928, 68);
__m512 tmp10943 = _mm512_shuffle_ps(tmp10926, tmp10928, 238);
__m512 tmp10944 = _mm512_shuffle_ps(tmp10927, tmp10929, 68);
__m512 tmp10945 = _mm512_shuffle_ps(tmp10927, tmp10929, 238);
__m512 tmp10946 = _mm512_shuffle_ps(tmp10930, tmp10932, 68);
__m512 tmp10947 = _mm512_shuffle_ps(tmp10930, tmp10932, 238);
__m512 tmp10948 = _mm512_shuffle_ps(tmp10931, tmp10933, 68);
__m512 tmp10949 = _mm512_shuffle_ps(tmp10931, tmp10933, 238);
__m512 tmp10950 = _mm512_shuffle_ps(tmp10934, tmp10936, 68);
__m512 tmp10951 = _mm512_shuffle_ps(tmp10934, tmp10936, 238);
__m512 tmp10952 = _mm512_shuffle_ps(tmp10935, tmp10937, 68);
__m512 tmp10953 = _mm512_shuffle_ps(tmp10935, tmp10937, 238);
__m512 tmp10954 = _mm512_shuffle_ps(tmp10938, tmp10940, 68);
__m512 tmp10955 = _mm512_shuffle_ps(tmp10938, tmp10940, 238);
__m512 tmp10956 = _mm512_shuffle_ps(tmp10939, tmp10941, 68);
__m512 tmp10957 = _mm512_shuffle_ps(tmp10939, tmp10941, 238);
__m512 tmp10958 = _mm512_shuffle_f32x4(tmp10942, tmp10946, 136);
__m512 tmp10959 = _mm512_shuffle_f32x4(tmp10942, tmp10946, 221);
__m512 tmp10960 = _mm512_shuffle_f32x4(tmp10943, tmp10947, 136);
__m512 tmp10961 = _mm512_shuffle_f32x4(tmp10943, tmp10947, 221);
__m512 tmp10962 = _mm512_shuffle_f32x4(tmp10944, tmp10948, 136);
__m512 tmp10963 = _mm512_shuffle_f32x4(tmp10944, tmp10948, 221);
__m512 tmp10964 = _mm512_shuffle_f32x4(tmp10945, tmp10949, 136);
__m512 tmp10965 = _mm512_shuffle_f32x4(tmp10945, tmp10949, 221);
__m512 tmp10966 = _mm512_shuffle_f32x4(tmp10950, tmp10954, 136);
__m512 tmp10967 = _mm512_shuffle_f32x4(tmp10950, tmp10954, 221);
__m512 tmp10968 = _mm512_shuffle_f32x4(tmp10951, tmp10955, 136);
__m512 tmp10969 = _mm512_shuffle_f32x4(tmp10951, tmp10955, 221);
__m512 tmp10970 = _mm512_shuffle_f32x4(tmp10952, tmp10956, 136);
__m512 tmp10971 = _mm512_shuffle_f32x4(tmp10952, tmp10956, 221);
__m512 tmp10972 = _mm512_shuffle_f32x4(tmp10953, tmp10957, 136);
__m512 tmp10973 = _mm512_shuffle_f32x4(tmp10953, tmp10957, 221);
in1534 = _mm512_shuffle_f32x4(tmp10958, tmp10966, 136);
tmp10916 = _mm512_shuffle_f32x4(tmp10958, tmp10966, 221);
tmp10911 = _mm512_shuffle_f32x4(tmp10960, tmp10968, 136);
tmp10915 = _mm512_shuffle_f32x4(tmp10960, tmp10968, 221);
tmp10912 = _mm512_shuffle_f32x4(tmp10962, tmp10970, 136);
tmp10917 = _mm512_shuffle_f32x4(tmp10962, tmp10970, 221);
in1540 = _mm512_shuffle_f32x4(tmp10964, tmp10972, 136);
in1547 = _mm512_shuffle_f32x4(tmp10964, tmp10972, 221);
tmp10910 = _mm512_shuffle_f32x4(tmp10959, tmp10967, 136);
tmp10914 = _mm512_shuffle_f32x4(tmp10959, tmp10967, 221);
in1536 = _mm512_shuffle_f32x4(tmp10961, tmp10969, 136);
in1543 = _mm512_shuffle_f32x4(tmp10961, tmp10969, 221);
in1538 = _mm512_shuffle_f32x4(tmp10963, tmp10971, 136);
in1545 = _mm512_shuffle_f32x4(tmp10963, tmp10971, 221);
in1537 = _mm512_shuffle_f32x4(tmp10965, tmp10973, 136);
in1544 = _mm512_shuffle_f32x4(tmp10965, tmp10973, 221);
__m512 tmp10918 = _mm512_add_ps(tmp10911, in1536);
__m512 tmp10922 = _mm512_add_ps(tmp10915, in1543);
__m512 tmp10919 = _mm512_sub_ps(tmp10910, tmp10912);
__m512 tmp10923 = _mm512_sub_ps(tmp10914, tmp10917);
__m512 tmp10920 = _mm512_add_ps(tmp10912, in1538);
__m512 tmp10924 = _mm512_add_ps(tmp10917, in1545);
in1534 = _mm512_sub_ps(in1534, in1538);
tmp10916 = _mm512_sub_ps(tmp10916, in1545);
tmp10918 = _mm512_fmadd_ps(in1540, _mm512_set1_ps(-4.25e+00f), tmp10918);
tmp10922 = _mm512_fmadd_ps(in1547, _mm512_set1_ps(-4.25e+00f), tmp10922);
tmp10920 = _mm512_fmadd_ps(tmp10910, _mm512_set1_ps(-4.25e+00f), tmp10920);
tmp10924 = _mm512_fmadd_ps(tmp10914, _mm512_set1_ps(-4.25e+00f), tmp10924);
in1534 = _mm512_fmadd_ps(tmp10919, _mm512_set1_ps(5.25e+00f), in1534);
tmp10916 = _mm512_fmadd_ps(tmp10923, _mm512_set1_ps(5.25e+00f), tmp10916);
tmp10919 = _mm512_fmadd_ps(tmp10912, _mm512_set1_ps(2.5e-01f), in1538);
tmp10923 = _mm512_fmadd_ps(tmp10917, _mm512_set1_ps(2.5e-01f), in1545);
tmp10912 = _mm512_fmadd_ps(tmp10912, _mm512_set1_ps(4e+00f), in1538);
tmp10917 = _mm512_fmadd_ps(tmp10917, _mm512_set1_ps(4e+00f), in1545);
__m512 tmp10921 = _mm512_sub_ps(tmp10920, tmp10918);
__m512 tmp10925 = _mm512_sub_ps(tmp10924, tmp10922);
tmp10920 = _mm512_add_ps(tmp10918, tmp10920);
tmp10924 = _mm512_add_ps(tmp10922, tmp10924);
tmp10918 = _mm512_fmadd_ps(tmp10911, _mm512_set1_ps(2.5e-01f), in1536);
tmp10922 = _mm512_fmadd_ps(tmp10915, _mm512_set1_ps(2.5e-01f), in1543);
tmp10919 = _mm512_fmadd_ps(tmp10910, _mm512_set1_ps(-1.25e+00f), tmp10919);
tmp10923 = _mm512_fmadd_ps(tmp10914, _mm512_set1_ps(-1.25e+00f), tmp10923);
tmp10910 = _mm512_fmadd_ps(tmp10910, _mm512_set1_ps(-5e+00f), tmp10912);
tmp10914 = _mm512_fmadd_ps(tmp10914, _mm512_set1_ps(-5e+00f), tmp10917);
tmp10918 = _mm512_fmadd_ps(in1540, _mm512_set1_ps(-1.25e+00f), tmp10918);
tmp10922 = _mm512_fmadd_ps(in1547, _mm512_set1_ps(-1.25e+00f), tmp10922);
in1538 = _mm512_fmadd_ps(tmp10918, _mm512_set1_ps(2e+00f), tmp10919);
in1545 = _mm512_fmadd_ps(tmp10922, _mm512_set1_ps(2e+00f), tmp10923);
tmp10919 = _mm512_fnmadd_ps(tmp10918, _mm512_set1_ps(2e+00f), tmp10919);
tmp10923 = _mm512_fnmadd_ps(tmp10922, _mm512_set1_ps(2e+00f), tmp10923);
tmp10918 = _mm512_fmadd_ps(in1536, _mm512_set1_ps(2.5e-01f), tmp10911);
tmp10922 = _mm512_fmadd_ps(in1543, _mm512_set1_ps(2.5e-01f), tmp10915);
tmp10911 = _mm512_sub_ps(in1537, tmp10911);
tmp10915 = _mm512_sub_ps(in1544, tmp10915);
tmp10918 = _mm512_fmadd_ps(in1540, _mm512_set1_ps(-1.25e+00f), tmp10918);
tmp10922 = _mm512_fmadd_ps(in1547, _mm512_set1_ps(-1.25e+00f), tmp10922);
in1540 = _mm512_sub_ps(in1540, in1536);
in1547 = _mm512_sub_ps(in1547, in1543);
in1540 = _mm512_fmadd_ps(in1540, _mm512_set1_ps(5.25e+00f), tmp10911);
in1547 = _mm512_fmadd_ps(in1547, _mm512_set1_ps(5.25e+00f), tmp10915);
tmp10912 = _mm512_fmadd_ps(tmp10918, _mm512_set1_ps(2e+00f), tmp10910);
tmp10917 = _mm512_fmadd_ps(tmp10922, _mm512_set1_ps(2e+00f), tmp10914);
tmp10910 = _mm512_fnmadd_ps(tmp10918, _mm512_set1_ps(2e+00f), tmp10910);
tmp10914 = _mm512_fnmadd_ps(tmp10922, _mm512_set1_ps(2e+00f), tmp10914);
__m512 out1407 = _mm512_shuffle_f32x4(in1534, tmp10920, 68);
__m512 out1415 = _mm512_shuffle_f32x4(in1534, tmp10920, 238);
__m512 out1408 = _mm512_shuffle_f32x4(tmp10921, in1538, 68);
__m512 out1416 = _mm512_shuffle_f32x4(tmp10921, in1538, 238);
__m512 out1409 = _mm512_shuffle_f32x4(tmp10919, tmp10912, 68);
__m512 out1417 = _mm512_shuffle_f32x4(tmp10919, tmp10912, 238);
__m512 out1410 = _mm512_shuffle_f32x4(tmp10910, in1540, 68);
__m512 out1418 = _mm512_shuffle_f32x4(tmp10910, in1540, 238);
__m512 out1411 = _mm512_shuffle_f32x4(tmp10916, tmp10924, 68);
__m512 out1419 = _mm512_shuffle_f32x4(tmp10916, tmp10924, 238);
__m512 out1412 = _mm512_shuffle_f32x4(tmp10925, in1545, 68);
__m512 out1420 = _mm512_shuffle_f32x4(tmp10925, in1545, 238);
__m512 out1413 = _mm512_shuffle_f32x4(tmp10923, tmp10917, 68);
__m512 out1421 = _mm512_shuffle_f32x4(tmp10923, tmp10917, 238);
__m512 out1414 = _mm512_shuffle_f32x4(tmp10914, in1547, 68);
__m512 out1422 = _mm512_shuffle_f32x4(tmp10914, in1547, 238);
_mm512_storeu_ps(dfPtr8+256+819200*i35+49152*j28+49152*s25+768*k105, out1407);
_mm512_storeu_ps(dfPtr8+384+819200*i35+49152*j28+49152*s25+768*k105, out1415);
_mm512_storeu_ps(dfPtr8+320+819200*i35+49152*j28+49152*s25+768*k105, out1411);
_mm512_storeu_ps(dfPtr8+448+819200*i35+49152*j28+49152*s25+768*k105, out1419);
_mm512_storeu_ps(dfPtr8+205056+819200*i35+49152*j28+49152*s25+768*k105, out1408);
_mm512_storeu_ps(dfPtr8+205184+819200*i35+49152*j28+49152*s25+768*k105, out1416);
_mm512_storeu_ps(dfPtr8+205120+819200*i35+49152*j28+49152*s25+768*k105, out1412);
_mm512_storeu_ps(dfPtr8+205248+819200*i35+49152*j28+49152*s25+768*k105, out1420);
_mm512_storeu_ps(dfPtr8+409856+819200*i35+49152*j28+49152*s25+768*k105, out1409);
_mm512_storeu_ps(dfPtr8+409984+819200*i35+49152*j28+49152*s25+768*k105, out1417);
_mm512_storeu_ps(dfPtr8+409920+819200*i35+49152*j28+49152*s25+768*k105, out1413);
_mm512_storeu_ps(dfPtr8+410048+819200*i35+49152*j28+49152*s25+768*k105, out1421);
_mm512_storeu_ps(dfPtr8+614656+819200*i35+49152*j28+49152*s25+768*k105, out1410);
_mm512_storeu_ps(dfPtr8+614784+819200*i35+49152*j28+49152*s25+768*k105, out1418);
_mm512_storeu_ps(dfPtr8+614720+819200*i35+49152*j28+49152*s25+768*k105, out1414);
_mm512_storeu_ps(dfPtr8+614848+819200*i35+49152*j28+49152*s25+768*k105, out1422);
__m512 dat1721 = _mm512_maskz_loadu_ps(127, datPtr16+3812+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1721 = _mm512_max_ps(_mm512_setzero_ps(), dat1721);
__m512i pm164 = _mm512_set_epi32(6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15);
__m512 in1556 = _mm512_permutexvar_ps(pm164, dat1721);
__m512 dat1722 = _mm512_maskz_loadu_ps(16383, datPtr16+3296+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1722 = _mm512_max_ps(_mm512_setzero_ps(), dat1722);
__m512 dat1723 = _mm512_maskz_loadu_ps(31, datPtr16+3344+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1723 = _mm512_max_ps(_mm512_setzero_ps(), dat1723);
__m512 dat1724 = _mm512_maskz_loadu_ps(127, datPtr16+3924+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1724 = _mm512_max_ps(_mm512_setzero_ps(), dat1724);
__m512i pm165 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1549 = _mm512_permutexvar_ps(pm165, dat1722);
__m512i pm166 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 4, 3, 2, 1, 0);
__m512 in1557 = _mm512_permutex2var_ps(dat1723, pm166, dat1724);
__m512 dat1725 = _mm512_maskz_loadu_ps(16383, datPtr16+3408+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1725 = _mm512_max_ps(_mm512_setzero_ps(), dat1725);
__m512 dat1726 = _mm512_maskz_loadu_ps(31, datPtr16+3456+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1726 = _mm512_max_ps(_mm512_setzero_ps(), dat1726);
__m512 dat1727 = _mm512_maskz_loadu_ps(127, datPtr16+4036+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1727 = _mm512_max_ps(_mm512_setzero_ps(), dat1727);
__m512 in1550 = _mm512_permutexvar_ps(pm165, dat1725);
__m512 in1558 = _mm512_permutex2var_ps(dat1726, pm166, dat1727);
__m512 dat1728 = _mm512_maskz_loadu_ps(16383, datPtr16+3520+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1728 = _mm512_max_ps(_mm512_setzero_ps(), dat1728);
__m512 dat1729 = _mm512_maskz_loadu_ps(31, datPtr16+3568+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1729 = _mm512_max_ps(_mm512_setzero_ps(), dat1729);
__m512 dat1730 = _mm512_maskz_loadu_ps(127, datPtr16+4148+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1730 = _mm512_max_ps(_mm512_setzero_ps(), dat1730);
__m512 in1551 = _mm512_permutexvar_ps(pm165, dat1728);
__m512 in1559 = _mm512_permutex2var_ps(dat1729, pm166, dat1730);
__m512 dat1731 = _mm512_maskz_loadu_ps(16383, datPtr16+3632+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1731 = _mm512_max_ps(_mm512_setzero_ps(), dat1731);
__m512 dat1732 = _mm512_maskz_loadu_ps(31, datPtr16+3680+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1732 = _mm512_max_ps(_mm512_setzero_ps(), dat1732);
__m512 dat1733 = _mm512_maskz_loadu_ps(127, datPtr16+4260+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1733 = _mm512_max_ps(_mm512_setzero_ps(), dat1733);
__m512 in1552 = _mm512_permutexvar_ps(pm165, dat1731);
__m512 in1560 = _mm512_permutex2var_ps(dat1732, pm166, dat1733);
__m512 dat1734 = _mm512_maskz_loadu_ps(16383, datPtr16+3744+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1734 = _mm512_max_ps(_mm512_setzero_ps(), dat1734);
__m512 dat1735 = _mm512_maskz_loadu_ps(31, datPtr16+3792+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1735 = _mm512_max_ps(_mm512_setzero_ps(), dat1735);
__m512 dat1736 = _mm512_maskz_loadu_ps(127, datPtr16+4372+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1736 = _mm512_max_ps(_mm512_setzero_ps(), dat1736);
__m512 in1553 = _mm512_permutexvar_ps(pm165, dat1734);
__m512 in1561 = _mm512_permutex2var_ps(dat1735, pm166, dat1736);
__m512 dat1737 = _mm512_maskz_loadu_ps(16383, datPtr16+3856+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1737 = _mm512_max_ps(_mm512_setzero_ps(), dat1737);
__m512 dat1738 = _mm512_maskz_loadu_ps(31, datPtr16+3904+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1738 = _mm512_max_ps(_mm512_setzero_ps(), dat1738);
__m512 dat1739 = _mm512_maskz_loadu_ps(127, datPtr16+4484+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1739 = _mm512_max_ps(_mm512_setzero_ps(), dat1739);
__m512 in1554 = _mm512_permutexvar_ps(pm165, dat1737);
__m512 in1562 = _mm512_permutex2var_ps(dat1738, pm166, dat1739);
__m512 dat1740 = _mm512_maskz_loadu_ps(16383, datPtr16+3968+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1740 = _mm512_max_ps(_mm512_setzero_ps(), dat1740);
__m512 dat1741 = _mm512_maskz_loadu_ps(31, datPtr16+4016+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1741 = _mm512_max_ps(_mm512_setzero_ps(), dat1741);
__m512 dat1742 = _mm512_maskz_loadu_ps(127, datPtr16+4596+401408*i35+112*h40+4*w48+401408*s25+6272*k105);
dat1742 = _mm512_max_ps(_mm512_setzero_ps(), dat1742);
__m512 in1555 = _mm512_permutexvar_ps(pm165, dat1740);
__m512 in1563 = _mm512_permutex2var_ps(dat1741, pm166, dat1742);
__m512 tmp10974 = _mm512_add_ps(in1549, in1553);
__m512 tmp10979 = _mm512_add_ps(in1557, in1561);
__m512 tmp10975 = _mm512_sub_ps(in1552, in1550);
__m512 tmp10980 = _mm512_sub_ps(in1560, in1558);
__m512 tmp10976 = _mm512_add_ps(in1550, in1554);
__m512 tmp10981 = _mm512_add_ps(in1558, in1562);
__m512 tmp10977 = _mm512_sub_ps(_mm512_setzero_ps(), in1554);
in1556 = _mm512_sub_ps(in1556, in1562);
tmp10974 = _mm512_fmadd_ps(in1551, _mm512_set1_ps(-4.25e+00f), tmp10974);
tmp10979 = _mm512_fmadd_ps(in1559, _mm512_set1_ps(-4.25e+00f), tmp10979);
tmp10976 = _mm512_fmadd_ps(in1552, _mm512_set1_ps(-4.25e+00f), tmp10976);
tmp10981 = _mm512_fmadd_ps(in1560, _mm512_set1_ps(-4.25e+00f), tmp10981);
tmp10977 = _mm512_fmadd_ps(tmp10975, _mm512_set1_ps(5.25e+00f), tmp10977);
in1556 = _mm512_fmadd_ps(tmp10980, _mm512_set1_ps(5.25e+00f), in1556);
tmp10975 = _mm512_fmadd_ps(in1550, _mm512_set1_ps(2.5e-01f), in1554);
tmp10980 = _mm512_fmadd_ps(in1558, _mm512_set1_ps(2.5e-01f), in1562);
in1550 = _mm512_fmadd_ps(in1550, _mm512_set1_ps(4e+00f), in1554);
in1558 = _mm512_fmadd_ps(in1558, _mm512_set1_ps(4e+00f), in1562);
__m512 tmp10978 = _mm512_sub_ps(tmp10976, tmp10974);
__m512 tmp10982 = _mm512_sub_ps(tmp10981, tmp10979);
tmp10976 = _mm512_add_ps(tmp10974, tmp10976);
tmp10981 = _mm512_add_ps(tmp10979, tmp10981);
tmp10974 = _mm512_fmadd_ps(in1549, _mm512_set1_ps(2.5e-01f), in1553);
tmp10979 = _mm512_fmadd_ps(in1557, _mm512_set1_ps(2.5e-01f), in1561);
tmp10975 = _mm512_fmadd_ps(in1552, _mm512_set1_ps(-1.25e+00f), tmp10975);
tmp10980 = _mm512_fmadd_ps(in1560, _mm512_set1_ps(-1.25e+00f), tmp10980);
in1552 = _mm512_fmadd_ps(in1552, _mm512_set1_ps(-5e+00f), in1550);
in1560 = _mm512_fmadd_ps(in1560, _mm512_set1_ps(-5e+00f), in1558);
tmp10974 = _mm512_fmadd_ps(in1551, _mm512_set1_ps(-1.25e+00f), tmp10974);
tmp10979 = _mm512_fmadd_ps(in1559, _mm512_set1_ps(-1.25e+00f), tmp10979);
in1554 = _mm512_fmadd_ps(tmp10974, _mm512_set1_ps(2e+00f), tmp10975);
in1562 = _mm512_fmadd_ps(tmp10979, _mm512_set1_ps(2e+00f), tmp10980);
tmp10975 = _mm512_fnmadd_ps(tmp10974, _mm512_set1_ps(2e+00f), tmp10975);
tmp10980 = _mm512_fnmadd_ps(tmp10979, _mm512_set1_ps(2e+00f), tmp10980);
tmp10974 = _mm512_fmadd_ps(in1553, _mm512_set1_ps(2.5e-01f), in1549);
tmp10979 = _mm512_fmadd_ps(in1561, _mm512_set1_ps(2.5e-01f), in1557);
in1549 = _mm512_sub_ps(in1555, in1549);
in1557 = _mm512_sub_ps(in1563, in1557);
tmp10974 = _mm512_fmadd_ps(in1551, _mm512_set1_ps(-1.25e+00f), tmp10974);
tmp10979 = _mm512_fmadd_ps(in1559, _mm512_set1_ps(-1.25e+00f), tmp10979);
in1551 = _mm512_sub_ps(in1551, in1553);
in1559 = _mm512_sub_ps(in1559, in1561);
in1551 = _mm512_fmadd_ps(in1551, _mm512_set1_ps(5.25e+00f), in1549);
in1559 = _mm512_fmadd_ps(in1559, _mm512_set1_ps(5.25e+00f), in1557);
in1550 = _mm512_fmadd_ps(tmp10974, _mm512_set1_ps(2e+00f), in1552);
in1558 = _mm512_fmadd_ps(tmp10979, _mm512_set1_ps(2e+00f), in1560);
in1552 = _mm512_fnmadd_ps(tmp10974, _mm512_set1_ps(2e+00f), in1552);
in1560 = _mm512_fnmadd_ps(tmp10979, _mm512_set1_ps(2e+00f), in1560);
__m512 tmp10991 = _mm512_unpacklo_ps(tmp10977, tmp10976);
__m512 tmp10992 = _mm512_unpackhi_ps(tmp10977, tmp10976);
__m512 tmp10993 = _mm512_unpacklo_ps(tmp10978, in1554);
__m512 tmp10994 = _mm512_unpackhi_ps(tmp10978, in1554);
__m512 tmp10995 = _mm512_unpacklo_ps(tmp10975, in1550);
__m512 tmp10996 = _mm512_unpackhi_ps(tmp10975, in1550);
__m512 tmp10997 = _mm512_unpacklo_ps(in1552, in1551);
__m512 tmp10998 = _mm512_unpackhi_ps(in1552, in1551);
__m512 tmp10999 = _mm512_unpacklo_ps(in1556, tmp10981);
__m512 tmp11000 = _mm512_unpackhi_ps(in1556, tmp10981);
__m512 tmp11001 = _mm512_unpacklo_ps(tmp10982, in1562);
__m512 tmp11002 = _mm512_unpackhi_ps(tmp10982, in1562);
__m512 tmp11003 = _mm512_unpacklo_ps(tmp10980, in1558);
__m512 tmp11004 = _mm512_unpackhi_ps(tmp10980, in1558);
__m512 tmp11005 = _mm512_unpacklo_ps(in1560, in1559);
__m512 tmp11006 = _mm512_unpackhi_ps(in1560, in1559);
__m512 tmp11007 = _mm512_shuffle_ps(tmp10991, tmp10993, 68);
__m512 tmp11008 = _mm512_shuffle_ps(tmp10991, tmp10993, 238);
__m512 tmp11009 = _mm512_shuffle_ps(tmp10992, tmp10994, 68);
__m512 tmp11010 = _mm512_shuffle_ps(tmp10992, tmp10994, 238);
__m512 tmp11011 = _mm512_shuffle_ps(tmp10995, tmp10997, 68);
__m512 tmp11012 = _mm512_shuffle_ps(tmp10995, tmp10997, 238);
__m512 tmp11013 = _mm512_shuffle_ps(tmp10996, tmp10998, 68);
__m512 tmp11014 = _mm512_shuffle_ps(tmp10996, tmp10998, 238);
__m512 tmp11015 = _mm512_shuffle_ps(tmp10999, tmp11001, 68);
__m512 tmp11016 = _mm512_shuffle_ps(tmp10999, tmp11001, 238);
__m512 tmp11017 = _mm512_shuffle_ps(tmp11000, tmp11002, 68);
__m512 tmp11018 = _mm512_shuffle_ps(tmp11000, tmp11002, 238);
__m512 tmp11019 = _mm512_shuffle_ps(tmp11003, tmp11005, 68);
__m512 tmp11020 = _mm512_shuffle_ps(tmp11003, tmp11005, 238);
__m512 tmp11021 = _mm512_shuffle_ps(tmp11004, tmp11006, 68);
__m512 tmp11022 = _mm512_shuffle_ps(tmp11004, tmp11006, 238);
__m512 tmp11023 = _mm512_shuffle_f32x4(tmp11007, tmp11011, 136);
__m512 tmp11024 = _mm512_shuffle_f32x4(tmp11007, tmp11011, 221);
__m512 tmp11025 = _mm512_shuffle_f32x4(tmp11008, tmp11012, 136);
__m512 tmp11026 = _mm512_shuffle_f32x4(tmp11008, tmp11012, 221);
__m512 tmp11027 = _mm512_shuffle_f32x4(tmp11009, tmp11013, 136);
__m512 tmp11028 = _mm512_shuffle_f32x4(tmp11009, tmp11013, 221);
__m512 tmp11029 = _mm512_shuffle_f32x4(tmp11010, tmp11014, 136);
__m512 tmp11030 = _mm512_shuffle_f32x4(tmp11010, tmp11014, 221);
__m512 tmp11031 = _mm512_shuffle_f32x4(tmp11015, tmp11019, 136);
__m512 tmp11032 = _mm512_shuffle_f32x4(tmp11015, tmp11019, 221);
__m512 tmp11033 = _mm512_shuffle_f32x4(tmp11016, tmp11020, 136);
__m512 tmp11034 = _mm512_shuffle_f32x4(tmp11016, tmp11020, 221);
__m512 tmp11035 = _mm512_shuffle_f32x4(tmp11017, tmp11021, 136);
__m512 tmp11036 = _mm512_shuffle_f32x4(tmp11017, tmp11021, 221);
__m512 tmp11037 = _mm512_shuffle_f32x4(tmp11018, tmp11022, 136);
__m512 tmp11038 = _mm512_shuffle_f32x4(tmp11018, tmp11022, 221);
tmp10977 = _mm512_shuffle_f32x4(tmp11023, tmp11031, 136);
in1556 = _mm512_shuffle_f32x4(tmp11023, tmp11031, 221);
tmp10976 = _mm512_shuffle_f32x4(tmp11025, tmp11033, 136);
tmp10981 = _mm512_shuffle_f32x4(tmp11025, tmp11033, 221);
tmp10978 = _mm512_shuffle_f32x4(tmp11027, tmp11035, 136);
tmp10982 = _mm512_shuffle_f32x4(tmp11027, tmp11035, 221);
in1554 = _mm512_shuffle_f32x4(tmp11029, tmp11037, 136);
in1562 = _mm512_shuffle_f32x4(tmp11029, tmp11037, 221);
tmp10975 = _mm512_shuffle_f32x4(tmp11024, tmp11032, 136);
tmp10980 = _mm512_shuffle_f32x4(tmp11024, tmp11032, 221);
in1550 = _mm512_shuffle_f32x4(tmp11026, tmp11034, 136);
in1558 = _mm512_shuffle_f32x4(tmp11026, tmp11034, 221);
in1552 = _mm512_shuffle_f32x4(tmp11028, tmp11036, 136);
in1560 = _mm512_shuffle_f32x4(tmp11028, tmp11036, 221);
in1551 = _mm512_shuffle_f32x4(tmp11030, tmp11038, 136);
in1559 = _mm512_shuffle_f32x4(tmp11030, tmp11038, 221);
__m512 tmp10983 = _mm512_add_ps(tmp10976, in1550);
__m512 tmp10987 = _mm512_add_ps(tmp10981, in1558);
__m512 tmp10984 = _mm512_sub_ps(tmp10975, tmp10978);
__m512 tmp10988 = _mm512_sub_ps(tmp10980, tmp10982);
__m512 tmp10985 = _mm512_add_ps(tmp10978, in1552);
__m512 tmp10989 = _mm512_add_ps(tmp10982, in1560);
tmp10977 = _mm512_sub_ps(tmp10977, in1552);
in1556 = _mm512_sub_ps(in1556, in1560);
tmp10983 = _mm512_fmadd_ps(in1554, _mm512_set1_ps(-4.25e+00f), tmp10983);
tmp10987 = _mm512_fmadd_ps(in1562, _mm512_set1_ps(-4.25e+00f), tmp10987);
tmp10985 = _mm512_fmadd_ps(tmp10975, _mm512_set1_ps(-4.25e+00f), tmp10985);
tmp10989 = _mm512_fmadd_ps(tmp10980, _mm512_set1_ps(-4.25e+00f), tmp10989);
tmp10977 = _mm512_fmadd_ps(tmp10984, _mm512_set1_ps(5.25e+00f), tmp10977);
in1556 = _mm512_fmadd_ps(tmp10988, _mm512_set1_ps(5.25e+00f), in1556);
tmp10984 = _mm512_fmadd_ps(tmp10978, _mm512_set1_ps(2.5e-01f), in1552);
tmp10988 = _mm512_fmadd_ps(tmp10982, _mm512_set1_ps(2.5e-01f), in1560);
tmp10978 = _mm512_fmadd_ps(tmp10978, _mm512_set1_ps(4e+00f), in1552);
tmp10982 = _mm512_fmadd_ps(tmp10982, _mm512_set1_ps(4e+00f), in1560);
__m512 tmp10986 = _mm512_sub_ps(tmp10985, tmp10983);
__m512 tmp10990 = _mm512_sub_ps(tmp10989, tmp10987);
tmp10985 = _mm512_add_ps(tmp10983, tmp10985);
tmp10989 = _mm512_add_ps(tmp10987, tmp10989);
tmp10983 = _mm512_fmadd_ps(tmp10976, _mm512_set1_ps(2.5e-01f), in1550);
tmp10987 = _mm512_fmadd_ps(tmp10981, _mm512_set1_ps(2.5e-01f), in1558);
tmp10984 = _mm512_fmadd_ps(tmp10975, _mm512_set1_ps(-1.25e+00f), tmp10984);
tmp10988 = _mm512_fmadd_ps(tmp10980, _mm512_set1_ps(-1.25e+00f), tmp10988);
tmp10975 = _mm512_fmadd_ps(tmp10975, _mm512_set1_ps(-5e+00f), tmp10978);
tmp10980 = _mm512_fmadd_ps(tmp10980, _mm512_set1_ps(-5e+00f), tmp10982);
tmp10983 = _mm512_fmadd_ps(in1554, _mm512_set1_ps(-1.25e+00f), tmp10983);
tmp10987 = _mm512_fmadd_ps(in1562, _mm512_set1_ps(-1.25e+00f), tmp10987);
in1552 = _mm512_fmadd_ps(tmp10983, _mm512_set1_ps(2e+00f), tmp10984);
in1560 = _mm512_fmadd_ps(tmp10987, _mm512_set1_ps(2e+00f), tmp10988);
tmp10984 = _mm512_fnmadd_ps(tmp10983, _mm512_set1_ps(2e+00f), tmp10984);
tmp10988 = _mm512_fnmadd_ps(tmp10987, _mm512_set1_ps(2e+00f), tmp10988);
tmp10983 = _mm512_fmadd_ps(in1550, _mm512_set1_ps(2.5e-01f), tmp10976);
tmp10987 = _mm512_fmadd_ps(in1558, _mm512_set1_ps(2.5e-01f), tmp10981);
tmp10976 = _mm512_sub_ps(in1551, tmp10976);
tmp10981 = _mm512_sub_ps(in1559, tmp10981);
tmp10983 = _mm512_fmadd_ps(in1554, _mm512_set1_ps(-1.25e+00f), tmp10983);
tmp10987 = _mm512_fmadd_ps(in1562, _mm512_set1_ps(-1.25e+00f), tmp10987);
in1554 = _mm512_sub_ps(in1554, in1550);
in1562 = _mm512_sub_ps(in1562, in1558);
in1554 = _mm512_fmadd_ps(in1554, _mm512_set1_ps(5.25e+00f), tmp10976);
in1562 = _mm512_fmadd_ps(in1562, _mm512_set1_ps(5.25e+00f), tmp10981);
tmp10978 = _mm512_fmadd_ps(tmp10983, _mm512_set1_ps(2e+00f), tmp10975);
tmp10982 = _mm512_fmadd_ps(tmp10987, _mm512_set1_ps(2e+00f), tmp10980);
tmp10975 = _mm512_fnmadd_ps(tmp10983, _mm512_set1_ps(2e+00f), tmp10975);
tmp10980 = _mm512_fnmadd_ps(tmp10987, _mm512_set1_ps(2e+00f), tmp10980);
__m512 out1423 = _mm512_shuffle_f32x4(tmp10977, tmp10985, 68);
__m512 out1431 = _mm512_shuffle_f32x4(tmp10977, tmp10985, 238);
__m512 out1424 = _mm512_shuffle_f32x4(tmp10986, in1552, 68);
__m512 out1432 = _mm512_shuffle_f32x4(tmp10986, in1552, 238);
__m512 out1425 = _mm512_shuffle_f32x4(tmp10984, tmp10978, 68);
__m512 out1433 = _mm512_shuffle_f32x4(tmp10984, tmp10978, 238);
__m512 out1426 = _mm512_shuffle_f32x4(tmp10975, in1554, 68);
__m512 out1434 = _mm512_shuffle_f32x4(tmp10975, in1554, 238);
__m512 out1427 = _mm512_shuffle_f32x4(in1556, tmp10989, 68);
__m512 out1435 = _mm512_shuffle_f32x4(in1556, tmp10989, 238);
__m512 out1428 = _mm512_shuffle_f32x4(tmp10990, in1560, 68);
__m512 out1436 = _mm512_shuffle_f32x4(tmp10990, in1560, 238);
__m512 out1429 = _mm512_shuffle_f32x4(tmp10988, tmp10982, 68);
__m512 out1437 = _mm512_shuffle_f32x4(tmp10988, tmp10982, 238);
__m512 out1430 = _mm512_shuffle_f32x4(tmp10980, in1562, 68);
__m512 out1438 = _mm512_shuffle_f32x4(tmp10980, in1562, 238);
_mm512_storeu_ps(dfPtr8+512+819200*i35+49152*j28+49152*s25+768*k105, out1423);
_mm512_storeu_ps(dfPtr8+640+819200*i35+49152*j28+49152*s25+768*k105, out1431);
_mm512_storeu_ps(dfPtr8+576+819200*i35+49152*j28+49152*s25+768*k105, out1427);
_mm512_storeu_ps(dfPtr8+704+819200*i35+49152*j28+49152*s25+768*k105, out1435);
_mm512_storeu_ps(dfPtr8+205312+819200*i35+49152*j28+49152*s25+768*k105, out1424);
_mm512_storeu_ps(dfPtr8+205440+819200*i35+49152*j28+49152*s25+768*k105, out1432);
_mm512_storeu_ps(dfPtr8+205376+819200*i35+49152*j28+49152*s25+768*k105, out1428);
_mm512_storeu_ps(dfPtr8+205504+819200*i35+49152*j28+49152*s25+768*k105, out1436);
_mm512_storeu_ps(dfPtr8+410112+819200*i35+49152*j28+49152*s25+768*k105, out1425);
_mm512_storeu_ps(dfPtr8+410240+819200*i35+49152*j28+49152*s25+768*k105, out1433);
_mm512_storeu_ps(dfPtr8+410176+819200*i35+49152*j28+49152*s25+768*k105, out1429);
_mm512_storeu_ps(dfPtr8+410304+819200*i35+49152*j28+49152*s25+768*k105, out1437);
_mm512_storeu_ps(dfPtr8+614912+819200*i35+49152*j28+49152*s25+768*k105, out1426);
_mm512_storeu_ps(dfPtr8+615040+819200*i35+49152*j28+49152*s25+768*k105, out1434);
_mm512_storeu_ps(dfPtr8+614976+819200*i35+49152*j28+49152*s25+768*k105, out1430);
_mm512_storeu_ps(dfPtr8+615104+819200*i35+49152*j28+49152*s25+768*k105, out1438);
}
if (j28 >= last7) return;
++j28;
rel19 = 1;
}
ptrdiff_t h41 = base19+6;
ptrdiff_t w49 = 6;
ptrdiff_t k106 = 0;
for (; k106 != 64; ++k106) {
__m512 dat1743 = _mm512_maskz_loadu_ps(16383, datPtr16+0+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1743 = _mm512_max_ps(_mm512_setzero_ps(), dat1743);
__m512 dat1744 = _mm512_maskz_loadu_ps(2047, datPtr16+48+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1744 = _mm512_max_ps(_mm512_setzero_ps(), dat1744);
__m512i pm167 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1564 = _mm512_permutexvar_ps(pm167, dat1743);
__m512i pm168 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1572 = _mm512_permutexvar_ps(pm168, dat1744);
__m512 dat1745 = _mm512_maskz_loadu_ps(16383, datPtr16+112+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1745 = _mm512_max_ps(_mm512_setzero_ps(), dat1745);
__m512 dat1746 = _mm512_maskz_loadu_ps(2047, datPtr16+160+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1746 = _mm512_max_ps(_mm512_setzero_ps(), dat1746);
__m512 in1565 = _mm512_permutexvar_ps(pm167, dat1745);
__m512 in1573 = _mm512_permutexvar_ps(pm168, dat1746);
__m512 dat1747 = _mm512_maskz_loadu_ps(16383, datPtr16+224+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1747 = _mm512_max_ps(_mm512_setzero_ps(), dat1747);
__m512 dat1748 = _mm512_maskz_loadu_ps(2047, datPtr16+272+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1748 = _mm512_max_ps(_mm512_setzero_ps(), dat1748);
__m512 in1566 = _mm512_permutexvar_ps(pm167, dat1747);
__m512 in1574 = _mm512_permutexvar_ps(pm168, dat1748);
__m512 dat1749 = _mm512_maskz_loadu_ps(16383, datPtr16+336+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1749 = _mm512_max_ps(_mm512_setzero_ps(), dat1749);
__m512 dat1750 = _mm512_maskz_loadu_ps(2047, datPtr16+384+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1750 = _mm512_max_ps(_mm512_setzero_ps(), dat1750);
__m512 in1567 = _mm512_permutexvar_ps(pm167, dat1749);
__m512 in1575 = _mm512_permutexvar_ps(pm168, dat1750);
__m512 dat1751 = _mm512_maskz_loadu_ps(16383, datPtr16+448+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1751 = _mm512_max_ps(_mm512_setzero_ps(), dat1751);
__m512 dat1752 = _mm512_maskz_loadu_ps(2047, datPtr16+496+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1752 = _mm512_max_ps(_mm512_setzero_ps(), dat1752);
__m512 in1568 = _mm512_permutexvar_ps(pm167, dat1751);
__m512 in1576 = _mm512_permutexvar_ps(pm168, dat1752);
__m512 dat1753 = _mm512_maskz_loadu_ps(16383, datPtr16+560+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1753 = _mm512_max_ps(_mm512_setzero_ps(), dat1753);
__m512 dat1754 = _mm512_maskz_loadu_ps(2047, datPtr16+608+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1754 = _mm512_max_ps(_mm512_setzero_ps(), dat1754);
__m512 in1569 = _mm512_permutexvar_ps(pm167, dat1753);
__m512 in1577 = _mm512_permutexvar_ps(pm168, dat1754);
__m512 dat1755 = _mm512_maskz_loadu_ps(16383, datPtr16+672+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1755 = _mm512_max_ps(_mm512_setzero_ps(), dat1755);
__m512 dat1756 = _mm512_maskz_loadu_ps(2047, datPtr16+720+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1756 = _mm512_max_ps(_mm512_setzero_ps(), dat1756);
__m512 in1570 = _mm512_permutexvar_ps(pm167, dat1755);
__m512 in1578 = _mm512_permutexvar_ps(pm168, dat1756);
__m512 dat1757 = _mm512_maskz_loadu_ps(16383, datPtr16+784+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1757 = _mm512_max_ps(_mm512_setzero_ps(), dat1757);
__m512 dat1758 = _mm512_maskz_loadu_ps(2047, datPtr16+832+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1758 = _mm512_max_ps(_mm512_setzero_ps(), dat1758);
__m512 in1571 = _mm512_permutexvar_ps(pm167, dat1757);
__m512 in1579 = _mm512_permutexvar_ps(pm168, dat1758);
__m512 tmp11039 = _mm512_add_ps(in1565, in1569);
__m512 tmp11043 = _mm512_add_ps(in1573, in1577);
__m512 tmp11040 = _mm512_sub_ps(in1568, in1566);
__m512 tmp11044 = _mm512_sub_ps(in1576, in1574);
__m512 tmp11041 = _mm512_add_ps(in1566, in1570);
__m512 tmp11045 = _mm512_add_ps(in1574, in1578);
in1564 = _mm512_sub_ps(in1564, in1570);
in1572 = _mm512_sub_ps(in1572, in1578);
tmp11039 = _mm512_fmadd_ps(in1567, _mm512_set1_ps(-4.25e+00f), tmp11039);
tmp11043 = _mm512_fmadd_ps(in1575, _mm512_set1_ps(-4.25e+00f), tmp11043);
tmp11041 = _mm512_fmadd_ps(in1568, _mm512_set1_ps(-4.25e+00f), tmp11041);
tmp11045 = _mm512_fmadd_ps(in1576, _mm512_set1_ps(-4.25e+00f), tmp11045);
in1564 = _mm512_fmadd_ps(tmp11040, _mm512_set1_ps(5.25e+00f), in1564);
in1572 = _mm512_fmadd_ps(tmp11044, _mm512_set1_ps(5.25e+00f), in1572);
tmp11040 = _mm512_fmadd_ps(in1566, _mm512_set1_ps(2.5e-01f), in1570);
tmp11044 = _mm512_fmadd_ps(in1574, _mm512_set1_ps(2.5e-01f), in1578);
in1566 = _mm512_fmadd_ps(in1566, _mm512_set1_ps(4e+00f), in1570);
in1574 = _mm512_fmadd_ps(in1574, _mm512_set1_ps(4e+00f), in1578);
__m512 tmp11042 = _mm512_sub_ps(tmp11041, tmp11039);
__m512 tmp11046 = _mm512_sub_ps(tmp11045, tmp11043);
tmp11041 = _mm512_add_ps(tmp11039, tmp11041);
tmp11045 = _mm512_add_ps(tmp11043, tmp11045);
tmp11039 = _mm512_fmadd_ps(in1565, _mm512_set1_ps(2.5e-01f), in1569);
tmp11043 = _mm512_fmadd_ps(in1573, _mm512_set1_ps(2.5e-01f), in1577);
tmp11040 = _mm512_fmadd_ps(in1568, _mm512_set1_ps(-1.25e+00f), tmp11040);
tmp11044 = _mm512_fmadd_ps(in1576, _mm512_set1_ps(-1.25e+00f), tmp11044);
in1568 = _mm512_fmadd_ps(in1568, _mm512_set1_ps(-5e+00f), in1566);
in1576 = _mm512_fmadd_ps(in1576, _mm512_set1_ps(-5e+00f), in1574);
tmp11039 = _mm512_fmadd_ps(in1567, _mm512_set1_ps(-1.25e+00f), tmp11039);
tmp11043 = _mm512_fmadd_ps(in1575, _mm512_set1_ps(-1.25e+00f), tmp11043);
in1570 = _mm512_fmadd_ps(tmp11039, _mm512_set1_ps(2e+00f), tmp11040);
in1578 = _mm512_fmadd_ps(tmp11043, _mm512_set1_ps(2e+00f), tmp11044);
tmp11040 = _mm512_fnmadd_ps(tmp11039, _mm512_set1_ps(2e+00f), tmp11040);
tmp11044 = _mm512_fnmadd_ps(tmp11043, _mm512_set1_ps(2e+00f), tmp11044);
tmp11039 = _mm512_fmadd_ps(in1569, _mm512_set1_ps(2.5e-01f), in1565);
tmp11043 = _mm512_fmadd_ps(in1577, _mm512_set1_ps(2.5e-01f), in1573);
in1565 = _mm512_sub_ps(in1571, in1565);
in1573 = _mm512_sub_ps(in1579, in1573);
tmp11039 = _mm512_fmadd_ps(in1567, _mm512_set1_ps(-1.25e+00f), tmp11039);
tmp11043 = _mm512_fmadd_ps(in1575, _mm512_set1_ps(-1.25e+00f), tmp11043);
in1567 = _mm512_sub_ps(in1567, in1569);
in1575 = _mm512_sub_ps(in1575, in1577);
in1567 = _mm512_fmadd_ps(in1567, _mm512_set1_ps(5.25e+00f), in1565);
in1575 = _mm512_fmadd_ps(in1575, _mm512_set1_ps(5.25e+00f), in1573);
in1566 = _mm512_fmadd_ps(tmp11039, _mm512_set1_ps(2e+00f), in1568);
in1574 = _mm512_fmadd_ps(tmp11043, _mm512_set1_ps(2e+00f), in1576);
in1568 = _mm512_fnmadd_ps(tmp11039, _mm512_set1_ps(2e+00f), in1568);
in1576 = _mm512_fnmadd_ps(tmp11043, _mm512_set1_ps(2e+00f), in1576);
__m512 tmp11055 = _mm512_unpacklo_ps(in1564, tmp11041);
__m512 tmp11056 = _mm512_unpackhi_ps(in1564, tmp11041);
__m512 tmp11057 = _mm512_unpacklo_ps(tmp11042, in1570);
__m512 tmp11058 = _mm512_unpackhi_ps(tmp11042, in1570);
__m512 tmp11059 = _mm512_unpacklo_ps(tmp11040, in1566);
__m512 tmp11060 = _mm512_unpackhi_ps(tmp11040, in1566);
__m512 tmp11061 = _mm512_unpacklo_ps(in1568, in1567);
__m512 tmp11062 = _mm512_unpackhi_ps(in1568, in1567);
__m512 tmp11063 = _mm512_unpacklo_ps(in1572, tmp11045);
__m512 tmp11064 = _mm512_unpackhi_ps(in1572, tmp11045);
__m512 tmp11065 = _mm512_unpacklo_ps(tmp11046, in1578);
__m512 tmp11066 = _mm512_unpackhi_ps(tmp11046, in1578);
__m512 tmp11067 = _mm512_unpacklo_ps(tmp11044, in1574);
__m512 tmp11068 = _mm512_unpackhi_ps(tmp11044, in1574);
__m512 tmp11069 = _mm512_unpacklo_ps(in1576, in1575);
__m512 tmp11070 = _mm512_unpackhi_ps(in1576, in1575);
__m512 tmp11071 = _mm512_shuffle_ps(tmp11055, tmp11057, 68);
__m512 tmp11072 = _mm512_shuffle_ps(tmp11055, tmp11057, 238);
__m512 tmp11073 = _mm512_shuffle_ps(tmp11056, tmp11058, 68);
__m512 tmp11074 = _mm512_shuffle_ps(tmp11056, tmp11058, 238);
__m512 tmp11075 = _mm512_shuffle_ps(tmp11059, tmp11061, 68);
__m512 tmp11076 = _mm512_shuffle_ps(tmp11059, tmp11061, 238);
__m512 tmp11077 = _mm512_shuffle_ps(tmp11060, tmp11062, 68);
__m512 tmp11078 = _mm512_shuffle_ps(tmp11060, tmp11062, 238);
__m512 tmp11079 = _mm512_shuffle_ps(tmp11063, tmp11065, 68);
__m512 tmp11080 = _mm512_shuffle_ps(tmp11063, tmp11065, 238);
__m512 tmp11081 = _mm512_shuffle_ps(tmp11064, tmp11066, 68);
__m512 tmp11082 = _mm512_shuffle_ps(tmp11064, tmp11066, 238);
__m512 tmp11083 = _mm512_shuffle_ps(tmp11067, tmp11069, 68);
__m512 tmp11084 = _mm512_shuffle_ps(tmp11067, tmp11069, 238);
__m512 tmp11085 = _mm512_shuffle_ps(tmp11068, tmp11070, 68);
__m512 tmp11086 = _mm512_shuffle_ps(tmp11068, tmp11070, 238);
__m512 tmp11087 = _mm512_shuffle_f32x4(tmp11071, tmp11075, 136);
__m512 tmp11088 = _mm512_shuffle_f32x4(tmp11071, tmp11075, 221);
__m512 tmp11089 = _mm512_shuffle_f32x4(tmp11072, tmp11076, 136);
__m512 tmp11090 = _mm512_shuffle_f32x4(tmp11072, tmp11076, 221);
__m512 tmp11091 = _mm512_shuffle_f32x4(tmp11073, tmp11077, 136);
__m512 tmp11092 = _mm512_shuffle_f32x4(tmp11073, tmp11077, 221);
__m512 tmp11093 = _mm512_shuffle_f32x4(tmp11074, tmp11078, 136);
__m512 tmp11094 = _mm512_shuffle_f32x4(tmp11074, tmp11078, 221);
__m512 tmp11095 = _mm512_shuffle_f32x4(tmp11079, tmp11083, 136);
__m512 tmp11096 = _mm512_shuffle_f32x4(tmp11079, tmp11083, 221);
__m512 tmp11097 = _mm512_shuffle_f32x4(tmp11080, tmp11084, 136);
__m512 tmp11098 = _mm512_shuffle_f32x4(tmp11080, tmp11084, 221);
__m512 tmp11099 = _mm512_shuffle_f32x4(tmp11081, tmp11085, 136);
__m512 tmp11100 = _mm512_shuffle_f32x4(tmp11081, tmp11085, 221);
__m512 tmp11101 = _mm512_shuffle_f32x4(tmp11082, tmp11086, 136);
__m512 tmp11102 = _mm512_shuffle_f32x4(tmp11082, tmp11086, 221);
in1564 = _mm512_shuffle_f32x4(tmp11087, tmp11095, 136);
in1572 = _mm512_shuffle_f32x4(tmp11087, tmp11095, 221);
tmp11041 = _mm512_shuffle_f32x4(tmp11089, tmp11097, 136);
tmp11045 = _mm512_shuffle_f32x4(tmp11089, tmp11097, 221);
tmp11042 = _mm512_shuffle_f32x4(tmp11091, tmp11099, 136);
tmp11046 = _mm512_shuffle_f32x4(tmp11091, tmp11099, 221);
in1570 = _mm512_shuffle_f32x4(tmp11093, tmp11101, 136);
in1578 = _mm512_shuffle_f32x4(tmp11093, tmp11101, 221);
tmp11040 = _mm512_shuffle_f32x4(tmp11088, tmp11096, 136);
tmp11044 = _mm512_shuffle_f32x4(tmp11088, tmp11096, 221);
in1566 = _mm512_shuffle_f32x4(tmp11090, tmp11098, 136);
in1574 = _mm512_shuffle_f32x4(tmp11090, tmp11098, 221);
in1568 = _mm512_shuffle_f32x4(tmp11092, tmp11100, 136);
in1576 = _mm512_shuffle_f32x4(tmp11092, tmp11100, 221);
in1567 = _mm512_shuffle_f32x4(tmp11094, tmp11102, 136);
in1575 = _mm512_shuffle_f32x4(tmp11094, tmp11102, 221);
__m512 tmp11047 = _mm512_add_ps(tmp11041, in1566);
__m512 tmp11051 = _mm512_add_ps(tmp11045, in1574);
__m512 tmp11048 = _mm512_sub_ps(tmp11040, tmp11042);
__m512 tmp11052 = _mm512_sub_ps(tmp11044, tmp11046);
__m512 tmp11049 = _mm512_add_ps(tmp11042, in1568);
__m512 tmp11053 = _mm512_add_ps(tmp11046, in1576);
in1564 = _mm512_sub_ps(in1564, in1568);
in1572 = _mm512_sub_ps(in1572, in1576);
tmp11047 = _mm512_fmadd_ps(in1570, _mm512_set1_ps(-4.25e+00f), tmp11047);
tmp11051 = _mm512_fmadd_ps(in1578, _mm512_set1_ps(-4.25e+00f), tmp11051);
tmp11049 = _mm512_fmadd_ps(tmp11040, _mm512_set1_ps(-4.25e+00f), tmp11049);
tmp11053 = _mm512_fmadd_ps(tmp11044, _mm512_set1_ps(-4.25e+00f), tmp11053);
in1564 = _mm512_fmadd_ps(tmp11048, _mm512_set1_ps(5.25e+00f), in1564);
in1572 = _mm512_fmadd_ps(tmp11052, _mm512_set1_ps(5.25e+00f), in1572);
tmp11048 = _mm512_fmadd_ps(tmp11042, _mm512_set1_ps(2.5e-01f), in1568);
tmp11052 = _mm512_fmadd_ps(tmp11046, _mm512_set1_ps(2.5e-01f), in1576);
tmp11042 = _mm512_fmadd_ps(tmp11042, _mm512_set1_ps(4e+00f), in1568);
tmp11046 = _mm512_fmadd_ps(tmp11046, _mm512_set1_ps(4e+00f), in1576);
__m512 tmp11050 = _mm512_sub_ps(tmp11049, tmp11047);
__m512 tmp11054 = _mm512_sub_ps(tmp11053, tmp11051);
tmp11049 = _mm512_add_ps(tmp11047, tmp11049);
tmp11053 = _mm512_add_ps(tmp11051, tmp11053);
tmp11047 = _mm512_fmadd_ps(tmp11041, _mm512_set1_ps(2.5e-01f), in1566);
tmp11051 = _mm512_fmadd_ps(tmp11045, _mm512_set1_ps(2.5e-01f), in1574);
tmp11048 = _mm512_fmadd_ps(tmp11040, _mm512_set1_ps(-1.25e+00f), tmp11048);
tmp11052 = _mm512_fmadd_ps(tmp11044, _mm512_set1_ps(-1.25e+00f), tmp11052);
tmp11040 = _mm512_fmadd_ps(tmp11040, _mm512_set1_ps(-5e+00f), tmp11042);
tmp11044 = _mm512_fmadd_ps(tmp11044, _mm512_set1_ps(-5e+00f), tmp11046);
tmp11047 = _mm512_fmadd_ps(in1570, _mm512_set1_ps(-1.25e+00f), tmp11047);
tmp11051 = _mm512_fmadd_ps(in1578, _mm512_set1_ps(-1.25e+00f), tmp11051);
in1568 = _mm512_fmadd_ps(tmp11047, _mm512_set1_ps(2e+00f), tmp11048);
in1576 = _mm512_fmadd_ps(tmp11051, _mm512_set1_ps(2e+00f), tmp11052);
tmp11048 = _mm512_fnmadd_ps(tmp11047, _mm512_set1_ps(2e+00f), tmp11048);
tmp11052 = _mm512_fnmadd_ps(tmp11051, _mm512_set1_ps(2e+00f), tmp11052);
tmp11047 = _mm512_fmadd_ps(in1566, _mm512_set1_ps(2.5e-01f), tmp11041);
tmp11051 = _mm512_fmadd_ps(in1574, _mm512_set1_ps(2.5e-01f), tmp11045);
tmp11041 = _mm512_sub_ps(in1567, tmp11041);
tmp11045 = _mm512_sub_ps(in1575, tmp11045);
tmp11047 = _mm512_fmadd_ps(in1570, _mm512_set1_ps(-1.25e+00f), tmp11047);
tmp11051 = _mm512_fmadd_ps(in1578, _mm512_set1_ps(-1.25e+00f), tmp11051);
in1570 = _mm512_sub_ps(in1570, in1566);
in1578 = _mm512_sub_ps(in1578, in1574);
in1570 = _mm512_fmadd_ps(in1570, _mm512_set1_ps(5.25e+00f), tmp11041);
in1578 = _mm512_fmadd_ps(in1578, _mm512_set1_ps(5.25e+00f), tmp11045);
tmp11042 = _mm512_fmadd_ps(tmp11047, _mm512_set1_ps(2e+00f), tmp11040);
tmp11046 = _mm512_fmadd_ps(tmp11051, _mm512_set1_ps(2e+00f), tmp11044);
tmp11040 = _mm512_fnmadd_ps(tmp11047, _mm512_set1_ps(2e+00f), tmp11040);
tmp11044 = _mm512_fnmadd_ps(tmp11051, _mm512_set1_ps(2e+00f), tmp11044);
__m512 out1439 = _mm512_shuffle_f32x4(in1564, tmp11049, 68);
__m512 out1447 = _mm512_shuffle_f32x4(in1564, tmp11049, 238);
__m512 out1440 = _mm512_shuffle_f32x4(tmp11050, in1568, 68);
__m512 out1448 = _mm512_shuffle_f32x4(tmp11050, in1568, 238);
__m512 out1441 = _mm512_shuffle_f32x4(tmp11048, tmp11042, 68);
__m512 out1449 = _mm512_shuffle_f32x4(tmp11048, tmp11042, 238);
__m512 out1442 = _mm512_shuffle_f32x4(tmp11040, in1570, 68);
__m512 out1450 = _mm512_shuffle_f32x4(tmp11040, in1570, 238);
__m512 out1443 = _mm512_shuffle_f32x4(in1572, tmp11053, 68);
__m512 out1451 = _mm512_shuffle_f32x4(in1572, tmp11053, 238);
__m512 out1444 = _mm512_shuffle_f32x4(tmp11054, in1576, 68);
__m512 out1452 = _mm512_shuffle_f32x4(tmp11054, in1576, 238);
__m512 out1445 = _mm512_shuffle_f32x4(tmp11052, tmp11046, 68);
__m512 out1453 = _mm512_shuffle_f32x4(tmp11052, tmp11046, 238);
__m512 out1446 = _mm512_shuffle_f32x4(tmp11044, in1578, 68);
__m512 out1454 = _mm512_shuffle_f32x4(tmp11044, in1578, 238);
_mm512_storeu_ps(dfPtr8+0+819200*i35+49152*j28+49152*s25+768*k106, out1439);
_mm512_storeu_ps(dfPtr8+128+819200*i35+49152*j28+49152*s25+768*k106, out1447);
_mm512_storeu_ps(dfPtr8+64+819200*i35+49152*j28+49152*s25+768*k106, out1443);
_mm512_storeu_ps(dfPtr8+192+819200*i35+49152*j28+49152*s25+768*k106, out1451);
_mm512_storeu_ps(dfPtr8+204800+819200*i35+49152*j28+49152*s25+768*k106, out1440);
_mm512_storeu_ps(dfPtr8+204928+819200*i35+49152*j28+49152*s25+768*k106, out1448);
_mm512_storeu_ps(dfPtr8+204864+819200*i35+49152*j28+49152*s25+768*k106, out1444);
_mm512_storeu_ps(dfPtr8+204992+819200*i35+49152*j28+49152*s25+768*k106, out1452);
_mm512_storeu_ps(dfPtr8+409600+819200*i35+49152*j28+49152*s25+768*k106, out1441);
_mm512_storeu_ps(dfPtr8+409728+819200*i35+49152*j28+49152*s25+768*k106, out1449);
_mm512_storeu_ps(dfPtr8+409664+819200*i35+49152*j28+49152*s25+768*k106, out1445);
_mm512_storeu_ps(dfPtr8+409792+819200*i35+49152*j28+49152*s25+768*k106, out1453);
_mm512_storeu_ps(dfPtr8+614400+819200*i35+49152*j28+49152*s25+768*k106, out1442);
_mm512_storeu_ps(dfPtr8+614528+819200*i35+49152*j28+49152*s25+768*k106, out1450);
_mm512_storeu_ps(dfPtr8+614464+819200*i35+49152*j28+49152*s25+768*k106, out1446);
_mm512_storeu_ps(dfPtr8+614592+819200*i35+49152*j28+49152*s25+768*k106, out1454);
__m512 dat1759 = _mm512_maskz_loadu_ps(8191, datPtr16+652+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1759 = _mm512_max_ps(_mm512_setzero_ps(), dat1759);
__m512 dat1760 = _mm512_maskz_loadu_ps(16383, datPtr16+3136+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1760 = _mm512_max_ps(_mm512_setzero_ps(), dat1760);
__m512i pm169 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1580 = _mm512_permutexvar_ps(pm169, dat1759);
__m512i pm170 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1588 = _mm512_permutexvar_ps(pm170, dat1760);
__m512 dat1761 = _mm512_maskz_loadu_ps(8191, datPtr16+764+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1761 = _mm512_max_ps(_mm512_setzero_ps(), dat1761);
__m512 dat1762 = _mm512_maskz_loadu_ps(16383, datPtr16+3248+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1762 = _mm512_max_ps(_mm512_setzero_ps(), dat1762);
__m512 in1581 = _mm512_permutexvar_ps(pm169, dat1761);
__m512 in1589 = _mm512_permutexvar_ps(pm170, dat1762);
__m512 dat1763 = _mm512_maskz_loadu_ps(8191, datPtr16+876+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1763 = _mm512_max_ps(_mm512_setzero_ps(), dat1763);
__m512 dat1764 = _mm512_maskz_loadu_ps(16383, datPtr16+3360+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1764 = _mm512_max_ps(_mm512_setzero_ps(), dat1764);
__m512 in1582 = _mm512_permutexvar_ps(pm169, dat1763);
__m512 in1590 = _mm512_permutexvar_ps(pm170, dat1764);
__m512 dat1765 = _mm512_maskz_loadu_ps(8191, datPtr16+988+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1765 = _mm512_max_ps(_mm512_setzero_ps(), dat1765);
__m512 dat1766 = _mm512_maskz_loadu_ps(16383, datPtr16+3472+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1766 = _mm512_max_ps(_mm512_setzero_ps(), dat1766);
__m512 in1583 = _mm512_permutexvar_ps(pm169, dat1765);
__m512 in1591 = _mm512_permutexvar_ps(pm170, dat1766);
__m512 dat1767 = _mm512_maskz_loadu_ps(8191, datPtr16+1100+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1767 = _mm512_max_ps(_mm512_setzero_ps(), dat1767);
__m512 dat1768 = _mm512_maskz_loadu_ps(16383, datPtr16+3584+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1768 = _mm512_max_ps(_mm512_setzero_ps(), dat1768);
__m512 in1584 = _mm512_permutexvar_ps(pm169, dat1767);
__m512 in1592 = _mm512_permutexvar_ps(pm170, dat1768);
__m512 dat1769 = _mm512_maskz_loadu_ps(8191, datPtr16+1212+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1769 = _mm512_max_ps(_mm512_setzero_ps(), dat1769);
__m512 dat1770 = _mm512_maskz_loadu_ps(16383, datPtr16+3696+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1770 = _mm512_max_ps(_mm512_setzero_ps(), dat1770);
__m512 in1585 = _mm512_permutexvar_ps(pm169, dat1769);
__m512 in1593 = _mm512_permutexvar_ps(pm170, dat1770);
__m512 dat1771 = _mm512_maskz_loadu_ps(8191, datPtr16+1324+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1771 = _mm512_max_ps(_mm512_setzero_ps(), dat1771);
__m512 dat1772 = _mm512_maskz_loadu_ps(16383, datPtr16+3808+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1772 = _mm512_max_ps(_mm512_setzero_ps(), dat1772);
__m512 in1586 = _mm512_permutexvar_ps(pm169, dat1771);
__m512 in1594 = _mm512_permutexvar_ps(pm170, dat1772);
__m512 dat1773 = _mm512_maskz_loadu_ps(8191, datPtr16+1436+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1773 = _mm512_max_ps(_mm512_setzero_ps(), dat1773);
__m512 dat1774 = _mm512_maskz_loadu_ps(16383, datPtr16+3920+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1774 = _mm512_max_ps(_mm512_setzero_ps(), dat1774);
__m512 in1587 = _mm512_permutexvar_ps(pm169, dat1773);
__m512 in1595 = _mm512_permutexvar_ps(pm170, dat1774);
__m512 tmp11103 = _mm512_add_ps(in1581, in1585);
__m512 tmp11107 = _mm512_add_ps(in1589, in1593);
__m512 tmp11104 = _mm512_sub_ps(in1584, in1582);
__m512 tmp11108 = _mm512_sub_ps(in1592, in1590);
__m512 tmp11105 = _mm512_add_ps(in1582, in1586);
__m512 tmp11109 = _mm512_add_ps(in1590, in1594);
in1580 = _mm512_sub_ps(in1580, in1586);
in1588 = _mm512_sub_ps(in1588, in1594);
tmp11103 = _mm512_fmadd_ps(in1583, _mm512_set1_ps(-4.25e+00f), tmp11103);
tmp11107 = _mm512_fmadd_ps(in1591, _mm512_set1_ps(-4.25e+00f), tmp11107);
tmp11105 = _mm512_fmadd_ps(in1584, _mm512_set1_ps(-4.25e+00f), tmp11105);
tmp11109 = _mm512_fmadd_ps(in1592, _mm512_set1_ps(-4.25e+00f), tmp11109);
in1580 = _mm512_fmadd_ps(tmp11104, _mm512_set1_ps(5.25e+00f), in1580);
in1588 = _mm512_fmadd_ps(tmp11108, _mm512_set1_ps(5.25e+00f), in1588);
tmp11104 = _mm512_fmadd_ps(in1582, _mm512_set1_ps(2.5e-01f), in1586);
tmp11108 = _mm512_fmadd_ps(in1590, _mm512_set1_ps(2.5e-01f), in1594);
in1582 = _mm512_fmadd_ps(in1582, _mm512_set1_ps(4e+00f), in1586);
in1590 = _mm512_fmadd_ps(in1590, _mm512_set1_ps(4e+00f), in1594);
__m512 tmp11106 = _mm512_sub_ps(tmp11105, tmp11103);
__m512 tmp11110 = _mm512_sub_ps(tmp11109, tmp11107);
tmp11105 = _mm512_add_ps(tmp11103, tmp11105);
tmp11109 = _mm512_add_ps(tmp11107, tmp11109);
tmp11103 = _mm512_fmadd_ps(in1581, _mm512_set1_ps(2.5e-01f), in1585);
tmp11107 = _mm512_fmadd_ps(in1589, _mm512_set1_ps(2.5e-01f), in1593);
tmp11104 = _mm512_fmadd_ps(in1584, _mm512_set1_ps(-1.25e+00f), tmp11104);
tmp11108 = _mm512_fmadd_ps(in1592, _mm512_set1_ps(-1.25e+00f), tmp11108);
in1584 = _mm512_fmadd_ps(in1584, _mm512_set1_ps(-5e+00f), in1582);
in1592 = _mm512_fmadd_ps(in1592, _mm512_set1_ps(-5e+00f), in1590);
tmp11103 = _mm512_fmadd_ps(in1583, _mm512_set1_ps(-1.25e+00f), tmp11103);
tmp11107 = _mm512_fmadd_ps(in1591, _mm512_set1_ps(-1.25e+00f), tmp11107);
in1586 = _mm512_fmadd_ps(tmp11103, _mm512_set1_ps(2e+00f), tmp11104);
in1594 = _mm512_fmadd_ps(tmp11107, _mm512_set1_ps(2e+00f), tmp11108);
tmp11104 = _mm512_fnmadd_ps(tmp11103, _mm512_set1_ps(2e+00f), tmp11104);
tmp11108 = _mm512_fnmadd_ps(tmp11107, _mm512_set1_ps(2e+00f), tmp11108);
tmp11103 = _mm512_fmadd_ps(in1585, _mm512_set1_ps(2.5e-01f), in1581);
tmp11107 = _mm512_fmadd_ps(in1593, _mm512_set1_ps(2.5e-01f), in1589);
in1581 = _mm512_sub_ps(in1587, in1581);
in1589 = _mm512_sub_ps(in1595, in1589);
tmp11103 = _mm512_fmadd_ps(in1583, _mm512_set1_ps(-1.25e+00f), tmp11103);
tmp11107 = _mm512_fmadd_ps(in1591, _mm512_set1_ps(-1.25e+00f), tmp11107);
in1583 = _mm512_sub_ps(in1583, in1585);
in1591 = _mm512_sub_ps(in1591, in1593);
in1583 = _mm512_fmadd_ps(in1583, _mm512_set1_ps(5.25e+00f), in1581);
in1591 = _mm512_fmadd_ps(in1591, _mm512_set1_ps(5.25e+00f), in1589);
in1582 = _mm512_fmadd_ps(tmp11103, _mm512_set1_ps(2e+00f), in1584);
in1590 = _mm512_fmadd_ps(tmp11107, _mm512_set1_ps(2e+00f), in1592);
in1584 = _mm512_fnmadd_ps(tmp11103, _mm512_set1_ps(2e+00f), in1584);
in1592 = _mm512_fnmadd_ps(tmp11107, _mm512_set1_ps(2e+00f), in1592);
__m512 tmp11119 = _mm512_unpacklo_ps(in1580, tmp11105);
__m512 tmp11120 = _mm512_unpackhi_ps(in1580, tmp11105);
__m512 tmp11121 = _mm512_unpacklo_ps(tmp11106, in1586);
__m512 tmp11122 = _mm512_unpackhi_ps(tmp11106, in1586);
__m512 tmp11123 = _mm512_unpacklo_ps(tmp11104, in1582);
__m512 tmp11124 = _mm512_unpackhi_ps(tmp11104, in1582);
__m512 tmp11125 = _mm512_unpacklo_ps(in1584, in1583);
__m512 tmp11126 = _mm512_unpackhi_ps(in1584, in1583);
__m512 tmp11127 = _mm512_unpacklo_ps(in1588, tmp11109);
__m512 tmp11128 = _mm512_unpackhi_ps(in1588, tmp11109);
__m512 tmp11129 = _mm512_unpacklo_ps(tmp11110, in1594);
__m512 tmp11130 = _mm512_unpackhi_ps(tmp11110, in1594);
__m512 tmp11131 = _mm512_unpacklo_ps(tmp11108, in1590);
__m512 tmp11132 = _mm512_unpackhi_ps(tmp11108, in1590);
__m512 tmp11133 = _mm512_unpacklo_ps(in1592, in1591);
__m512 tmp11134 = _mm512_unpackhi_ps(in1592, in1591);
__m512 tmp11135 = _mm512_shuffle_ps(tmp11119, tmp11121, 68);
__m512 tmp11136 = _mm512_shuffle_ps(tmp11119, tmp11121, 238);
__m512 tmp11137 = _mm512_shuffle_ps(tmp11120, tmp11122, 68);
__m512 tmp11138 = _mm512_shuffle_ps(tmp11120, tmp11122, 238);
__m512 tmp11139 = _mm512_shuffle_ps(tmp11123, tmp11125, 68);
__m512 tmp11140 = _mm512_shuffle_ps(tmp11123, tmp11125, 238);
__m512 tmp11141 = _mm512_shuffle_ps(tmp11124, tmp11126, 68);
__m512 tmp11142 = _mm512_shuffle_ps(tmp11124, tmp11126, 238);
__m512 tmp11143 = _mm512_shuffle_ps(tmp11127, tmp11129, 68);
__m512 tmp11144 = _mm512_shuffle_ps(tmp11127, tmp11129, 238);
__m512 tmp11145 = _mm512_shuffle_ps(tmp11128, tmp11130, 68);
__m512 tmp11146 = _mm512_shuffle_ps(tmp11128, tmp11130, 238);
__m512 tmp11147 = _mm512_shuffle_ps(tmp11131, tmp11133, 68);
__m512 tmp11148 = _mm512_shuffle_ps(tmp11131, tmp11133, 238);
__m512 tmp11149 = _mm512_shuffle_ps(tmp11132, tmp11134, 68);
__m512 tmp11150 = _mm512_shuffle_ps(tmp11132, tmp11134, 238);
__m512 tmp11151 = _mm512_shuffle_f32x4(tmp11135, tmp11139, 136);
__m512 tmp11152 = _mm512_shuffle_f32x4(tmp11135, tmp11139, 221);
__m512 tmp11153 = _mm512_shuffle_f32x4(tmp11136, tmp11140, 136);
__m512 tmp11154 = _mm512_shuffle_f32x4(tmp11136, tmp11140, 221);
__m512 tmp11155 = _mm512_shuffle_f32x4(tmp11137, tmp11141, 136);
__m512 tmp11156 = _mm512_shuffle_f32x4(tmp11137, tmp11141, 221);
__m512 tmp11157 = _mm512_shuffle_f32x4(tmp11138, tmp11142, 136);
__m512 tmp11158 = _mm512_shuffle_f32x4(tmp11138, tmp11142, 221);
__m512 tmp11159 = _mm512_shuffle_f32x4(tmp11143, tmp11147, 136);
__m512 tmp11160 = _mm512_shuffle_f32x4(tmp11143, tmp11147, 221);
__m512 tmp11161 = _mm512_shuffle_f32x4(tmp11144, tmp11148, 136);
__m512 tmp11162 = _mm512_shuffle_f32x4(tmp11144, tmp11148, 221);
__m512 tmp11163 = _mm512_shuffle_f32x4(tmp11145, tmp11149, 136);
__m512 tmp11164 = _mm512_shuffle_f32x4(tmp11145, tmp11149, 221);
__m512 tmp11165 = _mm512_shuffle_f32x4(tmp11146, tmp11150, 136);
__m512 tmp11166 = _mm512_shuffle_f32x4(tmp11146, tmp11150, 221);
in1580 = _mm512_shuffle_f32x4(tmp11151, tmp11159, 136);
in1588 = _mm512_shuffle_f32x4(tmp11151, tmp11159, 221);
tmp11105 = _mm512_shuffle_f32x4(tmp11153, tmp11161, 136);
tmp11109 = _mm512_shuffle_f32x4(tmp11153, tmp11161, 221);
tmp11106 = _mm512_shuffle_f32x4(tmp11155, tmp11163, 136);
tmp11110 = _mm512_shuffle_f32x4(tmp11155, tmp11163, 221);
in1586 = _mm512_shuffle_f32x4(tmp11157, tmp11165, 136);
in1594 = _mm512_shuffle_f32x4(tmp11157, tmp11165, 221);
tmp11104 = _mm512_shuffle_f32x4(tmp11152, tmp11160, 136);
tmp11108 = _mm512_shuffle_f32x4(tmp11152, tmp11160, 221);
in1582 = _mm512_shuffle_f32x4(tmp11154, tmp11162, 136);
in1590 = _mm512_shuffle_f32x4(tmp11154, tmp11162, 221);
in1584 = _mm512_shuffle_f32x4(tmp11156, tmp11164, 136);
in1592 = _mm512_shuffle_f32x4(tmp11156, tmp11164, 221);
in1583 = _mm512_shuffle_f32x4(tmp11158, tmp11166, 136);
in1591 = _mm512_shuffle_f32x4(tmp11158, tmp11166, 221);
__m512 tmp11111 = _mm512_add_ps(tmp11105, in1582);
__m512 tmp11115 = _mm512_add_ps(tmp11109, in1590);
__m512 tmp11112 = _mm512_sub_ps(tmp11104, tmp11106);
__m512 tmp11116 = _mm512_sub_ps(tmp11108, tmp11110);
__m512 tmp11113 = _mm512_add_ps(tmp11106, in1584);
__m512 tmp11117 = _mm512_add_ps(tmp11110, in1592);
in1580 = _mm512_sub_ps(in1580, in1584);
in1588 = _mm512_sub_ps(in1588, in1592);
tmp11111 = _mm512_fmadd_ps(in1586, _mm512_set1_ps(-4.25e+00f), tmp11111);
tmp11115 = _mm512_fmadd_ps(in1594, _mm512_set1_ps(-4.25e+00f), tmp11115);
tmp11113 = _mm512_fmadd_ps(tmp11104, _mm512_set1_ps(-4.25e+00f), tmp11113);
tmp11117 = _mm512_fmadd_ps(tmp11108, _mm512_set1_ps(-4.25e+00f), tmp11117);
in1580 = _mm512_fmadd_ps(tmp11112, _mm512_set1_ps(5.25e+00f), in1580);
in1588 = _mm512_fmadd_ps(tmp11116, _mm512_set1_ps(5.25e+00f), in1588);
tmp11112 = _mm512_fmadd_ps(tmp11106, _mm512_set1_ps(2.5e-01f), in1584);
tmp11116 = _mm512_fmadd_ps(tmp11110, _mm512_set1_ps(2.5e-01f), in1592);
tmp11106 = _mm512_fmadd_ps(tmp11106, _mm512_set1_ps(4e+00f), in1584);
tmp11110 = _mm512_fmadd_ps(tmp11110, _mm512_set1_ps(4e+00f), in1592);
__m512 tmp11114 = _mm512_sub_ps(tmp11113, tmp11111);
__m512 tmp11118 = _mm512_sub_ps(tmp11117, tmp11115);
tmp11113 = _mm512_add_ps(tmp11111, tmp11113);
tmp11117 = _mm512_add_ps(tmp11115, tmp11117);
tmp11111 = _mm512_fmadd_ps(tmp11105, _mm512_set1_ps(2.5e-01f), in1582);
tmp11115 = _mm512_fmadd_ps(tmp11109, _mm512_set1_ps(2.5e-01f), in1590);
tmp11112 = _mm512_fmadd_ps(tmp11104, _mm512_set1_ps(-1.25e+00f), tmp11112);
tmp11116 = _mm512_fmadd_ps(tmp11108, _mm512_set1_ps(-1.25e+00f), tmp11116);
tmp11104 = _mm512_fmadd_ps(tmp11104, _mm512_set1_ps(-5e+00f), tmp11106);
tmp11108 = _mm512_fmadd_ps(tmp11108, _mm512_set1_ps(-5e+00f), tmp11110);
tmp11111 = _mm512_fmadd_ps(in1586, _mm512_set1_ps(-1.25e+00f), tmp11111);
tmp11115 = _mm512_fmadd_ps(in1594, _mm512_set1_ps(-1.25e+00f), tmp11115);
in1584 = _mm512_fmadd_ps(tmp11111, _mm512_set1_ps(2e+00f), tmp11112);
in1592 = _mm512_fmadd_ps(tmp11115, _mm512_set1_ps(2e+00f), tmp11116);
tmp11112 = _mm512_fnmadd_ps(tmp11111, _mm512_set1_ps(2e+00f), tmp11112);
tmp11116 = _mm512_fnmadd_ps(tmp11115, _mm512_set1_ps(2e+00f), tmp11116);
tmp11111 = _mm512_fmadd_ps(in1582, _mm512_set1_ps(2.5e-01f), tmp11105);
tmp11115 = _mm512_fmadd_ps(in1590, _mm512_set1_ps(2.5e-01f), tmp11109);
tmp11105 = _mm512_sub_ps(in1583, tmp11105);
tmp11109 = _mm512_sub_ps(in1591, tmp11109);
tmp11111 = _mm512_fmadd_ps(in1586, _mm512_set1_ps(-1.25e+00f), tmp11111);
tmp11115 = _mm512_fmadd_ps(in1594, _mm512_set1_ps(-1.25e+00f), tmp11115);
in1586 = _mm512_sub_ps(in1586, in1582);
in1594 = _mm512_sub_ps(in1594, in1590);
in1586 = _mm512_fmadd_ps(in1586, _mm512_set1_ps(5.25e+00f), tmp11105);
in1594 = _mm512_fmadd_ps(in1594, _mm512_set1_ps(5.25e+00f), tmp11109);
tmp11106 = _mm512_fmadd_ps(tmp11111, _mm512_set1_ps(2e+00f), tmp11104);
tmp11110 = _mm512_fmadd_ps(tmp11115, _mm512_set1_ps(2e+00f), tmp11108);
tmp11104 = _mm512_fnmadd_ps(tmp11111, _mm512_set1_ps(2e+00f), tmp11104);
tmp11108 = _mm512_fnmadd_ps(tmp11115, _mm512_set1_ps(2e+00f), tmp11108);
__m512 out1455 = _mm512_shuffle_f32x4(in1580, tmp11113, 68);
__m512 out1463 = _mm512_shuffle_f32x4(in1580, tmp11113, 238);
__m512 out1456 = _mm512_shuffle_f32x4(tmp11114, in1584, 68);
__m512 out1464 = _mm512_shuffle_f32x4(tmp11114, in1584, 238);
__m512 out1457 = _mm512_shuffle_f32x4(tmp11112, tmp11106, 68);
__m512 out1465 = _mm512_shuffle_f32x4(tmp11112, tmp11106, 238);
__m512 out1458 = _mm512_shuffle_f32x4(tmp11104, in1586, 68);
__m512 out1466 = _mm512_shuffle_f32x4(tmp11104, in1586, 238);
__m512 out1459 = _mm512_shuffle_f32x4(in1588, tmp11117, 68);
__m512 out1467 = _mm512_shuffle_f32x4(in1588, tmp11117, 238);
__m512 out1460 = _mm512_shuffle_f32x4(tmp11118, in1592, 68);
__m512 out1468 = _mm512_shuffle_f32x4(tmp11118, in1592, 238);
__m512 out1461 = _mm512_shuffle_f32x4(tmp11116, tmp11110, 68);
__m512 out1469 = _mm512_shuffle_f32x4(tmp11116, tmp11110, 238);
__m512 out1462 = _mm512_shuffle_f32x4(tmp11108, in1594, 68);
__m512 out1470 = _mm512_shuffle_f32x4(tmp11108, in1594, 238);
_mm512_storeu_ps(dfPtr8+256+819200*i35+49152*j28+49152*s25+768*k106, out1455);
_mm512_storeu_ps(dfPtr8+384+819200*i35+49152*j28+49152*s25+768*k106, out1463);
_mm512_storeu_ps(dfPtr8+320+819200*i35+49152*j28+49152*s25+768*k106, out1459);
_mm512_storeu_ps(dfPtr8+448+819200*i35+49152*j28+49152*s25+768*k106, out1467);
_mm512_storeu_ps(dfPtr8+205056+819200*i35+49152*j28+49152*s25+768*k106, out1456);
_mm512_storeu_ps(dfPtr8+205184+819200*i35+49152*j28+49152*s25+768*k106, out1464);
_mm512_storeu_ps(dfPtr8+205120+819200*i35+49152*j28+49152*s25+768*k106, out1460);
_mm512_storeu_ps(dfPtr8+205248+819200*i35+49152*j28+49152*s25+768*k106, out1468);
_mm512_storeu_ps(dfPtr8+409856+819200*i35+49152*j28+49152*s25+768*k106, out1457);
_mm512_storeu_ps(dfPtr8+409984+819200*i35+49152*j28+49152*s25+768*k106, out1465);
_mm512_storeu_ps(dfPtr8+409920+819200*i35+49152*j28+49152*s25+768*k106, out1461);
_mm512_storeu_ps(dfPtr8+410048+819200*i35+49152*j28+49152*s25+768*k106, out1469);
_mm512_storeu_ps(dfPtr8+614656+819200*i35+49152*j28+49152*s25+768*k106, out1458);
_mm512_storeu_ps(dfPtr8+614784+819200*i35+49152*j28+49152*s25+768*k106, out1466);
_mm512_storeu_ps(dfPtr8+614720+819200*i35+49152*j28+49152*s25+768*k106, out1462);
_mm512_storeu_ps(dfPtr8+614848+819200*i35+49152*j28+49152*s25+768*k106, out1470);
__m512 dat1775 = _mm512_maskz_loadu_ps(2047, datPtr16+3184+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1775 = _mm512_max_ps(_mm512_setzero_ps(), dat1775);
__m512 dat1776 = _mm512_maskz_loadu_ps(8191, datPtr16+3788+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1776 = _mm512_max_ps(_mm512_setzero_ps(), dat1776);
__m512i pm171 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1596 = _mm512_permutexvar_ps(pm171, dat1775);
__m512i pm172 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1604 = _mm512_permutexvar_ps(pm172, dat1776);
__m512 dat1777 = _mm512_maskz_loadu_ps(2047, datPtr16+3296+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1777 = _mm512_max_ps(_mm512_setzero_ps(), dat1777);
__m512 dat1778 = _mm512_maskz_loadu_ps(8191, datPtr16+3900+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1778 = _mm512_max_ps(_mm512_setzero_ps(), dat1778);
__m512 in1597 = _mm512_permutexvar_ps(pm171, dat1777);
__m512 in1605 = _mm512_permutexvar_ps(pm172, dat1778);
__m512 dat1779 = _mm512_maskz_loadu_ps(2047, datPtr16+3408+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1779 = _mm512_max_ps(_mm512_setzero_ps(), dat1779);
__m512 dat1780 = _mm512_maskz_loadu_ps(8191, datPtr16+4012+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1780 = _mm512_max_ps(_mm512_setzero_ps(), dat1780);
__m512 in1598 = _mm512_permutexvar_ps(pm171, dat1779);
__m512 in1606 = _mm512_permutexvar_ps(pm172, dat1780);
__m512 dat1781 = _mm512_maskz_loadu_ps(2047, datPtr16+3520+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1781 = _mm512_max_ps(_mm512_setzero_ps(), dat1781);
__m512 dat1782 = _mm512_maskz_loadu_ps(8191, datPtr16+4124+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1782 = _mm512_max_ps(_mm512_setzero_ps(), dat1782);
__m512 in1599 = _mm512_permutexvar_ps(pm171, dat1781);
__m512 in1607 = _mm512_permutexvar_ps(pm172, dat1782);
__m512 dat1783 = _mm512_maskz_loadu_ps(2047, datPtr16+3632+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1783 = _mm512_max_ps(_mm512_setzero_ps(), dat1783);
__m512 dat1784 = _mm512_maskz_loadu_ps(8191, datPtr16+4236+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1784 = _mm512_max_ps(_mm512_setzero_ps(), dat1784);
__m512 in1600 = _mm512_permutexvar_ps(pm171, dat1783);
__m512 in1608 = _mm512_permutexvar_ps(pm172, dat1784);
__m512 dat1785 = _mm512_maskz_loadu_ps(2047, datPtr16+3744+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1785 = _mm512_max_ps(_mm512_setzero_ps(), dat1785);
__m512 dat1786 = _mm512_maskz_loadu_ps(8191, datPtr16+4348+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1786 = _mm512_max_ps(_mm512_setzero_ps(), dat1786);
__m512 in1601 = _mm512_permutexvar_ps(pm171, dat1785);
__m512 in1609 = _mm512_permutexvar_ps(pm172, dat1786);
__m512 dat1787 = _mm512_maskz_loadu_ps(2047, datPtr16+3856+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1787 = _mm512_max_ps(_mm512_setzero_ps(), dat1787);
__m512 dat1788 = _mm512_maskz_loadu_ps(8191, datPtr16+4460+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1788 = _mm512_max_ps(_mm512_setzero_ps(), dat1788);
__m512 in1602 = _mm512_permutexvar_ps(pm171, dat1787);
__m512 in1610 = _mm512_permutexvar_ps(pm172, dat1788);
__m512 dat1789 = _mm512_maskz_loadu_ps(2047, datPtr16+3968+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1789 = _mm512_max_ps(_mm512_setzero_ps(), dat1789);
__m512 dat1790 = _mm512_maskz_loadu_ps(8191, datPtr16+4572+401408*i35+112*h41+4*w49+401408*s25+6272*k106);
dat1790 = _mm512_max_ps(_mm512_setzero_ps(), dat1790);
__m512 in1603 = _mm512_permutexvar_ps(pm171, dat1789);
__m512 in1611 = _mm512_permutexvar_ps(pm172, dat1790);
__m512 tmp11167 = _mm512_add_ps(in1597, in1601);
__m512 tmp11171 = _mm512_add_ps(in1605, in1609);
__m512 tmp11168 = _mm512_sub_ps(in1600, in1598);
__m512 tmp11172 = _mm512_sub_ps(in1608, in1606);
__m512 tmp11169 = _mm512_add_ps(in1598, in1602);
__m512 tmp11173 = _mm512_add_ps(in1606, in1610);
in1596 = _mm512_sub_ps(in1596, in1602);
in1604 = _mm512_sub_ps(in1604, in1610);
tmp11167 = _mm512_fmadd_ps(in1599, _mm512_set1_ps(-4.25e+00f), tmp11167);
tmp11171 = _mm512_fmadd_ps(in1607, _mm512_set1_ps(-4.25e+00f), tmp11171);
tmp11169 = _mm512_fmadd_ps(in1600, _mm512_set1_ps(-4.25e+00f), tmp11169);
tmp11173 = _mm512_fmadd_ps(in1608, _mm512_set1_ps(-4.25e+00f), tmp11173);
in1596 = _mm512_fmadd_ps(tmp11168, _mm512_set1_ps(5.25e+00f), in1596);
in1604 = _mm512_fmadd_ps(tmp11172, _mm512_set1_ps(5.25e+00f), in1604);
tmp11168 = _mm512_fmadd_ps(in1598, _mm512_set1_ps(2.5e-01f), in1602);
tmp11172 = _mm512_fmadd_ps(in1606, _mm512_set1_ps(2.5e-01f), in1610);
in1598 = _mm512_fmadd_ps(in1598, _mm512_set1_ps(4e+00f), in1602);
in1606 = _mm512_fmadd_ps(in1606, _mm512_set1_ps(4e+00f), in1610);
__m512 tmp11170 = _mm512_sub_ps(tmp11169, tmp11167);
__m512 tmp11174 = _mm512_sub_ps(tmp11173, tmp11171);
tmp11169 = _mm512_add_ps(tmp11167, tmp11169);
tmp11173 = _mm512_add_ps(tmp11171, tmp11173);
tmp11167 = _mm512_fmadd_ps(in1597, _mm512_set1_ps(2.5e-01f), in1601);
tmp11171 = _mm512_fmadd_ps(in1605, _mm512_set1_ps(2.5e-01f), in1609);
tmp11168 = _mm512_fmadd_ps(in1600, _mm512_set1_ps(-1.25e+00f), tmp11168);
tmp11172 = _mm512_fmadd_ps(in1608, _mm512_set1_ps(-1.25e+00f), tmp11172);
in1600 = _mm512_fmadd_ps(in1600, _mm512_set1_ps(-5e+00f), in1598);
in1608 = _mm512_fmadd_ps(in1608, _mm512_set1_ps(-5e+00f), in1606);
tmp11167 = _mm512_fmadd_ps(in1599, _mm512_set1_ps(-1.25e+00f), tmp11167);
tmp11171 = _mm512_fmadd_ps(in1607, _mm512_set1_ps(-1.25e+00f), tmp11171);
in1602 = _mm512_fmadd_ps(tmp11167, _mm512_set1_ps(2e+00f), tmp11168);
in1610 = _mm512_fmadd_ps(tmp11171, _mm512_set1_ps(2e+00f), tmp11172);
tmp11168 = _mm512_fnmadd_ps(tmp11167, _mm512_set1_ps(2e+00f), tmp11168);
tmp11172 = _mm512_fnmadd_ps(tmp11171, _mm512_set1_ps(2e+00f), tmp11172);
tmp11167 = _mm512_fmadd_ps(in1601, _mm512_set1_ps(2.5e-01f), in1597);
tmp11171 = _mm512_fmadd_ps(in1609, _mm512_set1_ps(2.5e-01f), in1605);
in1597 = _mm512_sub_ps(in1603, in1597);
in1605 = _mm512_sub_ps(in1611, in1605);
tmp11167 = _mm512_fmadd_ps(in1599, _mm512_set1_ps(-1.25e+00f), tmp11167);
tmp11171 = _mm512_fmadd_ps(in1607, _mm512_set1_ps(-1.25e+00f), tmp11171);
in1599 = _mm512_sub_ps(in1599, in1601);
in1607 = _mm512_sub_ps(in1607, in1609);
in1599 = _mm512_fmadd_ps(in1599, _mm512_set1_ps(5.25e+00f), in1597);
in1607 = _mm512_fmadd_ps(in1607, _mm512_set1_ps(5.25e+00f), in1605);
in1598 = _mm512_fmadd_ps(tmp11167, _mm512_set1_ps(2e+00f), in1600);
in1606 = _mm512_fmadd_ps(tmp11171, _mm512_set1_ps(2e+00f), in1608);
in1600 = _mm512_fnmadd_ps(tmp11167, _mm512_set1_ps(2e+00f), in1600);
in1608 = _mm512_fnmadd_ps(tmp11171, _mm512_set1_ps(2e+00f), in1608);
__m512 tmp11183 = _mm512_unpacklo_ps(in1596, tmp11169);
__m512 tmp11184 = _mm512_unpackhi_ps(in1596, tmp11169);
__m512 tmp11185 = _mm512_unpacklo_ps(tmp11170, in1602);
__m512 tmp11186 = _mm512_unpackhi_ps(tmp11170, in1602);
__m512 tmp11187 = _mm512_unpacklo_ps(tmp11168, in1598);
__m512 tmp11188 = _mm512_unpackhi_ps(tmp11168, in1598);
__m512 tmp11189 = _mm512_unpacklo_ps(in1600, in1599);
__m512 tmp11190 = _mm512_unpackhi_ps(in1600, in1599);
__m512 tmp11191 = _mm512_unpacklo_ps(in1604, tmp11173);
__m512 tmp11192 = _mm512_unpackhi_ps(in1604, tmp11173);
__m512 tmp11193 = _mm512_unpacklo_ps(tmp11174, in1610);
__m512 tmp11194 = _mm512_unpackhi_ps(tmp11174, in1610);
__m512 tmp11195 = _mm512_unpacklo_ps(tmp11172, in1606);
__m512 tmp11196 = _mm512_unpackhi_ps(tmp11172, in1606);
__m512 tmp11197 = _mm512_unpacklo_ps(in1608, in1607);
__m512 tmp11198 = _mm512_unpackhi_ps(in1608, in1607);
__m512 tmp11199 = _mm512_shuffle_ps(tmp11183, tmp11185, 68);
__m512 tmp11200 = _mm512_shuffle_ps(tmp11183, tmp11185, 238);
__m512 tmp11201 = _mm512_shuffle_ps(tmp11184, tmp11186, 68);
__m512 tmp11202 = _mm512_shuffle_ps(tmp11184, tmp11186, 238);
__m512 tmp11203 = _mm512_shuffle_ps(tmp11187, tmp11189, 68);
__m512 tmp11204 = _mm512_shuffle_ps(tmp11187, tmp11189, 238);
__m512 tmp11205 = _mm512_shuffle_ps(tmp11188, tmp11190, 68);
__m512 tmp11206 = _mm512_shuffle_ps(tmp11188, tmp11190, 238);
__m512 tmp11207 = _mm512_shuffle_ps(tmp11191, tmp11193, 68);
__m512 tmp11208 = _mm512_shuffle_ps(tmp11191, tmp11193, 238);
__m512 tmp11209 = _mm512_shuffle_ps(tmp11192, tmp11194, 68);
__m512 tmp11210 = _mm512_shuffle_ps(tmp11192, tmp11194, 238);
__m512 tmp11211 = _mm512_shuffle_ps(tmp11195, tmp11197, 68);
__m512 tmp11212 = _mm512_shuffle_ps(tmp11195, tmp11197, 238);
__m512 tmp11213 = _mm512_shuffle_ps(tmp11196, tmp11198, 68);
__m512 tmp11214 = _mm512_shuffle_ps(tmp11196, tmp11198, 238);
__m512 tmp11215 = _mm512_shuffle_f32x4(tmp11199, tmp11203, 136);
__m512 tmp11216 = _mm512_shuffle_f32x4(tmp11199, tmp11203, 221);
__m512 tmp11217 = _mm512_shuffle_f32x4(tmp11200, tmp11204, 136);
__m512 tmp11218 = _mm512_shuffle_f32x4(tmp11200, tmp11204, 221);
__m512 tmp11219 = _mm512_shuffle_f32x4(tmp11201, tmp11205, 136);
__m512 tmp11220 = _mm512_shuffle_f32x4(tmp11201, tmp11205, 221);
__m512 tmp11221 = _mm512_shuffle_f32x4(tmp11202, tmp11206, 136);
__m512 tmp11222 = _mm512_shuffle_f32x4(tmp11202, tmp11206, 221);
__m512 tmp11223 = _mm512_shuffle_f32x4(tmp11207, tmp11211, 136);
__m512 tmp11224 = _mm512_shuffle_f32x4(tmp11207, tmp11211, 221);
__m512 tmp11225 = _mm512_shuffle_f32x4(tmp11208, tmp11212, 136);
__m512 tmp11226 = _mm512_shuffle_f32x4(tmp11208, tmp11212, 221);
__m512 tmp11227 = _mm512_shuffle_f32x4(tmp11209, tmp11213, 136);
__m512 tmp11228 = _mm512_shuffle_f32x4(tmp11209, tmp11213, 221);
__m512 tmp11229 = _mm512_shuffle_f32x4(tmp11210, tmp11214, 136);
__m512 tmp11230 = _mm512_shuffle_f32x4(tmp11210, tmp11214, 221);
in1596 = _mm512_shuffle_f32x4(tmp11215, tmp11223, 136);
in1604 = _mm512_shuffle_f32x4(tmp11215, tmp11223, 221);
tmp11169 = _mm512_shuffle_f32x4(tmp11217, tmp11225, 136);
tmp11173 = _mm512_shuffle_f32x4(tmp11217, tmp11225, 221);
tmp11170 = _mm512_shuffle_f32x4(tmp11219, tmp11227, 136);
tmp11174 = _mm512_shuffle_f32x4(tmp11219, tmp11227, 221);
in1602 = _mm512_shuffle_f32x4(tmp11221, tmp11229, 136);
in1610 = _mm512_shuffle_f32x4(tmp11221, tmp11229, 221);
tmp11168 = _mm512_shuffle_f32x4(tmp11216, tmp11224, 136);
tmp11172 = _mm512_shuffle_f32x4(tmp11216, tmp11224, 221);
in1598 = _mm512_shuffle_f32x4(tmp11218, tmp11226, 136);
in1606 = _mm512_shuffle_f32x4(tmp11218, tmp11226, 221);
in1600 = _mm512_shuffle_f32x4(tmp11220, tmp11228, 136);
in1608 = _mm512_shuffle_f32x4(tmp11220, tmp11228, 221);
in1599 = _mm512_shuffle_f32x4(tmp11222, tmp11230, 136);
in1607 = _mm512_shuffle_f32x4(tmp11222, tmp11230, 221);
__m512 tmp11175 = _mm512_add_ps(tmp11169, in1598);
__m512 tmp11179 = _mm512_add_ps(tmp11173, in1606);
__m512 tmp11176 = _mm512_sub_ps(tmp11168, tmp11170);
__m512 tmp11180 = _mm512_sub_ps(tmp11172, tmp11174);
__m512 tmp11177 = _mm512_add_ps(tmp11170, in1600);
__m512 tmp11181 = _mm512_add_ps(tmp11174, in1608);
in1596 = _mm512_sub_ps(in1596, in1600);
in1604 = _mm512_sub_ps(in1604, in1608);
tmp11175 = _mm512_fmadd_ps(in1602, _mm512_set1_ps(-4.25e+00f), tmp11175);
tmp11179 = _mm512_fmadd_ps(in1610, _mm512_set1_ps(-4.25e+00f), tmp11179);
tmp11177 = _mm512_fmadd_ps(tmp11168, _mm512_set1_ps(-4.25e+00f), tmp11177);
tmp11181 = _mm512_fmadd_ps(tmp11172, _mm512_set1_ps(-4.25e+00f), tmp11181);
in1596 = _mm512_fmadd_ps(tmp11176, _mm512_set1_ps(5.25e+00f), in1596);
in1604 = _mm512_fmadd_ps(tmp11180, _mm512_set1_ps(5.25e+00f), in1604);
tmp11176 = _mm512_fmadd_ps(tmp11170, _mm512_set1_ps(2.5e-01f), in1600);
tmp11180 = _mm512_fmadd_ps(tmp11174, _mm512_set1_ps(2.5e-01f), in1608);
tmp11170 = _mm512_fmadd_ps(tmp11170, _mm512_set1_ps(4e+00f), in1600);
tmp11174 = _mm512_fmadd_ps(tmp11174, _mm512_set1_ps(4e+00f), in1608);
__m512 tmp11178 = _mm512_sub_ps(tmp11177, tmp11175);
__m512 tmp11182 = _mm512_sub_ps(tmp11181, tmp11179);
tmp11177 = _mm512_add_ps(tmp11175, tmp11177);
tmp11181 = _mm512_add_ps(tmp11179, tmp11181);
tmp11175 = _mm512_fmadd_ps(tmp11169, _mm512_set1_ps(2.5e-01f), in1598);
tmp11179 = _mm512_fmadd_ps(tmp11173, _mm512_set1_ps(2.5e-01f), in1606);
tmp11176 = _mm512_fmadd_ps(tmp11168, _mm512_set1_ps(-1.25e+00f), tmp11176);
tmp11180 = _mm512_fmadd_ps(tmp11172, _mm512_set1_ps(-1.25e+00f), tmp11180);
tmp11168 = _mm512_fmadd_ps(tmp11168, _mm512_set1_ps(-5e+00f), tmp11170);
tmp11172 = _mm512_fmadd_ps(tmp11172, _mm512_set1_ps(-5e+00f), tmp11174);
tmp11175 = _mm512_fmadd_ps(in1602, _mm512_set1_ps(-1.25e+00f), tmp11175);
tmp11179 = _mm512_fmadd_ps(in1610, _mm512_set1_ps(-1.25e+00f), tmp11179);
in1600 = _mm512_fmadd_ps(tmp11175, _mm512_set1_ps(2e+00f), tmp11176);
in1608 = _mm512_fmadd_ps(tmp11179, _mm512_set1_ps(2e+00f), tmp11180);
tmp11176 = _mm512_fnmadd_ps(tmp11175, _mm512_set1_ps(2e+00f), tmp11176);
tmp11180 = _mm512_fnmadd_ps(tmp11179, _mm512_set1_ps(2e+00f), tmp11180);
tmp11175 = _mm512_fmadd_ps(in1598, _mm512_set1_ps(2.5e-01f), tmp11169);
tmp11179 = _mm512_fmadd_ps(in1606, _mm512_set1_ps(2.5e-01f), tmp11173);
tmp11169 = _mm512_sub_ps(in1599, tmp11169);
tmp11173 = _mm512_sub_ps(in1607, tmp11173);
tmp11175 = _mm512_fmadd_ps(in1602, _mm512_set1_ps(-1.25e+00f), tmp11175);
tmp11179 = _mm512_fmadd_ps(in1610, _mm512_set1_ps(-1.25e+00f), tmp11179);
in1602 = _mm512_sub_ps(in1602, in1598);
in1610 = _mm512_sub_ps(in1610, in1606);
in1602 = _mm512_fmadd_ps(in1602, _mm512_set1_ps(5.25e+00f), tmp11169);
in1610 = _mm512_fmadd_ps(in1610, _mm512_set1_ps(5.25e+00f), tmp11173);
tmp11170 = _mm512_fmadd_ps(tmp11175, _mm512_set1_ps(2e+00f), tmp11168);
tmp11174 = _mm512_fmadd_ps(tmp11179, _mm512_set1_ps(2e+00f), tmp11172);
tmp11168 = _mm512_fnmadd_ps(tmp11175, _mm512_set1_ps(2e+00f), tmp11168);
tmp11172 = _mm512_fnmadd_ps(tmp11179, _mm512_set1_ps(2e+00f), tmp11172);
__m512 out1471 = _mm512_shuffle_f32x4(in1596, tmp11177, 68);
__m512 out1479 = _mm512_shuffle_f32x4(in1596, tmp11177, 238);
__m512 out1472 = _mm512_shuffle_f32x4(tmp11178, in1600, 68);
__m512 out1480 = _mm512_shuffle_f32x4(tmp11178, in1600, 238);
__m512 out1473 = _mm512_shuffle_f32x4(tmp11176, tmp11170, 68);
__m512 out1481 = _mm512_shuffle_f32x4(tmp11176, tmp11170, 238);
__m512 out1474 = _mm512_shuffle_f32x4(tmp11168, in1602, 68);
__m512 out1482 = _mm512_shuffle_f32x4(tmp11168, in1602, 238);
__m512 out1475 = _mm512_shuffle_f32x4(in1604, tmp11181, 68);
__m512 out1483 = _mm512_shuffle_f32x4(in1604, tmp11181, 238);
__m512 out1476 = _mm512_shuffle_f32x4(tmp11182, in1608, 68);
__m512 out1484 = _mm512_shuffle_f32x4(tmp11182, in1608, 238);
__m512 out1477 = _mm512_shuffle_f32x4(tmp11180, tmp11174, 68);
__m512 out1485 = _mm512_shuffle_f32x4(tmp11180, tmp11174, 238);
__m512 out1478 = _mm512_shuffle_f32x4(tmp11172, in1610, 68);
__m512 out1486 = _mm512_shuffle_f32x4(tmp11172, in1610, 238);
_mm512_storeu_ps(dfPtr8+512+819200*i35+49152*j28+49152*s25+768*k106, out1471);
_mm512_storeu_ps(dfPtr8+640+819200*i35+49152*j28+49152*s25+768*k106, out1479);
_mm512_storeu_ps(dfPtr8+576+819200*i35+49152*j28+49152*s25+768*k106, out1475);
_mm512_storeu_ps(dfPtr8+704+819200*i35+49152*j28+49152*s25+768*k106, out1483);
_mm512_storeu_ps(dfPtr8+205312+819200*i35+49152*j28+49152*s25+768*k106, out1472);
_mm512_storeu_ps(dfPtr8+205440+819200*i35+49152*j28+49152*s25+768*k106, out1480);
_mm512_storeu_ps(dfPtr8+205376+819200*i35+49152*j28+49152*s25+768*k106, out1476);
_mm512_storeu_ps(dfPtr8+205504+819200*i35+49152*j28+49152*s25+768*k106, out1484);
_mm512_storeu_ps(dfPtr8+410112+819200*i35+49152*j28+49152*s25+768*k106, out1473);
_mm512_storeu_ps(dfPtr8+410240+819200*i35+49152*j28+49152*s25+768*k106, out1481);
_mm512_storeu_ps(dfPtr8+410176+819200*i35+49152*j28+49152*s25+768*k106, out1477);
_mm512_storeu_ps(dfPtr8+410304+819200*i35+49152*j28+49152*s25+768*k106, out1485);
_mm512_storeu_ps(dfPtr8+614912+819200*i35+49152*j28+49152*s25+768*k106, out1474);
_mm512_storeu_ps(dfPtr8+615040+819200*i35+49152*j28+49152*s25+768*k106, out1482);
_mm512_storeu_ps(dfPtr8+614976+819200*i35+49152*j28+49152*s25+768*k106, out1478);
_mm512_storeu_ps(dfPtr8+615104+819200*i35+49152*j28+49152*s25+768*k106, out1486);
}
if (j28 >= last7) return;
++j28;
rel19 = 2;
}
if (rel19 < 3) {
ptrdiff_t h42 = base19+12;
ptrdiff_t w50 = 12;
ptrdiff_t k107 = 0;
for (; k107 != 64; ++k107) {
__m512 dat1791 = _mm512_maskz_loadu_ps(16383, datPtr16+0+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1791 = _mm512_max_ps(_mm512_setzero_ps(), dat1791);
__m512 dat1792 = _mm512_maskz_loadu_ps(31, datPtr16+48+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1792 = _mm512_max_ps(_mm512_setzero_ps(), dat1792);
__m512 dat1793 = _mm512_maskz_loadu_ps(127, datPtr16+628+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1793 = _mm512_max_ps(_mm512_setzero_ps(), dat1793);
__m512i pm173 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1612 = _mm512_permutexvar_ps(pm173, dat1791);
__m512i pm174 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 4, 3, 2, 1, 0);
__m512 in1620 = _mm512_permutex2var_ps(dat1792, pm174, dat1793);
__m512 dat1794 = _mm512_maskz_loadu_ps(16383, datPtr16+112+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1794 = _mm512_max_ps(_mm512_setzero_ps(), dat1794);
__m512 dat1795 = _mm512_maskz_loadu_ps(31, datPtr16+160+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1795 = _mm512_max_ps(_mm512_setzero_ps(), dat1795);
__m512 dat1796 = _mm512_maskz_loadu_ps(127, datPtr16+740+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1796 = _mm512_max_ps(_mm512_setzero_ps(), dat1796);
__m512 in1613 = _mm512_permutexvar_ps(pm173, dat1794);
__m512 in1621 = _mm512_permutex2var_ps(dat1795, pm174, dat1796);
__m512 dat1797 = _mm512_maskz_loadu_ps(16383, datPtr16+224+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1797 = _mm512_max_ps(_mm512_setzero_ps(), dat1797);
__m512 dat1798 = _mm512_maskz_loadu_ps(31, datPtr16+272+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1798 = _mm512_max_ps(_mm512_setzero_ps(), dat1798);
__m512 dat1799 = _mm512_maskz_loadu_ps(127, datPtr16+852+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1799 = _mm512_max_ps(_mm512_setzero_ps(), dat1799);
__m512 in1614 = _mm512_permutexvar_ps(pm173, dat1797);
__m512 in1622 = _mm512_permutex2var_ps(dat1798, pm174, dat1799);
__m512 dat1800 = _mm512_maskz_loadu_ps(16383, datPtr16+336+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1800 = _mm512_max_ps(_mm512_setzero_ps(), dat1800);
__m512 dat1801 = _mm512_maskz_loadu_ps(31, datPtr16+384+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1801 = _mm512_max_ps(_mm512_setzero_ps(), dat1801);
__m512 dat1802 = _mm512_maskz_loadu_ps(127, datPtr16+964+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1802 = _mm512_max_ps(_mm512_setzero_ps(), dat1802);
__m512 in1615 = _mm512_permutexvar_ps(pm173, dat1800);
__m512 in1623 = _mm512_permutex2var_ps(dat1801, pm174, dat1802);
__m512 dat1803 = _mm512_maskz_loadu_ps(16383, datPtr16+448+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1803 = _mm512_max_ps(_mm512_setzero_ps(), dat1803);
__m512 dat1804 = _mm512_maskz_loadu_ps(31, datPtr16+496+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1804 = _mm512_max_ps(_mm512_setzero_ps(), dat1804);
__m512 dat1805 = _mm512_maskz_loadu_ps(127, datPtr16+1076+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1805 = _mm512_max_ps(_mm512_setzero_ps(), dat1805);
__m512 in1616 = _mm512_permutexvar_ps(pm173, dat1803);
__m512 in1624 = _mm512_permutex2var_ps(dat1804, pm174, dat1805);
__m512 dat1806 = _mm512_maskz_loadu_ps(16383, datPtr16+560+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1806 = _mm512_max_ps(_mm512_setzero_ps(), dat1806);
__m512 dat1807 = _mm512_maskz_loadu_ps(31, datPtr16+608+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1807 = _mm512_max_ps(_mm512_setzero_ps(), dat1807);
__m512 dat1808 = _mm512_maskz_loadu_ps(127, datPtr16+1188+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1808 = _mm512_max_ps(_mm512_setzero_ps(), dat1808);
__m512 in1617 = _mm512_permutexvar_ps(pm173, dat1806);
__m512 in1625 = _mm512_permutex2var_ps(dat1807, pm174, dat1808);
__m512 dat1809 = _mm512_maskz_loadu_ps(16383, datPtr16+672+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1809 = _mm512_max_ps(_mm512_setzero_ps(), dat1809);
__m512 dat1810 = _mm512_maskz_loadu_ps(31, datPtr16+720+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1810 = _mm512_max_ps(_mm512_setzero_ps(), dat1810);
__m512 dat1811 = _mm512_maskz_loadu_ps(127, datPtr16+1300+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1811 = _mm512_max_ps(_mm512_setzero_ps(), dat1811);
__m512 in1618 = _mm512_permutexvar_ps(pm173, dat1809);
__m512 in1626 = _mm512_permutex2var_ps(dat1810, pm174, dat1811);
__m512 dat1812 = _mm512_maskz_loadu_ps(16383, datPtr16+784+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1812 = _mm512_max_ps(_mm512_setzero_ps(), dat1812);
__m512 dat1813 = _mm512_maskz_loadu_ps(31, datPtr16+832+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1813 = _mm512_max_ps(_mm512_setzero_ps(), dat1813);
__m512 dat1814 = _mm512_maskz_loadu_ps(127, datPtr16+1412+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1814 = _mm512_max_ps(_mm512_setzero_ps(), dat1814);
__m512 in1619 = _mm512_permutexvar_ps(pm173, dat1812);
__m512 in1627 = _mm512_permutex2var_ps(dat1813, pm174, dat1814);
__m512 tmp11231 = _mm512_add_ps(in1613, in1617);
__m512 tmp11235 = _mm512_add_ps(in1621, in1625);
__m512 tmp11232 = _mm512_sub_ps(in1616, in1614);
__m512 tmp11236 = _mm512_sub_ps(in1624, in1622);
__m512 tmp11233 = _mm512_add_ps(in1614, in1618);
__m512 tmp11237 = _mm512_add_ps(in1622, in1626);
in1612 = _mm512_sub_ps(in1612, in1618);
in1620 = _mm512_sub_ps(in1620, in1626);
tmp11231 = _mm512_fmadd_ps(in1615, _mm512_set1_ps(-4.25e+00f), tmp11231);
tmp11235 = _mm512_fmadd_ps(in1623, _mm512_set1_ps(-4.25e+00f), tmp11235);
tmp11233 = _mm512_fmadd_ps(in1616, _mm512_set1_ps(-4.25e+00f), tmp11233);
tmp11237 = _mm512_fmadd_ps(in1624, _mm512_set1_ps(-4.25e+00f), tmp11237);
in1612 = _mm512_fmadd_ps(tmp11232, _mm512_set1_ps(5.25e+00f), in1612);
in1620 = _mm512_fmadd_ps(tmp11236, _mm512_set1_ps(5.25e+00f), in1620);
tmp11232 = _mm512_fmadd_ps(in1614, _mm512_set1_ps(2.5e-01f), in1618);
tmp11236 = _mm512_fmadd_ps(in1622, _mm512_set1_ps(2.5e-01f), in1626);
in1614 = _mm512_fmadd_ps(in1614, _mm512_set1_ps(4e+00f), in1618);
in1622 = _mm512_fmadd_ps(in1622, _mm512_set1_ps(4e+00f), in1626);
__m512 tmp11234 = _mm512_sub_ps(tmp11233, tmp11231);
__m512 tmp11238 = _mm512_sub_ps(tmp11237, tmp11235);
tmp11233 = _mm512_add_ps(tmp11231, tmp11233);
tmp11237 = _mm512_add_ps(tmp11235, tmp11237);
tmp11231 = _mm512_fmadd_ps(in1613, _mm512_set1_ps(2.5e-01f), in1617);
tmp11235 = _mm512_fmadd_ps(in1621, _mm512_set1_ps(2.5e-01f), in1625);
tmp11232 = _mm512_fmadd_ps(in1616, _mm512_set1_ps(-1.25e+00f), tmp11232);
tmp11236 = _mm512_fmadd_ps(in1624, _mm512_set1_ps(-1.25e+00f), tmp11236);
in1616 = _mm512_fmadd_ps(in1616, _mm512_set1_ps(-5e+00f), in1614);
in1624 = _mm512_fmadd_ps(in1624, _mm512_set1_ps(-5e+00f), in1622);
tmp11231 = _mm512_fmadd_ps(in1615, _mm512_set1_ps(-1.25e+00f), tmp11231);
tmp11235 = _mm512_fmadd_ps(in1623, _mm512_set1_ps(-1.25e+00f), tmp11235);
in1618 = _mm512_fmadd_ps(tmp11231, _mm512_set1_ps(2e+00f), tmp11232);
in1626 = _mm512_fmadd_ps(tmp11235, _mm512_set1_ps(2e+00f), tmp11236);
tmp11232 = _mm512_fnmadd_ps(tmp11231, _mm512_set1_ps(2e+00f), tmp11232);
tmp11236 = _mm512_fnmadd_ps(tmp11235, _mm512_set1_ps(2e+00f), tmp11236);
tmp11231 = _mm512_fmadd_ps(in1617, _mm512_set1_ps(2.5e-01f), in1613);
tmp11235 = _mm512_fmadd_ps(in1625, _mm512_set1_ps(2.5e-01f), in1621);
in1613 = _mm512_sub_ps(in1619, in1613);
in1621 = _mm512_sub_ps(in1627, in1621);
tmp11231 = _mm512_fmadd_ps(in1615, _mm512_set1_ps(-1.25e+00f), tmp11231);
tmp11235 = _mm512_fmadd_ps(in1623, _mm512_set1_ps(-1.25e+00f), tmp11235);
in1615 = _mm512_sub_ps(in1615, in1617);
in1623 = _mm512_sub_ps(in1623, in1625);
in1615 = _mm512_fmadd_ps(in1615, _mm512_set1_ps(5.25e+00f), in1613);
in1623 = _mm512_fmadd_ps(in1623, _mm512_set1_ps(5.25e+00f), in1621);
in1614 = _mm512_fmadd_ps(tmp11231, _mm512_set1_ps(2e+00f), in1616);
in1622 = _mm512_fmadd_ps(tmp11235, _mm512_set1_ps(2e+00f), in1624);
in1616 = _mm512_fnmadd_ps(tmp11231, _mm512_set1_ps(2e+00f), in1616);
in1624 = _mm512_fnmadd_ps(tmp11235, _mm512_set1_ps(2e+00f), in1624);
__m512 tmp11247 = _mm512_unpacklo_ps(in1612, tmp11233);
__m512 tmp11248 = _mm512_unpackhi_ps(in1612, tmp11233);
__m512 tmp11249 = _mm512_unpacklo_ps(tmp11234, in1618);
__m512 tmp11250 = _mm512_unpackhi_ps(tmp11234, in1618);
__m512 tmp11251 = _mm512_unpacklo_ps(tmp11232, in1614);
__m512 tmp11252 = _mm512_unpackhi_ps(tmp11232, in1614);
__m512 tmp11253 = _mm512_unpacklo_ps(in1616, in1615);
__m512 tmp11254 = _mm512_unpackhi_ps(in1616, in1615);
__m512 tmp11255 = _mm512_unpacklo_ps(in1620, tmp11237);
__m512 tmp11256 = _mm512_unpackhi_ps(in1620, tmp11237);
__m512 tmp11257 = _mm512_unpacklo_ps(tmp11238, in1626);
__m512 tmp11258 = _mm512_unpackhi_ps(tmp11238, in1626);
__m512 tmp11259 = _mm512_unpacklo_ps(tmp11236, in1622);
__m512 tmp11260 = _mm512_unpackhi_ps(tmp11236, in1622);
__m512 tmp11261 = _mm512_unpacklo_ps(in1624, in1623);
__m512 tmp11262 = _mm512_unpackhi_ps(in1624, in1623);
__m512 tmp11263 = _mm512_shuffle_ps(tmp11247, tmp11249, 68);
__m512 tmp11264 = _mm512_shuffle_ps(tmp11247, tmp11249, 238);
__m512 tmp11265 = _mm512_shuffle_ps(tmp11248, tmp11250, 68);
__m512 tmp11266 = _mm512_shuffle_ps(tmp11248, tmp11250, 238);
__m512 tmp11267 = _mm512_shuffle_ps(tmp11251, tmp11253, 68);
__m512 tmp11268 = _mm512_shuffle_ps(tmp11251, tmp11253, 238);
__m512 tmp11269 = _mm512_shuffle_ps(tmp11252, tmp11254, 68);
__m512 tmp11270 = _mm512_shuffle_ps(tmp11252, tmp11254, 238);
__m512 tmp11271 = _mm512_shuffle_ps(tmp11255, tmp11257, 68);
__m512 tmp11272 = _mm512_shuffle_ps(tmp11255, tmp11257, 238);
__m512 tmp11273 = _mm512_shuffle_ps(tmp11256, tmp11258, 68);
__m512 tmp11274 = _mm512_shuffle_ps(tmp11256, tmp11258, 238);
__m512 tmp11275 = _mm512_shuffle_ps(tmp11259, tmp11261, 68);
__m512 tmp11276 = _mm512_shuffle_ps(tmp11259, tmp11261, 238);
__m512 tmp11277 = _mm512_shuffle_ps(tmp11260, tmp11262, 68);
__m512 tmp11278 = _mm512_shuffle_ps(tmp11260, tmp11262, 238);
__m512 tmp11279 = _mm512_shuffle_f32x4(tmp11263, tmp11267, 136);
__m512 tmp11280 = _mm512_shuffle_f32x4(tmp11263, tmp11267, 221);
__m512 tmp11281 = _mm512_shuffle_f32x4(tmp11264, tmp11268, 136);
__m512 tmp11282 = _mm512_shuffle_f32x4(tmp11264, tmp11268, 221);
__m512 tmp11283 = _mm512_shuffle_f32x4(tmp11265, tmp11269, 136);
__m512 tmp11284 = _mm512_shuffle_f32x4(tmp11265, tmp11269, 221);
__m512 tmp11285 = _mm512_shuffle_f32x4(tmp11266, tmp11270, 136);
__m512 tmp11286 = _mm512_shuffle_f32x4(tmp11266, tmp11270, 221);
__m512 tmp11287 = _mm512_shuffle_f32x4(tmp11271, tmp11275, 136);
__m512 tmp11288 = _mm512_shuffle_f32x4(tmp11271, tmp11275, 221);
__m512 tmp11289 = _mm512_shuffle_f32x4(tmp11272, tmp11276, 136);
__m512 tmp11290 = _mm512_shuffle_f32x4(tmp11272, tmp11276, 221);
__m512 tmp11291 = _mm512_shuffle_f32x4(tmp11273, tmp11277, 136);
__m512 tmp11292 = _mm512_shuffle_f32x4(tmp11273, tmp11277, 221);
__m512 tmp11293 = _mm512_shuffle_f32x4(tmp11274, tmp11278, 136);
__m512 tmp11294 = _mm512_shuffle_f32x4(tmp11274, tmp11278, 221);
in1612 = _mm512_shuffle_f32x4(tmp11279, tmp11287, 136);
in1620 = _mm512_shuffle_f32x4(tmp11279, tmp11287, 221);
tmp11233 = _mm512_shuffle_f32x4(tmp11281, tmp11289, 136);
tmp11237 = _mm512_shuffle_f32x4(tmp11281, tmp11289, 221);
tmp11234 = _mm512_shuffle_f32x4(tmp11283, tmp11291, 136);
tmp11238 = _mm512_shuffle_f32x4(tmp11283, tmp11291, 221);
in1618 = _mm512_shuffle_f32x4(tmp11285, tmp11293, 136);
in1626 = _mm512_shuffle_f32x4(tmp11285, tmp11293, 221);
tmp11232 = _mm512_shuffle_f32x4(tmp11280, tmp11288, 136);
tmp11236 = _mm512_shuffle_f32x4(tmp11280, tmp11288, 221);
in1614 = _mm512_shuffle_f32x4(tmp11282, tmp11290, 136);
in1622 = _mm512_shuffle_f32x4(tmp11282, tmp11290, 221);
in1616 = _mm512_shuffle_f32x4(tmp11284, tmp11292, 136);
in1624 = _mm512_shuffle_f32x4(tmp11284, tmp11292, 221);
in1615 = _mm512_shuffle_f32x4(tmp11286, tmp11294, 136);
in1623 = _mm512_shuffle_f32x4(tmp11286, tmp11294, 221);
__m512 tmp11239 = _mm512_add_ps(tmp11233, in1614);
__m512 tmp11243 = _mm512_add_ps(tmp11237, in1622);
__m512 tmp11240 = _mm512_sub_ps(tmp11232, tmp11234);
__m512 tmp11244 = _mm512_sub_ps(tmp11236, tmp11238);
__m512 tmp11241 = _mm512_add_ps(tmp11234, in1616);
__m512 tmp11245 = _mm512_add_ps(tmp11238, in1624);
in1612 = _mm512_sub_ps(in1612, in1616);
in1620 = _mm512_sub_ps(in1620, in1624);
tmp11239 = _mm512_fmadd_ps(in1618, _mm512_set1_ps(-4.25e+00f), tmp11239);
tmp11243 = _mm512_fmadd_ps(in1626, _mm512_set1_ps(-4.25e+00f), tmp11243);
tmp11241 = _mm512_fmadd_ps(tmp11232, _mm512_set1_ps(-4.25e+00f), tmp11241);
tmp11245 = _mm512_fmadd_ps(tmp11236, _mm512_set1_ps(-4.25e+00f), tmp11245);
in1612 = _mm512_fmadd_ps(tmp11240, _mm512_set1_ps(5.25e+00f), in1612);
in1620 = _mm512_fmadd_ps(tmp11244, _mm512_set1_ps(5.25e+00f), in1620);
tmp11240 = _mm512_fmadd_ps(tmp11234, _mm512_set1_ps(2.5e-01f), in1616);
tmp11244 = _mm512_fmadd_ps(tmp11238, _mm512_set1_ps(2.5e-01f), in1624);
tmp11234 = _mm512_fmadd_ps(tmp11234, _mm512_set1_ps(4e+00f), in1616);
tmp11238 = _mm512_fmadd_ps(tmp11238, _mm512_set1_ps(4e+00f), in1624);
__m512 tmp11242 = _mm512_sub_ps(tmp11241, tmp11239);
__m512 tmp11246 = _mm512_sub_ps(tmp11245, tmp11243);
tmp11241 = _mm512_add_ps(tmp11239, tmp11241);
tmp11245 = _mm512_add_ps(tmp11243, tmp11245);
tmp11239 = _mm512_fmadd_ps(tmp11233, _mm512_set1_ps(2.5e-01f), in1614);
tmp11243 = _mm512_fmadd_ps(tmp11237, _mm512_set1_ps(2.5e-01f), in1622);
tmp11240 = _mm512_fmadd_ps(tmp11232, _mm512_set1_ps(-1.25e+00f), tmp11240);
tmp11244 = _mm512_fmadd_ps(tmp11236, _mm512_set1_ps(-1.25e+00f), tmp11244);
tmp11232 = _mm512_fmadd_ps(tmp11232, _mm512_set1_ps(-5e+00f), tmp11234);
tmp11236 = _mm512_fmadd_ps(tmp11236, _mm512_set1_ps(-5e+00f), tmp11238);
tmp11239 = _mm512_fmadd_ps(in1618, _mm512_set1_ps(-1.25e+00f), tmp11239);
tmp11243 = _mm512_fmadd_ps(in1626, _mm512_set1_ps(-1.25e+00f), tmp11243);
in1616 = _mm512_fmadd_ps(tmp11239, _mm512_set1_ps(2e+00f), tmp11240);
in1624 = _mm512_fmadd_ps(tmp11243, _mm512_set1_ps(2e+00f), tmp11244);
tmp11240 = _mm512_fnmadd_ps(tmp11239, _mm512_set1_ps(2e+00f), tmp11240);
tmp11244 = _mm512_fnmadd_ps(tmp11243, _mm512_set1_ps(2e+00f), tmp11244);
tmp11239 = _mm512_fmadd_ps(in1614, _mm512_set1_ps(2.5e-01f), tmp11233);
tmp11243 = _mm512_fmadd_ps(in1622, _mm512_set1_ps(2.5e-01f), tmp11237);
tmp11233 = _mm512_sub_ps(in1615, tmp11233);
tmp11237 = _mm512_sub_ps(in1623, tmp11237);
tmp11239 = _mm512_fmadd_ps(in1618, _mm512_set1_ps(-1.25e+00f), tmp11239);
tmp11243 = _mm512_fmadd_ps(in1626, _mm512_set1_ps(-1.25e+00f), tmp11243);
in1618 = _mm512_sub_ps(in1618, in1614);
in1626 = _mm512_sub_ps(in1626, in1622);
in1618 = _mm512_fmadd_ps(in1618, _mm512_set1_ps(5.25e+00f), tmp11233);
in1626 = _mm512_fmadd_ps(in1626, _mm512_set1_ps(5.25e+00f), tmp11237);
tmp11234 = _mm512_fmadd_ps(tmp11239, _mm512_set1_ps(2e+00f), tmp11232);
tmp11238 = _mm512_fmadd_ps(tmp11243, _mm512_set1_ps(2e+00f), tmp11236);
tmp11232 = _mm512_fnmadd_ps(tmp11239, _mm512_set1_ps(2e+00f), tmp11232);
tmp11236 = _mm512_fnmadd_ps(tmp11243, _mm512_set1_ps(2e+00f), tmp11236);
__m512 out1487 = _mm512_shuffle_f32x4(in1612, tmp11241, 68);
__m512 out1495 = _mm512_shuffle_f32x4(in1612, tmp11241, 238);
__m512 out1488 = _mm512_shuffle_f32x4(tmp11242, in1616, 68);
__m512 out1496 = _mm512_shuffle_f32x4(tmp11242, in1616, 238);
__m512 out1489 = _mm512_shuffle_f32x4(tmp11240, tmp11234, 68);
__m512 out1497 = _mm512_shuffle_f32x4(tmp11240, tmp11234, 238);
__m512 out1490 = _mm512_shuffle_f32x4(tmp11232, in1618, 68);
__m512 out1498 = _mm512_shuffle_f32x4(tmp11232, in1618, 238);
__m512 out1491 = _mm512_shuffle_f32x4(in1620, tmp11245, 68);
__m512 out1499 = _mm512_shuffle_f32x4(in1620, tmp11245, 238);
__m512 out1492 = _mm512_shuffle_f32x4(tmp11246, in1624, 68);
__m512 out1500 = _mm512_shuffle_f32x4(tmp11246, in1624, 238);
__m512 out1493 = _mm512_shuffle_f32x4(tmp11244, tmp11238, 68);
__m512 out1501 = _mm512_shuffle_f32x4(tmp11244, tmp11238, 238);
__m512 out1494 = _mm512_shuffle_f32x4(tmp11236, in1626, 68);
__m512 out1502 = _mm512_shuffle_f32x4(tmp11236, in1626, 238);
_mm512_storeu_ps(dfPtr8+0+819200*i35+49152*j28+49152*s25+768*k107, out1487);
_mm512_storeu_ps(dfPtr8+128+819200*i35+49152*j28+49152*s25+768*k107, out1495);
_mm512_storeu_ps(dfPtr8+64+819200*i35+49152*j28+49152*s25+768*k107, out1491);
_mm512_storeu_ps(dfPtr8+192+819200*i35+49152*j28+49152*s25+768*k107, out1499);
_mm512_storeu_ps(dfPtr8+204800+819200*i35+49152*j28+49152*s25+768*k107, out1488);
_mm512_storeu_ps(dfPtr8+204928+819200*i35+49152*j28+49152*s25+768*k107, out1496);
_mm512_storeu_ps(dfPtr8+204864+819200*i35+49152*j28+49152*s25+768*k107, out1492);
_mm512_storeu_ps(dfPtr8+204992+819200*i35+49152*j28+49152*s25+768*k107, out1500);
_mm512_storeu_ps(dfPtr8+409600+819200*i35+49152*j28+49152*s25+768*k107, out1489);
_mm512_storeu_ps(dfPtr8+409728+819200*i35+49152*j28+49152*s25+768*k107, out1497);
_mm512_storeu_ps(dfPtr8+409664+819200*i35+49152*j28+49152*s25+768*k107, out1493);
_mm512_storeu_ps(dfPtr8+409792+819200*i35+49152*j28+49152*s25+768*k107, out1501);
_mm512_storeu_ps(dfPtr8+614400+819200*i35+49152*j28+49152*s25+768*k107, out1490);
_mm512_storeu_ps(dfPtr8+614528+819200*i35+49152*j28+49152*s25+768*k107, out1498);
_mm512_storeu_ps(dfPtr8+614464+819200*i35+49152*j28+49152*s25+768*k107, out1494);
_mm512_storeu_ps(dfPtr8+614592+819200*i35+49152*j28+49152*s25+768*k107, out1502);
__m512 dat1815 = _mm512_maskz_loadu_ps(16383, datPtr16+648+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1815 = _mm512_max_ps(_mm512_setzero_ps(), dat1815);
__m512 dat1816 = _mm512_maskz_loadu_ps(16383, datPtr16+3136+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1816 = _mm512_max_ps(_mm512_setzero_ps(), dat1816);
__m512i pm175 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1628 = _mm512_permutexvar_ps(pm175, dat1815);
__m512 in1636 = _mm512_permutexvar_ps(pm175, dat1816);
__m512 dat1817 = _mm512_maskz_loadu_ps(16383, datPtr16+760+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1817 = _mm512_max_ps(_mm512_setzero_ps(), dat1817);
__m512 dat1818 = _mm512_maskz_loadu_ps(16383, datPtr16+3248+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1818 = _mm512_max_ps(_mm512_setzero_ps(), dat1818);
__m512 in1629 = _mm512_permutexvar_ps(pm175, dat1817);
__m512 in1637 = _mm512_permutexvar_ps(pm175, dat1818);
__m512 dat1819 = _mm512_maskz_loadu_ps(16383, datPtr16+872+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1819 = _mm512_max_ps(_mm512_setzero_ps(), dat1819);
__m512 dat1820 = _mm512_maskz_loadu_ps(16383, datPtr16+3360+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1820 = _mm512_max_ps(_mm512_setzero_ps(), dat1820);
__m512 in1630 = _mm512_permutexvar_ps(pm175, dat1819);
__m512 in1638 = _mm512_permutexvar_ps(pm175, dat1820);
__m512 dat1821 = _mm512_maskz_loadu_ps(16383, datPtr16+984+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1821 = _mm512_max_ps(_mm512_setzero_ps(), dat1821);
__m512 dat1822 = _mm512_maskz_loadu_ps(16383, datPtr16+3472+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1822 = _mm512_max_ps(_mm512_setzero_ps(), dat1822);
__m512 in1631 = _mm512_permutexvar_ps(pm175, dat1821);
__m512 in1639 = _mm512_permutexvar_ps(pm175, dat1822);
__m512 dat1823 = _mm512_maskz_loadu_ps(16383, datPtr16+1096+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1823 = _mm512_max_ps(_mm512_setzero_ps(), dat1823);
__m512 dat1824 = _mm512_maskz_loadu_ps(16383, datPtr16+3584+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1824 = _mm512_max_ps(_mm512_setzero_ps(), dat1824);
__m512 in1632 = _mm512_permutexvar_ps(pm175, dat1823);
__m512 in1640 = _mm512_permutexvar_ps(pm175, dat1824);
__m512 dat1825 = _mm512_maskz_loadu_ps(16383, datPtr16+1208+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1825 = _mm512_max_ps(_mm512_setzero_ps(), dat1825);
__m512 dat1826 = _mm512_maskz_loadu_ps(16383, datPtr16+3696+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1826 = _mm512_max_ps(_mm512_setzero_ps(), dat1826);
__m512 in1633 = _mm512_permutexvar_ps(pm175, dat1825);
__m512 in1641 = _mm512_permutexvar_ps(pm175, dat1826);
__m512 dat1827 = _mm512_maskz_loadu_ps(16383, datPtr16+1320+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1827 = _mm512_max_ps(_mm512_setzero_ps(), dat1827);
__m512 dat1828 = _mm512_maskz_loadu_ps(16383, datPtr16+3808+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1828 = _mm512_max_ps(_mm512_setzero_ps(), dat1828);
__m512 in1634 = _mm512_permutexvar_ps(pm175, dat1827);
__m512 in1642 = _mm512_permutexvar_ps(pm175, dat1828);
__m512 dat1829 = _mm512_maskz_loadu_ps(16383, datPtr16+1432+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1829 = _mm512_max_ps(_mm512_setzero_ps(), dat1829);
__m512 dat1830 = _mm512_maskz_loadu_ps(16383, datPtr16+3920+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1830 = _mm512_max_ps(_mm512_setzero_ps(), dat1830);
__m512 in1635 = _mm512_permutexvar_ps(pm175, dat1829);
__m512 in1643 = _mm512_permutexvar_ps(pm175, dat1830);
__m512 tmp11295 = _mm512_add_ps(in1629, in1633);
__m512 tmp11299 = _mm512_add_ps(in1637, in1641);
__m512 tmp11296 = _mm512_sub_ps(in1632, in1630);
__m512 tmp11300 = _mm512_sub_ps(in1640, in1638);
__m512 tmp11297 = _mm512_add_ps(in1630, in1634);
__m512 tmp11301 = _mm512_add_ps(in1638, in1642);
in1628 = _mm512_sub_ps(in1628, in1634);
in1636 = _mm512_sub_ps(in1636, in1642);
tmp11295 = _mm512_fmadd_ps(in1631, _mm512_set1_ps(-4.25e+00f), tmp11295);
tmp11299 = _mm512_fmadd_ps(in1639, _mm512_set1_ps(-4.25e+00f), tmp11299);
tmp11297 = _mm512_fmadd_ps(in1632, _mm512_set1_ps(-4.25e+00f), tmp11297);
tmp11301 = _mm512_fmadd_ps(in1640, _mm512_set1_ps(-4.25e+00f), tmp11301);
in1628 = _mm512_fmadd_ps(tmp11296, _mm512_set1_ps(5.25e+00f), in1628);
in1636 = _mm512_fmadd_ps(tmp11300, _mm512_set1_ps(5.25e+00f), in1636);
tmp11296 = _mm512_fmadd_ps(in1630, _mm512_set1_ps(2.5e-01f), in1634);
tmp11300 = _mm512_fmadd_ps(in1638, _mm512_set1_ps(2.5e-01f), in1642);
in1630 = _mm512_fmadd_ps(in1630, _mm512_set1_ps(4e+00f), in1634);
in1638 = _mm512_fmadd_ps(in1638, _mm512_set1_ps(4e+00f), in1642);
__m512 tmp11298 = _mm512_sub_ps(tmp11297, tmp11295);
__m512 tmp11302 = _mm512_sub_ps(tmp11301, tmp11299);
tmp11297 = _mm512_add_ps(tmp11295, tmp11297);
tmp11301 = _mm512_add_ps(tmp11299, tmp11301);
tmp11295 = _mm512_fmadd_ps(in1629, _mm512_set1_ps(2.5e-01f), in1633);
tmp11299 = _mm512_fmadd_ps(in1637, _mm512_set1_ps(2.5e-01f), in1641);
tmp11296 = _mm512_fmadd_ps(in1632, _mm512_set1_ps(-1.25e+00f), tmp11296);
tmp11300 = _mm512_fmadd_ps(in1640, _mm512_set1_ps(-1.25e+00f), tmp11300);
in1632 = _mm512_fmadd_ps(in1632, _mm512_set1_ps(-5e+00f), in1630);
in1640 = _mm512_fmadd_ps(in1640, _mm512_set1_ps(-5e+00f), in1638);
tmp11295 = _mm512_fmadd_ps(in1631, _mm512_set1_ps(-1.25e+00f), tmp11295);
tmp11299 = _mm512_fmadd_ps(in1639, _mm512_set1_ps(-1.25e+00f), tmp11299);
in1634 = _mm512_fmadd_ps(tmp11295, _mm512_set1_ps(2e+00f), tmp11296);
in1642 = _mm512_fmadd_ps(tmp11299, _mm512_set1_ps(2e+00f), tmp11300);
tmp11296 = _mm512_fnmadd_ps(tmp11295, _mm512_set1_ps(2e+00f), tmp11296);
tmp11300 = _mm512_fnmadd_ps(tmp11299, _mm512_set1_ps(2e+00f), tmp11300);
tmp11295 = _mm512_fmadd_ps(in1633, _mm512_set1_ps(2.5e-01f), in1629);
tmp11299 = _mm512_fmadd_ps(in1641, _mm512_set1_ps(2.5e-01f), in1637);
in1629 = _mm512_sub_ps(in1635, in1629);
in1637 = _mm512_sub_ps(in1643, in1637);
tmp11295 = _mm512_fmadd_ps(in1631, _mm512_set1_ps(-1.25e+00f), tmp11295);
tmp11299 = _mm512_fmadd_ps(in1639, _mm512_set1_ps(-1.25e+00f), tmp11299);
in1631 = _mm512_sub_ps(in1631, in1633);
in1639 = _mm512_sub_ps(in1639, in1641);
in1631 = _mm512_fmadd_ps(in1631, _mm512_set1_ps(5.25e+00f), in1629);
in1639 = _mm512_fmadd_ps(in1639, _mm512_set1_ps(5.25e+00f), in1637);
in1630 = _mm512_fmadd_ps(tmp11295, _mm512_set1_ps(2e+00f), in1632);
in1638 = _mm512_fmadd_ps(tmp11299, _mm512_set1_ps(2e+00f), in1640);
in1632 = _mm512_fnmadd_ps(tmp11295, _mm512_set1_ps(2e+00f), in1632);
in1640 = _mm512_fnmadd_ps(tmp11299, _mm512_set1_ps(2e+00f), in1640);
__m512 tmp11311 = _mm512_unpacklo_ps(in1628, tmp11297);
__m512 tmp11312 = _mm512_unpackhi_ps(in1628, tmp11297);
__m512 tmp11313 = _mm512_unpacklo_ps(tmp11298, in1634);
__m512 tmp11314 = _mm512_unpackhi_ps(tmp11298, in1634);
__m512 tmp11315 = _mm512_unpacklo_ps(tmp11296, in1630);
__m512 tmp11316 = _mm512_unpackhi_ps(tmp11296, in1630);
__m512 tmp11317 = _mm512_unpacklo_ps(in1632, in1631);
__m512 tmp11318 = _mm512_unpackhi_ps(in1632, in1631);
__m512 tmp11319 = _mm512_unpacklo_ps(in1636, tmp11301);
__m512 tmp11320 = _mm512_unpackhi_ps(in1636, tmp11301);
__m512 tmp11321 = _mm512_unpacklo_ps(tmp11302, in1642);
__m512 tmp11322 = _mm512_unpackhi_ps(tmp11302, in1642);
__m512 tmp11323 = _mm512_unpacklo_ps(tmp11300, in1638);
__m512 tmp11324 = _mm512_unpackhi_ps(tmp11300, in1638);
__m512 tmp11325 = _mm512_unpacklo_ps(in1640, in1639);
__m512 tmp11326 = _mm512_unpackhi_ps(in1640, in1639);
__m512 tmp11327 = _mm512_shuffle_ps(tmp11311, tmp11313, 68);
__m512 tmp11328 = _mm512_shuffle_ps(tmp11311, tmp11313, 238);
__m512 tmp11329 = _mm512_shuffle_ps(tmp11312, tmp11314, 68);
__m512 tmp11330 = _mm512_shuffle_ps(tmp11312, tmp11314, 238);
__m512 tmp11331 = _mm512_shuffle_ps(tmp11315, tmp11317, 68);
__m512 tmp11332 = _mm512_shuffle_ps(tmp11315, tmp11317, 238);
__m512 tmp11333 = _mm512_shuffle_ps(tmp11316, tmp11318, 68);
__m512 tmp11334 = _mm512_shuffle_ps(tmp11316, tmp11318, 238);
__m512 tmp11335 = _mm512_shuffle_ps(tmp11319, tmp11321, 68);
__m512 tmp11336 = _mm512_shuffle_ps(tmp11319, tmp11321, 238);
__m512 tmp11337 = _mm512_shuffle_ps(tmp11320, tmp11322, 68);
__m512 tmp11338 = _mm512_shuffle_ps(tmp11320, tmp11322, 238);
__m512 tmp11339 = _mm512_shuffle_ps(tmp11323, tmp11325, 68);
__m512 tmp11340 = _mm512_shuffle_ps(tmp11323, tmp11325, 238);
__m512 tmp11341 = _mm512_shuffle_ps(tmp11324, tmp11326, 68);
__m512 tmp11342 = _mm512_shuffle_ps(tmp11324, tmp11326, 238);
__m512 tmp11343 = _mm512_shuffle_f32x4(tmp11327, tmp11331, 136);
__m512 tmp11344 = _mm512_shuffle_f32x4(tmp11327, tmp11331, 221);
__m512 tmp11345 = _mm512_shuffle_f32x4(tmp11328, tmp11332, 136);
__m512 tmp11346 = _mm512_shuffle_f32x4(tmp11328, tmp11332, 221);
__m512 tmp11347 = _mm512_shuffle_f32x4(tmp11329, tmp11333, 136);
__m512 tmp11348 = _mm512_shuffle_f32x4(tmp11329, tmp11333, 221);
__m512 tmp11349 = _mm512_shuffle_f32x4(tmp11330, tmp11334, 136);
__m512 tmp11350 = _mm512_shuffle_f32x4(tmp11330, tmp11334, 221);
__m512 tmp11351 = _mm512_shuffle_f32x4(tmp11335, tmp11339, 136);
__m512 tmp11352 = _mm512_shuffle_f32x4(tmp11335, tmp11339, 221);
__m512 tmp11353 = _mm512_shuffle_f32x4(tmp11336, tmp11340, 136);
__m512 tmp11354 = _mm512_shuffle_f32x4(tmp11336, tmp11340, 221);
__m512 tmp11355 = _mm512_shuffle_f32x4(tmp11337, tmp11341, 136);
__m512 tmp11356 = _mm512_shuffle_f32x4(tmp11337, tmp11341, 221);
__m512 tmp11357 = _mm512_shuffle_f32x4(tmp11338, tmp11342, 136);
__m512 tmp11358 = _mm512_shuffle_f32x4(tmp11338, tmp11342, 221);
in1628 = _mm512_shuffle_f32x4(tmp11343, tmp11351, 136);
in1636 = _mm512_shuffle_f32x4(tmp11343, tmp11351, 221);
tmp11297 = _mm512_shuffle_f32x4(tmp11345, tmp11353, 136);
tmp11301 = _mm512_shuffle_f32x4(tmp11345, tmp11353, 221);
tmp11298 = _mm512_shuffle_f32x4(tmp11347, tmp11355, 136);
tmp11302 = _mm512_shuffle_f32x4(tmp11347, tmp11355, 221);
in1634 = _mm512_shuffle_f32x4(tmp11349, tmp11357, 136);
in1642 = _mm512_shuffle_f32x4(tmp11349, tmp11357, 221);
tmp11296 = _mm512_shuffle_f32x4(tmp11344, tmp11352, 136);
tmp11300 = _mm512_shuffle_f32x4(tmp11344, tmp11352, 221);
in1630 = _mm512_shuffle_f32x4(tmp11346, tmp11354, 136);
in1638 = _mm512_shuffle_f32x4(tmp11346, tmp11354, 221);
in1632 = _mm512_shuffle_f32x4(tmp11348, tmp11356, 136);
in1640 = _mm512_shuffle_f32x4(tmp11348, tmp11356, 221);
in1631 = _mm512_shuffle_f32x4(tmp11350, tmp11358, 136);
in1639 = _mm512_shuffle_f32x4(tmp11350, tmp11358, 221);
__m512 tmp11303 = _mm512_add_ps(tmp11297, in1630);
__m512 tmp11307 = _mm512_add_ps(tmp11301, in1638);
__m512 tmp11304 = _mm512_sub_ps(tmp11296, tmp11298);
__m512 tmp11308 = _mm512_sub_ps(tmp11300, tmp11302);
__m512 tmp11305 = _mm512_add_ps(tmp11298, in1632);
__m512 tmp11309 = _mm512_add_ps(tmp11302, in1640);
in1628 = _mm512_sub_ps(in1628, in1632);
in1636 = _mm512_sub_ps(in1636, in1640);
tmp11303 = _mm512_fmadd_ps(in1634, _mm512_set1_ps(-4.25e+00f), tmp11303);
tmp11307 = _mm512_fmadd_ps(in1642, _mm512_set1_ps(-4.25e+00f), tmp11307);
tmp11305 = _mm512_fmadd_ps(tmp11296, _mm512_set1_ps(-4.25e+00f), tmp11305);
tmp11309 = _mm512_fmadd_ps(tmp11300, _mm512_set1_ps(-4.25e+00f), tmp11309);
in1628 = _mm512_fmadd_ps(tmp11304, _mm512_set1_ps(5.25e+00f), in1628);
in1636 = _mm512_fmadd_ps(tmp11308, _mm512_set1_ps(5.25e+00f), in1636);
tmp11304 = _mm512_fmadd_ps(tmp11298, _mm512_set1_ps(2.5e-01f), in1632);
tmp11308 = _mm512_fmadd_ps(tmp11302, _mm512_set1_ps(2.5e-01f), in1640);
tmp11298 = _mm512_fmadd_ps(tmp11298, _mm512_set1_ps(4e+00f), in1632);
tmp11302 = _mm512_fmadd_ps(tmp11302, _mm512_set1_ps(4e+00f), in1640);
__m512 tmp11306 = _mm512_sub_ps(tmp11305, tmp11303);
__m512 tmp11310 = _mm512_sub_ps(tmp11309, tmp11307);
tmp11305 = _mm512_add_ps(tmp11303, tmp11305);
tmp11309 = _mm512_add_ps(tmp11307, tmp11309);
tmp11303 = _mm512_fmadd_ps(tmp11297, _mm512_set1_ps(2.5e-01f), in1630);
tmp11307 = _mm512_fmadd_ps(tmp11301, _mm512_set1_ps(2.5e-01f), in1638);
tmp11304 = _mm512_fmadd_ps(tmp11296, _mm512_set1_ps(-1.25e+00f), tmp11304);
tmp11308 = _mm512_fmadd_ps(tmp11300, _mm512_set1_ps(-1.25e+00f), tmp11308);
tmp11296 = _mm512_fmadd_ps(tmp11296, _mm512_set1_ps(-5e+00f), tmp11298);
tmp11300 = _mm512_fmadd_ps(tmp11300, _mm512_set1_ps(-5e+00f), tmp11302);
tmp11303 = _mm512_fmadd_ps(in1634, _mm512_set1_ps(-1.25e+00f), tmp11303);
tmp11307 = _mm512_fmadd_ps(in1642, _mm512_set1_ps(-1.25e+00f), tmp11307);
in1632 = _mm512_fmadd_ps(tmp11303, _mm512_set1_ps(2e+00f), tmp11304);
in1640 = _mm512_fmadd_ps(tmp11307, _mm512_set1_ps(2e+00f), tmp11308);
tmp11304 = _mm512_fnmadd_ps(tmp11303, _mm512_set1_ps(2e+00f), tmp11304);
tmp11308 = _mm512_fnmadd_ps(tmp11307, _mm512_set1_ps(2e+00f), tmp11308);
tmp11303 = _mm512_fmadd_ps(in1630, _mm512_set1_ps(2.5e-01f), tmp11297);
tmp11307 = _mm512_fmadd_ps(in1638, _mm512_set1_ps(2.5e-01f), tmp11301);
tmp11297 = _mm512_sub_ps(in1631, tmp11297);
tmp11301 = _mm512_sub_ps(in1639, tmp11301);
tmp11303 = _mm512_fmadd_ps(in1634, _mm512_set1_ps(-1.25e+00f), tmp11303);
tmp11307 = _mm512_fmadd_ps(in1642, _mm512_set1_ps(-1.25e+00f), tmp11307);
in1634 = _mm512_sub_ps(in1634, in1630);
in1642 = _mm512_sub_ps(in1642, in1638);
in1634 = _mm512_fmadd_ps(in1634, _mm512_set1_ps(5.25e+00f), tmp11297);
in1642 = _mm512_fmadd_ps(in1642, _mm512_set1_ps(5.25e+00f), tmp11301);
tmp11298 = _mm512_fmadd_ps(tmp11303, _mm512_set1_ps(2e+00f), tmp11296);
tmp11302 = _mm512_fmadd_ps(tmp11307, _mm512_set1_ps(2e+00f), tmp11300);
tmp11296 = _mm512_fnmadd_ps(tmp11303, _mm512_set1_ps(2e+00f), tmp11296);
tmp11300 = _mm512_fnmadd_ps(tmp11307, _mm512_set1_ps(2e+00f), tmp11300);
__m512 out1503 = _mm512_shuffle_f32x4(in1628, tmp11305, 68);
__m512 out1511 = _mm512_shuffle_f32x4(in1628, tmp11305, 238);
__m512 out1504 = _mm512_shuffle_f32x4(tmp11306, in1632, 68);
__m512 out1512 = _mm512_shuffle_f32x4(tmp11306, in1632, 238);
__m512 out1505 = _mm512_shuffle_f32x4(tmp11304, tmp11298, 68);
__m512 out1513 = _mm512_shuffle_f32x4(tmp11304, tmp11298, 238);
__m512 out1506 = _mm512_shuffle_f32x4(tmp11296, in1634, 68);
__m512 out1514 = _mm512_shuffle_f32x4(tmp11296, in1634, 238);
__m512 out1507 = _mm512_shuffle_f32x4(in1636, tmp11309, 68);
__m512 out1515 = _mm512_shuffle_f32x4(in1636, tmp11309, 238);
__m512 out1508 = _mm512_shuffle_f32x4(tmp11310, in1640, 68);
__m512 out1516 = _mm512_shuffle_f32x4(tmp11310, in1640, 238);
__m512 out1509 = _mm512_shuffle_f32x4(tmp11308, tmp11302, 68);
__m512 out1517 = _mm512_shuffle_f32x4(tmp11308, tmp11302, 238);
__m512 out1510 = _mm512_shuffle_f32x4(tmp11300, in1642, 68);
__m512 out1518 = _mm512_shuffle_f32x4(tmp11300, in1642, 238);
_mm512_storeu_ps(dfPtr8+256+819200*i35+49152*j28+49152*s25+768*k107, out1503);
_mm512_storeu_ps(dfPtr8+384+819200*i35+49152*j28+49152*s25+768*k107, out1511);
_mm512_storeu_ps(dfPtr8+320+819200*i35+49152*j28+49152*s25+768*k107, out1507);
_mm512_storeu_ps(dfPtr8+448+819200*i35+49152*j28+49152*s25+768*k107, out1515);
_mm512_storeu_ps(dfPtr8+205056+819200*i35+49152*j28+49152*s25+768*k107, out1504);
_mm512_storeu_ps(dfPtr8+205184+819200*i35+49152*j28+49152*s25+768*k107, out1512);
_mm512_storeu_ps(dfPtr8+205120+819200*i35+49152*j28+49152*s25+768*k107, out1508);
_mm512_storeu_ps(dfPtr8+205248+819200*i35+49152*j28+49152*s25+768*k107, out1516);
_mm512_storeu_ps(dfPtr8+409856+819200*i35+49152*j28+49152*s25+768*k107, out1505);
_mm512_storeu_ps(dfPtr8+409984+819200*i35+49152*j28+49152*s25+768*k107, out1513);
_mm512_storeu_ps(dfPtr8+409920+819200*i35+49152*j28+49152*s25+768*k107, out1509);
_mm512_storeu_ps(dfPtr8+410048+819200*i35+49152*j28+49152*s25+768*k107, out1517);
_mm512_storeu_ps(dfPtr8+614656+819200*i35+49152*j28+49152*s25+768*k107, out1506);
_mm512_storeu_ps(dfPtr8+614784+819200*i35+49152*j28+49152*s25+768*k107, out1514);
_mm512_storeu_ps(dfPtr8+614720+819200*i35+49152*j28+49152*s25+768*k107, out1510);
_mm512_storeu_ps(dfPtr8+614848+819200*i35+49152*j28+49152*s25+768*k107, out1518);
__m512 dat1831 = _mm512_maskz_loadu_ps(31, datPtr16+3184+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1831 = _mm512_max_ps(_mm512_setzero_ps(), dat1831);
__m512 dat1832 = _mm512_maskz_loadu_ps(8191, datPtr16+3764+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1832 = _mm512_max_ps(_mm512_setzero_ps(), dat1832);
__m512 dat1833 = _mm512_maskz_loadu_ps(255, datPtr16+3808+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1833 = _mm512_max_ps(_mm512_setzero_ps(), dat1833);
__m512i pm176 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 4, 3, 2, 1, 0);
__m512 in1644 = _mm512_permutex2var_ps(dat1831, pm176, dat1832);
__m512i pm177 = _mm512_set_epi32(23, 22, 21, 20, 19, 18, 17, 16, 12, 11, 10, 9, 8, 7, 6, 5);
__m512 in1652 = _mm512_permutex2var_ps(dat1832, pm177, dat1833);
__m512 dat1834 = _mm512_maskz_loadu_ps(31, datPtr16+3296+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1834 = _mm512_max_ps(_mm512_setzero_ps(), dat1834);
__m512 dat1835 = _mm512_maskz_loadu_ps(8191, datPtr16+3876+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1835 = _mm512_max_ps(_mm512_setzero_ps(), dat1835);
__m512 dat1836 = _mm512_maskz_loadu_ps(255, datPtr16+3920+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1836 = _mm512_max_ps(_mm512_setzero_ps(), dat1836);
__m512 in1645 = _mm512_permutex2var_ps(dat1834, pm176, dat1835);
__m512 in1653 = _mm512_permutex2var_ps(dat1835, pm177, dat1836);
__m512 dat1837 = _mm512_maskz_loadu_ps(31, datPtr16+3408+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1837 = _mm512_max_ps(_mm512_setzero_ps(), dat1837);
__m512 dat1838 = _mm512_maskz_loadu_ps(8191, datPtr16+3988+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1838 = _mm512_max_ps(_mm512_setzero_ps(), dat1838);
__m512 dat1839 = _mm512_maskz_loadu_ps(255, datPtr16+4032+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1839 = _mm512_max_ps(_mm512_setzero_ps(), dat1839);
__m512 in1646 = _mm512_permutex2var_ps(dat1837, pm176, dat1838);
__m512 in1654 = _mm512_permutex2var_ps(dat1838, pm177, dat1839);
__m512 dat1840 = _mm512_maskz_loadu_ps(31, datPtr16+3520+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1840 = _mm512_max_ps(_mm512_setzero_ps(), dat1840);
__m512 dat1841 = _mm512_maskz_loadu_ps(8191, datPtr16+4100+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1841 = _mm512_max_ps(_mm512_setzero_ps(), dat1841);
__m512 dat1842 = _mm512_maskz_loadu_ps(255, datPtr16+4144+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1842 = _mm512_max_ps(_mm512_setzero_ps(), dat1842);
__m512 in1647 = _mm512_permutex2var_ps(dat1840, pm176, dat1841);
__m512 in1655 = _mm512_permutex2var_ps(dat1841, pm177, dat1842);
__m512 dat1843 = _mm512_maskz_loadu_ps(31, datPtr16+3632+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1843 = _mm512_max_ps(_mm512_setzero_ps(), dat1843);
__m512 dat1844 = _mm512_maskz_loadu_ps(8191, datPtr16+4212+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1844 = _mm512_max_ps(_mm512_setzero_ps(), dat1844);
__m512 dat1845 = _mm512_maskz_loadu_ps(255, datPtr16+4256+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1845 = _mm512_max_ps(_mm512_setzero_ps(), dat1845);
__m512 in1648 = _mm512_permutex2var_ps(dat1843, pm176, dat1844);
__m512 in1656 = _mm512_permutex2var_ps(dat1844, pm177, dat1845);
__m512 dat1846 = _mm512_maskz_loadu_ps(31, datPtr16+3744+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1846 = _mm512_max_ps(_mm512_setzero_ps(), dat1846);
__m512 dat1847 = _mm512_maskz_loadu_ps(8191, datPtr16+4324+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1847 = _mm512_max_ps(_mm512_setzero_ps(), dat1847);
__m512 dat1848 = _mm512_maskz_loadu_ps(255, datPtr16+4368+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1848 = _mm512_max_ps(_mm512_setzero_ps(), dat1848);
__m512 in1649 = _mm512_permutex2var_ps(dat1846, pm176, dat1847);
__m512 in1657 = _mm512_permutex2var_ps(dat1847, pm177, dat1848);
__m512 dat1849 = _mm512_maskz_loadu_ps(31, datPtr16+3856+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1849 = _mm512_max_ps(_mm512_setzero_ps(), dat1849);
__m512 dat1850 = _mm512_maskz_loadu_ps(8191, datPtr16+4436+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1850 = _mm512_max_ps(_mm512_setzero_ps(), dat1850);
__m512 dat1851 = _mm512_maskz_loadu_ps(255, datPtr16+4480+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1851 = _mm512_max_ps(_mm512_setzero_ps(), dat1851);
__m512 in1650 = _mm512_permutex2var_ps(dat1849, pm176, dat1850);
__m512 in1658 = _mm512_permutex2var_ps(dat1850, pm177, dat1851);
__m512 dat1852 = _mm512_maskz_loadu_ps(31, datPtr16+3968+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1852 = _mm512_max_ps(_mm512_setzero_ps(), dat1852);
__m512 dat1853 = _mm512_maskz_loadu_ps(8191, datPtr16+4548+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1853 = _mm512_max_ps(_mm512_setzero_ps(), dat1853);
__m512 dat1854 = _mm512_maskz_loadu_ps(255, datPtr16+4592+401408*i35+112*h42+4*w50+401408*s25+6272*k107);
dat1854 = _mm512_max_ps(_mm512_setzero_ps(), dat1854);
__m512 in1651 = _mm512_permutex2var_ps(dat1852, pm176, dat1853);
__m512 in1659 = _mm512_permutex2var_ps(dat1853, pm177, dat1854);
__m512 tmp11359 = _mm512_add_ps(in1645, in1649);
__m512 tmp11363 = _mm512_add_ps(in1653, in1657);
__m512 tmp11360 = _mm512_sub_ps(in1648, in1646);
__m512 tmp11364 = _mm512_sub_ps(in1656, in1654);
__m512 tmp11361 = _mm512_add_ps(in1646, in1650);
__m512 tmp11365 = _mm512_add_ps(in1654, in1658);
in1644 = _mm512_sub_ps(in1644, in1650);
in1652 = _mm512_sub_ps(in1652, in1658);
tmp11359 = _mm512_fmadd_ps(in1647, _mm512_set1_ps(-4.25e+00f), tmp11359);
tmp11363 = _mm512_fmadd_ps(in1655, _mm512_set1_ps(-4.25e+00f), tmp11363);
tmp11361 = _mm512_fmadd_ps(in1648, _mm512_set1_ps(-4.25e+00f), tmp11361);
tmp11365 = _mm512_fmadd_ps(in1656, _mm512_set1_ps(-4.25e+00f), tmp11365);
in1644 = _mm512_fmadd_ps(tmp11360, _mm512_set1_ps(5.25e+00f), in1644);
in1652 = _mm512_fmadd_ps(tmp11364, _mm512_set1_ps(5.25e+00f), in1652);
tmp11360 = _mm512_fmadd_ps(in1646, _mm512_set1_ps(2.5e-01f), in1650);
tmp11364 = _mm512_fmadd_ps(in1654, _mm512_set1_ps(2.5e-01f), in1658);
in1646 = _mm512_fmadd_ps(in1646, _mm512_set1_ps(4e+00f), in1650);
in1654 = _mm512_fmadd_ps(in1654, _mm512_set1_ps(4e+00f), in1658);
__m512 tmp11362 = _mm512_sub_ps(tmp11361, tmp11359);
__m512 tmp11366 = _mm512_sub_ps(tmp11365, tmp11363);
tmp11361 = _mm512_add_ps(tmp11359, tmp11361);
tmp11365 = _mm512_add_ps(tmp11363, tmp11365);
tmp11359 = _mm512_fmadd_ps(in1645, _mm512_set1_ps(2.5e-01f), in1649);
tmp11363 = _mm512_fmadd_ps(in1653, _mm512_set1_ps(2.5e-01f), in1657);
tmp11360 = _mm512_fmadd_ps(in1648, _mm512_set1_ps(-1.25e+00f), tmp11360);
tmp11364 = _mm512_fmadd_ps(in1656, _mm512_set1_ps(-1.25e+00f), tmp11364);
in1648 = _mm512_fmadd_ps(in1648, _mm512_set1_ps(-5e+00f), in1646);
in1656 = _mm512_fmadd_ps(in1656, _mm512_set1_ps(-5e+00f), in1654);
tmp11359 = _mm512_fmadd_ps(in1647, _mm512_set1_ps(-1.25e+00f), tmp11359);
tmp11363 = _mm512_fmadd_ps(in1655, _mm512_set1_ps(-1.25e+00f), tmp11363);
in1650 = _mm512_fmadd_ps(tmp11359, _mm512_set1_ps(2e+00f), tmp11360);
in1658 = _mm512_fmadd_ps(tmp11363, _mm512_set1_ps(2e+00f), tmp11364);
tmp11360 = _mm512_fnmadd_ps(tmp11359, _mm512_set1_ps(2e+00f), tmp11360);
tmp11364 = _mm512_fnmadd_ps(tmp11363, _mm512_set1_ps(2e+00f), tmp11364);
tmp11359 = _mm512_fmadd_ps(in1649, _mm512_set1_ps(2.5e-01f), in1645);
tmp11363 = _mm512_fmadd_ps(in1657, _mm512_set1_ps(2.5e-01f), in1653);
in1645 = _mm512_sub_ps(in1651, in1645);
in1653 = _mm512_sub_ps(in1659, in1653);
tmp11359 = _mm512_fmadd_ps(in1647, _mm512_set1_ps(-1.25e+00f), tmp11359);
tmp11363 = _mm512_fmadd_ps(in1655, _mm512_set1_ps(-1.25e+00f), tmp11363);
in1647 = _mm512_sub_ps(in1647, in1649);
in1655 = _mm512_sub_ps(in1655, in1657);
in1647 = _mm512_fmadd_ps(in1647, _mm512_set1_ps(5.25e+00f), in1645);
in1655 = _mm512_fmadd_ps(in1655, _mm512_set1_ps(5.25e+00f), in1653);
in1646 = _mm512_fmadd_ps(tmp11359, _mm512_set1_ps(2e+00f), in1648);
in1654 = _mm512_fmadd_ps(tmp11363, _mm512_set1_ps(2e+00f), in1656);
in1648 = _mm512_fnmadd_ps(tmp11359, _mm512_set1_ps(2e+00f), in1648);
in1656 = _mm512_fnmadd_ps(tmp11363, _mm512_set1_ps(2e+00f), in1656);
__m512 tmp11375 = _mm512_unpacklo_ps(in1644, tmp11361);
__m512 tmp11376 = _mm512_unpackhi_ps(in1644, tmp11361);
__m512 tmp11377 = _mm512_unpacklo_ps(tmp11362, in1650);
__m512 tmp11378 = _mm512_unpackhi_ps(tmp11362, in1650);
__m512 tmp11379 = _mm512_unpacklo_ps(tmp11360, in1646);
__m512 tmp11380 = _mm512_unpackhi_ps(tmp11360, in1646);
__m512 tmp11381 = _mm512_unpacklo_ps(in1648, in1647);
__m512 tmp11382 = _mm512_unpackhi_ps(in1648, in1647);
__m512 tmp11383 = _mm512_unpacklo_ps(in1652, tmp11365);
__m512 tmp11384 = _mm512_unpackhi_ps(in1652, tmp11365);
__m512 tmp11385 = _mm512_unpacklo_ps(tmp11366, in1658);
__m512 tmp11386 = _mm512_unpackhi_ps(tmp11366, in1658);
__m512 tmp11387 = _mm512_unpacklo_ps(tmp11364, in1654);
__m512 tmp11388 = _mm512_unpackhi_ps(tmp11364, in1654);
__m512 tmp11389 = _mm512_unpacklo_ps(in1656, in1655);
__m512 tmp11390 = _mm512_unpackhi_ps(in1656, in1655);
__m512 tmp11391 = _mm512_shuffle_ps(tmp11375, tmp11377, 68);
__m512 tmp11392 = _mm512_shuffle_ps(tmp11375, tmp11377, 238);
__m512 tmp11393 = _mm512_shuffle_ps(tmp11376, tmp11378, 68);
__m512 tmp11394 = _mm512_shuffle_ps(tmp11376, tmp11378, 238);
__m512 tmp11395 = _mm512_shuffle_ps(tmp11379, tmp11381, 68);
__m512 tmp11396 = _mm512_shuffle_ps(tmp11379, tmp11381, 238);
__m512 tmp11397 = _mm512_shuffle_ps(tmp11380, tmp11382, 68);
__m512 tmp11398 = _mm512_shuffle_ps(tmp11380, tmp11382, 238);
__m512 tmp11399 = _mm512_shuffle_ps(tmp11383, tmp11385, 68);
__m512 tmp11400 = _mm512_shuffle_ps(tmp11383, tmp11385, 238);
__m512 tmp11401 = _mm512_shuffle_ps(tmp11384, tmp11386, 68);
__m512 tmp11402 = _mm512_shuffle_ps(tmp11384, tmp11386, 238);
__m512 tmp11403 = _mm512_shuffle_ps(tmp11387, tmp11389, 68);
__m512 tmp11404 = _mm512_shuffle_ps(tmp11387, tmp11389, 238);
__m512 tmp11405 = _mm512_shuffle_ps(tmp11388, tmp11390, 68);
__m512 tmp11406 = _mm512_shuffle_ps(tmp11388, tmp11390, 238);
__m512 tmp11407 = _mm512_shuffle_f32x4(tmp11391, tmp11395, 136);
__m512 tmp11408 = _mm512_shuffle_f32x4(tmp11391, tmp11395, 221);
__m512 tmp11409 = _mm512_shuffle_f32x4(tmp11392, tmp11396, 136);
__m512 tmp11410 = _mm512_shuffle_f32x4(tmp11392, tmp11396, 221);
__m512 tmp11411 = _mm512_shuffle_f32x4(tmp11393, tmp11397, 136);
__m512 tmp11412 = _mm512_shuffle_f32x4(tmp11393, tmp11397, 221);
__m512 tmp11413 = _mm512_shuffle_f32x4(tmp11394, tmp11398, 136);
__m512 tmp11414 = _mm512_shuffle_f32x4(tmp11394, tmp11398, 221);
__m512 tmp11415 = _mm512_shuffle_f32x4(tmp11399, tmp11403, 136);
__m512 tmp11416 = _mm512_shuffle_f32x4(tmp11399, tmp11403, 221);
__m512 tmp11417 = _mm512_shuffle_f32x4(tmp11400, tmp11404, 136);
__m512 tmp11418 = _mm512_shuffle_f32x4(tmp11400, tmp11404, 221);
__m512 tmp11419 = _mm512_shuffle_f32x4(tmp11401, tmp11405, 136);
__m512 tmp11420 = _mm512_shuffle_f32x4(tmp11401, tmp11405, 221);
__m512 tmp11421 = _mm512_shuffle_f32x4(tmp11402, tmp11406, 136);
__m512 tmp11422 = _mm512_shuffle_f32x4(tmp11402, tmp11406, 221);
in1644 = _mm512_shuffle_f32x4(tmp11407, tmp11415, 136);
in1652 = _mm512_shuffle_f32x4(tmp11407, tmp11415, 221);
tmp11361 = _mm512_shuffle_f32x4(tmp11409, tmp11417, 136);
tmp11365 = _mm512_shuffle_f32x4(tmp11409, tmp11417, 221);
tmp11362 = _mm512_shuffle_f32x4(tmp11411, tmp11419, 136);
tmp11366 = _mm512_shuffle_f32x4(tmp11411, tmp11419, 221);
in1650 = _mm512_shuffle_f32x4(tmp11413, tmp11421, 136);
in1658 = _mm512_shuffle_f32x4(tmp11413, tmp11421, 221);
tmp11360 = _mm512_shuffle_f32x4(tmp11408, tmp11416, 136);
tmp11364 = _mm512_shuffle_f32x4(tmp11408, tmp11416, 221);
in1646 = _mm512_shuffle_f32x4(tmp11410, tmp11418, 136);
in1654 = _mm512_shuffle_f32x4(tmp11410, tmp11418, 221);
in1648 = _mm512_shuffle_f32x4(tmp11412, tmp11420, 136);
in1656 = _mm512_shuffle_f32x4(tmp11412, tmp11420, 221);
in1647 = _mm512_shuffle_f32x4(tmp11414, tmp11422, 136);
in1655 = _mm512_shuffle_f32x4(tmp11414, tmp11422, 221);
__m512 tmp11367 = _mm512_add_ps(tmp11361, in1646);
__m512 tmp11371 = _mm512_add_ps(tmp11365, in1654);
__m512 tmp11368 = _mm512_sub_ps(tmp11360, tmp11362);
__m512 tmp11372 = _mm512_sub_ps(tmp11364, tmp11366);
__m512 tmp11369 = _mm512_add_ps(tmp11362, in1648);
__m512 tmp11373 = _mm512_add_ps(tmp11366, in1656);
in1644 = _mm512_sub_ps(in1644, in1648);
in1652 = _mm512_sub_ps(in1652, in1656);
tmp11367 = _mm512_fmadd_ps(in1650, _mm512_set1_ps(-4.25e+00f), tmp11367);
tmp11371 = _mm512_fmadd_ps(in1658, _mm512_set1_ps(-4.25e+00f), tmp11371);
tmp11369 = _mm512_fmadd_ps(tmp11360, _mm512_set1_ps(-4.25e+00f), tmp11369);
tmp11373 = _mm512_fmadd_ps(tmp11364, _mm512_set1_ps(-4.25e+00f), tmp11373);
in1644 = _mm512_fmadd_ps(tmp11368, _mm512_set1_ps(5.25e+00f), in1644);
in1652 = _mm512_fmadd_ps(tmp11372, _mm512_set1_ps(5.25e+00f), in1652);
tmp11368 = _mm512_fmadd_ps(tmp11362, _mm512_set1_ps(2.5e-01f), in1648);
tmp11372 = _mm512_fmadd_ps(tmp11366, _mm512_set1_ps(2.5e-01f), in1656);
tmp11362 = _mm512_fmadd_ps(tmp11362, _mm512_set1_ps(4e+00f), in1648);
tmp11366 = _mm512_fmadd_ps(tmp11366, _mm512_set1_ps(4e+00f), in1656);
__m512 tmp11370 = _mm512_sub_ps(tmp11369, tmp11367);
__m512 tmp11374 = _mm512_sub_ps(tmp11373, tmp11371);
tmp11369 = _mm512_add_ps(tmp11367, tmp11369);
tmp11373 = _mm512_add_ps(tmp11371, tmp11373);
tmp11367 = _mm512_fmadd_ps(tmp11361, _mm512_set1_ps(2.5e-01f), in1646);
tmp11371 = _mm512_fmadd_ps(tmp11365, _mm512_set1_ps(2.5e-01f), in1654);
tmp11368 = _mm512_fmadd_ps(tmp11360, _mm512_set1_ps(-1.25e+00f), tmp11368);
tmp11372 = _mm512_fmadd_ps(tmp11364, _mm512_set1_ps(-1.25e+00f), tmp11372);
tmp11360 = _mm512_fmadd_ps(tmp11360, _mm512_set1_ps(-5e+00f), tmp11362);
tmp11364 = _mm512_fmadd_ps(tmp11364, _mm512_set1_ps(-5e+00f), tmp11366);
tmp11367 = _mm512_fmadd_ps(in1650, _mm512_set1_ps(-1.25e+00f), tmp11367);
tmp11371 = _mm512_fmadd_ps(in1658, _mm512_set1_ps(-1.25e+00f), tmp11371);
in1648 = _mm512_fmadd_ps(tmp11367, _mm512_set1_ps(2e+00f), tmp11368);
in1656 = _mm512_fmadd_ps(tmp11371, _mm512_set1_ps(2e+00f), tmp11372);
tmp11368 = _mm512_fnmadd_ps(tmp11367, _mm512_set1_ps(2e+00f), tmp11368);
tmp11372 = _mm512_fnmadd_ps(tmp11371, _mm512_set1_ps(2e+00f), tmp11372);
tmp11367 = _mm512_fmadd_ps(in1646, _mm512_set1_ps(2.5e-01f), tmp11361);
tmp11371 = _mm512_fmadd_ps(in1654, _mm512_set1_ps(2.5e-01f), tmp11365);
tmp11361 = _mm512_sub_ps(in1647, tmp11361);
tmp11365 = _mm512_sub_ps(in1655, tmp11365);
tmp11367 = _mm512_fmadd_ps(in1650, _mm512_set1_ps(-1.25e+00f), tmp11367);
tmp11371 = _mm512_fmadd_ps(in1658, _mm512_set1_ps(-1.25e+00f), tmp11371);
in1650 = _mm512_sub_ps(in1650, in1646);
in1658 = _mm512_sub_ps(in1658, in1654);
in1650 = _mm512_fmadd_ps(in1650, _mm512_set1_ps(5.25e+00f), tmp11361);
in1658 = _mm512_fmadd_ps(in1658, _mm512_set1_ps(5.25e+00f), tmp11365);
tmp11362 = _mm512_fmadd_ps(tmp11367, _mm512_set1_ps(2e+00f), tmp11360);
tmp11366 = _mm512_fmadd_ps(tmp11371, _mm512_set1_ps(2e+00f), tmp11364);
tmp11360 = _mm512_fnmadd_ps(tmp11367, _mm512_set1_ps(2e+00f), tmp11360);
tmp11364 = _mm512_fnmadd_ps(tmp11371, _mm512_set1_ps(2e+00f), tmp11364);
__m512 out1519 = _mm512_shuffle_f32x4(in1644, tmp11369, 68);
__m512 out1527 = _mm512_shuffle_f32x4(in1644, tmp11369, 238);
__m512 out1520 = _mm512_shuffle_f32x4(tmp11370, in1648, 68);
__m512 out1528 = _mm512_shuffle_f32x4(tmp11370, in1648, 238);
__m512 out1521 = _mm512_shuffle_f32x4(tmp11368, tmp11362, 68);
__m512 out1529 = _mm512_shuffle_f32x4(tmp11368, tmp11362, 238);
__m512 out1522 = _mm512_shuffle_f32x4(tmp11360, in1650, 68);
__m512 out1530 = _mm512_shuffle_f32x4(tmp11360, in1650, 238);
__m512 out1523 = _mm512_shuffle_f32x4(in1652, tmp11373, 68);
__m512 out1531 = _mm512_shuffle_f32x4(in1652, tmp11373, 238);
__m512 out1524 = _mm512_shuffle_f32x4(tmp11374, in1656, 68);
__m512 out1532 = _mm512_shuffle_f32x4(tmp11374, in1656, 238);
__m512 out1525 = _mm512_shuffle_f32x4(tmp11372, tmp11366, 68);
__m512 out1533 = _mm512_shuffle_f32x4(tmp11372, tmp11366, 238);
__m512 out1526 = _mm512_shuffle_f32x4(tmp11364, in1658, 68);
__m512 out1534 = _mm512_shuffle_f32x4(tmp11364, in1658, 238);
_mm512_storeu_ps(dfPtr8+512+819200*i35+49152*j28+49152*s25+768*k107, out1519);
_mm512_storeu_ps(dfPtr8+640+819200*i35+49152*j28+49152*s25+768*k107, out1527);
_mm512_storeu_ps(dfPtr8+576+819200*i35+49152*j28+49152*s25+768*k107, out1523);
_mm512_storeu_ps(dfPtr8+704+819200*i35+49152*j28+49152*s25+768*k107, out1531);
_mm512_storeu_ps(dfPtr8+205312+819200*i35+49152*j28+49152*s25+768*k107, out1520);
_mm512_storeu_ps(dfPtr8+205440+819200*i35+49152*j28+49152*s25+768*k107, out1528);
_mm512_storeu_ps(dfPtr8+205376+819200*i35+49152*j28+49152*s25+768*k107, out1524);
_mm512_storeu_ps(dfPtr8+205504+819200*i35+49152*j28+49152*s25+768*k107, out1532);
_mm512_storeu_ps(dfPtr8+410112+819200*i35+49152*j28+49152*s25+768*k107, out1521);
_mm512_storeu_ps(dfPtr8+410240+819200*i35+49152*j28+49152*s25+768*k107, out1529);
_mm512_storeu_ps(dfPtr8+410176+819200*i35+49152*j28+49152*s25+768*k107, out1525);
_mm512_storeu_ps(dfPtr8+410304+819200*i35+49152*j28+49152*s25+768*k107, out1533);
_mm512_storeu_ps(dfPtr8+614912+819200*i35+49152*j28+49152*s25+768*k107, out1522);
_mm512_storeu_ps(dfPtr8+615040+819200*i35+49152*j28+49152*s25+768*k107, out1530);
_mm512_storeu_ps(dfPtr8+614976+819200*i35+49152*j28+49152*s25+768*k107, out1526);
_mm512_storeu_ps(dfPtr8+615104+819200*i35+49152*j28+49152*s25+768*k107, out1534);
}
if (j28 >= last7) return;
++j28;
rel19 = 3;
}
if (rel19 < 4) {
ptrdiff_t h43 = base19+18;
ptrdiff_t w51 = 18;
ptrdiff_t k108 = 0;
for (; k108 != 64; ++k108) {
__m512 dat1855 = _mm512_maskz_loadu_ps(2047, datPtr16+0+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1855 = _mm512_max_ps(_mm512_setzero_ps(), dat1855);
__m512 dat1856 = _mm512_maskz_loadu_ps(8191, datPtr16+604+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1856 = _mm512_max_ps(_mm512_setzero_ps(), dat1856);
__m512i pm178 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1660 = _mm512_permutexvar_ps(pm178, dat1855);
__m512i pm179 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1668 = _mm512_permutexvar_ps(pm179, dat1856);
__m512 dat1857 = _mm512_maskz_loadu_ps(2047, datPtr16+112+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1857 = _mm512_max_ps(_mm512_setzero_ps(), dat1857);
__m512 dat1858 = _mm512_maskz_loadu_ps(8191, datPtr16+716+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1858 = _mm512_max_ps(_mm512_setzero_ps(), dat1858);
__m512 in1661 = _mm512_permutexvar_ps(pm178, dat1857);
__m512 in1669 = _mm512_permutexvar_ps(pm179, dat1858);
__m512 dat1859 = _mm512_maskz_loadu_ps(2047, datPtr16+224+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1859 = _mm512_max_ps(_mm512_setzero_ps(), dat1859);
__m512 dat1860 = _mm512_maskz_loadu_ps(8191, datPtr16+828+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1860 = _mm512_max_ps(_mm512_setzero_ps(), dat1860);
__m512 in1662 = _mm512_permutexvar_ps(pm178, dat1859);
__m512 in1670 = _mm512_permutexvar_ps(pm179, dat1860);
__m512 dat1861 = _mm512_maskz_loadu_ps(2047, datPtr16+336+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1861 = _mm512_max_ps(_mm512_setzero_ps(), dat1861);
__m512 dat1862 = _mm512_maskz_loadu_ps(8191, datPtr16+940+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1862 = _mm512_max_ps(_mm512_setzero_ps(), dat1862);
__m512 in1663 = _mm512_permutexvar_ps(pm178, dat1861);
__m512 in1671 = _mm512_permutexvar_ps(pm179, dat1862);
__m512 dat1863 = _mm512_maskz_loadu_ps(2047, datPtr16+448+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1863 = _mm512_max_ps(_mm512_setzero_ps(), dat1863);
__m512 dat1864 = _mm512_maskz_loadu_ps(8191, datPtr16+1052+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1864 = _mm512_max_ps(_mm512_setzero_ps(), dat1864);
__m512 in1664 = _mm512_permutexvar_ps(pm178, dat1863);
__m512 in1672 = _mm512_permutexvar_ps(pm179, dat1864);
__m512 dat1865 = _mm512_maskz_loadu_ps(2047, datPtr16+560+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1865 = _mm512_max_ps(_mm512_setzero_ps(), dat1865);
__m512 in1665 = _mm512_permutexvar_ps(pm178, dat1865);
__m512 dat1866 = _mm512_maskz_loadu_ps(2047, datPtr16+672+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1866 = _mm512_max_ps(_mm512_setzero_ps(), dat1866);
__m512 in1666 = _mm512_permutexvar_ps(pm178, dat1866);
__m512 dat1867 = _mm512_maskz_loadu_ps(2047, datPtr16+784+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1867 = _mm512_max_ps(_mm512_setzero_ps(), dat1867);
__m512 in1667 = _mm512_permutexvar_ps(pm178, dat1867);
__m512 tmp11423 = _mm512_add_ps(in1661, in1665);
__m512 tmp11427 = in1669;
__m512 tmp11424 = _mm512_sub_ps(in1664, in1662);
__m512 tmp11428 = _mm512_sub_ps(in1672, in1670);
__m512 tmp11425 = _mm512_add_ps(in1662, in1666);
__m512 tmp11429 = in1670;
in1660 = _mm512_sub_ps(in1660, in1666);
in1668 = in1668;
tmp11423 = _mm512_fmadd_ps(in1663, _mm512_set1_ps(-4.25e+00f), tmp11423);
tmp11427 = _mm512_fmadd_ps(in1671, _mm512_set1_ps(-4.25e+00f), tmp11427);
tmp11425 = _mm512_fmadd_ps(in1664, _mm512_set1_ps(-4.25e+00f), tmp11425);
tmp11429 = _mm512_fmadd_ps(in1672, _mm512_set1_ps(-4.25e+00f), tmp11429);
in1660 = _mm512_fmadd_ps(tmp11424, _mm512_set1_ps(5.25e+00f), in1660);
in1668 = _mm512_fmadd_ps(tmp11428, _mm512_set1_ps(5.25e+00f), in1668);
tmp11424 = _mm512_fmadd_ps(in1662, _mm512_set1_ps(2.5e-01f), in1666);
tmp11428 = _mm512_mul_ps(in1670, _mm512_set1_ps(2.5e-01f));
in1662 = _mm512_fmadd_ps(in1662, _mm512_set1_ps(4e+00f), in1666);
in1670 = _mm512_mul_ps(in1670, _mm512_set1_ps(4e+00f));
__m512 tmp11426 = _mm512_sub_ps(tmp11425, tmp11423);
__m512 tmp11430 = _mm512_sub_ps(tmp11429, tmp11427);
tmp11425 = _mm512_add_ps(tmp11423, tmp11425);
tmp11429 = _mm512_add_ps(tmp11427, tmp11429);
tmp11423 = _mm512_fmadd_ps(in1661, _mm512_set1_ps(2.5e-01f), in1665);
tmp11427 = _mm512_mul_ps(in1669, _mm512_set1_ps(2.5e-01f));
tmp11424 = _mm512_fmadd_ps(in1664, _mm512_set1_ps(-1.25e+00f), tmp11424);
tmp11428 = _mm512_fmadd_ps(in1672, _mm512_set1_ps(-1.25e+00f), tmp11428);
in1664 = _mm512_fmadd_ps(in1664, _mm512_set1_ps(-5e+00f), in1662);
in1672 = _mm512_fmadd_ps(in1672, _mm512_set1_ps(-5e+00f), in1670);
tmp11423 = _mm512_fmadd_ps(in1663, _mm512_set1_ps(-1.25e+00f), tmp11423);
tmp11427 = _mm512_fmadd_ps(in1671, _mm512_set1_ps(-1.25e+00f), tmp11427);
in1666 = _mm512_fmadd_ps(tmp11423, _mm512_set1_ps(2e+00f), tmp11424);
__m512 tmp11431 = _mm512_fmadd_ps(tmp11427, _mm512_set1_ps(2e+00f), tmp11428);
tmp11424 = _mm512_fnmadd_ps(tmp11423, _mm512_set1_ps(2e+00f), tmp11424);
tmp11428 = _mm512_fnmadd_ps(tmp11427, _mm512_set1_ps(2e+00f), tmp11428);
tmp11423 = _mm512_fmadd_ps(in1665, _mm512_set1_ps(2.5e-01f), in1661);
tmp11427 = in1669;
in1661 = _mm512_sub_ps(in1667, in1661);
in1669 = _mm512_sub_ps(_mm512_setzero_ps(), in1669);
tmp11423 = _mm512_fmadd_ps(in1663, _mm512_set1_ps(-1.25e+00f), tmp11423);
tmp11427 = _mm512_fmadd_ps(in1671, _mm512_set1_ps(-1.25e+00f), tmp11427);
in1663 = _mm512_sub_ps(in1663, in1665);
in1671 = in1671;
in1663 = _mm512_fmadd_ps(in1663, _mm512_set1_ps(5.25e+00f), in1661);
in1671 = _mm512_fmadd_ps(in1671, _mm512_set1_ps(5.25e+00f), in1669);
in1662 = _mm512_fmadd_ps(tmp11423, _mm512_set1_ps(2e+00f), in1664);
in1670 = _mm512_fmadd_ps(tmp11427, _mm512_set1_ps(2e+00f), in1672);
in1664 = _mm512_fnmadd_ps(tmp11423, _mm512_set1_ps(2e+00f), in1664);
in1672 = _mm512_fnmadd_ps(tmp11427, _mm512_set1_ps(2e+00f), in1672);
__m512 tmp11440 = _mm512_unpacklo_ps(in1660, tmp11425);
__m512 tmp11441 = _mm512_unpackhi_ps(in1660, tmp11425);
__m512 tmp11442 = _mm512_unpacklo_ps(tmp11426, in1666);
__m512 tmp11443 = _mm512_unpackhi_ps(tmp11426, in1666);
__m512 tmp11444 = _mm512_unpacklo_ps(tmp11424, in1662);
__m512 tmp11445 = _mm512_unpackhi_ps(tmp11424, in1662);
__m512 tmp11446 = _mm512_unpacklo_ps(in1664, in1663);
__m512 tmp11447 = _mm512_unpackhi_ps(in1664, in1663);
__m512 tmp11448 = _mm512_unpacklo_ps(in1668, tmp11429);
__m512 tmp11449 = _mm512_unpackhi_ps(in1668, tmp11429);
__m512 tmp11450 = _mm512_unpacklo_ps(tmp11430, tmp11431);
__m512 tmp11451 = _mm512_unpackhi_ps(tmp11430, tmp11431);
__m512 tmp11452 = _mm512_unpacklo_ps(tmp11428, in1670);
__m512 tmp11453 = _mm512_unpackhi_ps(tmp11428, in1670);
__m512 tmp11454 = _mm512_unpacklo_ps(in1672, in1671);
__m512 tmp11455 = _mm512_unpackhi_ps(in1672, in1671);
__m512 tmp11456 = _mm512_shuffle_ps(tmp11440, tmp11442, 68);
__m512 tmp11457 = _mm512_shuffle_ps(tmp11440, tmp11442, 238);
__m512 tmp11458 = _mm512_shuffle_ps(tmp11441, tmp11443, 68);
__m512 tmp11459 = _mm512_shuffle_ps(tmp11441, tmp11443, 238);
__m512 tmp11460 = _mm512_shuffle_ps(tmp11444, tmp11446, 68);
__m512 tmp11461 = _mm512_shuffle_ps(tmp11444, tmp11446, 238);
__m512 tmp11462 = _mm512_shuffle_ps(tmp11445, tmp11447, 68);
__m512 tmp11463 = _mm512_shuffle_ps(tmp11445, tmp11447, 238);
__m512 tmp11464 = _mm512_shuffle_ps(tmp11448, tmp11450, 68);
__m512 tmp11465 = _mm512_shuffle_ps(tmp11448, tmp11450, 238);
__m512 tmp11466 = _mm512_shuffle_ps(tmp11449, tmp11451, 68);
__m512 tmp11467 = _mm512_shuffle_ps(tmp11449, tmp11451, 238);
__m512 tmp11468 = _mm512_shuffle_ps(tmp11452, tmp11454, 68);
__m512 tmp11469 = _mm512_shuffle_ps(tmp11452, tmp11454, 238);
__m512 tmp11470 = _mm512_shuffle_ps(tmp11453, tmp11455, 68);
__m512 tmp11471 = _mm512_shuffle_ps(tmp11453, tmp11455, 238);
__m512 tmp11472 = _mm512_shuffle_f32x4(tmp11456, tmp11460, 136);
__m512 tmp11473 = _mm512_shuffle_f32x4(tmp11456, tmp11460, 221);
__m512 tmp11474 = _mm512_shuffle_f32x4(tmp11457, tmp11461, 136);
__m512 tmp11475 = _mm512_shuffle_f32x4(tmp11457, tmp11461, 221);
__m512 tmp11476 = _mm512_shuffle_f32x4(tmp11458, tmp11462, 136);
__m512 tmp11477 = _mm512_shuffle_f32x4(tmp11458, tmp11462, 221);
__m512 tmp11478 = _mm512_shuffle_f32x4(tmp11459, tmp11463, 136);
__m512 tmp11479 = _mm512_shuffle_f32x4(tmp11459, tmp11463, 221);
__m512 tmp11480 = _mm512_shuffle_f32x4(tmp11464, tmp11468, 136);
__m512 tmp11481 = _mm512_shuffle_f32x4(tmp11464, tmp11468, 221);
__m512 tmp11482 = _mm512_shuffle_f32x4(tmp11465, tmp11469, 136);
__m512 tmp11483 = _mm512_shuffle_f32x4(tmp11465, tmp11469, 221);
__m512 tmp11484 = _mm512_shuffle_f32x4(tmp11466, tmp11470, 136);
__m512 tmp11485 = _mm512_shuffle_f32x4(tmp11466, tmp11470, 221);
__m512 tmp11486 = _mm512_shuffle_f32x4(tmp11467, tmp11471, 136);
__m512 tmp11487 = _mm512_shuffle_f32x4(tmp11467, tmp11471, 221);
in1660 = _mm512_shuffle_f32x4(tmp11472, tmp11480, 136);
in1668 = _mm512_shuffle_f32x4(tmp11472, tmp11480, 221);
tmp11425 = _mm512_shuffle_f32x4(tmp11474, tmp11482, 136);
tmp11429 = _mm512_shuffle_f32x4(tmp11474, tmp11482, 221);
tmp11426 = _mm512_shuffle_f32x4(tmp11476, tmp11484, 136);
tmp11430 = _mm512_shuffle_f32x4(tmp11476, tmp11484, 221);
in1666 = _mm512_shuffle_f32x4(tmp11478, tmp11486, 136);
tmp11431 = _mm512_shuffle_f32x4(tmp11478, tmp11486, 221);
tmp11424 = _mm512_shuffle_f32x4(tmp11473, tmp11481, 136);
tmp11428 = _mm512_shuffle_f32x4(tmp11473, tmp11481, 221);
in1662 = _mm512_shuffle_f32x4(tmp11475, tmp11483, 136);
in1670 = _mm512_shuffle_f32x4(tmp11475, tmp11483, 221);
in1664 = _mm512_shuffle_f32x4(tmp11477, tmp11485, 136);
in1672 = _mm512_shuffle_f32x4(tmp11477, tmp11485, 221);
in1663 = _mm512_shuffle_f32x4(tmp11479, tmp11487, 136);
in1671 = _mm512_shuffle_f32x4(tmp11479, tmp11487, 221);
__m512 tmp11432 = _mm512_add_ps(tmp11425, in1662);
__m512 tmp11436 = _mm512_add_ps(tmp11429, in1670);
__m512 tmp11433 = _mm512_sub_ps(tmp11424, tmp11426);
__m512 tmp11437 = _mm512_sub_ps(tmp11428, tmp11430);
__m512 tmp11434 = _mm512_add_ps(tmp11426, in1664);
__m512 tmp11438 = _mm512_add_ps(tmp11430, in1672);
in1660 = _mm512_sub_ps(in1660, in1664);
in1668 = _mm512_sub_ps(in1668, in1672);
tmp11432 = _mm512_fmadd_ps(in1666, _mm512_set1_ps(-4.25e+00f), tmp11432);
tmp11436 = _mm512_fmadd_ps(tmp11431, _mm512_set1_ps(-4.25e+00f), tmp11436);
tmp11434 = _mm512_fmadd_ps(tmp11424, _mm512_set1_ps(-4.25e+00f), tmp11434);
tmp11438 = _mm512_fmadd_ps(tmp11428, _mm512_set1_ps(-4.25e+00f), tmp11438);
in1660 = _mm512_fmadd_ps(tmp11433, _mm512_set1_ps(5.25e+00f), in1660);
in1668 = _mm512_fmadd_ps(tmp11437, _mm512_set1_ps(5.25e+00f), in1668);
tmp11433 = _mm512_fmadd_ps(tmp11426, _mm512_set1_ps(2.5e-01f), in1664);
tmp11437 = _mm512_fmadd_ps(tmp11430, _mm512_set1_ps(2.5e-01f), in1672);
tmp11426 = _mm512_fmadd_ps(tmp11426, _mm512_set1_ps(4e+00f), in1664);
tmp11430 = _mm512_fmadd_ps(tmp11430, _mm512_set1_ps(4e+00f), in1672);
__m512 tmp11435 = _mm512_sub_ps(tmp11434, tmp11432);
__m512 tmp11439 = _mm512_sub_ps(tmp11438, tmp11436);
tmp11434 = _mm512_add_ps(tmp11432, tmp11434);
tmp11438 = _mm512_add_ps(tmp11436, tmp11438);
tmp11432 = _mm512_fmadd_ps(tmp11425, _mm512_set1_ps(2.5e-01f), in1662);
tmp11436 = _mm512_fmadd_ps(tmp11429, _mm512_set1_ps(2.5e-01f), in1670);
tmp11433 = _mm512_fmadd_ps(tmp11424, _mm512_set1_ps(-1.25e+00f), tmp11433);
tmp11437 = _mm512_fmadd_ps(tmp11428, _mm512_set1_ps(-1.25e+00f), tmp11437);
tmp11424 = _mm512_fmadd_ps(tmp11424, _mm512_set1_ps(-5e+00f), tmp11426);
tmp11428 = _mm512_fmadd_ps(tmp11428, _mm512_set1_ps(-5e+00f), tmp11430);
tmp11432 = _mm512_fmadd_ps(in1666, _mm512_set1_ps(-1.25e+00f), tmp11432);
tmp11436 = _mm512_fmadd_ps(tmp11431, _mm512_set1_ps(-1.25e+00f), tmp11436);
in1664 = _mm512_fmadd_ps(tmp11432, _mm512_set1_ps(2e+00f), tmp11433);
in1672 = _mm512_fmadd_ps(tmp11436, _mm512_set1_ps(2e+00f), tmp11437);
tmp11433 = _mm512_fnmadd_ps(tmp11432, _mm512_set1_ps(2e+00f), tmp11433);
tmp11437 = _mm512_fnmadd_ps(tmp11436, _mm512_set1_ps(2e+00f), tmp11437);
tmp11432 = _mm512_fmadd_ps(in1662, _mm512_set1_ps(2.5e-01f), tmp11425);
tmp11436 = _mm512_fmadd_ps(in1670, _mm512_set1_ps(2.5e-01f), tmp11429);
tmp11425 = _mm512_sub_ps(in1663, tmp11425);
tmp11429 = _mm512_sub_ps(in1671, tmp11429);
tmp11432 = _mm512_fmadd_ps(in1666, _mm512_set1_ps(-1.25e+00f), tmp11432);
tmp11436 = _mm512_fmadd_ps(tmp11431, _mm512_set1_ps(-1.25e+00f), tmp11436);
in1666 = _mm512_sub_ps(in1666, in1662);
tmp11431 = _mm512_sub_ps(tmp11431, in1670);
in1666 = _mm512_fmadd_ps(in1666, _mm512_set1_ps(5.25e+00f), tmp11425);
tmp11431 = _mm512_fmadd_ps(tmp11431, _mm512_set1_ps(5.25e+00f), tmp11429);
tmp11426 = _mm512_fmadd_ps(tmp11432, _mm512_set1_ps(2e+00f), tmp11424);
tmp11430 = _mm512_fmadd_ps(tmp11436, _mm512_set1_ps(2e+00f), tmp11428);
tmp11424 = _mm512_fnmadd_ps(tmp11432, _mm512_set1_ps(2e+00f), tmp11424);
tmp11428 = _mm512_fnmadd_ps(tmp11436, _mm512_set1_ps(2e+00f), tmp11428);
__m512 out1535 = _mm512_shuffle_f32x4(in1660, tmp11434, 68);
__m512 out1543 = _mm512_shuffle_f32x4(in1660, tmp11434, 238);
__m512 out1536 = _mm512_shuffle_f32x4(tmp11435, in1664, 68);
__m512 out1544 = _mm512_shuffle_f32x4(tmp11435, in1664, 238);
__m512 out1537 = _mm512_shuffle_f32x4(tmp11433, tmp11426, 68);
__m512 out1545 = _mm512_shuffle_f32x4(tmp11433, tmp11426, 238);
__m512 out1538 = _mm512_shuffle_f32x4(tmp11424, in1666, 68);
__m512 out1546 = _mm512_shuffle_f32x4(tmp11424, in1666, 238);
__m512 out1539 = _mm512_shuffle_f32x4(in1668, tmp11438, 68);
__m512 out1547 = _mm512_shuffle_f32x4(in1668, tmp11438, 238);
__m512 out1540 = _mm512_shuffle_f32x4(tmp11439, in1672, 68);
__m512 out1548 = _mm512_shuffle_f32x4(tmp11439, in1672, 238);
__m512 out1541 = _mm512_shuffle_f32x4(tmp11437, tmp11430, 68);
__m512 out1549 = _mm512_shuffle_f32x4(tmp11437, tmp11430, 238);
__m512 out1542 = _mm512_shuffle_f32x4(tmp11428, tmp11431, 68);
__m512 out1550 = _mm512_shuffle_f32x4(tmp11428, tmp11431, 238);
_mm512_storeu_ps(dfPtr8+0+819200*i35+49152*j28+49152*s25+768*k108, out1535);
_mm512_storeu_ps(dfPtr8+128+819200*i35+49152*j28+49152*s25+768*k108, out1543);
_mm512_storeu_ps(dfPtr8+64+819200*i35+49152*j28+49152*s25+768*k108, out1539);
_mm512_storeu_ps(dfPtr8+192+819200*i35+49152*j28+49152*s25+768*k108, out1547);
_mm512_storeu_ps(dfPtr8+204800+819200*i35+49152*j28+49152*s25+768*k108, out1536);
_mm512_storeu_ps(dfPtr8+204928+819200*i35+49152*j28+49152*s25+768*k108, out1544);
_mm512_storeu_ps(dfPtr8+204864+819200*i35+49152*j28+49152*s25+768*k108, out1540);
_mm512_storeu_ps(dfPtr8+204992+819200*i35+49152*j28+49152*s25+768*k108, out1548);
_mm512_storeu_ps(dfPtr8+409600+819200*i35+49152*j28+49152*s25+768*k108, out1537);
_mm512_storeu_ps(dfPtr8+409728+819200*i35+49152*j28+49152*s25+768*k108, out1545);
_mm512_storeu_ps(dfPtr8+409664+819200*i35+49152*j28+49152*s25+768*k108, out1541);
_mm512_storeu_ps(dfPtr8+409792+819200*i35+49152*j28+49152*s25+768*k108, out1549);
_mm512_storeu_ps(dfPtr8+614400+819200*i35+49152*j28+49152*s25+768*k108, out1538);
_mm512_storeu_ps(dfPtr8+614528+819200*i35+49152*j28+49152*s25+768*k108, out1546);
_mm512_storeu_ps(dfPtr8+614464+819200*i35+49152*j28+49152*s25+768*k108, out1542);
_mm512_storeu_ps(dfPtr8+614592+819200*i35+49152*j28+49152*s25+768*k108, out1550);
__m512 dat1868 = _mm512_maskz_loadu_ps(16383, datPtr16+648+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1868 = _mm512_max_ps(_mm512_setzero_ps(), dat1868);
__m512 dat1869 = _mm512_maskz_loadu_ps(2047, datPtr16+3136+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1869 = _mm512_max_ps(_mm512_setzero_ps(), dat1869);
__m512i pm180 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1673 = _mm512_permutexvar_ps(pm180, dat1868);
__m512i pm181 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1678 = _mm512_permutexvar_ps(pm181, dat1869);
__m512 dat1870 = _mm512_maskz_loadu_ps(16383, datPtr16+760+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1870 = _mm512_max_ps(_mm512_setzero_ps(), dat1870);
__m512 dat1871 = _mm512_maskz_loadu_ps(2047, datPtr16+3248+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1871 = _mm512_max_ps(_mm512_setzero_ps(), dat1871);
__m512 in1674 = _mm512_permutexvar_ps(pm180, dat1870);
__m512 in1679 = _mm512_permutexvar_ps(pm181, dat1871);
__m512 dat1872 = _mm512_maskz_loadu_ps(16383, datPtr16+872+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1872 = _mm512_max_ps(_mm512_setzero_ps(), dat1872);
__m512 dat1873 = _mm512_maskz_loadu_ps(2047, datPtr16+3360+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1873 = _mm512_max_ps(_mm512_setzero_ps(), dat1873);
__m512 in1675 = _mm512_permutexvar_ps(pm180, dat1872);
__m512 in1680 = _mm512_permutexvar_ps(pm181, dat1873);
__m512 dat1874 = _mm512_maskz_loadu_ps(16383, datPtr16+984+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1874 = _mm512_max_ps(_mm512_setzero_ps(), dat1874);
__m512 dat1875 = _mm512_maskz_loadu_ps(2047, datPtr16+3472+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1875 = _mm512_max_ps(_mm512_setzero_ps(), dat1875);
__m512 in1676 = _mm512_permutexvar_ps(pm180, dat1874);
__m512 in1681 = _mm512_permutexvar_ps(pm181, dat1875);
__m512 dat1876 = _mm512_maskz_loadu_ps(16383, datPtr16+1096+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1876 = _mm512_max_ps(_mm512_setzero_ps(), dat1876);
__m512 dat1877 = _mm512_maskz_loadu_ps(2047, datPtr16+3584+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1877 = _mm512_max_ps(_mm512_setzero_ps(), dat1877);
__m512 in1677 = _mm512_permutexvar_ps(pm180, dat1876);
__m512 in1682 = _mm512_permutexvar_ps(pm181, dat1877);
__m512 dat1878 = _mm512_maskz_loadu_ps(2047, datPtr16+3696+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1878 = _mm512_max_ps(_mm512_setzero_ps(), dat1878);
__m512 in1683 = _mm512_permutexvar_ps(pm181, dat1878);
__m512 dat1879 = _mm512_maskz_loadu_ps(2047, datPtr16+3808+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1879 = _mm512_max_ps(_mm512_setzero_ps(), dat1879);
__m512 in1684 = _mm512_permutexvar_ps(pm181, dat1879);
__m512 dat1880 = _mm512_maskz_loadu_ps(2047, datPtr16+3920+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1880 = _mm512_max_ps(_mm512_setzero_ps(), dat1880);
__m512 in1685 = _mm512_permutexvar_ps(pm181, dat1880);
__m512 tmp11488 = in1674;
__m512 tmp11493 = _mm512_add_ps(in1679, in1683);
__m512 tmp11489 = _mm512_sub_ps(in1677, in1675);
__m512 tmp11494 = _mm512_sub_ps(in1682, in1680);
__m512 tmp11490 = in1675;
__m512 tmp11495 = _mm512_add_ps(in1680, in1684);
in1673 = in1673;
in1678 = _mm512_sub_ps(in1678, in1684);
tmp11488 = _mm512_fmadd_ps(in1676, _mm512_set1_ps(-4.25e+00f), tmp11488);
tmp11493 = _mm512_fmadd_ps(in1681, _mm512_set1_ps(-4.25e+00f), tmp11493);
tmp11490 = _mm512_fmadd_ps(in1677, _mm512_set1_ps(-4.25e+00f), tmp11490);
tmp11495 = _mm512_fmadd_ps(in1682, _mm512_set1_ps(-4.25e+00f), tmp11495);
in1673 = _mm512_fmadd_ps(tmp11489, _mm512_set1_ps(5.25e+00f), in1673);
in1678 = _mm512_fmadd_ps(tmp11494, _mm512_set1_ps(5.25e+00f), in1678);
tmp11489 = _mm512_mul_ps(in1675, _mm512_set1_ps(2.5e-01f));
tmp11494 = _mm512_fmadd_ps(in1680, _mm512_set1_ps(2.5e-01f), in1684);
in1675 = _mm512_mul_ps(in1675, _mm512_set1_ps(4e+00f));
in1680 = _mm512_fmadd_ps(in1680, _mm512_set1_ps(4e+00f), in1684);
__m512 tmp11491 = _mm512_sub_ps(tmp11490, tmp11488);
__m512 tmp11496 = _mm512_sub_ps(tmp11495, tmp11493);
tmp11490 = _mm512_add_ps(tmp11488, tmp11490);
tmp11495 = _mm512_add_ps(tmp11493, tmp11495);
tmp11488 = _mm512_mul_ps(in1674, _mm512_set1_ps(2.5e-01f));
tmp11493 = _mm512_fmadd_ps(in1679, _mm512_set1_ps(2.5e-01f), in1683);
tmp11489 = _mm512_fmadd_ps(in1677, _mm512_set1_ps(-1.25e+00f), tmp11489);
tmp11494 = _mm512_fmadd_ps(in1682, _mm512_set1_ps(-1.25e+00f), tmp11494);
in1677 = _mm512_fmadd_ps(in1677, _mm512_set1_ps(-5e+00f), in1675);
in1682 = _mm512_fmadd_ps(in1682, _mm512_set1_ps(-5e+00f), in1680);
tmp11488 = _mm512_fmadd_ps(in1676, _mm512_set1_ps(-1.25e+00f), tmp11488);
tmp11493 = _mm512_fmadd_ps(in1681, _mm512_set1_ps(-1.25e+00f), tmp11493);
__m512 tmp11492 = _mm512_fmadd_ps(tmp11488, _mm512_set1_ps(2e+00f), tmp11489);
in1684 = _mm512_fmadd_ps(tmp11493, _mm512_set1_ps(2e+00f), tmp11494);
tmp11489 = _mm512_fnmadd_ps(tmp11488, _mm512_set1_ps(2e+00f), tmp11489);
tmp11494 = _mm512_fnmadd_ps(tmp11493, _mm512_set1_ps(2e+00f), tmp11494);
tmp11488 = in1674;
tmp11493 = _mm512_fmadd_ps(in1683, _mm512_set1_ps(2.5e-01f), in1679);
in1674 = _mm512_sub_ps(_mm512_setzero_ps(), in1674);
in1679 = _mm512_sub_ps(in1685, in1679);
tmp11488 = _mm512_fmadd_ps(in1676, _mm512_set1_ps(-1.25e+00f), tmp11488);
tmp11493 = _mm512_fmadd_ps(in1681, _mm512_set1_ps(-1.25e+00f), tmp11493);
in1676 = in1676;
in1681 = _mm512_sub_ps(in1681, in1683);
in1676 = _mm512_fmadd_ps(in1676, _mm512_set1_ps(5.25e+00f), in1674);
in1681 = _mm512_fmadd_ps(in1681, _mm512_set1_ps(5.25e+00f), in1679);
in1675 = _mm512_fmadd_ps(tmp11488, _mm512_set1_ps(2e+00f), in1677);
in1680 = _mm512_fmadd_ps(tmp11493, _mm512_set1_ps(2e+00f), in1682);
in1677 = _mm512_fnmadd_ps(tmp11488, _mm512_set1_ps(2e+00f), in1677);
in1682 = _mm512_fnmadd_ps(tmp11493, _mm512_set1_ps(2e+00f), in1682);
__m512 tmp11505 = _mm512_unpacklo_ps(in1673, tmp11490);
__m512 tmp11506 = _mm512_unpackhi_ps(in1673, tmp11490);
__m512 tmp11507 = _mm512_unpacklo_ps(tmp11491, tmp11492);
__m512 tmp11508 = _mm512_unpackhi_ps(tmp11491, tmp11492);
__m512 tmp11509 = _mm512_unpacklo_ps(tmp11489, in1675);
__m512 tmp11510 = _mm512_unpackhi_ps(tmp11489, in1675);
__m512 tmp11511 = _mm512_unpacklo_ps(in1677, in1676);
__m512 tmp11512 = _mm512_unpackhi_ps(in1677, in1676);
__m512 tmp11513 = _mm512_unpacklo_ps(in1678, tmp11495);
__m512 tmp11514 = _mm512_unpackhi_ps(in1678, tmp11495);
__m512 tmp11515 = _mm512_unpacklo_ps(tmp11496, in1684);
__m512 tmp11516 = _mm512_unpackhi_ps(tmp11496, in1684);
__m512 tmp11517 = _mm512_unpacklo_ps(tmp11494, in1680);
__m512 tmp11518 = _mm512_unpackhi_ps(tmp11494, in1680);
__m512 tmp11519 = _mm512_unpacklo_ps(in1682, in1681);
__m512 tmp11520 = _mm512_unpackhi_ps(in1682, in1681);
__m512 tmp11521 = _mm512_shuffle_ps(tmp11505, tmp11507, 68);
__m512 tmp11522 = _mm512_shuffle_ps(tmp11505, tmp11507, 238);
__m512 tmp11523 = _mm512_shuffle_ps(tmp11506, tmp11508, 68);
__m512 tmp11524 = _mm512_shuffle_ps(tmp11506, tmp11508, 238);
__m512 tmp11525 = _mm512_shuffle_ps(tmp11509, tmp11511, 68);
__m512 tmp11526 = _mm512_shuffle_ps(tmp11509, tmp11511, 238);
__m512 tmp11527 = _mm512_shuffle_ps(tmp11510, tmp11512, 68);
__m512 tmp11528 = _mm512_shuffle_ps(tmp11510, tmp11512, 238);
__m512 tmp11529 = _mm512_shuffle_ps(tmp11513, tmp11515, 68);
__m512 tmp11530 = _mm512_shuffle_ps(tmp11513, tmp11515, 238);
__m512 tmp11531 = _mm512_shuffle_ps(tmp11514, tmp11516, 68);
__m512 tmp11532 = _mm512_shuffle_ps(tmp11514, tmp11516, 238);
__m512 tmp11533 = _mm512_shuffle_ps(tmp11517, tmp11519, 68);
__m512 tmp11534 = _mm512_shuffle_ps(tmp11517, tmp11519, 238);
__m512 tmp11535 = _mm512_shuffle_ps(tmp11518, tmp11520, 68);
__m512 tmp11536 = _mm512_shuffle_ps(tmp11518, tmp11520, 238);
__m512 tmp11537 = _mm512_shuffle_f32x4(tmp11521, tmp11525, 136);
__m512 tmp11538 = _mm512_shuffle_f32x4(tmp11521, tmp11525, 221);
__m512 tmp11539 = _mm512_shuffle_f32x4(tmp11522, tmp11526, 136);
__m512 tmp11540 = _mm512_shuffle_f32x4(tmp11522, tmp11526, 221);
__m512 tmp11541 = _mm512_shuffle_f32x4(tmp11523, tmp11527, 136);
__m512 tmp11542 = _mm512_shuffle_f32x4(tmp11523, tmp11527, 221);
__m512 tmp11543 = _mm512_shuffle_f32x4(tmp11524, tmp11528, 136);
__m512 tmp11544 = _mm512_shuffle_f32x4(tmp11524, tmp11528, 221);
__m512 tmp11545 = _mm512_shuffle_f32x4(tmp11529, tmp11533, 136);
__m512 tmp11546 = _mm512_shuffle_f32x4(tmp11529, tmp11533, 221);
__m512 tmp11547 = _mm512_shuffle_f32x4(tmp11530, tmp11534, 136);
__m512 tmp11548 = _mm512_shuffle_f32x4(tmp11530, tmp11534, 221);
__m512 tmp11549 = _mm512_shuffle_f32x4(tmp11531, tmp11535, 136);
__m512 tmp11550 = _mm512_shuffle_f32x4(tmp11531, tmp11535, 221);
__m512 tmp11551 = _mm512_shuffle_f32x4(tmp11532, tmp11536, 136);
__m512 tmp11552 = _mm512_shuffle_f32x4(tmp11532, tmp11536, 221);
in1673 = _mm512_shuffle_f32x4(tmp11537, tmp11545, 136);
in1678 = _mm512_shuffle_f32x4(tmp11537, tmp11545, 221);
tmp11490 = _mm512_shuffle_f32x4(tmp11539, tmp11547, 136);
tmp11495 = _mm512_shuffle_f32x4(tmp11539, tmp11547, 221);
tmp11491 = _mm512_shuffle_f32x4(tmp11541, tmp11549, 136);
tmp11496 = _mm512_shuffle_f32x4(tmp11541, tmp11549, 221);
tmp11492 = _mm512_shuffle_f32x4(tmp11543, tmp11551, 136);
in1684 = _mm512_shuffle_f32x4(tmp11543, tmp11551, 221);
tmp11489 = _mm512_shuffle_f32x4(tmp11538, tmp11546, 136);
tmp11494 = _mm512_shuffle_f32x4(tmp11538, tmp11546, 221);
in1675 = _mm512_shuffle_f32x4(tmp11540, tmp11548, 136);
in1680 = _mm512_shuffle_f32x4(tmp11540, tmp11548, 221);
in1677 = _mm512_shuffle_f32x4(tmp11542, tmp11550, 136);
in1682 = _mm512_shuffle_f32x4(tmp11542, tmp11550, 221);
in1676 = _mm512_shuffle_f32x4(tmp11544, tmp11552, 136);
in1681 = _mm512_shuffle_f32x4(tmp11544, tmp11552, 221);
__m512 tmp11497 = _mm512_add_ps(tmp11490, in1675);
__m512 tmp11501 = _mm512_add_ps(tmp11495, in1680);
__m512 tmp11498 = _mm512_sub_ps(tmp11489, tmp11491);
__m512 tmp11502 = _mm512_sub_ps(tmp11494, tmp11496);
__m512 tmp11499 = _mm512_add_ps(tmp11491, in1677);
__m512 tmp11503 = _mm512_add_ps(tmp11496, in1682);
in1673 = _mm512_sub_ps(in1673, in1677);
in1678 = _mm512_sub_ps(in1678, in1682);
tmp11497 = _mm512_fmadd_ps(tmp11492, _mm512_set1_ps(-4.25e+00f), tmp11497);
tmp11501 = _mm512_fmadd_ps(in1684, _mm512_set1_ps(-4.25e+00f), tmp11501);
tmp11499 = _mm512_fmadd_ps(tmp11489, _mm512_set1_ps(-4.25e+00f), tmp11499);
tmp11503 = _mm512_fmadd_ps(tmp11494, _mm512_set1_ps(-4.25e+00f), tmp11503);
in1673 = _mm512_fmadd_ps(tmp11498, _mm512_set1_ps(5.25e+00f), in1673);
in1678 = _mm512_fmadd_ps(tmp11502, _mm512_set1_ps(5.25e+00f), in1678);
tmp11498 = _mm512_fmadd_ps(tmp11491, _mm512_set1_ps(2.5e-01f), in1677);
tmp11502 = _mm512_fmadd_ps(tmp11496, _mm512_set1_ps(2.5e-01f), in1682);
tmp11491 = _mm512_fmadd_ps(tmp11491, _mm512_set1_ps(4e+00f), in1677);
tmp11496 = _mm512_fmadd_ps(tmp11496, _mm512_set1_ps(4e+00f), in1682);
__m512 tmp11500 = _mm512_sub_ps(tmp11499, tmp11497);
__m512 tmp11504 = _mm512_sub_ps(tmp11503, tmp11501);
tmp11499 = _mm512_add_ps(tmp11497, tmp11499);
tmp11503 = _mm512_add_ps(tmp11501, tmp11503);
tmp11497 = _mm512_fmadd_ps(tmp11490, _mm512_set1_ps(2.5e-01f), in1675);
tmp11501 = _mm512_fmadd_ps(tmp11495, _mm512_set1_ps(2.5e-01f), in1680);
tmp11498 = _mm512_fmadd_ps(tmp11489, _mm512_set1_ps(-1.25e+00f), tmp11498);
tmp11502 = _mm512_fmadd_ps(tmp11494, _mm512_set1_ps(-1.25e+00f), tmp11502);
tmp11489 = _mm512_fmadd_ps(tmp11489, _mm512_set1_ps(-5e+00f), tmp11491);
tmp11494 = _mm512_fmadd_ps(tmp11494, _mm512_set1_ps(-5e+00f), tmp11496);
tmp11497 = _mm512_fmadd_ps(tmp11492, _mm512_set1_ps(-1.25e+00f), tmp11497);
tmp11501 = _mm512_fmadd_ps(in1684, _mm512_set1_ps(-1.25e+00f), tmp11501);
in1677 = _mm512_fmadd_ps(tmp11497, _mm512_set1_ps(2e+00f), tmp11498);
in1682 = _mm512_fmadd_ps(tmp11501, _mm512_set1_ps(2e+00f), tmp11502);
tmp11498 = _mm512_fnmadd_ps(tmp11497, _mm512_set1_ps(2e+00f), tmp11498);
tmp11502 = _mm512_fnmadd_ps(tmp11501, _mm512_set1_ps(2e+00f), tmp11502);
tmp11497 = _mm512_fmadd_ps(in1675, _mm512_set1_ps(2.5e-01f), tmp11490);
tmp11501 = _mm512_fmadd_ps(in1680, _mm512_set1_ps(2.5e-01f), tmp11495);
tmp11490 = _mm512_sub_ps(in1676, tmp11490);
tmp11495 = _mm512_sub_ps(in1681, tmp11495);
tmp11497 = _mm512_fmadd_ps(tmp11492, _mm512_set1_ps(-1.25e+00f), tmp11497);
tmp11501 = _mm512_fmadd_ps(in1684, _mm512_set1_ps(-1.25e+00f), tmp11501);
tmp11492 = _mm512_sub_ps(tmp11492, in1675);
in1684 = _mm512_sub_ps(in1684, in1680);
tmp11492 = _mm512_fmadd_ps(tmp11492, _mm512_set1_ps(5.25e+00f), tmp11490);
in1684 = _mm512_fmadd_ps(in1684, _mm512_set1_ps(5.25e+00f), tmp11495);
tmp11491 = _mm512_fmadd_ps(tmp11497, _mm512_set1_ps(2e+00f), tmp11489);
tmp11496 = _mm512_fmadd_ps(tmp11501, _mm512_set1_ps(2e+00f), tmp11494);
tmp11489 = _mm512_fnmadd_ps(tmp11497, _mm512_set1_ps(2e+00f), tmp11489);
tmp11494 = _mm512_fnmadd_ps(tmp11501, _mm512_set1_ps(2e+00f), tmp11494);
__m512 out1551 = _mm512_shuffle_f32x4(in1673, tmp11499, 68);
__m512 out1559 = _mm512_shuffle_f32x4(in1673, tmp11499, 238);
__m512 out1552 = _mm512_shuffle_f32x4(tmp11500, in1677, 68);
__m512 out1560 = _mm512_shuffle_f32x4(tmp11500, in1677, 238);
__m512 out1553 = _mm512_shuffle_f32x4(tmp11498, tmp11491, 68);
__m512 out1561 = _mm512_shuffle_f32x4(tmp11498, tmp11491, 238);
__m512 out1554 = _mm512_shuffle_f32x4(tmp11489, tmp11492, 68);
__m512 out1562 = _mm512_shuffle_f32x4(tmp11489, tmp11492, 238);
__m512 out1555 = _mm512_shuffle_f32x4(in1678, tmp11503, 68);
__m512 out1563 = _mm512_shuffle_f32x4(in1678, tmp11503, 238);
__m512 out1556 = _mm512_shuffle_f32x4(tmp11504, in1682, 68);
__m512 out1564 = _mm512_shuffle_f32x4(tmp11504, in1682, 238);
__m512 out1557 = _mm512_shuffle_f32x4(tmp11502, tmp11496, 68);
__m512 out1565 = _mm512_shuffle_f32x4(tmp11502, tmp11496, 238);
__m512 out1558 = _mm512_shuffle_f32x4(tmp11494, in1684, 68);
__m512 out1566 = _mm512_shuffle_f32x4(tmp11494, in1684, 238);
_mm512_storeu_ps(dfPtr8+256+819200*i35+49152*j28+49152*s25+768*k108, out1551);
_mm512_storeu_ps(dfPtr8+384+819200*i35+49152*j28+49152*s25+768*k108, out1559);
_mm512_storeu_ps(dfPtr8+320+819200*i35+49152*j28+49152*s25+768*k108, out1555);
_mm512_storeu_ps(dfPtr8+448+819200*i35+49152*j28+49152*s25+768*k108, out1563);
_mm512_storeu_ps(dfPtr8+205056+819200*i35+49152*j28+49152*s25+768*k108, out1552);
_mm512_storeu_ps(dfPtr8+205184+819200*i35+49152*j28+49152*s25+768*k108, out1560);
_mm512_storeu_ps(dfPtr8+205120+819200*i35+49152*j28+49152*s25+768*k108, out1556);
_mm512_storeu_ps(dfPtr8+205248+819200*i35+49152*j28+49152*s25+768*k108, out1564);
_mm512_storeu_ps(dfPtr8+409856+819200*i35+49152*j28+49152*s25+768*k108, out1553);
_mm512_storeu_ps(dfPtr8+409984+819200*i35+49152*j28+49152*s25+768*k108, out1561);
_mm512_storeu_ps(dfPtr8+409920+819200*i35+49152*j28+49152*s25+768*k108, out1557);
_mm512_storeu_ps(dfPtr8+410048+819200*i35+49152*j28+49152*s25+768*k108, out1565);
_mm512_storeu_ps(dfPtr8+614656+819200*i35+49152*j28+49152*s25+768*k108, out1554);
_mm512_storeu_ps(dfPtr8+614784+819200*i35+49152*j28+49152*s25+768*k108, out1562);
_mm512_storeu_ps(dfPtr8+614720+819200*i35+49152*j28+49152*s25+768*k108, out1558);
_mm512_storeu_ps(dfPtr8+614848+819200*i35+49152*j28+49152*s25+768*k108, out1566);
__m512 dat1881 = _mm512_maskz_loadu_ps(8191, datPtr16+3740+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1881 = _mm512_max_ps(_mm512_setzero_ps(), dat1881);
__m512 dat1882 = _mm512_maskz_loadu_ps(16383, datPtr16+3784+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1882 = _mm512_max_ps(_mm512_setzero_ps(), dat1882);
__m512i pm182 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1686 = _mm512_permutexvar_ps(pm182, dat1881);
__m512i pm183 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1691 = _mm512_permutexvar_ps(pm183, dat1882);
__m512 dat1883 = _mm512_maskz_loadu_ps(8191, datPtr16+3852+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1883 = _mm512_max_ps(_mm512_setzero_ps(), dat1883);
__m512 dat1884 = _mm512_maskz_loadu_ps(16383, datPtr16+3896+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1884 = _mm512_max_ps(_mm512_setzero_ps(), dat1884);
__m512 in1687 = _mm512_permutexvar_ps(pm182, dat1883);
__m512 in1692 = _mm512_permutexvar_ps(pm183, dat1884);
__m512 dat1885 = _mm512_maskz_loadu_ps(8191, datPtr16+3964+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1885 = _mm512_max_ps(_mm512_setzero_ps(), dat1885);
__m512 dat1886 = _mm512_maskz_loadu_ps(16383, datPtr16+4008+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1886 = _mm512_max_ps(_mm512_setzero_ps(), dat1886);
__m512 in1688 = _mm512_permutexvar_ps(pm182, dat1885);
__m512 in1693 = _mm512_permutexvar_ps(pm183, dat1886);
__m512 dat1887 = _mm512_maskz_loadu_ps(8191, datPtr16+4076+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1887 = _mm512_max_ps(_mm512_setzero_ps(), dat1887);
__m512 dat1888 = _mm512_maskz_loadu_ps(16383, datPtr16+4120+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1888 = _mm512_max_ps(_mm512_setzero_ps(), dat1888);
__m512 in1689 = _mm512_permutexvar_ps(pm182, dat1887);
__m512 in1694 = _mm512_permutexvar_ps(pm183, dat1888);
__m512 dat1889 = _mm512_maskz_loadu_ps(8191, datPtr16+4188+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1889 = _mm512_max_ps(_mm512_setzero_ps(), dat1889);
__m512 dat1890 = _mm512_maskz_loadu_ps(16383, datPtr16+4232+401408*i35+112*h43+4*w51+401408*s25+6272*k108);
dat1890 = _mm512_max_ps(_mm512_setzero_ps(), dat1890);
__m512 in1690 = _mm512_permutexvar_ps(pm182, dat1889);
__m512 in1695 = _mm512_permutexvar_ps(pm183, dat1890);
__m512 tmp11553 = in1687;
__m512 tmp11558 = in1692;
__m512 tmp11554 = _mm512_sub_ps(in1690, in1688);
__m512 tmp11559 = _mm512_sub_ps(in1695, in1693);
__m512 tmp11555 = in1688;
__m512 tmp11560 = in1693;
in1686 = in1686;
in1691 = in1691;
tmp11553 = _mm512_fmadd_ps(in1689, _mm512_set1_ps(-4.25e+00f), tmp11553);
tmp11558 = _mm512_fmadd_ps(in1694, _mm512_set1_ps(-4.25e+00f), tmp11558);
tmp11555 = _mm512_fmadd_ps(in1690, _mm512_set1_ps(-4.25e+00f), tmp11555);
tmp11560 = _mm512_fmadd_ps(in1695, _mm512_set1_ps(-4.25e+00f), tmp11560);
in1686 = _mm512_fmadd_ps(tmp11554, _mm512_set1_ps(5.25e+00f), in1686);
in1691 = _mm512_fmadd_ps(tmp11559, _mm512_set1_ps(5.25e+00f), in1691);
tmp11554 = _mm512_mul_ps(in1688, _mm512_set1_ps(2.5e-01f));
tmp11559 = _mm512_mul_ps(in1693, _mm512_set1_ps(2.5e-01f));
in1688 = _mm512_mul_ps(in1688, _mm512_set1_ps(4e+00f));
in1693 = _mm512_mul_ps(in1693, _mm512_set1_ps(4e+00f));
__m512 tmp11556 = _mm512_sub_ps(tmp11555, tmp11553);
__m512 tmp11561 = _mm512_sub_ps(tmp11560, tmp11558);
tmp11555 = _mm512_add_ps(tmp11553, tmp11555);
tmp11560 = _mm512_add_ps(tmp11558, tmp11560);
tmp11553 = _mm512_mul_ps(in1687, _mm512_set1_ps(2.5e-01f));
tmp11558 = _mm512_mul_ps(in1692, _mm512_set1_ps(2.5e-01f));
tmp11554 = _mm512_fmadd_ps(in1690, _mm512_set1_ps(-1.25e+00f), tmp11554);
tmp11559 = _mm512_fmadd_ps(in1695, _mm512_set1_ps(-1.25e+00f), tmp11559);
in1690 = _mm512_fmadd_ps(in1690, _mm512_set1_ps(-5e+00f), in1688);
in1695 = _mm512_fmadd_ps(in1695, _mm512_set1_ps(-5e+00f), in1693);
tmp11553 = _mm512_fmadd_ps(in1689, _mm512_set1_ps(-1.25e+00f), tmp11553);
tmp11558 = _mm512_fmadd_ps(in1694, _mm512_set1_ps(-1.25e+00f), tmp11558);
__m512 tmp11557 = _mm512_fmadd_ps(tmp11553, _mm512_set1_ps(2e+00f), tmp11554);
__m512 tmp11562 = _mm512_fmadd_ps(tmp11558, _mm512_set1_ps(2e+00f), tmp11559);
tmp11554 = _mm512_fnmadd_ps(tmp11553, _mm512_set1_ps(2e+00f), tmp11554);
tmp11559 = _mm512_fnmadd_ps(tmp11558, _mm512_set1_ps(2e+00f), tmp11559);
tmp11553 = in1687;
tmp11558 = in1692;
in1687 = _mm512_sub_ps(_mm512_setzero_ps(), in1687);
in1692 = _mm512_sub_ps(_mm512_setzero_ps(), in1692);
tmp11553 = _mm512_fmadd_ps(in1689, _mm512_set1_ps(-1.25e+00f), tmp11553);
tmp11558 = _mm512_fmadd_ps(in1694, _mm512_set1_ps(-1.25e+00f), tmp11558);
in1689 = in1689;
in1694 = in1694;
in1689 = _mm512_fmadd_ps(in1689, _mm512_set1_ps(5.25e+00f), in1687);
in1694 = _mm512_fmadd_ps(in1694, _mm512_set1_ps(5.25e+00f), in1692);
in1688 = _mm512_fmadd_ps(tmp11553, _mm512_set1_ps(2e+00f), in1690);
in1693 = _mm512_fmadd_ps(tmp11558, _mm512_set1_ps(2e+00f), in1695);
in1690 = _mm512_fnmadd_ps(tmp11553, _mm512_set1_ps(2e+00f), in1690);
in1695 = _mm512_fnmadd_ps(tmp11558, _mm512_set1_ps(2e+00f), in1695);
__m512 tmp11571 = _mm512_unpacklo_ps(in1686, tmp11555);
__m512 tmp11572 = _mm512_unpackhi_ps(in1686, tmp11555);
__m512 tmp11573 = _mm512_unpacklo_ps(tmp11556, tmp11557);
__m512 tmp11574 = _mm512_unpackhi_ps(tmp11556, tmp11557);
__m512 tmp11575 = _mm512_unpacklo_ps(tmp11554, in1688);
__m512 tmp11576 = _mm512_unpackhi_ps(tmp11554, in1688);
__m512 tmp11577 = _mm512_unpacklo_ps(in1690, in1689);
__m512 tmp11578 = _mm512_unpackhi_ps(in1690, in1689);
__m512 tmp11579 = _mm512_unpacklo_ps(in1691, tmp11560);
__m512 tmp11580 = _mm512_unpackhi_ps(in1691, tmp11560);
__m512 tmp11581 = _mm512_unpacklo_ps(tmp11561, tmp11562);
__m512 tmp11582 = _mm512_unpackhi_ps(tmp11561, tmp11562);
__m512 tmp11583 = _mm512_unpacklo_ps(tmp11559, in1693);
__m512 tmp11584 = _mm512_unpackhi_ps(tmp11559, in1693);
__m512 tmp11585 = _mm512_unpacklo_ps(in1695, in1694);
__m512 tmp11586 = _mm512_unpackhi_ps(in1695, in1694);
__m512 tmp11587 = _mm512_shuffle_ps(tmp11571, tmp11573, 68);
__m512 tmp11588 = _mm512_shuffle_ps(tmp11571, tmp11573, 238);
__m512 tmp11589 = _mm512_shuffle_ps(tmp11572, tmp11574, 68);
__m512 tmp11590 = _mm512_shuffle_ps(tmp11572, tmp11574, 238);
__m512 tmp11591 = _mm512_shuffle_ps(tmp11575, tmp11577, 68);
__m512 tmp11592 = _mm512_shuffle_ps(tmp11575, tmp11577, 238);
__m512 tmp11593 = _mm512_shuffle_ps(tmp11576, tmp11578, 68);
__m512 tmp11594 = _mm512_shuffle_ps(tmp11576, tmp11578, 238);
__m512 tmp11595 = _mm512_shuffle_ps(tmp11579, tmp11581, 68);
__m512 tmp11596 = _mm512_shuffle_ps(tmp11579, tmp11581, 238);
__m512 tmp11597 = _mm512_shuffle_ps(tmp11580, tmp11582, 68);
__m512 tmp11598 = _mm512_shuffle_ps(tmp11580, tmp11582, 238);
__m512 tmp11599 = _mm512_shuffle_ps(tmp11583, tmp11585, 68);
__m512 tmp11600 = _mm512_shuffle_ps(tmp11583, tmp11585, 238);
__m512 tmp11601 = _mm512_shuffle_ps(tmp11584, tmp11586, 68);
__m512 tmp11602 = _mm512_shuffle_ps(tmp11584, tmp11586, 238);
__m512 tmp11603 = _mm512_shuffle_f32x4(tmp11587, tmp11591, 136);
__m512 tmp11604 = _mm512_shuffle_f32x4(tmp11587, tmp11591, 221);
__m512 tmp11605 = _mm512_shuffle_f32x4(tmp11588, tmp11592, 136);
__m512 tmp11606 = _mm512_shuffle_f32x4(tmp11588, tmp11592, 221);
__m512 tmp11607 = _mm512_shuffle_f32x4(tmp11589, tmp11593, 136);
__m512 tmp11608 = _mm512_shuffle_f32x4(tmp11589, tmp11593, 221);
__m512 tmp11609 = _mm512_shuffle_f32x4(tmp11590, tmp11594, 136);
__m512 tmp11610 = _mm512_shuffle_f32x4(tmp11590, tmp11594, 221);
__m512 tmp11611 = _mm512_shuffle_f32x4(tmp11595, tmp11599, 136);
__m512 tmp11612 = _mm512_shuffle_f32x4(tmp11595, tmp11599, 221);
__m512 tmp11613 = _mm512_shuffle_f32x4(tmp11596, tmp11600, 136);
__m512 tmp11614 = _mm512_shuffle_f32x4(tmp11596, tmp11600, 221);
__m512 tmp11615 = _mm512_shuffle_f32x4(tmp11597, tmp11601, 136);
__m512 tmp11616 = _mm512_shuffle_f32x4(tmp11597, tmp11601, 221);
__m512 tmp11617 = _mm512_shuffle_f32x4(tmp11598, tmp11602, 136);
__m512 tmp11618 = _mm512_shuffle_f32x4(tmp11598, tmp11602, 221);
in1686 = _mm512_shuffle_f32x4(tmp11603, tmp11611, 136);
in1691 = _mm512_shuffle_f32x4(tmp11603, tmp11611, 221);
tmp11555 = _mm512_shuffle_f32x4(tmp11605, tmp11613, 136);
tmp11560 = _mm512_shuffle_f32x4(tmp11605, tmp11613, 221);
tmp11556 = _mm512_shuffle_f32x4(tmp11607, tmp11615, 136);
tmp11561 = _mm512_shuffle_f32x4(tmp11607, tmp11615, 221);
tmp11557 = _mm512_shuffle_f32x4(tmp11609, tmp11617, 136);
tmp11562 = _mm512_shuffle_f32x4(tmp11609, tmp11617, 221);
tmp11554 = _mm512_shuffle_f32x4(tmp11604, tmp11612, 136);
tmp11559 = _mm512_shuffle_f32x4(tmp11604, tmp11612, 221);
in1688 = _mm512_shuffle_f32x4(tmp11606, tmp11614, 136);
in1693 = _mm512_shuffle_f32x4(tmp11606, tmp11614, 221);
in1690 = _mm512_shuffle_f32x4(tmp11608, tmp11616, 136);
in1695 = _mm512_shuffle_f32x4(tmp11608, tmp11616, 221);
in1689 = _mm512_shuffle_f32x4(tmp11610, tmp11618, 136);
in1694 = _mm512_shuffle_f32x4(tmp11610, tmp11618, 221);
__m512 tmp11563 = _mm512_add_ps(tmp11555, in1688);
__m512 tmp11567 = _mm512_add_ps(tmp11560, in1693);
__m512 tmp11564 = _mm512_sub_ps(tmp11554, tmp11556);
__m512 tmp11568 = _mm512_sub_ps(tmp11559, tmp11561);
__m512 tmp11565 = _mm512_add_ps(tmp11556, in1690);
__m512 tmp11569 = _mm512_add_ps(tmp11561, in1695);
in1686 = _mm512_sub_ps(in1686, in1690);
in1691 = _mm512_sub_ps(in1691, in1695);
tmp11563 = _mm512_fmadd_ps(tmp11557, _mm512_set1_ps(-4.25e+00f), tmp11563);
tmp11567 = _mm512_fmadd_ps(tmp11562, _mm512_set1_ps(-4.25e+00f), tmp11567);
tmp11565 = _mm512_fmadd_ps(tmp11554, _mm512_set1_ps(-4.25e+00f), tmp11565);
tmp11569 = _mm512_fmadd_ps(tmp11559, _mm512_set1_ps(-4.25e+00f), tmp11569);
in1686 = _mm512_fmadd_ps(tmp11564, _mm512_set1_ps(5.25e+00f), in1686);
in1691 = _mm512_fmadd_ps(tmp11568, _mm512_set1_ps(5.25e+00f), in1691);
tmp11564 = _mm512_fmadd_ps(tmp11556, _mm512_set1_ps(2.5e-01f), in1690);
tmp11568 = _mm512_fmadd_ps(tmp11561, _mm512_set1_ps(2.5e-01f), in1695);
tmp11556 = _mm512_fmadd_ps(tmp11556, _mm512_set1_ps(4e+00f), in1690);
tmp11561 = _mm512_fmadd_ps(tmp11561, _mm512_set1_ps(4e+00f), in1695);
__m512 tmp11566 = _mm512_sub_ps(tmp11565, tmp11563);
__m512 tmp11570 = _mm512_sub_ps(tmp11569, tmp11567);
tmp11565 = _mm512_add_ps(tmp11563, tmp11565);
tmp11569 = _mm512_add_ps(tmp11567, tmp11569);
tmp11563 = _mm512_fmadd_ps(tmp11555, _mm512_set1_ps(2.5e-01f), in1688);
tmp11567 = _mm512_fmadd_ps(tmp11560, _mm512_set1_ps(2.5e-01f), in1693);
tmp11564 = _mm512_fmadd_ps(tmp11554, _mm512_set1_ps(-1.25e+00f), tmp11564);
tmp11568 = _mm512_fmadd_ps(tmp11559, _mm512_set1_ps(-1.25e+00f), tmp11568);
tmp11554 = _mm512_fmadd_ps(tmp11554, _mm512_set1_ps(-5e+00f), tmp11556);
tmp11559 = _mm512_fmadd_ps(tmp11559, _mm512_set1_ps(-5e+00f), tmp11561);
tmp11563 = _mm512_fmadd_ps(tmp11557, _mm512_set1_ps(-1.25e+00f), tmp11563);
tmp11567 = _mm512_fmadd_ps(tmp11562, _mm512_set1_ps(-1.25e+00f), tmp11567);
in1690 = _mm512_fmadd_ps(tmp11563, _mm512_set1_ps(2e+00f), tmp11564);
in1695 = _mm512_fmadd_ps(tmp11567, _mm512_set1_ps(2e+00f), tmp11568);
tmp11564 = _mm512_fnmadd_ps(tmp11563, _mm512_set1_ps(2e+00f), tmp11564);
tmp11568 = _mm512_fnmadd_ps(tmp11567, _mm512_set1_ps(2e+00f), tmp11568);
tmp11563 = _mm512_fmadd_ps(in1688, _mm512_set1_ps(2.5e-01f), tmp11555);
tmp11567 = _mm512_fmadd_ps(in1693, _mm512_set1_ps(2.5e-01f), tmp11560);
tmp11555 = _mm512_sub_ps(in1689, tmp11555);
tmp11560 = _mm512_sub_ps(in1694, tmp11560);
tmp11563 = _mm512_fmadd_ps(tmp11557, _mm512_set1_ps(-1.25e+00f), tmp11563);
tmp11567 = _mm512_fmadd_ps(tmp11562, _mm512_set1_ps(-1.25e+00f), tmp11567);
tmp11557 = _mm512_sub_ps(tmp11557, in1688);
tmp11562 = _mm512_sub_ps(tmp11562, in1693);
tmp11557 = _mm512_fmadd_ps(tmp11557, _mm512_set1_ps(5.25e+00f), tmp11555);
tmp11562 = _mm512_fmadd_ps(tmp11562, _mm512_set1_ps(5.25e+00f), tmp11560);
tmp11556 = _mm512_fmadd_ps(tmp11563, _mm512_set1_ps(2e+00f), tmp11554);
tmp11561 = _mm512_fmadd_ps(tmp11567, _mm512_set1_ps(2e+00f), tmp11559);
tmp11554 = _mm512_fnmadd_ps(tmp11563, _mm512_set1_ps(2e+00f), tmp11554);
tmp11559 = _mm512_fnmadd_ps(tmp11567, _mm512_set1_ps(2e+00f), tmp11559);
__m512 out1567 = _mm512_shuffle_f32x4(in1686, tmp11565, 68);
__m512 out1575 = _mm512_shuffle_f32x4(in1686, tmp11565, 238);
__m512 out1568 = _mm512_shuffle_f32x4(tmp11566, in1690, 68);
__m512 out1576 = _mm512_shuffle_f32x4(tmp11566, in1690, 238);
__m512 out1569 = _mm512_shuffle_f32x4(tmp11564, tmp11556, 68);
__m512 out1577 = _mm512_shuffle_f32x4(tmp11564, tmp11556, 238);
__m512 out1570 = _mm512_shuffle_f32x4(tmp11554, tmp11557, 68);
__m512 out1578 = _mm512_shuffle_f32x4(tmp11554, tmp11557, 238);
__m512 out1571 = _mm512_shuffle_f32x4(in1691, tmp11569, 68);
__m512 out1579 = _mm512_shuffle_f32x4(in1691, tmp11569, 238);
__m512 out1572 = _mm512_shuffle_f32x4(tmp11570, in1695, 68);
__m512 out1580 = _mm512_shuffle_f32x4(tmp11570, in1695, 238);
__m512 out1573 = _mm512_shuffle_f32x4(tmp11568, tmp11561, 68);
__m512 out1581 = _mm512_shuffle_f32x4(tmp11568, tmp11561, 238);
__m512 out1574 = _mm512_shuffle_f32x4(tmp11559, tmp11562, 68);
__m512 out1582 = _mm512_shuffle_f32x4(tmp11559, tmp11562, 238);
_mm512_storeu_ps(dfPtr8+512+819200*i35+49152*j28+49152*s25+768*k108, out1567);
_mm512_storeu_ps(dfPtr8+640+819200*i35+49152*j28+49152*s25+768*k108, out1575);
_mm512_storeu_ps(dfPtr8+576+819200*i35+49152*j28+49152*s25+768*k108, out1571);
_mm512_storeu_ps(dfPtr8+704+819200*i35+49152*j28+49152*s25+768*k108, out1579);
_mm512_storeu_ps(dfPtr8+205312+819200*i35+49152*j28+49152*s25+768*k108, out1568);
_mm512_storeu_ps(dfPtr8+205440+819200*i35+49152*j28+49152*s25+768*k108, out1576);
_mm512_storeu_ps(dfPtr8+205376+819200*i35+49152*j28+49152*s25+768*k108, out1572);
_mm512_storeu_ps(dfPtr8+205504+819200*i35+49152*j28+49152*s25+768*k108, out1580);
_mm512_storeu_ps(dfPtr8+410112+819200*i35+49152*j28+49152*s25+768*k108, out1569);
_mm512_storeu_ps(dfPtr8+410240+819200*i35+49152*j28+49152*s25+768*k108, out1577);
_mm512_storeu_ps(dfPtr8+410176+819200*i35+49152*j28+49152*s25+768*k108, out1573);
_mm512_storeu_ps(dfPtr8+410304+819200*i35+49152*j28+49152*s25+768*k108, out1581);
_mm512_storeu_ps(dfPtr8+614912+819200*i35+49152*j28+49152*s25+768*k108, out1570);
_mm512_storeu_ps(dfPtr8+615040+819200*i35+49152*j28+49152*s25+768*k108, out1578);
_mm512_storeu_ps(dfPtr8+614976+819200*i35+49152*j28+49152*s25+768*k108, out1574);
_mm512_storeu_ps(dfPtr8+615104+819200*i35+49152*j28+49152*s25+768*k108, out1582);
}
if (j28 >= last7) return;
++j28;
rel19 = 4;
}
ptrdiff_t h44 = base19+24;
ptrdiff_t w52 = 24;
ptrdiff_t k109 = 0;
for (; k109 != 32; ++k109) {
__m512 dat1891 = _mm512_maskz_loadu_ps(31, datPtr16+0+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1891 = _mm512_max_ps(_mm512_setzero_ps(), dat1891);
__m512 dat1892 = _mm512_maskz_loadu_ps(31, datPtr16+3136+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1892 = _mm512_max_ps(_mm512_setzero_ps(), dat1892);
__m512 dat1893 = _mm512_maskz_loadu_ps(31, datPtr16+6272+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1893 = _mm512_max_ps(_mm512_setzero_ps(), dat1893);
__m512 dat1894 = _mm512_maskz_loadu_ps(31, datPtr16+9408+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1894 = _mm512_max_ps(_mm512_setzero_ps(), dat1894);
__m512i pm184 = _mm512_set_epi32(15, 15, 15, 20, 19, 18, 17, 16, 15, 15, 15, 4, 3, 2, 1, 0);
__m512 in1696 = _mm512_permutex2var_ps(dat1891, pm184, dat1892);
__m512 in1701 = _mm512_permutex2var_ps(dat1893, pm184, dat1894);
__m512 dat1895 = _mm512_maskz_loadu_ps(31, datPtr16+112+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1895 = _mm512_max_ps(_mm512_setzero_ps(), dat1895);
__m512 dat1896 = _mm512_maskz_loadu_ps(31, datPtr16+3248+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1896 = _mm512_max_ps(_mm512_setzero_ps(), dat1896);
__m512 dat1897 = _mm512_maskz_loadu_ps(31, datPtr16+6384+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1897 = _mm512_max_ps(_mm512_setzero_ps(), dat1897);
__m512 dat1898 = _mm512_maskz_loadu_ps(31, datPtr16+9520+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1898 = _mm512_max_ps(_mm512_setzero_ps(), dat1898);
__m512 in1697 = _mm512_permutex2var_ps(dat1895, pm184, dat1896);
__m512 in1702 = _mm512_permutex2var_ps(dat1897, pm184, dat1898);
__m512 dat1899 = _mm512_maskz_loadu_ps(31, datPtr16+224+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1899 = _mm512_max_ps(_mm512_setzero_ps(), dat1899);
__m512 dat1900 = _mm512_maskz_loadu_ps(31, datPtr16+3360+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1900 = _mm512_max_ps(_mm512_setzero_ps(), dat1900);
__m512 dat1901 = _mm512_maskz_loadu_ps(31, datPtr16+6496+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1901 = _mm512_max_ps(_mm512_setzero_ps(), dat1901);
__m512 dat1902 = _mm512_maskz_loadu_ps(31, datPtr16+9632+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1902 = _mm512_max_ps(_mm512_setzero_ps(), dat1902);
__m512 in1698 = _mm512_permutex2var_ps(dat1899, pm184, dat1900);
__m512 in1703 = _mm512_permutex2var_ps(dat1901, pm184, dat1902);
__m512 dat1903 = _mm512_maskz_loadu_ps(31, datPtr16+336+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1903 = _mm512_max_ps(_mm512_setzero_ps(), dat1903);
__m512 dat1904 = _mm512_maskz_loadu_ps(31, datPtr16+3472+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1904 = _mm512_max_ps(_mm512_setzero_ps(), dat1904);
__m512 dat1905 = _mm512_maskz_loadu_ps(31, datPtr16+6608+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1905 = _mm512_max_ps(_mm512_setzero_ps(), dat1905);
__m512 dat1906 = _mm512_maskz_loadu_ps(31, datPtr16+9744+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1906 = _mm512_max_ps(_mm512_setzero_ps(), dat1906);
__m512 in1699 = _mm512_permutex2var_ps(dat1903, pm184, dat1904);
__m512 in1704 = _mm512_permutex2var_ps(dat1905, pm184, dat1906);
__m512 dat1907 = _mm512_maskz_loadu_ps(31, datPtr16+448+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1907 = _mm512_max_ps(_mm512_setzero_ps(), dat1907);
__m512 dat1908 = _mm512_maskz_loadu_ps(31, datPtr16+3584+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1908 = _mm512_max_ps(_mm512_setzero_ps(), dat1908);
__m512 dat1909 = _mm512_maskz_loadu_ps(31, datPtr16+6720+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1909 = _mm512_max_ps(_mm512_setzero_ps(), dat1909);
__m512 dat1910 = _mm512_maskz_loadu_ps(31, datPtr16+9856+401408*i35+112*h44+4*w52+401408*s25+12544*k109);
dat1910 = _mm512_max_ps(_mm512_setzero_ps(), dat1910);
__m512 in1700 = _mm512_permutex2var_ps(dat1907, pm184, dat1908);
__m512 in1705 = _mm512_permutex2var_ps(dat1909, pm184, dat1910);
__m512 tmp11619 = in1697;
__m512 tmp11624 = in1702;
__m512 tmp11620 = _mm512_sub_ps(in1700, in1698);
__m512 tmp11625 = _mm512_sub_ps(in1705, in1703);
__m512 tmp11621 = in1698;
__m512 tmp11626 = in1703;
in1696 = in1696;
in1701 = in1701;
tmp11619 = _mm512_fmadd_ps(in1699, _mm512_set1_ps(-4.25e+00f), tmp11619);
tmp11624 = _mm512_fmadd_ps(in1704, _mm512_set1_ps(-4.25e+00f), tmp11624);
tmp11621 = _mm512_fmadd_ps(in1700, _mm512_set1_ps(-4.25e+00f), tmp11621);
tmp11626 = _mm512_fmadd_ps(in1705, _mm512_set1_ps(-4.25e+00f), tmp11626);
in1696 = _mm512_fmadd_ps(tmp11620, _mm512_set1_ps(5.25e+00f), in1696);
in1701 = _mm512_fmadd_ps(tmp11625, _mm512_set1_ps(5.25e+00f), in1701);
tmp11620 = _mm512_mul_ps(in1698, _mm512_set1_ps(2.5e-01f));
tmp11625 = _mm512_mul_ps(in1703, _mm512_set1_ps(2.5e-01f));
in1698 = _mm512_mul_ps(in1698, _mm512_set1_ps(4e+00f));
in1703 = _mm512_mul_ps(in1703, _mm512_set1_ps(4e+00f));
__m512 tmp11622 = _mm512_sub_ps(tmp11621, tmp11619);
__m512 tmp11627 = _mm512_sub_ps(tmp11626, tmp11624);
tmp11621 = _mm512_add_ps(tmp11619, tmp11621);
tmp11626 = _mm512_add_ps(tmp11624, tmp11626);
tmp11619 = _mm512_mul_ps(in1697, _mm512_set1_ps(2.5e-01f));
tmp11624 = _mm512_mul_ps(in1702, _mm512_set1_ps(2.5e-01f));
tmp11620 = _mm512_fmadd_ps(in1700, _mm512_set1_ps(-1.25e+00f), tmp11620);
tmp11625 = _mm512_fmadd_ps(in1705, _mm512_set1_ps(-1.25e+00f), tmp11625);
in1700 = _mm512_fmadd_ps(in1700, _mm512_set1_ps(-5e+00f), in1698);
in1705 = _mm512_fmadd_ps(in1705, _mm512_set1_ps(-5e+00f), in1703);
tmp11619 = _mm512_fmadd_ps(in1699, _mm512_set1_ps(-1.25e+00f), tmp11619);
tmp11624 = _mm512_fmadd_ps(in1704, _mm512_set1_ps(-1.25e+00f), tmp11624);
__m512 tmp11623 = _mm512_fmadd_ps(tmp11619, _mm512_set1_ps(2e+00f), tmp11620);
__m512 tmp11628 = _mm512_fmadd_ps(tmp11624, _mm512_set1_ps(2e+00f), tmp11625);
tmp11620 = _mm512_fnmadd_ps(tmp11619, _mm512_set1_ps(2e+00f), tmp11620);
tmp11625 = _mm512_fnmadd_ps(tmp11624, _mm512_set1_ps(2e+00f), tmp11625);
tmp11619 = in1697;
tmp11624 = in1702;
in1697 = _mm512_sub_ps(_mm512_setzero_ps(), in1697);
in1702 = _mm512_sub_ps(_mm512_setzero_ps(), in1702);
tmp11619 = _mm512_fmadd_ps(in1699, _mm512_set1_ps(-1.25e+00f), tmp11619);
tmp11624 = _mm512_fmadd_ps(in1704, _mm512_set1_ps(-1.25e+00f), tmp11624);
in1699 = in1699;
in1704 = in1704;
in1699 = _mm512_fmadd_ps(in1699, _mm512_set1_ps(5.25e+00f), in1697);
in1704 = _mm512_fmadd_ps(in1704, _mm512_set1_ps(5.25e+00f), in1702);
in1698 = _mm512_fmadd_ps(tmp11619, _mm512_set1_ps(2e+00f), in1700);
in1703 = _mm512_fmadd_ps(tmp11624, _mm512_set1_ps(2e+00f), in1705);
in1700 = _mm512_fnmadd_ps(tmp11619, _mm512_set1_ps(2e+00f), in1700);
in1705 = _mm512_fnmadd_ps(tmp11624, _mm512_set1_ps(2e+00f), in1705);
__m512 tmp11639 = _mm512_unpacklo_ps(in1696, tmp11621);
__m512 tmp11640 = _mm512_unpackhi_ps(in1696, tmp11621);
__m512 tmp11641 = _mm512_unpacklo_ps(tmp11622, tmp11623);
__m512 tmp11642 = _mm512_unpackhi_ps(tmp11622, tmp11623);
__m512 tmp11643 = _mm512_unpacklo_ps(tmp11620, in1698);
__m512 tmp11644 = _mm512_unpackhi_ps(tmp11620, in1698);
__m512 tmp11645 = _mm512_unpacklo_ps(in1700, in1699);
__m512 tmp11646 = _mm512_unpackhi_ps(in1700, in1699);
__m512 tmp11647 = _mm512_unpacklo_ps(in1701, tmp11626);
__m512 tmp11648 = _mm512_unpackhi_ps(in1701, tmp11626);
__m512 tmp11649 = _mm512_unpacklo_ps(tmp11627, tmp11628);
__m512 tmp11650 = _mm512_unpackhi_ps(tmp11627, tmp11628);
__m512 tmp11651 = _mm512_unpacklo_ps(tmp11625, in1703);
__m512 tmp11652 = _mm512_unpackhi_ps(tmp11625, in1703);
__m512 tmp11653 = _mm512_unpacklo_ps(in1705, in1704);
__m512 tmp11654 = _mm512_unpackhi_ps(in1705, in1704);
__m512 tmp11655 = _mm512_shuffle_ps(tmp11639, tmp11641, 68);
__m512 tmp11656 = _mm512_shuffle_ps(tmp11639, tmp11641, 238);
__m512 tmp11657 = _mm512_shuffle_ps(tmp11640, tmp11642, 68);
__m512 tmp11658 = _mm512_shuffle_ps(tmp11640, tmp11642, 238);
__m512 tmp11659 = _mm512_shuffle_ps(tmp11643, tmp11645, 68);
__m512 tmp11660 = _mm512_shuffle_ps(tmp11643, tmp11645, 238);
__m512 tmp11661 = _mm512_shuffle_ps(tmp11644, tmp11646, 68);
__m512 tmp11662 = _mm512_shuffle_ps(tmp11644, tmp11646, 238);
__m512 tmp11663 = _mm512_shuffle_ps(tmp11647, tmp11649, 68);
__m512 tmp11664 = _mm512_shuffle_ps(tmp11647, tmp11649, 238);
__m512 tmp11665 = _mm512_shuffle_ps(tmp11648, tmp11650, 68);
__m512 tmp11666 = _mm512_shuffle_ps(tmp11648, tmp11650, 238);
__m512 tmp11667 = _mm512_shuffle_ps(tmp11651, tmp11653, 68);
__m512 tmp11668 = _mm512_shuffle_ps(tmp11651, tmp11653, 238);
__m512 tmp11669 = _mm512_shuffle_ps(tmp11652, tmp11654, 68);
__m512 tmp11670 = _mm512_shuffle_ps(tmp11652, tmp11654, 238);
__m512 tmp11671 = _mm512_shuffle_f32x4(tmp11655, tmp11659, 136);
__m512 tmp11672 = _mm512_shuffle_f32x4(tmp11655, tmp11659, 221);
__m512 tmp11673 = _mm512_shuffle_f32x4(tmp11656, tmp11660, 136);
__m512 tmp11674 = _mm512_shuffle_f32x4(tmp11656, tmp11660, 221);
__m512 tmp11675 = _mm512_shuffle_f32x4(tmp11657, tmp11661, 136);
__m512 tmp11676 = _mm512_shuffle_f32x4(tmp11657, tmp11661, 221);
__m512 tmp11677 = _mm512_shuffle_f32x4(tmp11658, tmp11662, 136);
__m512 tmp11678 = _mm512_shuffle_f32x4(tmp11658, tmp11662, 221);
__m512 tmp11679 = _mm512_shuffle_f32x4(tmp11663, tmp11667, 136);
__m512 tmp11680 = _mm512_shuffle_f32x4(tmp11663, tmp11667, 221);
__m512 tmp11681 = _mm512_shuffle_f32x4(tmp11664, tmp11668, 136);
__m512 tmp11682 = _mm512_shuffle_f32x4(tmp11664, tmp11668, 221);
__m512 tmp11683 = _mm512_shuffle_f32x4(tmp11665, tmp11669, 136);
__m512 tmp11684 = _mm512_shuffle_f32x4(tmp11665, tmp11669, 221);
__m512 tmp11685 = _mm512_shuffle_f32x4(tmp11666, tmp11670, 136);
__m512 tmp11686 = _mm512_shuffle_f32x4(tmp11666, tmp11670, 221);
in1696 = _mm512_shuffle_f32x4(tmp11671, tmp11679, 136);
in1701 = _mm512_shuffle_f32x4(tmp11671, tmp11679, 221);
tmp11621 = _mm512_shuffle_f32x4(tmp11673, tmp11681, 136);
tmp11626 = _mm512_shuffle_f32x4(tmp11673, tmp11681, 221);
tmp11622 = _mm512_shuffle_f32x4(tmp11675, tmp11683, 136);
tmp11627 = _mm512_shuffle_f32x4(tmp11675, tmp11683, 221);
tmp11623 = _mm512_shuffle_f32x4(tmp11677, tmp11685, 136);
tmp11628 = _mm512_shuffle_f32x4(tmp11677, tmp11685, 221);
tmp11620 = _mm512_shuffle_f32x4(tmp11672, tmp11680, 136);
tmp11625 = _mm512_shuffle_f32x4(tmp11672, tmp11680, 221);
in1698 = _mm512_shuffle_f32x4(tmp11674, tmp11682, 136);
in1700 = _mm512_shuffle_f32x4(tmp11676, tmp11684, 136);
in1699 = _mm512_shuffle_f32x4(tmp11678, tmp11686, 136);
(void)in1698;
(void)in1700;
(void)in1699;
__m512 tmp11629 = tmp11621;
__m512 tmp11634 = tmp11626;
__m512 tmp11630 = _mm512_sub_ps(tmp11620, tmp11622);
__m512 tmp11635 = _mm512_sub_ps(tmp11625, tmp11627);
__m512 tmp11631 = tmp11622;
__m512 tmp11636 = tmp11627;
in1696 = in1696;
in1701 = in1701;
tmp11629 = _mm512_fmadd_ps(tmp11623, _mm512_set1_ps(-4.25e+00f), tmp11629);
tmp11634 = _mm512_fmadd_ps(tmp11628, _mm512_set1_ps(-4.25e+00f), tmp11634);
tmp11631 = _mm512_fmadd_ps(tmp11620, _mm512_set1_ps(-4.25e+00f), tmp11631);
tmp11636 = _mm512_fmadd_ps(tmp11625, _mm512_set1_ps(-4.25e+00f), tmp11636);
in1696 = _mm512_fmadd_ps(tmp11630, _mm512_set1_ps(5.25e+00f), in1696);
in1701 = _mm512_fmadd_ps(tmp11635, _mm512_set1_ps(5.25e+00f), in1701);
tmp11630 = _mm512_mul_ps(tmp11622, _mm512_set1_ps(2.5e-01f));
tmp11635 = _mm512_mul_ps(tmp11627, _mm512_set1_ps(2.5e-01f));
tmp11622 = _mm512_mul_ps(tmp11622, _mm512_set1_ps(4e+00f));
tmp11627 = _mm512_mul_ps(tmp11627, _mm512_set1_ps(4e+00f));
__m512 tmp11632 = _mm512_sub_ps(tmp11631, tmp11629);
__m512 tmp11637 = _mm512_sub_ps(tmp11636, tmp11634);
tmp11631 = _mm512_add_ps(tmp11629, tmp11631);
tmp11636 = _mm512_add_ps(tmp11634, tmp11636);
tmp11629 = _mm512_mul_ps(tmp11621, _mm512_set1_ps(2.5e-01f));
tmp11634 = _mm512_mul_ps(tmp11626, _mm512_set1_ps(2.5e-01f));
tmp11630 = _mm512_fmadd_ps(tmp11620, _mm512_set1_ps(-1.25e+00f), tmp11630);
tmp11635 = _mm512_fmadd_ps(tmp11625, _mm512_set1_ps(-1.25e+00f), tmp11635);
tmp11620 = _mm512_fmadd_ps(tmp11620, _mm512_set1_ps(-5e+00f), tmp11622);
tmp11625 = _mm512_fmadd_ps(tmp11625, _mm512_set1_ps(-5e+00f), tmp11627);
tmp11629 = _mm512_fmadd_ps(tmp11623, _mm512_set1_ps(-1.25e+00f), tmp11629);
tmp11634 = _mm512_fmadd_ps(tmp11628, _mm512_set1_ps(-1.25e+00f), tmp11634);
__m512 tmp11633 = _mm512_fmadd_ps(tmp11629, _mm512_set1_ps(2e+00f), tmp11630);
__m512 tmp11638 = _mm512_fmadd_ps(tmp11634, _mm512_set1_ps(2e+00f), tmp11635);
tmp11630 = _mm512_fnmadd_ps(tmp11629, _mm512_set1_ps(2e+00f), tmp11630);
tmp11635 = _mm512_fnmadd_ps(tmp11634, _mm512_set1_ps(2e+00f), tmp11635);
tmp11629 = tmp11621;
tmp11634 = tmp11626;
tmp11621 = _mm512_sub_ps(_mm512_setzero_ps(), tmp11621);
tmp11626 = _mm512_sub_ps(_mm512_setzero_ps(), tmp11626);
tmp11629 = _mm512_fmadd_ps(tmp11623, _mm512_set1_ps(-1.25e+00f), tmp11629);
tmp11634 = _mm512_fmadd_ps(tmp11628, _mm512_set1_ps(-1.25e+00f), tmp11634);
tmp11623 = tmp11623;
tmp11628 = tmp11628;
tmp11623 = _mm512_fmadd_ps(tmp11623, _mm512_set1_ps(5.25e+00f), tmp11621);
tmp11628 = _mm512_fmadd_ps(tmp11628, _mm512_set1_ps(5.25e+00f), tmp11626);
tmp11622 = _mm512_fmadd_ps(tmp11629, _mm512_set1_ps(2e+00f), tmp11620);
tmp11627 = _mm512_fmadd_ps(tmp11634, _mm512_set1_ps(2e+00f), tmp11625);
tmp11620 = _mm512_fnmadd_ps(tmp11629, _mm512_set1_ps(2e+00f), tmp11620);
tmp11625 = _mm512_fnmadd_ps(tmp11634, _mm512_set1_ps(2e+00f), tmp11625);
__m512 out1583 = _mm512_shuffle_f32x4(in1696, tmp11631, 68);
__m512 out1591 = _mm512_shuffle_f32x4(in1696, tmp11631, 238);
__m512 out1584 = _mm512_shuffle_f32x4(tmp11632, tmp11633, 68);
__m512 out1592 = _mm512_shuffle_f32x4(tmp11632, tmp11633, 238);
__m512 out1585 = _mm512_shuffle_f32x4(tmp11630, tmp11622, 68);
__m512 out1593 = _mm512_shuffle_f32x4(tmp11630, tmp11622, 238);
__m512 out1586 = _mm512_shuffle_f32x4(tmp11620, tmp11623, 68);
__m512 out1594 = _mm512_shuffle_f32x4(tmp11620, tmp11623, 238);
__m512 out1587 = _mm512_shuffle_f32x4(in1701, tmp11636, 68);
__m512 out1595 = _mm512_shuffle_f32x4(in1701, tmp11636, 238);
__m512 out1588 = _mm512_shuffle_f32x4(tmp11637, tmp11638, 68);
__m512 out1596 = _mm512_shuffle_f32x4(tmp11637, tmp11638, 238);
__m512 out1589 = _mm512_shuffle_f32x4(tmp11635, tmp11627, 68);
__m512 out1597 = _mm512_shuffle_f32x4(tmp11635, tmp11627, 238);
__m512 out1590 = _mm512_shuffle_f32x4(tmp11625, tmp11628, 68);
__m512 out1598 = _mm512_shuffle_f32x4(tmp11625, tmp11628, 238);
_mm512_storeu_ps(dfPtr8+0+819200*i35+49152*j28+8192*s25+256*k109, out1583);
_mm512_storeu_ps(dfPtr8+128+819200*i35+49152*j28+8192*s25+256*k109, out1591);
_mm512_storeu_ps(dfPtr8+64+819200*i35+49152*j28+8192*s25+256*k109, out1587);
_mm512_storeu_ps(dfPtr8+192+819200*i35+49152*j28+8192*s25+256*k109, out1595);
_mm512_storeu_ps(dfPtr8+204800+819200*i35+49152*j28+8192*s25+256*k109, out1584);
_mm512_storeu_ps(dfPtr8+204928+819200*i35+49152*j28+8192*s25+256*k109, out1592);
_mm512_storeu_ps(dfPtr8+204864+819200*i35+49152*j28+8192*s25+256*k109, out1588);
_mm512_storeu_ps(dfPtr8+204992+819200*i35+49152*j28+8192*s25+256*k109, out1596);
_mm512_storeu_ps(dfPtr8+409600+819200*i35+49152*j28+8192*s25+256*k109, out1585);
_mm512_storeu_ps(dfPtr8+409728+819200*i35+49152*j28+8192*s25+256*k109, out1593);
_mm512_storeu_ps(dfPtr8+409664+819200*i35+49152*j28+8192*s25+256*k109, out1589);
_mm512_storeu_ps(dfPtr8+409792+819200*i35+49152*j28+8192*s25+256*k109, out1597);
_mm512_storeu_ps(dfPtr8+614400+819200*i35+49152*j28+8192*s25+256*k109, out1586);
_mm512_storeu_ps(dfPtr8+614528+819200*i35+49152*j28+8192*s25+256*k109, out1594);
_mm512_storeu_ps(dfPtr8+614464+819200*i35+49152*j28+8192*s25+256*k109, out1590);
_mm512_storeu_ps(dfPtr8+614592+819200*i35+49152*j28+8192*s25+256*k109, out1598);
}
if (j28 >= last7) return;
++j28;
}

static void ResNet50ThreeArrangeDats3(ResNet50ThreaderTeam1* team40, char** tensors53) {
ResNet50ThreaderTask1 task57;
task57.callee1 = ResNet50ThreeArrangeDats3Callee1;
task57.any1 = tensors53;
task57.nd1 = 4;
task57.hull1[0] = 1;
task57.hull1[1] = 5;
task57.hull1[2] = 1;
task57.hull1[3] = 1;
ResNet50ThreaderDo1(team40, &task57);
}

static void ResNet50ThreeProduceSums3Callee1(ResNet50ThreaderTask1* task58, int64_t* pt34) {
void** pair14 = task58->any1;
char** tensors56 = pair14[0];
ptrdiff_t e18 = 0;
ptrdiff_t g20 = 0;
ptrdiff_t f46 = pt34[2];
ptrdiff_t d11 = pt34[1];
ptrdiff_t w53 = pt34[0];
char*restrict bfPtr9 = tensors56[0]+512*e18;
char*restrict wfPtr9 = tensors56[0]+512+6488064*e18;
char*restrict dfPtr9 = tensors56[1]+2534400*e18;
char*restrict sfPtr8 = tensors56[2];
ptrdiff_t i36 = 1*g20;
ptrdiff_t j29 = 1*f46;
ptrdiff_t k110 = 1*d11;
ptrdiff_t kk34 = k110+0;
for (; k110 != 4; ++k110) {
ptrdiff_t l41 = 4*w53;
ptrdiff_t ll5 = l41+3;
for (; l41 != 32; ++l41) {
__m512 sum250;
__m512 sum256;
__m512 sum262;
__m512 sum268;
if (__builtin_expect(!j29, 0)) {
sum250 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr9+0+512*i36+16*l41)));
sum256 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr9+4+512*i36+16*l41)));
sum262 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr9+8+512*i36+16*l41)));
sum268 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr9+12+512*i36+16*l41)));
} else {
sum250 = _mm512_setzero_ps();
sum256 = _mm512_setzero_ps();
sum262 = _mm512_setzero_ps();
sum268 = _mm512_setzero_ps();
}
__m512 sum251 = sum250;
__m512 sum252 = sum250;
__m512 sum253 = sum250;
__m512 sum254 = sum250;
__m512 sum255 = sum250;
__m512 sum257 = sum256;
__m512 sum258 = sum256;
__m512 sum259 = sum256;
__m512 sum260 = sum256;
__m512 sum261 = sum256;
__m512 sum263 = sum262;
__m512 sum264 = sum262;
__m512 sum265 = sum262;
__m512 sum266 = sum262;
__m512 sum267 = sum262;
__m512 sum269 = sum268;
__m512 sum270 = sum268;
__m512 sum271 = sum268;
__m512 sum272 = sum268;
__m512 sum273 = sum268;
ptrdiff_t b55 = 0;
for (; b55 != 128; ++b55) {
__m512i wfs25 = _mm512_maskz_loadu_epi32(65535, wfPtr9+0+2097152*i36+524288*j29+16384*l41+128*b55);
__m512 wf97 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs25));
__m512 df661 = _mm512_loadu_ps(dfPtr9+0+819200*i36+204800*j29+49152*k110+384*b55);
sum250 = _mm512_fmadd_ps(wf97, df661, sum250);
__m512 df662 = _mm512_loadu_ps(dfPtr9+64+819200*i36+204800*j29+49152*k110+384*b55);
sum251 = _mm512_fmadd_ps(wf97, df662, sum251);
__m512 df663 = _mm512_loadu_ps(dfPtr9+128+819200*i36+204800*j29+49152*k110+384*b55);
sum252 = _mm512_fmadd_ps(wf97, df663, sum252);
__m512 df664 = _mm512_loadu_ps(dfPtr9+192+819200*i36+204800*j29+49152*k110+384*b55);
sum253 = _mm512_fmadd_ps(wf97, df664, sum253);
__m512 df665 = _mm512_loadu_ps(dfPtr9+256+819200*i36+204800*j29+49152*k110+384*b55);
sum254 = _mm512_fmadd_ps(wf97, df665, sum254);
__m512 df666 = _mm512_loadu_ps(dfPtr9+320+819200*i36+204800*j29+49152*k110+384*b55);
sum255 = _mm512_fmadd_ps(wf97, df666, sum255);
__m512 wf98 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs25, 1));
sum256 = _mm512_fmadd_ps(wf98, df661, sum256);
sum257 = _mm512_fmadd_ps(wf98, df662, sum257);
sum258 = _mm512_fmadd_ps(wf98, df663, sum258);
sum259 = _mm512_fmadd_ps(wf98, df664, sum259);
sum260 = _mm512_fmadd_ps(wf98, df665, sum260);
sum261 = _mm512_fmadd_ps(wf98, df666, sum261);
__m512i wfs26 = _mm512_maskz_loadu_epi32(65535, wfPtr9+64+2097152*i36+524288*j29+16384*l41+128*b55);
__m512 wf99 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs26));
sum262 = _mm512_fmadd_ps(wf99, df661, sum262);
sum263 = _mm512_fmadd_ps(wf99, df662, sum263);
sum264 = _mm512_fmadd_ps(wf99, df663, sum264);
sum265 = _mm512_fmadd_ps(wf99, df664, sum265);
sum266 = _mm512_fmadd_ps(wf99, df665, sum266);
sum267 = _mm512_fmadd_ps(wf99, df666, sum267);
__m512 wf100 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs26, 1));
sum268 = _mm512_fmadd_ps(wf100, df661, sum268);
sum269 = _mm512_fmadd_ps(wf100, df662, sum269);
sum270 = _mm512_fmadd_ps(wf100, df663, sum270);
sum271 = _mm512_fmadd_ps(wf100, df664, sum271);
sum272 = _mm512_fmadd_ps(wf100, df665, sum272);
sum273 = _mm512_fmadd_ps(wf100, df666, sum273);
}
_mm512_storeu_ps(sfPtr8+0+819200*i36+204800*j29+49152*k110+1536*l41, sum250);
_mm512_storeu_ps(sfPtr8+64+819200*i36+204800*j29+49152*k110+1536*l41, sum251);
_mm512_storeu_ps(sfPtr8+128+819200*i36+204800*j29+49152*k110+1536*l41, sum252);
_mm512_storeu_ps(sfPtr8+192+819200*i36+204800*j29+49152*k110+1536*l41, sum253);
_mm512_storeu_ps(sfPtr8+256+819200*i36+204800*j29+49152*k110+1536*l41, sum254);
_mm512_storeu_ps(sfPtr8+320+819200*i36+204800*j29+49152*k110+1536*l41, sum255);
_mm512_storeu_ps(sfPtr8+384+819200*i36+204800*j29+49152*k110+1536*l41, sum256);
_mm512_storeu_ps(sfPtr8+448+819200*i36+204800*j29+49152*k110+1536*l41, sum257);
_mm512_storeu_ps(sfPtr8+512+819200*i36+204800*j29+49152*k110+1536*l41, sum258);
_mm512_storeu_ps(sfPtr8+576+819200*i36+204800*j29+49152*k110+1536*l41, sum259);
_mm512_storeu_ps(sfPtr8+640+819200*i36+204800*j29+49152*k110+1536*l41, sum260);
_mm512_storeu_ps(sfPtr8+704+819200*i36+204800*j29+49152*k110+1536*l41, sum261);
_mm512_storeu_ps(sfPtr8+768+819200*i36+204800*j29+49152*k110+1536*l41, sum262);
_mm512_storeu_ps(sfPtr8+832+819200*i36+204800*j29+49152*k110+1536*l41, sum263);
_mm512_storeu_ps(sfPtr8+896+819200*i36+204800*j29+49152*k110+1536*l41, sum264);
_mm512_storeu_ps(sfPtr8+960+819200*i36+204800*j29+49152*k110+1536*l41, sum265);
_mm512_storeu_ps(sfPtr8+1024+819200*i36+204800*j29+49152*k110+1536*l41, sum266);
_mm512_storeu_ps(sfPtr8+1088+819200*i36+204800*j29+49152*k110+1536*l41, sum267);
_mm512_storeu_ps(sfPtr8+1152+819200*i36+204800*j29+49152*k110+1536*l41, sum268);
_mm512_storeu_ps(sfPtr8+1216+819200*i36+204800*j29+49152*k110+1536*l41, sum269);
_mm512_storeu_ps(sfPtr8+1280+819200*i36+204800*j29+49152*k110+1536*l41, sum270);
_mm512_storeu_ps(sfPtr8+1344+819200*i36+204800*j29+49152*k110+1536*l41, sum271);
_mm512_storeu_ps(sfPtr8+1408+819200*i36+204800*j29+49152*k110+1536*l41, sum272);
_mm512_storeu_ps(sfPtr8+1472+819200*i36+204800*j29+49152*k110+1536*l41, sum273);
if (l41 >= ll5) return;
}
if (k110 >= kk34) return;
}
ptrdiff_t l42 = 4*w53;
ptrdiff_t ll6 = l42+3;
for (; l42 != 32; ++l42) {
__m512 sum274;
__m512 sum275;
__m512 sum276;
__m512 sum277;
if (__builtin_expect(!j29, 0)) {
sum274 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr9+0+512*i36+16*l42)));
sum275 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr9+4+512*i36+16*l42)));
sum276 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr9+8+512*i36+16*l42)));
sum277 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr9+12+512*i36+16*l42)));
} else {
sum274 = _mm512_setzero_ps();
sum275 = _mm512_setzero_ps();
sum276 = _mm512_setzero_ps();
sum277 = _mm512_setzero_ps();
}
ptrdiff_t b56 = 0;
for (; b56 != 128; ++b56) {
__m512i wfs27 = _mm512_maskz_loadu_epi32(65535, wfPtr9+0+2097152*i36+524288*j29+16384*l42+128*b56);
__m512 wf101 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs27));
__m512 df667 = _mm512_loadu_ps(dfPtr9+0+819200*i36+204800*j29+49152*k110+64*b56);
sum274 = _mm512_fmadd_ps(wf101, df667, sum274);
__m512 wf102 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs27, 1));
sum275 = _mm512_fmadd_ps(wf102, df667, sum275);
__m512i wfs28 = _mm512_maskz_loadu_epi32(65535, wfPtr9+64+2097152*i36+524288*j29+16384*l42+128*b56);
__m512 wf103 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs28));
sum276 = _mm512_fmadd_ps(wf103, df667, sum276);
__m512 wf104 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs28, 1));
sum277 = _mm512_fmadd_ps(wf104, df667, sum277);
}
_mm512_storeu_ps(sfPtr8+0+819200*i36+204800*j29+49152*k110+256*l42, sum274);
_mm512_storeu_ps(sfPtr8+64+819200*i36+204800*j29+49152*k110+256*l42, sum275);
_mm512_storeu_ps(sfPtr8+128+819200*i36+204800*j29+49152*k110+256*l42, sum276);
_mm512_storeu_ps(sfPtr8+192+819200*i36+204800*j29+49152*k110+256*l42, sum277);
if (l42 >= ll6) return;
}
}

static void ResNet50ThreeProduceSums3(ResNet50ThreaderTeam1* team41, char** tensors55) {
void* pair13[] = {tensors55, 0};
ResNet50ThreaderTask1 task59;
task59.callee1 = ResNet50ThreeProduceSums3Callee1;
task59.any1 = pair13;
task59.nd1 = 4;
task59.hull1[0] = 8;
task59.hull1[1] = 5;
task59.hull1[2] = 4;
task59.hull1[3] = 1;
ResNet50ThreaderDo1(team41, &task59);
}

static void ResNet50ThreeConsumeSums3Callee1(ResNet50ThreaderTask1* task60, int64_t* pt35) {
char** tensors58 = task60->any1;
ptrdiff_t w54 = 0;
ptrdiff_t d12 = pt35[1];
ptrdiff_t g21 = 0;
char*restrict sfPtr9 = tensors58[0];
char*restrict datPtr17 = tensors58[1];
ptrdiff_t i37 = 1*g21;
ptrdiff_t j30 = 1*d12;
ptrdiff_t last8 = j30+0;
ptrdiff_t rel20 = j30-0;
ptrdiff_t base20 = 0;
if (rel20 < 2) {
if (rel20 < 1) {
ptrdiff_t toH38 = base20+0;
ptrdiff_t toW38 = 0;
ptrdiff_t k111 = 32*w54;
for (; k111 != 32; ++k111) {
ptrdiff_t l43 = 0;
for (; l43 != 2; ++l43) {
__m512 sf801 = _mm512_loadu_ps(sfPtr9+0+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf802 = _mm512_loadu_ps(sfPtr9+128+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1706 = _mm512_shuffle_f32x4(sf801, sf802, 68);
__m512 in1707 = _mm512_shuffle_f32x4(sf801, sf802, 238);
__m512 sf803 = _mm512_loadu_ps(sfPtr9+64+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf804 = _mm512_loadu_ps(sfPtr9+192+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1714 = _mm512_shuffle_f32x4(sf803, sf804, 68);
__m512 in1715 = _mm512_shuffle_f32x4(sf803, sf804, 238);
__m512 sf805 = _mm512_loadu_ps(sfPtr9+204800+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf806 = _mm512_loadu_ps(sfPtr9+204928+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1708 = _mm512_shuffle_f32x4(sf805, sf806, 68);
__m512 in1709 = _mm512_shuffle_f32x4(sf805, sf806, 238);
__m512 sf807 = _mm512_loadu_ps(sfPtr9+204864+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf808 = _mm512_loadu_ps(sfPtr9+204992+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1716 = _mm512_shuffle_f32x4(sf807, sf808, 68);
__m512 in1717 = _mm512_shuffle_f32x4(sf807, sf808, 238);
__m512 sf809 = _mm512_loadu_ps(sfPtr9+409600+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf810 = _mm512_loadu_ps(sfPtr9+409728+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1710 = _mm512_shuffle_f32x4(sf809, sf810, 68);
__m512 in1711 = _mm512_shuffle_f32x4(sf809, sf810, 238);
__m512 sf811 = _mm512_loadu_ps(sfPtr9+409664+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf812 = _mm512_loadu_ps(sfPtr9+409792+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1718 = _mm512_shuffle_f32x4(sf811, sf812, 68);
__m512 in1719 = _mm512_shuffle_f32x4(sf811, sf812, 238);
__m512 sf813 = _mm512_loadu_ps(sfPtr9+614400+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf814 = _mm512_loadu_ps(sfPtr9+614528+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1712 = _mm512_shuffle_f32x4(sf813, sf814, 68);
__m512 in1713 = _mm512_shuffle_f32x4(sf813, sf814, 238);
__m512 sf815 = _mm512_loadu_ps(sfPtr9+614464+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf816 = _mm512_loadu_ps(sfPtr9+614592+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1720 = _mm512_shuffle_f32x4(sf815, sf816, 68);
__m512 in1721 = _mm512_shuffle_f32x4(sf815, sf816, 238);
__m512 tmp11703 = _mm512_add_ps(in1707, in1708);
__m512 tmp11723 = _mm512_add_ps(in1715, in1716);
__m512 tmp11702 = _mm512_add_ps(in1709, in1710);
__m512 tmp11722 = _mm512_add_ps(in1717, in1718);
__m512 tmp11708 = _mm512_sub_ps(in1709, in1710);
__m512 tmp11728 = _mm512_sub_ps(in1717, in1718);
__m512 tmp11707 = _mm512_sub_ps(in1707, in1708);
__m512 tmp11727 = _mm512_sub_ps(in1715, in1716);
__m512 tmp11704 = _mm512_add_ps(in1711, in1712);
__m512 tmp11724 = _mm512_add_ps(in1719, in1720);
__m512 tmp11709 = _mm512_sub_ps(in1711, in1712);
__m512 tmp11729 = _mm512_sub_ps(in1719, in1720);
__m512 tmp11706 = _mm512_fmadd_ps(tmp11708, _mm512_set1_ps(2e+00f), tmp11707);
__m512 tmp11726 = _mm512_fmadd_ps(tmp11728, _mm512_set1_ps(2e+00f), tmp11727);
__m512 tmp11713 = _mm512_fmadd_ps(tmp11708, _mm512_set1_ps(8e+00f), tmp11707);
__m512 tmp11733 = _mm512_fmadd_ps(tmp11728, _mm512_set1_ps(8e+00f), tmp11727);
__m512 tmp11701 = _mm512_add_ps(tmp11702, tmp11703);
__m512 tmp11721 = _mm512_add_ps(tmp11722, tmp11723);
__m512 tmp11705 = _mm512_fmadd_ps(tmp11709, _mm512_set1_ps(1.6e+01f), tmp11706);
__m512 tmp11725 = _mm512_fmadd_ps(tmp11729, _mm512_set1_ps(1.6e+01f), tmp11726);
__m512 tmp11712 = _mm512_fmadd_ps(tmp11709, _mm512_set1_ps(4e+00f), tmp11713);
__m512 tmp11732 = _mm512_fmadd_ps(tmp11729, _mm512_set1_ps(4e+00f), tmp11733);
__m512 tmp11718 = _mm512_add_ps(tmp11709, tmp11707);
__m512 tmp11738 = _mm512_add_ps(tmp11729, tmp11727);
__m512 tmp11711 = _mm512_fmadd_ps(tmp11702, _mm512_set1_ps(4e+00f), tmp11703);
__m512 tmp11731 = _mm512_fmadd_ps(tmp11722, _mm512_set1_ps(4e+00f), tmp11723);
__m512 tmp11715 = _mm512_fmadd_ps(tmp11702, _mm512_set1_ps(1.6e+01f), tmp11703);
__m512 tmp11735 = _mm512_fmadd_ps(tmp11722, _mm512_set1_ps(1.6e+01f), tmp11723);
__m512 tmp11700 = _mm512_add_ps(tmp11701, in1706);
__m512 tmp11720 = _mm512_add_ps(tmp11721, in1714);
__m512 tmp11717 = _mm512_add_ps(tmp11718, in1713);
__m512 tmp11737 = _mm512_add_ps(tmp11738, in1721);
__m512 tmp11699 = _mm512_fmadd_ps(tmp11704, _mm512_set1_ps(3.2e+01f), tmp11700);
__m512 tmp11719 = _mm512_fmadd_ps(tmp11724, _mm512_set1_ps(3.2e+01f), tmp11720);
__m512 tmp11710 = _mm512_fmadd_ps(tmp11704, _mm512_set1_ps(8e+00f), tmp11711);
__m512 tmp11730 = _mm512_fmadd_ps(tmp11724, _mm512_set1_ps(8e+00f), tmp11731);
__m512 tmp11716 = _mm512_fmadd_ps(tmp11708, _mm512_set1_ps(3.2e+01f), tmp11717);
__m512 tmp11736 = _mm512_fmadd_ps(tmp11728, _mm512_set1_ps(3.2e+01f), tmp11737);
__m512 tmp11714 = _mm512_fmadd_ps(tmp11704, _mm512_set1_ps(2e+00f), tmp11715);
__m512 tmp11734 = _mm512_fmadd_ps(tmp11724, _mm512_set1_ps(2e+00f), tmp11735);
__m512 tmp11687 = tmp11699;
__m512 tmp11693 = tmp11719;
__m512 tmp11688 = tmp11705;
__m512 tmp11694 = tmp11725;
__m512 tmp11689 = tmp11710;
__m512 tmp11695 = tmp11730;
__m512 tmp11690 = tmp11712;
__m512 tmp11696 = tmp11732;
__m512 tmp11691 = tmp11714;
__m512 tmp11697 = tmp11734;
__m512 tmp11692 = tmp11716;
__m512 tmp11698 = tmp11736;
__m512 tmp11783 = _mm512_unpacklo_ps(tmp11687, tmp11688);
__m512 tmp11784 = _mm512_unpackhi_ps(tmp11687, tmp11688);
__m512 tmp11785 = _mm512_unpacklo_ps(tmp11689, tmp11690);
__m512 tmp11786 = _mm512_unpackhi_ps(tmp11689, tmp11690);
__m512 tmp11787 = _mm512_unpacklo_ps(tmp11691, tmp11692);
__m512 tmp11788 = _mm512_unpackhi_ps(tmp11691, tmp11692);
__m512 tmp11789 = _mm512_unpacklo_ps(tmp11693, tmp11694);
__m512 tmp11790 = _mm512_unpackhi_ps(tmp11693, tmp11694);
__m512 tmp11791 = _mm512_unpacklo_ps(tmp11695, tmp11696);
__m512 tmp11792 = _mm512_unpackhi_ps(tmp11695, tmp11696);
__m512 tmp11793 = _mm512_unpacklo_ps(tmp11697, tmp11698);
__m512 tmp11794 = _mm512_unpackhi_ps(tmp11697, tmp11698);
__m512 tmp11795 = _mm512_shuffle_ps(tmp11783, tmp11785, 68);
__m512 tmp11796 = _mm512_shuffle_ps(tmp11783, tmp11785, 238);
__m512 tmp11797 = _mm512_shuffle_ps(tmp11784, tmp11786, 68);
__m512 tmp11798 = _mm512_shuffle_ps(tmp11784, tmp11786, 238);
__m512 tmp11799 = _mm512_shuffle_ps(tmp11787, tmp11789, 68);
__m512 tmp11800 = _mm512_shuffle_ps(tmp11787, tmp11789, 238);
__m512 tmp11801 = _mm512_shuffle_ps(tmp11788, tmp11790, 68);
__m512 tmp11802 = _mm512_shuffle_ps(tmp11788, tmp11790, 238);
__m512 tmp11803 = _mm512_shuffle_ps(tmp11791, tmp11793, 68);
__m512 tmp11804 = _mm512_shuffle_ps(tmp11791, tmp11793, 238);
__m512 tmp11805 = _mm512_shuffle_ps(tmp11792, tmp11794, 68);
__m512 tmp11806 = _mm512_shuffle_ps(tmp11792, tmp11794, 238);
__m512 tmp11807 = _mm512_shuffle_f32x4(tmp11795, tmp11799, 136);
__m512 tmp11808 = _mm512_shuffle_f32x4(tmp11795, tmp11799, 221);
__m512 tmp11809 = _mm512_shuffle_f32x4(tmp11796, tmp11800, 136);
__m512 tmp11810 = _mm512_shuffle_f32x4(tmp11796, tmp11800, 221);
__m512 tmp11811 = _mm512_shuffle_f32x4(tmp11797, tmp11801, 136);
__m512 tmp11812 = _mm512_shuffle_f32x4(tmp11797, tmp11801, 221);
__m512 tmp11813 = _mm512_shuffle_f32x4(tmp11798, tmp11802, 136);
__m512 tmp11814 = _mm512_shuffle_f32x4(tmp11798, tmp11802, 221);
__m512 tmp11815 = _mm512_shuffle_f32x4(tmp11803, tmp11803, 136);
__m512 tmp11816 = _mm512_shuffle_f32x4(tmp11803, tmp11803, 221);
__m512 tmp11817 = _mm512_shuffle_f32x4(tmp11804, tmp11804, 136);
__m512 tmp11818 = _mm512_shuffle_f32x4(tmp11804, tmp11804, 221);
__m512 tmp11819 = _mm512_shuffle_f32x4(tmp11805, tmp11805, 136);
__m512 tmp11820 = _mm512_shuffle_f32x4(tmp11805, tmp11805, 221);
__m512 tmp11821 = _mm512_shuffle_f32x4(tmp11806, tmp11806, 136);
__m512 tmp11822 = _mm512_shuffle_f32x4(tmp11806, tmp11806, 221);
tmp11687 = _mm512_shuffle_f32x4(tmp11807, tmp11815, 136);
tmp11695 = _mm512_shuffle_f32x4(tmp11807, tmp11815, 221);
tmp11688 = _mm512_shuffle_f32x4(tmp11809, tmp11817, 136);
tmp11696 = _mm512_shuffle_f32x4(tmp11809, tmp11817, 221);
tmp11689 = _mm512_shuffle_f32x4(tmp11811, tmp11819, 136);
tmp11697 = _mm512_shuffle_f32x4(tmp11811, tmp11819, 221);
tmp11690 = _mm512_shuffle_f32x4(tmp11813, tmp11821, 136);
tmp11698 = _mm512_shuffle_f32x4(tmp11813, tmp11821, 221);
tmp11691 = _mm512_shuffle_f32x4(tmp11808, tmp11816, 136);
__m512 tmp11739 = _mm512_shuffle_f32x4(tmp11808, tmp11816, 221);
tmp11692 = _mm512_shuffle_f32x4(tmp11810, tmp11818, 136);
__m512 tmp11740 = _mm512_shuffle_f32x4(tmp11810, tmp11818, 221);
tmp11693 = _mm512_shuffle_f32x4(tmp11812, tmp11820, 136);
__m512 tmp11741 = _mm512_shuffle_f32x4(tmp11812, tmp11820, 221);
tmp11694 = _mm512_shuffle_f32x4(tmp11814, tmp11822, 136);
__m512 tmp11742 = _mm512_shuffle_f32x4(tmp11814, tmp11822, 221);
__m512 tmp11747 = _mm512_add_ps(tmp11688, tmp11689);
__m512 tmp11767 = _mm512_add_ps(tmp11696, tmp11697);
__m512 tmp11746 = _mm512_add_ps(tmp11690, tmp11691);
__m512 tmp11766 = _mm512_add_ps(tmp11698, tmp11739);
__m512 tmp11752 = _mm512_sub_ps(tmp11690, tmp11691);
__m512 tmp11772 = _mm512_sub_ps(tmp11698, tmp11739);
__m512 tmp11751 = _mm512_sub_ps(tmp11688, tmp11689);
__m512 tmp11771 = _mm512_sub_ps(tmp11696, tmp11697);
__m512 tmp11748 = _mm512_add_ps(tmp11692, tmp11693);
__m512 tmp11768 = _mm512_add_ps(tmp11740, tmp11741);
__m512 tmp11753 = _mm512_sub_ps(tmp11692, tmp11693);
__m512 tmp11773 = _mm512_sub_ps(tmp11740, tmp11741);
__m512 tmp11750 = _mm512_fmadd_ps(tmp11752, _mm512_set1_ps(2e+00f), tmp11751);
__m512 tmp11770 = _mm512_fmadd_ps(tmp11772, _mm512_set1_ps(2e+00f), tmp11771);
__m512 tmp11757 = _mm512_fmadd_ps(tmp11752, _mm512_set1_ps(8e+00f), tmp11751);
__m512 tmp11777 = _mm512_fmadd_ps(tmp11772, _mm512_set1_ps(8e+00f), tmp11771);
__m512 tmp11745 = _mm512_add_ps(tmp11746, tmp11747);
__m512 tmp11765 = _mm512_add_ps(tmp11766, tmp11767);
__m512 tmp11749 = _mm512_fmadd_ps(tmp11753, _mm512_set1_ps(1.6e+01f), tmp11750);
__m512 tmp11769 = _mm512_fmadd_ps(tmp11773, _mm512_set1_ps(1.6e+01f), tmp11770);
__m512 tmp11756 = _mm512_fmadd_ps(tmp11753, _mm512_set1_ps(4e+00f), tmp11757);
__m512 tmp11776 = _mm512_fmadd_ps(tmp11773, _mm512_set1_ps(4e+00f), tmp11777);
__m512 tmp11762 = _mm512_add_ps(tmp11753, tmp11751);
__m512 tmp11782 = _mm512_add_ps(tmp11773, tmp11771);
__m512 tmp11755 = _mm512_fmadd_ps(tmp11746, _mm512_set1_ps(4e+00f), tmp11747);
__m512 tmp11775 = _mm512_fmadd_ps(tmp11766, _mm512_set1_ps(4e+00f), tmp11767);
__m512 tmp11759 = _mm512_fmadd_ps(tmp11746, _mm512_set1_ps(1.6e+01f), tmp11747);
__m512 tmp11779 = _mm512_fmadd_ps(tmp11766, _mm512_set1_ps(1.6e+01f), tmp11767);
__m512 tmp11744 = _mm512_add_ps(tmp11745, tmp11687);
__m512 tmp11764 = _mm512_add_ps(tmp11765, tmp11695);
__m512 tmp11761 = _mm512_add_ps(tmp11762, tmp11694);
__m512 tmp11781 = _mm512_add_ps(tmp11782, tmp11742);
__m512 tmp11743 = _mm512_fmadd_ps(tmp11748, _mm512_set1_ps(3.2e+01f), tmp11744);
__m512 tmp11763 = _mm512_fmadd_ps(tmp11768, _mm512_set1_ps(3.2e+01f), tmp11764);
__m512 tmp11754 = _mm512_fmadd_ps(tmp11748, _mm512_set1_ps(8e+00f), tmp11755);
__m512 tmp11774 = _mm512_fmadd_ps(tmp11768, _mm512_set1_ps(8e+00f), tmp11775);
__m512 tmp11760 = _mm512_fmadd_ps(tmp11752, _mm512_set1_ps(3.2e+01f), tmp11761);
__m512 tmp11780 = _mm512_fmadd_ps(tmp11772, _mm512_set1_ps(3.2e+01f), tmp11781);
__m512 tmp11758 = _mm512_fmadd_ps(tmp11748, _mm512_set1_ps(2e+00f), tmp11759);
__m512 tmp11778 = _mm512_fmadd_ps(tmp11768, _mm512_set1_ps(2e+00f), tmp11779);
__m512 out1599 = tmp11743;
__m512 out1605 = tmp11763;
__m512 out1600 = tmp11749;
__m512 out1606 = tmp11769;
__m512 out1601 = tmp11754;
__m512 out1607 = tmp11774;
__m512 out1602 = tmp11756;
__m512 out1608 = tmp11776;
__m512 out1603 = tmp11758;
__m512 out1609 = tmp11778;
__m512 out1604 = tmp11760;
__m512 out1610 = tmp11780;
out1599 = _mm512_max_ps(_mm512_setzero_ps(), out1599);
out1605 = _mm512_max_ps(_mm512_setzero_ps(), out1605);
out1600 = _mm512_max_ps(_mm512_setzero_ps(), out1600);
out1606 = _mm512_max_ps(_mm512_setzero_ps(), out1606);
out1601 = _mm512_max_ps(_mm512_setzero_ps(), out1601);
out1607 = _mm512_max_ps(_mm512_setzero_ps(), out1607);
out1602 = _mm512_max_ps(_mm512_setzero_ps(), out1602);
out1608 = _mm512_max_ps(_mm512_setzero_ps(), out1608);
out1603 = _mm512_max_ps(_mm512_setzero_ps(), out1603);
out1609 = _mm512_max_ps(_mm512_setzero_ps(), out1609);
out1604 = _mm512_max_ps(_mm512_setzero_ps(), out1604);
out1610 = _mm512_max_ps(_mm512_setzero_ps(), out1610);
_mm512_mask_storeu_ps(datPtr17+0+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1599);
_mm512_mask_storeu_ps(datPtr17+48+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1605);
_mm512_mask_storeu_ps(datPtr17+112+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1600);
_mm512_mask_storeu_ps(datPtr17+160+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1606);
_mm512_mask_storeu_ps(datPtr17+224+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1601);
_mm512_mask_storeu_ps(datPtr17+272+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1607);
_mm512_mask_storeu_ps(datPtr17+336+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1602);
_mm512_mask_storeu_ps(datPtr17+384+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1608);
_mm512_mask_storeu_ps(datPtr17+448+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1603);
_mm512_mask_storeu_ps(datPtr17+496+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1609);
_mm512_mask_storeu_ps(datPtr17+560+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1604);
_mm512_mask_storeu_ps(datPtr17+608+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1610);
__m512 sf817 = _mm512_loadu_ps(sfPtr9+256+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf818 = _mm512_loadu_ps(sfPtr9+384+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1722 = _mm512_shuffle_f32x4(sf818, sf817, 68);
__m512 in1723 = _mm512_shuffle_f32x4(sf818, sf817, 238);
__m512 sf819 = _mm512_loadu_ps(sfPtr9+320+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf820 = _mm512_loadu_ps(sfPtr9+448+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1730 = _mm512_shuffle_f32x4(sf820, sf819, 68);
__m512 in1731 = _mm512_shuffle_f32x4(sf820, sf819, 238);
__m512 sf821 = _mm512_loadu_ps(sfPtr9+205056+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf822 = _mm512_loadu_ps(sfPtr9+205184+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1724 = _mm512_shuffle_f32x4(sf822, sf821, 68);
__m512 in1725 = _mm512_shuffle_f32x4(sf822, sf821, 238);
__m512 sf823 = _mm512_loadu_ps(sfPtr9+205120+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf824 = _mm512_loadu_ps(sfPtr9+205248+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1732 = _mm512_shuffle_f32x4(sf824, sf823, 68);
__m512 in1733 = _mm512_shuffle_f32x4(sf824, sf823, 238);
__m512 sf825 = _mm512_loadu_ps(sfPtr9+409856+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf826 = _mm512_loadu_ps(sfPtr9+409984+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1726 = _mm512_shuffle_f32x4(sf826, sf825, 68);
__m512 in1727 = _mm512_shuffle_f32x4(sf826, sf825, 238);
__m512 sf827 = _mm512_loadu_ps(sfPtr9+409920+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf828 = _mm512_loadu_ps(sfPtr9+410048+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1734 = _mm512_shuffle_f32x4(sf828, sf827, 68);
__m512 in1735 = _mm512_shuffle_f32x4(sf828, sf827, 238);
__m512 sf829 = _mm512_loadu_ps(sfPtr9+614656+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf830 = _mm512_loadu_ps(sfPtr9+614784+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1728 = _mm512_shuffle_f32x4(sf830, sf829, 68);
__m512 in1729 = _mm512_shuffle_f32x4(sf830, sf829, 238);
__m512 sf831 = _mm512_loadu_ps(sfPtr9+614720+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf832 = _mm512_loadu_ps(sfPtr9+614848+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1736 = _mm512_shuffle_f32x4(sf832, sf831, 68);
__m512 in1737 = _mm512_shuffle_f32x4(sf832, sf831, 238);
__m512 tmp11839 = _mm512_add_ps(in1723, in1724);
__m512 tmp11859 = _mm512_add_ps(in1731, in1732);
__m512 tmp11838 = _mm512_add_ps(in1725, in1726);
__m512 tmp11858 = _mm512_add_ps(in1733, in1734);
__m512 tmp11844 = _mm512_sub_ps(in1725, in1726);
__m512 tmp11864 = _mm512_sub_ps(in1733, in1734);
__m512 tmp11843 = _mm512_sub_ps(in1723, in1724);
__m512 tmp11863 = _mm512_sub_ps(in1731, in1732);
__m512 tmp11840 = _mm512_add_ps(in1727, in1728);
__m512 tmp11860 = _mm512_add_ps(in1735, in1736);
__m512 tmp11845 = _mm512_sub_ps(in1727, in1728);
__m512 tmp11865 = _mm512_sub_ps(in1735, in1736);
__m512 tmp11842 = _mm512_fmadd_ps(tmp11844, _mm512_set1_ps(2e+00f), tmp11843);
__m512 tmp11862 = _mm512_fmadd_ps(tmp11864, _mm512_set1_ps(2e+00f), tmp11863);
__m512 tmp11849 = _mm512_fmadd_ps(tmp11844, _mm512_set1_ps(8e+00f), tmp11843);
__m512 tmp11869 = _mm512_fmadd_ps(tmp11864, _mm512_set1_ps(8e+00f), tmp11863);
__m512 tmp11837 = _mm512_add_ps(tmp11838, tmp11839);
__m512 tmp11857 = _mm512_add_ps(tmp11858, tmp11859);
__m512 tmp11841 = _mm512_fmadd_ps(tmp11845, _mm512_set1_ps(1.6e+01f), tmp11842);
__m512 tmp11861 = _mm512_fmadd_ps(tmp11865, _mm512_set1_ps(1.6e+01f), tmp11862);
__m512 tmp11848 = _mm512_fmadd_ps(tmp11845, _mm512_set1_ps(4e+00f), tmp11849);
__m512 tmp11868 = _mm512_fmadd_ps(tmp11865, _mm512_set1_ps(4e+00f), tmp11869);
__m512 tmp11854 = _mm512_add_ps(tmp11845, tmp11843);
__m512 tmp11874 = _mm512_add_ps(tmp11865, tmp11863);
__m512 tmp11847 = _mm512_fmadd_ps(tmp11838, _mm512_set1_ps(4e+00f), tmp11839);
__m512 tmp11867 = _mm512_fmadd_ps(tmp11858, _mm512_set1_ps(4e+00f), tmp11859);
__m512 tmp11851 = _mm512_fmadd_ps(tmp11838, _mm512_set1_ps(1.6e+01f), tmp11839);
__m512 tmp11871 = _mm512_fmadd_ps(tmp11858, _mm512_set1_ps(1.6e+01f), tmp11859);
__m512 tmp11836 = _mm512_add_ps(tmp11837, in1722);
__m512 tmp11856 = _mm512_add_ps(tmp11857, in1730);
__m512 tmp11853 = _mm512_add_ps(tmp11854, in1729);
__m512 tmp11873 = _mm512_add_ps(tmp11874, in1737);
__m512 tmp11835 = _mm512_fmadd_ps(tmp11840, _mm512_set1_ps(3.2e+01f), tmp11836);
__m512 tmp11855 = _mm512_fmadd_ps(tmp11860, _mm512_set1_ps(3.2e+01f), tmp11856);
__m512 tmp11846 = _mm512_fmadd_ps(tmp11840, _mm512_set1_ps(8e+00f), tmp11847);
__m512 tmp11866 = _mm512_fmadd_ps(tmp11860, _mm512_set1_ps(8e+00f), tmp11867);
__m512 tmp11852 = _mm512_fmadd_ps(tmp11844, _mm512_set1_ps(3.2e+01f), tmp11853);
__m512 tmp11872 = _mm512_fmadd_ps(tmp11864, _mm512_set1_ps(3.2e+01f), tmp11873);
__m512 tmp11850 = _mm512_fmadd_ps(tmp11840, _mm512_set1_ps(2e+00f), tmp11851);
__m512 tmp11870 = _mm512_fmadd_ps(tmp11860, _mm512_set1_ps(2e+00f), tmp11871);
__m512 tmp11823 = tmp11835;
__m512 tmp11829 = tmp11855;
__m512 tmp11824 = tmp11841;
__m512 tmp11830 = tmp11861;
__m512 tmp11825 = tmp11846;
__m512 tmp11831 = tmp11866;
__m512 tmp11826 = tmp11848;
__m512 tmp11832 = tmp11868;
__m512 tmp11827 = tmp11850;
__m512 tmp11833 = tmp11870;
__m512 tmp11828 = tmp11852;
__m512 tmp11834 = tmp11872;
__m512 tmp11919 = _mm512_unpacklo_ps(tmp11823, tmp11824);
__m512 tmp11920 = _mm512_unpackhi_ps(tmp11823, tmp11824);
__m512 tmp11921 = _mm512_unpacklo_ps(tmp11825, tmp11826);
__m512 tmp11922 = _mm512_unpackhi_ps(tmp11825, tmp11826);
__m512 tmp11923 = _mm512_unpacklo_ps(tmp11827, tmp11828);
__m512 tmp11924 = _mm512_unpackhi_ps(tmp11827, tmp11828);
__m512 tmp11925 = _mm512_unpacklo_ps(tmp11829, tmp11830);
__m512 tmp11926 = _mm512_unpackhi_ps(tmp11829, tmp11830);
__m512 tmp11927 = _mm512_unpacklo_ps(tmp11831, tmp11832);
__m512 tmp11928 = _mm512_unpackhi_ps(tmp11831, tmp11832);
__m512 tmp11929 = _mm512_unpacklo_ps(tmp11833, tmp11834);
__m512 tmp11930 = _mm512_unpackhi_ps(tmp11833, tmp11834);
__m512 tmp11931 = _mm512_shuffle_ps(tmp11919, tmp11921, 68);
__m512 tmp11932 = _mm512_shuffle_ps(tmp11919, tmp11921, 238);
__m512 tmp11933 = _mm512_shuffle_ps(tmp11920, tmp11922, 68);
__m512 tmp11934 = _mm512_shuffle_ps(tmp11920, tmp11922, 238);
__m512 tmp11935 = _mm512_shuffle_ps(tmp11923, tmp11925, 68);
__m512 tmp11936 = _mm512_shuffle_ps(tmp11923, tmp11925, 238);
__m512 tmp11937 = _mm512_shuffle_ps(tmp11924, tmp11926, 68);
__m512 tmp11938 = _mm512_shuffle_ps(tmp11924, tmp11926, 238);
__m512 tmp11939 = _mm512_shuffle_ps(tmp11927, tmp11929, 68);
__m512 tmp11940 = _mm512_shuffle_ps(tmp11927, tmp11929, 238);
__m512 tmp11941 = _mm512_shuffle_ps(tmp11928, tmp11930, 68);
__m512 tmp11942 = _mm512_shuffle_ps(tmp11928, tmp11930, 238);
__m512 tmp11943 = _mm512_shuffle_f32x4(tmp11931, tmp11935, 136);
__m512 tmp11944 = _mm512_shuffle_f32x4(tmp11931, tmp11935, 221);
__m512 tmp11945 = _mm512_shuffle_f32x4(tmp11932, tmp11936, 136);
__m512 tmp11946 = _mm512_shuffle_f32x4(tmp11932, tmp11936, 221);
__m512 tmp11947 = _mm512_shuffle_f32x4(tmp11933, tmp11937, 136);
__m512 tmp11948 = _mm512_shuffle_f32x4(tmp11933, tmp11937, 221);
__m512 tmp11949 = _mm512_shuffle_f32x4(tmp11934, tmp11938, 136);
__m512 tmp11950 = _mm512_shuffle_f32x4(tmp11934, tmp11938, 221);
__m512 tmp11951 = _mm512_shuffle_f32x4(tmp11939, tmp11939, 136);
__m512 tmp11952 = _mm512_shuffle_f32x4(tmp11939, tmp11939, 221);
__m512 tmp11953 = _mm512_shuffle_f32x4(tmp11940, tmp11940, 136);
__m512 tmp11954 = _mm512_shuffle_f32x4(tmp11940, tmp11940, 221);
__m512 tmp11955 = _mm512_shuffle_f32x4(tmp11941, tmp11941, 136);
__m512 tmp11956 = _mm512_shuffle_f32x4(tmp11941, tmp11941, 221);
__m512 tmp11957 = _mm512_shuffle_f32x4(tmp11942, tmp11942, 136);
__m512 tmp11958 = _mm512_shuffle_f32x4(tmp11942, tmp11942, 221);
tmp11823 = _mm512_shuffle_f32x4(tmp11943, tmp11951, 136);
tmp11831 = _mm512_shuffle_f32x4(tmp11943, tmp11951, 221);
tmp11824 = _mm512_shuffle_f32x4(tmp11945, tmp11953, 136);
tmp11832 = _mm512_shuffle_f32x4(tmp11945, tmp11953, 221);
tmp11825 = _mm512_shuffle_f32x4(tmp11947, tmp11955, 136);
tmp11833 = _mm512_shuffle_f32x4(tmp11947, tmp11955, 221);
tmp11826 = _mm512_shuffle_f32x4(tmp11949, tmp11957, 136);
tmp11834 = _mm512_shuffle_f32x4(tmp11949, tmp11957, 221);
tmp11827 = _mm512_shuffle_f32x4(tmp11944, tmp11952, 136);
__m512 tmp11875 = _mm512_shuffle_f32x4(tmp11944, tmp11952, 221);
tmp11828 = _mm512_shuffle_f32x4(tmp11946, tmp11954, 136);
__m512 tmp11876 = _mm512_shuffle_f32x4(tmp11946, tmp11954, 221);
tmp11829 = _mm512_shuffle_f32x4(tmp11948, tmp11956, 136);
__m512 tmp11877 = _mm512_shuffle_f32x4(tmp11948, tmp11956, 221);
tmp11830 = _mm512_shuffle_f32x4(tmp11950, tmp11958, 136);
__m512 tmp11878 = _mm512_shuffle_f32x4(tmp11950, tmp11958, 221);
__m512 tmp11883 = _mm512_add_ps(tmp11824, tmp11825);
__m512 tmp11903 = _mm512_add_ps(tmp11832, tmp11833);
__m512 tmp11882 = _mm512_add_ps(tmp11826, tmp11827);
__m512 tmp11902 = _mm512_add_ps(tmp11834, tmp11875);
__m512 tmp11888 = _mm512_sub_ps(tmp11826, tmp11827);
__m512 tmp11908 = _mm512_sub_ps(tmp11834, tmp11875);
__m512 tmp11887 = _mm512_sub_ps(tmp11824, tmp11825);
__m512 tmp11907 = _mm512_sub_ps(tmp11832, tmp11833);
__m512 tmp11884 = _mm512_add_ps(tmp11828, tmp11829);
__m512 tmp11904 = _mm512_add_ps(tmp11876, tmp11877);
__m512 tmp11889 = _mm512_sub_ps(tmp11828, tmp11829);
__m512 tmp11909 = _mm512_sub_ps(tmp11876, tmp11877);
__m512 tmp11886 = _mm512_fmadd_ps(tmp11888, _mm512_set1_ps(2e+00f), tmp11887);
__m512 tmp11906 = _mm512_fmadd_ps(tmp11908, _mm512_set1_ps(2e+00f), tmp11907);
__m512 tmp11893 = _mm512_fmadd_ps(tmp11888, _mm512_set1_ps(8e+00f), tmp11887);
__m512 tmp11913 = _mm512_fmadd_ps(tmp11908, _mm512_set1_ps(8e+00f), tmp11907);
__m512 tmp11881 = _mm512_add_ps(tmp11882, tmp11883);
__m512 tmp11901 = _mm512_add_ps(tmp11902, tmp11903);
__m512 tmp11885 = _mm512_fmadd_ps(tmp11889, _mm512_set1_ps(1.6e+01f), tmp11886);
__m512 tmp11905 = _mm512_fmadd_ps(tmp11909, _mm512_set1_ps(1.6e+01f), tmp11906);
__m512 tmp11892 = _mm512_fmadd_ps(tmp11889, _mm512_set1_ps(4e+00f), tmp11893);
__m512 tmp11912 = _mm512_fmadd_ps(tmp11909, _mm512_set1_ps(4e+00f), tmp11913);
__m512 tmp11898 = _mm512_add_ps(tmp11889, tmp11887);
__m512 tmp11918 = _mm512_add_ps(tmp11909, tmp11907);
__m512 tmp11891 = _mm512_fmadd_ps(tmp11882, _mm512_set1_ps(4e+00f), tmp11883);
__m512 tmp11911 = _mm512_fmadd_ps(tmp11902, _mm512_set1_ps(4e+00f), tmp11903);
__m512 tmp11895 = _mm512_fmadd_ps(tmp11882, _mm512_set1_ps(1.6e+01f), tmp11883);
__m512 tmp11915 = _mm512_fmadd_ps(tmp11902, _mm512_set1_ps(1.6e+01f), tmp11903);
__m512 tmp11880 = _mm512_add_ps(tmp11881, tmp11823);
__m512 tmp11900 = _mm512_add_ps(tmp11901, tmp11831);
__m512 tmp11897 = _mm512_add_ps(tmp11898, tmp11830);
__m512 tmp11917 = _mm512_add_ps(tmp11918, tmp11878);
__m512 tmp11879 = _mm512_fmadd_ps(tmp11884, _mm512_set1_ps(3.2e+01f), tmp11880);
__m512 tmp11899 = _mm512_fmadd_ps(tmp11904, _mm512_set1_ps(3.2e+01f), tmp11900);
__m512 tmp11890 = _mm512_fmadd_ps(tmp11884, _mm512_set1_ps(8e+00f), tmp11891);
__m512 tmp11910 = _mm512_fmadd_ps(tmp11904, _mm512_set1_ps(8e+00f), tmp11911);
__m512 tmp11896 = _mm512_fmadd_ps(tmp11888, _mm512_set1_ps(3.2e+01f), tmp11897);
__m512 tmp11916 = _mm512_fmadd_ps(tmp11908, _mm512_set1_ps(3.2e+01f), tmp11917);
__m512 tmp11894 = _mm512_fmadd_ps(tmp11884, _mm512_set1_ps(2e+00f), tmp11895);
__m512 tmp11914 = _mm512_fmadd_ps(tmp11904, _mm512_set1_ps(2e+00f), tmp11915);
__m512 out1617 = tmp11879;
__m512 out1611 = tmp11899;
__m512 out1618 = tmp11885;
__m512 out1612 = tmp11905;
__m512 out1619 = tmp11890;
__m512 out1613 = tmp11910;
__m512 out1620 = tmp11892;
__m512 out1614 = tmp11912;
__m512 out1621 = tmp11894;
__m512 out1615 = tmp11914;
__m512 out1622 = tmp11896;
__m512 out1616 = tmp11916;
out1617 = _mm512_max_ps(_mm512_setzero_ps(), out1617);
out1611 = _mm512_max_ps(_mm512_setzero_ps(), out1611);
out1618 = _mm512_max_ps(_mm512_setzero_ps(), out1618);
out1612 = _mm512_max_ps(_mm512_setzero_ps(), out1612);
out1619 = _mm512_max_ps(_mm512_setzero_ps(), out1619);
out1613 = _mm512_max_ps(_mm512_setzero_ps(), out1613);
out1620 = _mm512_max_ps(_mm512_setzero_ps(), out1620);
out1614 = _mm512_max_ps(_mm512_setzero_ps(), out1614);
out1621 = _mm512_max_ps(_mm512_setzero_ps(), out1621);
out1615 = _mm512_max_ps(_mm512_setzero_ps(), out1615);
out1622 = _mm512_max_ps(_mm512_setzero_ps(), out1622);
out1616 = _mm512_max_ps(_mm512_setzero_ps(), out1616);
_mm512_mask_storeu_ps(datPtr17+3136+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1617);
_mm512_mask_storeu_ps(datPtr17+96+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 15, out1611);
_mm512_mask_storeu_ps(datPtr17+648+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4032, out1611);
_mm512_mask_storeu_ps(datPtr17+3248+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1618);
_mm512_mask_storeu_ps(datPtr17+208+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 15, out1612);
_mm512_mask_storeu_ps(datPtr17+760+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4032, out1612);
_mm512_mask_storeu_ps(datPtr17+3360+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1619);
_mm512_mask_storeu_ps(datPtr17+320+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 15, out1613);
_mm512_mask_storeu_ps(datPtr17+872+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4032, out1613);
_mm512_mask_storeu_ps(datPtr17+3472+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1620);
_mm512_mask_storeu_ps(datPtr17+432+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 15, out1614);
_mm512_mask_storeu_ps(datPtr17+984+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4032, out1614);
_mm512_mask_storeu_ps(datPtr17+3584+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1621);
_mm512_mask_storeu_ps(datPtr17+544+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 15, out1615);
_mm512_mask_storeu_ps(datPtr17+1096+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4032, out1615);
_mm512_mask_storeu_ps(datPtr17+3696+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1622);
_mm512_mask_storeu_ps(datPtr17+656+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 15, out1616);
_mm512_mask_storeu_ps(datPtr17+1208+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4032, out1616);
__m512 sf833 = _mm512_loadu_ps(sfPtr9+512+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf834 = _mm512_loadu_ps(sfPtr9+640+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1738 = _mm512_shuffle_f32x4(sf833, sf834, 68);
__m512 in1739 = _mm512_shuffle_f32x4(sf833, sf834, 238);
__m512 sf835 = _mm512_loadu_ps(sfPtr9+576+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf836 = _mm512_loadu_ps(sfPtr9+704+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1746 = _mm512_shuffle_f32x4(sf835, sf836, 68);
__m512 in1747 = _mm512_shuffle_f32x4(sf835, sf836, 238);
__m512 sf837 = _mm512_loadu_ps(sfPtr9+205312+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf838 = _mm512_loadu_ps(sfPtr9+205440+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1740 = _mm512_shuffle_f32x4(sf837, sf838, 68);
__m512 in1741 = _mm512_shuffle_f32x4(sf837, sf838, 238);
__m512 sf839 = _mm512_loadu_ps(sfPtr9+205376+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf840 = _mm512_loadu_ps(sfPtr9+205504+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1748 = _mm512_shuffle_f32x4(sf839, sf840, 68);
__m512 in1749 = _mm512_shuffle_f32x4(sf839, sf840, 238);
__m512 sf841 = _mm512_loadu_ps(sfPtr9+410112+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf842 = _mm512_loadu_ps(sfPtr9+410240+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1742 = _mm512_shuffle_f32x4(sf841, sf842, 68);
__m512 in1743 = _mm512_shuffle_f32x4(sf841, sf842, 238);
__m512 sf843 = _mm512_loadu_ps(sfPtr9+410176+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf844 = _mm512_loadu_ps(sfPtr9+410304+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1750 = _mm512_shuffle_f32x4(sf843, sf844, 68);
__m512 in1751 = _mm512_shuffle_f32x4(sf843, sf844, 238);
__m512 sf845 = _mm512_loadu_ps(sfPtr9+614912+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf846 = _mm512_loadu_ps(sfPtr9+615040+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1744 = _mm512_shuffle_f32x4(sf845, sf846, 68);
__m512 in1745 = _mm512_shuffle_f32x4(sf845, sf846, 238);
__m512 sf847 = _mm512_loadu_ps(sfPtr9+614976+819200*i37+49152*j30+1536*k111+768*l43);
__m512 sf848 = _mm512_loadu_ps(sfPtr9+615104+819200*i37+49152*j30+1536*k111+768*l43);
__m512 in1752 = _mm512_shuffle_f32x4(sf847, sf848, 68);
__m512 in1753 = _mm512_shuffle_f32x4(sf847, sf848, 238);
__m512 tmp11975 = _mm512_add_ps(in1739, in1740);
__m512 tmp11995 = _mm512_add_ps(in1747, in1748);
__m512 tmp11974 = _mm512_add_ps(in1741, in1742);
__m512 tmp11994 = _mm512_add_ps(in1749, in1750);
__m512 tmp11980 = _mm512_sub_ps(in1741, in1742);
__m512 tmp12000 = _mm512_sub_ps(in1749, in1750);
__m512 tmp11979 = _mm512_sub_ps(in1739, in1740);
__m512 tmp11999 = _mm512_sub_ps(in1747, in1748);
__m512 tmp11976 = _mm512_add_ps(in1743, in1744);
__m512 tmp11996 = _mm512_add_ps(in1751, in1752);
__m512 tmp11981 = _mm512_sub_ps(in1743, in1744);
__m512 tmp12001 = _mm512_sub_ps(in1751, in1752);
__m512 tmp11978 = _mm512_fmadd_ps(tmp11980, _mm512_set1_ps(2e+00f), tmp11979);
__m512 tmp11998 = _mm512_fmadd_ps(tmp12000, _mm512_set1_ps(2e+00f), tmp11999);
__m512 tmp11985 = _mm512_fmadd_ps(tmp11980, _mm512_set1_ps(8e+00f), tmp11979);
__m512 tmp12005 = _mm512_fmadd_ps(tmp12000, _mm512_set1_ps(8e+00f), tmp11999);
__m512 tmp11973 = _mm512_add_ps(tmp11974, tmp11975);
__m512 tmp11993 = _mm512_add_ps(tmp11994, tmp11995);
__m512 tmp11977 = _mm512_fmadd_ps(tmp11981, _mm512_set1_ps(1.6e+01f), tmp11978);
__m512 tmp11997 = _mm512_fmadd_ps(tmp12001, _mm512_set1_ps(1.6e+01f), tmp11998);
__m512 tmp11984 = _mm512_fmadd_ps(tmp11981, _mm512_set1_ps(4e+00f), tmp11985);
__m512 tmp12004 = _mm512_fmadd_ps(tmp12001, _mm512_set1_ps(4e+00f), tmp12005);
__m512 tmp11990 = _mm512_add_ps(tmp11981, tmp11979);
__m512 tmp12010 = _mm512_add_ps(tmp12001, tmp11999);
__m512 tmp11983 = _mm512_fmadd_ps(tmp11974, _mm512_set1_ps(4e+00f), tmp11975);
__m512 tmp12003 = _mm512_fmadd_ps(tmp11994, _mm512_set1_ps(4e+00f), tmp11995);
__m512 tmp11987 = _mm512_fmadd_ps(tmp11974, _mm512_set1_ps(1.6e+01f), tmp11975);
__m512 tmp12007 = _mm512_fmadd_ps(tmp11994, _mm512_set1_ps(1.6e+01f), tmp11995);
__m512 tmp11972 = _mm512_add_ps(tmp11973, in1738);
__m512 tmp11992 = _mm512_add_ps(tmp11993, in1746);
__m512 tmp11989 = _mm512_add_ps(tmp11990, in1745);
__m512 tmp12009 = _mm512_add_ps(tmp12010, in1753);
__m512 tmp11971 = _mm512_fmadd_ps(tmp11976, _mm512_set1_ps(3.2e+01f), tmp11972);
__m512 tmp11991 = _mm512_fmadd_ps(tmp11996, _mm512_set1_ps(3.2e+01f), tmp11992);
__m512 tmp11982 = _mm512_fmadd_ps(tmp11976, _mm512_set1_ps(8e+00f), tmp11983);
__m512 tmp12002 = _mm512_fmadd_ps(tmp11996, _mm512_set1_ps(8e+00f), tmp12003);
__m512 tmp11988 = _mm512_fmadd_ps(tmp11980, _mm512_set1_ps(3.2e+01f), tmp11989);
__m512 tmp12008 = _mm512_fmadd_ps(tmp12000, _mm512_set1_ps(3.2e+01f), tmp12009);
__m512 tmp11986 = _mm512_fmadd_ps(tmp11976, _mm512_set1_ps(2e+00f), tmp11987);
__m512 tmp12006 = _mm512_fmadd_ps(tmp11996, _mm512_set1_ps(2e+00f), tmp12007);
__m512 tmp11959 = tmp11971;
__m512 tmp11965 = tmp11991;
__m512 tmp11960 = tmp11977;
__m512 tmp11966 = tmp11997;
__m512 tmp11961 = tmp11982;
__m512 tmp11967 = tmp12002;
__m512 tmp11962 = tmp11984;
__m512 tmp11968 = tmp12004;
__m512 tmp11963 = tmp11986;
__m512 tmp11969 = tmp12006;
__m512 tmp11964 = tmp11988;
__m512 tmp11970 = tmp12008;
__m512 tmp12055 = _mm512_unpacklo_ps(tmp11959, tmp11960);
__m512 tmp12056 = _mm512_unpackhi_ps(tmp11959, tmp11960);
__m512 tmp12057 = _mm512_unpacklo_ps(tmp11961, tmp11962);
__m512 tmp12058 = _mm512_unpackhi_ps(tmp11961, tmp11962);
__m512 tmp12059 = _mm512_unpacklo_ps(tmp11963, tmp11964);
__m512 tmp12060 = _mm512_unpackhi_ps(tmp11963, tmp11964);
__m512 tmp12061 = _mm512_unpacklo_ps(tmp11965, tmp11966);
__m512 tmp12062 = _mm512_unpackhi_ps(tmp11965, tmp11966);
__m512 tmp12063 = _mm512_unpacklo_ps(tmp11967, tmp11968);
__m512 tmp12064 = _mm512_unpackhi_ps(tmp11967, tmp11968);
__m512 tmp12065 = _mm512_unpacklo_ps(tmp11969, tmp11970);
__m512 tmp12066 = _mm512_unpackhi_ps(tmp11969, tmp11970);
__m512 tmp12067 = _mm512_shuffle_ps(tmp12055, tmp12057, 68);
__m512 tmp12068 = _mm512_shuffle_ps(tmp12055, tmp12057, 238);
__m512 tmp12069 = _mm512_shuffle_ps(tmp12056, tmp12058, 68);
__m512 tmp12070 = _mm512_shuffle_ps(tmp12056, tmp12058, 238);
__m512 tmp12071 = _mm512_shuffle_ps(tmp12059, tmp12061, 68);
__m512 tmp12072 = _mm512_shuffle_ps(tmp12059, tmp12061, 238);
__m512 tmp12073 = _mm512_shuffle_ps(tmp12060, tmp12062, 68);
__m512 tmp12074 = _mm512_shuffle_ps(tmp12060, tmp12062, 238);
__m512 tmp12075 = _mm512_shuffle_ps(tmp12063, tmp12065, 68);
__m512 tmp12076 = _mm512_shuffle_ps(tmp12063, tmp12065, 238);
__m512 tmp12077 = _mm512_shuffle_ps(tmp12064, tmp12066, 68);
__m512 tmp12078 = _mm512_shuffle_ps(tmp12064, tmp12066, 238);
__m512 tmp12079 = _mm512_shuffle_f32x4(tmp12067, tmp12071, 136);
__m512 tmp12080 = _mm512_shuffle_f32x4(tmp12067, tmp12071, 221);
__m512 tmp12081 = _mm512_shuffle_f32x4(tmp12068, tmp12072, 136);
__m512 tmp12082 = _mm512_shuffle_f32x4(tmp12068, tmp12072, 221);
__m512 tmp12083 = _mm512_shuffle_f32x4(tmp12069, tmp12073, 136);
__m512 tmp12084 = _mm512_shuffle_f32x4(tmp12069, tmp12073, 221);
__m512 tmp12085 = _mm512_shuffle_f32x4(tmp12070, tmp12074, 136);
__m512 tmp12086 = _mm512_shuffle_f32x4(tmp12070, tmp12074, 221);
__m512 tmp12087 = _mm512_shuffle_f32x4(tmp12075, tmp12075, 136);
__m512 tmp12088 = _mm512_shuffle_f32x4(tmp12075, tmp12075, 221);
__m512 tmp12089 = _mm512_shuffle_f32x4(tmp12076, tmp12076, 136);
__m512 tmp12090 = _mm512_shuffle_f32x4(tmp12076, tmp12076, 221);
__m512 tmp12091 = _mm512_shuffle_f32x4(tmp12077, tmp12077, 136);
__m512 tmp12092 = _mm512_shuffle_f32x4(tmp12077, tmp12077, 221);
__m512 tmp12093 = _mm512_shuffle_f32x4(tmp12078, tmp12078, 136);
__m512 tmp12094 = _mm512_shuffle_f32x4(tmp12078, tmp12078, 221);
tmp11959 = _mm512_shuffle_f32x4(tmp12079, tmp12087, 136);
tmp11967 = _mm512_shuffle_f32x4(tmp12079, tmp12087, 221);
tmp11960 = _mm512_shuffle_f32x4(tmp12081, tmp12089, 136);
tmp11968 = _mm512_shuffle_f32x4(tmp12081, tmp12089, 221);
tmp11961 = _mm512_shuffle_f32x4(tmp12083, tmp12091, 136);
tmp11969 = _mm512_shuffle_f32x4(tmp12083, tmp12091, 221);
tmp11962 = _mm512_shuffle_f32x4(tmp12085, tmp12093, 136);
tmp11970 = _mm512_shuffle_f32x4(tmp12085, tmp12093, 221);
tmp11963 = _mm512_shuffle_f32x4(tmp12080, tmp12088, 136);
__m512 tmp12011 = _mm512_shuffle_f32x4(tmp12080, tmp12088, 221);
tmp11964 = _mm512_shuffle_f32x4(tmp12082, tmp12090, 136);
__m512 tmp12012 = _mm512_shuffle_f32x4(tmp12082, tmp12090, 221);
tmp11965 = _mm512_shuffle_f32x4(tmp12084, tmp12092, 136);
__m512 tmp12013 = _mm512_shuffle_f32x4(tmp12084, tmp12092, 221);
tmp11966 = _mm512_shuffle_f32x4(tmp12086, tmp12094, 136);
__m512 tmp12014 = _mm512_shuffle_f32x4(tmp12086, tmp12094, 221);
__m512 tmp12019 = _mm512_add_ps(tmp11960, tmp11961);
__m512 tmp12039 = _mm512_add_ps(tmp11968, tmp11969);
__m512 tmp12018 = _mm512_add_ps(tmp11962, tmp11963);
__m512 tmp12038 = _mm512_add_ps(tmp11970, tmp12011);
__m512 tmp12024 = _mm512_sub_ps(tmp11962, tmp11963);
__m512 tmp12044 = _mm512_sub_ps(tmp11970, tmp12011);
__m512 tmp12023 = _mm512_sub_ps(tmp11960, tmp11961);
__m512 tmp12043 = _mm512_sub_ps(tmp11968, tmp11969);
__m512 tmp12020 = _mm512_add_ps(tmp11964, tmp11965);
__m512 tmp12040 = _mm512_add_ps(tmp12012, tmp12013);
__m512 tmp12025 = _mm512_sub_ps(tmp11964, tmp11965);
__m512 tmp12045 = _mm512_sub_ps(tmp12012, tmp12013);
__m512 tmp12022 = _mm512_fmadd_ps(tmp12024, _mm512_set1_ps(2e+00f), tmp12023);
__m512 tmp12042 = _mm512_fmadd_ps(tmp12044, _mm512_set1_ps(2e+00f), tmp12043);
__m512 tmp12029 = _mm512_fmadd_ps(tmp12024, _mm512_set1_ps(8e+00f), tmp12023);
__m512 tmp12049 = _mm512_fmadd_ps(tmp12044, _mm512_set1_ps(8e+00f), tmp12043);
__m512 tmp12017 = _mm512_add_ps(tmp12018, tmp12019);
__m512 tmp12037 = _mm512_add_ps(tmp12038, tmp12039);
__m512 tmp12021 = _mm512_fmadd_ps(tmp12025, _mm512_set1_ps(1.6e+01f), tmp12022);
__m512 tmp12041 = _mm512_fmadd_ps(tmp12045, _mm512_set1_ps(1.6e+01f), tmp12042);
__m512 tmp12028 = _mm512_fmadd_ps(tmp12025, _mm512_set1_ps(4e+00f), tmp12029);
__m512 tmp12048 = _mm512_fmadd_ps(tmp12045, _mm512_set1_ps(4e+00f), tmp12049);
__m512 tmp12034 = _mm512_add_ps(tmp12025, tmp12023);
__m512 tmp12054 = _mm512_add_ps(tmp12045, tmp12043);
__m512 tmp12027 = _mm512_fmadd_ps(tmp12018, _mm512_set1_ps(4e+00f), tmp12019);
__m512 tmp12047 = _mm512_fmadd_ps(tmp12038, _mm512_set1_ps(4e+00f), tmp12039);
__m512 tmp12031 = _mm512_fmadd_ps(tmp12018, _mm512_set1_ps(1.6e+01f), tmp12019);
__m512 tmp12051 = _mm512_fmadd_ps(tmp12038, _mm512_set1_ps(1.6e+01f), tmp12039);
__m512 tmp12016 = _mm512_add_ps(tmp12017, tmp11959);
__m512 tmp12036 = _mm512_add_ps(tmp12037, tmp11967);
__m512 tmp12033 = _mm512_add_ps(tmp12034, tmp11966);
__m512 tmp12053 = _mm512_add_ps(tmp12054, tmp12014);
__m512 tmp12015 = _mm512_fmadd_ps(tmp12020, _mm512_set1_ps(3.2e+01f), tmp12016);
__m512 tmp12035 = _mm512_fmadd_ps(tmp12040, _mm512_set1_ps(3.2e+01f), tmp12036);
__m512 tmp12026 = _mm512_fmadd_ps(tmp12020, _mm512_set1_ps(8e+00f), tmp12027);
__m512 tmp12046 = _mm512_fmadd_ps(tmp12040, _mm512_set1_ps(8e+00f), tmp12047);
__m512 tmp12032 = _mm512_fmadd_ps(tmp12024, _mm512_set1_ps(3.2e+01f), tmp12033);
__m512 tmp12052 = _mm512_fmadd_ps(tmp12044, _mm512_set1_ps(3.2e+01f), tmp12053);
__m512 tmp12030 = _mm512_fmadd_ps(tmp12020, _mm512_set1_ps(2e+00f), tmp12031);
__m512 tmp12050 = _mm512_fmadd_ps(tmp12040, _mm512_set1_ps(2e+00f), tmp12051);
__m512 out1623 = tmp12015;
__m512 out1629 = tmp12035;
__m512 out1624 = tmp12021;
__m512 out1630 = tmp12041;
__m512 out1625 = tmp12026;
__m512 out1631 = tmp12046;
__m512 out1626 = tmp12028;
__m512 out1632 = tmp12048;
__m512 out1627 = tmp12030;
__m512 out1633 = tmp12050;
__m512 out1628 = tmp12032;
__m512 out1634 = tmp12052;
out1623 = _mm512_max_ps(_mm512_setzero_ps(), out1623);
out1629 = _mm512_max_ps(_mm512_setzero_ps(), out1629);
out1624 = _mm512_max_ps(_mm512_setzero_ps(), out1624);
out1630 = _mm512_max_ps(_mm512_setzero_ps(), out1630);
out1625 = _mm512_max_ps(_mm512_setzero_ps(), out1625);
out1631 = _mm512_max_ps(_mm512_setzero_ps(), out1631);
out1626 = _mm512_max_ps(_mm512_setzero_ps(), out1626);
out1632 = _mm512_max_ps(_mm512_setzero_ps(), out1632);
out1627 = _mm512_max_ps(_mm512_setzero_ps(), out1627);
out1633 = _mm512_max_ps(_mm512_setzero_ps(), out1633);
out1628 = _mm512_max_ps(_mm512_setzero_ps(), out1628);
out1634 = _mm512_max_ps(_mm512_setzero_ps(), out1634);
_mm512_mask_storeu_ps(datPtr17+3184+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1623);
_mm512_mask_storeu_ps(datPtr17+3232+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 15, out1629);
_mm512_mask_storeu_ps(datPtr17+3784+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4032, out1629);
_mm512_mask_storeu_ps(datPtr17+3296+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1624);
_mm512_mask_storeu_ps(datPtr17+3344+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 15, out1630);
_mm512_mask_storeu_ps(datPtr17+3896+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4032, out1630);
_mm512_mask_storeu_ps(datPtr17+3408+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1625);
_mm512_mask_storeu_ps(datPtr17+3456+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 15, out1631);
_mm512_mask_storeu_ps(datPtr17+4008+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4032, out1631);
_mm512_mask_storeu_ps(datPtr17+3520+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1626);
_mm512_mask_storeu_ps(datPtr17+3568+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 15, out1632);
_mm512_mask_storeu_ps(datPtr17+4120+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4032, out1632);
_mm512_mask_storeu_ps(datPtr17+3632+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1627);
_mm512_mask_storeu_ps(datPtr17+3680+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 15, out1633);
_mm512_mask_storeu_ps(datPtr17+4232+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4032, out1633);
_mm512_mask_storeu_ps(datPtr17+3744+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4095, out1628);
_mm512_mask_storeu_ps(datPtr17+3792+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 15, out1634);
_mm512_mask_storeu_ps(datPtr17+4344+401408*i37+112*toH38+4*toW38+12544*k111+6272*l43, 4032, out1634);
}
}
if (j30 >= last8) return;
++j30;
rel20 = 1;
}
ptrdiff_t toH39 = base20+6;
ptrdiff_t toW39 = 6;
ptrdiff_t k112 = 32*w54;
for (; k112 != 32; ++k112) {
ptrdiff_t l44 = 0;
for (; l44 != 2; ++l44) {
__m512 sf849 = _mm512_loadu_ps(sfPtr9+0+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf850 = _mm512_loadu_ps(sfPtr9+128+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1754 = _mm512_shuffle_f32x4(sf849, sf850, 68);
__m512 in1755 = _mm512_shuffle_f32x4(sf849, sf850, 238);
__m512 sf851 = _mm512_loadu_ps(sfPtr9+64+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf852 = _mm512_loadu_ps(sfPtr9+192+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1762 = _mm512_shuffle_f32x4(sf851, sf852, 68);
__m512 in1763 = _mm512_shuffle_f32x4(sf851, sf852, 238);
__m512 sf853 = _mm512_loadu_ps(sfPtr9+204800+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf854 = _mm512_loadu_ps(sfPtr9+204928+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1756 = _mm512_shuffle_f32x4(sf853, sf854, 68);
__m512 in1757 = _mm512_shuffle_f32x4(sf853, sf854, 238);
__m512 sf855 = _mm512_loadu_ps(sfPtr9+204864+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf856 = _mm512_loadu_ps(sfPtr9+204992+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1764 = _mm512_shuffle_f32x4(sf855, sf856, 68);
__m512 in1765 = _mm512_shuffle_f32x4(sf855, sf856, 238);
__m512 sf857 = _mm512_loadu_ps(sfPtr9+409600+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf858 = _mm512_loadu_ps(sfPtr9+409728+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1758 = _mm512_shuffle_f32x4(sf857, sf858, 68);
__m512 in1759 = _mm512_shuffle_f32x4(sf857, sf858, 238);
__m512 sf859 = _mm512_loadu_ps(sfPtr9+409664+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf860 = _mm512_loadu_ps(sfPtr9+409792+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1766 = _mm512_shuffle_f32x4(sf859, sf860, 68);
__m512 in1767 = _mm512_shuffle_f32x4(sf859, sf860, 238);
__m512 sf861 = _mm512_loadu_ps(sfPtr9+614400+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf862 = _mm512_loadu_ps(sfPtr9+614528+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1760 = _mm512_shuffle_f32x4(sf861, sf862, 68);
__m512 in1761 = _mm512_shuffle_f32x4(sf861, sf862, 238);
__m512 sf863 = _mm512_loadu_ps(sfPtr9+614464+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf864 = _mm512_loadu_ps(sfPtr9+614592+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1768 = _mm512_shuffle_f32x4(sf863, sf864, 68);
__m512 in1769 = _mm512_shuffle_f32x4(sf863, sf864, 238);
__m512 tmp12111 = _mm512_add_ps(in1755, in1756);
__m512 tmp12131 = _mm512_add_ps(in1763, in1764);
__m512 tmp12110 = _mm512_add_ps(in1757, in1758);
__m512 tmp12130 = _mm512_add_ps(in1765, in1766);
__m512 tmp12116 = _mm512_sub_ps(in1757, in1758);
__m512 tmp12136 = _mm512_sub_ps(in1765, in1766);
__m512 tmp12115 = _mm512_sub_ps(in1755, in1756);
__m512 tmp12135 = _mm512_sub_ps(in1763, in1764);
__m512 tmp12112 = _mm512_add_ps(in1759, in1760);
__m512 tmp12132 = _mm512_add_ps(in1767, in1768);
__m512 tmp12117 = _mm512_sub_ps(in1759, in1760);
__m512 tmp12137 = _mm512_sub_ps(in1767, in1768);
__m512 tmp12114 = _mm512_fmadd_ps(tmp12116, _mm512_set1_ps(2e+00f), tmp12115);
__m512 tmp12134 = _mm512_fmadd_ps(tmp12136, _mm512_set1_ps(2e+00f), tmp12135);
__m512 tmp12121 = _mm512_fmadd_ps(tmp12116, _mm512_set1_ps(8e+00f), tmp12115);
__m512 tmp12141 = _mm512_fmadd_ps(tmp12136, _mm512_set1_ps(8e+00f), tmp12135);
__m512 tmp12109 = _mm512_add_ps(tmp12110, tmp12111);
__m512 tmp12129 = _mm512_add_ps(tmp12130, tmp12131);
__m512 tmp12113 = _mm512_fmadd_ps(tmp12117, _mm512_set1_ps(1.6e+01f), tmp12114);
__m512 tmp12133 = _mm512_fmadd_ps(tmp12137, _mm512_set1_ps(1.6e+01f), tmp12134);
__m512 tmp12120 = _mm512_fmadd_ps(tmp12117, _mm512_set1_ps(4e+00f), tmp12121);
__m512 tmp12140 = _mm512_fmadd_ps(tmp12137, _mm512_set1_ps(4e+00f), tmp12141);
__m512 tmp12126 = _mm512_add_ps(tmp12117, tmp12115);
__m512 tmp12146 = _mm512_add_ps(tmp12137, tmp12135);
__m512 tmp12119 = _mm512_fmadd_ps(tmp12110, _mm512_set1_ps(4e+00f), tmp12111);
__m512 tmp12139 = _mm512_fmadd_ps(tmp12130, _mm512_set1_ps(4e+00f), tmp12131);
__m512 tmp12123 = _mm512_fmadd_ps(tmp12110, _mm512_set1_ps(1.6e+01f), tmp12111);
__m512 tmp12143 = _mm512_fmadd_ps(tmp12130, _mm512_set1_ps(1.6e+01f), tmp12131);
__m512 tmp12108 = _mm512_add_ps(tmp12109, in1754);
__m512 tmp12128 = _mm512_add_ps(tmp12129, in1762);
__m512 tmp12125 = _mm512_add_ps(tmp12126, in1761);
__m512 tmp12145 = _mm512_add_ps(tmp12146, in1769);
__m512 tmp12107 = _mm512_fmadd_ps(tmp12112, _mm512_set1_ps(3.2e+01f), tmp12108);
__m512 tmp12127 = _mm512_fmadd_ps(tmp12132, _mm512_set1_ps(3.2e+01f), tmp12128);
__m512 tmp12118 = _mm512_fmadd_ps(tmp12112, _mm512_set1_ps(8e+00f), tmp12119);
__m512 tmp12138 = _mm512_fmadd_ps(tmp12132, _mm512_set1_ps(8e+00f), tmp12139);
__m512 tmp12124 = _mm512_fmadd_ps(tmp12116, _mm512_set1_ps(3.2e+01f), tmp12125);
__m512 tmp12144 = _mm512_fmadd_ps(tmp12136, _mm512_set1_ps(3.2e+01f), tmp12145);
__m512 tmp12122 = _mm512_fmadd_ps(tmp12112, _mm512_set1_ps(2e+00f), tmp12123);
__m512 tmp12142 = _mm512_fmadd_ps(tmp12132, _mm512_set1_ps(2e+00f), tmp12143);
__m512 tmp12095 = tmp12107;
__m512 tmp12101 = tmp12127;
__m512 tmp12096 = tmp12113;
__m512 tmp12102 = tmp12133;
__m512 tmp12097 = tmp12118;
__m512 tmp12103 = tmp12138;
__m512 tmp12098 = tmp12120;
__m512 tmp12104 = tmp12140;
__m512 tmp12099 = tmp12122;
__m512 tmp12105 = tmp12142;
__m512 tmp12100 = tmp12124;
__m512 tmp12106 = tmp12144;
__m512 tmp12191 = _mm512_unpacklo_ps(tmp12095, tmp12096);
__m512 tmp12192 = _mm512_unpackhi_ps(tmp12095, tmp12096);
__m512 tmp12193 = _mm512_unpacklo_ps(tmp12097, tmp12098);
__m512 tmp12194 = _mm512_unpackhi_ps(tmp12097, tmp12098);
__m512 tmp12195 = _mm512_unpacklo_ps(tmp12099, tmp12100);
__m512 tmp12196 = _mm512_unpackhi_ps(tmp12099, tmp12100);
__m512 tmp12197 = _mm512_unpacklo_ps(tmp12101, tmp12102);
__m512 tmp12198 = _mm512_unpackhi_ps(tmp12101, tmp12102);
__m512 tmp12199 = _mm512_unpacklo_ps(tmp12103, tmp12104);
__m512 tmp12200 = _mm512_unpackhi_ps(tmp12103, tmp12104);
__m512 tmp12201 = _mm512_unpacklo_ps(tmp12105, tmp12106);
__m512 tmp12202 = _mm512_unpackhi_ps(tmp12105, tmp12106);
__m512 tmp12203 = _mm512_shuffle_ps(tmp12191, tmp12193, 68);
__m512 tmp12204 = _mm512_shuffle_ps(tmp12191, tmp12193, 238);
__m512 tmp12205 = _mm512_shuffle_ps(tmp12192, tmp12194, 68);
__m512 tmp12206 = _mm512_shuffle_ps(tmp12192, tmp12194, 238);
__m512 tmp12207 = _mm512_shuffle_ps(tmp12195, tmp12197, 68);
__m512 tmp12208 = _mm512_shuffle_ps(tmp12195, tmp12197, 238);
__m512 tmp12209 = _mm512_shuffle_ps(tmp12196, tmp12198, 68);
__m512 tmp12210 = _mm512_shuffle_ps(tmp12196, tmp12198, 238);
__m512 tmp12211 = _mm512_shuffle_ps(tmp12199, tmp12201, 68);
__m512 tmp12212 = _mm512_shuffle_ps(tmp12199, tmp12201, 238);
__m512 tmp12213 = _mm512_shuffle_ps(tmp12200, tmp12202, 68);
__m512 tmp12214 = _mm512_shuffle_ps(tmp12200, tmp12202, 238);
__m512 tmp12215 = _mm512_shuffle_f32x4(tmp12203, tmp12207, 136);
__m512 tmp12216 = _mm512_shuffle_f32x4(tmp12203, tmp12207, 221);
__m512 tmp12217 = _mm512_shuffle_f32x4(tmp12204, tmp12208, 136);
__m512 tmp12218 = _mm512_shuffle_f32x4(tmp12204, tmp12208, 221);
__m512 tmp12219 = _mm512_shuffle_f32x4(tmp12205, tmp12209, 136);
__m512 tmp12220 = _mm512_shuffle_f32x4(tmp12205, tmp12209, 221);
__m512 tmp12221 = _mm512_shuffle_f32x4(tmp12206, tmp12210, 136);
__m512 tmp12222 = _mm512_shuffle_f32x4(tmp12206, tmp12210, 221);
__m512 tmp12223 = _mm512_shuffle_f32x4(tmp12211, tmp12211, 136);
__m512 tmp12224 = _mm512_shuffle_f32x4(tmp12211, tmp12211, 221);
__m512 tmp12225 = _mm512_shuffle_f32x4(tmp12212, tmp12212, 136);
__m512 tmp12226 = _mm512_shuffle_f32x4(tmp12212, tmp12212, 221);
__m512 tmp12227 = _mm512_shuffle_f32x4(tmp12213, tmp12213, 136);
__m512 tmp12228 = _mm512_shuffle_f32x4(tmp12213, tmp12213, 221);
__m512 tmp12229 = _mm512_shuffle_f32x4(tmp12214, tmp12214, 136);
__m512 tmp12230 = _mm512_shuffle_f32x4(tmp12214, tmp12214, 221);
tmp12095 = _mm512_shuffle_f32x4(tmp12215, tmp12223, 136);
tmp12103 = _mm512_shuffle_f32x4(tmp12215, tmp12223, 221);
tmp12096 = _mm512_shuffle_f32x4(tmp12217, tmp12225, 136);
tmp12104 = _mm512_shuffle_f32x4(tmp12217, tmp12225, 221);
tmp12097 = _mm512_shuffle_f32x4(tmp12219, tmp12227, 136);
tmp12105 = _mm512_shuffle_f32x4(tmp12219, tmp12227, 221);
tmp12098 = _mm512_shuffle_f32x4(tmp12221, tmp12229, 136);
tmp12106 = _mm512_shuffle_f32x4(tmp12221, tmp12229, 221);
tmp12099 = _mm512_shuffle_f32x4(tmp12216, tmp12224, 136);
__m512 tmp12147 = _mm512_shuffle_f32x4(tmp12216, tmp12224, 221);
tmp12100 = _mm512_shuffle_f32x4(tmp12218, tmp12226, 136);
__m512 tmp12148 = _mm512_shuffle_f32x4(tmp12218, tmp12226, 221);
tmp12101 = _mm512_shuffle_f32x4(tmp12220, tmp12228, 136);
__m512 tmp12149 = _mm512_shuffle_f32x4(tmp12220, tmp12228, 221);
tmp12102 = _mm512_shuffle_f32x4(tmp12222, tmp12230, 136);
__m512 tmp12150 = _mm512_shuffle_f32x4(tmp12222, tmp12230, 221);
__m512 tmp12155 = _mm512_add_ps(tmp12096, tmp12097);
__m512 tmp12175 = _mm512_add_ps(tmp12104, tmp12105);
__m512 tmp12154 = _mm512_add_ps(tmp12098, tmp12099);
__m512 tmp12174 = _mm512_add_ps(tmp12106, tmp12147);
__m512 tmp12160 = _mm512_sub_ps(tmp12098, tmp12099);
__m512 tmp12180 = _mm512_sub_ps(tmp12106, tmp12147);
__m512 tmp12159 = _mm512_sub_ps(tmp12096, tmp12097);
__m512 tmp12179 = _mm512_sub_ps(tmp12104, tmp12105);
__m512 tmp12156 = _mm512_add_ps(tmp12100, tmp12101);
__m512 tmp12176 = _mm512_add_ps(tmp12148, tmp12149);
__m512 tmp12161 = _mm512_sub_ps(tmp12100, tmp12101);
__m512 tmp12181 = _mm512_sub_ps(tmp12148, tmp12149);
__m512 tmp12158 = _mm512_fmadd_ps(tmp12160, _mm512_set1_ps(2e+00f), tmp12159);
__m512 tmp12178 = _mm512_fmadd_ps(tmp12180, _mm512_set1_ps(2e+00f), tmp12179);
__m512 tmp12165 = _mm512_fmadd_ps(tmp12160, _mm512_set1_ps(8e+00f), tmp12159);
__m512 tmp12185 = _mm512_fmadd_ps(tmp12180, _mm512_set1_ps(8e+00f), tmp12179);
__m512 tmp12153 = _mm512_add_ps(tmp12154, tmp12155);
__m512 tmp12173 = _mm512_add_ps(tmp12174, tmp12175);
__m512 tmp12157 = _mm512_fmadd_ps(tmp12161, _mm512_set1_ps(1.6e+01f), tmp12158);
__m512 tmp12177 = _mm512_fmadd_ps(tmp12181, _mm512_set1_ps(1.6e+01f), tmp12178);
__m512 tmp12164 = _mm512_fmadd_ps(tmp12161, _mm512_set1_ps(4e+00f), tmp12165);
__m512 tmp12184 = _mm512_fmadd_ps(tmp12181, _mm512_set1_ps(4e+00f), tmp12185);
__m512 tmp12170 = _mm512_add_ps(tmp12161, tmp12159);
__m512 tmp12190 = _mm512_add_ps(tmp12181, tmp12179);
__m512 tmp12163 = _mm512_fmadd_ps(tmp12154, _mm512_set1_ps(4e+00f), tmp12155);
__m512 tmp12183 = _mm512_fmadd_ps(tmp12174, _mm512_set1_ps(4e+00f), tmp12175);
__m512 tmp12167 = _mm512_fmadd_ps(tmp12154, _mm512_set1_ps(1.6e+01f), tmp12155);
__m512 tmp12187 = _mm512_fmadd_ps(tmp12174, _mm512_set1_ps(1.6e+01f), tmp12175);
__m512 tmp12152 = _mm512_add_ps(tmp12153, tmp12095);
__m512 tmp12172 = _mm512_add_ps(tmp12173, tmp12103);
__m512 tmp12169 = _mm512_add_ps(tmp12170, tmp12102);
__m512 tmp12189 = _mm512_add_ps(tmp12190, tmp12150);
__m512 tmp12151 = _mm512_fmadd_ps(tmp12156, _mm512_set1_ps(3.2e+01f), tmp12152);
__m512 tmp12171 = _mm512_fmadd_ps(tmp12176, _mm512_set1_ps(3.2e+01f), tmp12172);
__m512 tmp12162 = _mm512_fmadd_ps(tmp12156, _mm512_set1_ps(8e+00f), tmp12163);
__m512 tmp12182 = _mm512_fmadd_ps(tmp12176, _mm512_set1_ps(8e+00f), tmp12183);
__m512 tmp12168 = _mm512_fmadd_ps(tmp12160, _mm512_set1_ps(3.2e+01f), tmp12169);
__m512 tmp12188 = _mm512_fmadd_ps(tmp12180, _mm512_set1_ps(3.2e+01f), tmp12189);
__m512 tmp12166 = _mm512_fmadd_ps(tmp12156, _mm512_set1_ps(2e+00f), tmp12167);
__m512 tmp12186 = _mm512_fmadd_ps(tmp12176, _mm512_set1_ps(2e+00f), tmp12187);
__m512 out1635 = tmp12151;
__m512 out1641 = tmp12171;
__m512 out1636 = tmp12157;
__m512 out1642 = tmp12177;
__m512 out1637 = tmp12162;
__m512 out1643 = tmp12182;
__m512 out1638 = tmp12164;
__m512 out1644 = tmp12184;
__m512 out1639 = tmp12166;
__m512 out1645 = tmp12186;
__m512 out1640 = tmp12168;
__m512 out1646 = tmp12188;
out1635 = _mm512_max_ps(_mm512_setzero_ps(), out1635);
out1641 = _mm512_max_ps(_mm512_setzero_ps(), out1641);
out1636 = _mm512_max_ps(_mm512_setzero_ps(), out1636);
out1642 = _mm512_max_ps(_mm512_setzero_ps(), out1642);
out1637 = _mm512_max_ps(_mm512_setzero_ps(), out1637);
out1643 = _mm512_max_ps(_mm512_setzero_ps(), out1643);
out1638 = _mm512_max_ps(_mm512_setzero_ps(), out1638);
out1644 = _mm512_max_ps(_mm512_setzero_ps(), out1644);
out1639 = _mm512_max_ps(_mm512_setzero_ps(), out1639);
out1645 = _mm512_max_ps(_mm512_setzero_ps(), out1645);
out1640 = _mm512_max_ps(_mm512_setzero_ps(), out1640);
out1646 = _mm512_max_ps(_mm512_setzero_ps(), out1646);
_mm512_mask_storeu_ps(datPtr17+0+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1635);
_mm512_mask_storeu_ps(datPtr17+48+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 1023, out1641);
_mm512_mask_storeu_ps(datPtr17+112+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1636);
_mm512_mask_storeu_ps(datPtr17+160+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 1023, out1642);
_mm512_mask_storeu_ps(datPtr17+224+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1637);
_mm512_mask_storeu_ps(datPtr17+272+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 1023, out1643);
_mm512_mask_storeu_ps(datPtr17+336+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1638);
_mm512_mask_storeu_ps(datPtr17+384+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 1023, out1644);
_mm512_mask_storeu_ps(datPtr17+448+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1639);
_mm512_mask_storeu_ps(datPtr17+496+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 1023, out1645);
_mm512_mask_storeu_ps(datPtr17+560+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1640);
_mm512_mask_storeu_ps(datPtr17+608+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 1023, out1646);
__m512 sf865 = _mm512_loadu_ps(sfPtr9+256+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf866 = _mm512_loadu_ps(sfPtr9+384+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1770 = _mm512_shuffle_f32x4(sf865, sf866, 68);
__m512 in1771 = _mm512_shuffle_f32x4(sf865, sf866, 238);
__m512 sf867 = _mm512_loadu_ps(sfPtr9+320+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf868 = _mm512_loadu_ps(sfPtr9+448+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1778 = _mm512_shuffle_f32x4(sf867, sf868, 68);
__m512 in1779 = _mm512_shuffle_f32x4(sf867, sf868, 238);
__m512 sf869 = _mm512_loadu_ps(sfPtr9+205056+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf870 = _mm512_loadu_ps(sfPtr9+205184+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1772 = _mm512_shuffle_f32x4(sf869, sf870, 68);
__m512 in1773 = _mm512_shuffle_f32x4(sf869, sf870, 238);
__m512 sf871 = _mm512_loadu_ps(sfPtr9+205120+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf872 = _mm512_loadu_ps(sfPtr9+205248+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1780 = _mm512_shuffle_f32x4(sf871, sf872, 68);
__m512 in1781 = _mm512_shuffle_f32x4(sf871, sf872, 238);
__m512 sf873 = _mm512_loadu_ps(sfPtr9+409856+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf874 = _mm512_loadu_ps(sfPtr9+409984+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1774 = _mm512_shuffle_f32x4(sf873, sf874, 68);
__m512 in1775 = _mm512_shuffle_f32x4(sf873, sf874, 238);
__m512 sf875 = _mm512_loadu_ps(sfPtr9+409920+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf876 = _mm512_loadu_ps(sfPtr9+410048+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1782 = _mm512_shuffle_f32x4(sf875, sf876, 68);
__m512 in1783 = _mm512_shuffle_f32x4(sf875, sf876, 238);
__m512 sf877 = _mm512_loadu_ps(sfPtr9+614656+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf878 = _mm512_loadu_ps(sfPtr9+614784+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1776 = _mm512_shuffle_f32x4(sf877, sf878, 68);
__m512 in1777 = _mm512_shuffle_f32x4(sf877, sf878, 238);
__m512 sf879 = _mm512_loadu_ps(sfPtr9+614720+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf880 = _mm512_loadu_ps(sfPtr9+614848+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1784 = _mm512_shuffle_f32x4(sf879, sf880, 68);
__m512 in1785 = _mm512_shuffle_f32x4(sf879, sf880, 238);
__m512 tmp12247 = _mm512_add_ps(in1771, in1772);
__m512 tmp12267 = _mm512_add_ps(in1779, in1780);
__m512 tmp12246 = _mm512_add_ps(in1773, in1774);
__m512 tmp12266 = _mm512_add_ps(in1781, in1782);
__m512 tmp12252 = _mm512_sub_ps(in1773, in1774);
__m512 tmp12272 = _mm512_sub_ps(in1781, in1782);
__m512 tmp12251 = _mm512_sub_ps(in1771, in1772);
__m512 tmp12271 = _mm512_sub_ps(in1779, in1780);
__m512 tmp12248 = _mm512_add_ps(in1775, in1776);
__m512 tmp12268 = _mm512_add_ps(in1783, in1784);
__m512 tmp12253 = _mm512_sub_ps(in1775, in1776);
__m512 tmp12273 = _mm512_sub_ps(in1783, in1784);
__m512 tmp12250 = _mm512_fmadd_ps(tmp12252, _mm512_set1_ps(2e+00f), tmp12251);
__m512 tmp12270 = _mm512_fmadd_ps(tmp12272, _mm512_set1_ps(2e+00f), tmp12271);
__m512 tmp12257 = _mm512_fmadd_ps(tmp12252, _mm512_set1_ps(8e+00f), tmp12251);
__m512 tmp12277 = _mm512_fmadd_ps(tmp12272, _mm512_set1_ps(8e+00f), tmp12271);
__m512 tmp12245 = _mm512_add_ps(tmp12246, tmp12247);
__m512 tmp12265 = _mm512_add_ps(tmp12266, tmp12267);
__m512 tmp12249 = _mm512_fmadd_ps(tmp12253, _mm512_set1_ps(1.6e+01f), tmp12250);
__m512 tmp12269 = _mm512_fmadd_ps(tmp12273, _mm512_set1_ps(1.6e+01f), tmp12270);
__m512 tmp12256 = _mm512_fmadd_ps(tmp12253, _mm512_set1_ps(4e+00f), tmp12257);
__m512 tmp12276 = _mm512_fmadd_ps(tmp12273, _mm512_set1_ps(4e+00f), tmp12277);
__m512 tmp12262 = _mm512_add_ps(tmp12253, tmp12251);
__m512 tmp12282 = _mm512_add_ps(tmp12273, tmp12271);
__m512 tmp12255 = _mm512_fmadd_ps(tmp12246, _mm512_set1_ps(4e+00f), tmp12247);
__m512 tmp12275 = _mm512_fmadd_ps(tmp12266, _mm512_set1_ps(4e+00f), tmp12267);
__m512 tmp12259 = _mm512_fmadd_ps(tmp12246, _mm512_set1_ps(1.6e+01f), tmp12247);
__m512 tmp12279 = _mm512_fmadd_ps(tmp12266, _mm512_set1_ps(1.6e+01f), tmp12267);
__m512 tmp12244 = _mm512_add_ps(tmp12245, in1770);
__m512 tmp12264 = _mm512_add_ps(tmp12265, in1778);
__m512 tmp12261 = _mm512_add_ps(tmp12262, in1777);
__m512 tmp12281 = _mm512_add_ps(tmp12282, in1785);
__m512 tmp12243 = _mm512_fmadd_ps(tmp12248, _mm512_set1_ps(3.2e+01f), tmp12244);
__m512 tmp12263 = _mm512_fmadd_ps(tmp12268, _mm512_set1_ps(3.2e+01f), tmp12264);
__m512 tmp12254 = _mm512_fmadd_ps(tmp12248, _mm512_set1_ps(8e+00f), tmp12255);
__m512 tmp12274 = _mm512_fmadd_ps(tmp12268, _mm512_set1_ps(8e+00f), tmp12275);
__m512 tmp12260 = _mm512_fmadd_ps(tmp12252, _mm512_set1_ps(3.2e+01f), tmp12261);
__m512 tmp12280 = _mm512_fmadd_ps(tmp12272, _mm512_set1_ps(3.2e+01f), tmp12281);
__m512 tmp12258 = _mm512_fmadd_ps(tmp12248, _mm512_set1_ps(2e+00f), tmp12259);
__m512 tmp12278 = _mm512_fmadd_ps(tmp12268, _mm512_set1_ps(2e+00f), tmp12279);
__m512 tmp12231 = tmp12243;
__m512 tmp12237 = tmp12263;
__m512 tmp12232 = tmp12249;
__m512 tmp12238 = tmp12269;
__m512 tmp12233 = tmp12254;
__m512 tmp12239 = tmp12274;
__m512 tmp12234 = tmp12256;
__m512 tmp12240 = tmp12276;
__m512 tmp12235 = tmp12258;
__m512 tmp12241 = tmp12278;
__m512 tmp12236 = tmp12260;
__m512 tmp12242 = tmp12280;
__m512 tmp12327 = _mm512_unpacklo_ps(tmp12231, tmp12232);
__m512 tmp12328 = _mm512_unpackhi_ps(tmp12231, tmp12232);
__m512 tmp12329 = _mm512_unpacklo_ps(tmp12233, tmp12234);
__m512 tmp12330 = _mm512_unpackhi_ps(tmp12233, tmp12234);
__m512 tmp12331 = _mm512_unpacklo_ps(tmp12235, tmp12236);
__m512 tmp12332 = _mm512_unpackhi_ps(tmp12235, tmp12236);
__m512 tmp12333 = _mm512_unpacklo_ps(tmp12237, tmp12238);
__m512 tmp12334 = _mm512_unpackhi_ps(tmp12237, tmp12238);
__m512 tmp12335 = _mm512_unpacklo_ps(tmp12239, tmp12240);
__m512 tmp12336 = _mm512_unpackhi_ps(tmp12239, tmp12240);
__m512 tmp12337 = _mm512_unpacklo_ps(tmp12241, tmp12242);
__m512 tmp12338 = _mm512_unpackhi_ps(tmp12241, tmp12242);
__m512 tmp12339 = _mm512_shuffle_ps(tmp12327, tmp12329, 68);
__m512 tmp12340 = _mm512_shuffle_ps(tmp12327, tmp12329, 238);
__m512 tmp12341 = _mm512_shuffle_ps(tmp12328, tmp12330, 68);
__m512 tmp12342 = _mm512_shuffle_ps(tmp12328, tmp12330, 238);
__m512 tmp12343 = _mm512_shuffle_ps(tmp12331, tmp12333, 68);
__m512 tmp12344 = _mm512_shuffle_ps(tmp12331, tmp12333, 238);
__m512 tmp12345 = _mm512_shuffle_ps(tmp12332, tmp12334, 68);
__m512 tmp12346 = _mm512_shuffle_ps(tmp12332, tmp12334, 238);
__m512 tmp12347 = _mm512_shuffle_ps(tmp12335, tmp12337, 68);
__m512 tmp12348 = _mm512_shuffle_ps(tmp12335, tmp12337, 238);
__m512 tmp12349 = _mm512_shuffle_ps(tmp12336, tmp12338, 68);
__m512 tmp12350 = _mm512_shuffle_ps(tmp12336, tmp12338, 238);
__m512 tmp12351 = _mm512_shuffle_f32x4(tmp12339, tmp12343, 136);
__m512 tmp12352 = _mm512_shuffle_f32x4(tmp12339, tmp12343, 221);
__m512 tmp12353 = _mm512_shuffle_f32x4(tmp12340, tmp12344, 136);
__m512 tmp12354 = _mm512_shuffle_f32x4(tmp12340, tmp12344, 221);
__m512 tmp12355 = _mm512_shuffle_f32x4(tmp12341, tmp12345, 136);
__m512 tmp12356 = _mm512_shuffle_f32x4(tmp12341, tmp12345, 221);
__m512 tmp12357 = _mm512_shuffle_f32x4(tmp12342, tmp12346, 136);
__m512 tmp12358 = _mm512_shuffle_f32x4(tmp12342, tmp12346, 221);
__m512 tmp12359 = _mm512_shuffle_f32x4(tmp12347, tmp12347, 136);
__m512 tmp12360 = _mm512_shuffle_f32x4(tmp12347, tmp12347, 221);
__m512 tmp12361 = _mm512_shuffle_f32x4(tmp12348, tmp12348, 136);
__m512 tmp12362 = _mm512_shuffle_f32x4(tmp12348, tmp12348, 221);
__m512 tmp12363 = _mm512_shuffle_f32x4(tmp12349, tmp12349, 136);
__m512 tmp12364 = _mm512_shuffle_f32x4(tmp12349, tmp12349, 221);
__m512 tmp12365 = _mm512_shuffle_f32x4(tmp12350, tmp12350, 136);
__m512 tmp12366 = _mm512_shuffle_f32x4(tmp12350, tmp12350, 221);
tmp12231 = _mm512_shuffle_f32x4(tmp12351, tmp12359, 136);
tmp12239 = _mm512_shuffle_f32x4(tmp12351, tmp12359, 221);
tmp12232 = _mm512_shuffle_f32x4(tmp12353, tmp12361, 136);
tmp12240 = _mm512_shuffle_f32x4(tmp12353, tmp12361, 221);
tmp12233 = _mm512_shuffle_f32x4(tmp12355, tmp12363, 136);
tmp12241 = _mm512_shuffle_f32x4(tmp12355, tmp12363, 221);
tmp12234 = _mm512_shuffle_f32x4(tmp12357, tmp12365, 136);
tmp12242 = _mm512_shuffle_f32x4(tmp12357, tmp12365, 221);
tmp12235 = _mm512_shuffle_f32x4(tmp12352, tmp12360, 136);
__m512 tmp12283 = _mm512_shuffle_f32x4(tmp12352, tmp12360, 221);
tmp12236 = _mm512_shuffle_f32x4(tmp12354, tmp12362, 136);
__m512 tmp12284 = _mm512_shuffle_f32x4(tmp12354, tmp12362, 221);
tmp12237 = _mm512_shuffle_f32x4(tmp12356, tmp12364, 136);
__m512 tmp12285 = _mm512_shuffle_f32x4(tmp12356, tmp12364, 221);
tmp12238 = _mm512_shuffle_f32x4(tmp12358, tmp12366, 136);
__m512 tmp12286 = _mm512_shuffle_f32x4(tmp12358, tmp12366, 221);
__m512 tmp12291 = _mm512_add_ps(tmp12232, tmp12233);
__m512 tmp12311 = _mm512_add_ps(tmp12240, tmp12241);
__m512 tmp12290 = _mm512_add_ps(tmp12234, tmp12235);
__m512 tmp12310 = _mm512_add_ps(tmp12242, tmp12283);
__m512 tmp12296 = _mm512_sub_ps(tmp12234, tmp12235);
__m512 tmp12316 = _mm512_sub_ps(tmp12242, tmp12283);
__m512 tmp12295 = _mm512_sub_ps(tmp12232, tmp12233);
__m512 tmp12315 = _mm512_sub_ps(tmp12240, tmp12241);
__m512 tmp12292 = _mm512_add_ps(tmp12236, tmp12237);
__m512 tmp12312 = _mm512_add_ps(tmp12284, tmp12285);
__m512 tmp12297 = _mm512_sub_ps(tmp12236, tmp12237);
__m512 tmp12317 = _mm512_sub_ps(tmp12284, tmp12285);
__m512 tmp12294 = _mm512_fmadd_ps(tmp12296, _mm512_set1_ps(2e+00f), tmp12295);
__m512 tmp12314 = _mm512_fmadd_ps(tmp12316, _mm512_set1_ps(2e+00f), tmp12315);
__m512 tmp12301 = _mm512_fmadd_ps(tmp12296, _mm512_set1_ps(8e+00f), tmp12295);
__m512 tmp12321 = _mm512_fmadd_ps(tmp12316, _mm512_set1_ps(8e+00f), tmp12315);
__m512 tmp12289 = _mm512_add_ps(tmp12290, tmp12291);
__m512 tmp12309 = _mm512_add_ps(tmp12310, tmp12311);
__m512 tmp12293 = _mm512_fmadd_ps(tmp12297, _mm512_set1_ps(1.6e+01f), tmp12294);
__m512 tmp12313 = _mm512_fmadd_ps(tmp12317, _mm512_set1_ps(1.6e+01f), tmp12314);
__m512 tmp12300 = _mm512_fmadd_ps(tmp12297, _mm512_set1_ps(4e+00f), tmp12301);
__m512 tmp12320 = _mm512_fmadd_ps(tmp12317, _mm512_set1_ps(4e+00f), tmp12321);
__m512 tmp12306 = _mm512_add_ps(tmp12297, tmp12295);
__m512 tmp12326 = _mm512_add_ps(tmp12317, tmp12315);
__m512 tmp12299 = _mm512_fmadd_ps(tmp12290, _mm512_set1_ps(4e+00f), tmp12291);
__m512 tmp12319 = _mm512_fmadd_ps(tmp12310, _mm512_set1_ps(4e+00f), tmp12311);
__m512 tmp12303 = _mm512_fmadd_ps(tmp12290, _mm512_set1_ps(1.6e+01f), tmp12291);
__m512 tmp12323 = _mm512_fmadd_ps(tmp12310, _mm512_set1_ps(1.6e+01f), tmp12311);
__m512 tmp12288 = _mm512_add_ps(tmp12289, tmp12231);
__m512 tmp12308 = _mm512_add_ps(tmp12309, tmp12239);
__m512 tmp12305 = _mm512_add_ps(tmp12306, tmp12238);
__m512 tmp12325 = _mm512_add_ps(tmp12326, tmp12286);
__m512 tmp12287 = _mm512_fmadd_ps(tmp12292, _mm512_set1_ps(3.2e+01f), tmp12288);
__m512 tmp12307 = _mm512_fmadd_ps(tmp12312, _mm512_set1_ps(3.2e+01f), tmp12308);
__m512 tmp12298 = _mm512_fmadd_ps(tmp12292, _mm512_set1_ps(8e+00f), tmp12299);
__m512 tmp12318 = _mm512_fmadd_ps(tmp12312, _mm512_set1_ps(8e+00f), tmp12319);
__m512 tmp12304 = _mm512_fmadd_ps(tmp12296, _mm512_set1_ps(3.2e+01f), tmp12305);
__m512 tmp12324 = _mm512_fmadd_ps(tmp12316, _mm512_set1_ps(3.2e+01f), tmp12325);
__m512 tmp12302 = _mm512_fmadd_ps(tmp12292, _mm512_set1_ps(2e+00f), tmp12303);
__m512 tmp12322 = _mm512_fmadd_ps(tmp12312, _mm512_set1_ps(2e+00f), tmp12323);
__m512 out1647 = tmp12287;
__m512 out1653 = tmp12307;
__m512 out1648 = tmp12293;
__m512 out1654 = tmp12313;
__m512 out1649 = tmp12298;
__m512 out1655 = tmp12318;
__m512 out1650 = tmp12300;
__m512 out1656 = tmp12320;
__m512 out1651 = tmp12302;
__m512 out1657 = tmp12322;
__m512 out1652 = tmp12304;
__m512 out1658 = tmp12324;
out1647 = _mm512_max_ps(_mm512_setzero_ps(), out1647);
out1653 = _mm512_max_ps(_mm512_setzero_ps(), out1653);
out1648 = _mm512_max_ps(_mm512_setzero_ps(), out1648);
out1654 = _mm512_max_ps(_mm512_setzero_ps(), out1654);
out1649 = _mm512_max_ps(_mm512_setzero_ps(), out1649);
out1655 = _mm512_max_ps(_mm512_setzero_ps(), out1655);
out1650 = _mm512_max_ps(_mm512_setzero_ps(), out1650);
out1656 = _mm512_max_ps(_mm512_setzero_ps(), out1656);
out1651 = _mm512_max_ps(_mm512_setzero_ps(), out1651);
out1657 = _mm512_max_ps(_mm512_setzero_ps(), out1657);
out1652 = _mm512_max_ps(_mm512_setzero_ps(), out1652);
out1658 = _mm512_max_ps(_mm512_setzero_ps(), out1658);
_mm512_mask_storeu_ps(datPtr17+648+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1647);
_mm512_mask_storeu_ps(datPtr17+3136+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1653);
_mm512_mask_storeu_ps(datPtr17+760+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1648);
_mm512_mask_storeu_ps(datPtr17+3248+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1654);
_mm512_mask_storeu_ps(datPtr17+872+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1649);
_mm512_mask_storeu_ps(datPtr17+3360+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1655);
_mm512_mask_storeu_ps(datPtr17+984+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1650);
_mm512_mask_storeu_ps(datPtr17+3472+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1656);
_mm512_mask_storeu_ps(datPtr17+1096+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1651);
_mm512_mask_storeu_ps(datPtr17+3584+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1657);
_mm512_mask_storeu_ps(datPtr17+1208+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1652);
_mm512_mask_storeu_ps(datPtr17+3696+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1658);
__m512 sf881 = _mm512_loadu_ps(sfPtr9+512+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf882 = _mm512_loadu_ps(sfPtr9+640+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1786 = _mm512_shuffle_f32x4(sf881, sf882, 68);
__m512 in1787 = _mm512_shuffle_f32x4(sf881, sf882, 238);
__m512 sf883 = _mm512_loadu_ps(sfPtr9+576+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf884 = _mm512_loadu_ps(sfPtr9+704+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1794 = _mm512_shuffle_f32x4(sf883, sf884, 68);
__m512 in1795 = _mm512_shuffle_f32x4(sf883, sf884, 238);
__m512 sf885 = _mm512_loadu_ps(sfPtr9+205312+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf886 = _mm512_loadu_ps(sfPtr9+205440+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1788 = _mm512_shuffle_f32x4(sf885, sf886, 68);
__m512 in1789 = _mm512_shuffle_f32x4(sf885, sf886, 238);
__m512 sf887 = _mm512_loadu_ps(sfPtr9+205376+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf888 = _mm512_loadu_ps(sfPtr9+205504+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1796 = _mm512_shuffle_f32x4(sf887, sf888, 68);
__m512 in1797 = _mm512_shuffle_f32x4(sf887, sf888, 238);
__m512 sf889 = _mm512_loadu_ps(sfPtr9+410112+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf890 = _mm512_loadu_ps(sfPtr9+410240+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1790 = _mm512_shuffle_f32x4(sf889, sf890, 68);
__m512 in1791 = _mm512_shuffle_f32x4(sf889, sf890, 238);
__m512 sf891 = _mm512_loadu_ps(sfPtr9+410176+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf892 = _mm512_loadu_ps(sfPtr9+410304+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1798 = _mm512_shuffle_f32x4(sf891, sf892, 68);
__m512 in1799 = _mm512_shuffle_f32x4(sf891, sf892, 238);
__m512 sf893 = _mm512_loadu_ps(sfPtr9+614912+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf894 = _mm512_loadu_ps(sfPtr9+615040+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1792 = _mm512_shuffle_f32x4(sf893, sf894, 68);
__m512 in1793 = _mm512_shuffle_f32x4(sf893, sf894, 238);
__m512 sf895 = _mm512_loadu_ps(sfPtr9+614976+819200*i37+49152*j30+1536*k112+768*l44);
__m512 sf896 = _mm512_loadu_ps(sfPtr9+615104+819200*i37+49152*j30+1536*k112+768*l44);
__m512 in1800 = _mm512_shuffle_f32x4(sf895, sf896, 68);
__m512 in1801 = _mm512_shuffle_f32x4(sf895, sf896, 238);
__m512 tmp12383 = _mm512_add_ps(in1787, in1788);
__m512 tmp12403 = _mm512_add_ps(in1795, in1796);
__m512 tmp12382 = _mm512_add_ps(in1789, in1790);
__m512 tmp12402 = _mm512_add_ps(in1797, in1798);
__m512 tmp12388 = _mm512_sub_ps(in1789, in1790);
__m512 tmp12408 = _mm512_sub_ps(in1797, in1798);
__m512 tmp12387 = _mm512_sub_ps(in1787, in1788);
__m512 tmp12407 = _mm512_sub_ps(in1795, in1796);
__m512 tmp12384 = _mm512_add_ps(in1791, in1792);
__m512 tmp12404 = _mm512_add_ps(in1799, in1800);
__m512 tmp12389 = _mm512_sub_ps(in1791, in1792);
__m512 tmp12409 = _mm512_sub_ps(in1799, in1800);
__m512 tmp12386 = _mm512_fmadd_ps(tmp12388, _mm512_set1_ps(2e+00f), tmp12387);
__m512 tmp12406 = _mm512_fmadd_ps(tmp12408, _mm512_set1_ps(2e+00f), tmp12407);
__m512 tmp12393 = _mm512_fmadd_ps(tmp12388, _mm512_set1_ps(8e+00f), tmp12387);
__m512 tmp12413 = _mm512_fmadd_ps(tmp12408, _mm512_set1_ps(8e+00f), tmp12407);
__m512 tmp12381 = _mm512_add_ps(tmp12382, tmp12383);
__m512 tmp12401 = _mm512_add_ps(tmp12402, tmp12403);
__m512 tmp12385 = _mm512_fmadd_ps(tmp12389, _mm512_set1_ps(1.6e+01f), tmp12386);
__m512 tmp12405 = _mm512_fmadd_ps(tmp12409, _mm512_set1_ps(1.6e+01f), tmp12406);
__m512 tmp12392 = _mm512_fmadd_ps(tmp12389, _mm512_set1_ps(4e+00f), tmp12393);
__m512 tmp12412 = _mm512_fmadd_ps(tmp12409, _mm512_set1_ps(4e+00f), tmp12413);
__m512 tmp12398 = _mm512_add_ps(tmp12389, tmp12387);
__m512 tmp12418 = _mm512_add_ps(tmp12409, tmp12407);
__m512 tmp12391 = _mm512_fmadd_ps(tmp12382, _mm512_set1_ps(4e+00f), tmp12383);
__m512 tmp12411 = _mm512_fmadd_ps(tmp12402, _mm512_set1_ps(4e+00f), tmp12403);
__m512 tmp12395 = _mm512_fmadd_ps(tmp12382, _mm512_set1_ps(1.6e+01f), tmp12383);
__m512 tmp12415 = _mm512_fmadd_ps(tmp12402, _mm512_set1_ps(1.6e+01f), tmp12403);
__m512 tmp12380 = _mm512_add_ps(tmp12381, in1786);
__m512 tmp12400 = _mm512_add_ps(tmp12401, in1794);
__m512 tmp12397 = _mm512_add_ps(tmp12398, in1793);
__m512 tmp12417 = _mm512_add_ps(tmp12418, in1801);
__m512 tmp12379 = _mm512_fmadd_ps(tmp12384, _mm512_set1_ps(3.2e+01f), tmp12380);
__m512 tmp12399 = _mm512_fmadd_ps(tmp12404, _mm512_set1_ps(3.2e+01f), tmp12400);
__m512 tmp12390 = _mm512_fmadd_ps(tmp12384, _mm512_set1_ps(8e+00f), tmp12391);
__m512 tmp12410 = _mm512_fmadd_ps(tmp12404, _mm512_set1_ps(8e+00f), tmp12411);
__m512 tmp12396 = _mm512_fmadd_ps(tmp12388, _mm512_set1_ps(3.2e+01f), tmp12397);
__m512 tmp12416 = _mm512_fmadd_ps(tmp12408, _mm512_set1_ps(3.2e+01f), tmp12417);
__m512 tmp12394 = _mm512_fmadd_ps(tmp12384, _mm512_set1_ps(2e+00f), tmp12395);
__m512 tmp12414 = _mm512_fmadd_ps(tmp12404, _mm512_set1_ps(2e+00f), tmp12415);
__m512 tmp12367 = tmp12379;
__m512 tmp12373 = tmp12399;
__m512 tmp12368 = tmp12385;
__m512 tmp12374 = tmp12405;
__m512 tmp12369 = tmp12390;
__m512 tmp12375 = tmp12410;
__m512 tmp12370 = tmp12392;
__m512 tmp12376 = tmp12412;
__m512 tmp12371 = tmp12394;
__m512 tmp12377 = tmp12414;
__m512 tmp12372 = tmp12396;
__m512 tmp12378 = tmp12416;
__m512 tmp12463 = _mm512_unpacklo_ps(tmp12367, tmp12368);
__m512 tmp12464 = _mm512_unpackhi_ps(tmp12367, tmp12368);
__m512 tmp12465 = _mm512_unpacklo_ps(tmp12369, tmp12370);
__m512 tmp12466 = _mm512_unpackhi_ps(tmp12369, tmp12370);
__m512 tmp12467 = _mm512_unpacklo_ps(tmp12371, tmp12372);
__m512 tmp12468 = _mm512_unpackhi_ps(tmp12371, tmp12372);
__m512 tmp12469 = _mm512_unpacklo_ps(tmp12373, tmp12374);
__m512 tmp12470 = _mm512_unpackhi_ps(tmp12373, tmp12374);
__m512 tmp12471 = _mm512_unpacklo_ps(tmp12375, tmp12376);
__m512 tmp12472 = _mm512_unpackhi_ps(tmp12375, tmp12376);
__m512 tmp12473 = _mm512_unpacklo_ps(tmp12377, tmp12378);
__m512 tmp12474 = _mm512_unpackhi_ps(tmp12377, tmp12378);
__m512 tmp12475 = _mm512_shuffle_ps(tmp12463, tmp12465, 68);
__m512 tmp12476 = _mm512_shuffle_ps(tmp12463, tmp12465, 238);
__m512 tmp12477 = _mm512_shuffle_ps(tmp12464, tmp12466, 68);
__m512 tmp12478 = _mm512_shuffle_ps(tmp12464, tmp12466, 238);
__m512 tmp12479 = _mm512_shuffle_ps(tmp12467, tmp12469, 68);
__m512 tmp12480 = _mm512_shuffle_ps(tmp12467, tmp12469, 238);
__m512 tmp12481 = _mm512_shuffle_ps(tmp12468, tmp12470, 68);
__m512 tmp12482 = _mm512_shuffle_ps(tmp12468, tmp12470, 238);
__m512 tmp12483 = _mm512_shuffle_ps(tmp12471, tmp12473, 68);
__m512 tmp12484 = _mm512_shuffle_ps(tmp12471, tmp12473, 238);
__m512 tmp12485 = _mm512_shuffle_ps(tmp12472, tmp12474, 68);
__m512 tmp12486 = _mm512_shuffle_ps(tmp12472, tmp12474, 238);
__m512 tmp12487 = _mm512_shuffle_f32x4(tmp12475, tmp12479, 136);
__m512 tmp12488 = _mm512_shuffle_f32x4(tmp12475, tmp12479, 221);
__m512 tmp12489 = _mm512_shuffle_f32x4(tmp12476, tmp12480, 136);
__m512 tmp12490 = _mm512_shuffle_f32x4(tmp12476, tmp12480, 221);
__m512 tmp12491 = _mm512_shuffle_f32x4(tmp12477, tmp12481, 136);
__m512 tmp12492 = _mm512_shuffle_f32x4(tmp12477, tmp12481, 221);
__m512 tmp12493 = _mm512_shuffle_f32x4(tmp12478, tmp12482, 136);
__m512 tmp12494 = _mm512_shuffle_f32x4(tmp12478, tmp12482, 221);
__m512 tmp12495 = _mm512_shuffle_f32x4(tmp12483, tmp12483, 136);
__m512 tmp12496 = _mm512_shuffle_f32x4(tmp12483, tmp12483, 221);
__m512 tmp12497 = _mm512_shuffle_f32x4(tmp12484, tmp12484, 136);
__m512 tmp12498 = _mm512_shuffle_f32x4(tmp12484, tmp12484, 221);
__m512 tmp12499 = _mm512_shuffle_f32x4(tmp12485, tmp12485, 136);
__m512 tmp12500 = _mm512_shuffle_f32x4(tmp12485, tmp12485, 221);
__m512 tmp12501 = _mm512_shuffle_f32x4(tmp12486, tmp12486, 136);
__m512 tmp12502 = _mm512_shuffle_f32x4(tmp12486, tmp12486, 221);
tmp12367 = _mm512_shuffle_f32x4(tmp12487, tmp12495, 136);
tmp12375 = _mm512_shuffle_f32x4(tmp12487, tmp12495, 221);
tmp12368 = _mm512_shuffle_f32x4(tmp12489, tmp12497, 136);
tmp12376 = _mm512_shuffle_f32x4(tmp12489, tmp12497, 221);
tmp12369 = _mm512_shuffle_f32x4(tmp12491, tmp12499, 136);
tmp12377 = _mm512_shuffle_f32x4(tmp12491, tmp12499, 221);
tmp12370 = _mm512_shuffle_f32x4(tmp12493, tmp12501, 136);
tmp12378 = _mm512_shuffle_f32x4(tmp12493, tmp12501, 221);
tmp12371 = _mm512_shuffle_f32x4(tmp12488, tmp12496, 136);
__m512 tmp12419 = _mm512_shuffle_f32x4(tmp12488, tmp12496, 221);
tmp12372 = _mm512_shuffle_f32x4(tmp12490, tmp12498, 136);
__m512 tmp12420 = _mm512_shuffle_f32x4(tmp12490, tmp12498, 221);
tmp12373 = _mm512_shuffle_f32x4(tmp12492, tmp12500, 136);
__m512 tmp12421 = _mm512_shuffle_f32x4(tmp12492, tmp12500, 221);
tmp12374 = _mm512_shuffle_f32x4(tmp12494, tmp12502, 136);
__m512 tmp12422 = _mm512_shuffle_f32x4(tmp12494, tmp12502, 221);
__m512 tmp12427 = _mm512_add_ps(tmp12368, tmp12369);
__m512 tmp12447 = _mm512_add_ps(tmp12376, tmp12377);
__m512 tmp12426 = _mm512_add_ps(tmp12370, tmp12371);
__m512 tmp12446 = _mm512_add_ps(tmp12378, tmp12419);
__m512 tmp12432 = _mm512_sub_ps(tmp12370, tmp12371);
__m512 tmp12452 = _mm512_sub_ps(tmp12378, tmp12419);
__m512 tmp12431 = _mm512_sub_ps(tmp12368, tmp12369);
__m512 tmp12451 = _mm512_sub_ps(tmp12376, tmp12377);
__m512 tmp12428 = _mm512_add_ps(tmp12372, tmp12373);
__m512 tmp12448 = _mm512_add_ps(tmp12420, tmp12421);
__m512 tmp12433 = _mm512_sub_ps(tmp12372, tmp12373);
__m512 tmp12453 = _mm512_sub_ps(tmp12420, tmp12421);
__m512 tmp12430 = _mm512_fmadd_ps(tmp12432, _mm512_set1_ps(2e+00f), tmp12431);
__m512 tmp12450 = _mm512_fmadd_ps(tmp12452, _mm512_set1_ps(2e+00f), tmp12451);
__m512 tmp12437 = _mm512_fmadd_ps(tmp12432, _mm512_set1_ps(8e+00f), tmp12431);
__m512 tmp12457 = _mm512_fmadd_ps(tmp12452, _mm512_set1_ps(8e+00f), tmp12451);
__m512 tmp12425 = _mm512_add_ps(tmp12426, tmp12427);
__m512 tmp12445 = _mm512_add_ps(tmp12446, tmp12447);
__m512 tmp12429 = _mm512_fmadd_ps(tmp12433, _mm512_set1_ps(1.6e+01f), tmp12430);
__m512 tmp12449 = _mm512_fmadd_ps(tmp12453, _mm512_set1_ps(1.6e+01f), tmp12450);
__m512 tmp12436 = _mm512_fmadd_ps(tmp12433, _mm512_set1_ps(4e+00f), tmp12437);
__m512 tmp12456 = _mm512_fmadd_ps(tmp12453, _mm512_set1_ps(4e+00f), tmp12457);
__m512 tmp12442 = _mm512_add_ps(tmp12433, tmp12431);
__m512 tmp12462 = _mm512_add_ps(tmp12453, tmp12451);
__m512 tmp12435 = _mm512_fmadd_ps(tmp12426, _mm512_set1_ps(4e+00f), tmp12427);
__m512 tmp12455 = _mm512_fmadd_ps(tmp12446, _mm512_set1_ps(4e+00f), tmp12447);
__m512 tmp12439 = _mm512_fmadd_ps(tmp12426, _mm512_set1_ps(1.6e+01f), tmp12427);
__m512 tmp12459 = _mm512_fmadd_ps(tmp12446, _mm512_set1_ps(1.6e+01f), tmp12447);
__m512 tmp12424 = _mm512_add_ps(tmp12425, tmp12367);
__m512 tmp12444 = _mm512_add_ps(tmp12445, tmp12375);
__m512 tmp12441 = _mm512_add_ps(tmp12442, tmp12374);
__m512 tmp12461 = _mm512_add_ps(tmp12462, tmp12422);
__m512 tmp12423 = _mm512_fmadd_ps(tmp12428, _mm512_set1_ps(3.2e+01f), tmp12424);
__m512 tmp12443 = _mm512_fmadd_ps(tmp12448, _mm512_set1_ps(3.2e+01f), tmp12444);
__m512 tmp12434 = _mm512_fmadd_ps(tmp12428, _mm512_set1_ps(8e+00f), tmp12435);
__m512 tmp12454 = _mm512_fmadd_ps(tmp12448, _mm512_set1_ps(8e+00f), tmp12455);
__m512 tmp12440 = _mm512_fmadd_ps(tmp12432, _mm512_set1_ps(3.2e+01f), tmp12441);
__m512 tmp12460 = _mm512_fmadd_ps(tmp12452, _mm512_set1_ps(3.2e+01f), tmp12461);
__m512 tmp12438 = _mm512_fmadd_ps(tmp12428, _mm512_set1_ps(2e+00f), tmp12439);
__m512 tmp12458 = _mm512_fmadd_ps(tmp12448, _mm512_set1_ps(2e+00f), tmp12459);
__m512 out1659 = tmp12423;
__m512 out1665 = tmp12443;
__m512 out1660 = tmp12429;
__m512 out1666 = tmp12449;
__m512 out1661 = tmp12434;
__m512 out1667 = tmp12454;
__m512 out1662 = tmp12436;
__m512 out1668 = tmp12456;
__m512 out1663 = tmp12438;
__m512 out1669 = tmp12458;
__m512 out1664 = tmp12440;
__m512 out1670 = tmp12460;
out1659 = _mm512_max_ps(_mm512_setzero_ps(), out1659);
out1665 = _mm512_max_ps(_mm512_setzero_ps(), out1665);
out1660 = _mm512_max_ps(_mm512_setzero_ps(), out1660);
out1666 = _mm512_max_ps(_mm512_setzero_ps(), out1666);
out1661 = _mm512_max_ps(_mm512_setzero_ps(), out1661);
out1667 = _mm512_max_ps(_mm512_setzero_ps(), out1667);
out1662 = _mm512_max_ps(_mm512_setzero_ps(), out1662);
out1668 = _mm512_max_ps(_mm512_setzero_ps(), out1668);
out1663 = _mm512_max_ps(_mm512_setzero_ps(), out1663);
out1669 = _mm512_max_ps(_mm512_setzero_ps(), out1669);
out1664 = _mm512_max_ps(_mm512_setzero_ps(), out1664);
out1670 = _mm512_max_ps(_mm512_setzero_ps(), out1670);
_mm512_mask_storeu_ps(datPtr17+3184+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 1023, out1659);
_mm512_mask_storeu_ps(datPtr17+3784+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1665);
_mm512_mask_storeu_ps(datPtr17+3296+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 1023, out1660);
_mm512_mask_storeu_ps(datPtr17+3896+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1666);
_mm512_mask_storeu_ps(datPtr17+3408+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 1023, out1661);
_mm512_mask_storeu_ps(datPtr17+4008+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1667);
_mm512_mask_storeu_ps(datPtr17+3520+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 1023, out1662);
_mm512_mask_storeu_ps(datPtr17+4120+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1668);
_mm512_mask_storeu_ps(datPtr17+3632+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 1023, out1663);
_mm512_mask_storeu_ps(datPtr17+4232+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1669);
_mm512_mask_storeu_ps(datPtr17+3744+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 1023, out1664);
_mm512_mask_storeu_ps(datPtr17+4344+401408*i37+112*toH39+4*toW39+12544*k112+6272*l44, 4095, out1670);
}
}
if (j30 >= last8) return;
++j30;
rel20 = 2;
}
if (rel20 < 3) {
ptrdiff_t toH40 = base20+12;
ptrdiff_t toW40 = 12;
ptrdiff_t k113 = 32*w54;
for (; k113 != 32; ++k113) {
ptrdiff_t l45 = 0;
for (; l45 != 2; ++l45) {
__m512 sf897 = _mm512_loadu_ps(sfPtr9+0+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf898 = _mm512_loadu_ps(sfPtr9+128+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1802 = _mm512_shuffle_f32x4(sf897, sf898, 68);
__m512 in1803 = _mm512_shuffle_f32x4(sf897, sf898, 238);
__m512 sf899 = _mm512_loadu_ps(sfPtr9+64+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf900 = _mm512_loadu_ps(sfPtr9+192+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1810 = _mm512_shuffle_f32x4(sf899, sf900, 68);
__m512 in1811 = _mm512_shuffle_f32x4(sf899, sf900, 238);
__m512 sf901 = _mm512_loadu_ps(sfPtr9+204800+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf902 = _mm512_loadu_ps(sfPtr9+204928+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1804 = _mm512_shuffle_f32x4(sf901, sf902, 68);
__m512 in1805 = _mm512_shuffle_f32x4(sf901, sf902, 238);
__m512 sf903 = _mm512_loadu_ps(sfPtr9+204864+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf904 = _mm512_loadu_ps(sfPtr9+204992+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1812 = _mm512_shuffle_f32x4(sf903, sf904, 68);
__m512 in1813 = _mm512_shuffle_f32x4(sf903, sf904, 238);
__m512 sf905 = _mm512_loadu_ps(sfPtr9+409600+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf906 = _mm512_loadu_ps(sfPtr9+409728+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1806 = _mm512_shuffle_f32x4(sf905, sf906, 68);
__m512 in1807 = _mm512_shuffle_f32x4(sf905, sf906, 238);
__m512 sf907 = _mm512_loadu_ps(sfPtr9+409664+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf908 = _mm512_loadu_ps(sfPtr9+409792+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1814 = _mm512_shuffle_f32x4(sf907, sf908, 68);
__m512 in1815 = _mm512_shuffle_f32x4(sf907, sf908, 238);
__m512 sf909 = _mm512_loadu_ps(sfPtr9+614400+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf910 = _mm512_loadu_ps(sfPtr9+614528+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1808 = _mm512_shuffle_f32x4(sf909, sf910, 68);
__m512 in1809 = _mm512_shuffle_f32x4(sf909, sf910, 238);
__m512 sf911 = _mm512_loadu_ps(sfPtr9+614464+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf912 = _mm512_loadu_ps(sfPtr9+614592+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1816 = _mm512_shuffle_f32x4(sf911, sf912, 68);
__m512 in1817 = _mm512_shuffle_f32x4(sf911, sf912, 238);
__m512 tmp12519 = _mm512_add_ps(in1803, in1804);
__m512 tmp12539 = _mm512_add_ps(in1811, in1812);
__m512 tmp12518 = _mm512_add_ps(in1805, in1806);
__m512 tmp12538 = _mm512_add_ps(in1813, in1814);
__m512 tmp12524 = _mm512_sub_ps(in1805, in1806);
__m512 tmp12544 = _mm512_sub_ps(in1813, in1814);
__m512 tmp12523 = _mm512_sub_ps(in1803, in1804);
__m512 tmp12543 = _mm512_sub_ps(in1811, in1812);
__m512 tmp12520 = _mm512_add_ps(in1807, in1808);
__m512 tmp12540 = _mm512_add_ps(in1815, in1816);
__m512 tmp12525 = _mm512_sub_ps(in1807, in1808);
__m512 tmp12545 = _mm512_sub_ps(in1815, in1816);
__m512 tmp12522 = _mm512_fmadd_ps(tmp12524, _mm512_set1_ps(2e+00f), tmp12523);
__m512 tmp12542 = _mm512_fmadd_ps(tmp12544, _mm512_set1_ps(2e+00f), tmp12543);
__m512 tmp12529 = _mm512_fmadd_ps(tmp12524, _mm512_set1_ps(8e+00f), tmp12523);
__m512 tmp12549 = _mm512_fmadd_ps(tmp12544, _mm512_set1_ps(8e+00f), tmp12543);
__m512 tmp12517 = _mm512_add_ps(tmp12518, tmp12519);
__m512 tmp12537 = _mm512_add_ps(tmp12538, tmp12539);
__m512 tmp12521 = _mm512_fmadd_ps(tmp12525, _mm512_set1_ps(1.6e+01f), tmp12522);
__m512 tmp12541 = _mm512_fmadd_ps(tmp12545, _mm512_set1_ps(1.6e+01f), tmp12542);
__m512 tmp12528 = _mm512_fmadd_ps(tmp12525, _mm512_set1_ps(4e+00f), tmp12529);
__m512 tmp12548 = _mm512_fmadd_ps(tmp12545, _mm512_set1_ps(4e+00f), tmp12549);
__m512 tmp12534 = _mm512_add_ps(tmp12525, tmp12523);
__m512 tmp12554 = _mm512_add_ps(tmp12545, tmp12543);
__m512 tmp12527 = _mm512_fmadd_ps(tmp12518, _mm512_set1_ps(4e+00f), tmp12519);
__m512 tmp12547 = _mm512_fmadd_ps(tmp12538, _mm512_set1_ps(4e+00f), tmp12539);
__m512 tmp12531 = _mm512_fmadd_ps(tmp12518, _mm512_set1_ps(1.6e+01f), tmp12519);
__m512 tmp12551 = _mm512_fmadd_ps(tmp12538, _mm512_set1_ps(1.6e+01f), tmp12539);
__m512 tmp12516 = _mm512_add_ps(tmp12517, in1802);
__m512 tmp12536 = _mm512_add_ps(tmp12537, in1810);
__m512 tmp12533 = _mm512_add_ps(tmp12534, in1809);
__m512 tmp12553 = _mm512_add_ps(tmp12554, in1817);
__m512 tmp12515 = _mm512_fmadd_ps(tmp12520, _mm512_set1_ps(3.2e+01f), tmp12516);
__m512 tmp12535 = _mm512_fmadd_ps(tmp12540, _mm512_set1_ps(3.2e+01f), tmp12536);
__m512 tmp12526 = _mm512_fmadd_ps(tmp12520, _mm512_set1_ps(8e+00f), tmp12527);
__m512 tmp12546 = _mm512_fmadd_ps(tmp12540, _mm512_set1_ps(8e+00f), tmp12547);
__m512 tmp12532 = _mm512_fmadd_ps(tmp12524, _mm512_set1_ps(3.2e+01f), tmp12533);
__m512 tmp12552 = _mm512_fmadd_ps(tmp12544, _mm512_set1_ps(3.2e+01f), tmp12553);
__m512 tmp12530 = _mm512_fmadd_ps(tmp12520, _mm512_set1_ps(2e+00f), tmp12531);
__m512 tmp12550 = _mm512_fmadd_ps(tmp12540, _mm512_set1_ps(2e+00f), tmp12551);
__m512 tmp12503 = tmp12515;
__m512 tmp12509 = tmp12535;
__m512 tmp12504 = tmp12521;
__m512 tmp12510 = tmp12541;
__m512 tmp12505 = tmp12526;
__m512 tmp12511 = tmp12546;
__m512 tmp12506 = tmp12528;
__m512 tmp12512 = tmp12548;
__m512 tmp12507 = tmp12530;
__m512 tmp12513 = tmp12550;
__m512 tmp12508 = tmp12532;
__m512 tmp12514 = tmp12552;
__m512 tmp12599 = _mm512_unpacklo_ps(tmp12503, tmp12504);
__m512 tmp12600 = _mm512_unpackhi_ps(tmp12503, tmp12504);
__m512 tmp12601 = _mm512_unpacklo_ps(tmp12505, tmp12506);
__m512 tmp12602 = _mm512_unpackhi_ps(tmp12505, tmp12506);
__m512 tmp12603 = _mm512_unpacklo_ps(tmp12507, tmp12508);
__m512 tmp12604 = _mm512_unpackhi_ps(tmp12507, tmp12508);
__m512 tmp12605 = _mm512_unpacklo_ps(tmp12509, tmp12510);
__m512 tmp12606 = _mm512_unpackhi_ps(tmp12509, tmp12510);
__m512 tmp12607 = _mm512_unpacklo_ps(tmp12511, tmp12512);
__m512 tmp12608 = _mm512_unpackhi_ps(tmp12511, tmp12512);
__m512 tmp12609 = _mm512_unpacklo_ps(tmp12513, tmp12514);
__m512 tmp12610 = _mm512_unpackhi_ps(tmp12513, tmp12514);
__m512 tmp12611 = _mm512_shuffle_ps(tmp12599, tmp12601, 68);
__m512 tmp12612 = _mm512_shuffle_ps(tmp12599, tmp12601, 238);
__m512 tmp12613 = _mm512_shuffle_ps(tmp12600, tmp12602, 68);
__m512 tmp12614 = _mm512_shuffle_ps(tmp12600, tmp12602, 238);
__m512 tmp12615 = _mm512_shuffle_ps(tmp12603, tmp12605, 68);
__m512 tmp12616 = _mm512_shuffle_ps(tmp12603, tmp12605, 238);
__m512 tmp12617 = _mm512_shuffle_ps(tmp12604, tmp12606, 68);
__m512 tmp12618 = _mm512_shuffle_ps(tmp12604, tmp12606, 238);
__m512 tmp12619 = _mm512_shuffle_ps(tmp12607, tmp12609, 68);
__m512 tmp12620 = _mm512_shuffle_ps(tmp12607, tmp12609, 238);
__m512 tmp12621 = _mm512_shuffle_ps(tmp12608, tmp12610, 68);
__m512 tmp12622 = _mm512_shuffle_ps(tmp12608, tmp12610, 238);
__m512 tmp12623 = _mm512_shuffle_f32x4(tmp12611, tmp12615, 136);
__m512 tmp12624 = _mm512_shuffle_f32x4(tmp12611, tmp12615, 221);
__m512 tmp12625 = _mm512_shuffle_f32x4(tmp12612, tmp12616, 136);
__m512 tmp12626 = _mm512_shuffle_f32x4(tmp12612, tmp12616, 221);
__m512 tmp12627 = _mm512_shuffle_f32x4(tmp12613, tmp12617, 136);
__m512 tmp12628 = _mm512_shuffle_f32x4(tmp12613, tmp12617, 221);
__m512 tmp12629 = _mm512_shuffle_f32x4(tmp12614, tmp12618, 136);
__m512 tmp12630 = _mm512_shuffle_f32x4(tmp12614, tmp12618, 221);
__m512 tmp12631 = _mm512_shuffle_f32x4(tmp12619, tmp12619, 136);
__m512 tmp12632 = _mm512_shuffle_f32x4(tmp12619, tmp12619, 221);
__m512 tmp12633 = _mm512_shuffle_f32x4(tmp12620, tmp12620, 136);
__m512 tmp12634 = _mm512_shuffle_f32x4(tmp12620, tmp12620, 221);
__m512 tmp12635 = _mm512_shuffle_f32x4(tmp12621, tmp12621, 136);
__m512 tmp12636 = _mm512_shuffle_f32x4(tmp12621, tmp12621, 221);
__m512 tmp12637 = _mm512_shuffle_f32x4(tmp12622, tmp12622, 136);
__m512 tmp12638 = _mm512_shuffle_f32x4(tmp12622, tmp12622, 221);
tmp12503 = _mm512_shuffle_f32x4(tmp12623, tmp12631, 136);
tmp12511 = _mm512_shuffle_f32x4(tmp12623, tmp12631, 221);
tmp12504 = _mm512_shuffle_f32x4(tmp12625, tmp12633, 136);
tmp12512 = _mm512_shuffle_f32x4(tmp12625, tmp12633, 221);
tmp12505 = _mm512_shuffle_f32x4(tmp12627, tmp12635, 136);
tmp12513 = _mm512_shuffle_f32x4(tmp12627, tmp12635, 221);
tmp12506 = _mm512_shuffle_f32x4(tmp12629, tmp12637, 136);
tmp12514 = _mm512_shuffle_f32x4(tmp12629, tmp12637, 221);
tmp12507 = _mm512_shuffle_f32x4(tmp12624, tmp12632, 136);
__m512 tmp12555 = _mm512_shuffle_f32x4(tmp12624, tmp12632, 221);
tmp12508 = _mm512_shuffle_f32x4(tmp12626, tmp12634, 136);
__m512 tmp12556 = _mm512_shuffle_f32x4(tmp12626, tmp12634, 221);
tmp12509 = _mm512_shuffle_f32x4(tmp12628, tmp12636, 136);
__m512 tmp12557 = _mm512_shuffle_f32x4(tmp12628, tmp12636, 221);
tmp12510 = _mm512_shuffle_f32x4(tmp12630, tmp12638, 136);
__m512 tmp12558 = _mm512_shuffle_f32x4(tmp12630, tmp12638, 221);
__m512 tmp12563 = _mm512_add_ps(tmp12504, tmp12505);
__m512 tmp12583 = _mm512_add_ps(tmp12512, tmp12513);
__m512 tmp12562 = _mm512_add_ps(tmp12506, tmp12507);
__m512 tmp12582 = _mm512_add_ps(tmp12514, tmp12555);
__m512 tmp12568 = _mm512_sub_ps(tmp12506, tmp12507);
__m512 tmp12588 = _mm512_sub_ps(tmp12514, tmp12555);
__m512 tmp12567 = _mm512_sub_ps(tmp12504, tmp12505);
__m512 tmp12587 = _mm512_sub_ps(tmp12512, tmp12513);
__m512 tmp12564 = _mm512_add_ps(tmp12508, tmp12509);
__m512 tmp12584 = _mm512_add_ps(tmp12556, tmp12557);
__m512 tmp12569 = _mm512_sub_ps(tmp12508, tmp12509);
__m512 tmp12589 = _mm512_sub_ps(tmp12556, tmp12557);
__m512 tmp12566 = _mm512_fmadd_ps(tmp12568, _mm512_set1_ps(2e+00f), tmp12567);
__m512 tmp12586 = _mm512_fmadd_ps(tmp12588, _mm512_set1_ps(2e+00f), tmp12587);
__m512 tmp12573 = _mm512_fmadd_ps(tmp12568, _mm512_set1_ps(8e+00f), tmp12567);
__m512 tmp12593 = _mm512_fmadd_ps(tmp12588, _mm512_set1_ps(8e+00f), tmp12587);
__m512 tmp12561 = _mm512_add_ps(tmp12562, tmp12563);
__m512 tmp12581 = _mm512_add_ps(tmp12582, tmp12583);
__m512 tmp12565 = _mm512_fmadd_ps(tmp12569, _mm512_set1_ps(1.6e+01f), tmp12566);
__m512 tmp12585 = _mm512_fmadd_ps(tmp12589, _mm512_set1_ps(1.6e+01f), tmp12586);
__m512 tmp12572 = _mm512_fmadd_ps(tmp12569, _mm512_set1_ps(4e+00f), tmp12573);
__m512 tmp12592 = _mm512_fmadd_ps(tmp12589, _mm512_set1_ps(4e+00f), tmp12593);
__m512 tmp12578 = _mm512_add_ps(tmp12569, tmp12567);
__m512 tmp12598 = _mm512_add_ps(tmp12589, tmp12587);
__m512 tmp12571 = _mm512_fmadd_ps(tmp12562, _mm512_set1_ps(4e+00f), tmp12563);
__m512 tmp12591 = _mm512_fmadd_ps(tmp12582, _mm512_set1_ps(4e+00f), tmp12583);
__m512 tmp12575 = _mm512_fmadd_ps(tmp12562, _mm512_set1_ps(1.6e+01f), tmp12563);
__m512 tmp12595 = _mm512_fmadd_ps(tmp12582, _mm512_set1_ps(1.6e+01f), tmp12583);
__m512 tmp12560 = _mm512_add_ps(tmp12561, tmp12503);
__m512 tmp12580 = _mm512_add_ps(tmp12581, tmp12511);
__m512 tmp12577 = _mm512_add_ps(tmp12578, tmp12510);
__m512 tmp12597 = _mm512_add_ps(tmp12598, tmp12558);
__m512 tmp12559 = _mm512_fmadd_ps(tmp12564, _mm512_set1_ps(3.2e+01f), tmp12560);
__m512 tmp12579 = _mm512_fmadd_ps(tmp12584, _mm512_set1_ps(3.2e+01f), tmp12580);
__m512 tmp12570 = _mm512_fmadd_ps(tmp12564, _mm512_set1_ps(8e+00f), tmp12571);
__m512 tmp12590 = _mm512_fmadd_ps(tmp12584, _mm512_set1_ps(8e+00f), tmp12591);
__m512 tmp12576 = _mm512_fmadd_ps(tmp12568, _mm512_set1_ps(3.2e+01f), tmp12577);
__m512 tmp12596 = _mm512_fmadd_ps(tmp12588, _mm512_set1_ps(3.2e+01f), tmp12597);
__m512 tmp12574 = _mm512_fmadd_ps(tmp12564, _mm512_set1_ps(2e+00f), tmp12575);
__m512 tmp12594 = _mm512_fmadd_ps(tmp12584, _mm512_set1_ps(2e+00f), tmp12595);
__m512 out1671 = tmp12559;
__m512 out1677 = tmp12579;
__m512 out1672 = tmp12565;
__m512 out1678 = tmp12585;
__m512 out1673 = tmp12570;
__m512 out1679 = tmp12590;
__m512 out1674 = tmp12572;
__m512 out1680 = tmp12592;
__m512 out1675 = tmp12574;
__m512 out1681 = tmp12594;
__m512 out1676 = tmp12576;
__m512 out1682 = tmp12596;
out1671 = _mm512_max_ps(_mm512_setzero_ps(), out1671);
out1677 = _mm512_max_ps(_mm512_setzero_ps(), out1677);
out1672 = _mm512_max_ps(_mm512_setzero_ps(), out1672);
out1678 = _mm512_max_ps(_mm512_setzero_ps(), out1678);
out1673 = _mm512_max_ps(_mm512_setzero_ps(), out1673);
out1679 = _mm512_max_ps(_mm512_setzero_ps(), out1679);
out1674 = _mm512_max_ps(_mm512_setzero_ps(), out1674);
out1680 = _mm512_max_ps(_mm512_setzero_ps(), out1680);
out1675 = _mm512_max_ps(_mm512_setzero_ps(), out1675);
out1681 = _mm512_max_ps(_mm512_setzero_ps(), out1681);
out1676 = _mm512_max_ps(_mm512_setzero_ps(), out1676);
out1682 = _mm512_max_ps(_mm512_setzero_ps(), out1682);
_mm512_mask_storeu_ps(datPtr17+0+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1671);
_mm512_mask_storeu_ps(datPtr17+48+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 15, out1677);
_mm512_mask_storeu_ps(datPtr17+600+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4032, out1677);
_mm512_mask_storeu_ps(datPtr17+112+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1672);
_mm512_mask_storeu_ps(datPtr17+160+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 15, out1678);
_mm512_mask_storeu_ps(datPtr17+712+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4032, out1678);
_mm512_mask_storeu_ps(datPtr17+224+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1673);
_mm512_mask_storeu_ps(datPtr17+272+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 15, out1679);
_mm512_mask_storeu_ps(datPtr17+824+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4032, out1679);
_mm512_mask_storeu_ps(datPtr17+336+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1674);
_mm512_mask_storeu_ps(datPtr17+384+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 15, out1680);
_mm512_mask_storeu_ps(datPtr17+936+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4032, out1680);
_mm512_mask_storeu_ps(datPtr17+448+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1675);
_mm512_mask_storeu_ps(datPtr17+496+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 15, out1681);
_mm512_mask_storeu_ps(datPtr17+1048+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4032, out1681);
_mm512_mask_storeu_ps(datPtr17+560+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1676);
_mm512_mask_storeu_ps(datPtr17+608+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 15, out1682);
_mm512_mask_storeu_ps(datPtr17+1160+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4032, out1682);
__m512 sf913 = _mm512_loadu_ps(sfPtr9+256+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf914 = _mm512_loadu_ps(sfPtr9+384+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1818 = _mm512_shuffle_f32x4(sf913, sf914, 68);
__m512 in1819 = _mm512_shuffle_f32x4(sf913, sf914, 238);
__m512 sf915 = _mm512_loadu_ps(sfPtr9+320+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf916 = _mm512_loadu_ps(sfPtr9+448+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1826 = _mm512_shuffle_f32x4(sf915, sf916, 68);
__m512 in1827 = _mm512_shuffle_f32x4(sf915, sf916, 238);
__m512 sf917 = _mm512_loadu_ps(sfPtr9+205056+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf918 = _mm512_loadu_ps(sfPtr9+205184+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1820 = _mm512_shuffle_f32x4(sf917, sf918, 68);
__m512 in1821 = _mm512_shuffle_f32x4(sf917, sf918, 238);
__m512 sf919 = _mm512_loadu_ps(sfPtr9+205120+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf920 = _mm512_loadu_ps(sfPtr9+205248+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1828 = _mm512_shuffle_f32x4(sf919, sf920, 68);
__m512 in1829 = _mm512_shuffle_f32x4(sf919, sf920, 238);
__m512 sf921 = _mm512_loadu_ps(sfPtr9+409856+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf922 = _mm512_loadu_ps(sfPtr9+409984+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1822 = _mm512_shuffle_f32x4(sf921, sf922, 68);
__m512 in1823 = _mm512_shuffle_f32x4(sf921, sf922, 238);
__m512 sf923 = _mm512_loadu_ps(sfPtr9+409920+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf924 = _mm512_loadu_ps(sfPtr9+410048+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1830 = _mm512_shuffle_f32x4(sf923, sf924, 68);
__m512 in1831 = _mm512_shuffle_f32x4(sf923, sf924, 238);
__m512 sf925 = _mm512_loadu_ps(sfPtr9+614656+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf926 = _mm512_loadu_ps(sfPtr9+614784+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1824 = _mm512_shuffle_f32x4(sf925, sf926, 68);
__m512 in1825 = _mm512_shuffle_f32x4(sf925, sf926, 238);
__m512 sf927 = _mm512_loadu_ps(sfPtr9+614720+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf928 = _mm512_loadu_ps(sfPtr9+614848+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1832 = _mm512_shuffle_f32x4(sf927, sf928, 68);
__m512 in1833 = _mm512_shuffle_f32x4(sf927, sf928, 238);
__m512 tmp12655 = _mm512_add_ps(in1819, in1820);
__m512 tmp12675 = _mm512_add_ps(in1827, in1828);
__m512 tmp12654 = _mm512_add_ps(in1821, in1822);
__m512 tmp12674 = _mm512_add_ps(in1829, in1830);
__m512 tmp12660 = _mm512_sub_ps(in1821, in1822);
__m512 tmp12680 = _mm512_sub_ps(in1829, in1830);
__m512 tmp12659 = _mm512_sub_ps(in1819, in1820);
__m512 tmp12679 = _mm512_sub_ps(in1827, in1828);
__m512 tmp12656 = _mm512_add_ps(in1823, in1824);
__m512 tmp12676 = _mm512_add_ps(in1831, in1832);
__m512 tmp12661 = _mm512_sub_ps(in1823, in1824);
__m512 tmp12681 = _mm512_sub_ps(in1831, in1832);
__m512 tmp12658 = _mm512_fmadd_ps(tmp12660, _mm512_set1_ps(2e+00f), tmp12659);
__m512 tmp12678 = _mm512_fmadd_ps(tmp12680, _mm512_set1_ps(2e+00f), tmp12679);
__m512 tmp12665 = _mm512_fmadd_ps(tmp12660, _mm512_set1_ps(8e+00f), tmp12659);
__m512 tmp12685 = _mm512_fmadd_ps(tmp12680, _mm512_set1_ps(8e+00f), tmp12679);
__m512 tmp12653 = _mm512_add_ps(tmp12654, tmp12655);
__m512 tmp12673 = _mm512_add_ps(tmp12674, tmp12675);
__m512 tmp12657 = _mm512_fmadd_ps(tmp12661, _mm512_set1_ps(1.6e+01f), tmp12658);
__m512 tmp12677 = _mm512_fmadd_ps(tmp12681, _mm512_set1_ps(1.6e+01f), tmp12678);
__m512 tmp12664 = _mm512_fmadd_ps(tmp12661, _mm512_set1_ps(4e+00f), tmp12665);
__m512 tmp12684 = _mm512_fmadd_ps(tmp12681, _mm512_set1_ps(4e+00f), tmp12685);
__m512 tmp12670 = _mm512_add_ps(tmp12661, tmp12659);
__m512 tmp12690 = _mm512_add_ps(tmp12681, tmp12679);
__m512 tmp12663 = _mm512_fmadd_ps(tmp12654, _mm512_set1_ps(4e+00f), tmp12655);
__m512 tmp12683 = _mm512_fmadd_ps(tmp12674, _mm512_set1_ps(4e+00f), tmp12675);
__m512 tmp12667 = _mm512_fmadd_ps(tmp12654, _mm512_set1_ps(1.6e+01f), tmp12655);
__m512 tmp12687 = _mm512_fmadd_ps(tmp12674, _mm512_set1_ps(1.6e+01f), tmp12675);
__m512 tmp12652 = _mm512_add_ps(tmp12653, in1818);
__m512 tmp12672 = _mm512_add_ps(tmp12673, in1826);
__m512 tmp12669 = _mm512_add_ps(tmp12670, in1825);
__m512 tmp12689 = _mm512_add_ps(tmp12690, in1833);
__m512 tmp12651 = _mm512_fmadd_ps(tmp12656, _mm512_set1_ps(3.2e+01f), tmp12652);
__m512 tmp12671 = _mm512_fmadd_ps(tmp12676, _mm512_set1_ps(3.2e+01f), tmp12672);
__m512 tmp12662 = _mm512_fmadd_ps(tmp12656, _mm512_set1_ps(8e+00f), tmp12663);
__m512 tmp12682 = _mm512_fmadd_ps(tmp12676, _mm512_set1_ps(8e+00f), tmp12683);
__m512 tmp12668 = _mm512_fmadd_ps(tmp12660, _mm512_set1_ps(3.2e+01f), tmp12669);
__m512 tmp12688 = _mm512_fmadd_ps(tmp12680, _mm512_set1_ps(3.2e+01f), tmp12689);
__m512 tmp12666 = _mm512_fmadd_ps(tmp12656, _mm512_set1_ps(2e+00f), tmp12667);
__m512 tmp12686 = _mm512_fmadd_ps(tmp12676, _mm512_set1_ps(2e+00f), tmp12687);
__m512 tmp12639 = tmp12651;
__m512 tmp12645 = tmp12671;
__m512 tmp12640 = tmp12657;
__m512 tmp12646 = tmp12677;
__m512 tmp12641 = tmp12662;
__m512 tmp12647 = tmp12682;
__m512 tmp12642 = tmp12664;
__m512 tmp12648 = tmp12684;
__m512 tmp12643 = tmp12666;
__m512 tmp12649 = tmp12686;
__m512 tmp12644 = tmp12668;
__m512 tmp12650 = tmp12688;
__m512 tmp12735 = _mm512_unpacklo_ps(tmp12639, tmp12640);
__m512 tmp12736 = _mm512_unpackhi_ps(tmp12639, tmp12640);
__m512 tmp12737 = _mm512_unpacklo_ps(tmp12641, tmp12642);
__m512 tmp12738 = _mm512_unpackhi_ps(tmp12641, tmp12642);
__m512 tmp12739 = _mm512_unpacklo_ps(tmp12643, tmp12644);
__m512 tmp12740 = _mm512_unpackhi_ps(tmp12643, tmp12644);
__m512 tmp12741 = _mm512_unpacklo_ps(tmp12645, tmp12646);
__m512 tmp12742 = _mm512_unpackhi_ps(tmp12645, tmp12646);
__m512 tmp12743 = _mm512_unpacklo_ps(tmp12647, tmp12648);
__m512 tmp12744 = _mm512_unpackhi_ps(tmp12647, tmp12648);
__m512 tmp12745 = _mm512_unpacklo_ps(tmp12649, tmp12650);
__m512 tmp12746 = _mm512_unpackhi_ps(tmp12649, tmp12650);
__m512 tmp12747 = _mm512_shuffle_ps(tmp12735, tmp12737, 68);
__m512 tmp12748 = _mm512_shuffle_ps(tmp12735, tmp12737, 238);
__m512 tmp12749 = _mm512_shuffle_ps(tmp12736, tmp12738, 68);
__m512 tmp12750 = _mm512_shuffle_ps(tmp12736, tmp12738, 238);
__m512 tmp12751 = _mm512_shuffle_ps(tmp12739, tmp12741, 68);
__m512 tmp12752 = _mm512_shuffle_ps(tmp12739, tmp12741, 238);
__m512 tmp12753 = _mm512_shuffle_ps(tmp12740, tmp12742, 68);
__m512 tmp12754 = _mm512_shuffle_ps(tmp12740, tmp12742, 238);
__m512 tmp12755 = _mm512_shuffle_ps(tmp12743, tmp12745, 68);
__m512 tmp12756 = _mm512_shuffle_ps(tmp12743, tmp12745, 238);
__m512 tmp12757 = _mm512_shuffle_ps(tmp12744, tmp12746, 68);
__m512 tmp12758 = _mm512_shuffle_ps(tmp12744, tmp12746, 238);
__m512 tmp12759 = _mm512_shuffle_f32x4(tmp12747, tmp12751, 136);
__m512 tmp12760 = _mm512_shuffle_f32x4(tmp12747, tmp12751, 221);
__m512 tmp12761 = _mm512_shuffle_f32x4(tmp12748, tmp12752, 136);
__m512 tmp12762 = _mm512_shuffle_f32x4(tmp12748, tmp12752, 221);
__m512 tmp12763 = _mm512_shuffle_f32x4(tmp12749, tmp12753, 136);
__m512 tmp12764 = _mm512_shuffle_f32x4(tmp12749, tmp12753, 221);
__m512 tmp12765 = _mm512_shuffle_f32x4(tmp12750, tmp12754, 136);
__m512 tmp12766 = _mm512_shuffle_f32x4(tmp12750, tmp12754, 221);
__m512 tmp12767 = _mm512_shuffle_f32x4(tmp12755, tmp12755, 136);
__m512 tmp12768 = _mm512_shuffle_f32x4(tmp12755, tmp12755, 221);
__m512 tmp12769 = _mm512_shuffle_f32x4(tmp12756, tmp12756, 136);
__m512 tmp12770 = _mm512_shuffle_f32x4(tmp12756, tmp12756, 221);
__m512 tmp12771 = _mm512_shuffle_f32x4(tmp12757, tmp12757, 136);
__m512 tmp12772 = _mm512_shuffle_f32x4(tmp12757, tmp12757, 221);
__m512 tmp12773 = _mm512_shuffle_f32x4(tmp12758, tmp12758, 136);
__m512 tmp12774 = _mm512_shuffle_f32x4(tmp12758, tmp12758, 221);
tmp12639 = _mm512_shuffle_f32x4(tmp12759, tmp12767, 136);
tmp12647 = _mm512_shuffle_f32x4(tmp12759, tmp12767, 221);
tmp12640 = _mm512_shuffle_f32x4(tmp12761, tmp12769, 136);
tmp12648 = _mm512_shuffle_f32x4(tmp12761, tmp12769, 221);
tmp12641 = _mm512_shuffle_f32x4(tmp12763, tmp12771, 136);
tmp12649 = _mm512_shuffle_f32x4(tmp12763, tmp12771, 221);
tmp12642 = _mm512_shuffle_f32x4(tmp12765, tmp12773, 136);
tmp12650 = _mm512_shuffle_f32x4(tmp12765, tmp12773, 221);
tmp12643 = _mm512_shuffle_f32x4(tmp12760, tmp12768, 136);
__m512 tmp12691 = _mm512_shuffle_f32x4(tmp12760, tmp12768, 221);
tmp12644 = _mm512_shuffle_f32x4(tmp12762, tmp12770, 136);
__m512 tmp12692 = _mm512_shuffle_f32x4(tmp12762, tmp12770, 221);
tmp12645 = _mm512_shuffle_f32x4(tmp12764, tmp12772, 136);
__m512 tmp12693 = _mm512_shuffle_f32x4(tmp12764, tmp12772, 221);
tmp12646 = _mm512_shuffle_f32x4(tmp12766, tmp12774, 136);
__m512 tmp12694 = _mm512_shuffle_f32x4(tmp12766, tmp12774, 221);
__m512 tmp12699 = _mm512_add_ps(tmp12640, tmp12641);
__m512 tmp12719 = _mm512_add_ps(tmp12648, tmp12649);
__m512 tmp12698 = _mm512_add_ps(tmp12642, tmp12643);
__m512 tmp12718 = _mm512_add_ps(tmp12650, tmp12691);
__m512 tmp12704 = _mm512_sub_ps(tmp12642, tmp12643);
__m512 tmp12724 = _mm512_sub_ps(tmp12650, tmp12691);
__m512 tmp12703 = _mm512_sub_ps(tmp12640, tmp12641);
__m512 tmp12723 = _mm512_sub_ps(tmp12648, tmp12649);
__m512 tmp12700 = _mm512_add_ps(tmp12644, tmp12645);
__m512 tmp12720 = _mm512_add_ps(tmp12692, tmp12693);
__m512 tmp12705 = _mm512_sub_ps(tmp12644, tmp12645);
__m512 tmp12725 = _mm512_sub_ps(tmp12692, tmp12693);
__m512 tmp12702 = _mm512_fmadd_ps(tmp12704, _mm512_set1_ps(2e+00f), tmp12703);
__m512 tmp12722 = _mm512_fmadd_ps(tmp12724, _mm512_set1_ps(2e+00f), tmp12723);
__m512 tmp12709 = _mm512_fmadd_ps(tmp12704, _mm512_set1_ps(8e+00f), tmp12703);
__m512 tmp12729 = _mm512_fmadd_ps(tmp12724, _mm512_set1_ps(8e+00f), tmp12723);
__m512 tmp12697 = _mm512_add_ps(tmp12698, tmp12699);
__m512 tmp12717 = _mm512_add_ps(tmp12718, tmp12719);
__m512 tmp12701 = _mm512_fmadd_ps(tmp12705, _mm512_set1_ps(1.6e+01f), tmp12702);
__m512 tmp12721 = _mm512_fmadd_ps(tmp12725, _mm512_set1_ps(1.6e+01f), tmp12722);
__m512 tmp12708 = _mm512_fmadd_ps(tmp12705, _mm512_set1_ps(4e+00f), tmp12709);
__m512 tmp12728 = _mm512_fmadd_ps(tmp12725, _mm512_set1_ps(4e+00f), tmp12729);
__m512 tmp12714 = _mm512_add_ps(tmp12705, tmp12703);
__m512 tmp12734 = _mm512_add_ps(tmp12725, tmp12723);
__m512 tmp12707 = _mm512_fmadd_ps(tmp12698, _mm512_set1_ps(4e+00f), tmp12699);
__m512 tmp12727 = _mm512_fmadd_ps(tmp12718, _mm512_set1_ps(4e+00f), tmp12719);
__m512 tmp12711 = _mm512_fmadd_ps(tmp12698, _mm512_set1_ps(1.6e+01f), tmp12699);
__m512 tmp12731 = _mm512_fmadd_ps(tmp12718, _mm512_set1_ps(1.6e+01f), tmp12719);
__m512 tmp12696 = _mm512_add_ps(tmp12697, tmp12639);
__m512 tmp12716 = _mm512_add_ps(tmp12717, tmp12647);
__m512 tmp12713 = _mm512_add_ps(tmp12714, tmp12646);
__m512 tmp12733 = _mm512_add_ps(tmp12734, tmp12694);
__m512 tmp12695 = _mm512_fmadd_ps(tmp12700, _mm512_set1_ps(3.2e+01f), tmp12696);
__m512 tmp12715 = _mm512_fmadd_ps(tmp12720, _mm512_set1_ps(3.2e+01f), tmp12716);
__m512 tmp12706 = _mm512_fmadd_ps(tmp12700, _mm512_set1_ps(8e+00f), tmp12707);
__m512 tmp12726 = _mm512_fmadd_ps(tmp12720, _mm512_set1_ps(8e+00f), tmp12727);
__m512 tmp12712 = _mm512_fmadd_ps(tmp12704, _mm512_set1_ps(3.2e+01f), tmp12713);
__m512 tmp12732 = _mm512_fmadd_ps(tmp12724, _mm512_set1_ps(3.2e+01f), tmp12733);
__m512 tmp12710 = _mm512_fmadd_ps(tmp12700, _mm512_set1_ps(2e+00f), tmp12711);
__m512 tmp12730 = _mm512_fmadd_ps(tmp12720, _mm512_set1_ps(2e+00f), tmp12731);
__m512 out1683 = tmp12695;
__m512 out1689 = tmp12715;
__m512 out1684 = tmp12701;
__m512 out1690 = tmp12721;
__m512 out1685 = tmp12706;
__m512 out1691 = tmp12726;
__m512 out1686 = tmp12708;
__m512 out1692 = tmp12728;
__m512 out1687 = tmp12710;
__m512 out1693 = tmp12730;
__m512 out1688 = tmp12712;
__m512 out1694 = tmp12732;
out1683 = _mm512_max_ps(_mm512_setzero_ps(), out1683);
out1689 = _mm512_max_ps(_mm512_setzero_ps(), out1689);
out1684 = _mm512_max_ps(_mm512_setzero_ps(), out1684);
out1690 = _mm512_max_ps(_mm512_setzero_ps(), out1690);
out1685 = _mm512_max_ps(_mm512_setzero_ps(), out1685);
out1691 = _mm512_max_ps(_mm512_setzero_ps(), out1691);
out1686 = _mm512_max_ps(_mm512_setzero_ps(), out1686);
out1692 = _mm512_max_ps(_mm512_setzero_ps(), out1692);
out1687 = _mm512_max_ps(_mm512_setzero_ps(), out1687);
out1693 = _mm512_max_ps(_mm512_setzero_ps(), out1693);
out1688 = _mm512_max_ps(_mm512_setzero_ps(), out1688);
out1694 = _mm512_max_ps(_mm512_setzero_ps(), out1694);
_mm512_mask_storeu_ps(datPtr17+648+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1683);
_mm512_mask_storeu_ps(datPtr17+3136+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1689);
_mm512_mask_storeu_ps(datPtr17+760+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1684);
_mm512_mask_storeu_ps(datPtr17+3248+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1690);
_mm512_mask_storeu_ps(datPtr17+872+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1685);
_mm512_mask_storeu_ps(datPtr17+3360+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1691);
_mm512_mask_storeu_ps(datPtr17+984+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1686);
_mm512_mask_storeu_ps(datPtr17+3472+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1692);
_mm512_mask_storeu_ps(datPtr17+1096+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1687);
_mm512_mask_storeu_ps(datPtr17+3584+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1693);
_mm512_mask_storeu_ps(datPtr17+1208+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1688);
_mm512_mask_storeu_ps(datPtr17+3696+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1694);
__m512 sf929 = _mm512_loadu_ps(sfPtr9+512+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf930 = _mm512_loadu_ps(sfPtr9+576+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1834 = _mm512_shuffle_f32x4(sf930, sf929, 68);
__m512 in1835 = _mm512_shuffle_f32x4(sf930, sf929, 238);
__m512 sf931 = _mm512_loadu_ps(sfPtr9+640+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf932 = _mm512_loadu_ps(sfPtr9+704+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1842 = _mm512_shuffle_f32x4(sf931, sf932, 68);
__m512 in1843 = _mm512_shuffle_f32x4(sf931, sf932, 238);
__m512 sf933 = _mm512_loadu_ps(sfPtr9+205312+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf934 = _mm512_loadu_ps(sfPtr9+205376+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1836 = _mm512_shuffle_f32x4(sf934, sf933, 68);
__m512 in1837 = _mm512_shuffle_f32x4(sf934, sf933, 238);
__m512 sf935 = _mm512_loadu_ps(sfPtr9+205440+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf936 = _mm512_loadu_ps(sfPtr9+205504+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1844 = _mm512_shuffle_f32x4(sf935, sf936, 68);
__m512 in1845 = _mm512_shuffle_f32x4(sf935, sf936, 238);
__m512 sf937 = _mm512_loadu_ps(sfPtr9+410112+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf938 = _mm512_loadu_ps(sfPtr9+410176+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1838 = _mm512_shuffle_f32x4(sf938, sf937, 68);
__m512 in1839 = _mm512_shuffle_f32x4(sf938, sf937, 238);
__m512 sf939 = _mm512_loadu_ps(sfPtr9+410240+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf940 = _mm512_loadu_ps(sfPtr9+410304+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1846 = _mm512_shuffle_f32x4(sf939, sf940, 68);
__m512 in1847 = _mm512_shuffle_f32x4(sf939, sf940, 238);
__m512 sf941 = _mm512_loadu_ps(sfPtr9+614912+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf942 = _mm512_loadu_ps(sfPtr9+614976+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1840 = _mm512_shuffle_f32x4(sf942, sf941, 68);
__m512 in1841 = _mm512_shuffle_f32x4(sf942, sf941, 238);
__m512 sf943 = _mm512_loadu_ps(sfPtr9+615040+819200*i37+49152*j30+1536*k113+768*l45);
__m512 sf944 = _mm512_loadu_ps(sfPtr9+615104+819200*i37+49152*j30+1536*k113+768*l45);
__m512 in1848 = _mm512_shuffle_f32x4(sf943, sf944, 68);
__m512 in1849 = _mm512_shuffle_f32x4(sf943, sf944, 238);
__m512 tmp12791 = _mm512_add_ps(in1835, in1836);
__m512 tmp12811 = _mm512_add_ps(in1843, in1844);
__m512 tmp12790 = _mm512_add_ps(in1837, in1838);
__m512 tmp12810 = _mm512_add_ps(in1845, in1846);
__m512 tmp12796 = _mm512_sub_ps(in1837, in1838);
__m512 tmp12816 = _mm512_sub_ps(in1845, in1846);
__m512 tmp12795 = _mm512_sub_ps(in1835, in1836);
__m512 tmp12815 = _mm512_sub_ps(in1843, in1844);
__m512 tmp12792 = _mm512_add_ps(in1839, in1840);
__m512 tmp12812 = _mm512_add_ps(in1847, in1848);
__m512 tmp12797 = _mm512_sub_ps(in1839, in1840);
__m512 tmp12817 = _mm512_sub_ps(in1847, in1848);
__m512 tmp12794 = _mm512_fmadd_ps(tmp12796, _mm512_set1_ps(2e+00f), tmp12795);
__m512 tmp12814 = _mm512_fmadd_ps(tmp12816, _mm512_set1_ps(2e+00f), tmp12815);
__m512 tmp12801 = _mm512_fmadd_ps(tmp12796, _mm512_set1_ps(8e+00f), tmp12795);
__m512 tmp12821 = _mm512_fmadd_ps(tmp12816, _mm512_set1_ps(8e+00f), tmp12815);
__m512 tmp12789 = _mm512_add_ps(tmp12790, tmp12791);
__m512 tmp12809 = _mm512_add_ps(tmp12810, tmp12811);
__m512 tmp12793 = _mm512_fmadd_ps(tmp12797, _mm512_set1_ps(1.6e+01f), tmp12794);
__m512 tmp12813 = _mm512_fmadd_ps(tmp12817, _mm512_set1_ps(1.6e+01f), tmp12814);
__m512 tmp12800 = _mm512_fmadd_ps(tmp12797, _mm512_set1_ps(4e+00f), tmp12801);
__m512 tmp12820 = _mm512_fmadd_ps(tmp12817, _mm512_set1_ps(4e+00f), tmp12821);
__m512 tmp12806 = _mm512_add_ps(tmp12797, tmp12795);
__m512 tmp12826 = _mm512_add_ps(tmp12817, tmp12815);
__m512 tmp12799 = _mm512_fmadd_ps(tmp12790, _mm512_set1_ps(4e+00f), tmp12791);
__m512 tmp12819 = _mm512_fmadd_ps(tmp12810, _mm512_set1_ps(4e+00f), tmp12811);
__m512 tmp12803 = _mm512_fmadd_ps(tmp12790, _mm512_set1_ps(1.6e+01f), tmp12791);
__m512 tmp12823 = _mm512_fmadd_ps(tmp12810, _mm512_set1_ps(1.6e+01f), tmp12811);
__m512 tmp12788 = _mm512_add_ps(tmp12789, in1834);
__m512 tmp12808 = _mm512_add_ps(tmp12809, in1842);
__m512 tmp12805 = _mm512_add_ps(tmp12806, in1841);
__m512 tmp12825 = _mm512_add_ps(tmp12826, in1849);
__m512 tmp12787 = _mm512_fmadd_ps(tmp12792, _mm512_set1_ps(3.2e+01f), tmp12788);
__m512 tmp12807 = _mm512_fmadd_ps(tmp12812, _mm512_set1_ps(3.2e+01f), tmp12808);
__m512 tmp12798 = _mm512_fmadd_ps(tmp12792, _mm512_set1_ps(8e+00f), tmp12799);
__m512 tmp12818 = _mm512_fmadd_ps(tmp12812, _mm512_set1_ps(8e+00f), tmp12819);
__m512 tmp12804 = _mm512_fmadd_ps(tmp12796, _mm512_set1_ps(3.2e+01f), tmp12805);
__m512 tmp12824 = _mm512_fmadd_ps(tmp12816, _mm512_set1_ps(3.2e+01f), tmp12825);
__m512 tmp12802 = _mm512_fmadd_ps(tmp12792, _mm512_set1_ps(2e+00f), tmp12803);
__m512 tmp12822 = _mm512_fmadd_ps(tmp12812, _mm512_set1_ps(2e+00f), tmp12823);
__m512 tmp12775 = tmp12787;
__m512 tmp12781 = tmp12807;
__m512 tmp12776 = tmp12793;
__m512 tmp12782 = tmp12813;
__m512 tmp12777 = tmp12798;
__m512 tmp12783 = tmp12818;
__m512 tmp12778 = tmp12800;
__m512 tmp12784 = tmp12820;
__m512 tmp12779 = tmp12802;
__m512 tmp12785 = tmp12822;
__m512 tmp12780 = tmp12804;
__m512 tmp12786 = tmp12824;
__m512 tmp12871 = _mm512_unpacklo_ps(tmp12775, tmp12776);
__m512 tmp12872 = _mm512_unpackhi_ps(tmp12775, tmp12776);
__m512 tmp12873 = _mm512_unpacklo_ps(tmp12777, tmp12778);
__m512 tmp12874 = _mm512_unpackhi_ps(tmp12777, tmp12778);
__m512 tmp12875 = _mm512_unpacklo_ps(tmp12779, tmp12780);
__m512 tmp12876 = _mm512_unpackhi_ps(tmp12779, tmp12780);
__m512 tmp12877 = _mm512_unpacklo_ps(tmp12781, tmp12782);
__m512 tmp12878 = _mm512_unpackhi_ps(tmp12781, tmp12782);
__m512 tmp12879 = _mm512_unpacklo_ps(tmp12783, tmp12784);
__m512 tmp12880 = _mm512_unpackhi_ps(tmp12783, tmp12784);
__m512 tmp12881 = _mm512_unpacklo_ps(tmp12785, tmp12786);
__m512 tmp12882 = _mm512_unpackhi_ps(tmp12785, tmp12786);
__m512 tmp12883 = _mm512_shuffle_ps(tmp12871, tmp12873, 68);
__m512 tmp12884 = _mm512_shuffle_ps(tmp12871, tmp12873, 238);
__m512 tmp12885 = _mm512_shuffle_ps(tmp12872, tmp12874, 68);
__m512 tmp12886 = _mm512_shuffle_ps(tmp12872, tmp12874, 238);
__m512 tmp12887 = _mm512_shuffle_ps(tmp12875, tmp12877, 68);
__m512 tmp12888 = _mm512_shuffle_ps(tmp12875, tmp12877, 238);
__m512 tmp12889 = _mm512_shuffle_ps(tmp12876, tmp12878, 68);
__m512 tmp12890 = _mm512_shuffle_ps(tmp12876, tmp12878, 238);
__m512 tmp12891 = _mm512_shuffle_ps(tmp12879, tmp12881, 68);
__m512 tmp12892 = _mm512_shuffle_ps(tmp12879, tmp12881, 238);
__m512 tmp12893 = _mm512_shuffle_ps(tmp12880, tmp12882, 68);
__m512 tmp12894 = _mm512_shuffle_ps(tmp12880, tmp12882, 238);
__m512 tmp12895 = _mm512_shuffle_f32x4(tmp12883, tmp12887, 136);
__m512 tmp12896 = _mm512_shuffle_f32x4(tmp12883, tmp12887, 221);
__m512 tmp12897 = _mm512_shuffle_f32x4(tmp12884, tmp12888, 136);
__m512 tmp12898 = _mm512_shuffle_f32x4(tmp12884, tmp12888, 221);
__m512 tmp12899 = _mm512_shuffle_f32x4(tmp12885, tmp12889, 136);
__m512 tmp12900 = _mm512_shuffle_f32x4(tmp12885, tmp12889, 221);
__m512 tmp12901 = _mm512_shuffle_f32x4(tmp12886, tmp12890, 136);
__m512 tmp12902 = _mm512_shuffle_f32x4(tmp12886, tmp12890, 221);
__m512 tmp12903 = _mm512_shuffle_f32x4(tmp12891, tmp12891, 136);
__m512 tmp12904 = _mm512_shuffle_f32x4(tmp12891, tmp12891, 221);
__m512 tmp12905 = _mm512_shuffle_f32x4(tmp12892, tmp12892, 136);
__m512 tmp12906 = _mm512_shuffle_f32x4(tmp12892, tmp12892, 221);
__m512 tmp12907 = _mm512_shuffle_f32x4(tmp12893, tmp12893, 136);
__m512 tmp12908 = _mm512_shuffle_f32x4(tmp12893, tmp12893, 221);
__m512 tmp12909 = _mm512_shuffle_f32x4(tmp12894, tmp12894, 136);
__m512 tmp12910 = _mm512_shuffle_f32x4(tmp12894, tmp12894, 221);
tmp12775 = _mm512_shuffle_f32x4(tmp12895, tmp12903, 136);
tmp12783 = _mm512_shuffle_f32x4(tmp12895, tmp12903, 221);
tmp12776 = _mm512_shuffle_f32x4(tmp12897, tmp12905, 136);
tmp12784 = _mm512_shuffle_f32x4(tmp12897, tmp12905, 221);
tmp12777 = _mm512_shuffle_f32x4(tmp12899, tmp12907, 136);
tmp12785 = _mm512_shuffle_f32x4(tmp12899, tmp12907, 221);
tmp12778 = _mm512_shuffle_f32x4(tmp12901, tmp12909, 136);
tmp12786 = _mm512_shuffle_f32x4(tmp12901, tmp12909, 221);
tmp12779 = _mm512_shuffle_f32x4(tmp12896, tmp12904, 136);
__m512 tmp12827 = _mm512_shuffle_f32x4(tmp12896, tmp12904, 221);
tmp12780 = _mm512_shuffle_f32x4(tmp12898, tmp12906, 136);
__m512 tmp12828 = _mm512_shuffle_f32x4(tmp12898, tmp12906, 221);
tmp12781 = _mm512_shuffle_f32x4(tmp12900, tmp12908, 136);
__m512 tmp12829 = _mm512_shuffle_f32x4(tmp12900, tmp12908, 221);
tmp12782 = _mm512_shuffle_f32x4(tmp12902, tmp12910, 136);
__m512 tmp12830 = _mm512_shuffle_f32x4(tmp12902, tmp12910, 221);
__m512 tmp12835 = _mm512_add_ps(tmp12776, tmp12777);
__m512 tmp12855 = _mm512_add_ps(tmp12784, tmp12785);
__m512 tmp12834 = _mm512_add_ps(tmp12778, tmp12779);
__m512 tmp12854 = _mm512_add_ps(tmp12786, tmp12827);
__m512 tmp12840 = _mm512_sub_ps(tmp12778, tmp12779);
__m512 tmp12860 = _mm512_sub_ps(tmp12786, tmp12827);
__m512 tmp12839 = _mm512_sub_ps(tmp12776, tmp12777);
__m512 tmp12859 = _mm512_sub_ps(tmp12784, tmp12785);
__m512 tmp12836 = _mm512_add_ps(tmp12780, tmp12781);
__m512 tmp12856 = _mm512_add_ps(tmp12828, tmp12829);
__m512 tmp12841 = _mm512_sub_ps(tmp12780, tmp12781);
__m512 tmp12861 = _mm512_sub_ps(tmp12828, tmp12829);
__m512 tmp12838 = _mm512_fmadd_ps(tmp12840, _mm512_set1_ps(2e+00f), tmp12839);
__m512 tmp12858 = _mm512_fmadd_ps(tmp12860, _mm512_set1_ps(2e+00f), tmp12859);
__m512 tmp12845 = _mm512_fmadd_ps(tmp12840, _mm512_set1_ps(8e+00f), tmp12839);
__m512 tmp12865 = _mm512_fmadd_ps(tmp12860, _mm512_set1_ps(8e+00f), tmp12859);
__m512 tmp12833 = _mm512_add_ps(tmp12834, tmp12835);
__m512 tmp12853 = _mm512_add_ps(tmp12854, tmp12855);
__m512 tmp12837 = _mm512_fmadd_ps(tmp12841, _mm512_set1_ps(1.6e+01f), tmp12838);
__m512 tmp12857 = _mm512_fmadd_ps(tmp12861, _mm512_set1_ps(1.6e+01f), tmp12858);
__m512 tmp12844 = _mm512_fmadd_ps(tmp12841, _mm512_set1_ps(4e+00f), tmp12845);
__m512 tmp12864 = _mm512_fmadd_ps(tmp12861, _mm512_set1_ps(4e+00f), tmp12865);
__m512 tmp12850 = _mm512_add_ps(tmp12841, tmp12839);
__m512 tmp12870 = _mm512_add_ps(tmp12861, tmp12859);
__m512 tmp12843 = _mm512_fmadd_ps(tmp12834, _mm512_set1_ps(4e+00f), tmp12835);
__m512 tmp12863 = _mm512_fmadd_ps(tmp12854, _mm512_set1_ps(4e+00f), tmp12855);
__m512 tmp12847 = _mm512_fmadd_ps(tmp12834, _mm512_set1_ps(1.6e+01f), tmp12835);
__m512 tmp12867 = _mm512_fmadd_ps(tmp12854, _mm512_set1_ps(1.6e+01f), tmp12855);
__m512 tmp12832 = _mm512_add_ps(tmp12833, tmp12775);
__m512 tmp12852 = _mm512_add_ps(tmp12853, tmp12783);
__m512 tmp12849 = _mm512_add_ps(tmp12850, tmp12782);
__m512 tmp12869 = _mm512_add_ps(tmp12870, tmp12830);
__m512 tmp12831 = _mm512_fmadd_ps(tmp12836, _mm512_set1_ps(3.2e+01f), tmp12832);
__m512 tmp12851 = _mm512_fmadd_ps(tmp12856, _mm512_set1_ps(3.2e+01f), tmp12852);
__m512 tmp12842 = _mm512_fmadd_ps(tmp12836, _mm512_set1_ps(8e+00f), tmp12843);
__m512 tmp12862 = _mm512_fmadd_ps(tmp12856, _mm512_set1_ps(8e+00f), tmp12863);
__m512 tmp12848 = _mm512_fmadd_ps(tmp12840, _mm512_set1_ps(3.2e+01f), tmp12849);
__m512 tmp12868 = _mm512_fmadd_ps(tmp12860, _mm512_set1_ps(3.2e+01f), tmp12869);
__m512 tmp12846 = _mm512_fmadd_ps(tmp12836, _mm512_set1_ps(2e+00f), tmp12847);
__m512 tmp12866 = _mm512_fmadd_ps(tmp12856, _mm512_set1_ps(2e+00f), tmp12867);
__m512 out1701 = tmp12831;
__m512 out1695 = tmp12851;
__m512 out1702 = tmp12837;
__m512 out1696 = tmp12857;
__m512 out1703 = tmp12842;
__m512 out1697 = tmp12862;
__m512 out1704 = tmp12844;
__m512 out1698 = tmp12864;
__m512 out1705 = tmp12846;
__m512 out1699 = tmp12866;
__m512 out1706 = tmp12848;
__m512 out1700 = tmp12868;
out1701 = _mm512_max_ps(_mm512_setzero_ps(), out1701);
out1695 = _mm512_max_ps(_mm512_setzero_ps(), out1695);
out1702 = _mm512_max_ps(_mm512_setzero_ps(), out1702);
out1696 = _mm512_max_ps(_mm512_setzero_ps(), out1696);
out1703 = _mm512_max_ps(_mm512_setzero_ps(), out1703);
out1697 = _mm512_max_ps(_mm512_setzero_ps(), out1697);
out1704 = _mm512_max_ps(_mm512_setzero_ps(), out1704);
out1698 = _mm512_max_ps(_mm512_setzero_ps(), out1698);
out1705 = _mm512_max_ps(_mm512_setzero_ps(), out1705);
out1699 = _mm512_max_ps(_mm512_setzero_ps(), out1699);
out1706 = _mm512_max_ps(_mm512_setzero_ps(), out1706);
out1700 = _mm512_max_ps(_mm512_setzero_ps(), out1700);
_mm512_mask_storeu_ps(datPtr17+3760+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1701);
_mm512_mask_storeu_ps(datPtr17+3184+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 15, out1695);
_mm512_mask_storeu_ps(datPtr17+3784+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4032, out1695);
_mm512_mask_storeu_ps(datPtr17+3872+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1702);
_mm512_mask_storeu_ps(datPtr17+3296+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 15, out1696);
_mm512_mask_storeu_ps(datPtr17+3896+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4032, out1696);
_mm512_mask_storeu_ps(datPtr17+3984+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1703);
_mm512_mask_storeu_ps(datPtr17+3408+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 15, out1697);
_mm512_mask_storeu_ps(datPtr17+4008+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4032, out1697);
_mm512_mask_storeu_ps(datPtr17+4096+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1704);
_mm512_mask_storeu_ps(datPtr17+3520+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 15, out1698);
_mm512_mask_storeu_ps(datPtr17+4120+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4032, out1698);
_mm512_mask_storeu_ps(datPtr17+4208+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1705);
_mm512_mask_storeu_ps(datPtr17+3632+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 15, out1699);
_mm512_mask_storeu_ps(datPtr17+4232+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4032, out1699);
_mm512_mask_storeu_ps(datPtr17+4320+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4095, out1706);
_mm512_mask_storeu_ps(datPtr17+3744+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 15, out1700);
_mm512_mask_storeu_ps(datPtr17+4344+401408*i37+112*toH40+4*toW40+12544*k113+6272*l45, 4032, out1700);
}
}
if (j30 >= last8) return;
++j30;
rel20 = 3;
}
if (rel20 < 4) {
ptrdiff_t toH41 = base20+18;
ptrdiff_t toW41 = 18;
ptrdiff_t k114 = 32*w54;
for (; k114 != 32; ++k114) {
ptrdiff_t l46 = 0;
for (; l46 != 2; ++l46) {
__m512 sf945 = _mm512_loadu_ps(sfPtr9+0+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf946 = _mm512_loadu_ps(sfPtr9+128+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1850 = _mm512_shuffle_f32x4(sf945, sf946, 68);
__m512 in1851 = _mm512_shuffle_f32x4(sf945, sf946, 238);
__m512 sf947 = _mm512_loadu_ps(sfPtr9+64+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf948 = _mm512_loadu_ps(sfPtr9+192+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1858 = _mm512_shuffle_f32x4(sf947, sf948, 68);
__m512 in1859 = _mm512_shuffle_f32x4(sf947, sf948, 238);
__m512 sf949 = _mm512_loadu_ps(sfPtr9+204800+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf950 = _mm512_loadu_ps(sfPtr9+204928+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1852 = _mm512_shuffle_f32x4(sf949, sf950, 68);
__m512 in1853 = _mm512_shuffle_f32x4(sf949, sf950, 238);
__m512 sf951 = _mm512_loadu_ps(sfPtr9+204864+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf952 = _mm512_loadu_ps(sfPtr9+204992+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1860 = _mm512_shuffle_f32x4(sf951, sf952, 68);
__m512 in1861 = _mm512_shuffle_f32x4(sf951, sf952, 238);
__m512 sf953 = _mm512_loadu_ps(sfPtr9+409600+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf954 = _mm512_loadu_ps(sfPtr9+409728+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1854 = _mm512_shuffle_f32x4(sf953, sf954, 68);
__m512 in1855 = _mm512_shuffle_f32x4(sf953, sf954, 238);
__m512 sf955 = _mm512_loadu_ps(sfPtr9+409664+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf956 = _mm512_loadu_ps(sfPtr9+409792+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1862 = _mm512_shuffle_f32x4(sf955, sf956, 68);
__m512 in1863 = _mm512_shuffle_f32x4(sf955, sf956, 238);
__m512 sf957 = _mm512_loadu_ps(sfPtr9+614400+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf958 = _mm512_loadu_ps(sfPtr9+614528+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1856 = _mm512_shuffle_f32x4(sf957, sf958, 68);
__m512 in1857 = _mm512_shuffle_f32x4(sf957, sf958, 238);
__m512 sf959 = _mm512_loadu_ps(sfPtr9+614464+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf960 = _mm512_loadu_ps(sfPtr9+614592+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1864 = _mm512_shuffle_f32x4(sf959, sf960, 68);
__m512 in1865 = _mm512_shuffle_f32x4(sf959, sf960, 238);
__m512 tmp12927 = _mm512_add_ps(in1851, in1852);
__m512 tmp12947 = _mm512_add_ps(in1859, in1860);
__m512 tmp12926 = _mm512_add_ps(in1853, in1854);
__m512 tmp12946 = _mm512_add_ps(in1861, in1862);
__m512 tmp12932 = _mm512_sub_ps(in1853, in1854);
__m512 tmp12952 = _mm512_sub_ps(in1861, in1862);
__m512 tmp12931 = _mm512_sub_ps(in1851, in1852);
__m512 tmp12951 = _mm512_sub_ps(in1859, in1860);
__m512 tmp12928 = _mm512_add_ps(in1855, in1856);
__m512 tmp12948 = _mm512_add_ps(in1863, in1864);
__m512 tmp12933 = _mm512_sub_ps(in1855, in1856);
__m512 tmp12953 = _mm512_sub_ps(in1863, in1864);
__m512 tmp12930 = _mm512_fmadd_ps(tmp12932, _mm512_set1_ps(2e+00f), tmp12931);
__m512 tmp12950 = _mm512_fmadd_ps(tmp12952, _mm512_set1_ps(2e+00f), tmp12951);
__m512 tmp12937 = _mm512_fmadd_ps(tmp12932, _mm512_set1_ps(8e+00f), tmp12931);
__m512 tmp12957 = _mm512_fmadd_ps(tmp12952, _mm512_set1_ps(8e+00f), tmp12951);
__m512 tmp12925 = _mm512_add_ps(tmp12926, tmp12927);
__m512 tmp12945 = _mm512_add_ps(tmp12946, tmp12947);
__m512 tmp12929 = _mm512_fmadd_ps(tmp12933, _mm512_set1_ps(1.6e+01f), tmp12930);
__m512 tmp12949 = _mm512_fmadd_ps(tmp12953, _mm512_set1_ps(1.6e+01f), tmp12950);
__m512 tmp12936 = _mm512_fmadd_ps(tmp12933, _mm512_set1_ps(4e+00f), tmp12937);
__m512 tmp12956 = _mm512_fmadd_ps(tmp12953, _mm512_set1_ps(4e+00f), tmp12957);
__m512 tmp12942 = _mm512_add_ps(tmp12933, tmp12931);
__m512 tmp12962 = _mm512_add_ps(tmp12953, tmp12951);
__m512 tmp12935 = _mm512_fmadd_ps(tmp12926, _mm512_set1_ps(4e+00f), tmp12927);
__m512 tmp12955 = _mm512_fmadd_ps(tmp12946, _mm512_set1_ps(4e+00f), tmp12947);
__m512 tmp12939 = _mm512_fmadd_ps(tmp12926, _mm512_set1_ps(1.6e+01f), tmp12927);
__m512 tmp12959 = _mm512_fmadd_ps(tmp12946, _mm512_set1_ps(1.6e+01f), tmp12947);
__m512 tmp12924 = _mm512_add_ps(tmp12925, in1850);
__m512 tmp12944 = _mm512_add_ps(tmp12945, in1858);
__m512 tmp12941 = _mm512_add_ps(tmp12942, in1857);
__m512 tmp12961 = _mm512_add_ps(tmp12962, in1865);
__m512 tmp12923 = _mm512_fmadd_ps(tmp12928, _mm512_set1_ps(3.2e+01f), tmp12924);
__m512 tmp12943 = _mm512_fmadd_ps(tmp12948, _mm512_set1_ps(3.2e+01f), tmp12944);
__m512 tmp12934 = _mm512_fmadd_ps(tmp12928, _mm512_set1_ps(8e+00f), tmp12935);
__m512 tmp12954 = _mm512_fmadd_ps(tmp12948, _mm512_set1_ps(8e+00f), tmp12955);
__m512 tmp12940 = _mm512_fmadd_ps(tmp12932, _mm512_set1_ps(3.2e+01f), tmp12941);
__m512 tmp12960 = _mm512_fmadd_ps(tmp12952, _mm512_set1_ps(3.2e+01f), tmp12961);
__m512 tmp12938 = _mm512_fmadd_ps(tmp12928, _mm512_set1_ps(2e+00f), tmp12939);
__m512 tmp12958 = _mm512_fmadd_ps(tmp12948, _mm512_set1_ps(2e+00f), tmp12959);
__m512 tmp12911 = tmp12923;
__m512 tmp12917 = tmp12943;
__m512 tmp12912 = tmp12929;
__m512 tmp12918 = tmp12949;
__m512 tmp12913 = tmp12934;
__m512 tmp12919 = tmp12954;
__m512 tmp12914 = tmp12936;
__m512 tmp12920 = tmp12956;
__m512 tmp12915 = tmp12938;
__m512 tmp12921 = tmp12958;
__m512 tmp12916 = tmp12940;
__m512 tmp12922 = tmp12960;
__m512 tmp13002 = _mm512_unpacklo_ps(tmp12911, tmp12912);
__m512 tmp13003 = _mm512_unpackhi_ps(tmp12911, tmp12912);
__m512 tmp13004 = _mm512_unpacklo_ps(tmp12913, tmp12914);
__m512 tmp13005 = _mm512_unpackhi_ps(tmp12913, tmp12914);
__m512 tmp13006 = _mm512_unpacklo_ps(tmp12915, tmp12916);
__m512 tmp13007 = _mm512_unpackhi_ps(tmp12915, tmp12916);
__m512 tmp13008 = _mm512_unpacklo_ps(tmp12917, tmp12918);
__m512 tmp13009 = _mm512_unpackhi_ps(tmp12917, tmp12918);
__m512 tmp13010 = _mm512_unpacklo_ps(tmp12919, tmp12920);
__m512 tmp13011 = _mm512_unpackhi_ps(tmp12919, tmp12920);
__m512 tmp13012 = _mm512_unpacklo_ps(tmp12921, tmp12922);
__m512 tmp13013 = _mm512_unpackhi_ps(tmp12921, tmp12922);
__m512 tmp13014 = _mm512_shuffle_ps(tmp13002, tmp13004, 68);
__m512 tmp13015 = _mm512_shuffle_ps(tmp13002, tmp13004, 238);
__m512 tmp13016 = _mm512_shuffle_ps(tmp13003, tmp13005, 68);
__m512 tmp13017 = _mm512_shuffle_ps(tmp13003, tmp13005, 238);
__m512 tmp13018 = _mm512_shuffle_ps(tmp13006, tmp13008, 68);
__m512 tmp13019 = _mm512_shuffle_ps(tmp13006, tmp13008, 238);
__m512 tmp13020 = _mm512_shuffle_ps(tmp13007, tmp13009, 68);
__m512 tmp13021 = _mm512_shuffle_ps(tmp13007, tmp13009, 238);
__m512 tmp13022 = _mm512_shuffle_ps(tmp13010, tmp13012, 68);
__m512 tmp13023 = _mm512_shuffle_ps(tmp13010, tmp13012, 238);
__m512 tmp13024 = _mm512_shuffle_ps(tmp13011, tmp13013, 68);
__m512 tmp13025 = _mm512_shuffle_ps(tmp13011, tmp13013, 238);
__m512 tmp13026 = _mm512_shuffle_f32x4(tmp13014, tmp13018, 136);
__m512 tmp13027 = _mm512_shuffle_f32x4(tmp13014, tmp13018, 221);
__m512 tmp13028 = _mm512_shuffle_f32x4(tmp13015, tmp13019, 136);
__m512 tmp13029 = _mm512_shuffle_f32x4(tmp13015, tmp13019, 221);
__m512 tmp13030 = _mm512_shuffle_f32x4(tmp13016, tmp13020, 136);
__m512 tmp13031 = _mm512_shuffle_f32x4(tmp13016, tmp13020, 221);
__m512 tmp13032 = _mm512_shuffle_f32x4(tmp13017, tmp13021, 136);
__m512 tmp13033 = _mm512_shuffle_f32x4(tmp13017, tmp13021, 221);
__m512 tmp13034 = _mm512_shuffle_f32x4(tmp13022, tmp13022, 136);
__m512 tmp13035 = _mm512_shuffle_f32x4(tmp13022, tmp13022, 221);
__m512 tmp13036 = _mm512_shuffle_f32x4(tmp13023, tmp13023, 136);
__m512 tmp13037 = _mm512_shuffle_f32x4(tmp13023, tmp13023, 221);
__m512 tmp13038 = _mm512_shuffle_f32x4(tmp13024, tmp13024, 136);
__m512 tmp13039 = _mm512_shuffle_f32x4(tmp13024, tmp13024, 221);
__m512 tmp13040 = _mm512_shuffle_f32x4(tmp13025, tmp13025, 136);
__m512 tmp13041 = _mm512_shuffle_f32x4(tmp13025, tmp13025, 221);
tmp12911 = _mm512_shuffle_f32x4(tmp13026, tmp13034, 136);
tmp12919 = _mm512_shuffle_f32x4(tmp13026, tmp13034, 221);
tmp12912 = _mm512_shuffle_f32x4(tmp13028, tmp13036, 136);
tmp12920 = _mm512_shuffle_f32x4(tmp13028, tmp13036, 221);
tmp12913 = _mm512_shuffle_f32x4(tmp13030, tmp13038, 136);
tmp12921 = _mm512_shuffle_f32x4(tmp13030, tmp13038, 221);
tmp12914 = _mm512_shuffle_f32x4(tmp13032, tmp13040, 136);
tmp12922 = _mm512_shuffle_f32x4(tmp13032, tmp13040, 221);
tmp12915 = _mm512_shuffle_f32x4(tmp13027, tmp13035, 136);
__m512 tmp12963 = _mm512_shuffle_f32x4(tmp13027, tmp13035, 221);
tmp12916 = _mm512_shuffle_f32x4(tmp13029, tmp13037, 136);
__m512 tmp12964 = _mm512_shuffle_f32x4(tmp13029, tmp13037, 221);
tmp12917 = _mm512_shuffle_f32x4(tmp13031, tmp13039, 136);
__m512 tmp12965 = _mm512_shuffle_f32x4(tmp13031, tmp13039, 221);
tmp12918 = _mm512_shuffle_f32x4(tmp13033, tmp13041, 136);
__m512 tmp12966 = _mm512_shuffle_f32x4(tmp13033, tmp13041, 221);
(void)tmp12966;
__m512 tmp12971 = _mm512_add_ps(tmp12912, tmp12913);
__m512 tmp12991 = _mm512_add_ps(tmp12920, tmp12921);
__m512 tmp12970 = _mm512_add_ps(tmp12914, tmp12915);
__m512 tmp12990 = _mm512_add_ps(tmp12922, tmp12963);
__m512 tmp12976 = _mm512_sub_ps(tmp12914, tmp12915);
__m512 tmp12996 = _mm512_sub_ps(tmp12922, tmp12963);
__m512 tmp12975 = _mm512_sub_ps(tmp12912, tmp12913);
__m512 tmp12995 = _mm512_sub_ps(tmp12920, tmp12921);
__m512 tmp12972 = _mm512_add_ps(tmp12916, tmp12917);
__m512 tmp12992 = _mm512_add_ps(tmp12964, tmp12965);
__m512 tmp12977 = _mm512_sub_ps(tmp12916, tmp12917);
__m512 tmp12997 = _mm512_sub_ps(tmp12964, tmp12965);
__m512 tmp12974 = _mm512_fmadd_ps(tmp12976, _mm512_set1_ps(2e+00f), tmp12975);
__m512 tmp12994 = _mm512_fmadd_ps(tmp12996, _mm512_set1_ps(2e+00f), tmp12995);
__m512 tmp12981 = _mm512_fmadd_ps(tmp12976, _mm512_set1_ps(8e+00f), tmp12975);
__m512 tmp13001 = _mm512_fmadd_ps(tmp12996, _mm512_set1_ps(8e+00f), tmp12995);
__m512 tmp12969 = _mm512_add_ps(tmp12970, tmp12971);
__m512 tmp12989 = _mm512_add_ps(tmp12990, tmp12991);
__m512 tmp12973 = _mm512_fmadd_ps(tmp12977, _mm512_set1_ps(1.6e+01f), tmp12974);
__m512 tmp12993 = _mm512_fmadd_ps(tmp12997, _mm512_set1_ps(1.6e+01f), tmp12994);
__m512 tmp12980 = _mm512_fmadd_ps(tmp12977, _mm512_set1_ps(4e+00f), tmp12981);
__m512 tmp13000 = _mm512_fmadd_ps(tmp12997, _mm512_set1_ps(4e+00f), tmp13001);
__m512 tmp12986 = _mm512_add_ps(tmp12977, tmp12975);
__m512 tmp12979 = _mm512_fmadd_ps(tmp12970, _mm512_set1_ps(4e+00f), tmp12971);
__m512 tmp12999 = _mm512_fmadd_ps(tmp12990, _mm512_set1_ps(4e+00f), tmp12991);
__m512 tmp12983 = _mm512_fmadd_ps(tmp12970, _mm512_set1_ps(1.6e+01f), tmp12971);
__m512 tmp12968 = _mm512_add_ps(tmp12969, tmp12911);
__m512 tmp12988 = _mm512_add_ps(tmp12989, tmp12919);
__m512 tmp12985 = _mm512_add_ps(tmp12986, tmp12918);
__m512 tmp12967 = _mm512_fmadd_ps(tmp12972, _mm512_set1_ps(3.2e+01f), tmp12968);
__m512 tmp12987 = _mm512_fmadd_ps(tmp12992, _mm512_set1_ps(3.2e+01f), tmp12988);
__m512 tmp12978 = _mm512_fmadd_ps(tmp12972, _mm512_set1_ps(8e+00f), tmp12979);
__m512 tmp12998 = _mm512_fmadd_ps(tmp12992, _mm512_set1_ps(8e+00f), tmp12999);
__m512 tmp12984 = _mm512_fmadd_ps(tmp12976, _mm512_set1_ps(3.2e+01f), tmp12985);
__m512 tmp12982 = _mm512_fmadd_ps(tmp12972, _mm512_set1_ps(2e+00f), tmp12983);
__m512 out1707 = tmp12967;
__m512 out1713 = tmp12987;
__m512 out1708 = tmp12973;
__m512 out1714 = tmp12993;
__m512 out1709 = tmp12978;
__m512 out1715 = tmp12998;
__m512 out1710 = tmp12980;
__m512 out1716 = tmp13000;
__m512 out1711 = tmp12982;
__m512 out1712 = tmp12984;
out1707 = _mm512_max_ps(_mm512_setzero_ps(), out1707);
out1713 = _mm512_max_ps(_mm512_setzero_ps(), out1713);
out1708 = _mm512_max_ps(_mm512_setzero_ps(), out1708);
out1714 = _mm512_max_ps(_mm512_setzero_ps(), out1714);
out1709 = _mm512_max_ps(_mm512_setzero_ps(), out1709);
out1715 = _mm512_max_ps(_mm512_setzero_ps(), out1715);
out1710 = _mm512_max_ps(_mm512_setzero_ps(), out1710);
out1716 = _mm512_max_ps(_mm512_setzero_ps(), out1716);
out1711 = _mm512_max_ps(_mm512_setzero_ps(), out1711);
out1712 = _mm512_max_ps(_mm512_setzero_ps(), out1712);
_mm512_mask_storeu_ps(datPtr17+0+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 1023, out1707);
_mm512_mask_storeu_ps(datPtr17+600+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 4095, out1713);
_mm512_mask_storeu_ps(datPtr17+112+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 1023, out1708);
_mm512_mask_storeu_ps(datPtr17+712+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 4095, out1714);
_mm512_mask_storeu_ps(datPtr17+224+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 1023, out1709);
_mm512_mask_storeu_ps(datPtr17+824+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 4095, out1715);
_mm512_mask_storeu_ps(datPtr17+336+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 1023, out1710);
_mm512_mask_storeu_ps(datPtr17+936+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 4095, out1716);
_mm512_mask_storeu_ps(datPtr17+448+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 1023, out1711);
_mm512_mask_storeu_ps(datPtr17+560+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 1023, out1712);
__m512 sf961 = _mm512_loadu_ps(sfPtr9+256+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf962 = _mm512_loadu_ps(sfPtr9+384+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1866 = _mm512_shuffle_f32x4(sf961, sf962, 68);
__m512 in1867 = _mm512_shuffle_f32x4(sf961, sf962, 238);
__m512 sf963 = _mm512_loadu_ps(sfPtr9+320+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf964 = _mm512_loadu_ps(sfPtr9+448+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1874 = _mm512_shuffle_f32x4(sf963, sf964, 68);
__m512 in1875 = _mm512_shuffle_f32x4(sf963, sf964, 238);
__m512 sf965 = _mm512_loadu_ps(sfPtr9+205056+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf966 = _mm512_loadu_ps(sfPtr9+205184+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1868 = _mm512_shuffle_f32x4(sf965, sf966, 68);
__m512 in1869 = _mm512_shuffle_f32x4(sf965, sf966, 238);
__m512 sf967 = _mm512_loadu_ps(sfPtr9+205120+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf968 = _mm512_loadu_ps(sfPtr9+205248+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1876 = _mm512_shuffle_f32x4(sf967, sf968, 68);
__m512 in1877 = _mm512_shuffle_f32x4(sf967, sf968, 238);
__m512 sf969 = _mm512_loadu_ps(sfPtr9+409856+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf970 = _mm512_loadu_ps(sfPtr9+409984+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1870 = _mm512_shuffle_f32x4(sf969, sf970, 68);
__m512 in1871 = _mm512_shuffle_f32x4(sf969, sf970, 238);
__m512 sf971 = _mm512_loadu_ps(sfPtr9+409920+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf972 = _mm512_loadu_ps(sfPtr9+410048+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1878 = _mm512_shuffle_f32x4(sf971, sf972, 68);
__m512 in1879 = _mm512_shuffle_f32x4(sf971, sf972, 238);
__m512 sf973 = _mm512_loadu_ps(sfPtr9+614656+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf974 = _mm512_loadu_ps(sfPtr9+614784+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1872 = _mm512_shuffle_f32x4(sf973, sf974, 68);
__m512 in1873 = _mm512_shuffle_f32x4(sf973, sf974, 238);
__m512 sf975 = _mm512_loadu_ps(sfPtr9+614720+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf976 = _mm512_loadu_ps(sfPtr9+614848+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1880 = _mm512_shuffle_f32x4(sf975, sf976, 68);
__m512 in1881 = _mm512_shuffle_f32x4(sf975, sf976, 238);
__m512 tmp13058 = _mm512_add_ps(in1867, in1868);
__m512 tmp13078 = _mm512_add_ps(in1875, in1876);
__m512 tmp13057 = _mm512_add_ps(in1869, in1870);
__m512 tmp13077 = _mm512_add_ps(in1877, in1878);
__m512 tmp13063 = _mm512_sub_ps(in1869, in1870);
__m512 tmp13083 = _mm512_sub_ps(in1877, in1878);
__m512 tmp13062 = _mm512_sub_ps(in1867, in1868);
__m512 tmp13082 = _mm512_sub_ps(in1875, in1876);
__m512 tmp13059 = _mm512_add_ps(in1871, in1872);
__m512 tmp13079 = _mm512_add_ps(in1879, in1880);
__m512 tmp13064 = _mm512_sub_ps(in1871, in1872);
__m512 tmp13084 = _mm512_sub_ps(in1879, in1880);
__m512 tmp13061 = _mm512_fmadd_ps(tmp13063, _mm512_set1_ps(2e+00f), tmp13062);
__m512 tmp13081 = _mm512_fmadd_ps(tmp13083, _mm512_set1_ps(2e+00f), tmp13082);
__m512 tmp13068 = _mm512_fmadd_ps(tmp13063, _mm512_set1_ps(8e+00f), tmp13062);
__m512 tmp13088 = _mm512_fmadd_ps(tmp13083, _mm512_set1_ps(8e+00f), tmp13082);
__m512 tmp13056 = _mm512_add_ps(tmp13057, tmp13058);
__m512 tmp13076 = _mm512_add_ps(tmp13077, tmp13078);
__m512 tmp13060 = _mm512_fmadd_ps(tmp13064, _mm512_set1_ps(1.6e+01f), tmp13061);
__m512 tmp13080 = _mm512_fmadd_ps(tmp13084, _mm512_set1_ps(1.6e+01f), tmp13081);
__m512 tmp13067 = _mm512_fmadd_ps(tmp13064, _mm512_set1_ps(4e+00f), tmp13068);
__m512 tmp13087 = _mm512_fmadd_ps(tmp13084, _mm512_set1_ps(4e+00f), tmp13088);
__m512 tmp13073 = _mm512_add_ps(tmp13064, tmp13062);
__m512 tmp13093 = _mm512_add_ps(tmp13084, tmp13082);
__m512 tmp13066 = _mm512_fmadd_ps(tmp13057, _mm512_set1_ps(4e+00f), tmp13058);
__m512 tmp13086 = _mm512_fmadd_ps(tmp13077, _mm512_set1_ps(4e+00f), tmp13078);
__m512 tmp13070 = _mm512_fmadd_ps(tmp13057, _mm512_set1_ps(1.6e+01f), tmp13058);
__m512 tmp13090 = _mm512_fmadd_ps(tmp13077, _mm512_set1_ps(1.6e+01f), tmp13078);
__m512 tmp13055 = _mm512_add_ps(tmp13056, in1866);
__m512 tmp13075 = _mm512_add_ps(tmp13076, in1874);
__m512 tmp13072 = _mm512_add_ps(tmp13073, in1873);
__m512 tmp13092 = _mm512_add_ps(tmp13093, in1881);
__m512 tmp13054 = _mm512_fmadd_ps(tmp13059, _mm512_set1_ps(3.2e+01f), tmp13055);
__m512 tmp13074 = _mm512_fmadd_ps(tmp13079, _mm512_set1_ps(3.2e+01f), tmp13075);
__m512 tmp13065 = _mm512_fmadd_ps(tmp13059, _mm512_set1_ps(8e+00f), tmp13066);
__m512 tmp13085 = _mm512_fmadd_ps(tmp13079, _mm512_set1_ps(8e+00f), tmp13086);
__m512 tmp13071 = _mm512_fmadd_ps(tmp13063, _mm512_set1_ps(3.2e+01f), tmp13072);
__m512 tmp13091 = _mm512_fmadd_ps(tmp13083, _mm512_set1_ps(3.2e+01f), tmp13092);
__m512 tmp13069 = _mm512_fmadd_ps(tmp13059, _mm512_set1_ps(2e+00f), tmp13070);
__m512 tmp13089 = _mm512_fmadd_ps(tmp13079, _mm512_set1_ps(2e+00f), tmp13090);
__m512 tmp13042 = tmp13054;
__m512 tmp13048 = tmp13074;
__m512 tmp13043 = tmp13060;
__m512 tmp13049 = tmp13080;
__m512 tmp13044 = tmp13065;
__m512 tmp13050 = tmp13085;
__m512 tmp13045 = tmp13067;
__m512 tmp13051 = tmp13087;
__m512 tmp13046 = tmp13069;
__m512 tmp13052 = tmp13089;
__m512 tmp13047 = tmp13071;
__m512 tmp13053 = tmp13091;
__m512 tmp13133 = _mm512_unpacklo_ps(tmp13042, tmp13043);
__m512 tmp13134 = _mm512_unpackhi_ps(tmp13042, tmp13043);
__m512 tmp13135 = _mm512_unpacklo_ps(tmp13044, tmp13045);
__m512 tmp13136 = _mm512_unpackhi_ps(tmp13044, tmp13045);
__m512 tmp13137 = _mm512_unpacklo_ps(tmp13046, tmp13047);
__m512 tmp13138 = _mm512_unpackhi_ps(tmp13046, tmp13047);
__m512 tmp13139 = _mm512_unpacklo_ps(tmp13048, tmp13049);
__m512 tmp13140 = _mm512_unpackhi_ps(tmp13048, tmp13049);
__m512 tmp13141 = _mm512_unpacklo_ps(tmp13050, tmp13051);
__m512 tmp13142 = _mm512_unpackhi_ps(tmp13050, tmp13051);
__m512 tmp13143 = _mm512_unpacklo_ps(tmp13052, tmp13053);
__m512 tmp13144 = _mm512_unpackhi_ps(tmp13052, tmp13053);
__m512 tmp13145 = _mm512_shuffle_ps(tmp13133, tmp13135, 68);
__m512 tmp13146 = _mm512_shuffle_ps(tmp13133, tmp13135, 238);
__m512 tmp13147 = _mm512_shuffle_ps(tmp13134, tmp13136, 68);
__m512 tmp13148 = _mm512_shuffle_ps(tmp13134, tmp13136, 238);
__m512 tmp13149 = _mm512_shuffle_ps(tmp13137, tmp13139, 68);
__m512 tmp13150 = _mm512_shuffle_ps(tmp13137, tmp13139, 238);
__m512 tmp13151 = _mm512_shuffle_ps(tmp13138, tmp13140, 68);
__m512 tmp13152 = _mm512_shuffle_ps(tmp13138, tmp13140, 238);
__m512 tmp13153 = _mm512_shuffle_ps(tmp13141, tmp13143, 68);
__m512 tmp13154 = _mm512_shuffle_ps(tmp13141, tmp13143, 238);
__m512 tmp13155 = _mm512_shuffle_ps(tmp13142, tmp13144, 68);
__m512 tmp13156 = _mm512_shuffle_ps(tmp13142, tmp13144, 238);
__m512 tmp13157 = _mm512_shuffle_f32x4(tmp13145, tmp13149, 136);
__m512 tmp13158 = _mm512_shuffle_f32x4(tmp13145, tmp13149, 221);
__m512 tmp13159 = _mm512_shuffle_f32x4(tmp13146, tmp13150, 136);
__m512 tmp13160 = _mm512_shuffle_f32x4(tmp13146, tmp13150, 221);
__m512 tmp13161 = _mm512_shuffle_f32x4(tmp13147, tmp13151, 136);
__m512 tmp13162 = _mm512_shuffle_f32x4(tmp13147, tmp13151, 221);
__m512 tmp13163 = _mm512_shuffle_f32x4(tmp13148, tmp13152, 136);
__m512 tmp13164 = _mm512_shuffle_f32x4(tmp13148, tmp13152, 221);
__m512 tmp13165 = _mm512_shuffle_f32x4(tmp13153, tmp13153, 136);
__m512 tmp13166 = _mm512_shuffle_f32x4(tmp13153, tmp13153, 221);
__m512 tmp13167 = _mm512_shuffle_f32x4(tmp13154, tmp13154, 136);
__m512 tmp13168 = _mm512_shuffle_f32x4(tmp13154, tmp13154, 221);
__m512 tmp13169 = _mm512_shuffle_f32x4(tmp13155, tmp13155, 136);
__m512 tmp13170 = _mm512_shuffle_f32x4(tmp13155, tmp13155, 221);
__m512 tmp13171 = _mm512_shuffle_f32x4(tmp13156, tmp13156, 136);
__m512 tmp13172 = _mm512_shuffle_f32x4(tmp13156, tmp13156, 221);
tmp13042 = _mm512_shuffle_f32x4(tmp13157, tmp13165, 136);
tmp13050 = _mm512_shuffle_f32x4(tmp13157, tmp13165, 221);
tmp13043 = _mm512_shuffle_f32x4(tmp13159, tmp13167, 136);
tmp13051 = _mm512_shuffle_f32x4(tmp13159, tmp13167, 221);
tmp13044 = _mm512_shuffle_f32x4(tmp13161, tmp13169, 136);
tmp13052 = _mm512_shuffle_f32x4(tmp13161, tmp13169, 221);
tmp13045 = _mm512_shuffle_f32x4(tmp13163, tmp13171, 136);
tmp13053 = _mm512_shuffle_f32x4(tmp13163, tmp13171, 221);
tmp13046 = _mm512_shuffle_f32x4(tmp13158, tmp13166, 136);
__m512 tmp13094 = _mm512_shuffle_f32x4(tmp13158, tmp13166, 221);
tmp13047 = _mm512_shuffle_f32x4(tmp13160, tmp13168, 136);
__m512 tmp13095 = _mm512_shuffle_f32x4(tmp13160, tmp13168, 221);
tmp13048 = _mm512_shuffle_f32x4(tmp13162, tmp13170, 136);
__m512 tmp13096 = _mm512_shuffle_f32x4(tmp13162, tmp13170, 221);
tmp13049 = _mm512_shuffle_f32x4(tmp13164, tmp13172, 136);
__m512 tmp13097 = _mm512_shuffle_f32x4(tmp13164, tmp13172, 221);
(void)tmp13049;
__m512 tmp13102 = _mm512_add_ps(tmp13043, tmp13044);
__m512 tmp13117 = _mm512_add_ps(tmp13051, tmp13052);
__m512 tmp13101 = _mm512_add_ps(tmp13045, tmp13046);
__m512 tmp13116 = _mm512_add_ps(tmp13053, tmp13094);
__m512 tmp13107 = _mm512_sub_ps(tmp13045, tmp13046);
__m512 tmp13122 = _mm512_sub_ps(tmp13053, tmp13094);
__m512 tmp13106 = _mm512_sub_ps(tmp13043, tmp13044);
__m512 tmp13121 = _mm512_sub_ps(tmp13051, tmp13052);
__m512 tmp13103 = _mm512_add_ps(tmp13047, tmp13048);
__m512 tmp13118 = _mm512_add_ps(tmp13095, tmp13096);
__m512 tmp13108 = _mm512_sub_ps(tmp13047, tmp13048);
__m512 tmp13123 = _mm512_sub_ps(tmp13095, tmp13096);
__m512 tmp13105 = _mm512_fmadd_ps(tmp13107, _mm512_set1_ps(2e+00f), tmp13106);
__m512 tmp13120 = _mm512_fmadd_ps(tmp13122, _mm512_set1_ps(2e+00f), tmp13121);
__m512 tmp13112 = _mm512_fmadd_ps(tmp13107, _mm512_set1_ps(8e+00f), tmp13106);
__m512 tmp13127 = _mm512_fmadd_ps(tmp13122, _mm512_set1_ps(8e+00f), tmp13121);
__m512 tmp13100 = _mm512_add_ps(tmp13101, tmp13102);
__m512 tmp13115 = _mm512_add_ps(tmp13116, tmp13117);
__m512 tmp13104 = _mm512_fmadd_ps(tmp13108, _mm512_set1_ps(1.6e+01f), tmp13105);
__m512 tmp13119 = _mm512_fmadd_ps(tmp13123, _mm512_set1_ps(1.6e+01f), tmp13120);
__m512 tmp13111 = _mm512_fmadd_ps(tmp13108, _mm512_set1_ps(4e+00f), tmp13112);
__m512 tmp13126 = _mm512_fmadd_ps(tmp13123, _mm512_set1_ps(4e+00f), tmp13127);
__m512 tmp13132 = _mm512_add_ps(tmp13123, tmp13121);
__m512 tmp13110 = _mm512_fmadd_ps(tmp13101, _mm512_set1_ps(4e+00f), tmp13102);
__m512 tmp13125 = _mm512_fmadd_ps(tmp13116, _mm512_set1_ps(4e+00f), tmp13117);
__m512 tmp13129 = _mm512_fmadd_ps(tmp13116, _mm512_set1_ps(1.6e+01f), tmp13117);
__m512 tmp13099 = _mm512_add_ps(tmp13100, tmp13042);
__m512 tmp13114 = _mm512_add_ps(tmp13115, tmp13050);
__m512 tmp13131 = _mm512_add_ps(tmp13132, tmp13097);
__m512 tmp13098 = _mm512_fmadd_ps(tmp13103, _mm512_set1_ps(3.2e+01f), tmp13099);
__m512 tmp13113 = _mm512_fmadd_ps(tmp13118, _mm512_set1_ps(3.2e+01f), tmp13114);
__m512 tmp13109 = _mm512_fmadd_ps(tmp13103, _mm512_set1_ps(8e+00f), tmp13110);
__m512 tmp13124 = _mm512_fmadd_ps(tmp13118, _mm512_set1_ps(8e+00f), tmp13125);
__m512 tmp13130 = _mm512_fmadd_ps(tmp13122, _mm512_set1_ps(3.2e+01f), tmp13131);
__m512 tmp13128 = _mm512_fmadd_ps(tmp13118, _mm512_set1_ps(2e+00f), tmp13129);
__m512 out1717 = tmp13098;
__m512 out1721 = tmp13113;
__m512 out1718 = tmp13104;
__m512 out1722 = tmp13119;
__m512 out1719 = tmp13109;
__m512 out1723 = tmp13124;
__m512 out1720 = tmp13111;
__m512 out1724 = tmp13126;
__m512 out1725 = tmp13128;
__m512 out1726 = tmp13130;
out1717 = _mm512_max_ps(_mm512_setzero_ps(), out1717);
out1721 = _mm512_max_ps(_mm512_setzero_ps(), out1721);
out1718 = _mm512_max_ps(_mm512_setzero_ps(), out1718);
out1722 = _mm512_max_ps(_mm512_setzero_ps(), out1722);
out1719 = _mm512_max_ps(_mm512_setzero_ps(), out1719);
out1723 = _mm512_max_ps(_mm512_setzero_ps(), out1723);
out1720 = _mm512_max_ps(_mm512_setzero_ps(), out1720);
out1724 = _mm512_max_ps(_mm512_setzero_ps(), out1724);
out1725 = _mm512_max_ps(_mm512_setzero_ps(), out1725);
out1726 = _mm512_max_ps(_mm512_setzero_ps(), out1726);
_mm512_mask_storeu_ps(datPtr17+648+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 4095, out1717);
_mm512_mask_storeu_ps(datPtr17+3136+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 1023, out1721);
_mm512_mask_storeu_ps(datPtr17+760+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 4095, out1718);
_mm512_mask_storeu_ps(datPtr17+3248+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 1023, out1722);
_mm512_mask_storeu_ps(datPtr17+872+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 4095, out1719);
_mm512_mask_storeu_ps(datPtr17+3360+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 1023, out1723);
_mm512_mask_storeu_ps(datPtr17+984+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 4095, out1720);
_mm512_mask_storeu_ps(datPtr17+3472+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 1023, out1724);
_mm512_mask_storeu_ps(datPtr17+3584+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 1023, out1725);
_mm512_mask_storeu_ps(datPtr17+3696+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 1023, out1726);
__m512 sf977 = _mm512_loadu_ps(sfPtr9+512+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf978 = _mm512_loadu_ps(sfPtr9+640+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1882 = _mm512_shuffle_f32x4(sf977, sf978, 68);
__m512 in1883 = _mm512_shuffle_f32x4(sf977, sf978, 238);
__m512 sf979 = _mm512_loadu_ps(sfPtr9+576+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf980 = _mm512_loadu_ps(sfPtr9+704+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1890 = _mm512_shuffle_f32x4(sf979, sf980, 68);
__m512 in1891 = _mm512_shuffle_f32x4(sf979, sf980, 238);
__m512 sf981 = _mm512_loadu_ps(sfPtr9+205312+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf982 = _mm512_loadu_ps(sfPtr9+205440+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1884 = _mm512_shuffle_f32x4(sf981, sf982, 68);
__m512 in1885 = _mm512_shuffle_f32x4(sf981, sf982, 238);
__m512 sf983 = _mm512_loadu_ps(sfPtr9+205376+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf984 = _mm512_loadu_ps(sfPtr9+205504+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1892 = _mm512_shuffle_f32x4(sf983, sf984, 68);
__m512 in1893 = _mm512_shuffle_f32x4(sf983, sf984, 238);
__m512 sf985 = _mm512_loadu_ps(sfPtr9+410112+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf986 = _mm512_loadu_ps(sfPtr9+410240+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1886 = _mm512_shuffle_f32x4(sf985, sf986, 68);
__m512 in1887 = _mm512_shuffle_f32x4(sf985, sf986, 238);
__m512 sf987 = _mm512_loadu_ps(sfPtr9+410176+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf988 = _mm512_loadu_ps(sfPtr9+410304+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1894 = _mm512_shuffle_f32x4(sf987, sf988, 68);
__m512 in1895 = _mm512_shuffle_f32x4(sf987, sf988, 238);
__m512 sf989 = _mm512_loadu_ps(sfPtr9+614912+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf990 = _mm512_loadu_ps(sfPtr9+615040+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1888 = _mm512_shuffle_f32x4(sf989, sf990, 68);
__m512 in1889 = _mm512_shuffle_f32x4(sf989, sf990, 238);
__m512 sf991 = _mm512_loadu_ps(sfPtr9+614976+819200*i37+49152*j30+1536*k114+768*l46);
__m512 sf992 = _mm512_loadu_ps(sfPtr9+615104+819200*i37+49152*j30+1536*k114+768*l46);
__m512 in1896 = _mm512_shuffle_f32x4(sf991, sf992, 68);
__m512 in1897 = _mm512_shuffle_f32x4(sf991, sf992, 238);
__m512 tmp13189 = _mm512_add_ps(in1883, in1884);
__m512 tmp13209 = _mm512_add_ps(in1891, in1892);
__m512 tmp13188 = _mm512_add_ps(in1885, in1886);
__m512 tmp13208 = _mm512_add_ps(in1893, in1894);
__m512 tmp13194 = _mm512_sub_ps(in1885, in1886);
__m512 tmp13214 = _mm512_sub_ps(in1893, in1894);
__m512 tmp13193 = _mm512_sub_ps(in1883, in1884);
__m512 tmp13213 = _mm512_sub_ps(in1891, in1892);
__m512 tmp13190 = _mm512_add_ps(in1887, in1888);
__m512 tmp13210 = _mm512_add_ps(in1895, in1896);
__m512 tmp13195 = _mm512_sub_ps(in1887, in1888);
__m512 tmp13215 = _mm512_sub_ps(in1895, in1896);
__m512 tmp13192 = _mm512_fmadd_ps(tmp13194, _mm512_set1_ps(2e+00f), tmp13193);
__m512 tmp13212 = _mm512_fmadd_ps(tmp13214, _mm512_set1_ps(2e+00f), tmp13213);
__m512 tmp13199 = _mm512_fmadd_ps(tmp13194, _mm512_set1_ps(8e+00f), tmp13193);
__m512 tmp13219 = _mm512_fmadd_ps(tmp13214, _mm512_set1_ps(8e+00f), tmp13213);
__m512 tmp13187 = _mm512_add_ps(tmp13188, tmp13189);
__m512 tmp13207 = _mm512_add_ps(tmp13208, tmp13209);
__m512 tmp13191 = _mm512_fmadd_ps(tmp13195, _mm512_set1_ps(1.6e+01f), tmp13192);
__m512 tmp13211 = _mm512_fmadd_ps(tmp13215, _mm512_set1_ps(1.6e+01f), tmp13212);
__m512 tmp13198 = _mm512_fmadd_ps(tmp13195, _mm512_set1_ps(4e+00f), tmp13199);
__m512 tmp13218 = _mm512_fmadd_ps(tmp13215, _mm512_set1_ps(4e+00f), tmp13219);
__m512 tmp13204 = _mm512_add_ps(tmp13195, tmp13193);
__m512 tmp13224 = _mm512_add_ps(tmp13215, tmp13213);
__m512 tmp13197 = _mm512_fmadd_ps(tmp13188, _mm512_set1_ps(4e+00f), tmp13189);
__m512 tmp13217 = _mm512_fmadd_ps(tmp13208, _mm512_set1_ps(4e+00f), tmp13209);
__m512 tmp13201 = _mm512_fmadd_ps(tmp13188, _mm512_set1_ps(1.6e+01f), tmp13189);
__m512 tmp13221 = _mm512_fmadd_ps(tmp13208, _mm512_set1_ps(1.6e+01f), tmp13209);
__m512 tmp13186 = _mm512_add_ps(tmp13187, in1882);
__m512 tmp13206 = _mm512_add_ps(tmp13207, in1890);
__m512 tmp13203 = _mm512_add_ps(tmp13204, in1889);
__m512 tmp13223 = _mm512_add_ps(tmp13224, in1897);
__m512 tmp13185 = _mm512_fmadd_ps(tmp13190, _mm512_set1_ps(3.2e+01f), tmp13186);
__m512 tmp13205 = _mm512_fmadd_ps(tmp13210, _mm512_set1_ps(3.2e+01f), tmp13206);
__m512 tmp13196 = _mm512_fmadd_ps(tmp13190, _mm512_set1_ps(8e+00f), tmp13197);
__m512 tmp13216 = _mm512_fmadd_ps(tmp13210, _mm512_set1_ps(8e+00f), tmp13217);
__m512 tmp13202 = _mm512_fmadd_ps(tmp13194, _mm512_set1_ps(3.2e+01f), tmp13203);
__m512 tmp13222 = _mm512_fmadd_ps(tmp13214, _mm512_set1_ps(3.2e+01f), tmp13223);
__m512 tmp13200 = _mm512_fmadd_ps(tmp13190, _mm512_set1_ps(2e+00f), tmp13201);
__m512 tmp13220 = _mm512_fmadd_ps(tmp13210, _mm512_set1_ps(2e+00f), tmp13221);
__m512 tmp13173 = tmp13185;
__m512 tmp13179 = tmp13205;
__m512 tmp13174 = tmp13191;
__m512 tmp13180 = tmp13211;
__m512 tmp13175 = tmp13196;
__m512 tmp13181 = tmp13216;
__m512 tmp13176 = tmp13198;
__m512 tmp13182 = tmp13218;
__m512 tmp13177 = tmp13200;
__m512 tmp13183 = tmp13220;
__m512 tmp13178 = tmp13202;
__m512 tmp13184 = tmp13222;
__m512 tmp13259 = _mm512_unpacklo_ps(tmp13173, tmp13174);
__m512 tmp13260 = _mm512_unpackhi_ps(tmp13173, tmp13174);
__m512 tmp13261 = _mm512_unpacklo_ps(tmp13175, tmp13176);
__m512 tmp13262 = _mm512_unpackhi_ps(tmp13175, tmp13176);
__m512 tmp13263 = _mm512_unpacklo_ps(tmp13177, tmp13178);
__m512 tmp13264 = _mm512_unpackhi_ps(tmp13177, tmp13178);
__m512 tmp13265 = _mm512_unpacklo_ps(tmp13179, tmp13180);
__m512 tmp13266 = _mm512_unpackhi_ps(tmp13179, tmp13180);
__m512 tmp13267 = _mm512_unpacklo_ps(tmp13181, tmp13182);
__m512 tmp13268 = _mm512_unpackhi_ps(tmp13181, tmp13182);
__m512 tmp13269 = _mm512_unpacklo_ps(tmp13183, tmp13184);
__m512 tmp13270 = _mm512_unpackhi_ps(tmp13183, tmp13184);
__m512 tmp13271 = _mm512_shuffle_ps(tmp13259, tmp13261, 68);
__m512 tmp13272 = _mm512_shuffle_ps(tmp13259, tmp13261, 238);
__m512 tmp13273 = _mm512_shuffle_ps(tmp13260, tmp13262, 68);
__m512 tmp13274 = _mm512_shuffle_ps(tmp13260, tmp13262, 238);
__m512 tmp13275 = _mm512_shuffle_ps(tmp13263, tmp13265, 68);
__m512 tmp13276 = _mm512_shuffle_ps(tmp13263, tmp13265, 238);
__m512 tmp13277 = _mm512_shuffle_ps(tmp13264, tmp13266, 68);
__m512 tmp13278 = _mm512_shuffle_ps(tmp13264, tmp13266, 238);
__m512 tmp13279 = _mm512_shuffle_ps(tmp13267, tmp13269, 68);
__m512 tmp13280 = _mm512_shuffle_ps(tmp13267, tmp13269, 238);
__m512 tmp13281 = _mm512_shuffle_ps(tmp13268, tmp13270, 68);
__m512 tmp13282 = _mm512_shuffle_ps(tmp13268, tmp13270, 238);
__m512 tmp13283 = _mm512_shuffle_f32x4(tmp13271, tmp13275, 136);
__m512 tmp13284 = _mm512_shuffle_f32x4(tmp13271, tmp13275, 221);
__m512 tmp13285 = _mm512_shuffle_f32x4(tmp13272, tmp13276, 136);
__m512 tmp13286 = _mm512_shuffle_f32x4(tmp13272, tmp13276, 221);
__m512 tmp13287 = _mm512_shuffle_f32x4(tmp13273, tmp13277, 136);
__m512 tmp13288 = _mm512_shuffle_f32x4(tmp13273, tmp13277, 221);
__m512 tmp13289 = _mm512_shuffle_f32x4(tmp13274, tmp13278, 136);
__m512 tmp13290 = _mm512_shuffle_f32x4(tmp13274, tmp13278, 221);
__m512 tmp13291 = _mm512_shuffle_f32x4(tmp13279, tmp13279, 136);
__m512 tmp13292 = _mm512_shuffle_f32x4(tmp13279, tmp13279, 221);
__m512 tmp13293 = _mm512_shuffle_f32x4(tmp13280, tmp13280, 136);
__m512 tmp13294 = _mm512_shuffle_f32x4(tmp13280, tmp13280, 221);
__m512 tmp13295 = _mm512_shuffle_f32x4(tmp13281, tmp13281, 136);
__m512 tmp13296 = _mm512_shuffle_f32x4(tmp13281, tmp13281, 221);
__m512 tmp13297 = _mm512_shuffle_f32x4(tmp13282, tmp13282, 136);
__m512 tmp13298 = _mm512_shuffle_f32x4(tmp13282, tmp13282, 221);
tmp13173 = _mm512_shuffle_f32x4(tmp13283, tmp13291, 136);
tmp13181 = _mm512_shuffle_f32x4(tmp13283, tmp13291, 221);
tmp13174 = _mm512_shuffle_f32x4(tmp13285, tmp13293, 136);
tmp13182 = _mm512_shuffle_f32x4(tmp13285, tmp13293, 221);
tmp13175 = _mm512_shuffle_f32x4(tmp13287, tmp13295, 136);
tmp13183 = _mm512_shuffle_f32x4(tmp13287, tmp13295, 221);
tmp13176 = _mm512_shuffle_f32x4(tmp13289, tmp13297, 136);
tmp13184 = _mm512_shuffle_f32x4(tmp13289, tmp13297, 221);
tmp13177 = _mm512_shuffle_f32x4(tmp13284, tmp13292, 136);
__m512 tmp13225 = _mm512_shuffle_f32x4(tmp13284, tmp13292, 221);
tmp13178 = _mm512_shuffle_f32x4(tmp13286, tmp13294, 136);
__m512 tmp13226 = _mm512_shuffle_f32x4(tmp13286, tmp13294, 221);
tmp13179 = _mm512_shuffle_f32x4(tmp13288, tmp13296, 136);
__m512 tmp13227 = _mm512_shuffle_f32x4(tmp13288, tmp13296, 221);
tmp13180 = _mm512_shuffle_f32x4(tmp13290, tmp13298, 136);
__m512 tmp13228 = _mm512_shuffle_f32x4(tmp13290, tmp13298, 221);
(void)tmp13180;
(void)tmp13228;
__m512 tmp13233 = _mm512_add_ps(tmp13174, tmp13175);
__m512 tmp13248 = _mm512_add_ps(tmp13182, tmp13183);
__m512 tmp13232 = _mm512_add_ps(tmp13176, tmp13177);
__m512 tmp13247 = _mm512_add_ps(tmp13184, tmp13225);
__m512 tmp13238 = _mm512_sub_ps(tmp13176, tmp13177);
__m512 tmp13253 = _mm512_sub_ps(tmp13184, tmp13225);
__m512 tmp13237 = _mm512_sub_ps(tmp13174, tmp13175);
__m512 tmp13252 = _mm512_sub_ps(tmp13182, tmp13183);
__m512 tmp13234 = _mm512_add_ps(tmp13178, tmp13179);
__m512 tmp13249 = _mm512_add_ps(tmp13226, tmp13227);
__m512 tmp13239 = _mm512_sub_ps(tmp13178, tmp13179);
__m512 tmp13254 = _mm512_sub_ps(tmp13226, tmp13227);
__m512 tmp13236 = _mm512_fmadd_ps(tmp13238, _mm512_set1_ps(2e+00f), tmp13237);
__m512 tmp13251 = _mm512_fmadd_ps(tmp13253, _mm512_set1_ps(2e+00f), tmp13252);
__m512 tmp13243 = _mm512_fmadd_ps(tmp13238, _mm512_set1_ps(8e+00f), tmp13237);
__m512 tmp13258 = _mm512_fmadd_ps(tmp13253, _mm512_set1_ps(8e+00f), tmp13252);
__m512 tmp13231 = _mm512_add_ps(tmp13232, tmp13233);
__m512 tmp13246 = _mm512_add_ps(tmp13247, tmp13248);
__m512 tmp13235 = _mm512_fmadd_ps(tmp13239, _mm512_set1_ps(1.6e+01f), tmp13236);
__m512 tmp13250 = _mm512_fmadd_ps(tmp13254, _mm512_set1_ps(1.6e+01f), tmp13251);
__m512 tmp13242 = _mm512_fmadd_ps(tmp13239, _mm512_set1_ps(4e+00f), tmp13243);
__m512 tmp13257 = _mm512_fmadd_ps(tmp13254, _mm512_set1_ps(4e+00f), tmp13258);
__m512 tmp13241 = _mm512_fmadd_ps(tmp13232, _mm512_set1_ps(4e+00f), tmp13233);
__m512 tmp13256 = _mm512_fmadd_ps(tmp13247, _mm512_set1_ps(4e+00f), tmp13248);
__m512 tmp13230 = _mm512_add_ps(tmp13231, tmp13173);
__m512 tmp13245 = _mm512_add_ps(tmp13246, tmp13181);
__m512 tmp13229 = _mm512_fmadd_ps(tmp13234, _mm512_set1_ps(3.2e+01f), tmp13230);
__m512 tmp13244 = _mm512_fmadd_ps(tmp13249, _mm512_set1_ps(3.2e+01f), tmp13245);
__m512 tmp13240 = _mm512_fmadd_ps(tmp13234, _mm512_set1_ps(8e+00f), tmp13241);
__m512 tmp13255 = _mm512_fmadd_ps(tmp13249, _mm512_set1_ps(8e+00f), tmp13256);
__m512 out1727 = tmp13229;
__m512 out1731 = tmp13244;
__m512 out1728 = tmp13235;
__m512 out1732 = tmp13250;
__m512 out1729 = tmp13240;
__m512 out1733 = tmp13255;
__m512 out1730 = tmp13242;
__m512 out1734 = tmp13257;
out1727 = _mm512_max_ps(_mm512_setzero_ps(), out1727);
out1731 = _mm512_max_ps(_mm512_setzero_ps(), out1731);
out1728 = _mm512_max_ps(_mm512_setzero_ps(), out1728);
out1732 = _mm512_max_ps(_mm512_setzero_ps(), out1732);
out1729 = _mm512_max_ps(_mm512_setzero_ps(), out1729);
out1733 = _mm512_max_ps(_mm512_setzero_ps(), out1733);
out1730 = _mm512_max_ps(_mm512_setzero_ps(), out1730);
out1734 = _mm512_max_ps(_mm512_setzero_ps(), out1734);
_mm512_mask_storeu_ps(datPtr17+3736+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 4095, out1727);
_mm512_mask_storeu_ps(datPtr17+3784+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 4095, out1731);
_mm512_mask_storeu_ps(datPtr17+3848+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 4095, out1728);
_mm512_mask_storeu_ps(datPtr17+3896+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 4095, out1732);
_mm512_mask_storeu_ps(datPtr17+3960+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 4095, out1729);
_mm512_mask_storeu_ps(datPtr17+4008+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 4095, out1733);
_mm512_mask_storeu_ps(datPtr17+4072+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 4095, out1730);
_mm512_mask_storeu_ps(datPtr17+4120+401408*i37+112*toH41+4*toW41+12544*k114+6272*l46, 4095, out1734);
}
}
if (j30 >= last8) return;
++j30;
rel20 = 4;
}
ptrdiff_t toH42 = base20+24;
ptrdiff_t toW42 = 24;
ptrdiff_t k115 = 32*w54;
for (; k115 != 32; ++k115) {
ptrdiff_t l47 = 0;
for (; l47 != 1; ++l47) {
__m512 sf993 = _mm512_loadu_ps(sfPtr9+0+819200*i37+49152*j30+256*k115+256*l47);
__m512 sf994 = _mm512_loadu_ps(sfPtr9+64+819200*i37+49152*j30+256*k115+256*l47);
__m512 in1898 = _mm512_shuffle_f32x4(sf993, sf994, 68);
__m512 in1899 = _mm512_shuffle_f32x4(sf993, sf994, 238);
__m512 sf995 = _mm512_loadu_ps(sfPtr9+128+819200*i37+49152*j30+256*k115+256*l47);
__m512 sf996 = _mm512_loadu_ps(sfPtr9+192+819200*i37+49152*j30+256*k115+256*l47);
__m512 in1906 = _mm512_shuffle_f32x4(sf995, sf996, 68);
__m512 in1907 = _mm512_shuffle_f32x4(sf995, sf996, 238);
__m512 sf997 = _mm512_loadu_ps(sfPtr9+204800+819200*i37+49152*j30+256*k115+256*l47);
__m512 sf998 = _mm512_loadu_ps(sfPtr9+204864+819200*i37+49152*j30+256*k115+256*l47);
__m512 in1900 = _mm512_shuffle_f32x4(sf997, sf998, 68);
__m512 in1901 = _mm512_shuffle_f32x4(sf997, sf998, 238);
__m512 sf999 = _mm512_loadu_ps(sfPtr9+204928+819200*i37+49152*j30+256*k115+256*l47);
__m512 sf1000 = _mm512_loadu_ps(sfPtr9+204992+819200*i37+49152*j30+256*k115+256*l47);
__m512 in1908 = _mm512_shuffle_f32x4(sf999, sf1000, 68);
__m512 in1909 = _mm512_shuffle_f32x4(sf999, sf1000, 238);
__m512 sf1001 = _mm512_loadu_ps(sfPtr9+409600+819200*i37+49152*j30+256*k115+256*l47);
__m512 sf1002 = _mm512_loadu_ps(sfPtr9+409664+819200*i37+49152*j30+256*k115+256*l47);
__m512 in1902 = _mm512_shuffle_f32x4(sf1001, sf1002, 68);
__m512 in1903 = _mm512_shuffle_f32x4(sf1001, sf1002, 238);
__m512 sf1003 = _mm512_loadu_ps(sfPtr9+409728+819200*i37+49152*j30+256*k115+256*l47);
__m512 sf1004 = _mm512_loadu_ps(sfPtr9+409792+819200*i37+49152*j30+256*k115+256*l47);
__m512 in1910 = _mm512_shuffle_f32x4(sf1003, sf1004, 68);
__m512 in1911 = _mm512_shuffle_f32x4(sf1003, sf1004, 238);
__m512 sf1005 = _mm512_loadu_ps(sfPtr9+614400+819200*i37+49152*j30+256*k115+256*l47);
__m512 sf1006 = _mm512_loadu_ps(sfPtr9+614464+819200*i37+49152*j30+256*k115+256*l47);
__m512 in1904 = _mm512_shuffle_f32x4(sf1005, sf1006, 68);
__m512 in1905 = _mm512_shuffle_f32x4(sf1005, sf1006, 238);
__m512 sf1007 = _mm512_loadu_ps(sfPtr9+614528+819200*i37+49152*j30+256*k115+256*l47);
__m512 sf1008 = _mm512_loadu_ps(sfPtr9+614592+819200*i37+49152*j30+256*k115+256*l47);
__m512 in1912 = _mm512_shuffle_f32x4(sf1007, sf1008, 68);
__m512 in1913 = _mm512_shuffle_f32x4(sf1007, sf1008, 238);
(void)in1905;
(void)in1913;
__m512 tmp13311 = _mm512_add_ps(in1899, in1900);
__m512 tmp13326 = _mm512_add_ps(in1907, in1908);
__m512 tmp13310 = _mm512_add_ps(in1901, in1902);
__m512 tmp13325 = _mm512_add_ps(in1909, in1910);
__m512 tmp13316 = _mm512_sub_ps(in1901, in1902);
__m512 tmp13331 = _mm512_sub_ps(in1909, in1910);
__m512 tmp13315 = _mm512_sub_ps(in1899, in1900);
__m512 tmp13330 = _mm512_sub_ps(in1907, in1908);
__m512 tmp13312 = _mm512_add_ps(in1903, in1904);
__m512 tmp13327 = _mm512_add_ps(in1911, in1912);
__m512 tmp13317 = _mm512_sub_ps(in1903, in1904);
__m512 tmp13332 = _mm512_sub_ps(in1911, in1912);
__m512 tmp13314 = _mm512_fmadd_ps(tmp13316, _mm512_set1_ps(2e+00f), tmp13315);
__m512 tmp13329 = _mm512_fmadd_ps(tmp13331, _mm512_set1_ps(2e+00f), tmp13330);
__m512 tmp13321 = _mm512_fmadd_ps(tmp13316, _mm512_set1_ps(8e+00f), tmp13315);
__m512 tmp13336 = _mm512_fmadd_ps(tmp13331, _mm512_set1_ps(8e+00f), tmp13330);
__m512 tmp13309 = _mm512_add_ps(tmp13310, tmp13311);
__m512 tmp13324 = _mm512_add_ps(tmp13325, tmp13326);
__m512 tmp13313 = _mm512_fmadd_ps(tmp13317, _mm512_set1_ps(1.6e+01f), tmp13314);
__m512 tmp13328 = _mm512_fmadd_ps(tmp13332, _mm512_set1_ps(1.6e+01f), tmp13329);
__m512 tmp13320 = _mm512_fmadd_ps(tmp13317, _mm512_set1_ps(4e+00f), tmp13321);
__m512 tmp13335 = _mm512_fmadd_ps(tmp13332, _mm512_set1_ps(4e+00f), tmp13336);
__m512 tmp13319 = _mm512_fmadd_ps(tmp13310, _mm512_set1_ps(4e+00f), tmp13311);
__m512 tmp13334 = _mm512_fmadd_ps(tmp13325, _mm512_set1_ps(4e+00f), tmp13326);
__m512 tmp13308 = _mm512_add_ps(tmp13309, in1898);
__m512 tmp13323 = _mm512_add_ps(tmp13324, in1906);
__m512 tmp13307 = _mm512_fmadd_ps(tmp13312, _mm512_set1_ps(3.2e+01f), tmp13308);
__m512 tmp13322 = _mm512_fmadd_ps(tmp13327, _mm512_set1_ps(3.2e+01f), tmp13323);
__m512 tmp13318 = _mm512_fmadd_ps(tmp13312, _mm512_set1_ps(8e+00f), tmp13319);
__m512 tmp13333 = _mm512_fmadd_ps(tmp13327, _mm512_set1_ps(8e+00f), tmp13334);
__m512 tmp13299 = tmp13307;
__m512 tmp13303 = tmp13322;
__m512 tmp13300 = tmp13313;
__m512 tmp13304 = tmp13328;
__m512 tmp13301 = tmp13318;
__m512 tmp13305 = tmp13333;
__m512 tmp13302 = tmp13320;
__m512 tmp13306 = tmp13335;
__m512 tmp13337 = _mm512_setzero_ps();
__m512 tmp13338 = _mm512_setzero_ps();
__m512 tmp13375 = _mm512_unpacklo_ps(tmp13299, tmp13300);
__m512 tmp13376 = _mm512_unpackhi_ps(tmp13299, tmp13300);
__m512 tmp13377 = _mm512_unpacklo_ps(tmp13301, tmp13302);
__m512 tmp13378 = _mm512_unpackhi_ps(tmp13301, tmp13302);
__m512 tmp13379 = _mm512_unpacklo_ps(tmp13337, tmp13338);
__m512 tmp13380 = _mm512_unpackhi_ps(tmp13337, tmp13338);
__m512 tmp13381 = _mm512_unpacklo_ps(tmp13303, tmp13304);
__m512 tmp13382 = _mm512_unpackhi_ps(tmp13303, tmp13304);
__m512 tmp13383 = _mm512_unpacklo_ps(tmp13305, tmp13306);
__m512 tmp13384 = _mm512_unpackhi_ps(tmp13305, tmp13306);
__m512 tmp13385 = _mm512_shuffle_ps(tmp13375, tmp13377, 68);
__m512 tmp13386 = _mm512_shuffle_ps(tmp13375, tmp13377, 238);
__m512 tmp13387 = _mm512_shuffle_ps(tmp13376, tmp13378, 68);
__m512 tmp13388 = _mm512_shuffle_ps(tmp13376, tmp13378, 238);
__m512 tmp13389 = _mm512_shuffle_ps(tmp13379, tmp13381, 68);
__m512 tmp13390 = _mm512_shuffle_ps(tmp13379, tmp13381, 238);
__m512 tmp13391 = _mm512_shuffle_ps(tmp13380, tmp13382, 68);
__m512 tmp13392 = _mm512_shuffle_ps(tmp13380, tmp13382, 238);
__m512 tmp13393 = _mm512_shuffle_ps(tmp13383, tmp13383, 238);
__m512 tmp13394 = _mm512_shuffle_ps(tmp13384, tmp13384, 238);
__m512 tmp13395 = _mm512_shuffle_f32x4(tmp13385, tmp13389, 136);
__m512 tmp13396 = _mm512_shuffle_f32x4(tmp13385, tmp13389, 221);
__m512 tmp13397 = _mm512_shuffle_f32x4(tmp13386, tmp13390, 136);
__m512 tmp13398 = _mm512_shuffle_f32x4(tmp13386, tmp13390, 221);
__m512 tmp13399 = _mm512_shuffle_f32x4(tmp13387, tmp13391, 136);
__m512 tmp13400 = _mm512_shuffle_f32x4(tmp13387, tmp13391, 221);
__m512 tmp13401 = _mm512_shuffle_f32x4(tmp13388, tmp13392, 136);
__m512 tmp13402 = _mm512_shuffle_f32x4(tmp13388, tmp13392, 221);
__m512 tmp13403 = _mm512_shuffle_f32x4(tmp13383, tmp13383, 136);
__m512 tmp13404 = _mm512_shuffle_f32x4(tmp13383, tmp13383, 221);
__m512 tmp13405 = _mm512_shuffle_f32x4(tmp13393, tmp13393, 136);
__m512 tmp13406 = _mm512_shuffle_f32x4(tmp13393, tmp13393, 221);
__m512 tmp13407 = _mm512_shuffle_f32x4(tmp13384, tmp13384, 136);
__m512 tmp13408 = _mm512_shuffle_f32x4(tmp13384, tmp13384, 221);
__m512 tmp13409 = _mm512_shuffle_f32x4(tmp13394, tmp13394, 136);
__m512 tmp13410 = _mm512_shuffle_f32x4(tmp13394, tmp13394, 221);
tmp13299 = _mm512_shuffle_f32x4(tmp13395, tmp13403, 136);
tmp13305 = _mm512_shuffle_f32x4(tmp13395, tmp13403, 221);
tmp13300 = _mm512_shuffle_f32x4(tmp13397, tmp13405, 136);
tmp13306 = _mm512_shuffle_f32x4(tmp13397, tmp13405, 221);
tmp13301 = _mm512_shuffle_f32x4(tmp13399, tmp13407, 136);
__m512 tmp13339 = _mm512_shuffle_f32x4(tmp13399, tmp13407, 221);
tmp13302 = _mm512_shuffle_f32x4(tmp13401, tmp13409, 136);
__m512 tmp13340 = _mm512_shuffle_f32x4(tmp13401, tmp13409, 221);
tmp13337 = _mm512_shuffle_f32x4(tmp13396, tmp13404, 136);
__m512 tmp13341 = _mm512_shuffle_f32x4(tmp13396, tmp13404, 221);
tmp13338 = _mm512_shuffle_f32x4(tmp13398, tmp13406, 136);
__m512 tmp13342 = _mm512_shuffle_f32x4(tmp13398, tmp13406, 221);
tmp13303 = _mm512_shuffle_f32x4(tmp13400, tmp13408, 136);
__m512 tmp13343 = _mm512_shuffle_f32x4(tmp13400, tmp13408, 221);
tmp13304 = _mm512_shuffle_f32x4(tmp13402, tmp13410, 136);
__m512 tmp13344 = _mm512_shuffle_f32x4(tmp13402, tmp13410, 221);
(void)tmp13304;
(void)tmp13344;
__m512 tmp13349 = _mm512_add_ps(tmp13300, tmp13301);
__m512 tmp13364 = _mm512_add_ps(tmp13306, tmp13339);
__m512 tmp13348 = _mm512_add_ps(tmp13302, tmp13337);
__m512 tmp13363 = _mm512_add_ps(tmp13340, tmp13341);
__m512 tmp13354 = _mm512_sub_ps(tmp13302, tmp13337);
__m512 tmp13369 = _mm512_sub_ps(tmp13340, tmp13341);
__m512 tmp13353 = _mm512_sub_ps(tmp13300, tmp13301);
__m512 tmp13368 = _mm512_sub_ps(tmp13306, tmp13339);
__m512 tmp13350 = _mm512_add_ps(tmp13338, tmp13303);
__m512 tmp13365 = _mm512_add_ps(tmp13342, tmp13343);
__m512 tmp13355 = _mm512_sub_ps(tmp13338, tmp13303);
__m512 tmp13370 = _mm512_sub_ps(tmp13342, tmp13343);
__m512 tmp13352 = _mm512_fmadd_ps(tmp13354, _mm512_set1_ps(2e+00f), tmp13353);
__m512 tmp13367 = _mm512_fmadd_ps(tmp13369, _mm512_set1_ps(2e+00f), tmp13368);
__m512 tmp13359 = _mm512_fmadd_ps(tmp13354, _mm512_set1_ps(8e+00f), tmp13353);
__m512 tmp13374 = _mm512_fmadd_ps(tmp13369, _mm512_set1_ps(8e+00f), tmp13368);
__m512 tmp13347 = _mm512_add_ps(tmp13348, tmp13349);
__m512 tmp13362 = _mm512_add_ps(tmp13363, tmp13364);
__m512 tmp13351 = _mm512_fmadd_ps(tmp13355, _mm512_set1_ps(1.6e+01f), tmp13352);
__m512 tmp13366 = _mm512_fmadd_ps(tmp13370, _mm512_set1_ps(1.6e+01f), tmp13367);
__m512 tmp13358 = _mm512_fmadd_ps(tmp13355, _mm512_set1_ps(4e+00f), tmp13359);
__m512 tmp13373 = _mm512_fmadd_ps(tmp13370, _mm512_set1_ps(4e+00f), tmp13374);
__m512 tmp13357 = _mm512_fmadd_ps(tmp13348, _mm512_set1_ps(4e+00f), tmp13349);
__m512 tmp13372 = _mm512_fmadd_ps(tmp13363, _mm512_set1_ps(4e+00f), tmp13364);
__m512 tmp13346 = _mm512_add_ps(tmp13347, tmp13299);
__m512 tmp13361 = _mm512_add_ps(tmp13362, tmp13305);
__m512 tmp13345 = _mm512_fmadd_ps(tmp13350, _mm512_set1_ps(3.2e+01f), tmp13346);
__m512 tmp13360 = _mm512_fmadd_ps(tmp13365, _mm512_set1_ps(3.2e+01f), tmp13361);
__m512 tmp13356 = _mm512_fmadd_ps(tmp13350, _mm512_set1_ps(8e+00f), tmp13357);
__m512 tmp13371 = _mm512_fmadd_ps(tmp13365, _mm512_set1_ps(8e+00f), tmp13372);
__m512 out1735 = tmp13345;
__m512 out1739 = tmp13360;
__m512 out1736 = tmp13351;
__m512 out1740 = tmp13366;
__m512 out1737 = tmp13356;
__m512 out1741 = tmp13371;
__m512 out1738 = tmp13358;
__m512 out1742 = tmp13373;
out1735 = _mm512_max_ps(_mm512_setzero_ps(), out1735);
out1739 = _mm512_max_ps(_mm512_setzero_ps(), out1739);
out1736 = _mm512_max_ps(_mm512_setzero_ps(), out1736);
out1740 = _mm512_max_ps(_mm512_setzero_ps(), out1740);
out1737 = _mm512_max_ps(_mm512_setzero_ps(), out1737);
out1741 = _mm512_max_ps(_mm512_setzero_ps(), out1741);
out1738 = _mm512_max_ps(_mm512_setzero_ps(), out1738);
out1742 = _mm512_max_ps(_mm512_setzero_ps(), out1742);
_mm512_mask_storeu_ps(datPtr17+0+401408*i37+112*toH42+4*toW42+12544*k115+12544*l47, 15, out1735);
_mm512_mask_storeu_ps(datPtr17+6248+401408*i37+112*toH42+4*toW42+12544*k115+12544*l47, 960, out1735);
_mm512_mask_storeu_ps(datPtr17+3136+401408*i37+112*toH42+4*toW42+12544*k115+12544*l47, 15, out1739);
_mm512_mask_storeu_ps(datPtr17+9384+401408*i37+112*toH42+4*toW42+12544*k115+12544*l47, 960, out1739);
_mm512_mask_storeu_ps(datPtr17+112+401408*i37+112*toH42+4*toW42+12544*k115+12544*l47, 15, out1736);
_mm512_mask_storeu_ps(datPtr17+6360+401408*i37+112*toH42+4*toW42+12544*k115+12544*l47, 960, out1736);
_mm512_mask_storeu_ps(datPtr17+3248+401408*i37+112*toH42+4*toW42+12544*k115+12544*l47, 15, out1740);
_mm512_mask_storeu_ps(datPtr17+9496+401408*i37+112*toH42+4*toW42+12544*k115+12544*l47, 960, out1740);
_mm512_mask_storeu_ps(datPtr17+224+401408*i37+112*toH42+4*toW42+12544*k115+12544*l47, 15, out1737);
_mm512_mask_storeu_ps(datPtr17+6472+401408*i37+112*toH42+4*toW42+12544*k115+12544*l47, 960, out1737);
_mm512_mask_storeu_ps(datPtr17+3360+401408*i37+112*toH42+4*toW42+12544*k115+12544*l47, 15, out1741);
_mm512_mask_storeu_ps(datPtr17+9608+401408*i37+112*toH42+4*toW42+12544*k115+12544*l47, 960, out1741);
_mm512_mask_storeu_ps(datPtr17+336+401408*i37+112*toH42+4*toW42+12544*k115+12544*l47, 15, out1738);
_mm512_mask_storeu_ps(datPtr17+6584+401408*i37+112*toH42+4*toW42+12544*k115+12544*l47, 960, out1738);
_mm512_mask_storeu_ps(datPtr17+3472+401408*i37+112*toH42+4*toW42+12544*k115+12544*l47, 15, out1742);
_mm512_mask_storeu_ps(datPtr17+9720+401408*i37+112*toH42+4*toW42+12544*k115+12544*l47, 960, out1742);
}
}
if (j30 >= last8) return;
++j30;
}

static void ResNet50ThreeConsumeSums3(ResNet50ThreaderTeam1* team42, char** tensors57) {
ResNet50ThreaderTask1 task61;
task61.callee1 = ResNet50ThreeConsumeSums3Callee1;
task61.any1 = tensors57;
task61.nd1 = 3;
task61.hull1[0] = 1;
task61.hull1[1] = 5;
task61.hull1[2] = 1;
ResNet50ThreaderDo1(team42, &task61);
}

static void ResNet50ThreeArrangeFilts4Callee1(ResNet50ThreaderTask1* task74, int64_t* pt42) {
char** tensors72 = task74->any1;
ptrdiff_t b59 = pt42[0];
ptrdiff_t g24 = 0;
ptrdiff_t e21 = 0;
char*restrict bfPtr10 = tensors72[3]+512*e21;
char*restrict wfPtr10 = tensors72[3]+512+6488064*e21;
char*restrict wtPtr13 = tensors72[0]+14256*e21;
char*restrict biasPtr13 = tensors72[1];
char*restrict bnPtr13 = tensors72[2];
ptrdiff_t i44 = 1*g24;
ptrdiff_t j37 = 1*b59;
ptrdiff_t jj40 = j37+0;
if (j37 < 32) {
for (; j37 != 32; ++j37) {
ptrdiff_t k128 = 0+1*j37;
ptrdiff_t cut18 = 0;
__m512 postMul41 = _mm512_set1_ps(((float*)bnPtr13+(ptrdiff_t)2*(0+128*i44+4*j37))[0]);
__m512 postMul42 = _mm512_set1_ps(((float*)bnPtr13+(ptrdiff_t)2*(1+128*i44+4*j37))[0]);
__m512 postMul43 = _mm512_set1_ps(((float*)bnPtr13+(ptrdiff_t)2*(2+128*i44+4*j37))[0]);
__m512 postMul44 = _mm512_set1_ps(((float*)bnPtr13+(ptrdiff_t)2*(3+128*i44+4*j37))[0]);
ptrdiff_t s35 = 0;
for (; s35 != 128; ++s35) {
__m512 wt449 = _mm512_maskz_loadu_ps(511, wtPtr13+0+589824*i44+18432*j37+36*s35);
__m512 wt450 = _mm512_maskz_loadu_ps(511, wtPtr13+4608+589824*i44+18432*j37+36*s35);
__m512 wt451 = _mm512_maskz_loadu_ps(511, wtPtr13+9216+589824*i44+18432*j37+36*s35);
__m512 wt452 = _mm512_maskz_loadu_ps(511, wtPtr13+13824+589824*i44+18432*j37+36*s35);
wt449 = _mm512_mul_ps(wt449, postMul41);
wt450 = _mm512_mul_ps(wt450, postMul42);
wt451 = _mm512_mul_ps(wt451, postMul43);
wt452 = _mm512_mul_ps(wt452, postMul44);
__m512i pm185 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm186 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp13699 = _mm512_permutex2var_ps(wt449, pm185, wt451);
__m512 tmp13700 = _mm512_permutex2var_ps(wt450, pm185, wt452);
__m512 tmp13701 = _mm512_permutex2var_ps(wt449, pm186, wt451);
__m512 tmp13702 = _mm512_permutex2var_ps(wt450, pm186, wt452);
__m512 in1914 = _mm512_permutex2var_ps(tmp13699, pm185, tmp13700);
__m512 in1915 = _mm512_permutex2var_ps(tmp13699, pm186, tmp13700);
__m512 in1916 = _mm512_permutex2var_ps(tmp13701, pm185, tmp13702);
__m512 tmp13703 = _mm512_fmadd_ps(in1914, _mm512_set1_ps(4e+00f), in1916);
__m512 tmp13704 = _mm512_add_ps(in1914, in1916);
__m512 tmp13705 = _mm512_fmadd_ps(in1916, _mm512_set1_ps(4e+00f), in1914);
__m512 tmp13706 = _mm512_add_ps(in1915, tmp13704);
__m512 tmp13707 = _mm512_fmadd_ps(in1915, _mm512_set1_ps(2e+00f), tmp13705);
tmp13705 = _mm512_fnmadd_ps(in1915, _mm512_set1_ps(2e+00f), tmp13705);
__m512 tmp13708 = _mm512_fnmadd_ps(in1915, _mm512_set1_ps(2e+00f), tmp13703);
tmp13703 = _mm512_fmadd_ps(in1915, _mm512_set1_ps(2e+00f), tmp13703);
tmp13704 = _mm512_sub_ps(tmp13704, in1915);
__m512 tmp13725 = _mm512_unpacklo_ps(in1914, tmp13706);
__m512 tmp13726 = _mm512_unpackhi_ps(in1914, tmp13706);
__m512 tmp13727 = _mm512_unpacklo_ps(tmp13704, tmp13707);
__m512 tmp13728 = _mm512_unpackhi_ps(tmp13704, tmp13707);
__m512 tmp13729 = _mm512_unpacklo_ps(tmp13705, tmp13703);
__m512 tmp13730 = _mm512_unpackhi_ps(tmp13705, tmp13703);
__m512 tmp13731 = _mm512_unpacklo_ps(tmp13708, in1916);
__m512 tmp13732 = _mm512_unpackhi_ps(tmp13708, in1916);
__m512 tmp13733 = _mm512_shuffle_ps(tmp13725, tmp13727, 68);
__m512 tmp13734 = _mm512_shuffle_ps(tmp13725, tmp13727, 238);
__m512 tmp13735 = _mm512_shuffle_ps(tmp13726, tmp13728, 68);
__m512 tmp13736 = _mm512_shuffle_ps(tmp13726, tmp13728, 238);
__m512 tmp13737 = _mm512_shuffle_ps(tmp13729, tmp13731, 68);
__m512 tmp13738 = _mm512_shuffle_ps(tmp13729, tmp13731, 238);
__m512 tmp13739 = _mm512_shuffle_ps(tmp13730, tmp13732, 68);
__m512 tmp13740 = _mm512_shuffle_ps(tmp13730, tmp13732, 238);
__m512 tmp13741 = _mm512_shuffle_f32x4(tmp13733, tmp13737, 136);
__m512 tmp13742 = _mm512_shuffle_f32x4(tmp13733, tmp13737, 221);
__m512 tmp13743 = _mm512_shuffle_f32x4(tmp13734, tmp13738, 136);
__m512 tmp13744 = _mm512_shuffle_f32x4(tmp13734, tmp13738, 221);
__m512 tmp13745 = _mm512_shuffle_f32x4(tmp13735, tmp13739, 136);
__m512 tmp13746 = _mm512_shuffle_f32x4(tmp13735, tmp13739, 221);
__m512 tmp13747 = _mm512_shuffle_f32x4(tmp13736, tmp13740, 136);
__m512 tmp13748 = _mm512_shuffle_f32x4(tmp13736, tmp13740, 221);
in1914 = _mm512_shuffle_f32x4(tmp13741, tmp13741, 136);
__m512 tmp13709 = _mm512_shuffle_f32x4(tmp13741, tmp13741, 221);
tmp13706 = _mm512_shuffle_f32x4(tmp13743, tmp13743, 136);
__m512 tmp13710 = _mm512_shuffle_f32x4(tmp13743, tmp13743, 221);
tmp13704 = _mm512_shuffle_f32x4(tmp13745, tmp13745, 136);
__m512 tmp13711 = _mm512_shuffle_f32x4(tmp13745, tmp13745, 221);
tmp13707 = _mm512_shuffle_f32x4(tmp13747, tmp13747, 136);
__m512 tmp13712 = _mm512_shuffle_f32x4(tmp13747, tmp13747, 221);
tmp13705 = _mm512_shuffle_f32x4(tmp13742, tmp13742, 136);
tmp13703 = _mm512_shuffle_f32x4(tmp13744, tmp13744, 136);
tmp13708 = _mm512_shuffle_f32x4(tmp13746, tmp13746, 136);
in1916 = _mm512_shuffle_f32x4(tmp13748, tmp13748, 136);
in1914 = _mm512_shuffle_f32x4(in1914, tmp13707, 68);
tmp13706 = _mm512_shuffle_f32x4(tmp13706, tmp13705, 68);
tmp13704 = _mm512_shuffle_f32x4(tmp13704, tmp13703, 68);
tmp13708 = _mm512_shuffle_f32x4(tmp13708, tmp13710, 68);
in1916 = _mm512_shuffle_f32x4(in1916, tmp13711, 68);
tmp13709 = _mm512_shuffle_f32x4(tmp13709, tmp13712, 68);
__m512 tmp13713 = _mm512_fmadd_ps(in1914, _mm512_set1_ps(4e+00f), tmp13704);
__m512 tmp13719 = _mm512_fmadd_ps(tmp13708, _mm512_set1_ps(4e+00f), tmp13709);
__m512 tmp13714 = _mm512_add_ps(in1914, tmp13704);
__m512 tmp13720 = _mm512_add_ps(tmp13708, tmp13709);
__m512 tmp13715 = _mm512_fmadd_ps(tmp13704, _mm512_set1_ps(4e+00f), in1914);
__m512 tmp13721 = _mm512_fmadd_ps(tmp13709, _mm512_set1_ps(4e+00f), tmp13708);
__m512 tmp13716 = _mm512_add_ps(tmp13706, tmp13714);
__m512 tmp13722 = _mm512_add_ps(in1916, tmp13720);
__m512 tmp13717 = _mm512_fmadd_ps(tmp13706, _mm512_set1_ps(2e+00f), tmp13715);
__m512 tmp13723 = _mm512_fmadd_ps(in1916, _mm512_set1_ps(2e+00f), tmp13721);
tmp13715 = _mm512_fnmadd_ps(tmp13706, _mm512_set1_ps(2e+00f), tmp13715);
tmp13721 = _mm512_fnmadd_ps(in1916, _mm512_set1_ps(2e+00f), tmp13721);
__m512 tmp13718 = _mm512_fnmadd_ps(tmp13706, _mm512_set1_ps(2e+00f), tmp13713);
__m512 tmp13724 = _mm512_fnmadd_ps(in1916, _mm512_set1_ps(2e+00f), tmp13719);
tmp13713 = _mm512_fmadd_ps(tmp13706, _mm512_set1_ps(2e+00f), tmp13713);
tmp13719 = _mm512_fmadd_ps(in1916, _mm512_set1_ps(2e+00f), tmp13719);
tmp13714 = _mm512_sub_ps(tmp13714, tmp13706);
tmp13720 = _mm512_sub_ps(tmp13720, in1916);
in1914 = _mm512_mul_ps(in1914, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp13716 = _mm512_mul_ps(tmp13716, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp13714 = _mm512_mul_ps(tmp13714, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp13717 = _mm512_mul_ps(tmp13717, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp13715 = _mm512_mul_ps(tmp13715, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp13713 = _mm512_mul_ps(tmp13713, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp13718 = _mm512_mul_ps(tmp13718, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp13704 = _mm512_mul_ps(tmp13704, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp13708 = _mm512_mul_ps(tmp13708, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp13722 = _mm512_mul_ps(tmp13722, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp13720 = _mm512_mul_ps(tmp13720, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp13723 = _mm512_mul_ps(tmp13723, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp13721 = _mm512_mul_ps(tmp13721, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp13719 = _mm512_mul_ps(tmp13719, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp13724 = _mm512_mul_ps(tmp13724, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp13709 = _mm512_mul_ps(tmp13709, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out1743 = _mm512_shuffle_f32x4(in1914, tmp13716, 68);
__m512 out1747 = _mm512_shuffle_f32x4(in1914, tmp13716, 238);
__m512 out1744 = _mm512_shuffle_f32x4(tmp13714, tmp13717, 68);
__m512 out1748 = _mm512_shuffle_f32x4(tmp13714, tmp13717, 238);
__m512 out1745 = _mm512_shuffle_f32x4(tmp13715, tmp13713, 68);
__m512 out1749 = _mm512_shuffle_f32x4(tmp13715, tmp13713, 238);
__m512 out1746 = _mm512_shuffle_f32x4(tmp13718, tmp13704, 68);
__m512 out1750 = _mm512_shuffle_f32x4(tmp13718, tmp13704, 238);
__m512 out1751 = _mm512_shuffle_f32x4(tmp13708, tmp13722, 68);
__m512 out1755 = _mm512_shuffle_f32x4(tmp13708, tmp13722, 238);
__m512 out1752 = _mm512_shuffle_f32x4(tmp13720, tmp13723, 68);
__m512 out1756 = _mm512_shuffle_f32x4(tmp13720, tmp13723, 238);
__m512 out1753 = _mm512_shuffle_f32x4(tmp13721, tmp13719, 68);
__m512 out1757 = _mm512_shuffle_f32x4(tmp13721, tmp13719, 238);
__m512 out1754 = _mm512_shuffle_f32x4(tmp13724, tmp13709, 68);
__m512 out1758 = _mm512_shuffle_f32x4(tmp13724, tmp13709, 238);
ptrdiff_t off13 = 32*cut18;
ptrdiff_t off14 = (size_t)(cut18+1)/4*16384+(size_t)(cut18+1)%4*32;
ptrdiff_t off15 = (size_t)(cut18+2)/4*16384+(size_t)(cut18+2)%4*32;
ptrdiff_t off16 = (size_t)(cut18+3)/4*16384+(size_t)(cut18+3)%4*32;
__m512i wf105 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1743, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf106 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1747, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf107 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1751, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf108 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1755, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf109 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1744, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf110 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1748, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf111 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1752, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf112 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1756, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf113 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1745, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf114 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1749, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf115 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1753, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf116 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1757, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf117 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1746, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf118 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1750, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf119 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1754, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf120 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1758, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr10+0+2097152*i44+16384*k128+off13+128*s35, 255, wf105);
_mm512_mask_storeu_epi32(wfPtr10+0+2097152*i44+16384*k128+off14+128*s35, 255, wf106);
_mm512_mask_storeu_epi32(wfPtr10+0+2097152*i44+16384*k128+off15+128*s35, 255, wf107);
_mm512_mask_storeu_epi32(wfPtr10+0+2097152*i44+16384*k128+off16+128*s35, 255, wf108);
_mm512_mask_storeu_epi32(wfPtr10+524288+2097152*i44+16384*k128+off13+128*s35, 255, wf109);
_mm512_mask_storeu_epi32(wfPtr10+524288+2097152*i44+16384*k128+off14+128*s35, 255, wf110);
_mm512_mask_storeu_epi32(wfPtr10+524288+2097152*i44+16384*k128+off15+128*s35, 255, wf111);
_mm512_mask_storeu_epi32(wfPtr10+524288+2097152*i44+16384*k128+off16+128*s35, 255, wf112);
_mm512_mask_storeu_epi32(wfPtr10+1048576+2097152*i44+16384*k128+off13+128*s35, 255, wf113);
_mm512_mask_storeu_epi32(wfPtr10+1048576+2097152*i44+16384*k128+off14+128*s35, 255, wf114);
_mm512_mask_storeu_epi32(wfPtr10+1048576+2097152*i44+16384*k128+off15+128*s35, 255, wf115);
_mm512_mask_storeu_epi32(wfPtr10+1048576+2097152*i44+16384*k128+off16+128*s35, 255, wf116);
_mm512_mask_storeu_epi32(wfPtr10+1572864+2097152*i44+16384*k128+off13+128*s35, 255, wf117);
_mm512_mask_storeu_epi32(wfPtr10+1572864+2097152*i44+16384*k128+off14+128*s35, 255, wf118);
_mm512_mask_storeu_epi32(wfPtr10+1572864+2097152*i44+16384*k128+off15+128*s35, 255, wf119);
_mm512_mask_storeu_epi32(wfPtr10+1572864+2097152*i44+16384*k128+off16+128*s35, 255, wf120);
}
__m512 bias5 = _mm512_setzero_ps();
if (!e21) {
bias5 = _mm512_maskz_loadu_ps(15, biasPtr13-0+512*i44+16*j37);
__m512i pmMul27 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd27 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas9 = _mm512_maskz_loadu_ps(255, bnPtr13+(ptrdiff_t)8*(0+128*i44+4*j37));
__m512 postMul45 = _mm512_permutexvar_ps(pmMul27, mas9);
__m512 postAdd27 = _mm512_permutexvar_ps(pmAdd27, mas9);
bias5 = _mm512_fmadd_ps(bias5, postMul45, postAdd27);
}
_mm512_mask_storeu_ps(bfPtr10-0+512*i44+16*j37, 15, bias5);
if (j37 >= jj40) return;
}
}
}

static void ResNet50ThreeArrangeFilts4(ResNet50ThreaderTeam1* team49, char** tensors71) {
ResNet50ThreaderTask1 task75;
task75.callee1 = ResNet50ThreeArrangeFilts4Callee1;
task75.any1 = tensors71;
task75.nd1 = 3;
task75.hull1[0] = 32;
task75.hull1[1] = 1;
task75.hull1[2] = 1;
ResNet50ThreaderDo1(team49, &task75);
}

static void ResNet50ThreeArrangeDats4Callee1(ResNet50ThreaderTask1* task76, int64_t* pt43) {
char** tensors74 = task76->any1;
ptrdiff_t s36 = 0;
ptrdiff_t c36 = pt43[1];
ptrdiff_t g25 = 0;
ptrdiff_t e22 = 0;
char*restrict datPtr23 = tensors74[0]-116+1241856*e22;
char*restrict dfPtr10 = tensors74[1]+2534400*e22;
ptrdiff_t i45 = 1*g25;
ptrdiff_t j38 = 1*c36;
ptrdiff_t last9 = j38+0;
ptrdiff_t rel21 = j38-0;
ptrdiff_t base21 = 0;
if (rel21 < 2) {
if (rel21 < 1) {
ptrdiff_t h45 = base21+0;
ptrdiff_t w57 = 0;
ptrdiff_t k129 = 0;
for (; k129 != 64; ++k129) {
__m512 dat1941 = _mm512_maskz_loadu_ps(8191, datPtr23+116+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1942 = _mm512_maskz_loadu_ps(16383, datPtr23+160+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512i pm187 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1917 = _mm512_permutexvar_ps(pm187, dat1941);
__m512i pm188 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1924 = _mm512_permutexvar_ps(pm188, dat1942);
__m512 dat1943 = _mm512_maskz_loadu_ps(8191, datPtr23+228+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1944 = _mm512_maskz_loadu_ps(16383, datPtr23+272+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1918 = _mm512_permutexvar_ps(pm187, dat1943);
__m512 in1925 = _mm512_permutexvar_ps(pm188, dat1944);
__m512 dat1945 = _mm512_maskz_loadu_ps(8191, datPtr23+340+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1946 = _mm512_maskz_loadu_ps(16383, datPtr23+384+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1919 = _mm512_permutexvar_ps(pm187, dat1945);
__m512 in1926 = _mm512_permutexvar_ps(pm188, dat1946);
__m512 dat1947 = _mm512_maskz_loadu_ps(8191, datPtr23+452+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1948 = _mm512_maskz_loadu_ps(16383, datPtr23+496+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1920 = _mm512_permutexvar_ps(pm187, dat1947);
__m512 in1927 = _mm512_permutexvar_ps(pm188, dat1948);
__m512 dat1949 = _mm512_maskz_loadu_ps(8191, datPtr23+564+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1950 = _mm512_maskz_loadu_ps(16383, datPtr23+608+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1921 = _mm512_permutexvar_ps(pm187, dat1949);
__m512 in1928 = _mm512_permutexvar_ps(pm188, dat1950);
__m512 dat1951 = _mm512_maskz_loadu_ps(8191, datPtr23+676+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1952 = _mm512_maskz_loadu_ps(16383, datPtr23+720+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1922 = _mm512_permutexvar_ps(pm187, dat1951);
__m512 in1929 = _mm512_permutexvar_ps(pm188, dat1952);
__m512 dat1953 = _mm512_maskz_loadu_ps(8191, datPtr23+788+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1954 = _mm512_maskz_loadu_ps(16383, datPtr23+832+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1923 = _mm512_permutexvar_ps(pm187, dat1953);
__m512 in1930 = _mm512_permutexvar_ps(pm188, dat1954);
__m512 tmp13749 = _mm512_add_ps(in1917, in1921);
__m512 tmp13754 = _mm512_add_ps(in1924, in1928);
__m512 tmp13750 = _mm512_sub_ps(in1920, in1918);
__m512 tmp13755 = _mm512_sub_ps(in1927, in1925);
__m512 tmp13751 = _mm512_add_ps(in1918, in1922);
__m512 tmp13756 = _mm512_add_ps(in1925, in1929);
__m512 tmp13752 = _mm512_sub_ps(_mm512_setzero_ps(), in1922);
__m512 tmp13757 = _mm512_sub_ps(_mm512_setzero_ps(), in1929);
tmp13749 = _mm512_fmadd_ps(in1919, _mm512_set1_ps(-4.25e+00f), tmp13749);
tmp13754 = _mm512_fmadd_ps(in1926, _mm512_set1_ps(-4.25e+00f), tmp13754);
tmp13751 = _mm512_fmadd_ps(in1920, _mm512_set1_ps(-4.25e+00f), tmp13751);
tmp13756 = _mm512_fmadd_ps(in1927, _mm512_set1_ps(-4.25e+00f), tmp13756);
tmp13752 = _mm512_fmadd_ps(tmp13750, _mm512_set1_ps(5.25e+00f), tmp13752);
tmp13757 = _mm512_fmadd_ps(tmp13755, _mm512_set1_ps(5.25e+00f), tmp13757);
tmp13750 = _mm512_fmadd_ps(in1918, _mm512_set1_ps(2.5e-01f), in1922);
tmp13755 = _mm512_fmadd_ps(in1925, _mm512_set1_ps(2.5e-01f), in1929);
in1918 = _mm512_fmadd_ps(in1918, _mm512_set1_ps(4e+00f), in1922);
in1925 = _mm512_fmadd_ps(in1925, _mm512_set1_ps(4e+00f), in1929);
__m512 tmp13753 = _mm512_sub_ps(tmp13751, tmp13749);
__m512 tmp13758 = _mm512_sub_ps(tmp13756, tmp13754);
tmp13751 = _mm512_add_ps(tmp13749, tmp13751);
tmp13756 = _mm512_add_ps(tmp13754, tmp13756);
tmp13749 = _mm512_fmadd_ps(in1917, _mm512_set1_ps(2.5e-01f), in1921);
tmp13754 = _mm512_fmadd_ps(in1924, _mm512_set1_ps(2.5e-01f), in1928);
tmp13750 = _mm512_fmadd_ps(in1920, _mm512_set1_ps(-1.25e+00f), tmp13750);
tmp13755 = _mm512_fmadd_ps(in1927, _mm512_set1_ps(-1.25e+00f), tmp13755);
in1920 = _mm512_fmadd_ps(in1920, _mm512_set1_ps(-5e+00f), in1918);
in1927 = _mm512_fmadd_ps(in1927, _mm512_set1_ps(-5e+00f), in1925);
tmp13749 = _mm512_fmadd_ps(in1919, _mm512_set1_ps(-1.25e+00f), tmp13749);
tmp13754 = _mm512_fmadd_ps(in1926, _mm512_set1_ps(-1.25e+00f), tmp13754);
in1922 = _mm512_fmadd_ps(tmp13749, _mm512_set1_ps(2e+00f), tmp13750);
in1929 = _mm512_fmadd_ps(tmp13754, _mm512_set1_ps(2e+00f), tmp13755);
tmp13750 = _mm512_fnmadd_ps(tmp13749, _mm512_set1_ps(2e+00f), tmp13750);
tmp13755 = _mm512_fnmadd_ps(tmp13754, _mm512_set1_ps(2e+00f), tmp13755);
tmp13749 = _mm512_fmadd_ps(in1921, _mm512_set1_ps(2.5e-01f), in1917);
tmp13754 = _mm512_fmadd_ps(in1928, _mm512_set1_ps(2.5e-01f), in1924);
in1917 = _mm512_sub_ps(in1923, in1917);
in1924 = _mm512_sub_ps(in1930, in1924);
tmp13749 = _mm512_fmadd_ps(in1919, _mm512_set1_ps(-1.25e+00f), tmp13749);
tmp13754 = _mm512_fmadd_ps(in1926, _mm512_set1_ps(-1.25e+00f), tmp13754);
in1919 = _mm512_sub_ps(in1919, in1921);
in1926 = _mm512_sub_ps(in1926, in1928);
in1919 = _mm512_fmadd_ps(in1919, _mm512_set1_ps(5.25e+00f), in1917);
in1926 = _mm512_fmadd_ps(in1926, _mm512_set1_ps(5.25e+00f), in1924);
in1918 = _mm512_fmadd_ps(tmp13749, _mm512_set1_ps(2e+00f), in1920);
in1925 = _mm512_fmadd_ps(tmp13754, _mm512_set1_ps(2e+00f), in1927);
in1920 = _mm512_fnmadd_ps(tmp13749, _mm512_set1_ps(2e+00f), in1920);
in1927 = _mm512_fnmadd_ps(tmp13754, _mm512_set1_ps(2e+00f), in1927);
__m512 tmp13767 = _mm512_unpacklo_ps(tmp13752, tmp13751);
__m512 tmp13768 = _mm512_unpackhi_ps(tmp13752, tmp13751);
__m512 tmp13769 = _mm512_unpacklo_ps(tmp13753, in1922);
__m512 tmp13770 = _mm512_unpackhi_ps(tmp13753, in1922);
__m512 tmp13771 = _mm512_unpacklo_ps(tmp13750, in1918);
__m512 tmp13772 = _mm512_unpackhi_ps(tmp13750, in1918);
__m512 tmp13773 = _mm512_unpacklo_ps(in1920, in1919);
__m512 tmp13774 = _mm512_unpackhi_ps(in1920, in1919);
__m512 tmp13775 = _mm512_unpacklo_ps(tmp13757, tmp13756);
__m512 tmp13776 = _mm512_unpackhi_ps(tmp13757, tmp13756);
__m512 tmp13777 = _mm512_unpacklo_ps(tmp13758, in1929);
__m512 tmp13778 = _mm512_unpackhi_ps(tmp13758, in1929);
__m512 tmp13779 = _mm512_unpacklo_ps(tmp13755, in1925);
__m512 tmp13780 = _mm512_unpackhi_ps(tmp13755, in1925);
__m512 tmp13781 = _mm512_unpacklo_ps(in1927, in1926);
__m512 tmp13782 = _mm512_unpackhi_ps(in1927, in1926);
__m512 tmp13783 = _mm512_shuffle_ps(tmp13767, tmp13769, 68);
__m512 tmp13784 = _mm512_shuffle_ps(tmp13767, tmp13769, 238);
__m512 tmp13785 = _mm512_shuffle_ps(tmp13768, tmp13770, 68);
__m512 tmp13786 = _mm512_shuffle_ps(tmp13768, tmp13770, 238);
__m512 tmp13787 = _mm512_shuffle_ps(tmp13771, tmp13773, 68);
__m512 tmp13788 = _mm512_shuffle_ps(tmp13771, tmp13773, 238);
__m512 tmp13789 = _mm512_shuffle_ps(tmp13772, tmp13774, 68);
__m512 tmp13790 = _mm512_shuffle_ps(tmp13772, tmp13774, 238);
__m512 tmp13791 = _mm512_shuffle_ps(tmp13775, tmp13777, 68);
__m512 tmp13792 = _mm512_shuffle_ps(tmp13775, tmp13777, 238);
__m512 tmp13793 = _mm512_shuffle_ps(tmp13776, tmp13778, 68);
__m512 tmp13794 = _mm512_shuffle_ps(tmp13776, tmp13778, 238);
__m512 tmp13795 = _mm512_shuffle_ps(tmp13779, tmp13781, 68);
__m512 tmp13796 = _mm512_shuffle_ps(tmp13779, tmp13781, 238);
__m512 tmp13797 = _mm512_shuffle_ps(tmp13780, tmp13782, 68);
__m512 tmp13798 = _mm512_shuffle_ps(tmp13780, tmp13782, 238);
__m512 tmp13799 = _mm512_shuffle_f32x4(tmp13783, tmp13787, 136);
__m512 tmp13800 = _mm512_shuffle_f32x4(tmp13783, tmp13787, 221);
__m512 tmp13801 = _mm512_shuffle_f32x4(tmp13784, tmp13788, 136);
__m512 tmp13802 = _mm512_shuffle_f32x4(tmp13784, tmp13788, 221);
__m512 tmp13803 = _mm512_shuffle_f32x4(tmp13785, tmp13789, 136);
__m512 tmp13804 = _mm512_shuffle_f32x4(tmp13785, tmp13789, 221);
__m512 tmp13805 = _mm512_shuffle_f32x4(tmp13786, tmp13790, 136);
__m512 tmp13806 = _mm512_shuffle_f32x4(tmp13786, tmp13790, 221);
__m512 tmp13807 = _mm512_shuffle_f32x4(tmp13791, tmp13795, 136);
__m512 tmp13808 = _mm512_shuffle_f32x4(tmp13791, tmp13795, 221);
__m512 tmp13809 = _mm512_shuffle_f32x4(tmp13792, tmp13796, 136);
__m512 tmp13810 = _mm512_shuffle_f32x4(tmp13792, tmp13796, 221);
__m512 tmp13811 = _mm512_shuffle_f32x4(tmp13793, tmp13797, 136);
__m512 tmp13812 = _mm512_shuffle_f32x4(tmp13793, tmp13797, 221);
__m512 tmp13813 = _mm512_shuffle_f32x4(tmp13794, tmp13798, 136);
__m512 tmp13814 = _mm512_shuffle_f32x4(tmp13794, tmp13798, 221);
tmp13752 = _mm512_shuffle_f32x4(tmp13799, tmp13807, 136);
tmp13757 = _mm512_shuffle_f32x4(tmp13799, tmp13807, 221);
tmp13751 = _mm512_shuffle_f32x4(tmp13801, tmp13809, 136);
tmp13756 = _mm512_shuffle_f32x4(tmp13801, tmp13809, 221);
tmp13753 = _mm512_shuffle_f32x4(tmp13803, tmp13811, 136);
tmp13758 = _mm512_shuffle_f32x4(tmp13803, tmp13811, 221);
in1922 = _mm512_shuffle_f32x4(tmp13805, tmp13813, 136);
in1929 = _mm512_shuffle_f32x4(tmp13805, tmp13813, 221);
tmp13750 = _mm512_shuffle_f32x4(tmp13800, tmp13808, 136);
tmp13755 = _mm512_shuffle_f32x4(tmp13800, tmp13808, 221);
in1918 = _mm512_shuffle_f32x4(tmp13802, tmp13810, 136);
in1925 = _mm512_shuffle_f32x4(tmp13802, tmp13810, 221);
in1920 = _mm512_shuffle_f32x4(tmp13804, tmp13812, 136);
in1927 = _mm512_shuffle_f32x4(tmp13804, tmp13812, 221);
in1919 = _mm512_shuffle_f32x4(tmp13806, tmp13814, 136);
in1926 = _mm512_shuffle_f32x4(tmp13806, tmp13814, 221);
__m512 tmp13759 = _mm512_add_ps(tmp13751, in1918);
__m512 tmp13763 = _mm512_add_ps(tmp13756, in1925);
__m512 tmp13760 = _mm512_sub_ps(tmp13750, tmp13753);
__m512 tmp13764 = _mm512_sub_ps(tmp13755, tmp13758);
__m512 tmp13761 = _mm512_add_ps(tmp13753, in1920);
__m512 tmp13765 = _mm512_add_ps(tmp13758, in1927);
tmp13752 = _mm512_sub_ps(tmp13752, in1920);
tmp13757 = _mm512_sub_ps(tmp13757, in1927);
tmp13759 = _mm512_fmadd_ps(in1922, _mm512_set1_ps(-4.25e+00f), tmp13759);
tmp13763 = _mm512_fmadd_ps(in1929, _mm512_set1_ps(-4.25e+00f), tmp13763);
tmp13761 = _mm512_fmadd_ps(tmp13750, _mm512_set1_ps(-4.25e+00f), tmp13761);
tmp13765 = _mm512_fmadd_ps(tmp13755, _mm512_set1_ps(-4.25e+00f), tmp13765);
tmp13752 = _mm512_fmadd_ps(tmp13760, _mm512_set1_ps(5.25e+00f), tmp13752);
tmp13757 = _mm512_fmadd_ps(tmp13764, _mm512_set1_ps(5.25e+00f), tmp13757);
tmp13760 = _mm512_fmadd_ps(tmp13753, _mm512_set1_ps(2.5e-01f), in1920);
tmp13764 = _mm512_fmadd_ps(tmp13758, _mm512_set1_ps(2.5e-01f), in1927);
tmp13753 = _mm512_fmadd_ps(tmp13753, _mm512_set1_ps(4e+00f), in1920);
tmp13758 = _mm512_fmadd_ps(tmp13758, _mm512_set1_ps(4e+00f), in1927);
__m512 tmp13762 = _mm512_sub_ps(tmp13761, tmp13759);
__m512 tmp13766 = _mm512_sub_ps(tmp13765, tmp13763);
tmp13761 = _mm512_add_ps(tmp13759, tmp13761);
tmp13765 = _mm512_add_ps(tmp13763, tmp13765);
tmp13759 = _mm512_fmadd_ps(tmp13751, _mm512_set1_ps(2.5e-01f), in1918);
tmp13763 = _mm512_fmadd_ps(tmp13756, _mm512_set1_ps(2.5e-01f), in1925);
tmp13760 = _mm512_fmadd_ps(tmp13750, _mm512_set1_ps(-1.25e+00f), tmp13760);
tmp13764 = _mm512_fmadd_ps(tmp13755, _mm512_set1_ps(-1.25e+00f), tmp13764);
tmp13750 = _mm512_fmadd_ps(tmp13750, _mm512_set1_ps(-5e+00f), tmp13753);
tmp13755 = _mm512_fmadd_ps(tmp13755, _mm512_set1_ps(-5e+00f), tmp13758);
tmp13759 = _mm512_fmadd_ps(in1922, _mm512_set1_ps(-1.25e+00f), tmp13759);
tmp13763 = _mm512_fmadd_ps(in1929, _mm512_set1_ps(-1.25e+00f), tmp13763);
in1920 = _mm512_fmadd_ps(tmp13759, _mm512_set1_ps(2e+00f), tmp13760);
in1927 = _mm512_fmadd_ps(tmp13763, _mm512_set1_ps(2e+00f), tmp13764);
tmp13760 = _mm512_fnmadd_ps(tmp13759, _mm512_set1_ps(2e+00f), tmp13760);
tmp13764 = _mm512_fnmadd_ps(tmp13763, _mm512_set1_ps(2e+00f), tmp13764);
tmp13759 = _mm512_fmadd_ps(in1918, _mm512_set1_ps(2.5e-01f), tmp13751);
tmp13763 = _mm512_fmadd_ps(in1925, _mm512_set1_ps(2.5e-01f), tmp13756);
tmp13751 = _mm512_sub_ps(in1919, tmp13751);
tmp13756 = _mm512_sub_ps(in1926, tmp13756);
tmp13759 = _mm512_fmadd_ps(in1922, _mm512_set1_ps(-1.25e+00f), tmp13759);
tmp13763 = _mm512_fmadd_ps(in1929, _mm512_set1_ps(-1.25e+00f), tmp13763);
in1922 = _mm512_sub_ps(in1922, in1918);
in1929 = _mm512_sub_ps(in1929, in1925);
in1922 = _mm512_fmadd_ps(in1922, _mm512_set1_ps(5.25e+00f), tmp13751);
in1929 = _mm512_fmadd_ps(in1929, _mm512_set1_ps(5.25e+00f), tmp13756);
tmp13753 = _mm512_fmadd_ps(tmp13759, _mm512_set1_ps(2e+00f), tmp13750);
tmp13758 = _mm512_fmadd_ps(tmp13763, _mm512_set1_ps(2e+00f), tmp13755);
tmp13750 = _mm512_fnmadd_ps(tmp13759, _mm512_set1_ps(2e+00f), tmp13750);
tmp13755 = _mm512_fnmadd_ps(tmp13763, _mm512_set1_ps(2e+00f), tmp13755);
__m512 out1759 = _mm512_shuffle_f32x4(tmp13752, tmp13761, 68);
__m512 out1767 = _mm512_shuffle_f32x4(tmp13752, tmp13761, 238);
__m512 out1760 = _mm512_shuffle_f32x4(tmp13762, in1920, 68);
__m512 out1768 = _mm512_shuffle_f32x4(tmp13762, in1920, 238);
__m512 out1761 = _mm512_shuffle_f32x4(tmp13760, tmp13753, 68);
__m512 out1769 = _mm512_shuffle_f32x4(tmp13760, tmp13753, 238);
__m512 out1762 = _mm512_shuffle_f32x4(tmp13750, in1922, 68);
__m512 out1770 = _mm512_shuffle_f32x4(tmp13750, in1922, 238);
__m512 out1763 = _mm512_shuffle_f32x4(tmp13757, tmp13765, 68);
__m512 out1771 = _mm512_shuffle_f32x4(tmp13757, tmp13765, 238);
__m512 out1764 = _mm512_shuffle_f32x4(tmp13766, in1927, 68);
__m512 out1772 = _mm512_shuffle_f32x4(tmp13766, in1927, 238);
__m512 out1765 = _mm512_shuffle_f32x4(tmp13764, tmp13758, 68);
__m512 out1773 = _mm512_shuffle_f32x4(tmp13764, tmp13758, 238);
__m512 out1766 = _mm512_shuffle_f32x4(tmp13755, in1929, 68);
__m512 out1774 = _mm512_shuffle_f32x4(tmp13755, in1929, 238);
_mm512_storeu_ps(dfPtr10+0+819200*i45+49152*j38+49152*s36+768*k129, out1759);
_mm512_storeu_ps(dfPtr10+128+819200*i45+49152*j38+49152*s36+768*k129, out1767);
_mm512_storeu_ps(dfPtr10+64+819200*i45+49152*j38+49152*s36+768*k129, out1763);
_mm512_storeu_ps(dfPtr10+192+819200*i45+49152*j38+49152*s36+768*k129, out1771);
_mm512_storeu_ps(dfPtr10+204800+819200*i45+49152*j38+49152*s36+768*k129, out1760);
_mm512_storeu_ps(dfPtr10+204928+819200*i45+49152*j38+49152*s36+768*k129, out1768);
_mm512_storeu_ps(dfPtr10+204864+819200*i45+49152*j38+49152*s36+768*k129, out1764);
_mm512_storeu_ps(dfPtr10+204992+819200*i45+49152*j38+49152*s36+768*k129, out1772);
_mm512_storeu_ps(dfPtr10+409600+819200*i45+49152*j38+49152*s36+768*k129, out1761);
_mm512_storeu_ps(dfPtr10+409728+819200*i45+49152*j38+49152*s36+768*k129, out1769);
_mm512_storeu_ps(dfPtr10+409664+819200*i45+49152*j38+49152*s36+768*k129, out1765);
_mm512_storeu_ps(dfPtr10+409792+819200*i45+49152*j38+49152*s36+768*k129, out1773);
_mm512_storeu_ps(dfPtr10+614400+819200*i45+49152*j38+49152*s36+768*k129, out1762);
_mm512_storeu_ps(dfPtr10+614528+819200*i45+49152*j38+49152*s36+768*k129, out1770);
_mm512_storeu_ps(dfPtr10+614464+819200*i45+49152*j38+49152*s36+768*k129, out1766);
_mm512_storeu_ps(dfPtr10+614592+819200*i45+49152*j38+49152*s36+768*k129, out1774);
__m512 dat1955 = _mm512_maskz_loadu_ps(127, datPtr23+676+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512i pm189 = _mm512_set_epi32(6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15);
__m512 in1931 = _mm512_permutexvar_ps(pm189, dat1955);
__m512 dat1956 = _mm512_maskz_loadu_ps(31, datPtr23+208+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1957 = _mm512_maskz_loadu_ps(127, datPtr23+788+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1958 = _mm512_maskz_loadu_ps(8191, datPtr23+3252+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512i pm190 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 4, 3, 2, 1, 0);
__m512 in1932 = _mm512_permutex2var_ps(dat1956, pm190, dat1957);
__m512i pm191 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1939 = _mm512_permutexvar_ps(pm191, dat1958);
__m512 dat1959 = _mm512_maskz_loadu_ps(31, datPtr23+320+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1960 = _mm512_maskz_loadu_ps(127, datPtr23+900+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1961 = _mm512_maskz_loadu_ps(8191, datPtr23+3364+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1933 = _mm512_permutex2var_ps(dat1959, pm190, dat1960);
__m512 in1940 = _mm512_permutexvar_ps(pm191, dat1961);
__m512 dat1962 = _mm512_maskz_loadu_ps(31, datPtr23+432+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1963 = _mm512_maskz_loadu_ps(127, datPtr23+1012+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1964 = _mm512_maskz_loadu_ps(8191, datPtr23+3476+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1934 = _mm512_permutex2var_ps(dat1962, pm190, dat1963);
__m512 in1941 = _mm512_permutexvar_ps(pm191, dat1964);
__m512 dat1965 = _mm512_maskz_loadu_ps(31, datPtr23+544+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1966 = _mm512_maskz_loadu_ps(127, datPtr23+1124+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1967 = _mm512_maskz_loadu_ps(8191, datPtr23+3588+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1935 = _mm512_permutex2var_ps(dat1965, pm190, dat1966);
__m512 in1942 = _mm512_permutexvar_ps(pm191, dat1967);
__m512 dat1968 = _mm512_maskz_loadu_ps(31, datPtr23+656+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1969 = _mm512_maskz_loadu_ps(127, datPtr23+1236+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1970 = _mm512_maskz_loadu_ps(8191, datPtr23+3700+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1936 = _mm512_permutex2var_ps(dat1968, pm190, dat1969);
__m512 in1943 = _mm512_permutexvar_ps(pm191, dat1970);
__m512 dat1971 = _mm512_maskz_loadu_ps(31, datPtr23+768+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1972 = _mm512_maskz_loadu_ps(127, datPtr23+1348+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1973 = _mm512_maskz_loadu_ps(8191, datPtr23+3812+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1937 = _mm512_permutex2var_ps(dat1971, pm190, dat1972);
__m512 in1944 = _mm512_permutexvar_ps(pm191, dat1973);
__m512 dat1974 = _mm512_maskz_loadu_ps(31, datPtr23+880+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1975 = _mm512_maskz_loadu_ps(127, datPtr23+1460+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1976 = _mm512_maskz_loadu_ps(8191, datPtr23+3924+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1938 = _mm512_permutex2var_ps(dat1974, pm190, dat1975);
__m512 in1945 = _mm512_permutexvar_ps(pm191, dat1976);
__m512 tmp13815 = _mm512_add_ps(in1932, in1936);
__m512 tmp13819 = _mm512_add_ps(in1939, in1943);
__m512 tmp13816 = _mm512_sub_ps(in1935, in1933);
__m512 tmp13820 = _mm512_sub_ps(in1942, in1940);
__m512 tmp13817 = _mm512_add_ps(in1933, in1937);
__m512 tmp13821 = _mm512_add_ps(in1940, in1944);
in1931 = _mm512_sub_ps(in1931, in1937);
__m512 tmp13822 = _mm512_sub_ps(_mm512_setzero_ps(), in1944);
tmp13815 = _mm512_fmadd_ps(in1934, _mm512_set1_ps(-4.25e+00f), tmp13815);
tmp13819 = _mm512_fmadd_ps(in1941, _mm512_set1_ps(-4.25e+00f), tmp13819);
tmp13817 = _mm512_fmadd_ps(in1935, _mm512_set1_ps(-4.25e+00f), tmp13817);
tmp13821 = _mm512_fmadd_ps(in1942, _mm512_set1_ps(-4.25e+00f), tmp13821);
in1931 = _mm512_fmadd_ps(tmp13816, _mm512_set1_ps(5.25e+00f), in1931);
tmp13822 = _mm512_fmadd_ps(tmp13820, _mm512_set1_ps(5.25e+00f), tmp13822);
tmp13816 = _mm512_fmadd_ps(in1933, _mm512_set1_ps(2.5e-01f), in1937);
tmp13820 = _mm512_fmadd_ps(in1940, _mm512_set1_ps(2.5e-01f), in1944);
in1933 = _mm512_fmadd_ps(in1933, _mm512_set1_ps(4e+00f), in1937);
in1940 = _mm512_fmadd_ps(in1940, _mm512_set1_ps(4e+00f), in1944);
__m512 tmp13818 = _mm512_sub_ps(tmp13817, tmp13815);
__m512 tmp13823 = _mm512_sub_ps(tmp13821, tmp13819);
tmp13817 = _mm512_add_ps(tmp13815, tmp13817);
tmp13821 = _mm512_add_ps(tmp13819, tmp13821);
tmp13815 = _mm512_fmadd_ps(in1932, _mm512_set1_ps(2.5e-01f), in1936);
tmp13819 = _mm512_fmadd_ps(in1939, _mm512_set1_ps(2.5e-01f), in1943);
tmp13816 = _mm512_fmadd_ps(in1935, _mm512_set1_ps(-1.25e+00f), tmp13816);
tmp13820 = _mm512_fmadd_ps(in1942, _mm512_set1_ps(-1.25e+00f), tmp13820);
in1935 = _mm512_fmadd_ps(in1935, _mm512_set1_ps(-5e+00f), in1933);
in1942 = _mm512_fmadd_ps(in1942, _mm512_set1_ps(-5e+00f), in1940);
tmp13815 = _mm512_fmadd_ps(in1934, _mm512_set1_ps(-1.25e+00f), tmp13815);
tmp13819 = _mm512_fmadd_ps(in1941, _mm512_set1_ps(-1.25e+00f), tmp13819);
in1937 = _mm512_fmadd_ps(tmp13815, _mm512_set1_ps(2e+00f), tmp13816);
in1944 = _mm512_fmadd_ps(tmp13819, _mm512_set1_ps(2e+00f), tmp13820);
tmp13816 = _mm512_fnmadd_ps(tmp13815, _mm512_set1_ps(2e+00f), tmp13816);
tmp13820 = _mm512_fnmadd_ps(tmp13819, _mm512_set1_ps(2e+00f), tmp13820);
tmp13815 = _mm512_fmadd_ps(in1936, _mm512_set1_ps(2.5e-01f), in1932);
tmp13819 = _mm512_fmadd_ps(in1943, _mm512_set1_ps(2.5e-01f), in1939);
in1932 = _mm512_sub_ps(in1938, in1932);
in1939 = _mm512_sub_ps(in1945, in1939);
tmp13815 = _mm512_fmadd_ps(in1934, _mm512_set1_ps(-1.25e+00f), tmp13815);
tmp13819 = _mm512_fmadd_ps(in1941, _mm512_set1_ps(-1.25e+00f), tmp13819);
in1934 = _mm512_sub_ps(in1934, in1936);
in1941 = _mm512_sub_ps(in1941, in1943);
in1934 = _mm512_fmadd_ps(in1934, _mm512_set1_ps(5.25e+00f), in1932);
in1941 = _mm512_fmadd_ps(in1941, _mm512_set1_ps(5.25e+00f), in1939);
in1933 = _mm512_fmadd_ps(tmp13815, _mm512_set1_ps(2e+00f), in1935);
in1940 = _mm512_fmadd_ps(tmp13819, _mm512_set1_ps(2e+00f), in1942);
in1935 = _mm512_fnmadd_ps(tmp13815, _mm512_set1_ps(2e+00f), in1935);
in1942 = _mm512_fnmadd_ps(tmp13819, _mm512_set1_ps(2e+00f), in1942);
__m512 tmp13832 = _mm512_unpacklo_ps(in1931, tmp13817);
__m512 tmp13833 = _mm512_unpackhi_ps(in1931, tmp13817);
__m512 tmp13834 = _mm512_unpacklo_ps(tmp13818, in1937);
__m512 tmp13835 = _mm512_unpackhi_ps(tmp13818, in1937);
__m512 tmp13836 = _mm512_unpacklo_ps(tmp13816, in1933);
__m512 tmp13837 = _mm512_unpackhi_ps(tmp13816, in1933);
__m512 tmp13838 = _mm512_unpacklo_ps(in1935, in1934);
__m512 tmp13839 = _mm512_unpackhi_ps(in1935, in1934);
__m512 tmp13840 = _mm512_unpacklo_ps(tmp13822, tmp13821);
__m512 tmp13841 = _mm512_unpackhi_ps(tmp13822, tmp13821);
__m512 tmp13842 = _mm512_unpacklo_ps(tmp13823, in1944);
__m512 tmp13843 = _mm512_unpackhi_ps(tmp13823, in1944);
__m512 tmp13844 = _mm512_unpacklo_ps(tmp13820, in1940);
__m512 tmp13845 = _mm512_unpackhi_ps(tmp13820, in1940);
__m512 tmp13846 = _mm512_unpacklo_ps(in1942, in1941);
__m512 tmp13847 = _mm512_unpackhi_ps(in1942, in1941);
__m512 tmp13848 = _mm512_shuffle_ps(tmp13832, tmp13834, 68);
__m512 tmp13849 = _mm512_shuffle_ps(tmp13832, tmp13834, 238);
__m512 tmp13850 = _mm512_shuffle_ps(tmp13833, tmp13835, 68);
__m512 tmp13851 = _mm512_shuffle_ps(tmp13833, tmp13835, 238);
__m512 tmp13852 = _mm512_shuffle_ps(tmp13836, tmp13838, 68);
__m512 tmp13853 = _mm512_shuffle_ps(tmp13836, tmp13838, 238);
__m512 tmp13854 = _mm512_shuffle_ps(tmp13837, tmp13839, 68);
__m512 tmp13855 = _mm512_shuffle_ps(tmp13837, tmp13839, 238);
__m512 tmp13856 = _mm512_shuffle_ps(tmp13840, tmp13842, 68);
__m512 tmp13857 = _mm512_shuffle_ps(tmp13840, tmp13842, 238);
__m512 tmp13858 = _mm512_shuffle_ps(tmp13841, tmp13843, 68);
__m512 tmp13859 = _mm512_shuffle_ps(tmp13841, tmp13843, 238);
__m512 tmp13860 = _mm512_shuffle_ps(tmp13844, tmp13846, 68);
__m512 tmp13861 = _mm512_shuffle_ps(tmp13844, tmp13846, 238);
__m512 tmp13862 = _mm512_shuffle_ps(tmp13845, tmp13847, 68);
__m512 tmp13863 = _mm512_shuffle_ps(tmp13845, tmp13847, 238);
__m512 tmp13864 = _mm512_shuffle_f32x4(tmp13848, tmp13852, 136);
__m512 tmp13865 = _mm512_shuffle_f32x4(tmp13848, tmp13852, 221);
__m512 tmp13866 = _mm512_shuffle_f32x4(tmp13849, tmp13853, 136);
__m512 tmp13867 = _mm512_shuffle_f32x4(tmp13849, tmp13853, 221);
__m512 tmp13868 = _mm512_shuffle_f32x4(tmp13850, tmp13854, 136);
__m512 tmp13869 = _mm512_shuffle_f32x4(tmp13850, tmp13854, 221);
__m512 tmp13870 = _mm512_shuffle_f32x4(tmp13851, tmp13855, 136);
__m512 tmp13871 = _mm512_shuffle_f32x4(tmp13851, tmp13855, 221);
__m512 tmp13872 = _mm512_shuffle_f32x4(tmp13856, tmp13860, 136);
__m512 tmp13873 = _mm512_shuffle_f32x4(tmp13856, tmp13860, 221);
__m512 tmp13874 = _mm512_shuffle_f32x4(tmp13857, tmp13861, 136);
__m512 tmp13875 = _mm512_shuffle_f32x4(tmp13857, tmp13861, 221);
__m512 tmp13876 = _mm512_shuffle_f32x4(tmp13858, tmp13862, 136);
__m512 tmp13877 = _mm512_shuffle_f32x4(tmp13858, tmp13862, 221);
__m512 tmp13878 = _mm512_shuffle_f32x4(tmp13859, tmp13863, 136);
__m512 tmp13879 = _mm512_shuffle_f32x4(tmp13859, tmp13863, 221);
in1931 = _mm512_shuffle_f32x4(tmp13864, tmp13872, 136);
tmp13822 = _mm512_shuffle_f32x4(tmp13864, tmp13872, 221);
tmp13817 = _mm512_shuffle_f32x4(tmp13866, tmp13874, 136);
tmp13821 = _mm512_shuffle_f32x4(tmp13866, tmp13874, 221);
tmp13818 = _mm512_shuffle_f32x4(tmp13868, tmp13876, 136);
tmp13823 = _mm512_shuffle_f32x4(tmp13868, tmp13876, 221);
in1937 = _mm512_shuffle_f32x4(tmp13870, tmp13878, 136);
in1944 = _mm512_shuffle_f32x4(tmp13870, tmp13878, 221);
tmp13816 = _mm512_shuffle_f32x4(tmp13865, tmp13873, 136);
tmp13820 = _mm512_shuffle_f32x4(tmp13865, tmp13873, 221);
in1933 = _mm512_shuffle_f32x4(tmp13867, tmp13875, 136);
in1940 = _mm512_shuffle_f32x4(tmp13867, tmp13875, 221);
in1935 = _mm512_shuffle_f32x4(tmp13869, tmp13877, 136);
in1942 = _mm512_shuffle_f32x4(tmp13869, tmp13877, 221);
in1934 = _mm512_shuffle_f32x4(tmp13871, tmp13879, 136);
in1941 = _mm512_shuffle_f32x4(tmp13871, tmp13879, 221);
__m512 tmp13824 = _mm512_add_ps(tmp13817, in1933);
__m512 tmp13828 = _mm512_add_ps(tmp13821, in1940);
__m512 tmp13825 = _mm512_sub_ps(tmp13816, tmp13818);
__m512 tmp13829 = _mm512_sub_ps(tmp13820, tmp13823);
__m512 tmp13826 = _mm512_add_ps(tmp13818, in1935);
__m512 tmp13830 = _mm512_add_ps(tmp13823, in1942);
in1931 = _mm512_sub_ps(in1931, in1935);
tmp13822 = _mm512_sub_ps(tmp13822, in1942);
tmp13824 = _mm512_fmadd_ps(in1937, _mm512_set1_ps(-4.25e+00f), tmp13824);
tmp13828 = _mm512_fmadd_ps(in1944, _mm512_set1_ps(-4.25e+00f), tmp13828);
tmp13826 = _mm512_fmadd_ps(tmp13816, _mm512_set1_ps(-4.25e+00f), tmp13826);
tmp13830 = _mm512_fmadd_ps(tmp13820, _mm512_set1_ps(-4.25e+00f), tmp13830);
in1931 = _mm512_fmadd_ps(tmp13825, _mm512_set1_ps(5.25e+00f), in1931);
tmp13822 = _mm512_fmadd_ps(tmp13829, _mm512_set1_ps(5.25e+00f), tmp13822);
tmp13825 = _mm512_fmadd_ps(tmp13818, _mm512_set1_ps(2.5e-01f), in1935);
tmp13829 = _mm512_fmadd_ps(tmp13823, _mm512_set1_ps(2.5e-01f), in1942);
tmp13818 = _mm512_fmadd_ps(tmp13818, _mm512_set1_ps(4e+00f), in1935);
tmp13823 = _mm512_fmadd_ps(tmp13823, _mm512_set1_ps(4e+00f), in1942);
__m512 tmp13827 = _mm512_sub_ps(tmp13826, tmp13824);
__m512 tmp13831 = _mm512_sub_ps(tmp13830, tmp13828);
tmp13826 = _mm512_add_ps(tmp13824, tmp13826);
tmp13830 = _mm512_add_ps(tmp13828, tmp13830);
tmp13824 = _mm512_fmadd_ps(tmp13817, _mm512_set1_ps(2.5e-01f), in1933);
tmp13828 = _mm512_fmadd_ps(tmp13821, _mm512_set1_ps(2.5e-01f), in1940);
tmp13825 = _mm512_fmadd_ps(tmp13816, _mm512_set1_ps(-1.25e+00f), tmp13825);
tmp13829 = _mm512_fmadd_ps(tmp13820, _mm512_set1_ps(-1.25e+00f), tmp13829);
tmp13816 = _mm512_fmadd_ps(tmp13816, _mm512_set1_ps(-5e+00f), tmp13818);
tmp13820 = _mm512_fmadd_ps(tmp13820, _mm512_set1_ps(-5e+00f), tmp13823);
tmp13824 = _mm512_fmadd_ps(in1937, _mm512_set1_ps(-1.25e+00f), tmp13824);
tmp13828 = _mm512_fmadd_ps(in1944, _mm512_set1_ps(-1.25e+00f), tmp13828);
in1935 = _mm512_fmadd_ps(tmp13824, _mm512_set1_ps(2e+00f), tmp13825);
in1942 = _mm512_fmadd_ps(tmp13828, _mm512_set1_ps(2e+00f), tmp13829);
tmp13825 = _mm512_fnmadd_ps(tmp13824, _mm512_set1_ps(2e+00f), tmp13825);
tmp13829 = _mm512_fnmadd_ps(tmp13828, _mm512_set1_ps(2e+00f), tmp13829);
tmp13824 = _mm512_fmadd_ps(in1933, _mm512_set1_ps(2.5e-01f), tmp13817);
tmp13828 = _mm512_fmadd_ps(in1940, _mm512_set1_ps(2.5e-01f), tmp13821);
tmp13817 = _mm512_sub_ps(in1934, tmp13817);
tmp13821 = _mm512_sub_ps(in1941, tmp13821);
tmp13824 = _mm512_fmadd_ps(in1937, _mm512_set1_ps(-1.25e+00f), tmp13824);
tmp13828 = _mm512_fmadd_ps(in1944, _mm512_set1_ps(-1.25e+00f), tmp13828);
in1937 = _mm512_sub_ps(in1937, in1933);
in1944 = _mm512_sub_ps(in1944, in1940);
in1937 = _mm512_fmadd_ps(in1937, _mm512_set1_ps(5.25e+00f), tmp13817);
in1944 = _mm512_fmadd_ps(in1944, _mm512_set1_ps(5.25e+00f), tmp13821);
tmp13818 = _mm512_fmadd_ps(tmp13824, _mm512_set1_ps(2e+00f), tmp13816);
tmp13823 = _mm512_fmadd_ps(tmp13828, _mm512_set1_ps(2e+00f), tmp13820);
tmp13816 = _mm512_fnmadd_ps(tmp13824, _mm512_set1_ps(2e+00f), tmp13816);
tmp13820 = _mm512_fnmadd_ps(tmp13828, _mm512_set1_ps(2e+00f), tmp13820);
__m512 out1775 = _mm512_shuffle_f32x4(in1931, tmp13826, 68);
__m512 out1783 = _mm512_shuffle_f32x4(in1931, tmp13826, 238);
__m512 out1776 = _mm512_shuffle_f32x4(tmp13827, in1935, 68);
__m512 out1784 = _mm512_shuffle_f32x4(tmp13827, in1935, 238);
__m512 out1777 = _mm512_shuffle_f32x4(tmp13825, tmp13818, 68);
__m512 out1785 = _mm512_shuffle_f32x4(tmp13825, tmp13818, 238);
__m512 out1778 = _mm512_shuffle_f32x4(tmp13816, in1937, 68);
__m512 out1786 = _mm512_shuffle_f32x4(tmp13816, in1937, 238);
__m512 out1779 = _mm512_shuffle_f32x4(tmp13822, tmp13830, 68);
__m512 out1787 = _mm512_shuffle_f32x4(tmp13822, tmp13830, 238);
__m512 out1780 = _mm512_shuffle_f32x4(tmp13831, in1942, 68);
__m512 out1788 = _mm512_shuffle_f32x4(tmp13831, in1942, 238);
__m512 out1781 = _mm512_shuffle_f32x4(tmp13829, tmp13823, 68);
__m512 out1789 = _mm512_shuffle_f32x4(tmp13829, tmp13823, 238);
__m512 out1782 = _mm512_shuffle_f32x4(tmp13820, in1944, 68);
__m512 out1790 = _mm512_shuffle_f32x4(tmp13820, in1944, 238);
_mm512_storeu_ps(dfPtr10+256+819200*i45+49152*j38+49152*s36+768*k129, out1775);
_mm512_storeu_ps(dfPtr10+384+819200*i45+49152*j38+49152*s36+768*k129, out1783);
_mm512_storeu_ps(dfPtr10+320+819200*i45+49152*j38+49152*s36+768*k129, out1779);
_mm512_storeu_ps(dfPtr10+448+819200*i45+49152*j38+49152*s36+768*k129, out1787);
_mm512_storeu_ps(dfPtr10+205056+819200*i45+49152*j38+49152*s36+768*k129, out1776);
_mm512_storeu_ps(dfPtr10+205184+819200*i45+49152*j38+49152*s36+768*k129, out1784);
_mm512_storeu_ps(dfPtr10+205120+819200*i45+49152*j38+49152*s36+768*k129, out1780);
_mm512_storeu_ps(dfPtr10+205248+819200*i45+49152*j38+49152*s36+768*k129, out1788);
_mm512_storeu_ps(dfPtr10+409856+819200*i45+49152*j38+49152*s36+768*k129, out1777);
_mm512_storeu_ps(dfPtr10+409984+819200*i45+49152*j38+49152*s36+768*k129, out1785);
_mm512_storeu_ps(dfPtr10+409920+819200*i45+49152*j38+49152*s36+768*k129, out1781);
_mm512_storeu_ps(dfPtr10+410048+819200*i45+49152*j38+49152*s36+768*k129, out1789);
_mm512_storeu_ps(dfPtr10+614656+819200*i45+49152*j38+49152*s36+768*k129, out1778);
_mm512_storeu_ps(dfPtr10+614784+819200*i45+49152*j38+49152*s36+768*k129, out1786);
_mm512_storeu_ps(dfPtr10+614720+819200*i45+49152*j38+49152*s36+768*k129, out1782);
_mm512_storeu_ps(dfPtr10+614848+819200*i45+49152*j38+49152*s36+768*k129, out1790);
__m512 dat1977 = _mm512_maskz_loadu_ps(127, datPtr23+3812+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512i pm192 = _mm512_set_epi32(6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15);
__m512 in1953 = _mm512_permutexvar_ps(pm192, dat1977);
__m512 dat1978 = _mm512_maskz_loadu_ps(16383, datPtr23+3296+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1979 = _mm512_maskz_loadu_ps(31, datPtr23+3344+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1980 = _mm512_maskz_loadu_ps(127, datPtr23+3924+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512i pm193 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1946 = _mm512_permutexvar_ps(pm193, dat1978);
__m512i pm194 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 4, 3, 2, 1, 0);
__m512 in1954 = _mm512_permutex2var_ps(dat1979, pm194, dat1980);
__m512 dat1981 = _mm512_maskz_loadu_ps(16383, datPtr23+3408+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1982 = _mm512_maskz_loadu_ps(31, datPtr23+3456+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1983 = _mm512_maskz_loadu_ps(127, datPtr23+4036+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1947 = _mm512_permutexvar_ps(pm193, dat1981);
__m512 in1955 = _mm512_permutex2var_ps(dat1982, pm194, dat1983);
__m512 dat1984 = _mm512_maskz_loadu_ps(16383, datPtr23+3520+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1985 = _mm512_maskz_loadu_ps(31, datPtr23+3568+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1986 = _mm512_maskz_loadu_ps(127, datPtr23+4148+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1948 = _mm512_permutexvar_ps(pm193, dat1984);
__m512 in1956 = _mm512_permutex2var_ps(dat1985, pm194, dat1986);
__m512 dat1987 = _mm512_maskz_loadu_ps(16383, datPtr23+3632+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1988 = _mm512_maskz_loadu_ps(31, datPtr23+3680+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1989 = _mm512_maskz_loadu_ps(127, datPtr23+4260+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1949 = _mm512_permutexvar_ps(pm193, dat1987);
__m512 in1957 = _mm512_permutex2var_ps(dat1988, pm194, dat1989);
__m512 dat1990 = _mm512_maskz_loadu_ps(16383, datPtr23+3744+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1991 = _mm512_maskz_loadu_ps(31, datPtr23+3792+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1992 = _mm512_maskz_loadu_ps(127, datPtr23+4372+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1950 = _mm512_permutexvar_ps(pm193, dat1990);
__m512 in1958 = _mm512_permutex2var_ps(dat1991, pm194, dat1992);
__m512 dat1993 = _mm512_maskz_loadu_ps(16383, datPtr23+3856+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1994 = _mm512_maskz_loadu_ps(31, datPtr23+3904+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1995 = _mm512_maskz_loadu_ps(127, datPtr23+4484+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1951 = _mm512_permutexvar_ps(pm193, dat1993);
__m512 in1959 = _mm512_permutex2var_ps(dat1994, pm194, dat1995);
__m512 dat1996 = _mm512_maskz_loadu_ps(16383, datPtr23+3968+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1997 = _mm512_maskz_loadu_ps(31, datPtr23+4016+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 dat1998 = _mm512_maskz_loadu_ps(127, datPtr23+4596+401408*i45+112*h45+4*w57+401408*s36+6272*k129);
__m512 in1952 = _mm512_permutexvar_ps(pm193, dat1996);
__m512 in1960 = _mm512_permutex2var_ps(dat1997, pm194, dat1998);
__m512 tmp13880 = _mm512_add_ps(in1946, in1950);
__m512 tmp13885 = _mm512_add_ps(in1954, in1958);
__m512 tmp13881 = _mm512_sub_ps(in1949, in1947);
__m512 tmp13886 = _mm512_sub_ps(in1957, in1955);
__m512 tmp13882 = _mm512_add_ps(in1947, in1951);
__m512 tmp13887 = _mm512_add_ps(in1955, in1959);
__m512 tmp13883 = _mm512_sub_ps(_mm512_setzero_ps(), in1951);
in1953 = _mm512_sub_ps(in1953, in1959);
tmp13880 = _mm512_fmadd_ps(in1948, _mm512_set1_ps(-4.25e+00f), tmp13880);
tmp13885 = _mm512_fmadd_ps(in1956, _mm512_set1_ps(-4.25e+00f), tmp13885);
tmp13882 = _mm512_fmadd_ps(in1949, _mm512_set1_ps(-4.25e+00f), tmp13882);
tmp13887 = _mm512_fmadd_ps(in1957, _mm512_set1_ps(-4.25e+00f), tmp13887);
tmp13883 = _mm512_fmadd_ps(tmp13881, _mm512_set1_ps(5.25e+00f), tmp13883);
in1953 = _mm512_fmadd_ps(tmp13886, _mm512_set1_ps(5.25e+00f), in1953);
tmp13881 = _mm512_fmadd_ps(in1947, _mm512_set1_ps(2.5e-01f), in1951);
tmp13886 = _mm512_fmadd_ps(in1955, _mm512_set1_ps(2.5e-01f), in1959);
in1947 = _mm512_fmadd_ps(in1947, _mm512_set1_ps(4e+00f), in1951);
in1955 = _mm512_fmadd_ps(in1955, _mm512_set1_ps(4e+00f), in1959);
__m512 tmp13884 = _mm512_sub_ps(tmp13882, tmp13880);
__m512 tmp13888 = _mm512_sub_ps(tmp13887, tmp13885);
tmp13882 = _mm512_add_ps(tmp13880, tmp13882);
tmp13887 = _mm512_add_ps(tmp13885, tmp13887);
tmp13880 = _mm512_fmadd_ps(in1946, _mm512_set1_ps(2.5e-01f), in1950);
tmp13885 = _mm512_fmadd_ps(in1954, _mm512_set1_ps(2.5e-01f), in1958);
tmp13881 = _mm512_fmadd_ps(in1949, _mm512_set1_ps(-1.25e+00f), tmp13881);
tmp13886 = _mm512_fmadd_ps(in1957, _mm512_set1_ps(-1.25e+00f), tmp13886);
in1949 = _mm512_fmadd_ps(in1949, _mm512_set1_ps(-5e+00f), in1947);
in1957 = _mm512_fmadd_ps(in1957, _mm512_set1_ps(-5e+00f), in1955);
tmp13880 = _mm512_fmadd_ps(in1948, _mm512_set1_ps(-1.25e+00f), tmp13880);
tmp13885 = _mm512_fmadd_ps(in1956, _mm512_set1_ps(-1.25e+00f), tmp13885);
in1951 = _mm512_fmadd_ps(tmp13880, _mm512_set1_ps(2e+00f), tmp13881);
in1959 = _mm512_fmadd_ps(tmp13885, _mm512_set1_ps(2e+00f), tmp13886);
tmp13881 = _mm512_fnmadd_ps(tmp13880, _mm512_set1_ps(2e+00f), tmp13881);
tmp13886 = _mm512_fnmadd_ps(tmp13885, _mm512_set1_ps(2e+00f), tmp13886);
tmp13880 = _mm512_fmadd_ps(in1950, _mm512_set1_ps(2.5e-01f), in1946);
tmp13885 = _mm512_fmadd_ps(in1958, _mm512_set1_ps(2.5e-01f), in1954);
in1946 = _mm512_sub_ps(in1952, in1946);
in1954 = _mm512_sub_ps(in1960, in1954);
tmp13880 = _mm512_fmadd_ps(in1948, _mm512_set1_ps(-1.25e+00f), tmp13880);
tmp13885 = _mm512_fmadd_ps(in1956, _mm512_set1_ps(-1.25e+00f), tmp13885);
in1948 = _mm512_sub_ps(in1948, in1950);
in1956 = _mm512_sub_ps(in1956, in1958);
in1948 = _mm512_fmadd_ps(in1948, _mm512_set1_ps(5.25e+00f), in1946);
in1956 = _mm512_fmadd_ps(in1956, _mm512_set1_ps(5.25e+00f), in1954);
in1947 = _mm512_fmadd_ps(tmp13880, _mm512_set1_ps(2e+00f), in1949);
in1955 = _mm512_fmadd_ps(tmp13885, _mm512_set1_ps(2e+00f), in1957);
in1949 = _mm512_fnmadd_ps(tmp13880, _mm512_set1_ps(2e+00f), in1949);
in1957 = _mm512_fnmadd_ps(tmp13885, _mm512_set1_ps(2e+00f), in1957);
__m512 tmp13897 = _mm512_unpacklo_ps(tmp13883, tmp13882);
__m512 tmp13898 = _mm512_unpackhi_ps(tmp13883, tmp13882);
__m512 tmp13899 = _mm512_unpacklo_ps(tmp13884, in1951);
__m512 tmp13900 = _mm512_unpackhi_ps(tmp13884, in1951);
__m512 tmp13901 = _mm512_unpacklo_ps(tmp13881, in1947);
__m512 tmp13902 = _mm512_unpackhi_ps(tmp13881, in1947);
__m512 tmp13903 = _mm512_unpacklo_ps(in1949, in1948);
__m512 tmp13904 = _mm512_unpackhi_ps(in1949, in1948);
__m512 tmp13905 = _mm512_unpacklo_ps(in1953, tmp13887);
__m512 tmp13906 = _mm512_unpackhi_ps(in1953, tmp13887);
__m512 tmp13907 = _mm512_unpacklo_ps(tmp13888, in1959);
__m512 tmp13908 = _mm512_unpackhi_ps(tmp13888, in1959);
__m512 tmp13909 = _mm512_unpacklo_ps(tmp13886, in1955);
__m512 tmp13910 = _mm512_unpackhi_ps(tmp13886, in1955);
__m512 tmp13911 = _mm512_unpacklo_ps(in1957, in1956);
__m512 tmp13912 = _mm512_unpackhi_ps(in1957, in1956);
__m512 tmp13913 = _mm512_shuffle_ps(tmp13897, tmp13899, 68);
__m512 tmp13914 = _mm512_shuffle_ps(tmp13897, tmp13899, 238);
__m512 tmp13915 = _mm512_shuffle_ps(tmp13898, tmp13900, 68);
__m512 tmp13916 = _mm512_shuffle_ps(tmp13898, tmp13900, 238);
__m512 tmp13917 = _mm512_shuffle_ps(tmp13901, tmp13903, 68);
__m512 tmp13918 = _mm512_shuffle_ps(tmp13901, tmp13903, 238);
__m512 tmp13919 = _mm512_shuffle_ps(tmp13902, tmp13904, 68);
__m512 tmp13920 = _mm512_shuffle_ps(tmp13902, tmp13904, 238);
__m512 tmp13921 = _mm512_shuffle_ps(tmp13905, tmp13907, 68);
__m512 tmp13922 = _mm512_shuffle_ps(tmp13905, tmp13907, 238);
__m512 tmp13923 = _mm512_shuffle_ps(tmp13906, tmp13908, 68);
__m512 tmp13924 = _mm512_shuffle_ps(tmp13906, tmp13908, 238);
__m512 tmp13925 = _mm512_shuffle_ps(tmp13909, tmp13911, 68);
__m512 tmp13926 = _mm512_shuffle_ps(tmp13909, tmp13911, 238);
__m512 tmp13927 = _mm512_shuffle_ps(tmp13910, tmp13912, 68);
__m512 tmp13928 = _mm512_shuffle_ps(tmp13910, tmp13912, 238);
__m512 tmp13929 = _mm512_shuffle_f32x4(tmp13913, tmp13917, 136);
__m512 tmp13930 = _mm512_shuffle_f32x4(tmp13913, tmp13917, 221);
__m512 tmp13931 = _mm512_shuffle_f32x4(tmp13914, tmp13918, 136);
__m512 tmp13932 = _mm512_shuffle_f32x4(tmp13914, tmp13918, 221);
__m512 tmp13933 = _mm512_shuffle_f32x4(tmp13915, tmp13919, 136);
__m512 tmp13934 = _mm512_shuffle_f32x4(tmp13915, tmp13919, 221);
__m512 tmp13935 = _mm512_shuffle_f32x4(tmp13916, tmp13920, 136);
__m512 tmp13936 = _mm512_shuffle_f32x4(tmp13916, tmp13920, 221);
__m512 tmp13937 = _mm512_shuffle_f32x4(tmp13921, tmp13925, 136);
__m512 tmp13938 = _mm512_shuffle_f32x4(tmp13921, tmp13925, 221);
__m512 tmp13939 = _mm512_shuffle_f32x4(tmp13922, tmp13926, 136);
__m512 tmp13940 = _mm512_shuffle_f32x4(tmp13922, tmp13926, 221);
__m512 tmp13941 = _mm512_shuffle_f32x4(tmp13923, tmp13927, 136);
__m512 tmp13942 = _mm512_shuffle_f32x4(tmp13923, tmp13927, 221);
__m512 tmp13943 = _mm512_shuffle_f32x4(tmp13924, tmp13928, 136);
__m512 tmp13944 = _mm512_shuffle_f32x4(tmp13924, tmp13928, 221);
tmp13883 = _mm512_shuffle_f32x4(tmp13929, tmp13937, 136);
in1953 = _mm512_shuffle_f32x4(tmp13929, tmp13937, 221);
tmp13882 = _mm512_shuffle_f32x4(tmp13931, tmp13939, 136);
tmp13887 = _mm512_shuffle_f32x4(tmp13931, tmp13939, 221);
tmp13884 = _mm512_shuffle_f32x4(tmp13933, tmp13941, 136);
tmp13888 = _mm512_shuffle_f32x4(tmp13933, tmp13941, 221);
in1951 = _mm512_shuffle_f32x4(tmp13935, tmp13943, 136);
in1959 = _mm512_shuffle_f32x4(tmp13935, tmp13943, 221);
tmp13881 = _mm512_shuffle_f32x4(tmp13930, tmp13938, 136);
tmp13886 = _mm512_shuffle_f32x4(tmp13930, tmp13938, 221);
in1947 = _mm512_shuffle_f32x4(tmp13932, tmp13940, 136);
in1955 = _mm512_shuffle_f32x4(tmp13932, tmp13940, 221);
in1949 = _mm512_shuffle_f32x4(tmp13934, tmp13942, 136);
in1957 = _mm512_shuffle_f32x4(tmp13934, tmp13942, 221);
in1948 = _mm512_shuffle_f32x4(tmp13936, tmp13944, 136);
in1956 = _mm512_shuffle_f32x4(tmp13936, tmp13944, 221);
__m512 tmp13889 = _mm512_add_ps(tmp13882, in1947);
__m512 tmp13893 = _mm512_add_ps(tmp13887, in1955);
__m512 tmp13890 = _mm512_sub_ps(tmp13881, tmp13884);
__m512 tmp13894 = _mm512_sub_ps(tmp13886, tmp13888);
__m512 tmp13891 = _mm512_add_ps(tmp13884, in1949);
__m512 tmp13895 = _mm512_add_ps(tmp13888, in1957);
tmp13883 = _mm512_sub_ps(tmp13883, in1949);
in1953 = _mm512_sub_ps(in1953, in1957);
tmp13889 = _mm512_fmadd_ps(in1951, _mm512_set1_ps(-4.25e+00f), tmp13889);
tmp13893 = _mm512_fmadd_ps(in1959, _mm512_set1_ps(-4.25e+00f), tmp13893);
tmp13891 = _mm512_fmadd_ps(tmp13881, _mm512_set1_ps(-4.25e+00f), tmp13891);
tmp13895 = _mm512_fmadd_ps(tmp13886, _mm512_set1_ps(-4.25e+00f), tmp13895);
tmp13883 = _mm512_fmadd_ps(tmp13890, _mm512_set1_ps(5.25e+00f), tmp13883);
in1953 = _mm512_fmadd_ps(tmp13894, _mm512_set1_ps(5.25e+00f), in1953);
tmp13890 = _mm512_fmadd_ps(tmp13884, _mm512_set1_ps(2.5e-01f), in1949);
tmp13894 = _mm512_fmadd_ps(tmp13888, _mm512_set1_ps(2.5e-01f), in1957);
tmp13884 = _mm512_fmadd_ps(tmp13884, _mm512_set1_ps(4e+00f), in1949);
tmp13888 = _mm512_fmadd_ps(tmp13888, _mm512_set1_ps(4e+00f), in1957);
__m512 tmp13892 = _mm512_sub_ps(tmp13891, tmp13889);
__m512 tmp13896 = _mm512_sub_ps(tmp13895, tmp13893);
tmp13891 = _mm512_add_ps(tmp13889, tmp13891);
tmp13895 = _mm512_add_ps(tmp13893, tmp13895);
tmp13889 = _mm512_fmadd_ps(tmp13882, _mm512_set1_ps(2.5e-01f), in1947);
tmp13893 = _mm512_fmadd_ps(tmp13887, _mm512_set1_ps(2.5e-01f), in1955);
tmp13890 = _mm512_fmadd_ps(tmp13881, _mm512_set1_ps(-1.25e+00f), tmp13890);
tmp13894 = _mm512_fmadd_ps(tmp13886, _mm512_set1_ps(-1.25e+00f), tmp13894);
tmp13881 = _mm512_fmadd_ps(tmp13881, _mm512_set1_ps(-5e+00f), tmp13884);
tmp13886 = _mm512_fmadd_ps(tmp13886, _mm512_set1_ps(-5e+00f), tmp13888);
tmp13889 = _mm512_fmadd_ps(in1951, _mm512_set1_ps(-1.25e+00f), tmp13889);
tmp13893 = _mm512_fmadd_ps(in1959, _mm512_set1_ps(-1.25e+00f), tmp13893);
in1949 = _mm512_fmadd_ps(tmp13889, _mm512_set1_ps(2e+00f), tmp13890);
in1957 = _mm512_fmadd_ps(tmp13893, _mm512_set1_ps(2e+00f), tmp13894);
tmp13890 = _mm512_fnmadd_ps(tmp13889, _mm512_set1_ps(2e+00f), tmp13890);
tmp13894 = _mm512_fnmadd_ps(tmp13893, _mm512_set1_ps(2e+00f), tmp13894);
tmp13889 = _mm512_fmadd_ps(in1947, _mm512_set1_ps(2.5e-01f), tmp13882);
tmp13893 = _mm512_fmadd_ps(in1955, _mm512_set1_ps(2.5e-01f), tmp13887);
tmp13882 = _mm512_sub_ps(in1948, tmp13882);
tmp13887 = _mm512_sub_ps(in1956, tmp13887);
tmp13889 = _mm512_fmadd_ps(in1951, _mm512_set1_ps(-1.25e+00f), tmp13889);
tmp13893 = _mm512_fmadd_ps(in1959, _mm512_set1_ps(-1.25e+00f), tmp13893);
in1951 = _mm512_sub_ps(in1951, in1947);
in1959 = _mm512_sub_ps(in1959, in1955);
in1951 = _mm512_fmadd_ps(in1951, _mm512_set1_ps(5.25e+00f), tmp13882);
in1959 = _mm512_fmadd_ps(in1959, _mm512_set1_ps(5.25e+00f), tmp13887);
tmp13884 = _mm512_fmadd_ps(tmp13889, _mm512_set1_ps(2e+00f), tmp13881);
tmp13888 = _mm512_fmadd_ps(tmp13893, _mm512_set1_ps(2e+00f), tmp13886);
tmp13881 = _mm512_fnmadd_ps(tmp13889, _mm512_set1_ps(2e+00f), tmp13881);
tmp13886 = _mm512_fnmadd_ps(tmp13893, _mm512_set1_ps(2e+00f), tmp13886);
__m512 out1791 = _mm512_shuffle_f32x4(tmp13883, tmp13891, 68);
__m512 out1799 = _mm512_shuffle_f32x4(tmp13883, tmp13891, 238);
__m512 out1792 = _mm512_shuffle_f32x4(tmp13892, in1949, 68);
__m512 out1800 = _mm512_shuffle_f32x4(tmp13892, in1949, 238);
__m512 out1793 = _mm512_shuffle_f32x4(tmp13890, tmp13884, 68);
__m512 out1801 = _mm512_shuffle_f32x4(tmp13890, tmp13884, 238);
__m512 out1794 = _mm512_shuffle_f32x4(tmp13881, in1951, 68);
__m512 out1802 = _mm512_shuffle_f32x4(tmp13881, in1951, 238);
__m512 out1795 = _mm512_shuffle_f32x4(in1953, tmp13895, 68);
__m512 out1803 = _mm512_shuffle_f32x4(in1953, tmp13895, 238);
__m512 out1796 = _mm512_shuffle_f32x4(tmp13896, in1957, 68);
__m512 out1804 = _mm512_shuffle_f32x4(tmp13896, in1957, 238);
__m512 out1797 = _mm512_shuffle_f32x4(tmp13894, tmp13888, 68);
__m512 out1805 = _mm512_shuffle_f32x4(tmp13894, tmp13888, 238);
__m512 out1798 = _mm512_shuffle_f32x4(tmp13886, in1959, 68);
__m512 out1806 = _mm512_shuffle_f32x4(tmp13886, in1959, 238);
_mm512_storeu_ps(dfPtr10+512+819200*i45+49152*j38+49152*s36+768*k129, out1791);
_mm512_storeu_ps(dfPtr10+640+819200*i45+49152*j38+49152*s36+768*k129, out1799);
_mm512_storeu_ps(dfPtr10+576+819200*i45+49152*j38+49152*s36+768*k129, out1795);
_mm512_storeu_ps(dfPtr10+704+819200*i45+49152*j38+49152*s36+768*k129, out1803);
_mm512_storeu_ps(dfPtr10+205312+819200*i45+49152*j38+49152*s36+768*k129, out1792);
_mm512_storeu_ps(dfPtr10+205440+819200*i45+49152*j38+49152*s36+768*k129, out1800);
_mm512_storeu_ps(dfPtr10+205376+819200*i45+49152*j38+49152*s36+768*k129, out1796);
_mm512_storeu_ps(dfPtr10+205504+819200*i45+49152*j38+49152*s36+768*k129, out1804);
_mm512_storeu_ps(dfPtr10+410112+819200*i45+49152*j38+49152*s36+768*k129, out1793);
_mm512_storeu_ps(dfPtr10+410240+819200*i45+49152*j38+49152*s36+768*k129, out1801);
_mm512_storeu_ps(dfPtr10+410176+819200*i45+49152*j38+49152*s36+768*k129, out1797);
_mm512_storeu_ps(dfPtr10+410304+819200*i45+49152*j38+49152*s36+768*k129, out1805);
_mm512_storeu_ps(dfPtr10+614912+819200*i45+49152*j38+49152*s36+768*k129, out1794);
_mm512_storeu_ps(dfPtr10+615040+819200*i45+49152*j38+49152*s36+768*k129, out1802);
_mm512_storeu_ps(dfPtr10+614976+819200*i45+49152*j38+49152*s36+768*k129, out1798);
_mm512_storeu_ps(dfPtr10+615104+819200*i45+49152*j38+49152*s36+768*k129, out1806);
}
if (j38 >= last9) return;
++j38;
rel21 = 1;
}
ptrdiff_t h46 = base21+6;
ptrdiff_t w58 = 6;
ptrdiff_t k130 = 0;
for (; k130 != 64; ++k130) {
__m512 dat1999 = _mm512_maskz_loadu_ps(16383, datPtr23+0+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2000 = _mm512_maskz_loadu_ps(2047, datPtr23+48+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512i pm195 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1961 = _mm512_permutexvar_ps(pm195, dat1999);
__m512i pm196 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1969 = _mm512_permutexvar_ps(pm196, dat2000);
__m512 dat2001 = _mm512_maskz_loadu_ps(16383, datPtr23+112+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2002 = _mm512_maskz_loadu_ps(2047, datPtr23+160+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1962 = _mm512_permutexvar_ps(pm195, dat2001);
__m512 in1970 = _mm512_permutexvar_ps(pm196, dat2002);
__m512 dat2003 = _mm512_maskz_loadu_ps(16383, datPtr23+224+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2004 = _mm512_maskz_loadu_ps(2047, datPtr23+272+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1963 = _mm512_permutexvar_ps(pm195, dat2003);
__m512 in1971 = _mm512_permutexvar_ps(pm196, dat2004);
__m512 dat2005 = _mm512_maskz_loadu_ps(16383, datPtr23+336+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2006 = _mm512_maskz_loadu_ps(2047, datPtr23+384+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1964 = _mm512_permutexvar_ps(pm195, dat2005);
__m512 in1972 = _mm512_permutexvar_ps(pm196, dat2006);
__m512 dat2007 = _mm512_maskz_loadu_ps(16383, datPtr23+448+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2008 = _mm512_maskz_loadu_ps(2047, datPtr23+496+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1965 = _mm512_permutexvar_ps(pm195, dat2007);
__m512 in1973 = _mm512_permutexvar_ps(pm196, dat2008);
__m512 dat2009 = _mm512_maskz_loadu_ps(16383, datPtr23+560+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2010 = _mm512_maskz_loadu_ps(2047, datPtr23+608+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1966 = _mm512_permutexvar_ps(pm195, dat2009);
__m512 in1974 = _mm512_permutexvar_ps(pm196, dat2010);
__m512 dat2011 = _mm512_maskz_loadu_ps(16383, datPtr23+672+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2012 = _mm512_maskz_loadu_ps(2047, datPtr23+720+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1967 = _mm512_permutexvar_ps(pm195, dat2011);
__m512 in1975 = _mm512_permutexvar_ps(pm196, dat2012);
__m512 dat2013 = _mm512_maskz_loadu_ps(16383, datPtr23+784+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2014 = _mm512_maskz_loadu_ps(2047, datPtr23+832+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1968 = _mm512_permutexvar_ps(pm195, dat2013);
__m512 in1976 = _mm512_permutexvar_ps(pm196, dat2014);
__m512 tmp13945 = _mm512_add_ps(in1962, in1966);
__m512 tmp13949 = _mm512_add_ps(in1970, in1974);
__m512 tmp13946 = _mm512_sub_ps(in1965, in1963);
__m512 tmp13950 = _mm512_sub_ps(in1973, in1971);
__m512 tmp13947 = _mm512_add_ps(in1963, in1967);
__m512 tmp13951 = _mm512_add_ps(in1971, in1975);
in1961 = _mm512_sub_ps(in1961, in1967);
in1969 = _mm512_sub_ps(in1969, in1975);
tmp13945 = _mm512_fmadd_ps(in1964, _mm512_set1_ps(-4.25e+00f), tmp13945);
tmp13949 = _mm512_fmadd_ps(in1972, _mm512_set1_ps(-4.25e+00f), tmp13949);
tmp13947 = _mm512_fmadd_ps(in1965, _mm512_set1_ps(-4.25e+00f), tmp13947);
tmp13951 = _mm512_fmadd_ps(in1973, _mm512_set1_ps(-4.25e+00f), tmp13951);
in1961 = _mm512_fmadd_ps(tmp13946, _mm512_set1_ps(5.25e+00f), in1961);
in1969 = _mm512_fmadd_ps(tmp13950, _mm512_set1_ps(5.25e+00f), in1969);
tmp13946 = _mm512_fmadd_ps(in1963, _mm512_set1_ps(2.5e-01f), in1967);
tmp13950 = _mm512_fmadd_ps(in1971, _mm512_set1_ps(2.5e-01f), in1975);
in1963 = _mm512_fmadd_ps(in1963, _mm512_set1_ps(4e+00f), in1967);
in1971 = _mm512_fmadd_ps(in1971, _mm512_set1_ps(4e+00f), in1975);
__m512 tmp13948 = _mm512_sub_ps(tmp13947, tmp13945);
__m512 tmp13952 = _mm512_sub_ps(tmp13951, tmp13949);
tmp13947 = _mm512_add_ps(tmp13945, tmp13947);
tmp13951 = _mm512_add_ps(tmp13949, tmp13951);
tmp13945 = _mm512_fmadd_ps(in1962, _mm512_set1_ps(2.5e-01f), in1966);
tmp13949 = _mm512_fmadd_ps(in1970, _mm512_set1_ps(2.5e-01f), in1974);
tmp13946 = _mm512_fmadd_ps(in1965, _mm512_set1_ps(-1.25e+00f), tmp13946);
tmp13950 = _mm512_fmadd_ps(in1973, _mm512_set1_ps(-1.25e+00f), tmp13950);
in1965 = _mm512_fmadd_ps(in1965, _mm512_set1_ps(-5e+00f), in1963);
in1973 = _mm512_fmadd_ps(in1973, _mm512_set1_ps(-5e+00f), in1971);
tmp13945 = _mm512_fmadd_ps(in1964, _mm512_set1_ps(-1.25e+00f), tmp13945);
tmp13949 = _mm512_fmadd_ps(in1972, _mm512_set1_ps(-1.25e+00f), tmp13949);
in1967 = _mm512_fmadd_ps(tmp13945, _mm512_set1_ps(2e+00f), tmp13946);
in1975 = _mm512_fmadd_ps(tmp13949, _mm512_set1_ps(2e+00f), tmp13950);
tmp13946 = _mm512_fnmadd_ps(tmp13945, _mm512_set1_ps(2e+00f), tmp13946);
tmp13950 = _mm512_fnmadd_ps(tmp13949, _mm512_set1_ps(2e+00f), tmp13950);
tmp13945 = _mm512_fmadd_ps(in1966, _mm512_set1_ps(2.5e-01f), in1962);
tmp13949 = _mm512_fmadd_ps(in1974, _mm512_set1_ps(2.5e-01f), in1970);
in1962 = _mm512_sub_ps(in1968, in1962);
in1970 = _mm512_sub_ps(in1976, in1970);
tmp13945 = _mm512_fmadd_ps(in1964, _mm512_set1_ps(-1.25e+00f), tmp13945);
tmp13949 = _mm512_fmadd_ps(in1972, _mm512_set1_ps(-1.25e+00f), tmp13949);
in1964 = _mm512_sub_ps(in1964, in1966);
in1972 = _mm512_sub_ps(in1972, in1974);
in1964 = _mm512_fmadd_ps(in1964, _mm512_set1_ps(5.25e+00f), in1962);
in1972 = _mm512_fmadd_ps(in1972, _mm512_set1_ps(5.25e+00f), in1970);
in1963 = _mm512_fmadd_ps(tmp13945, _mm512_set1_ps(2e+00f), in1965);
in1971 = _mm512_fmadd_ps(tmp13949, _mm512_set1_ps(2e+00f), in1973);
in1965 = _mm512_fnmadd_ps(tmp13945, _mm512_set1_ps(2e+00f), in1965);
in1973 = _mm512_fnmadd_ps(tmp13949, _mm512_set1_ps(2e+00f), in1973);
__m512 tmp13961 = _mm512_unpacklo_ps(in1961, tmp13947);
__m512 tmp13962 = _mm512_unpackhi_ps(in1961, tmp13947);
__m512 tmp13963 = _mm512_unpacklo_ps(tmp13948, in1967);
__m512 tmp13964 = _mm512_unpackhi_ps(tmp13948, in1967);
__m512 tmp13965 = _mm512_unpacklo_ps(tmp13946, in1963);
__m512 tmp13966 = _mm512_unpackhi_ps(tmp13946, in1963);
__m512 tmp13967 = _mm512_unpacklo_ps(in1965, in1964);
__m512 tmp13968 = _mm512_unpackhi_ps(in1965, in1964);
__m512 tmp13969 = _mm512_unpacklo_ps(in1969, tmp13951);
__m512 tmp13970 = _mm512_unpackhi_ps(in1969, tmp13951);
__m512 tmp13971 = _mm512_unpacklo_ps(tmp13952, in1975);
__m512 tmp13972 = _mm512_unpackhi_ps(tmp13952, in1975);
__m512 tmp13973 = _mm512_unpacklo_ps(tmp13950, in1971);
__m512 tmp13974 = _mm512_unpackhi_ps(tmp13950, in1971);
__m512 tmp13975 = _mm512_unpacklo_ps(in1973, in1972);
__m512 tmp13976 = _mm512_unpackhi_ps(in1973, in1972);
__m512 tmp13977 = _mm512_shuffle_ps(tmp13961, tmp13963, 68);
__m512 tmp13978 = _mm512_shuffle_ps(tmp13961, tmp13963, 238);
__m512 tmp13979 = _mm512_shuffle_ps(tmp13962, tmp13964, 68);
__m512 tmp13980 = _mm512_shuffle_ps(tmp13962, tmp13964, 238);
__m512 tmp13981 = _mm512_shuffle_ps(tmp13965, tmp13967, 68);
__m512 tmp13982 = _mm512_shuffle_ps(tmp13965, tmp13967, 238);
__m512 tmp13983 = _mm512_shuffle_ps(tmp13966, tmp13968, 68);
__m512 tmp13984 = _mm512_shuffle_ps(tmp13966, tmp13968, 238);
__m512 tmp13985 = _mm512_shuffle_ps(tmp13969, tmp13971, 68);
__m512 tmp13986 = _mm512_shuffle_ps(tmp13969, tmp13971, 238);
__m512 tmp13987 = _mm512_shuffle_ps(tmp13970, tmp13972, 68);
__m512 tmp13988 = _mm512_shuffle_ps(tmp13970, tmp13972, 238);
__m512 tmp13989 = _mm512_shuffle_ps(tmp13973, tmp13975, 68);
__m512 tmp13990 = _mm512_shuffle_ps(tmp13973, tmp13975, 238);
__m512 tmp13991 = _mm512_shuffle_ps(tmp13974, tmp13976, 68);
__m512 tmp13992 = _mm512_shuffle_ps(tmp13974, tmp13976, 238);
__m512 tmp13993 = _mm512_shuffle_f32x4(tmp13977, tmp13981, 136);
__m512 tmp13994 = _mm512_shuffle_f32x4(tmp13977, tmp13981, 221);
__m512 tmp13995 = _mm512_shuffle_f32x4(tmp13978, tmp13982, 136);
__m512 tmp13996 = _mm512_shuffle_f32x4(tmp13978, tmp13982, 221);
__m512 tmp13997 = _mm512_shuffle_f32x4(tmp13979, tmp13983, 136);
__m512 tmp13998 = _mm512_shuffle_f32x4(tmp13979, tmp13983, 221);
__m512 tmp13999 = _mm512_shuffle_f32x4(tmp13980, tmp13984, 136);
__m512 tmp14000 = _mm512_shuffle_f32x4(tmp13980, tmp13984, 221);
__m512 tmp14001 = _mm512_shuffle_f32x4(tmp13985, tmp13989, 136);
__m512 tmp14002 = _mm512_shuffle_f32x4(tmp13985, tmp13989, 221);
__m512 tmp14003 = _mm512_shuffle_f32x4(tmp13986, tmp13990, 136);
__m512 tmp14004 = _mm512_shuffle_f32x4(tmp13986, tmp13990, 221);
__m512 tmp14005 = _mm512_shuffle_f32x4(tmp13987, tmp13991, 136);
__m512 tmp14006 = _mm512_shuffle_f32x4(tmp13987, tmp13991, 221);
__m512 tmp14007 = _mm512_shuffle_f32x4(tmp13988, tmp13992, 136);
__m512 tmp14008 = _mm512_shuffle_f32x4(tmp13988, tmp13992, 221);
in1961 = _mm512_shuffle_f32x4(tmp13993, tmp14001, 136);
in1969 = _mm512_shuffle_f32x4(tmp13993, tmp14001, 221);
tmp13947 = _mm512_shuffle_f32x4(tmp13995, tmp14003, 136);
tmp13951 = _mm512_shuffle_f32x4(tmp13995, tmp14003, 221);
tmp13948 = _mm512_shuffle_f32x4(tmp13997, tmp14005, 136);
tmp13952 = _mm512_shuffle_f32x4(tmp13997, tmp14005, 221);
in1967 = _mm512_shuffle_f32x4(tmp13999, tmp14007, 136);
in1975 = _mm512_shuffle_f32x4(tmp13999, tmp14007, 221);
tmp13946 = _mm512_shuffle_f32x4(tmp13994, tmp14002, 136);
tmp13950 = _mm512_shuffle_f32x4(tmp13994, tmp14002, 221);
in1963 = _mm512_shuffle_f32x4(tmp13996, tmp14004, 136);
in1971 = _mm512_shuffle_f32x4(tmp13996, tmp14004, 221);
in1965 = _mm512_shuffle_f32x4(tmp13998, tmp14006, 136);
in1973 = _mm512_shuffle_f32x4(tmp13998, tmp14006, 221);
in1964 = _mm512_shuffle_f32x4(tmp14000, tmp14008, 136);
in1972 = _mm512_shuffle_f32x4(tmp14000, tmp14008, 221);
__m512 tmp13953 = _mm512_add_ps(tmp13947, in1963);
__m512 tmp13957 = _mm512_add_ps(tmp13951, in1971);
__m512 tmp13954 = _mm512_sub_ps(tmp13946, tmp13948);
__m512 tmp13958 = _mm512_sub_ps(tmp13950, tmp13952);
__m512 tmp13955 = _mm512_add_ps(tmp13948, in1965);
__m512 tmp13959 = _mm512_add_ps(tmp13952, in1973);
in1961 = _mm512_sub_ps(in1961, in1965);
in1969 = _mm512_sub_ps(in1969, in1973);
tmp13953 = _mm512_fmadd_ps(in1967, _mm512_set1_ps(-4.25e+00f), tmp13953);
tmp13957 = _mm512_fmadd_ps(in1975, _mm512_set1_ps(-4.25e+00f), tmp13957);
tmp13955 = _mm512_fmadd_ps(tmp13946, _mm512_set1_ps(-4.25e+00f), tmp13955);
tmp13959 = _mm512_fmadd_ps(tmp13950, _mm512_set1_ps(-4.25e+00f), tmp13959);
in1961 = _mm512_fmadd_ps(tmp13954, _mm512_set1_ps(5.25e+00f), in1961);
in1969 = _mm512_fmadd_ps(tmp13958, _mm512_set1_ps(5.25e+00f), in1969);
tmp13954 = _mm512_fmadd_ps(tmp13948, _mm512_set1_ps(2.5e-01f), in1965);
tmp13958 = _mm512_fmadd_ps(tmp13952, _mm512_set1_ps(2.5e-01f), in1973);
tmp13948 = _mm512_fmadd_ps(tmp13948, _mm512_set1_ps(4e+00f), in1965);
tmp13952 = _mm512_fmadd_ps(tmp13952, _mm512_set1_ps(4e+00f), in1973);
__m512 tmp13956 = _mm512_sub_ps(tmp13955, tmp13953);
__m512 tmp13960 = _mm512_sub_ps(tmp13959, tmp13957);
tmp13955 = _mm512_add_ps(tmp13953, tmp13955);
tmp13959 = _mm512_add_ps(tmp13957, tmp13959);
tmp13953 = _mm512_fmadd_ps(tmp13947, _mm512_set1_ps(2.5e-01f), in1963);
tmp13957 = _mm512_fmadd_ps(tmp13951, _mm512_set1_ps(2.5e-01f), in1971);
tmp13954 = _mm512_fmadd_ps(tmp13946, _mm512_set1_ps(-1.25e+00f), tmp13954);
tmp13958 = _mm512_fmadd_ps(tmp13950, _mm512_set1_ps(-1.25e+00f), tmp13958);
tmp13946 = _mm512_fmadd_ps(tmp13946, _mm512_set1_ps(-5e+00f), tmp13948);
tmp13950 = _mm512_fmadd_ps(tmp13950, _mm512_set1_ps(-5e+00f), tmp13952);
tmp13953 = _mm512_fmadd_ps(in1967, _mm512_set1_ps(-1.25e+00f), tmp13953);
tmp13957 = _mm512_fmadd_ps(in1975, _mm512_set1_ps(-1.25e+00f), tmp13957);
in1965 = _mm512_fmadd_ps(tmp13953, _mm512_set1_ps(2e+00f), tmp13954);
in1973 = _mm512_fmadd_ps(tmp13957, _mm512_set1_ps(2e+00f), tmp13958);
tmp13954 = _mm512_fnmadd_ps(tmp13953, _mm512_set1_ps(2e+00f), tmp13954);
tmp13958 = _mm512_fnmadd_ps(tmp13957, _mm512_set1_ps(2e+00f), tmp13958);
tmp13953 = _mm512_fmadd_ps(in1963, _mm512_set1_ps(2.5e-01f), tmp13947);
tmp13957 = _mm512_fmadd_ps(in1971, _mm512_set1_ps(2.5e-01f), tmp13951);
tmp13947 = _mm512_sub_ps(in1964, tmp13947);
tmp13951 = _mm512_sub_ps(in1972, tmp13951);
tmp13953 = _mm512_fmadd_ps(in1967, _mm512_set1_ps(-1.25e+00f), tmp13953);
tmp13957 = _mm512_fmadd_ps(in1975, _mm512_set1_ps(-1.25e+00f), tmp13957);
in1967 = _mm512_sub_ps(in1967, in1963);
in1975 = _mm512_sub_ps(in1975, in1971);
in1967 = _mm512_fmadd_ps(in1967, _mm512_set1_ps(5.25e+00f), tmp13947);
in1975 = _mm512_fmadd_ps(in1975, _mm512_set1_ps(5.25e+00f), tmp13951);
tmp13948 = _mm512_fmadd_ps(tmp13953, _mm512_set1_ps(2e+00f), tmp13946);
tmp13952 = _mm512_fmadd_ps(tmp13957, _mm512_set1_ps(2e+00f), tmp13950);
tmp13946 = _mm512_fnmadd_ps(tmp13953, _mm512_set1_ps(2e+00f), tmp13946);
tmp13950 = _mm512_fnmadd_ps(tmp13957, _mm512_set1_ps(2e+00f), tmp13950);
__m512 out1807 = _mm512_shuffle_f32x4(in1961, tmp13955, 68);
__m512 out1815 = _mm512_shuffle_f32x4(in1961, tmp13955, 238);
__m512 out1808 = _mm512_shuffle_f32x4(tmp13956, in1965, 68);
__m512 out1816 = _mm512_shuffle_f32x4(tmp13956, in1965, 238);
__m512 out1809 = _mm512_shuffle_f32x4(tmp13954, tmp13948, 68);
__m512 out1817 = _mm512_shuffle_f32x4(tmp13954, tmp13948, 238);
__m512 out1810 = _mm512_shuffle_f32x4(tmp13946, in1967, 68);
__m512 out1818 = _mm512_shuffle_f32x4(tmp13946, in1967, 238);
__m512 out1811 = _mm512_shuffle_f32x4(in1969, tmp13959, 68);
__m512 out1819 = _mm512_shuffle_f32x4(in1969, tmp13959, 238);
__m512 out1812 = _mm512_shuffle_f32x4(tmp13960, in1973, 68);
__m512 out1820 = _mm512_shuffle_f32x4(tmp13960, in1973, 238);
__m512 out1813 = _mm512_shuffle_f32x4(tmp13958, tmp13952, 68);
__m512 out1821 = _mm512_shuffle_f32x4(tmp13958, tmp13952, 238);
__m512 out1814 = _mm512_shuffle_f32x4(tmp13950, in1975, 68);
__m512 out1822 = _mm512_shuffle_f32x4(tmp13950, in1975, 238);
_mm512_storeu_ps(dfPtr10+0+819200*i45+49152*j38+49152*s36+768*k130, out1807);
_mm512_storeu_ps(dfPtr10+128+819200*i45+49152*j38+49152*s36+768*k130, out1815);
_mm512_storeu_ps(dfPtr10+64+819200*i45+49152*j38+49152*s36+768*k130, out1811);
_mm512_storeu_ps(dfPtr10+192+819200*i45+49152*j38+49152*s36+768*k130, out1819);
_mm512_storeu_ps(dfPtr10+204800+819200*i45+49152*j38+49152*s36+768*k130, out1808);
_mm512_storeu_ps(dfPtr10+204928+819200*i45+49152*j38+49152*s36+768*k130, out1816);
_mm512_storeu_ps(dfPtr10+204864+819200*i45+49152*j38+49152*s36+768*k130, out1812);
_mm512_storeu_ps(dfPtr10+204992+819200*i45+49152*j38+49152*s36+768*k130, out1820);
_mm512_storeu_ps(dfPtr10+409600+819200*i45+49152*j38+49152*s36+768*k130, out1809);
_mm512_storeu_ps(dfPtr10+409728+819200*i45+49152*j38+49152*s36+768*k130, out1817);
_mm512_storeu_ps(dfPtr10+409664+819200*i45+49152*j38+49152*s36+768*k130, out1813);
_mm512_storeu_ps(dfPtr10+409792+819200*i45+49152*j38+49152*s36+768*k130, out1821);
_mm512_storeu_ps(dfPtr10+614400+819200*i45+49152*j38+49152*s36+768*k130, out1810);
_mm512_storeu_ps(dfPtr10+614528+819200*i45+49152*j38+49152*s36+768*k130, out1818);
_mm512_storeu_ps(dfPtr10+614464+819200*i45+49152*j38+49152*s36+768*k130, out1814);
_mm512_storeu_ps(dfPtr10+614592+819200*i45+49152*j38+49152*s36+768*k130, out1822);
__m512 dat2015 = _mm512_maskz_loadu_ps(8191, datPtr23+652+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2016 = _mm512_maskz_loadu_ps(16383, datPtr23+3136+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512i pm197 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1977 = _mm512_permutexvar_ps(pm197, dat2015);
__m512i pm198 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1985 = _mm512_permutexvar_ps(pm198, dat2016);
__m512 dat2017 = _mm512_maskz_loadu_ps(8191, datPtr23+764+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2018 = _mm512_maskz_loadu_ps(16383, datPtr23+3248+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1978 = _mm512_permutexvar_ps(pm197, dat2017);
__m512 in1986 = _mm512_permutexvar_ps(pm198, dat2018);
__m512 dat2019 = _mm512_maskz_loadu_ps(8191, datPtr23+876+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2020 = _mm512_maskz_loadu_ps(16383, datPtr23+3360+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1979 = _mm512_permutexvar_ps(pm197, dat2019);
__m512 in1987 = _mm512_permutexvar_ps(pm198, dat2020);
__m512 dat2021 = _mm512_maskz_loadu_ps(8191, datPtr23+988+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2022 = _mm512_maskz_loadu_ps(16383, datPtr23+3472+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1980 = _mm512_permutexvar_ps(pm197, dat2021);
__m512 in1988 = _mm512_permutexvar_ps(pm198, dat2022);
__m512 dat2023 = _mm512_maskz_loadu_ps(8191, datPtr23+1100+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2024 = _mm512_maskz_loadu_ps(16383, datPtr23+3584+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1981 = _mm512_permutexvar_ps(pm197, dat2023);
__m512 in1989 = _mm512_permutexvar_ps(pm198, dat2024);
__m512 dat2025 = _mm512_maskz_loadu_ps(8191, datPtr23+1212+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2026 = _mm512_maskz_loadu_ps(16383, datPtr23+3696+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1982 = _mm512_permutexvar_ps(pm197, dat2025);
__m512 in1990 = _mm512_permutexvar_ps(pm198, dat2026);
__m512 dat2027 = _mm512_maskz_loadu_ps(8191, datPtr23+1324+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2028 = _mm512_maskz_loadu_ps(16383, datPtr23+3808+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1983 = _mm512_permutexvar_ps(pm197, dat2027);
__m512 in1991 = _mm512_permutexvar_ps(pm198, dat2028);
__m512 dat2029 = _mm512_maskz_loadu_ps(8191, datPtr23+1436+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2030 = _mm512_maskz_loadu_ps(16383, datPtr23+3920+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1984 = _mm512_permutexvar_ps(pm197, dat2029);
__m512 in1992 = _mm512_permutexvar_ps(pm198, dat2030);
__m512 tmp14009 = _mm512_add_ps(in1978, in1982);
__m512 tmp14013 = _mm512_add_ps(in1986, in1990);
__m512 tmp14010 = _mm512_sub_ps(in1981, in1979);
__m512 tmp14014 = _mm512_sub_ps(in1989, in1987);
__m512 tmp14011 = _mm512_add_ps(in1979, in1983);
__m512 tmp14015 = _mm512_add_ps(in1987, in1991);
in1977 = _mm512_sub_ps(in1977, in1983);
in1985 = _mm512_sub_ps(in1985, in1991);
tmp14009 = _mm512_fmadd_ps(in1980, _mm512_set1_ps(-4.25e+00f), tmp14009);
tmp14013 = _mm512_fmadd_ps(in1988, _mm512_set1_ps(-4.25e+00f), tmp14013);
tmp14011 = _mm512_fmadd_ps(in1981, _mm512_set1_ps(-4.25e+00f), tmp14011);
tmp14015 = _mm512_fmadd_ps(in1989, _mm512_set1_ps(-4.25e+00f), tmp14015);
in1977 = _mm512_fmadd_ps(tmp14010, _mm512_set1_ps(5.25e+00f), in1977);
in1985 = _mm512_fmadd_ps(tmp14014, _mm512_set1_ps(5.25e+00f), in1985);
tmp14010 = _mm512_fmadd_ps(in1979, _mm512_set1_ps(2.5e-01f), in1983);
tmp14014 = _mm512_fmadd_ps(in1987, _mm512_set1_ps(2.5e-01f), in1991);
in1979 = _mm512_fmadd_ps(in1979, _mm512_set1_ps(4e+00f), in1983);
in1987 = _mm512_fmadd_ps(in1987, _mm512_set1_ps(4e+00f), in1991);
__m512 tmp14012 = _mm512_sub_ps(tmp14011, tmp14009);
__m512 tmp14016 = _mm512_sub_ps(tmp14015, tmp14013);
tmp14011 = _mm512_add_ps(tmp14009, tmp14011);
tmp14015 = _mm512_add_ps(tmp14013, tmp14015);
tmp14009 = _mm512_fmadd_ps(in1978, _mm512_set1_ps(2.5e-01f), in1982);
tmp14013 = _mm512_fmadd_ps(in1986, _mm512_set1_ps(2.5e-01f), in1990);
tmp14010 = _mm512_fmadd_ps(in1981, _mm512_set1_ps(-1.25e+00f), tmp14010);
tmp14014 = _mm512_fmadd_ps(in1989, _mm512_set1_ps(-1.25e+00f), tmp14014);
in1981 = _mm512_fmadd_ps(in1981, _mm512_set1_ps(-5e+00f), in1979);
in1989 = _mm512_fmadd_ps(in1989, _mm512_set1_ps(-5e+00f), in1987);
tmp14009 = _mm512_fmadd_ps(in1980, _mm512_set1_ps(-1.25e+00f), tmp14009);
tmp14013 = _mm512_fmadd_ps(in1988, _mm512_set1_ps(-1.25e+00f), tmp14013);
in1983 = _mm512_fmadd_ps(tmp14009, _mm512_set1_ps(2e+00f), tmp14010);
in1991 = _mm512_fmadd_ps(tmp14013, _mm512_set1_ps(2e+00f), tmp14014);
tmp14010 = _mm512_fnmadd_ps(tmp14009, _mm512_set1_ps(2e+00f), tmp14010);
tmp14014 = _mm512_fnmadd_ps(tmp14013, _mm512_set1_ps(2e+00f), tmp14014);
tmp14009 = _mm512_fmadd_ps(in1982, _mm512_set1_ps(2.5e-01f), in1978);
tmp14013 = _mm512_fmadd_ps(in1990, _mm512_set1_ps(2.5e-01f), in1986);
in1978 = _mm512_sub_ps(in1984, in1978);
in1986 = _mm512_sub_ps(in1992, in1986);
tmp14009 = _mm512_fmadd_ps(in1980, _mm512_set1_ps(-1.25e+00f), tmp14009);
tmp14013 = _mm512_fmadd_ps(in1988, _mm512_set1_ps(-1.25e+00f), tmp14013);
in1980 = _mm512_sub_ps(in1980, in1982);
in1988 = _mm512_sub_ps(in1988, in1990);
in1980 = _mm512_fmadd_ps(in1980, _mm512_set1_ps(5.25e+00f), in1978);
in1988 = _mm512_fmadd_ps(in1988, _mm512_set1_ps(5.25e+00f), in1986);
in1979 = _mm512_fmadd_ps(tmp14009, _mm512_set1_ps(2e+00f), in1981);
in1987 = _mm512_fmadd_ps(tmp14013, _mm512_set1_ps(2e+00f), in1989);
in1981 = _mm512_fnmadd_ps(tmp14009, _mm512_set1_ps(2e+00f), in1981);
in1989 = _mm512_fnmadd_ps(tmp14013, _mm512_set1_ps(2e+00f), in1989);
__m512 tmp14025 = _mm512_unpacklo_ps(in1977, tmp14011);
__m512 tmp14026 = _mm512_unpackhi_ps(in1977, tmp14011);
__m512 tmp14027 = _mm512_unpacklo_ps(tmp14012, in1983);
__m512 tmp14028 = _mm512_unpackhi_ps(tmp14012, in1983);
__m512 tmp14029 = _mm512_unpacklo_ps(tmp14010, in1979);
__m512 tmp14030 = _mm512_unpackhi_ps(tmp14010, in1979);
__m512 tmp14031 = _mm512_unpacklo_ps(in1981, in1980);
__m512 tmp14032 = _mm512_unpackhi_ps(in1981, in1980);
__m512 tmp14033 = _mm512_unpacklo_ps(in1985, tmp14015);
__m512 tmp14034 = _mm512_unpackhi_ps(in1985, tmp14015);
__m512 tmp14035 = _mm512_unpacklo_ps(tmp14016, in1991);
__m512 tmp14036 = _mm512_unpackhi_ps(tmp14016, in1991);
__m512 tmp14037 = _mm512_unpacklo_ps(tmp14014, in1987);
__m512 tmp14038 = _mm512_unpackhi_ps(tmp14014, in1987);
__m512 tmp14039 = _mm512_unpacklo_ps(in1989, in1988);
__m512 tmp14040 = _mm512_unpackhi_ps(in1989, in1988);
__m512 tmp14041 = _mm512_shuffle_ps(tmp14025, tmp14027, 68);
__m512 tmp14042 = _mm512_shuffle_ps(tmp14025, tmp14027, 238);
__m512 tmp14043 = _mm512_shuffle_ps(tmp14026, tmp14028, 68);
__m512 tmp14044 = _mm512_shuffle_ps(tmp14026, tmp14028, 238);
__m512 tmp14045 = _mm512_shuffle_ps(tmp14029, tmp14031, 68);
__m512 tmp14046 = _mm512_shuffle_ps(tmp14029, tmp14031, 238);
__m512 tmp14047 = _mm512_shuffle_ps(tmp14030, tmp14032, 68);
__m512 tmp14048 = _mm512_shuffle_ps(tmp14030, tmp14032, 238);
__m512 tmp14049 = _mm512_shuffle_ps(tmp14033, tmp14035, 68);
__m512 tmp14050 = _mm512_shuffle_ps(tmp14033, tmp14035, 238);
__m512 tmp14051 = _mm512_shuffle_ps(tmp14034, tmp14036, 68);
__m512 tmp14052 = _mm512_shuffle_ps(tmp14034, tmp14036, 238);
__m512 tmp14053 = _mm512_shuffle_ps(tmp14037, tmp14039, 68);
__m512 tmp14054 = _mm512_shuffle_ps(tmp14037, tmp14039, 238);
__m512 tmp14055 = _mm512_shuffle_ps(tmp14038, tmp14040, 68);
__m512 tmp14056 = _mm512_shuffle_ps(tmp14038, tmp14040, 238);
__m512 tmp14057 = _mm512_shuffle_f32x4(tmp14041, tmp14045, 136);
__m512 tmp14058 = _mm512_shuffle_f32x4(tmp14041, tmp14045, 221);
__m512 tmp14059 = _mm512_shuffle_f32x4(tmp14042, tmp14046, 136);
__m512 tmp14060 = _mm512_shuffle_f32x4(tmp14042, tmp14046, 221);
__m512 tmp14061 = _mm512_shuffle_f32x4(tmp14043, tmp14047, 136);
__m512 tmp14062 = _mm512_shuffle_f32x4(tmp14043, tmp14047, 221);
__m512 tmp14063 = _mm512_shuffle_f32x4(tmp14044, tmp14048, 136);
__m512 tmp14064 = _mm512_shuffle_f32x4(tmp14044, tmp14048, 221);
__m512 tmp14065 = _mm512_shuffle_f32x4(tmp14049, tmp14053, 136);
__m512 tmp14066 = _mm512_shuffle_f32x4(tmp14049, tmp14053, 221);
__m512 tmp14067 = _mm512_shuffle_f32x4(tmp14050, tmp14054, 136);
__m512 tmp14068 = _mm512_shuffle_f32x4(tmp14050, tmp14054, 221);
__m512 tmp14069 = _mm512_shuffle_f32x4(tmp14051, tmp14055, 136);
__m512 tmp14070 = _mm512_shuffle_f32x4(tmp14051, tmp14055, 221);
__m512 tmp14071 = _mm512_shuffle_f32x4(tmp14052, tmp14056, 136);
__m512 tmp14072 = _mm512_shuffle_f32x4(tmp14052, tmp14056, 221);
in1977 = _mm512_shuffle_f32x4(tmp14057, tmp14065, 136);
in1985 = _mm512_shuffle_f32x4(tmp14057, tmp14065, 221);
tmp14011 = _mm512_shuffle_f32x4(tmp14059, tmp14067, 136);
tmp14015 = _mm512_shuffle_f32x4(tmp14059, tmp14067, 221);
tmp14012 = _mm512_shuffle_f32x4(tmp14061, tmp14069, 136);
tmp14016 = _mm512_shuffle_f32x4(tmp14061, tmp14069, 221);
in1983 = _mm512_shuffle_f32x4(tmp14063, tmp14071, 136);
in1991 = _mm512_shuffle_f32x4(tmp14063, tmp14071, 221);
tmp14010 = _mm512_shuffle_f32x4(tmp14058, tmp14066, 136);
tmp14014 = _mm512_shuffle_f32x4(tmp14058, tmp14066, 221);
in1979 = _mm512_shuffle_f32x4(tmp14060, tmp14068, 136);
in1987 = _mm512_shuffle_f32x4(tmp14060, tmp14068, 221);
in1981 = _mm512_shuffle_f32x4(tmp14062, tmp14070, 136);
in1989 = _mm512_shuffle_f32x4(tmp14062, tmp14070, 221);
in1980 = _mm512_shuffle_f32x4(tmp14064, tmp14072, 136);
in1988 = _mm512_shuffle_f32x4(tmp14064, tmp14072, 221);
__m512 tmp14017 = _mm512_add_ps(tmp14011, in1979);
__m512 tmp14021 = _mm512_add_ps(tmp14015, in1987);
__m512 tmp14018 = _mm512_sub_ps(tmp14010, tmp14012);
__m512 tmp14022 = _mm512_sub_ps(tmp14014, tmp14016);
__m512 tmp14019 = _mm512_add_ps(tmp14012, in1981);
__m512 tmp14023 = _mm512_add_ps(tmp14016, in1989);
in1977 = _mm512_sub_ps(in1977, in1981);
in1985 = _mm512_sub_ps(in1985, in1989);
tmp14017 = _mm512_fmadd_ps(in1983, _mm512_set1_ps(-4.25e+00f), tmp14017);
tmp14021 = _mm512_fmadd_ps(in1991, _mm512_set1_ps(-4.25e+00f), tmp14021);
tmp14019 = _mm512_fmadd_ps(tmp14010, _mm512_set1_ps(-4.25e+00f), tmp14019);
tmp14023 = _mm512_fmadd_ps(tmp14014, _mm512_set1_ps(-4.25e+00f), tmp14023);
in1977 = _mm512_fmadd_ps(tmp14018, _mm512_set1_ps(5.25e+00f), in1977);
in1985 = _mm512_fmadd_ps(tmp14022, _mm512_set1_ps(5.25e+00f), in1985);
tmp14018 = _mm512_fmadd_ps(tmp14012, _mm512_set1_ps(2.5e-01f), in1981);
tmp14022 = _mm512_fmadd_ps(tmp14016, _mm512_set1_ps(2.5e-01f), in1989);
tmp14012 = _mm512_fmadd_ps(tmp14012, _mm512_set1_ps(4e+00f), in1981);
tmp14016 = _mm512_fmadd_ps(tmp14016, _mm512_set1_ps(4e+00f), in1989);
__m512 tmp14020 = _mm512_sub_ps(tmp14019, tmp14017);
__m512 tmp14024 = _mm512_sub_ps(tmp14023, tmp14021);
tmp14019 = _mm512_add_ps(tmp14017, tmp14019);
tmp14023 = _mm512_add_ps(tmp14021, tmp14023);
tmp14017 = _mm512_fmadd_ps(tmp14011, _mm512_set1_ps(2.5e-01f), in1979);
tmp14021 = _mm512_fmadd_ps(tmp14015, _mm512_set1_ps(2.5e-01f), in1987);
tmp14018 = _mm512_fmadd_ps(tmp14010, _mm512_set1_ps(-1.25e+00f), tmp14018);
tmp14022 = _mm512_fmadd_ps(tmp14014, _mm512_set1_ps(-1.25e+00f), tmp14022);
tmp14010 = _mm512_fmadd_ps(tmp14010, _mm512_set1_ps(-5e+00f), tmp14012);
tmp14014 = _mm512_fmadd_ps(tmp14014, _mm512_set1_ps(-5e+00f), tmp14016);
tmp14017 = _mm512_fmadd_ps(in1983, _mm512_set1_ps(-1.25e+00f), tmp14017);
tmp14021 = _mm512_fmadd_ps(in1991, _mm512_set1_ps(-1.25e+00f), tmp14021);
in1981 = _mm512_fmadd_ps(tmp14017, _mm512_set1_ps(2e+00f), tmp14018);
in1989 = _mm512_fmadd_ps(tmp14021, _mm512_set1_ps(2e+00f), tmp14022);
tmp14018 = _mm512_fnmadd_ps(tmp14017, _mm512_set1_ps(2e+00f), tmp14018);
tmp14022 = _mm512_fnmadd_ps(tmp14021, _mm512_set1_ps(2e+00f), tmp14022);
tmp14017 = _mm512_fmadd_ps(in1979, _mm512_set1_ps(2.5e-01f), tmp14011);
tmp14021 = _mm512_fmadd_ps(in1987, _mm512_set1_ps(2.5e-01f), tmp14015);
tmp14011 = _mm512_sub_ps(in1980, tmp14011);
tmp14015 = _mm512_sub_ps(in1988, tmp14015);
tmp14017 = _mm512_fmadd_ps(in1983, _mm512_set1_ps(-1.25e+00f), tmp14017);
tmp14021 = _mm512_fmadd_ps(in1991, _mm512_set1_ps(-1.25e+00f), tmp14021);
in1983 = _mm512_sub_ps(in1983, in1979);
in1991 = _mm512_sub_ps(in1991, in1987);
in1983 = _mm512_fmadd_ps(in1983, _mm512_set1_ps(5.25e+00f), tmp14011);
in1991 = _mm512_fmadd_ps(in1991, _mm512_set1_ps(5.25e+00f), tmp14015);
tmp14012 = _mm512_fmadd_ps(tmp14017, _mm512_set1_ps(2e+00f), tmp14010);
tmp14016 = _mm512_fmadd_ps(tmp14021, _mm512_set1_ps(2e+00f), tmp14014);
tmp14010 = _mm512_fnmadd_ps(tmp14017, _mm512_set1_ps(2e+00f), tmp14010);
tmp14014 = _mm512_fnmadd_ps(tmp14021, _mm512_set1_ps(2e+00f), tmp14014);
__m512 out1823 = _mm512_shuffle_f32x4(in1977, tmp14019, 68);
__m512 out1831 = _mm512_shuffle_f32x4(in1977, tmp14019, 238);
__m512 out1824 = _mm512_shuffle_f32x4(tmp14020, in1981, 68);
__m512 out1832 = _mm512_shuffle_f32x4(tmp14020, in1981, 238);
__m512 out1825 = _mm512_shuffle_f32x4(tmp14018, tmp14012, 68);
__m512 out1833 = _mm512_shuffle_f32x4(tmp14018, tmp14012, 238);
__m512 out1826 = _mm512_shuffle_f32x4(tmp14010, in1983, 68);
__m512 out1834 = _mm512_shuffle_f32x4(tmp14010, in1983, 238);
__m512 out1827 = _mm512_shuffle_f32x4(in1985, tmp14023, 68);
__m512 out1835 = _mm512_shuffle_f32x4(in1985, tmp14023, 238);
__m512 out1828 = _mm512_shuffle_f32x4(tmp14024, in1989, 68);
__m512 out1836 = _mm512_shuffle_f32x4(tmp14024, in1989, 238);
__m512 out1829 = _mm512_shuffle_f32x4(tmp14022, tmp14016, 68);
__m512 out1837 = _mm512_shuffle_f32x4(tmp14022, tmp14016, 238);
__m512 out1830 = _mm512_shuffle_f32x4(tmp14014, in1991, 68);
__m512 out1838 = _mm512_shuffle_f32x4(tmp14014, in1991, 238);
_mm512_storeu_ps(dfPtr10+256+819200*i45+49152*j38+49152*s36+768*k130, out1823);
_mm512_storeu_ps(dfPtr10+384+819200*i45+49152*j38+49152*s36+768*k130, out1831);
_mm512_storeu_ps(dfPtr10+320+819200*i45+49152*j38+49152*s36+768*k130, out1827);
_mm512_storeu_ps(dfPtr10+448+819200*i45+49152*j38+49152*s36+768*k130, out1835);
_mm512_storeu_ps(dfPtr10+205056+819200*i45+49152*j38+49152*s36+768*k130, out1824);
_mm512_storeu_ps(dfPtr10+205184+819200*i45+49152*j38+49152*s36+768*k130, out1832);
_mm512_storeu_ps(dfPtr10+205120+819200*i45+49152*j38+49152*s36+768*k130, out1828);
_mm512_storeu_ps(dfPtr10+205248+819200*i45+49152*j38+49152*s36+768*k130, out1836);
_mm512_storeu_ps(dfPtr10+409856+819200*i45+49152*j38+49152*s36+768*k130, out1825);
_mm512_storeu_ps(dfPtr10+409984+819200*i45+49152*j38+49152*s36+768*k130, out1833);
_mm512_storeu_ps(dfPtr10+409920+819200*i45+49152*j38+49152*s36+768*k130, out1829);
_mm512_storeu_ps(dfPtr10+410048+819200*i45+49152*j38+49152*s36+768*k130, out1837);
_mm512_storeu_ps(dfPtr10+614656+819200*i45+49152*j38+49152*s36+768*k130, out1826);
_mm512_storeu_ps(dfPtr10+614784+819200*i45+49152*j38+49152*s36+768*k130, out1834);
_mm512_storeu_ps(dfPtr10+614720+819200*i45+49152*j38+49152*s36+768*k130, out1830);
_mm512_storeu_ps(dfPtr10+614848+819200*i45+49152*j38+49152*s36+768*k130, out1838);
__m512 dat2031 = _mm512_maskz_loadu_ps(2047, datPtr23+3184+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2032 = _mm512_maskz_loadu_ps(8191, datPtr23+3788+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512i pm199 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1993 = _mm512_permutexvar_ps(pm199, dat2031);
__m512i pm200 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in2001 = _mm512_permutexvar_ps(pm200, dat2032);
__m512 dat2033 = _mm512_maskz_loadu_ps(2047, datPtr23+3296+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2034 = _mm512_maskz_loadu_ps(8191, datPtr23+3900+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1994 = _mm512_permutexvar_ps(pm199, dat2033);
__m512 in2002 = _mm512_permutexvar_ps(pm200, dat2034);
__m512 dat2035 = _mm512_maskz_loadu_ps(2047, datPtr23+3408+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2036 = _mm512_maskz_loadu_ps(8191, datPtr23+4012+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1995 = _mm512_permutexvar_ps(pm199, dat2035);
__m512 in2003 = _mm512_permutexvar_ps(pm200, dat2036);
__m512 dat2037 = _mm512_maskz_loadu_ps(2047, datPtr23+3520+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2038 = _mm512_maskz_loadu_ps(8191, datPtr23+4124+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1996 = _mm512_permutexvar_ps(pm199, dat2037);
__m512 in2004 = _mm512_permutexvar_ps(pm200, dat2038);
__m512 dat2039 = _mm512_maskz_loadu_ps(2047, datPtr23+3632+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2040 = _mm512_maskz_loadu_ps(8191, datPtr23+4236+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1997 = _mm512_permutexvar_ps(pm199, dat2039);
__m512 in2005 = _mm512_permutexvar_ps(pm200, dat2040);
__m512 dat2041 = _mm512_maskz_loadu_ps(2047, datPtr23+3744+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2042 = _mm512_maskz_loadu_ps(8191, datPtr23+4348+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1998 = _mm512_permutexvar_ps(pm199, dat2041);
__m512 in2006 = _mm512_permutexvar_ps(pm200, dat2042);
__m512 dat2043 = _mm512_maskz_loadu_ps(2047, datPtr23+3856+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2044 = _mm512_maskz_loadu_ps(8191, datPtr23+4460+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in1999 = _mm512_permutexvar_ps(pm199, dat2043);
__m512 in2007 = _mm512_permutexvar_ps(pm200, dat2044);
__m512 dat2045 = _mm512_maskz_loadu_ps(2047, datPtr23+3968+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 dat2046 = _mm512_maskz_loadu_ps(8191, datPtr23+4572+401408*i45+112*h46+4*w58+401408*s36+6272*k130);
__m512 in2000 = _mm512_permutexvar_ps(pm199, dat2045);
__m512 in2008 = _mm512_permutexvar_ps(pm200, dat2046);
__m512 tmp14073 = _mm512_add_ps(in1994, in1998);
__m512 tmp14077 = _mm512_add_ps(in2002, in2006);
__m512 tmp14074 = _mm512_sub_ps(in1997, in1995);
__m512 tmp14078 = _mm512_sub_ps(in2005, in2003);
__m512 tmp14075 = _mm512_add_ps(in1995, in1999);
__m512 tmp14079 = _mm512_add_ps(in2003, in2007);
in1993 = _mm512_sub_ps(in1993, in1999);
in2001 = _mm512_sub_ps(in2001, in2007);
tmp14073 = _mm512_fmadd_ps(in1996, _mm512_set1_ps(-4.25e+00f), tmp14073);
tmp14077 = _mm512_fmadd_ps(in2004, _mm512_set1_ps(-4.25e+00f), tmp14077);
tmp14075 = _mm512_fmadd_ps(in1997, _mm512_set1_ps(-4.25e+00f), tmp14075);
tmp14079 = _mm512_fmadd_ps(in2005, _mm512_set1_ps(-4.25e+00f), tmp14079);
in1993 = _mm512_fmadd_ps(tmp14074, _mm512_set1_ps(5.25e+00f), in1993);
in2001 = _mm512_fmadd_ps(tmp14078, _mm512_set1_ps(5.25e+00f), in2001);
tmp14074 = _mm512_fmadd_ps(in1995, _mm512_set1_ps(2.5e-01f), in1999);
tmp14078 = _mm512_fmadd_ps(in2003, _mm512_set1_ps(2.5e-01f), in2007);
in1995 = _mm512_fmadd_ps(in1995, _mm512_set1_ps(4e+00f), in1999);
in2003 = _mm512_fmadd_ps(in2003, _mm512_set1_ps(4e+00f), in2007);
__m512 tmp14076 = _mm512_sub_ps(tmp14075, tmp14073);
__m512 tmp14080 = _mm512_sub_ps(tmp14079, tmp14077);
tmp14075 = _mm512_add_ps(tmp14073, tmp14075);
tmp14079 = _mm512_add_ps(tmp14077, tmp14079);
tmp14073 = _mm512_fmadd_ps(in1994, _mm512_set1_ps(2.5e-01f), in1998);
tmp14077 = _mm512_fmadd_ps(in2002, _mm512_set1_ps(2.5e-01f), in2006);
tmp14074 = _mm512_fmadd_ps(in1997, _mm512_set1_ps(-1.25e+00f), tmp14074);
tmp14078 = _mm512_fmadd_ps(in2005, _mm512_set1_ps(-1.25e+00f), tmp14078);
in1997 = _mm512_fmadd_ps(in1997, _mm512_set1_ps(-5e+00f), in1995);
in2005 = _mm512_fmadd_ps(in2005, _mm512_set1_ps(-5e+00f), in2003);
tmp14073 = _mm512_fmadd_ps(in1996, _mm512_set1_ps(-1.25e+00f), tmp14073);
tmp14077 = _mm512_fmadd_ps(in2004, _mm512_set1_ps(-1.25e+00f), tmp14077);
in1999 = _mm512_fmadd_ps(tmp14073, _mm512_set1_ps(2e+00f), tmp14074);
in2007 = _mm512_fmadd_ps(tmp14077, _mm512_set1_ps(2e+00f), tmp14078);
tmp14074 = _mm512_fnmadd_ps(tmp14073, _mm512_set1_ps(2e+00f), tmp14074);
tmp14078 = _mm512_fnmadd_ps(tmp14077, _mm512_set1_ps(2e+00f), tmp14078);
tmp14073 = _mm512_fmadd_ps(in1998, _mm512_set1_ps(2.5e-01f), in1994);
tmp14077 = _mm512_fmadd_ps(in2006, _mm512_set1_ps(2.5e-01f), in2002);
in1994 = _mm512_sub_ps(in2000, in1994);
in2002 = _mm512_sub_ps(in2008, in2002);
tmp14073 = _mm512_fmadd_ps(in1996, _mm512_set1_ps(-1.25e+00f), tmp14073);
tmp14077 = _mm512_fmadd_ps(in2004, _mm512_set1_ps(-1.25e+00f), tmp14077);
in1996 = _mm512_sub_ps(in1996, in1998);
in2004 = _mm512_sub_ps(in2004, in2006);
in1996 = _mm512_fmadd_ps(in1996, _mm512_set1_ps(5.25e+00f), in1994);
in2004 = _mm512_fmadd_ps(in2004, _mm512_set1_ps(5.25e+00f), in2002);
in1995 = _mm512_fmadd_ps(tmp14073, _mm512_set1_ps(2e+00f), in1997);
in2003 = _mm512_fmadd_ps(tmp14077, _mm512_set1_ps(2e+00f), in2005);
in1997 = _mm512_fnmadd_ps(tmp14073, _mm512_set1_ps(2e+00f), in1997);
in2005 = _mm512_fnmadd_ps(tmp14077, _mm512_set1_ps(2e+00f), in2005);
__m512 tmp14089 = _mm512_unpacklo_ps(in1993, tmp14075);
__m512 tmp14090 = _mm512_unpackhi_ps(in1993, tmp14075);
__m512 tmp14091 = _mm512_unpacklo_ps(tmp14076, in1999);
__m512 tmp14092 = _mm512_unpackhi_ps(tmp14076, in1999);
__m512 tmp14093 = _mm512_unpacklo_ps(tmp14074, in1995);
__m512 tmp14094 = _mm512_unpackhi_ps(tmp14074, in1995);
__m512 tmp14095 = _mm512_unpacklo_ps(in1997, in1996);
__m512 tmp14096 = _mm512_unpackhi_ps(in1997, in1996);
__m512 tmp14097 = _mm512_unpacklo_ps(in2001, tmp14079);
__m512 tmp14098 = _mm512_unpackhi_ps(in2001, tmp14079);
__m512 tmp14099 = _mm512_unpacklo_ps(tmp14080, in2007);
__m512 tmp14100 = _mm512_unpackhi_ps(tmp14080, in2007);
__m512 tmp14101 = _mm512_unpacklo_ps(tmp14078, in2003);
__m512 tmp14102 = _mm512_unpackhi_ps(tmp14078, in2003);
__m512 tmp14103 = _mm512_unpacklo_ps(in2005, in2004);
__m512 tmp14104 = _mm512_unpackhi_ps(in2005, in2004);
__m512 tmp14105 = _mm512_shuffle_ps(tmp14089, tmp14091, 68);
__m512 tmp14106 = _mm512_shuffle_ps(tmp14089, tmp14091, 238);
__m512 tmp14107 = _mm512_shuffle_ps(tmp14090, tmp14092, 68);
__m512 tmp14108 = _mm512_shuffle_ps(tmp14090, tmp14092, 238);
__m512 tmp14109 = _mm512_shuffle_ps(tmp14093, tmp14095, 68);
__m512 tmp14110 = _mm512_shuffle_ps(tmp14093, tmp14095, 238);
__m512 tmp14111 = _mm512_shuffle_ps(tmp14094, tmp14096, 68);
__m512 tmp14112 = _mm512_shuffle_ps(tmp14094, tmp14096, 238);
__m512 tmp14113 = _mm512_shuffle_ps(tmp14097, tmp14099, 68);
__m512 tmp14114 = _mm512_shuffle_ps(tmp14097, tmp14099, 238);
__m512 tmp14115 = _mm512_shuffle_ps(tmp14098, tmp14100, 68);
__m512 tmp14116 = _mm512_shuffle_ps(tmp14098, tmp14100, 238);
__m512 tmp14117 = _mm512_shuffle_ps(tmp14101, tmp14103, 68);
__m512 tmp14118 = _mm512_shuffle_ps(tmp14101, tmp14103, 238);
__m512 tmp14119 = _mm512_shuffle_ps(tmp14102, tmp14104, 68);
__m512 tmp14120 = _mm512_shuffle_ps(tmp14102, tmp14104, 238);
__m512 tmp14121 = _mm512_shuffle_f32x4(tmp14105, tmp14109, 136);
__m512 tmp14122 = _mm512_shuffle_f32x4(tmp14105, tmp14109, 221);
__m512 tmp14123 = _mm512_shuffle_f32x4(tmp14106, tmp14110, 136);
__m512 tmp14124 = _mm512_shuffle_f32x4(tmp14106, tmp14110, 221);
__m512 tmp14125 = _mm512_shuffle_f32x4(tmp14107, tmp14111, 136);
__m512 tmp14126 = _mm512_shuffle_f32x4(tmp14107, tmp14111, 221);
__m512 tmp14127 = _mm512_shuffle_f32x4(tmp14108, tmp14112, 136);
__m512 tmp14128 = _mm512_shuffle_f32x4(tmp14108, tmp14112, 221);
__m512 tmp14129 = _mm512_shuffle_f32x4(tmp14113, tmp14117, 136);
__m512 tmp14130 = _mm512_shuffle_f32x4(tmp14113, tmp14117, 221);
__m512 tmp14131 = _mm512_shuffle_f32x4(tmp14114, tmp14118, 136);
__m512 tmp14132 = _mm512_shuffle_f32x4(tmp14114, tmp14118, 221);
__m512 tmp14133 = _mm512_shuffle_f32x4(tmp14115, tmp14119, 136);
__m512 tmp14134 = _mm512_shuffle_f32x4(tmp14115, tmp14119, 221);
__m512 tmp14135 = _mm512_shuffle_f32x4(tmp14116, tmp14120, 136);
__m512 tmp14136 = _mm512_shuffle_f32x4(tmp14116, tmp14120, 221);
in1993 = _mm512_shuffle_f32x4(tmp14121, tmp14129, 136);
in2001 = _mm512_shuffle_f32x4(tmp14121, tmp14129, 221);
tmp14075 = _mm512_shuffle_f32x4(tmp14123, tmp14131, 136);
tmp14079 = _mm512_shuffle_f32x4(tmp14123, tmp14131, 221);
tmp14076 = _mm512_shuffle_f32x4(tmp14125, tmp14133, 136);
tmp14080 = _mm512_shuffle_f32x4(tmp14125, tmp14133, 221);
in1999 = _mm512_shuffle_f32x4(tmp14127, tmp14135, 136);
in2007 = _mm512_shuffle_f32x4(tmp14127, tmp14135, 221);
tmp14074 = _mm512_shuffle_f32x4(tmp14122, tmp14130, 136);
tmp14078 = _mm512_shuffle_f32x4(tmp14122, tmp14130, 221);
in1995 = _mm512_shuffle_f32x4(tmp14124, tmp14132, 136);
in2003 = _mm512_shuffle_f32x4(tmp14124, tmp14132, 221);
in1997 = _mm512_shuffle_f32x4(tmp14126, tmp14134, 136);
in2005 = _mm512_shuffle_f32x4(tmp14126, tmp14134, 221);
in1996 = _mm512_shuffle_f32x4(tmp14128, tmp14136, 136);
in2004 = _mm512_shuffle_f32x4(tmp14128, tmp14136, 221);
__m512 tmp14081 = _mm512_add_ps(tmp14075, in1995);
__m512 tmp14085 = _mm512_add_ps(tmp14079, in2003);
__m512 tmp14082 = _mm512_sub_ps(tmp14074, tmp14076);
__m512 tmp14086 = _mm512_sub_ps(tmp14078, tmp14080);
__m512 tmp14083 = _mm512_add_ps(tmp14076, in1997);
__m512 tmp14087 = _mm512_add_ps(tmp14080, in2005);
in1993 = _mm512_sub_ps(in1993, in1997);
in2001 = _mm512_sub_ps(in2001, in2005);
tmp14081 = _mm512_fmadd_ps(in1999, _mm512_set1_ps(-4.25e+00f), tmp14081);
tmp14085 = _mm512_fmadd_ps(in2007, _mm512_set1_ps(-4.25e+00f), tmp14085);
tmp14083 = _mm512_fmadd_ps(tmp14074, _mm512_set1_ps(-4.25e+00f), tmp14083);
tmp14087 = _mm512_fmadd_ps(tmp14078, _mm512_set1_ps(-4.25e+00f), tmp14087);
in1993 = _mm512_fmadd_ps(tmp14082, _mm512_set1_ps(5.25e+00f), in1993);
in2001 = _mm512_fmadd_ps(tmp14086, _mm512_set1_ps(5.25e+00f), in2001);
tmp14082 = _mm512_fmadd_ps(tmp14076, _mm512_set1_ps(2.5e-01f), in1997);
tmp14086 = _mm512_fmadd_ps(tmp14080, _mm512_set1_ps(2.5e-01f), in2005);
tmp14076 = _mm512_fmadd_ps(tmp14076, _mm512_set1_ps(4e+00f), in1997);
tmp14080 = _mm512_fmadd_ps(tmp14080, _mm512_set1_ps(4e+00f), in2005);
__m512 tmp14084 = _mm512_sub_ps(tmp14083, tmp14081);
__m512 tmp14088 = _mm512_sub_ps(tmp14087, tmp14085);
tmp14083 = _mm512_add_ps(tmp14081, tmp14083);
tmp14087 = _mm512_add_ps(tmp14085, tmp14087);
tmp14081 = _mm512_fmadd_ps(tmp14075, _mm512_set1_ps(2.5e-01f), in1995);
tmp14085 = _mm512_fmadd_ps(tmp14079, _mm512_set1_ps(2.5e-01f), in2003);
tmp14082 = _mm512_fmadd_ps(tmp14074, _mm512_set1_ps(-1.25e+00f), tmp14082);
tmp14086 = _mm512_fmadd_ps(tmp14078, _mm512_set1_ps(-1.25e+00f), tmp14086);
tmp14074 = _mm512_fmadd_ps(tmp14074, _mm512_set1_ps(-5e+00f), tmp14076);
tmp14078 = _mm512_fmadd_ps(tmp14078, _mm512_set1_ps(-5e+00f), tmp14080);
tmp14081 = _mm512_fmadd_ps(in1999, _mm512_set1_ps(-1.25e+00f), tmp14081);
tmp14085 = _mm512_fmadd_ps(in2007, _mm512_set1_ps(-1.25e+00f), tmp14085);
in1997 = _mm512_fmadd_ps(tmp14081, _mm512_set1_ps(2e+00f), tmp14082);
in2005 = _mm512_fmadd_ps(tmp14085, _mm512_set1_ps(2e+00f), tmp14086);
tmp14082 = _mm512_fnmadd_ps(tmp14081, _mm512_set1_ps(2e+00f), tmp14082);
tmp14086 = _mm512_fnmadd_ps(tmp14085, _mm512_set1_ps(2e+00f), tmp14086);
tmp14081 = _mm512_fmadd_ps(in1995, _mm512_set1_ps(2.5e-01f), tmp14075);
tmp14085 = _mm512_fmadd_ps(in2003, _mm512_set1_ps(2.5e-01f), tmp14079);
tmp14075 = _mm512_sub_ps(in1996, tmp14075);
tmp14079 = _mm512_sub_ps(in2004, tmp14079);
tmp14081 = _mm512_fmadd_ps(in1999, _mm512_set1_ps(-1.25e+00f), tmp14081);
tmp14085 = _mm512_fmadd_ps(in2007, _mm512_set1_ps(-1.25e+00f), tmp14085);
in1999 = _mm512_sub_ps(in1999, in1995);
in2007 = _mm512_sub_ps(in2007, in2003);
in1999 = _mm512_fmadd_ps(in1999, _mm512_set1_ps(5.25e+00f), tmp14075);
in2007 = _mm512_fmadd_ps(in2007, _mm512_set1_ps(5.25e+00f), tmp14079);
tmp14076 = _mm512_fmadd_ps(tmp14081, _mm512_set1_ps(2e+00f), tmp14074);
tmp14080 = _mm512_fmadd_ps(tmp14085, _mm512_set1_ps(2e+00f), tmp14078);
tmp14074 = _mm512_fnmadd_ps(tmp14081, _mm512_set1_ps(2e+00f), tmp14074);
tmp14078 = _mm512_fnmadd_ps(tmp14085, _mm512_set1_ps(2e+00f), tmp14078);
__m512 out1839 = _mm512_shuffle_f32x4(in1993, tmp14083, 68);
__m512 out1847 = _mm512_shuffle_f32x4(in1993, tmp14083, 238);
__m512 out1840 = _mm512_shuffle_f32x4(tmp14084, in1997, 68);
__m512 out1848 = _mm512_shuffle_f32x4(tmp14084, in1997, 238);
__m512 out1841 = _mm512_shuffle_f32x4(tmp14082, tmp14076, 68);
__m512 out1849 = _mm512_shuffle_f32x4(tmp14082, tmp14076, 238);
__m512 out1842 = _mm512_shuffle_f32x4(tmp14074, in1999, 68);
__m512 out1850 = _mm512_shuffle_f32x4(tmp14074, in1999, 238);
__m512 out1843 = _mm512_shuffle_f32x4(in2001, tmp14087, 68);
__m512 out1851 = _mm512_shuffle_f32x4(in2001, tmp14087, 238);
__m512 out1844 = _mm512_shuffle_f32x4(tmp14088, in2005, 68);
__m512 out1852 = _mm512_shuffle_f32x4(tmp14088, in2005, 238);
__m512 out1845 = _mm512_shuffle_f32x4(tmp14086, tmp14080, 68);
__m512 out1853 = _mm512_shuffle_f32x4(tmp14086, tmp14080, 238);
__m512 out1846 = _mm512_shuffle_f32x4(tmp14078, in2007, 68);
__m512 out1854 = _mm512_shuffle_f32x4(tmp14078, in2007, 238);
_mm512_storeu_ps(dfPtr10+512+819200*i45+49152*j38+49152*s36+768*k130, out1839);
_mm512_storeu_ps(dfPtr10+640+819200*i45+49152*j38+49152*s36+768*k130, out1847);
_mm512_storeu_ps(dfPtr10+576+819200*i45+49152*j38+49152*s36+768*k130, out1843);
_mm512_storeu_ps(dfPtr10+704+819200*i45+49152*j38+49152*s36+768*k130, out1851);
_mm512_storeu_ps(dfPtr10+205312+819200*i45+49152*j38+49152*s36+768*k130, out1840);
_mm512_storeu_ps(dfPtr10+205440+819200*i45+49152*j38+49152*s36+768*k130, out1848);
_mm512_storeu_ps(dfPtr10+205376+819200*i45+49152*j38+49152*s36+768*k130, out1844);
_mm512_storeu_ps(dfPtr10+205504+819200*i45+49152*j38+49152*s36+768*k130, out1852);
_mm512_storeu_ps(dfPtr10+410112+819200*i45+49152*j38+49152*s36+768*k130, out1841);
_mm512_storeu_ps(dfPtr10+410240+819200*i45+49152*j38+49152*s36+768*k130, out1849);
_mm512_storeu_ps(dfPtr10+410176+819200*i45+49152*j38+49152*s36+768*k130, out1845);
_mm512_storeu_ps(dfPtr10+410304+819200*i45+49152*j38+49152*s36+768*k130, out1853);
_mm512_storeu_ps(dfPtr10+614912+819200*i45+49152*j38+49152*s36+768*k130, out1842);
_mm512_storeu_ps(dfPtr10+615040+819200*i45+49152*j38+49152*s36+768*k130, out1850);
_mm512_storeu_ps(dfPtr10+614976+819200*i45+49152*j38+49152*s36+768*k130, out1846);
_mm512_storeu_ps(dfPtr10+615104+819200*i45+49152*j38+49152*s36+768*k130, out1854);
}
if (j38 >= last9) return;
++j38;
rel21 = 2;
}
if (rel21 < 3) {
ptrdiff_t h47 = base21+12;
ptrdiff_t w59 = 12;
ptrdiff_t k131 = 0;
for (; k131 != 64; ++k131) {
__m512 dat2047 = _mm512_maskz_loadu_ps(16383, datPtr23+0+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2048 = _mm512_maskz_loadu_ps(31, datPtr23+48+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2049 = _mm512_maskz_loadu_ps(127, datPtr23+628+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512i pm201 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in2009 = _mm512_permutexvar_ps(pm201, dat2047);
__m512i pm202 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 4, 3, 2, 1, 0);
__m512 in2017 = _mm512_permutex2var_ps(dat2048, pm202, dat2049);
__m512 dat2050 = _mm512_maskz_loadu_ps(16383, datPtr23+112+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2051 = _mm512_maskz_loadu_ps(31, datPtr23+160+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2052 = _mm512_maskz_loadu_ps(127, datPtr23+740+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2010 = _mm512_permutexvar_ps(pm201, dat2050);
__m512 in2018 = _mm512_permutex2var_ps(dat2051, pm202, dat2052);
__m512 dat2053 = _mm512_maskz_loadu_ps(16383, datPtr23+224+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2054 = _mm512_maskz_loadu_ps(31, datPtr23+272+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2055 = _mm512_maskz_loadu_ps(127, datPtr23+852+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2011 = _mm512_permutexvar_ps(pm201, dat2053);
__m512 in2019 = _mm512_permutex2var_ps(dat2054, pm202, dat2055);
__m512 dat2056 = _mm512_maskz_loadu_ps(16383, datPtr23+336+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2057 = _mm512_maskz_loadu_ps(31, datPtr23+384+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2058 = _mm512_maskz_loadu_ps(127, datPtr23+964+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2012 = _mm512_permutexvar_ps(pm201, dat2056);
__m512 in2020 = _mm512_permutex2var_ps(dat2057, pm202, dat2058);
__m512 dat2059 = _mm512_maskz_loadu_ps(16383, datPtr23+448+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2060 = _mm512_maskz_loadu_ps(31, datPtr23+496+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2061 = _mm512_maskz_loadu_ps(127, datPtr23+1076+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2013 = _mm512_permutexvar_ps(pm201, dat2059);
__m512 in2021 = _mm512_permutex2var_ps(dat2060, pm202, dat2061);
__m512 dat2062 = _mm512_maskz_loadu_ps(16383, datPtr23+560+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2063 = _mm512_maskz_loadu_ps(31, datPtr23+608+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2064 = _mm512_maskz_loadu_ps(127, datPtr23+1188+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2014 = _mm512_permutexvar_ps(pm201, dat2062);
__m512 in2022 = _mm512_permutex2var_ps(dat2063, pm202, dat2064);
__m512 dat2065 = _mm512_maskz_loadu_ps(16383, datPtr23+672+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2066 = _mm512_maskz_loadu_ps(31, datPtr23+720+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2067 = _mm512_maskz_loadu_ps(127, datPtr23+1300+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2015 = _mm512_permutexvar_ps(pm201, dat2065);
__m512 in2023 = _mm512_permutex2var_ps(dat2066, pm202, dat2067);
__m512 dat2068 = _mm512_maskz_loadu_ps(16383, datPtr23+784+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2069 = _mm512_maskz_loadu_ps(31, datPtr23+832+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2070 = _mm512_maskz_loadu_ps(127, datPtr23+1412+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2016 = _mm512_permutexvar_ps(pm201, dat2068);
__m512 in2024 = _mm512_permutex2var_ps(dat2069, pm202, dat2070);
__m512 tmp14137 = _mm512_add_ps(in2010, in2014);
__m512 tmp14141 = _mm512_add_ps(in2018, in2022);
__m512 tmp14138 = _mm512_sub_ps(in2013, in2011);
__m512 tmp14142 = _mm512_sub_ps(in2021, in2019);
__m512 tmp14139 = _mm512_add_ps(in2011, in2015);
__m512 tmp14143 = _mm512_add_ps(in2019, in2023);
in2009 = _mm512_sub_ps(in2009, in2015);
in2017 = _mm512_sub_ps(in2017, in2023);
tmp14137 = _mm512_fmadd_ps(in2012, _mm512_set1_ps(-4.25e+00f), tmp14137);
tmp14141 = _mm512_fmadd_ps(in2020, _mm512_set1_ps(-4.25e+00f), tmp14141);
tmp14139 = _mm512_fmadd_ps(in2013, _mm512_set1_ps(-4.25e+00f), tmp14139);
tmp14143 = _mm512_fmadd_ps(in2021, _mm512_set1_ps(-4.25e+00f), tmp14143);
in2009 = _mm512_fmadd_ps(tmp14138, _mm512_set1_ps(5.25e+00f), in2009);
in2017 = _mm512_fmadd_ps(tmp14142, _mm512_set1_ps(5.25e+00f), in2017);
tmp14138 = _mm512_fmadd_ps(in2011, _mm512_set1_ps(2.5e-01f), in2015);
tmp14142 = _mm512_fmadd_ps(in2019, _mm512_set1_ps(2.5e-01f), in2023);
in2011 = _mm512_fmadd_ps(in2011, _mm512_set1_ps(4e+00f), in2015);
in2019 = _mm512_fmadd_ps(in2019, _mm512_set1_ps(4e+00f), in2023);
__m512 tmp14140 = _mm512_sub_ps(tmp14139, tmp14137);
__m512 tmp14144 = _mm512_sub_ps(tmp14143, tmp14141);
tmp14139 = _mm512_add_ps(tmp14137, tmp14139);
tmp14143 = _mm512_add_ps(tmp14141, tmp14143);
tmp14137 = _mm512_fmadd_ps(in2010, _mm512_set1_ps(2.5e-01f), in2014);
tmp14141 = _mm512_fmadd_ps(in2018, _mm512_set1_ps(2.5e-01f), in2022);
tmp14138 = _mm512_fmadd_ps(in2013, _mm512_set1_ps(-1.25e+00f), tmp14138);
tmp14142 = _mm512_fmadd_ps(in2021, _mm512_set1_ps(-1.25e+00f), tmp14142);
in2013 = _mm512_fmadd_ps(in2013, _mm512_set1_ps(-5e+00f), in2011);
in2021 = _mm512_fmadd_ps(in2021, _mm512_set1_ps(-5e+00f), in2019);
tmp14137 = _mm512_fmadd_ps(in2012, _mm512_set1_ps(-1.25e+00f), tmp14137);
tmp14141 = _mm512_fmadd_ps(in2020, _mm512_set1_ps(-1.25e+00f), tmp14141);
in2015 = _mm512_fmadd_ps(tmp14137, _mm512_set1_ps(2e+00f), tmp14138);
in2023 = _mm512_fmadd_ps(tmp14141, _mm512_set1_ps(2e+00f), tmp14142);
tmp14138 = _mm512_fnmadd_ps(tmp14137, _mm512_set1_ps(2e+00f), tmp14138);
tmp14142 = _mm512_fnmadd_ps(tmp14141, _mm512_set1_ps(2e+00f), tmp14142);
tmp14137 = _mm512_fmadd_ps(in2014, _mm512_set1_ps(2.5e-01f), in2010);
tmp14141 = _mm512_fmadd_ps(in2022, _mm512_set1_ps(2.5e-01f), in2018);
in2010 = _mm512_sub_ps(in2016, in2010);
in2018 = _mm512_sub_ps(in2024, in2018);
tmp14137 = _mm512_fmadd_ps(in2012, _mm512_set1_ps(-1.25e+00f), tmp14137);
tmp14141 = _mm512_fmadd_ps(in2020, _mm512_set1_ps(-1.25e+00f), tmp14141);
in2012 = _mm512_sub_ps(in2012, in2014);
in2020 = _mm512_sub_ps(in2020, in2022);
in2012 = _mm512_fmadd_ps(in2012, _mm512_set1_ps(5.25e+00f), in2010);
in2020 = _mm512_fmadd_ps(in2020, _mm512_set1_ps(5.25e+00f), in2018);
in2011 = _mm512_fmadd_ps(tmp14137, _mm512_set1_ps(2e+00f), in2013);
in2019 = _mm512_fmadd_ps(tmp14141, _mm512_set1_ps(2e+00f), in2021);
in2013 = _mm512_fnmadd_ps(tmp14137, _mm512_set1_ps(2e+00f), in2013);
in2021 = _mm512_fnmadd_ps(tmp14141, _mm512_set1_ps(2e+00f), in2021);
__m512 tmp14153 = _mm512_unpacklo_ps(in2009, tmp14139);
__m512 tmp14154 = _mm512_unpackhi_ps(in2009, tmp14139);
__m512 tmp14155 = _mm512_unpacklo_ps(tmp14140, in2015);
__m512 tmp14156 = _mm512_unpackhi_ps(tmp14140, in2015);
__m512 tmp14157 = _mm512_unpacklo_ps(tmp14138, in2011);
__m512 tmp14158 = _mm512_unpackhi_ps(tmp14138, in2011);
__m512 tmp14159 = _mm512_unpacklo_ps(in2013, in2012);
__m512 tmp14160 = _mm512_unpackhi_ps(in2013, in2012);
__m512 tmp14161 = _mm512_unpacklo_ps(in2017, tmp14143);
__m512 tmp14162 = _mm512_unpackhi_ps(in2017, tmp14143);
__m512 tmp14163 = _mm512_unpacklo_ps(tmp14144, in2023);
__m512 tmp14164 = _mm512_unpackhi_ps(tmp14144, in2023);
__m512 tmp14165 = _mm512_unpacklo_ps(tmp14142, in2019);
__m512 tmp14166 = _mm512_unpackhi_ps(tmp14142, in2019);
__m512 tmp14167 = _mm512_unpacklo_ps(in2021, in2020);
__m512 tmp14168 = _mm512_unpackhi_ps(in2021, in2020);
__m512 tmp14169 = _mm512_shuffle_ps(tmp14153, tmp14155, 68);
__m512 tmp14170 = _mm512_shuffle_ps(tmp14153, tmp14155, 238);
__m512 tmp14171 = _mm512_shuffle_ps(tmp14154, tmp14156, 68);
__m512 tmp14172 = _mm512_shuffle_ps(tmp14154, tmp14156, 238);
__m512 tmp14173 = _mm512_shuffle_ps(tmp14157, tmp14159, 68);
__m512 tmp14174 = _mm512_shuffle_ps(tmp14157, tmp14159, 238);
__m512 tmp14175 = _mm512_shuffle_ps(tmp14158, tmp14160, 68);
__m512 tmp14176 = _mm512_shuffle_ps(tmp14158, tmp14160, 238);
__m512 tmp14177 = _mm512_shuffle_ps(tmp14161, tmp14163, 68);
__m512 tmp14178 = _mm512_shuffle_ps(tmp14161, tmp14163, 238);
__m512 tmp14179 = _mm512_shuffle_ps(tmp14162, tmp14164, 68);
__m512 tmp14180 = _mm512_shuffle_ps(tmp14162, tmp14164, 238);
__m512 tmp14181 = _mm512_shuffle_ps(tmp14165, tmp14167, 68);
__m512 tmp14182 = _mm512_shuffle_ps(tmp14165, tmp14167, 238);
__m512 tmp14183 = _mm512_shuffle_ps(tmp14166, tmp14168, 68);
__m512 tmp14184 = _mm512_shuffle_ps(tmp14166, tmp14168, 238);
__m512 tmp14185 = _mm512_shuffle_f32x4(tmp14169, tmp14173, 136);
__m512 tmp14186 = _mm512_shuffle_f32x4(tmp14169, tmp14173, 221);
__m512 tmp14187 = _mm512_shuffle_f32x4(tmp14170, tmp14174, 136);
__m512 tmp14188 = _mm512_shuffle_f32x4(tmp14170, tmp14174, 221);
__m512 tmp14189 = _mm512_shuffle_f32x4(tmp14171, tmp14175, 136);
__m512 tmp14190 = _mm512_shuffle_f32x4(tmp14171, tmp14175, 221);
__m512 tmp14191 = _mm512_shuffle_f32x4(tmp14172, tmp14176, 136);
__m512 tmp14192 = _mm512_shuffle_f32x4(tmp14172, tmp14176, 221);
__m512 tmp14193 = _mm512_shuffle_f32x4(tmp14177, tmp14181, 136);
__m512 tmp14194 = _mm512_shuffle_f32x4(tmp14177, tmp14181, 221);
__m512 tmp14195 = _mm512_shuffle_f32x4(tmp14178, tmp14182, 136);
__m512 tmp14196 = _mm512_shuffle_f32x4(tmp14178, tmp14182, 221);
__m512 tmp14197 = _mm512_shuffle_f32x4(tmp14179, tmp14183, 136);
__m512 tmp14198 = _mm512_shuffle_f32x4(tmp14179, tmp14183, 221);
__m512 tmp14199 = _mm512_shuffle_f32x4(tmp14180, tmp14184, 136);
__m512 tmp14200 = _mm512_shuffle_f32x4(tmp14180, tmp14184, 221);
in2009 = _mm512_shuffle_f32x4(tmp14185, tmp14193, 136);
in2017 = _mm512_shuffle_f32x4(tmp14185, tmp14193, 221);
tmp14139 = _mm512_shuffle_f32x4(tmp14187, tmp14195, 136);
tmp14143 = _mm512_shuffle_f32x4(tmp14187, tmp14195, 221);
tmp14140 = _mm512_shuffle_f32x4(tmp14189, tmp14197, 136);
tmp14144 = _mm512_shuffle_f32x4(tmp14189, tmp14197, 221);
in2015 = _mm512_shuffle_f32x4(tmp14191, tmp14199, 136);
in2023 = _mm512_shuffle_f32x4(tmp14191, tmp14199, 221);
tmp14138 = _mm512_shuffle_f32x4(tmp14186, tmp14194, 136);
tmp14142 = _mm512_shuffle_f32x4(tmp14186, tmp14194, 221);
in2011 = _mm512_shuffle_f32x4(tmp14188, tmp14196, 136);
in2019 = _mm512_shuffle_f32x4(tmp14188, tmp14196, 221);
in2013 = _mm512_shuffle_f32x4(tmp14190, tmp14198, 136);
in2021 = _mm512_shuffle_f32x4(tmp14190, tmp14198, 221);
in2012 = _mm512_shuffle_f32x4(tmp14192, tmp14200, 136);
in2020 = _mm512_shuffle_f32x4(tmp14192, tmp14200, 221);
__m512 tmp14145 = _mm512_add_ps(tmp14139, in2011);
__m512 tmp14149 = _mm512_add_ps(tmp14143, in2019);
__m512 tmp14146 = _mm512_sub_ps(tmp14138, tmp14140);
__m512 tmp14150 = _mm512_sub_ps(tmp14142, tmp14144);
__m512 tmp14147 = _mm512_add_ps(tmp14140, in2013);
__m512 tmp14151 = _mm512_add_ps(tmp14144, in2021);
in2009 = _mm512_sub_ps(in2009, in2013);
in2017 = _mm512_sub_ps(in2017, in2021);
tmp14145 = _mm512_fmadd_ps(in2015, _mm512_set1_ps(-4.25e+00f), tmp14145);
tmp14149 = _mm512_fmadd_ps(in2023, _mm512_set1_ps(-4.25e+00f), tmp14149);
tmp14147 = _mm512_fmadd_ps(tmp14138, _mm512_set1_ps(-4.25e+00f), tmp14147);
tmp14151 = _mm512_fmadd_ps(tmp14142, _mm512_set1_ps(-4.25e+00f), tmp14151);
in2009 = _mm512_fmadd_ps(tmp14146, _mm512_set1_ps(5.25e+00f), in2009);
in2017 = _mm512_fmadd_ps(tmp14150, _mm512_set1_ps(5.25e+00f), in2017);
tmp14146 = _mm512_fmadd_ps(tmp14140, _mm512_set1_ps(2.5e-01f), in2013);
tmp14150 = _mm512_fmadd_ps(tmp14144, _mm512_set1_ps(2.5e-01f), in2021);
tmp14140 = _mm512_fmadd_ps(tmp14140, _mm512_set1_ps(4e+00f), in2013);
tmp14144 = _mm512_fmadd_ps(tmp14144, _mm512_set1_ps(4e+00f), in2021);
__m512 tmp14148 = _mm512_sub_ps(tmp14147, tmp14145);
__m512 tmp14152 = _mm512_sub_ps(tmp14151, tmp14149);
tmp14147 = _mm512_add_ps(tmp14145, tmp14147);
tmp14151 = _mm512_add_ps(tmp14149, tmp14151);
tmp14145 = _mm512_fmadd_ps(tmp14139, _mm512_set1_ps(2.5e-01f), in2011);
tmp14149 = _mm512_fmadd_ps(tmp14143, _mm512_set1_ps(2.5e-01f), in2019);
tmp14146 = _mm512_fmadd_ps(tmp14138, _mm512_set1_ps(-1.25e+00f), tmp14146);
tmp14150 = _mm512_fmadd_ps(tmp14142, _mm512_set1_ps(-1.25e+00f), tmp14150);
tmp14138 = _mm512_fmadd_ps(tmp14138, _mm512_set1_ps(-5e+00f), tmp14140);
tmp14142 = _mm512_fmadd_ps(tmp14142, _mm512_set1_ps(-5e+00f), tmp14144);
tmp14145 = _mm512_fmadd_ps(in2015, _mm512_set1_ps(-1.25e+00f), tmp14145);
tmp14149 = _mm512_fmadd_ps(in2023, _mm512_set1_ps(-1.25e+00f), tmp14149);
in2013 = _mm512_fmadd_ps(tmp14145, _mm512_set1_ps(2e+00f), tmp14146);
in2021 = _mm512_fmadd_ps(tmp14149, _mm512_set1_ps(2e+00f), tmp14150);
tmp14146 = _mm512_fnmadd_ps(tmp14145, _mm512_set1_ps(2e+00f), tmp14146);
tmp14150 = _mm512_fnmadd_ps(tmp14149, _mm512_set1_ps(2e+00f), tmp14150);
tmp14145 = _mm512_fmadd_ps(in2011, _mm512_set1_ps(2.5e-01f), tmp14139);
tmp14149 = _mm512_fmadd_ps(in2019, _mm512_set1_ps(2.5e-01f), tmp14143);
tmp14139 = _mm512_sub_ps(in2012, tmp14139);
tmp14143 = _mm512_sub_ps(in2020, tmp14143);
tmp14145 = _mm512_fmadd_ps(in2015, _mm512_set1_ps(-1.25e+00f), tmp14145);
tmp14149 = _mm512_fmadd_ps(in2023, _mm512_set1_ps(-1.25e+00f), tmp14149);
in2015 = _mm512_sub_ps(in2015, in2011);
in2023 = _mm512_sub_ps(in2023, in2019);
in2015 = _mm512_fmadd_ps(in2015, _mm512_set1_ps(5.25e+00f), tmp14139);
in2023 = _mm512_fmadd_ps(in2023, _mm512_set1_ps(5.25e+00f), tmp14143);
tmp14140 = _mm512_fmadd_ps(tmp14145, _mm512_set1_ps(2e+00f), tmp14138);
tmp14144 = _mm512_fmadd_ps(tmp14149, _mm512_set1_ps(2e+00f), tmp14142);
tmp14138 = _mm512_fnmadd_ps(tmp14145, _mm512_set1_ps(2e+00f), tmp14138);
tmp14142 = _mm512_fnmadd_ps(tmp14149, _mm512_set1_ps(2e+00f), tmp14142);
__m512 out1855 = _mm512_shuffle_f32x4(in2009, tmp14147, 68);
__m512 out1863 = _mm512_shuffle_f32x4(in2009, tmp14147, 238);
__m512 out1856 = _mm512_shuffle_f32x4(tmp14148, in2013, 68);
__m512 out1864 = _mm512_shuffle_f32x4(tmp14148, in2013, 238);
__m512 out1857 = _mm512_shuffle_f32x4(tmp14146, tmp14140, 68);
__m512 out1865 = _mm512_shuffle_f32x4(tmp14146, tmp14140, 238);
__m512 out1858 = _mm512_shuffle_f32x4(tmp14138, in2015, 68);
__m512 out1866 = _mm512_shuffle_f32x4(tmp14138, in2015, 238);
__m512 out1859 = _mm512_shuffle_f32x4(in2017, tmp14151, 68);
__m512 out1867 = _mm512_shuffle_f32x4(in2017, tmp14151, 238);
__m512 out1860 = _mm512_shuffle_f32x4(tmp14152, in2021, 68);
__m512 out1868 = _mm512_shuffle_f32x4(tmp14152, in2021, 238);
__m512 out1861 = _mm512_shuffle_f32x4(tmp14150, tmp14144, 68);
__m512 out1869 = _mm512_shuffle_f32x4(tmp14150, tmp14144, 238);
__m512 out1862 = _mm512_shuffle_f32x4(tmp14142, in2023, 68);
__m512 out1870 = _mm512_shuffle_f32x4(tmp14142, in2023, 238);
_mm512_storeu_ps(dfPtr10+0+819200*i45+49152*j38+49152*s36+768*k131, out1855);
_mm512_storeu_ps(dfPtr10+128+819200*i45+49152*j38+49152*s36+768*k131, out1863);
_mm512_storeu_ps(dfPtr10+64+819200*i45+49152*j38+49152*s36+768*k131, out1859);
_mm512_storeu_ps(dfPtr10+192+819200*i45+49152*j38+49152*s36+768*k131, out1867);
_mm512_storeu_ps(dfPtr10+204800+819200*i45+49152*j38+49152*s36+768*k131, out1856);
_mm512_storeu_ps(dfPtr10+204928+819200*i45+49152*j38+49152*s36+768*k131, out1864);
_mm512_storeu_ps(dfPtr10+204864+819200*i45+49152*j38+49152*s36+768*k131, out1860);
_mm512_storeu_ps(dfPtr10+204992+819200*i45+49152*j38+49152*s36+768*k131, out1868);
_mm512_storeu_ps(dfPtr10+409600+819200*i45+49152*j38+49152*s36+768*k131, out1857);
_mm512_storeu_ps(dfPtr10+409728+819200*i45+49152*j38+49152*s36+768*k131, out1865);
_mm512_storeu_ps(dfPtr10+409664+819200*i45+49152*j38+49152*s36+768*k131, out1861);
_mm512_storeu_ps(dfPtr10+409792+819200*i45+49152*j38+49152*s36+768*k131, out1869);
_mm512_storeu_ps(dfPtr10+614400+819200*i45+49152*j38+49152*s36+768*k131, out1858);
_mm512_storeu_ps(dfPtr10+614528+819200*i45+49152*j38+49152*s36+768*k131, out1866);
_mm512_storeu_ps(dfPtr10+614464+819200*i45+49152*j38+49152*s36+768*k131, out1862);
_mm512_storeu_ps(dfPtr10+614592+819200*i45+49152*j38+49152*s36+768*k131, out1870);
__m512 dat2071 = _mm512_maskz_loadu_ps(16383, datPtr23+648+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2072 = _mm512_maskz_loadu_ps(16383, datPtr23+3136+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512i pm203 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in2025 = _mm512_permutexvar_ps(pm203, dat2071);
__m512 in2033 = _mm512_permutexvar_ps(pm203, dat2072);
__m512 dat2073 = _mm512_maskz_loadu_ps(16383, datPtr23+760+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2074 = _mm512_maskz_loadu_ps(16383, datPtr23+3248+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2026 = _mm512_permutexvar_ps(pm203, dat2073);
__m512 in2034 = _mm512_permutexvar_ps(pm203, dat2074);
__m512 dat2075 = _mm512_maskz_loadu_ps(16383, datPtr23+872+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2076 = _mm512_maskz_loadu_ps(16383, datPtr23+3360+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2027 = _mm512_permutexvar_ps(pm203, dat2075);
__m512 in2035 = _mm512_permutexvar_ps(pm203, dat2076);
__m512 dat2077 = _mm512_maskz_loadu_ps(16383, datPtr23+984+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2078 = _mm512_maskz_loadu_ps(16383, datPtr23+3472+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2028 = _mm512_permutexvar_ps(pm203, dat2077);
__m512 in2036 = _mm512_permutexvar_ps(pm203, dat2078);
__m512 dat2079 = _mm512_maskz_loadu_ps(16383, datPtr23+1096+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2080 = _mm512_maskz_loadu_ps(16383, datPtr23+3584+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2029 = _mm512_permutexvar_ps(pm203, dat2079);
__m512 in2037 = _mm512_permutexvar_ps(pm203, dat2080);
__m512 dat2081 = _mm512_maskz_loadu_ps(16383, datPtr23+1208+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2082 = _mm512_maskz_loadu_ps(16383, datPtr23+3696+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2030 = _mm512_permutexvar_ps(pm203, dat2081);
__m512 in2038 = _mm512_permutexvar_ps(pm203, dat2082);
__m512 dat2083 = _mm512_maskz_loadu_ps(16383, datPtr23+1320+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2084 = _mm512_maskz_loadu_ps(16383, datPtr23+3808+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2031 = _mm512_permutexvar_ps(pm203, dat2083);
__m512 in2039 = _mm512_permutexvar_ps(pm203, dat2084);
__m512 dat2085 = _mm512_maskz_loadu_ps(16383, datPtr23+1432+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2086 = _mm512_maskz_loadu_ps(16383, datPtr23+3920+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2032 = _mm512_permutexvar_ps(pm203, dat2085);
__m512 in2040 = _mm512_permutexvar_ps(pm203, dat2086);
__m512 tmp14201 = _mm512_add_ps(in2026, in2030);
__m512 tmp14205 = _mm512_add_ps(in2034, in2038);
__m512 tmp14202 = _mm512_sub_ps(in2029, in2027);
__m512 tmp14206 = _mm512_sub_ps(in2037, in2035);
__m512 tmp14203 = _mm512_add_ps(in2027, in2031);
__m512 tmp14207 = _mm512_add_ps(in2035, in2039);
in2025 = _mm512_sub_ps(in2025, in2031);
in2033 = _mm512_sub_ps(in2033, in2039);
tmp14201 = _mm512_fmadd_ps(in2028, _mm512_set1_ps(-4.25e+00f), tmp14201);
tmp14205 = _mm512_fmadd_ps(in2036, _mm512_set1_ps(-4.25e+00f), tmp14205);
tmp14203 = _mm512_fmadd_ps(in2029, _mm512_set1_ps(-4.25e+00f), tmp14203);
tmp14207 = _mm512_fmadd_ps(in2037, _mm512_set1_ps(-4.25e+00f), tmp14207);
in2025 = _mm512_fmadd_ps(tmp14202, _mm512_set1_ps(5.25e+00f), in2025);
in2033 = _mm512_fmadd_ps(tmp14206, _mm512_set1_ps(5.25e+00f), in2033);
tmp14202 = _mm512_fmadd_ps(in2027, _mm512_set1_ps(2.5e-01f), in2031);
tmp14206 = _mm512_fmadd_ps(in2035, _mm512_set1_ps(2.5e-01f), in2039);
in2027 = _mm512_fmadd_ps(in2027, _mm512_set1_ps(4e+00f), in2031);
in2035 = _mm512_fmadd_ps(in2035, _mm512_set1_ps(4e+00f), in2039);
__m512 tmp14204 = _mm512_sub_ps(tmp14203, tmp14201);
__m512 tmp14208 = _mm512_sub_ps(tmp14207, tmp14205);
tmp14203 = _mm512_add_ps(tmp14201, tmp14203);
tmp14207 = _mm512_add_ps(tmp14205, tmp14207);
tmp14201 = _mm512_fmadd_ps(in2026, _mm512_set1_ps(2.5e-01f), in2030);
tmp14205 = _mm512_fmadd_ps(in2034, _mm512_set1_ps(2.5e-01f), in2038);
tmp14202 = _mm512_fmadd_ps(in2029, _mm512_set1_ps(-1.25e+00f), tmp14202);
tmp14206 = _mm512_fmadd_ps(in2037, _mm512_set1_ps(-1.25e+00f), tmp14206);
in2029 = _mm512_fmadd_ps(in2029, _mm512_set1_ps(-5e+00f), in2027);
in2037 = _mm512_fmadd_ps(in2037, _mm512_set1_ps(-5e+00f), in2035);
tmp14201 = _mm512_fmadd_ps(in2028, _mm512_set1_ps(-1.25e+00f), tmp14201);
tmp14205 = _mm512_fmadd_ps(in2036, _mm512_set1_ps(-1.25e+00f), tmp14205);
in2031 = _mm512_fmadd_ps(tmp14201, _mm512_set1_ps(2e+00f), tmp14202);
in2039 = _mm512_fmadd_ps(tmp14205, _mm512_set1_ps(2e+00f), tmp14206);
tmp14202 = _mm512_fnmadd_ps(tmp14201, _mm512_set1_ps(2e+00f), tmp14202);
tmp14206 = _mm512_fnmadd_ps(tmp14205, _mm512_set1_ps(2e+00f), tmp14206);
tmp14201 = _mm512_fmadd_ps(in2030, _mm512_set1_ps(2.5e-01f), in2026);
tmp14205 = _mm512_fmadd_ps(in2038, _mm512_set1_ps(2.5e-01f), in2034);
in2026 = _mm512_sub_ps(in2032, in2026);
in2034 = _mm512_sub_ps(in2040, in2034);
tmp14201 = _mm512_fmadd_ps(in2028, _mm512_set1_ps(-1.25e+00f), tmp14201);
tmp14205 = _mm512_fmadd_ps(in2036, _mm512_set1_ps(-1.25e+00f), tmp14205);
in2028 = _mm512_sub_ps(in2028, in2030);
in2036 = _mm512_sub_ps(in2036, in2038);
in2028 = _mm512_fmadd_ps(in2028, _mm512_set1_ps(5.25e+00f), in2026);
in2036 = _mm512_fmadd_ps(in2036, _mm512_set1_ps(5.25e+00f), in2034);
in2027 = _mm512_fmadd_ps(tmp14201, _mm512_set1_ps(2e+00f), in2029);
in2035 = _mm512_fmadd_ps(tmp14205, _mm512_set1_ps(2e+00f), in2037);
in2029 = _mm512_fnmadd_ps(tmp14201, _mm512_set1_ps(2e+00f), in2029);
in2037 = _mm512_fnmadd_ps(tmp14205, _mm512_set1_ps(2e+00f), in2037);
__m512 tmp14217 = _mm512_unpacklo_ps(in2025, tmp14203);
__m512 tmp14218 = _mm512_unpackhi_ps(in2025, tmp14203);
__m512 tmp14219 = _mm512_unpacklo_ps(tmp14204, in2031);
__m512 tmp14220 = _mm512_unpackhi_ps(tmp14204, in2031);
__m512 tmp14221 = _mm512_unpacklo_ps(tmp14202, in2027);
__m512 tmp14222 = _mm512_unpackhi_ps(tmp14202, in2027);
__m512 tmp14223 = _mm512_unpacklo_ps(in2029, in2028);
__m512 tmp14224 = _mm512_unpackhi_ps(in2029, in2028);
__m512 tmp14225 = _mm512_unpacklo_ps(in2033, tmp14207);
__m512 tmp14226 = _mm512_unpackhi_ps(in2033, tmp14207);
__m512 tmp14227 = _mm512_unpacklo_ps(tmp14208, in2039);
__m512 tmp14228 = _mm512_unpackhi_ps(tmp14208, in2039);
__m512 tmp14229 = _mm512_unpacklo_ps(tmp14206, in2035);
__m512 tmp14230 = _mm512_unpackhi_ps(tmp14206, in2035);
__m512 tmp14231 = _mm512_unpacklo_ps(in2037, in2036);
__m512 tmp14232 = _mm512_unpackhi_ps(in2037, in2036);
__m512 tmp14233 = _mm512_shuffle_ps(tmp14217, tmp14219, 68);
__m512 tmp14234 = _mm512_shuffle_ps(tmp14217, tmp14219, 238);
__m512 tmp14235 = _mm512_shuffle_ps(tmp14218, tmp14220, 68);
__m512 tmp14236 = _mm512_shuffle_ps(tmp14218, tmp14220, 238);
__m512 tmp14237 = _mm512_shuffle_ps(tmp14221, tmp14223, 68);
__m512 tmp14238 = _mm512_shuffle_ps(tmp14221, tmp14223, 238);
__m512 tmp14239 = _mm512_shuffle_ps(tmp14222, tmp14224, 68);
__m512 tmp14240 = _mm512_shuffle_ps(tmp14222, tmp14224, 238);
__m512 tmp14241 = _mm512_shuffle_ps(tmp14225, tmp14227, 68);
__m512 tmp14242 = _mm512_shuffle_ps(tmp14225, tmp14227, 238);
__m512 tmp14243 = _mm512_shuffle_ps(tmp14226, tmp14228, 68);
__m512 tmp14244 = _mm512_shuffle_ps(tmp14226, tmp14228, 238);
__m512 tmp14245 = _mm512_shuffle_ps(tmp14229, tmp14231, 68);
__m512 tmp14246 = _mm512_shuffle_ps(tmp14229, tmp14231, 238);
__m512 tmp14247 = _mm512_shuffle_ps(tmp14230, tmp14232, 68);
__m512 tmp14248 = _mm512_shuffle_ps(tmp14230, tmp14232, 238);
__m512 tmp14249 = _mm512_shuffle_f32x4(tmp14233, tmp14237, 136);
__m512 tmp14250 = _mm512_shuffle_f32x4(tmp14233, tmp14237, 221);
__m512 tmp14251 = _mm512_shuffle_f32x4(tmp14234, tmp14238, 136);
__m512 tmp14252 = _mm512_shuffle_f32x4(tmp14234, tmp14238, 221);
__m512 tmp14253 = _mm512_shuffle_f32x4(tmp14235, tmp14239, 136);
__m512 tmp14254 = _mm512_shuffle_f32x4(tmp14235, tmp14239, 221);
__m512 tmp14255 = _mm512_shuffle_f32x4(tmp14236, tmp14240, 136);
__m512 tmp14256 = _mm512_shuffle_f32x4(tmp14236, tmp14240, 221);
__m512 tmp14257 = _mm512_shuffle_f32x4(tmp14241, tmp14245, 136);
__m512 tmp14258 = _mm512_shuffle_f32x4(tmp14241, tmp14245, 221);
__m512 tmp14259 = _mm512_shuffle_f32x4(tmp14242, tmp14246, 136);
__m512 tmp14260 = _mm512_shuffle_f32x4(tmp14242, tmp14246, 221);
__m512 tmp14261 = _mm512_shuffle_f32x4(tmp14243, tmp14247, 136);
__m512 tmp14262 = _mm512_shuffle_f32x4(tmp14243, tmp14247, 221);
__m512 tmp14263 = _mm512_shuffle_f32x4(tmp14244, tmp14248, 136);
__m512 tmp14264 = _mm512_shuffle_f32x4(tmp14244, tmp14248, 221);
in2025 = _mm512_shuffle_f32x4(tmp14249, tmp14257, 136);
in2033 = _mm512_shuffle_f32x4(tmp14249, tmp14257, 221);
tmp14203 = _mm512_shuffle_f32x4(tmp14251, tmp14259, 136);
tmp14207 = _mm512_shuffle_f32x4(tmp14251, tmp14259, 221);
tmp14204 = _mm512_shuffle_f32x4(tmp14253, tmp14261, 136);
tmp14208 = _mm512_shuffle_f32x4(tmp14253, tmp14261, 221);
in2031 = _mm512_shuffle_f32x4(tmp14255, tmp14263, 136);
in2039 = _mm512_shuffle_f32x4(tmp14255, tmp14263, 221);
tmp14202 = _mm512_shuffle_f32x4(tmp14250, tmp14258, 136);
tmp14206 = _mm512_shuffle_f32x4(tmp14250, tmp14258, 221);
in2027 = _mm512_shuffle_f32x4(tmp14252, tmp14260, 136);
in2035 = _mm512_shuffle_f32x4(tmp14252, tmp14260, 221);
in2029 = _mm512_shuffle_f32x4(tmp14254, tmp14262, 136);
in2037 = _mm512_shuffle_f32x4(tmp14254, tmp14262, 221);
in2028 = _mm512_shuffle_f32x4(tmp14256, tmp14264, 136);
in2036 = _mm512_shuffle_f32x4(tmp14256, tmp14264, 221);
__m512 tmp14209 = _mm512_add_ps(tmp14203, in2027);
__m512 tmp14213 = _mm512_add_ps(tmp14207, in2035);
__m512 tmp14210 = _mm512_sub_ps(tmp14202, tmp14204);
__m512 tmp14214 = _mm512_sub_ps(tmp14206, tmp14208);
__m512 tmp14211 = _mm512_add_ps(tmp14204, in2029);
__m512 tmp14215 = _mm512_add_ps(tmp14208, in2037);
in2025 = _mm512_sub_ps(in2025, in2029);
in2033 = _mm512_sub_ps(in2033, in2037);
tmp14209 = _mm512_fmadd_ps(in2031, _mm512_set1_ps(-4.25e+00f), tmp14209);
tmp14213 = _mm512_fmadd_ps(in2039, _mm512_set1_ps(-4.25e+00f), tmp14213);
tmp14211 = _mm512_fmadd_ps(tmp14202, _mm512_set1_ps(-4.25e+00f), tmp14211);
tmp14215 = _mm512_fmadd_ps(tmp14206, _mm512_set1_ps(-4.25e+00f), tmp14215);
in2025 = _mm512_fmadd_ps(tmp14210, _mm512_set1_ps(5.25e+00f), in2025);
in2033 = _mm512_fmadd_ps(tmp14214, _mm512_set1_ps(5.25e+00f), in2033);
tmp14210 = _mm512_fmadd_ps(tmp14204, _mm512_set1_ps(2.5e-01f), in2029);
tmp14214 = _mm512_fmadd_ps(tmp14208, _mm512_set1_ps(2.5e-01f), in2037);
tmp14204 = _mm512_fmadd_ps(tmp14204, _mm512_set1_ps(4e+00f), in2029);
tmp14208 = _mm512_fmadd_ps(tmp14208, _mm512_set1_ps(4e+00f), in2037);
__m512 tmp14212 = _mm512_sub_ps(tmp14211, tmp14209);
__m512 tmp14216 = _mm512_sub_ps(tmp14215, tmp14213);
tmp14211 = _mm512_add_ps(tmp14209, tmp14211);
tmp14215 = _mm512_add_ps(tmp14213, tmp14215);
tmp14209 = _mm512_fmadd_ps(tmp14203, _mm512_set1_ps(2.5e-01f), in2027);
tmp14213 = _mm512_fmadd_ps(tmp14207, _mm512_set1_ps(2.5e-01f), in2035);
tmp14210 = _mm512_fmadd_ps(tmp14202, _mm512_set1_ps(-1.25e+00f), tmp14210);
tmp14214 = _mm512_fmadd_ps(tmp14206, _mm512_set1_ps(-1.25e+00f), tmp14214);
tmp14202 = _mm512_fmadd_ps(tmp14202, _mm512_set1_ps(-5e+00f), tmp14204);
tmp14206 = _mm512_fmadd_ps(tmp14206, _mm512_set1_ps(-5e+00f), tmp14208);
tmp14209 = _mm512_fmadd_ps(in2031, _mm512_set1_ps(-1.25e+00f), tmp14209);
tmp14213 = _mm512_fmadd_ps(in2039, _mm512_set1_ps(-1.25e+00f), tmp14213);
in2029 = _mm512_fmadd_ps(tmp14209, _mm512_set1_ps(2e+00f), tmp14210);
in2037 = _mm512_fmadd_ps(tmp14213, _mm512_set1_ps(2e+00f), tmp14214);
tmp14210 = _mm512_fnmadd_ps(tmp14209, _mm512_set1_ps(2e+00f), tmp14210);
tmp14214 = _mm512_fnmadd_ps(tmp14213, _mm512_set1_ps(2e+00f), tmp14214);
tmp14209 = _mm512_fmadd_ps(in2027, _mm512_set1_ps(2.5e-01f), tmp14203);
tmp14213 = _mm512_fmadd_ps(in2035, _mm512_set1_ps(2.5e-01f), tmp14207);
tmp14203 = _mm512_sub_ps(in2028, tmp14203);
tmp14207 = _mm512_sub_ps(in2036, tmp14207);
tmp14209 = _mm512_fmadd_ps(in2031, _mm512_set1_ps(-1.25e+00f), tmp14209);
tmp14213 = _mm512_fmadd_ps(in2039, _mm512_set1_ps(-1.25e+00f), tmp14213);
in2031 = _mm512_sub_ps(in2031, in2027);
in2039 = _mm512_sub_ps(in2039, in2035);
in2031 = _mm512_fmadd_ps(in2031, _mm512_set1_ps(5.25e+00f), tmp14203);
in2039 = _mm512_fmadd_ps(in2039, _mm512_set1_ps(5.25e+00f), tmp14207);
tmp14204 = _mm512_fmadd_ps(tmp14209, _mm512_set1_ps(2e+00f), tmp14202);
tmp14208 = _mm512_fmadd_ps(tmp14213, _mm512_set1_ps(2e+00f), tmp14206);
tmp14202 = _mm512_fnmadd_ps(tmp14209, _mm512_set1_ps(2e+00f), tmp14202);
tmp14206 = _mm512_fnmadd_ps(tmp14213, _mm512_set1_ps(2e+00f), tmp14206);
__m512 out1871 = _mm512_shuffle_f32x4(in2025, tmp14211, 68);
__m512 out1879 = _mm512_shuffle_f32x4(in2025, tmp14211, 238);
__m512 out1872 = _mm512_shuffle_f32x4(tmp14212, in2029, 68);
__m512 out1880 = _mm512_shuffle_f32x4(tmp14212, in2029, 238);
__m512 out1873 = _mm512_shuffle_f32x4(tmp14210, tmp14204, 68);
__m512 out1881 = _mm512_shuffle_f32x4(tmp14210, tmp14204, 238);
__m512 out1874 = _mm512_shuffle_f32x4(tmp14202, in2031, 68);
__m512 out1882 = _mm512_shuffle_f32x4(tmp14202, in2031, 238);
__m512 out1875 = _mm512_shuffle_f32x4(in2033, tmp14215, 68);
__m512 out1883 = _mm512_shuffle_f32x4(in2033, tmp14215, 238);
__m512 out1876 = _mm512_shuffle_f32x4(tmp14216, in2037, 68);
__m512 out1884 = _mm512_shuffle_f32x4(tmp14216, in2037, 238);
__m512 out1877 = _mm512_shuffle_f32x4(tmp14214, tmp14208, 68);
__m512 out1885 = _mm512_shuffle_f32x4(tmp14214, tmp14208, 238);
__m512 out1878 = _mm512_shuffle_f32x4(tmp14206, in2039, 68);
__m512 out1886 = _mm512_shuffle_f32x4(tmp14206, in2039, 238);
_mm512_storeu_ps(dfPtr10+256+819200*i45+49152*j38+49152*s36+768*k131, out1871);
_mm512_storeu_ps(dfPtr10+384+819200*i45+49152*j38+49152*s36+768*k131, out1879);
_mm512_storeu_ps(dfPtr10+320+819200*i45+49152*j38+49152*s36+768*k131, out1875);
_mm512_storeu_ps(dfPtr10+448+819200*i45+49152*j38+49152*s36+768*k131, out1883);
_mm512_storeu_ps(dfPtr10+205056+819200*i45+49152*j38+49152*s36+768*k131, out1872);
_mm512_storeu_ps(dfPtr10+205184+819200*i45+49152*j38+49152*s36+768*k131, out1880);
_mm512_storeu_ps(dfPtr10+205120+819200*i45+49152*j38+49152*s36+768*k131, out1876);
_mm512_storeu_ps(dfPtr10+205248+819200*i45+49152*j38+49152*s36+768*k131, out1884);
_mm512_storeu_ps(dfPtr10+409856+819200*i45+49152*j38+49152*s36+768*k131, out1873);
_mm512_storeu_ps(dfPtr10+409984+819200*i45+49152*j38+49152*s36+768*k131, out1881);
_mm512_storeu_ps(dfPtr10+409920+819200*i45+49152*j38+49152*s36+768*k131, out1877);
_mm512_storeu_ps(dfPtr10+410048+819200*i45+49152*j38+49152*s36+768*k131, out1885);
_mm512_storeu_ps(dfPtr10+614656+819200*i45+49152*j38+49152*s36+768*k131, out1874);
_mm512_storeu_ps(dfPtr10+614784+819200*i45+49152*j38+49152*s36+768*k131, out1882);
_mm512_storeu_ps(dfPtr10+614720+819200*i45+49152*j38+49152*s36+768*k131, out1878);
_mm512_storeu_ps(dfPtr10+614848+819200*i45+49152*j38+49152*s36+768*k131, out1886);
__m512 dat2087 = _mm512_maskz_loadu_ps(31, datPtr23+3184+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2088 = _mm512_maskz_loadu_ps(8191, datPtr23+3764+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2089 = _mm512_maskz_loadu_ps(255, datPtr23+3808+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512i pm204 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 4, 3, 2, 1, 0);
__m512 in2041 = _mm512_permutex2var_ps(dat2087, pm204, dat2088);
__m512i pm205 = _mm512_set_epi32(23, 22, 21, 20, 19, 18, 17, 16, 12, 11, 10, 9, 8, 7, 6, 5);
__m512 in2049 = _mm512_permutex2var_ps(dat2088, pm205, dat2089);
__m512 dat2090 = _mm512_maskz_loadu_ps(31, datPtr23+3296+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2091 = _mm512_maskz_loadu_ps(8191, datPtr23+3876+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2092 = _mm512_maskz_loadu_ps(255, datPtr23+3920+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2042 = _mm512_permutex2var_ps(dat2090, pm204, dat2091);
__m512 in2050 = _mm512_permutex2var_ps(dat2091, pm205, dat2092);
__m512 dat2093 = _mm512_maskz_loadu_ps(31, datPtr23+3408+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2094 = _mm512_maskz_loadu_ps(8191, datPtr23+3988+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2095 = _mm512_maskz_loadu_ps(255, datPtr23+4032+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2043 = _mm512_permutex2var_ps(dat2093, pm204, dat2094);
__m512 in2051 = _mm512_permutex2var_ps(dat2094, pm205, dat2095);
__m512 dat2096 = _mm512_maskz_loadu_ps(31, datPtr23+3520+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2097 = _mm512_maskz_loadu_ps(8191, datPtr23+4100+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2098 = _mm512_maskz_loadu_ps(255, datPtr23+4144+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2044 = _mm512_permutex2var_ps(dat2096, pm204, dat2097);
__m512 in2052 = _mm512_permutex2var_ps(dat2097, pm205, dat2098);
__m512 dat2099 = _mm512_maskz_loadu_ps(31, datPtr23+3632+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2100 = _mm512_maskz_loadu_ps(8191, datPtr23+4212+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2101 = _mm512_maskz_loadu_ps(255, datPtr23+4256+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2045 = _mm512_permutex2var_ps(dat2099, pm204, dat2100);
__m512 in2053 = _mm512_permutex2var_ps(dat2100, pm205, dat2101);
__m512 dat2102 = _mm512_maskz_loadu_ps(31, datPtr23+3744+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2103 = _mm512_maskz_loadu_ps(8191, datPtr23+4324+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2104 = _mm512_maskz_loadu_ps(255, datPtr23+4368+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2046 = _mm512_permutex2var_ps(dat2102, pm204, dat2103);
__m512 in2054 = _mm512_permutex2var_ps(dat2103, pm205, dat2104);
__m512 dat2105 = _mm512_maskz_loadu_ps(31, datPtr23+3856+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2106 = _mm512_maskz_loadu_ps(8191, datPtr23+4436+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2107 = _mm512_maskz_loadu_ps(255, datPtr23+4480+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2047 = _mm512_permutex2var_ps(dat2105, pm204, dat2106);
__m512 in2055 = _mm512_permutex2var_ps(dat2106, pm205, dat2107);
__m512 dat2108 = _mm512_maskz_loadu_ps(31, datPtr23+3968+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2109 = _mm512_maskz_loadu_ps(8191, datPtr23+4548+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 dat2110 = _mm512_maskz_loadu_ps(255, datPtr23+4592+401408*i45+112*h47+4*w59+401408*s36+6272*k131);
__m512 in2048 = _mm512_permutex2var_ps(dat2108, pm204, dat2109);
__m512 in2056 = _mm512_permutex2var_ps(dat2109, pm205, dat2110);
__m512 tmp14265 = _mm512_add_ps(in2042, in2046);
__m512 tmp14269 = _mm512_add_ps(in2050, in2054);
__m512 tmp14266 = _mm512_sub_ps(in2045, in2043);
__m512 tmp14270 = _mm512_sub_ps(in2053, in2051);
__m512 tmp14267 = _mm512_add_ps(in2043, in2047);
__m512 tmp14271 = _mm512_add_ps(in2051, in2055);
in2041 = _mm512_sub_ps(in2041, in2047);
in2049 = _mm512_sub_ps(in2049, in2055);
tmp14265 = _mm512_fmadd_ps(in2044, _mm512_set1_ps(-4.25e+00f), tmp14265);
tmp14269 = _mm512_fmadd_ps(in2052, _mm512_set1_ps(-4.25e+00f), tmp14269);
tmp14267 = _mm512_fmadd_ps(in2045, _mm512_set1_ps(-4.25e+00f), tmp14267);
tmp14271 = _mm512_fmadd_ps(in2053, _mm512_set1_ps(-4.25e+00f), tmp14271);
in2041 = _mm512_fmadd_ps(tmp14266, _mm512_set1_ps(5.25e+00f), in2041);
in2049 = _mm512_fmadd_ps(tmp14270, _mm512_set1_ps(5.25e+00f), in2049);
tmp14266 = _mm512_fmadd_ps(in2043, _mm512_set1_ps(2.5e-01f), in2047);
tmp14270 = _mm512_fmadd_ps(in2051, _mm512_set1_ps(2.5e-01f), in2055);
in2043 = _mm512_fmadd_ps(in2043, _mm512_set1_ps(4e+00f), in2047);
in2051 = _mm512_fmadd_ps(in2051, _mm512_set1_ps(4e+00f), in2055);
__m512 tmp14268 = _mm512_sub_ps(tmp14267, tmp14265);
__m512 tmp14272 = _mm512_sub_ps(tmp14271, tmp14269);
tmp14267 = _mm512_add_ps(tmp14265, tmp14267);
tmp14271 = _mm512_add_ps(tmp14269, tmp14271);
tmp14265 = _mm512_fmadd_ps(in2042, _mm512_set1_ps(2.5e-01f), in2046);
tmp14269 = _mm512_fmadd_ps(in2050, _mm512_set1_ps(2.5e-01f), in2054);
tmp14266 = _mm512_fmadd_ps(in2045, _mm512_set1_ps(-1.25e+00f), tmp14266);
tmp14270 = _mm512_fmadd_ps(in2053, _mm512_set1_ps(-1.25e+00f), tmp14270);
in2045 = _mm512_fmadd_ps(in2045, _mm512_set1_ps(-5e+00f), in2043);
in2053 = _mm512_fmadd_ps(in2053, _mm512_set1_ps(-5e+00f), in2051);
tmp14265 = _mm512_fmadd_ps(in2044, _mm512_set1_ps(-1.25e+00f), tmp14265);
tmp14269 = _mm512_fmadd_ps(in2052, _mm512_set1_ps(-1.25e+00f), tmp14269);
in2047 = _mm512_fmadd_ps(tmp14265, _mm512_set1_ps(2e+00f), tmp14266);
in2055 = _mm512_fmadd_ps(tmp14269, _mm512_set1_ps(2e+00f), tmp14270);
tmp14266 = _mm512_fnmadd_ps(tmp14265, _mm512_set1_ps(2e+00f), tmp14266);
tmp14270 = _mm512_fnmadd_ps(tmp14269, _mm512_set1_ps(2e+00f), tmp14270);
tmp14265 = _mm512_fmadd_ps(in2046, _mm512_set1_ps(2.5e-01f), in2042);
tmp14269 = _mm512_fmadd_ps(in2054, _mm512_set1_ps(2.5e-01f), in2050);
in2042 = _mm512_sub_ps(in2048, in2042);
in2050 = _mm512_sub_ps(in2056, in2050);
tmp14265 = _mm512_fmadd_ps(in2044, _mm512_set1_ps(-1.25e+00f), tmp14265);
tmp14269 = _mm512_fmadd_ps(in2052, _mm512_set1_ps(-1.25e+00f), tmp14269);
in2044 = _mm512_sub_ps(in2044, in2046);
in2052 = _mm512_sub_ps(in2052, in2054);
in2044 = _mm512_fmadd_ps(in2044, _mm512_set1_ps(5.25e+00f), in2042);
in2052 = _mm512_fmadd_ps(in2052, _mm512_set1_ps(5.25e+00f), in2050);
in2043 = _mm512_fmadd_ps(tmp14265, _mm512_set1_ps(2e+00f), in2045);
in2051 = _mm512_fmadd_ps(tmp14269, _mm512_set1_ps(2e+00f), in2053);
in2045 = _mm512_fnmadd_ps(tmp14265, _mm512_set1_ps(2e+00f), in2045);
in2053 = _mm512_fnmadd_ps(tmp14269, _mm512_set1_ps(2e+00f), in2053);
__m512 tmp14281 = _mm512_unpacklo_ps(in2041, tmp14267);
__m512 tmp14282 = _mm512_unpackhi_ps(in2041, tmp14267);
__m512 tmp14283 = _mm512_unpacklo_ps(tmp14268, in2047);
__m512 tmp14284 = _mm512_unpackhi_ps(tmp14268, in2047);
__m512 tmp14285 = _mm512_unpacklo_ps(tmp14266, in2043);
__m512 tmp14286 = _mm512_unpackhi_ps(tmp14266, in2043);
__m512 tmp14287 = _mm512_unpacklo_ps(in2045, in2044);
__m512 tmp14288 = _mm512_unpackhi_ps(in2045, in2044);
__m512 tmp14289 = _mm512_unpacklo_ps(in2049, tmp14271);
__m512 tmp14290 = _mm512_unpackhi_ps(in2049, tmp14271);
__m512 tmp14291 = _mm512_unpacklo_ps(tmp14272, in2055);
__m512 tmp14292 = _mm512_unpackhi_ps(tmp14272, in2055);
__m512 tmp14293 = _mm512_unpacklo_ps(tmp14270, in2051);
__m512 tmp14294 = _mm512_unpackhi_ps(tmp14270, in2051);
__m512 tmp14295 = _mm512_unpacklo_ps(in2053, in2052);
__m512 tmp14296 = _mm512_unpackhi_ps(in2053, in2052);
__m512 tmp14297 = _mm512_shuffle_ps(tmp14281, tmp14283, 68);
__m512 tmp14298 = _mm512_shuffle_ps(tmp14281, tmp14283, 238);
__m512 tmp14299 = _mm512_shuffle_ps(tmp14282, tmp14284, 68);
__m512 tmp14300 = _mm512_shuffle_ps(tmp14282, tmp14284, 238);
__m512 tmp14301 = _mm512_shuffle_ps(tmp14285, tmp14287, 68);
__m512 tmp14302 = _mm512_shuffle_ps(tmp14285, tmp14287, 238);
__m512 tmp14303 = _mm512_shuffle_ps(tmp14286, tmp14288, 68);
__m512 tmp14304 = _mm512_shuffle_ps(tmp14286, tmp14288, 238);
__m512 tmp14305 = _mm512_shuffle_ps(tmp14289, tmp14291, 68);
__m512 tmp14306 = _mm512_shuffle_ps(tmp14289, tmp14291, 238);
__m512 tmp14307 = _mm512_shuffle_ps(tmp14290, tmp14292, 68);
__m512 tmp14308 = _mm512_shuffle_ps(tmp14290, tmp14292, 238);
__m512 tmp14309 = _mm512_shuffle_ps(tmp14293, tmp14295, 68);
__m512 tmp14310 = _mm512_shuffle_ps(tmp14293, tmp14295, 238);
__m512 tmp14311 = _mm512_shuffle_ps(tmp14294, tmp14296, 68);
__m512 tmp14312 = _mm512_shuffle_ps(tmp14294, tmp14296, 238);
__m512 tmp14313 = _mm512_shuffle_f32x4(tmp14297, tmp14301, 136);
__m512 tmp14314 = _mm512_shuffle_f32x4(tmp14297, tmp14301, 221);
__m512 tmp14315 = _mm512_shuffle_f32x4(tmp14298, tmp14302, 136);
__m512 tmp14316 = _mm512_shuffle_f32x4(tmp14298, tmp14302, 221);
__m512 tmp14317 = _mm512_shuffle_f32x4(tmp14299, tmp14303, 136);
__m512 tmp14318 = _mm512_shuffle_f32x4(tmp14299, tmp14303, 221);
__m512 tmp14319 = _mm512_shuffle_f32x4(tmp14300, tmp14304, 136);
__m512 tmp14320 = _mm512_shuffle_f32x4(tmp14300, tmp14304, 221);
__m512 tmp14321 = _mm512_shuffle_f32x4(tmp14305, tmp14309, 136);
__m512 tmp14322 = _mm512_shuffle_f32x4(tmp14305, tmp14309, 221);
__m512 tmp14323 = _mm512_shuffle_f32x4(tmp14306, tmp14310, 136);
__m512 tmp14324 = _mm512_shuffle_f32x4(tmp14306, tmp14310, 221);
__m512 tmp14325 = _mm512_shuffle_f32x4(tmp14307, tmp14311, 136);
__m512 tmp14326 = _mm512_shuffle_f32x4(tmp14307, tmp14311, 221);
__m512 tmp14327 = _mm512_shuffle_f32x4(tmp14308, tmp14312, 136);
__m512 tmp14328 = _mm512_shuffle_f32x4(tmp14308, tmp14312, 221);
in2041 = _mm512_shuffle_f32x4(tmp14313, tmp14321, 136);
in2049 = _mm512_shuffle_f32x4(tmp14313, tmp14321, 221);
tmp14267 = _mm512_shuffle_f32x4(tmp14315, tmp14323, 136);
tmp14271 = _mm512_shuffle_f32x4(tmp14315, tmp14323, 221);
tmp14268 = _mm512_shuffle_f32x4(tmp14317, tmp14325, 136);
tmp14272 = _mm512_shuffle_f32x4(tmp14317, tmp14325, 221);
in2047 = _mm512_shuffle_f32x4(tmp14319, tmp14327, 136);
in2055 = _mm512_shuffle_f32x4(tmp14319, tmp14327, 221);
tmp14266 = _mm512_shuffle_f32x4(tmp14314, tmp14322, 136);
tmp14270 = _mm512_shuffle_f32x4(tmp14314, tmp14322, 221);
in2043 = _mm512_shuffle_f32x4(tmp14316, tmp14324, 136);
in2051 = _mm512_shuffle_f32x4(tmp14316, tmp14324, 221);
in2045 = _mm512_shuffle_f32x4(tmp14318, tmp14326, 136);
in2053 = _mm512_shuffle_f32x4(tmp14318, tmp14326, 221);
in2044 = _mm512_shuffle_f32x4(tmp14320, tmp14328, 136);
in2052 = _mm512_shuffle_f32x4(tmp14320, tmp14328, 221);
__m512 tmp14273 = _mm512_add_ps(tmp14267, in2043);
__m512 tmp14277 = _mm512_add_ps(tmp14271, in2051);
__m512 tmp14274 = _mm512_sub_ps(tmp14266, tmp14268);
__m512 tmp14278 = _mm512_sub_ps(tmp14270, tmp14272);
__m512 tmp14275 = _mm512_add_ps(tmp14268, in2045);
__m512 tmp14279 = _mm512_add_ps(tmp14272, in2053);
in2041 = _mm512_sub_ps(in2041, in2045);
in2049 = _mm512_sub_ps(in2049, in2053);
tmp14273 = _mm512_fmadd_ps(in2047, _mm512_set1_ps(-4.25e+00f), tmp14273);
tmp14277 = _mm512_fmadd_ps(in2055, _mm512_set1_ps(-4.25e+00f), tmp14277);
tmp14275 = _mm512_fmadd_ps(tmp14266, _mm512_set1_ps(-4.25e+00f), tmp14275);
tmp14279 = _mm512_fmadd_ps(tmp14270, _mm512_set1_ps(-4.25e+00f), tmp14279);
in2041 = _mm512_fmadd_ps(tmp14274, _mm512_set1_ps(5.25e+00f), in2041);
in2049 = _mm512_fmadd_ps(tmp14278, _mm512_set1_ps(5.25e+00f), in2049);
tmp14274 = _mm512_fmadd_ps(tmp14268, _mm512_set1_ps(2.5e-01f), in2045);
tmp14278 = _mm512_fmadd_ps(tmp14272, _mm512_set1_ps(2.5e-01f), in2053);
tmp14268 = _mm512_fmadd_ps(tmp14268, _mm512_set1_ps(4e+00f), in2045);
tmp14272 = _mm512_fmadd_ps(tmp14272, _mm512_set1_ps(4e+00f), in2053);
__m512 tmp14276 = _mm512_sub_ps(tmp14275, tmp14273);
__m512 tmp14280 = _mm512_sub_ps(tmp14279, tmp14277);
tmp14275 = _mm512_add_ps(tmp14273, tmp14275);
tmp14279 = _mm512_add_ps(tmp14277, tmp14279);
tmp14273 = _mm512_fmadd_ps(tmp14267, _mm512_set1_ps(2.5e-01f), in2043);
tmp14277 = _mm512_fmadd_ps(tmp14271, _mm512_set1_ps(2.5e-01f), in2051);
tmp14274 = _mm512_fmadd_ps(tmp14266, _mm512_set1_ps(-1.25e+00f), tmp14274);
tmp14278 = _mm512_fmadd_ps(tmp14270, _mm512_set1_ps(-1.25e+00f), tmp14278);
tmp14266 = _mm512_fmadd_ps(tmp14266, _mm512_set1_ps(-5e+00f), tmp14268);
tmp14270 = _mm512_fmadd_ps(tmp14270, _mm512_set1_ps(-5e+00f), tmp14272);
tmp14273 = _mm512_fmadd_ps(in2047, _mm512_set1_ps(-1.25e+00f), tmp14273);
tmp14277 = _mm512_fmadd_ps(in2055, _mm512_set1_ps(-1.25e+00f), tmp14277);
in2045 = _mm512_fmadd_ps(tmp14273, _mm512_set1_ps(2e+00f), tmp14274);
in2053 = _mm512_fmadd_ps(tmp14277, _mm512_set1_ps(2e+00f), tmp14278);
tmp14274 = _mm512_fnmadd_ps(tmp14273, _mm512_set1_ps(2e+00f), tmp14274);
tmp14278 = _mm512_fnmadd_ps(tmp14277, _mm512_set1_ps(2e+00f), tmp14278);
tmp14273 = _mm512_fmadd_ps(in2043, _mm512_set1_ps(2.5e-01f), tmp14267);
tmp14277 = _mm512_fmadd_ps(in2051, _mm512_set1_ps(2.5e-01f), tmp14271);
tmp14267 = _mm512_sub_ps(in2044, tmp14267);
tmp14271 = _mm512_sub_ps(in2052, tmp14271);
tmp14273 = _mm512_fmadd_ps(in2047, _mm512_set1_ps(-1.25e+00f), tmp14273);
tmp14277 = _mm512_fmadd_ps(in2055, _mm512_set1_ps(-1.25e+00f), tmp14277);
in2047 = _mm512_sub_ps(in2047, in2043);
in2055 = _mm512_sub_ps(in2055, in2051);
in2047 = _mm512_fmadd_ps(in2047, _mm512_set1_ps(5.25e+00f), tmp14267);
in2055 = _mm512_fmadd_ps(in2055, _mm512_set1_ps(5.25e+00f), tmp14271);
tmp14268 = _mm512_fmadd_ps(tmp14273, _mm512_set1_ps(2e+00f), tmp14266);
tmp14272 = _mm512_fmadd_ps(tmp14277, _mm512_set1_ps(2e+00f), tmp14270);
tmp14266 = _mm512_fnmadd_ps(tmp14273, _mm512_set1_ps(2e+00f), tmp14266);
tmp14270 = _mm512_fnmadd_ps(tmp14277, _mm512_set1_ps(2e+00f), tmp14270);
__m512 out1887 = _mm512_shuffle_f32x4(in2041, tmp14275, 68);
__m512 out1895 = _mm512_shuffle_f32x4(in2041, tmp14275, 238);
__m512 out1888 = _mm512_shuffle_f32x4(tmp14276, in2045, 68);
__m512 out1896 = _mm512_shuffle_f32x4(tmp14276, in2045, 238);
__m512 out1889 = _mm512_shuffle_f32x4(tmp14274, tmp14268, 68);
__m512 out1897 = _mm512_shuffle_f32x4(tmp14274, tmp14268, 238);
__m512 out1890 = _mm512_shuffle_f32x4(tmp14266, in2047, 68);
__m512 out1898 = _mm512_shuffle_f32x4(tmp14266, in2047, 238);
__m512 out1891 = _mm512_shuffle_f32x4(in2049, tmp14279, 68);
__m512 out1899 = _mm512_shuffle_f32x4(in2049, tmp14279, 238);
__m512 out1892 = _mm512_shuffle_f32x4(tmp14280, in2053, 68);
__m512 out1900 = _mm512_shuffle_f32x4(tmp14280, in2053, 238);
__m512 out1893 = _mm512_shuffle_f32x4(tmp14278, tmp14272, 68);
__m512 out1901 = _mm512_shuffle_f32x4(tmp14278, tmp14272, 238);
__m512 out1894 = _mm512_shuffle_f32x4(tmp14270, in2055, 68);
__m512 out1902 = _mm512_shuffle_f32x4(tmp14270, in2055, 238);
_mm512_storeu_ps(dfPtr10+512+819200*i45+49152*j38+49152*s36+768*k131, out1887);
_mm512_storeu_ps(dfPtr10+640+819200*i45+49152*j38+49152*s36+768*k131, out1895);
_mm512_storeu_ps(dfPtr10+576+819200*i45+49152*j38+49152*s36+768*k131, out1891);
_mm512_storeu_ps(dfPtr10+704+819200*i45+49152*j38+49152*s36+768*k131, out1899);
_mm512_storeu_ps(dfPtr10+205312+819200*i45+49152*j38+49152*s36+768*k131, out1888);
_mm512_storeu_ps(dfPtr10+205440+819200*i45+49152*j38+49152*s36+768*k131, out1896);
_mm512_storeu_ps(dfPtr10+205376+819200*i45+49152*j38+49152*s36+768*k131, out1892);
_mm512_storeu_ps(dfPtr10+205504+819200*i45+49152*j38+49152*s36+768*k131, out1900);
_mm512_storeu_ps(dfPtr10+410112+819200*i45+49152*j38+49152*s36+768*k131, out1889);
_mm512_storeu_ps(dfPtr10+410240+819200*i45+49152*j38+49152*s36+768*k131, out1897);
_mm512_storeu_ps(dfPtr10+410176+819200*i45+49152*j38+49152*s36+768*k131, out1893);
_mm512_storeu_ps(dfPtr10+410304+819200*i45+49152*j38+49152*s36+768*k131, out1901);
_mm512_storeu_ps(dfPtr10+614912+819200*i45+49152*j38+49152*s36+768*k131, out1890);
_mm512_storeu_ps(dfPtr10+615040+819200*i45+49152*j38+49152*s36+768*k131, out1898);
_mm512_storeu_ps(dfPtr10+614976+819200*i45+49152*j38+49152*s36+768*k131, out1894);
_mm512_storeu_ps(dfPtr10+615104+819200*i45+49152*j38+49152*s36+768*k131, out1902);
}
if (j38 >= last9) return;
++j38;
rel21 = 3;
}
if (rel21 < 4) {
ptrdiff_t h48 = base21+18;
ptrdiff_t w60 = 18;
ptrdiff_t k132 = 0;
for (; k132 != 64; ++k132) {
__m512 dat2111 = _mm512_maskz_loadu_ps(2047, datPtr23+0+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 dat2112 = _mm512_maskz_loadu_ps(8191, datPtr23+604+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512i pm206 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in2057 = _mm512_permutexvar_ps(pm206, dat2111);
__m512i pm207 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in2065 = _mm512_permutexvar_ps(pm207, dat2112);
__m512 dat2113 = _mm512_maskz_loadu_ps(2047, datPtr23+112+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 dat2114 = _mm512_maskz_loadu_ps(8191, datPtr23+716+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2058 = _mm512_permutexvar_ps(pm206, dat2113);
__m512 in2066 = _mm512_permutexvar_ps(pm207, dat2114);
__m512 dat2115 = _mm512_maskz_loadu_ps(2047, datPtr23+224+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 dat2116 = _mm512_maskz_loadu_ps(8191, datPtr23+828+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2059 = _mm512_permutexvar_ps(pm206, dat2115);
__m512 in2067 = _mm512_permutexvar_ps(pm207, dat2116);
__m512 dat2117 = _mm512_maskz_loadu_ps(2047, datPtr23+336+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 dat2118 = _mm512_maskz_loadu_ps(8191, datPtr23+940+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2060 = _mm512_permutexvar_ps(pm206, dat2117);
__m512 in2068 = _mm512_permutexvar_ps(pm207, dat2118);
__m512 dat2119 = _mm512_maskz_loadu_ps(2047, datPtr23+448+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 dat2120 = _mm512_maskz_loadu_ps(8191, datPtr23+1052+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2061 = _mm512_permutexvar_ps(pm206, dat2119);
__m512 in2069 = _mm512_permutexvar_ps(pm207, dat2120);
__m512 dat2121 = _mm512_maskz_loadu_ps(2047, datPtr23+560+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2062 = _mm512_permutexvar_ps(pm206, dat2121);
__m512 dat2122 = _mm512_maskz_loadu_ps(2047, datPtr23+672+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2063 = _mm512_permutexvar_ps(pm206, dat2122);
__m512 dat2123 = _mm512_maskz_loadu_ps(2047, datPtr23+784+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2064 = _mm512_permutexvar_ps(pm206, dat2123);
__m512 tmp14329 = _mm512_add_ps(in2058, in2062);
__m512 tmp14333 = in2066;
__m512 tmp14330 = _mm512_sub_ps(in2061, in2059);
__m512 tmp14334 = _mm512_sub_ps(in2069, in2067);
__m512 tmp14331 = _mm512_add_ps(in2059, in2063);
__m512 tmp14335 = in2067;
in2057 = _mm512_sub_ps(in2057, in2063);
in2065 = in2065;
tmp14329 = _mm512_fmadd_ps(in2060, _mm512_set1_ps(-4.25e+00f), tmp14329);
tmp14333 = _mm512_fmadd_ps(in2068, _mm512_set1_ps(-4.25e+00f), tmp14333);
tmp14331 = _mm512_fmadd_ps(in2061, _mm512_set1_ps(-4.25e+00f), tmp14331);
tmp14335 = _mm512_fmadd_ps(in2069, _mm512_set1_ps(-4.25e+00f), tmp14335);
in2057 = _mm512_fmadd_ps(tmp14330, _mm512_set1_ps(5.25e+00f), in2057);
in2065 = _mm512_fmadd_ps(tmp14334, _mm512_set1_ps(5.25e+00f), in2065);
tmp14330 = _mm512_fmadd_ps(in2059, _mm512_set1_ps(2.5e-01f), in2063);
tmp14334 = _mm512_mul_ps(in2067, _mm512_set1_ps(2.5e-01f));
in2059 = _mm512_fmadd_ps(in2059, _mm512_set1_ps(4e+00f), in2063);
in2067 = _mm512_mul_ps(in2067, _mm512_set1_ps(4e+00f));
__m512 tmp14332 = _mm512_sub_ps(tmp14331, tmp14329);
__m512 tmp14336 = _mm512_sub_ps(tmp14335, tmp14333);
tmp14331 = _mm512_add_ps(tmp14329, tmp14331);
tmp14335 = _mm512_add_ps(tmp14333, tmp14335);
tmp14329 = _mm512_fmadd_ps(in2058, _mm512_set1_ps(2.5e-01f), in2062);
tmp14333 = _mm512_mul_ps(in2066, _mm512_set1_ps(2.5e-01f));
tmp14330 = _mm512_fmadd_ps(in2061, _mm512_set1_ps(-1.25e+00f), tmp14330);
tmp14334 = _mm512_fmadd_ps(in2069, _mm512_set1_ps(-1.25e+00f), tmp14334);
in2061 = _mm512_fmadd_ps(in2061, _mm512_set1_ps(-5e+00f), in2059);
in2069 = _mm512_fmadd_ps(in2069, _mm512_set1_ps(-5e+00f), in2067);
tmp14329 = _mm512_fmadd_ps(in2060, _mm512_set1_ps(-1.25e+00f), tmp14329);
tmp14333 = _mm512_fmadd_ps(in2068, _mm512_set1_ps(-1.25e+00f), tmp14333);
in2063 = _mm512_fmadd_ps(tmp14329, _mm512_set1_ps(2e+00f), tmp14330);
__m512 tmp14337 = _mm512_fmadd_ps(tmp14333, _mm512_set1_ps(2e+00f), tmp14334);
tmp14330 = _mm512_fnmadd_ps(tmp14329, _mm512_set1_ps(2e+00f), tmp14330);
tmp14334 = _mm512_fnmadd_ps(tmp14333, _mm512_set1_ps(2e+00f), tmp14334);
tmp14329 = _mm512_fmadd_ps(in2062, _mm512_set1_ps(2.5e-01f), in2058);
tmp14333 = in2066;
in2058 = _mm512_sub_ps(in2064, in2058);
in2066 = _mm512_sub_ps(_mm512_setzero_ps(), in2066);
tmp14329 = _mm512_fmadd_ps(in2060, _mm512_set1_ps(-1.25e+00f), tmp14329);
tmp14333 = _mm512_fmadd_ps(in2068, _mm512_set1_ps(-1.25e+00f), tmp14333);
in2060 = _mm512_sub_ps(in2060, in2062);
in2068 = in2068;
in2060 = _mm512_fmadd_ps(in2060, _mm512_set1_ps(5.25e+00f), in2058);
in2068 = _mm512_fmadd_ps(in2068, _mm512_set1_ps(5.25e+00f), in2066);
in2059 = _mm512_fmadd_ps(tmp14329, _mm512_set1_ps(2e+00f), in2061);
in2067 = _mm512_fmadd_ps(tmp14333, _mm512_set1_ps(2e+00f), in2069);
in2061 = _mm512_fnmadd_ps(tmp14329, _mm512_set1_ps(2e+00f), in2061);
in2069 = _mm512_fnmadd_ps(tmp14333, _mm512_set1_ps(2e+00f), in2069);
__m512 tmp14346 = _mm512_unpacklo_ps(in2057, tmp14331);
__m512 tmp14347 = _mm512_unpackhi_ps(in2057, tmp14331);
__m512 tmp14348 = _mm512_unpacklo_ps(tmp14332, in2063);
__m512 tmp14349 = _mm512_unpackhi_ps(tmp14332, in2063);
__m512 tmp14350 = _mm512_unpacklo_ps(tmp14330, in2059);
__m512 tmp14351 = _mm512_unpackhi_ps(tmp14330, in2059);
__m512 tmp14352 = _mm512_unpacklo_ps(in2061, in2060);
__m512 tmp14353 = _mm512_unpackhi_ps(in2061, in2060);
__m512 tmp14354 = _mm512_unpacklo_ps(in2065, tmp14335);
__m512 tmp14355 = _mm512_unpackhi_ps(in2065, tmp14335);
__m512 tmp14356 = _mm512_unpacklo_ps(tmp14336, tmp14337);
__m512 tmp14357 = _mm512_unpackhi_ps(tmp14336, tmp14337);
__m512 tmp14358 = _mm512_unpacklo_ps(tmp14334, in2067);
__m512 tmp14359 = _mm512_unpackhi_ps(tmp14334, in2067);
__m512 tmp14360 = _mm512_unpacklo_ps(in2069, in2068);
__m512 tmp14361 = _mm512_unpackhi_ps(in2069, in2068);
__m512 tmp14362 = _mm512_shuffle_ps(tmp14346, tmp14348, 68);
__m512 tmp14363 = _mm512_shuffle_ps(tmp14346, tmp14348, 238);
__m512 tmp14364 = _mm512_shuffle_ps(tmp14347, tmp14349, 68);
__m512 tmp14365 = _mm512_shuffle_ps(tmp14347, tmp14349, 238);
__m512 tmp14366 = _mm512_shuffle_ps(tmp14350, tmp14352, 68);
__m512 tmp14367 = _mm512_shuffle_ps(tmp14350, tmp14352, 238);
__m512 tmp14368 = _mm512_shuffle_ps(tmp14351, tmp14353, 68);
__m512 tmp14369 = _mm512_shuffle_ps(tmp14351, tmp14353, 238);
__m512 tmp14370 = _mm512_shuffle_ps(tmp14354, tmp14356, 68);
__m512 tmp14371 = _mm512_shuffle_ps(tmp14354, tmp14356, 238);
__m512 tmp14372 = _mm512_shuffle_ps(tmp14355, tmp14357, 68);
__m512 tmp14373 = _mm512_shuffle_ps(tmp14355, tmp14357, 238);
__m512 tmp14374 = _mm512_shuffle_ps(tmp14358, tmp14360, 68);
__m512 tmp14375 = _mm512_shuffle_ps(tmp14358, tmp14360, 238);
__m512 tmp14376 = _mm512_shuffle_ps(tmp14359, tmp14361, 68);
__m512 tmp14377 = _mm512_shuffle_ps(tmp14359, tmp14361, 238);
__m512 tmp14378 = _mm512_shuffle_f32x4(tmp14362, tmp14366, 136);
__m512 tmp14379 = _mm512_shuffle_f32x4(tmp14362, tmp14366, 221);
__m512 tmp14380 = _mm512_shuffle_f32x4(tmp14363, tmp14367, 136);
__m512 tmp14381 = _mm512_shuffle_f32x4(tmp14363, tmp14367, 221);
__m512 tmp14382 = _mm512_shuffle_f32x4(tmp14364, tmp14368, 136);
__m512 tmp14383 = _mm512_shuffle_f32x4(tmp14364, tmp14368, 221);
__m512 tmp14384 = _mm512_shuffle_f32x4(tmp14365, tmp14369, 136);
__m512 tmp14385 = _mm512_shuffle_f32x4(tmp14365, tmp14369, 221);
__m512 tmp14386 = _mm512_shuffle_f32x4(tmp14370, tmp14374, 136);
__m512 tmp14387 = _mm512_shuffle_f32x4(tmp14370, tmp14374, 221);
__m512 tmp14388 = _mm512_shuffle_f32x4(tmp14371, tmp14375, 136);
__m512 tmp14389 = _mm512_shuffle_f32x4(tmp14371, tmp14375, 221);
__m512 tmp14390 = _mm512_shuffle_f32x4(tmp14372, tmp14376, 136);
__m512 tmp14391 = _mm512_shuffle_f32x4(tmp14372, tmp14376, 221);
__m512 tmp14392 = _mm512_shuffle_f32x4(tmp14373, tmp14377, 136);
__m512 tmp14393 = _mm512_shuffle_f32x4(tmp14373, tmp14377, 221);
in2057 = _mm512_shuffle_f32x4(tmp14378, tmp14386, 136);
in2065 = _mm512_shuffle_f32x4(tmp14378, tmp14386, 221);
tmp14331 = _mm512_shuffle_f32x4(tmp14380, tmp14388, 136);
tmp14335 = _mm512_shuffle_f32x4(tmp14380, tmp14388, 221);
tmp14332 = _mm512_shuffle_f32x4(tmp14382, tmp14390, 136);
tmp14336 = _mm512_shuffle_f32x4(tmp14382, tmp14390, 221);
in2063 = _mm512_shuffle_f32x4(tmp14384, tmp14392, 136);
tmp14337 = _mm512_shuffle_f32x4(tmp14384, tmp14392, 221);
tmp14330 = _mm512_shuffle_f32x4(tmp14379, tmp14387, 136);
tmp14334 = _mm512_shuffle_f32x4(tmp14379, tmp14387, 221);
in2059 = _mm512_shuffle_f32x4(tmp14381, tmp14389, 136);
in2067 = _mm512_shuffle_f32x4(tmp14381, tmp14389, 221);
in2061 = _mm512_shuffle_f32x4(tmp14383, tmp14391, 136);
in2069 = _mm512_shuffle_f32x4(tmp14383, tmp14391, 221);
in2060 = _mm512_shuffle_f32x4(tmp14385, tmp14393, 136);
in2068 = _mm512_shuffle_f32x4(tmp14385, tmp14393, 221);
__m512 tmp14338 = _mm512_add_ps(tmp14331, in2059);
__m512 tmp14342 = _mm512_add_ps(tmp14335, in2067);
__m512 tmp14339 = _mm512_sub_ps(tmp14330, tmp14332);
__m512 tmp14343 = _mm512_sub_ps(tmp14334, tmp14336);
__m512 tmp14340 = _mm512_add_ps(tmp14332, in2061);
__m512 tmp14344 = _mm512_add_ps(tmp14336, in2069);
in2057 = _mm512_sub_ps(in2057, in2061);
in2065 = _mm512_sub_ps(in2065, in2069);
tmp14338 = _mm512_fmadd_ps(in2063, _mm512_set1_ps(-4.25e+00f), tmp14338);
tmp14342 = _mm512_fmadd_ps(tmp14337, _mm512_set1_ps(-4.25e+00f), tmp14342);
tmp14340 = _mm512_fmadd_ps(tmp14330, _mm512_set1_ps(-4.25e+00f), tmp14340);
tmp14344 = _mm512_fmadd_ps(tmp14334, _mm512_set1_ps(-4.25e+00f), tmp14344);
in2057 = _mm512_fmadd_ps(tmp14339, _mm512_set1_ps(5.25e+00f), in2057);
in2065 = _mm512_fmadd_ps(tmp14343, _mm512_set1_ps(5.25e+00f), in2065);
tmp14339 = _mm512_fmadd_ps(tmp14332, _mm512_set1_ps(2.5e-01f), in2061);
tmp14343 = _mm512_fmadd_ps(tmp14336, _mm512_set1_ps(2.5e-01f), in2069);
tmp14332 = _mm512_fmadd_ps(tmp14332, _mm512_set1_ps(4e+00f), in2061);
tmp14336 = _mm512_fmadd_ps(tmp14336, _mm512_set1_ps(4e+00f), in2069);
__m512 tmp14341 = _mm512_sub_ps(tmp14340, tmp14338);
__m512 tmp14345 = _mm512_sub_ps(tmp14344, tmp14342);
tmp14340 = _mm512_add_ps(tmp14338, tmp14340);
tmp14344 = _mm512_add_ps(tmp14342, tmp14344);
tmp14338 = _mm512_fmadd_ps(tmp14331, _mm512_set1_ps(2.5e-01f), in2059);
tmp14342 = _mm512_fmadd_ps(tmp14335, _mm512_set1_ps(2.5e-01f), in2067);
tmp14339 = _mm512_fmadd_ps(tmp14330, _mm512_set1_ps(-1.25e+00f), tmp14339);
tmp14343 = _mm512_fmadd_ps(tmp14334, _mm512_set1_ps(-1.25e+00f), tmp14343);
tmp14330 = _mm512_fmadd_ps(tmp14330, _mm512_set1_ps(-5e+00f), tmp14332);
tmp14334 = _mm512_fmadd_ps(tmp14334, _mm512_set1_ps(-5e+00f), tmp14336);
tmp14338 = _mm512_fmadd_ps(in2063, _mm512_set1_ps(-1.25e+00f), tmp14338);
tmp14342 = _mm512_fmadd_ps(tmp14337, _mm512_set1_ps(-1.25e+00f), tmp14342);
in2061 = _mm512_fmadd_ps(tmp14338, _mm512_set1_ps(2e+00f), tmp14339);
in2069 = _mm512_fmadd_ps(tmp14342, _mm512_set1_ps(2e+00f), tmp14343);
tmp14339 = _mm512_fnmadd_ps(tmp14338, _mm512_set1_ps(2e+00f), tmp14339);
tmp14343 = _mm512_fnmadd_ps(tmp14342, _mm512_set1_ps(2e+00f), tmp14343);
tmp14338 = _mm512_fmadd_ps(in2059, _mm512_set1_ps(2.5e-01f), tmp14331);
tmp14342 = _mm512_fmadd_ps(in2067, _mm512_set1_ps(2.5e-01f), tmp14335);
tmp14331 = _mm512_sub_ps(in2060, tmp14331);
tmp14335 = _mm512_sub_ps(in2068, tmp14335);
tmp14338 = _mm512_fmadd_ps(in2063, _mm512_set1_ps(-1.25e+00f), tmp14338);
tmp14342 = _mm512_fmadd_ps(tmp14337, _mm512_set1_ps(-1.25e+00f), tmp14342);
in2063 = _mm512_sub_ps(in2063, in2059);
tmp14337 = _mm512_sub_ps(tmp14337, in2067);
in2063 = _mm512_fmadd_ps(in2063, _mm512_set1_ps(5.25e+00f), tmp14331);
tmp14337 = _mm512_fmadd_ps(tmp14337, _mm512_set1_ps(5.25e+00f), tmp14335);
tmp14332 = _mm512_fmadd_ps(tmp14338, _mm512_set1_ps(2e+00f), tmp14330);
tmp14336 = _mm512_fmadd_ps(tmp14342, _mm512_set1_ps(2e+00f), tmp14334);
tmp14330 = _mm512_fnmadd_ps(tmp14338, _mm512_set1_ps(2e+00f), tmp14330);
tmp14334 = _mm512_fnmadd_ps(tmp14342, _mm512_set1_ps(2e+00f), tmp14334);
__m512 out1903 = _mm512_shuffle_f32x4(in2057, tmp14340, 68);
__m512 out1911 = _mm512_shuffle_f32x4(in2057, tmp14340, 238);
__m512 out1904 = _mm512_shuffle_f32x4(tmp14341, in2061, 68);
__m512 out1912 = _mm512_shuffle_f32x4(tmp14341, in2061, 238);
__m512 out1905 = _mm512_shuffle_f32x4(tmp14339, tmp14332, 68);
__m512 out1913 = _mm512_shuffle_f32x4(tmp14339, tmp14332, 238);
__m512 out1906 = _mm512_shuffle_f32x4(tmp14330, in2063, 68);
__m512 out1914 = _mm512_shuffle_f32x4(tmp14330, in2063, 238);
__m512 out1907 = _mm512_shuffle_f32x4(in2065, tmp14344, 68);
__m512 out1915 = _mm512_shuffle_f32x4(in2065, tmp14344, 238);
__m512 out1908 = _mm512_shuffle_f32x4(tmp14345, in2069, 68);
__m512 out1916 = _mm512_shuffle_f32x4(tmp14345, in2069, 238);
__m512 out1909 = _mm512_shuffle_f32x4(tmp14343, tmp14336, 68);
__m512 out1917 = _mm512_shuffle_f32x4(tmp14343, tmp14336, 238);
__m512 out1910 = _mm512_shuffle_f32x4(tmp14334, tmp14337, 68);
__m512 out1918 = _mm512_shuffle_f32x4(tmp14334, tmp14337, 238);
_mm512_storeu_ps(dfPtr10+0+819200*i45+49152*j38+49152*s36+768*k132, out1903);
_mm512_storeu_ps(dfPtr10+128+819200*i45+49152*j38+49152*s36+768*k132, out1911);
_mm512_storeu_ps(dfPtr10+64+819200*i45+49152*j38+49152*s36+768*k132, out1907);
_mm512_storeu_ps(dfPtr10+192+819200*i45+49152*j38+49152*s36+768*k132, out1915);
_mm512_storeu_ps(dfPtr10+204800+819200*i45+49152*j38+49152*s36+768*k132, out1904);
_mm512_storeu_ps(dfPtr10+204928+819200*i45+49152*j38+49152*s36+768*k132, out1912);
_mm512_storeu_ps(dfPtr10+204864+819200*i45+49152*j38+49152*s36+768*k132, out1908);
_mm512_storeu_ps(dfPtr10+204992+819200*i45+49152*j38+49152*s36+768*k132, out1916);
_mm512_storeu_ps(dfPtr10+409600+819200*i45+49152*j38+49152*s36+768*k132, out1905);
_mm512_storeu_ps(dfPtr10+409728+819200*i45+49152*j38+49152*s36+768*k132, out1913);
_mm512_storeu_ps(dfPtr10+409664+819200*i45+49152*j38+49152*s36+768*k132, out1909);
_mm512_storeu_ps(dfPtr10+409792+819200*i45+49152*j38+49152*s36+768*k132, out1917);
_mm512_storeu_ps(dfPtr10+614400+819200*i45+49152*j38+49152*s36+768*k132, out1906);
_mm512_storeu_ps(dfPtr10+614528+819200*i45+49152*j38+49152*s36+768*k132, out1914);
_mm512_storeu_ps(dfPtr10+614464+819200*i45+49152*j38+49152*s36+768*k132, out1910);
_mm512_storeu_ps(dfPtr10+614592+819200*i45+49152*j38+49152*s36+768*k132, out1918);
__m512 dat2124 = _mm512_maskz_loadu_ps(16383, datPtr23+648+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 dat2125 = _mm512_maskz_loadu_ps(2047, datPtr23+3136+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512i pm208 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in2070 = _mm512_permutexvar_ps(pm208, dat2124);
__m512i pm209 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in2075 = _mm512_permutexvar_ps(pm209, dat2125);
__m512 dat2126 = _mm512_maskz_loadu_ps(16383, datPtr23+760+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 dat2127 = _mm512_maskz_loadu_ps(2047, datPtr23+3248+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2071 = _mm512_permutexvar_ps(pm208, dat2126);
__m512 in2076 = _mm512_permutexvar_ps(pm209, dat2127);
__m512 dat2128 = _mm512_maskz_loadu_ps(16383, datPtr23+872+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 dat2129 = _mm512_maskz_loadu_ps(2047, datPtr23+3360+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2072 = _mm512_permutexvar_ps(pm208, dat2128);
__m512 in2077 = _mm512_permutexvar_ps(pm209, dat2129);
__m512 dat2130 = _mm512_maskz_loadu_ps(16383, datPtr23+984+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 dat2131 = _mm512_maskz_loadu_ps(2047, datPtr23+3472+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2073 = _mm512_permutexvar_ps(pm208, dat2130);
__m512 in2078 = _mm512_permutexvar_ps(pm209, dat2131);
__m512 dat2132 = _mm512_maskz_loadu_ps(16383, datPtr23+1096+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 dat2133 = _mm512_maskz_loadu_ps(2047, datPtr23+3584+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2074 = _mm512_permutexvar_ps(pm208, dat2132);
__m512 in2079 = _mm512_permutexvar_ps(pm209, dat2133);
__m512 dat2134 = _mm512_maskz_loadu_ps(2047, datPtr23+3696+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2080 = _mm512_permutexvar_ps(pm209, dat2134);
__m512 dat2135 = _mm512_maskz_loadu_ps(2047, datPtr23+3808+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2081 = _mm512_permutexvar_ps(pm209, dat2135);
__m512 dat2136 = _mm512_maskz_loadu_ps(2047, datPtr23+3920+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2082 = _mm512_permutexvar_ps(pm209, dat2136);
__m512 tmp14394 = in2071;
__m512 tmp14399 = _mm512_add_ps(in2076, in2080);
__m512 tmp14395 = _mm512_sub_ps(in2074, in2072);
__m512 tmp14400 = _mm512_sub_ps(in2079, in2077);
__m512 tmp14396 = in2072;
__m512 tmp14401 = _mm512_add_ps(in2077, in2081);
in2070 = in2070;
in2075 = _mm512_sub_ps(in2075, in2081);
tmp14394 = _mm512_fmadd_ps(in2073, _mm512_set1_ps(-4.25e+00f), tmp14394);
tmp14399 = _mm512_fmadd_ps(in2078, _mm512_set1_ps(-4.25e+00f), tmp14399);
tmp14396 = _mm512_fmadd_ps(in2074, _mm512_set1_ps(-4.25e+00f), tmp14396);
tmp14401 = _mm512_fmadd_ps(in2079, _mm512_set1_ps(-4.25e+00f), tmp14401);
in2070 = _mm512_fmadd_ps(tmp14395, _mm512_set1_ps(5.25e+00f), in2070);
in2075 = _mm512_fmadd_ps(tmp14400, _mm512_set1_ps(5.25e+00f), in2075);
tmp14395 = _mm512_mul_ps(in2072, _mm512_set1_ps(2.5e-01f));
tmp14400 = _mm512_fmadd_ps(in2077, _mm512_set1_ps(2.5e-01f), in2081);
in2072 = _mm512_mul_ps(in2072, _mm512_set1_ps(4e+00f));
in2077 = _mm512_fmadd_ps(in2077, _mm512_set1_ps(4e+00f), in2081);
__m512 tmp14397 = _mm512_sub_ps(tmp14396, tmp14394);
__m512 tmp14402 = _mm512_sub_ps(tmp14401, tmp14399);
tmp14396 = _mm512_add_ps(tmp14394, tmp14396);
tmp14401 = _mm512_add_ps(tmp14399, tmp14401);
tmp14394 = _mm512_mul_ps(in2071, _mm512_set1_ps(2.5e-01f));
tmp14399 = _mm512_fmadd_ps(in2076, _mm512_set1_ps(2.5e-01f), in2080);
tmp14395 = _mm512_fmadd_ps(in2074, _mm512_set1_ps(-1.25e+00f), tmp14395);
tmp14400 = _mm512_fmadd_ps(in2079, _mm512_set1_ps(-1.25e+00f), tmp14400);
in2074 = _mm512_fmadd_ps(in2074, _mm512_set1_ps(-5e+00f), in2072);
in2079 = _mm512_fmadd_ps(in2079, _mm512_set1_ps(-5e+00f), in2077);
tmp14394 = _mm512_fmadd_ps(in2073, _mm512_set1_ps(-1.25e+00f), tmp14394);
tmp14399 = _mm512_fmadd_ps(in2078, _mm512_set1_ps(-1.25e+00f), tmp14399);
__m512 tmp14398 = _mm512_fmadd_ps(tmp14394, _mm512_set1_ps(2e+00f), tmp14395);
in2081 = _mm512_fmadd_ps(tmp14399, _mm512_set1_ps(2e+00f), tmp14400);
tmp14395 = _mm512_fnmadd_ps(tmp14394, _mm512_set1_ps(2e+00f), tmp14395);
tmp14400 = _mm512_fnmadd_ps(tmp14399, _mm512_set1_ps(2e+00f), tmp14400);
tmp14394 = in2071;
tmp14399 = _mm512_fmadd_ps(in2080, _mm512_set1_ps(2.5e-01f), in2076);
in2071 = _mm512_sub_ps(_mm512_setzero_ps(), in2071);
in2076 = _mm512_sub_ps(in2082, in2076);
tmp14394 = _mm512_fmadd_ps(in2073, _mm512_set1_ps(-1.25e+00f), tmp14394);
tmp14399 = _mm512_fmadd_ps(in2078, _mm512_set1_ps(-1.25e+00f), tmp14399);
in2073 = in2073;
in2078 = _mm512_sub_ps(in2078, in2080);
in2073 = _mm512_fmadd_ps(in2073, _mm512_set1_ps(5.25e+00f), in2071);
in2078 = _mm512_fmadd_ps(in2078, _mm512_set1_ps(5.25e+00f), in2076);
in2072 = _mm512_fmadd_ps(tmp14394, _mm512_set1_ps(2e+00f), in2074);
in2077 = _mm512_fmadd_ps(tmp14399, _mm512_set1_ps(2e+00f), in2079);
in2074 = _mm512_fnmadd_ps(tmp14394, _mm512_set1_ps(2e+00f), in2074);
in2079 = _mm512_fnmadd_ps(tmp14399, _mm512_set1_ps(2e+00f), in2079);
__m512 tmp14411 = _mm512_unpacklo_ps(in2070, tmp14396);
__m512 tmp14412 = _mm512_unpackhi_ps(in2070, tmp14396);
__m512 tmp14413 = _mm512_unpacklo_ps(tmp14397, tmp14398);
__m512 tmp14414 = _mm512_unpackhi_ps(tmp14397, tmp14398);
__m512 tmp14415 = _mm512_unpacklo_ps(tmp14395, in2072);
__m512 tmp14416 = _mm512_unpackhi_ps(tmp14395, in2072);
__m512 tmp14417 = _mm512_unpacklo_ps(in2074, in2073);
__m512 tmp14418 = _mm512_unpackhi_ps(in2074, in2073);
__m512 tmp14419 = _mm512_unpacklo_ps(in2075, tmp14401);
__m512 tmp14420 = _mm512_unpackhi_ps(in2075, tmp14401);
__m512 tmp14421 = _mm512_unpacklo_ps(tmp14402, in2081);
__m512 tmp14422 = _mm512_unpackhi_ps(tmp14402, in2081);
__m512 tmp14423 = _mm512_unpacklo_ps(tmp14400, in2077);
__m512 tmp14424 = _mm512_unpackhi_ps(tmp14400, in2077);
__m512 tmp14425 = _mm512_unpacklo_ps(in2079, in2078);
__m512 tmp14426 = _mm512_unpackhi_ps(in2079, in2078);
__m512 tmp14427 = _mm512_shuffle_ps(tmp14411, tmp14413, 68);
__m512 tmp14428 = _mm512_shuffle_ps(tmp14411, tmp14413, 238);
__m512 tmp14429 = _mm512_shuffle_ps(tmp14412, tmp14414, 68);
__m512 tmp14430 = _mm512_shuffle_ps(tmp14412, tmp14414, 238);
__m512 tmp14431 = _mm512_shuffle_ps(tmp14415, tmp14417, 68);
__m512 tmp14432 = _mm512_shuffle_ps(tmp14415, tmp14417, 238);
__m512 tmp14433 = _mm512_shuffle_ps(tmp14416, tmp14418, 68);
__m512 tmp14434 = _mm512_shuffle_ps(tmp14416, tmp14418, 238);
__m512 tmp14435 = _mm512_shuffle_ps(tmp14419, tmp14421, 68);
__m512 tmp14436 = _mm512_shuffle_ps(tmp14419, tmp14421, 238);
__m512 tmp14437 = _mm512_shuffle_ps(tmp14420, tmp14422, 68);
__m512 tmp14438 = _mm512_shuffle_ps(tmp14420, tmp14422, 238);
__m512 tmp14439 = _mm512_shuffle_ps(tmp14423, tmp14425, 68);
__m512 tmp14440 = _mm512_shuffle_ps(tmp14423, tmp14425, 238);
__m512 tmp14441 = _mm512_shuffle_ps(tmp14424, tmp14426, 68);
__m512 tmp14442 = _mm512_shuffle_ps(tmp14424, tmp14426, 238);
__m512 tmp14443 = _mm512_shuffle_f32x4(tmp14427, tmp14431, 136);
__m512 tmp14444 = _mm512_shuffle_f32x4(tmp14427, tmp14431, 221);
__m512 tmp14445 = _mm512_shuffle_f32x4(tmp14428, tmp14432, 136);
__m512 tmp14446 = _mm512_shuffle_f32x4(tmp14428, tmp14432, 221);
__m512 tmp14447 = _mm512_shuffle_f32x4(tmp14429, tmp14433, 136);
__m512 tmp14448 = _mm512_shuffle_f32x4(tmp14429, tmp14433, 221);
__m512 tmp14449 = _mm512_shuffle_f32x4(tmp14430, tmp14434, 136);
__m512 tmp14450 = _mm512_shuffle_f32x4(tmp14430, tmp14434, 221);
__m512 tmp14451 = _mm512_shuffle_f32x4(tmp14435, tmp14439, 136);
__m512 tmp14452 = _mm512_shuffle_f32x4(tmp14435, tmp14439, 221);
__m512 tmp14453 = _mm512_shuffle_f32x4(tmp14436, tmp14440, 136);
__m512 tmp14454 = _mm512_shuffle_f32x4(tmp14436, tmp14440, 221);
__m512 tmp14455 = _mm512_shuffle_f32x4(tmp14437, tmp14441, 136);
__m512 tmp14456 = _mm512_shuffle_f32x4(tmp14437, tmp14441, 221);
__m512 tmp14457 = _mm512_shuffle_f32x4(tmp14438, tmp14442, 136);
__m512 tmp14458 = _mm512_shuffle_f32x4(tmp14438, tmp14442, 221);
in2070 = _mm512_shuffle_f32x4(tmp14443, tmp14451, 136);
in2075 = _mm512_shuffle_f32x4(tmp14443, tmp14451, 221);
tmp14396 = _mm512_shuffle_f32x4(tmp14445, tmp14453, 136);
tmp14401 = _mm512_shuffle_f32x4(tmp14445, tmp14453, 221);
tmp14397 = _mm512_shuffle_f32x4(tmp14447, tmp14455, 136);
tmp14402 = _mm512_shuffle_f32x4(tmp14447, tmp14455, 221);
tmp14398 = _mm512_shuffle_f32x4(tmp14449, tmp14457, 136);
in2081 = _mm512_shuffle_f32x4(tmp14449, tmp14457, 221);
tmp14395 = _mm512_shuffle_f32x4(tmp14444, tmp14452, 136);
tmp14400 = _mm512_shuffle_f32x4(tmp14444, tmp14452, 221);
in2072 = _mm512_shuffle_f32x4(tmp14446, tmp14454, 136);
in2077 = _mm512_shuffle_f32x4(tmp14446, tmp14454, 221);
in2074 = _mm512_shuffle_f32x4(tmp14448, tmp14456, 136);
in2079 = _mm512_shuffle_f32x4(tmp14448, tmp14456, 221);
in2073 = _mm512_shuffle_f32x4(tmp14450, tmp14458, 136);
in2078 = _mm512_shuffle_f32x4(tmp14450, tmp14458, 221);
__m512 tmp14403 = _mm512_add_ps(tmp14396, in2072);
__m512 tmp14407 = _mm512_add_ps(tmp14401, in2077);
__m512 tmp14404 = _mm512_sub_ps(tmp14395, tmp14397);
__m512 tmp14408 = _mm512_sub_ps(tmp14400, tmp14402);
__m512 tmp14405 = _mm512_add_ps(tmp14397, in2074);
__m512 tmp14409 = _mm512_add_ps(tmp14402, in2079);
in2070 = _mm512_sub_ps(in2070, in2074);
in2075 = _mm512_sub_ps(in2075, in2079);
tmp14403 = _mm512_fmadd_ps(tmp14398, _mm512_set1_ps(-4.25e+00f), tmp14403);
tmp14407 = _mm512_fmadd_ps(in2081, _mm512_set1_ps(-4.25e+00f), tmp14407);
tmp14405 = _mm512_fmadd_ps(tmp14395, _mm512_set1_ps(-4.25e+00f), tmp14405);
tmp14409 = _mm512_fmadd_ps(tmp14400, _mm512_set1_ps(-4.25e+00f), tmp14409);
in2070 = _mm512_fmadd_ps(tmp14404, _mm512_set1_ps(5.25e+00f), in2070);
in2075 = _mm512_fmadd_ps(tmp14408, _mm512_set1_ps(5.25e+00f), in2075);
tmp14404 = _mm512_fmadd_ps(tmp14397, _mm512_set1_ps(2.5e-01f), in2074);
tmp14408 = _mm512_fmadd_ps(tmp14402, _mm512_set1_ps(2.5e-01f), in2079);
tmp14397 = _mm512_fmadd_ps(tmp14397, _mm512_set1_ps(4e+00f), in2074);
tmp14402 = _mm512_fmadd_ps(tmp14402, _mm512_set1_ps(4e+00f), in2079);
__m512 tmp14406 = _mm512_sub_ps(tmp14405, tmp14403);
__m512 tmp14410 = _mm512_sub_ps(tmp14409, tmp14407);
tmp14405 = _mm512_add_ps(tmp14403, tmp14405);
tmp14409 = _mm512_add_ps(tmp14407, tmp14409);
tmp14403 = _mm512_fmadd_ps(tmp14396, _mm512_set1_ps(2.5e-01f), in2072);
tmp14407 = _mm512_fmadd_ps(tmp14401, _mm512_set1_ps(2.5e-01f), in2077);
tmp14404 = _mm512_fmadd_ps(tmp14395, _mm512_set1_ps(-1.25e+00f), tmp14404);
tmp14408 = _mm512_fmadd_ps(tmp14400, _mm512_set1_ps(-1.25e+00f), tmp14408);
tmp14395 = _mm512_fmadd_ps(tmp14395, _mm512_set1_ps(-5e+00f), tmp14397);
tmp14400 = _mm512_fmadd_ps(tmp14400, _mm512_set1_ps(-5e+00f), tmp14402);
tmp14403 = _mm512_fmadd_ps(tmp14398, _mm512_set1_ps(-1.25e+00f), tmp14403);
tmp14407 = _mm512_fmadd_ps(in2081, _mm512_set1_ps(-1.25e+00f), tmp14407);
in2074 = _mm512_fmadd_ps(tmp14403, _mm512_set1_ps(2e+00f), tmp14404);
in2079 = _mm512_fmadd_ps(tmp14407, _mm512_set1_ps(2e+00f), tmp14408);
tmp14404 = _mm512_fnmadd_ps(tmp14403, _mm512_set1_ps(2e+00f), tmp14404);
tmp14408 = _mm512_fnmadd_ps(tmp14407, _mm512_set1_ps(2e+00f), tmp14408);
tmp14403 = _mm512_fmadd_ps(in2072, _mm512_set1_ps(2.5e-01f), tmp14396);
tmp14407 = _mm512_fmadd_ps(in2077, _mm512_set1_ps(2.5e-01f), tmp14401);
tmp14396 = _mm512_sub_ps(in2073, tmp14396);
tmp14401 = _mm512_sub_ps(in2078, tmp14401);
tmp14403 = _mm512_fmadd_ps(tmp14398, _mm512_set1_ps(-1.25e+00f), tmp14403);
tmp14407 = _mm512_fmadd_ps(in2081, _mm512_set1_ps(-1.25e+00f), tmp14407);
tmp14398 = _mm512_sub_ps(tmp14398, in2072);
in2081 = _mm512_sub_ps(in2081, in2077);
tmp14398 = _mm512_fmadd_ps(tmp14398, _mm512_set1_ps(5.25e+00f), tmp14396);
in2081 = _mm512_fmadd_ps(in2081, _mm512_set1_ps(5.25e+00f), tmp14401);
tmp14397 = _mm512_fmadd_ps(tmp14403, _mm512_set1_ps(2e+00f), tmp14395);
tmp14402 = _mm512_fmadd_ps(tmp14407, _mm512_set1_ps(2e+00f), tmp14400);
tmp14395 = _mm512_fnmadd_ps(tmp14403, _mm512_set1_ps(2e+00f), tmp14395);
tmp14400 = _mm512_fnmadd_ps(tmp14407, _mm512_set1_ps(2e+00f), tmp14400);
__m512 out1919 = _mm512_shuffle_f32x4(in2070, tmp14405, 68);
__m512 out1927 = _mm512_shuffle_f32x4(in2070, tmp14405, 238);
__m512 out1920 = _mm512_shuffle_f32x4(tmp14406, in2074, 68);
__m512 out1928 = _mm512_shuffle_f32x4(tmp14406, in2074, 238);
__m512 out1921 = _mm512_shuffle_f32x4(tmp14404, tmp14397, 68);
__m512 out1929 = _mm512_shuffle_f32x4(tmp14404, tmp14397, 238);
__m512 out1922 = _mm512_shuffle_f32x4(tmp14395, tmp14398, 68);
__m512 out1930 = _mm512_shuffle_f32x4(tmp14395, tmp14398, 238);
__m512 out1923 = _mm512_shuffle_f32x4(in2075, tmp14409, 68);
__m512 out1931 = _mm512_shuffle_f32x4(in2075, tmp14409, 238);
__m512 out1924 = _mm512_shuffle_f32x4(tmp14410, in2079, 68);
__m512 out1932 = _mm512_shuffle_f32x4(tmp14410, in2079, 238);
__m512 out1925 = _mm512_shuffle_f32x4(tmp14408, tmp14402, 68);
__m512 out1933 = _mm512_shuffle_f32x4(tmp14408, tmp14402, 238);
__m512 out1926 = _mm512_shuffle_f32x4(tmp14400, in2081, 68);
__m512 out1934 = _mm512_shuffle_f32x4(tmp14400, in2081, 238);
_mm512_storeu_ps(dfPtr10+256+819200*i45+49152*j38+49152*s36+768*k132, out1919);
_mm512_storeu_ps(dfPtr10+384+819200*i45+49152*j38+49152*s36+768*k132, out1927);
_mm512_storeu_ps(dfPtr10+320+819200*i45+49152*j38+49152*s36+768*k132, out1923);
_mm512_storeu_ps(dfPtr10+448+819200*i45+49152*j38+49152*s36+768*k132, out1931);
_mm512_storeu_ps(dfPtr10+205056+819200*i45+49152*j38+49152*s36+768*k132, out1920);
_mm512_storeu_ps(dfPtr10+205184+819200*i45+49152*j38+49152*s36+768*k132, out1928);
_mm512_storeu_ps(dfPtr10+205120+819200*i45+49152*j38+49152*s36+768*k132, out1924);
_mm512_storeu_ps(dfPtr10+205248+819200*i45+49152*j38+49152*s36+768*k132, out1932);
_mm512_storeu_ps(dfPtr10+409856+819200*i45+49152*j38+49152*s36+768*k132, out1921);
_mm512_storeu_ps(dfPtr10+409984+819200*i45+49152*j38+49152*s36+768*k132, out1929);
_mm512_storeu_ps(dfPtr10+409920+819200*i45+49152*j38+49152*s36+768*k132, out1925);
_mm512_storeu_ps(dfPtr10+410048+819200*i45+49152*j38+49152*s36+768*k132, out1933);
_mm512_storeu_ps(dfPtr10+614656+819200*i45+49152*j38+49152*s36+768*k132, out1922);
_mm512_storeu_ps(dfPtr10+614784+819200*i45+49152*j38+49152*s36+768*k132, out1930);
_mm512_storeu_ps(dfPtr10+614720+819200*i45+49152*j38+49152*s36+768*k132, out1926);
_mm512_storeu_ps(dfPtr10+614848+819200*i45+49152*j38+49152*s36+768*k132, out1934);
__m512 dat2137 = _mm512_maskz_loadu_ps(8191, datPtr23+3740+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 dat2138 = _mm512_maskz_loadu_ps(16383, datPtr23+3784+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512i pm210 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in2083 = _mm512_permutexvar_ps(pm210, dat2137);
__m512i pm211 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in2088 = _mm512_permutexvar_ps(pm211, dat2138);
__m512 dat2139 = _mm512_maskz_loadu_ps(8191, datPtr23+3852+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 dat2140 = _mm512_maskz_loadu_ps(16383, datPtr23+3896+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2084 = _mm512_permutexvar_ps(pm210, dat2139);
__m512 in2089 = _mm512_permutexvar_ps(pm211, dat2140);
__m512 dat2141 = _mm512_maskz_loadu_ps(8191, datPtr23+3964+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 dat2142 = _mm512_maskz_loadu_ps(16383, datPtr23+4008+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2085 = _mm512_permutexvar_ps(pm210, dat2141);
__m512 in2090 = _mm512_permutexvar_ps(pm211, dat2142);
__m512 dat2143 = _mm512_maskz_loadu_ps(8191, datPtr23+4076+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 dat2144 = _mm512_maskz_loadu_ps(16383, datPtr23+4120+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2086 = _mm512_permutexvar_ps(pm210, dat2143);
__m512 in2091 = _mm512_permutexvar_ps(pm211, dat2144);
__m512 dat2145 = _mm512_maskz_loadu_ps(8191, datPtr23+4188+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 dat2146 = _mm512_maskz_loadu_ps(16383, datPtr23+4232+401408*i45+112*h48+4*w60+401408*s36+6272*k132);
__m512 in2087 = _mm512_permutexvar_ps(pm210, dat2145);
__m512 in2092 = _mm512_permutexvar_ps(pm211, dat2146);
__m512 tmp14459 = in2084;
__m512 tmp14464 = in2089;
__m512 tmp14460 = _mm512_sub_ps(in2087, in2085);
__m512 tmp14465 = _mm512_sub_ps(in2092, in2090);
__m512 tmp14461 = in2085;
__m512 tmp14466 = in2090;
in2083 = in2083;
in2088 = in2088;
tmp14459 = _mm512_fmadd_ps(in2086, _mm512_set1_ps(-4.25e+00f), tmp14459);
tmp14464 = _mm512_fmadd_ps(in2091, _mm512_set1_ps(-4.25e+00f), tmp14464);
tmp14461 = _mm512_fmadd_ps(in2087, _mm512_set1_ps(-4.25e+00f), tmp14461);
tmp14466 = _mm512_fmadd_ps(in2092, _mm512_set1_ps(-4.25e+00f), tmp14466);
in2083 = _mm512_fmadd_ps(tmp14460, _mm512_set1_ps(5.25e+00f), in2083);
in2088 = _mm512_fmadd_ps(tmp14465, _mm512_set1_ps(5.25e+00f), in2088);
tmp14460 = _mm512_mul_ps(in2085, _mm512_set1_ps(2.5e-01f));
tmp14465 = _mm512_mul_ps(in2090, _mm512_set1_ps(2.5e-01f));
in2085 = _mm512_mul_ps(in2085, _mm512_set1_ps(4e+00f));
in2090 = _mm512_mul_ps(in2090, _mm512_set1_ps(4e+00f));
__m512 tmp14462 = _mm512_sub_ps(tmp14461, tmp14459);
__m512 tmp14467 = _mm512_sub_ps(tmp14466, tmp14464);
tmp14461 = _mm512_add_ps(tmp14459, tmp14461);
tmp14466 = _mm512_add_ps(tmp14464, tmp14466);
tmp14459 = _mm512_mul_ps(in2084, _mm512_set1_ps(2.5e-01f));
tmp14464 = _mm512_mul_ps(in2089, _mm512_set1_ps(2.5e-01f));
tmp14460 = _mm512_fmadd_ps(in2087, _mm512_set1_ps(-1.25e+00f), tmp14460);
tmp14465 = _mm512_fmadd_ps(in2092, _mm512_set1_ps(-1.25e+00f), tmp14465);
in2087 = _mm512_fmadd_ps(in2087, _mm512_set1_ps(-5e+00f), in2085);
in2092 = _mm512_fmadd_ps(in2092, _mm512_set1_ps(-5e+00f), in2090);
tmp14459 = _mm512_fmadd_ps(in2086, _mm512_set1_ps(-1.25e+00f), tmp14459);
tmp14464 = _mm512_fmadd_ps(in2091, _mm512_set1_ps(-1.25e+00f), tmp14464);
__m512 tmp14463 = _mm512_fmadd_ps(tmp14459, _mm512_set1_ps(2e+00f), tmp14460);
__m512 tmp14468 = _mm512_fmadd_ps(tmp14464, _mm512_set1_ps(2e+00f), tmp14465);
tmp14460 = _mm512_fnmadd_ps(tmp14459, _mm512_set1_ps(2e+00f), tmp14460);
tmp14465 = _mm512_fnmadd_ps(tmp14464, _mm512_set1_ps(2e+00f), tmp14465);
tmp14459 = in2084;
tmp14464 = in2089;
in2084 = _mm512_sub_ps(_mm512_setzero_ps(), in2084);
in2089 = _mm512_sub_ps(_mm512_setzero_ps(), in2089);
tmp14459 = _mm512_fmadd_ps(in2086, _mm512_set1_ps(-1.25e+00f), tmp14459);
tmp14464 = _mm512_fmadd_ps(in2091, _mm512_set1_ps(-1.25e+00f), tmp14464);
in2086 = in2086;
in2091 = in2091;
in2086 = _mm512_fmadd_ps(in2086, _mm512_set1_ps(5.25e+00f), in2084);
in2091 = _mm512_fmadd_ps(in2091, _mm512_set1_ps(5.25e+00f), in2089);
in2085 = _mm512_fmadd_ps(tmp14459, _mm512_set1_ps(2e+00f), in2087);
in2090 = _mm512_fmadd_ps(tmp14464, _mm512_set1_ps(2e+00f), in2092);
in2087 = _mm512_fnmadd_ps(tmp14459, _mm512_set1_ps(2e+00f), in2087);
in2092 = _mm512_fnmadd_ps(tmp14464, _mm512_set1_ps(2e+00f), in2092);
__m512 tmp14477 = _mm512_unpacklo_ps(in2083, tmp14461);
__m512 tmp14478 = _mm512_unpackhi_ps(in2083, tmp14461);
__m512 tmp14479 = _mm512_unpacklo_ps(tmp14462, tmp14463);
__m512 tmp14480 = _mm512_unpackhi_ps(tmp14462, tmp14463);
__m512 tmp14481 = _mm512_unpacklo_ps(tmp14460, in2085);
__m512 tmp14482 = _mm512_unpackhi_ps(tmp14460, in2085);
__m512 tmp14483 = _mm512_unpacklo_ps(in2087, in2086);
__m512 tmp14484 = _mm512_unpackhi_ps(in2087, in2086);
__m512 tmp14485 = _mm512_unpacklo_ps(in2088, tmp14466);
__m512 tmp14486 = _mm512_unpackhi_ps(in2088, tmp14466);
__m512 tmp14487 = _mm512_unpacklo_ps(tmp14467, tmp14468);
__m512 tmp14488 = _mm512_unpackhi_ps(tmp14467, tmp14468);
__m512 tmp14489 = _mm512_unpacklo_ps(tmp14465, in2090);
__m512 tmp14490 = _mm512_unpackhi_ps(tmp14465, in2090);
__m512 tmp14491 = _mm512_unpacklo_ps(in2092, in2091);
__m512 tmp14492 = _mm512_unpackhi_ps(in2092, in2091);
__m512 tmp14493 = _mm512_shuffle_ps(tmp14477, tmp14479, 68);
__m512 tmp14494 = _mm512_shuffle_ps(tmp14477, tmp14479, 238);
__m512 tmp14495 = _mm512_shuffle_ps(tmp14478, tmp14480, 68);
__m512 tmp14496 = _mm512_shuffle_ps(tmp14478, tmp14480, 238);
__m512 tmp14497 = _mm512_shuffle_ps(tmp14481, tmp14483, 68);
__m512 tmp14498 = _mm512_shuffle_ps(tmp14481, tmp14483, 238);
__m512 tmp14499 = _mm512_shuffle_ps(tmp14482, tmp14484, 68);
__m512 tmp14500 = _mm512_shuffle_ps(tmp14482, tmp14484, 238);
__m512 tmp14501 = _mm512_shuffle_ps(tmp14485, tmp14487, 68);
__m512 tmp14502 = _mm512_shuffle_ps(tmp14485, tmp14487, 238);
__m512 tmp14503 = _mm512_shuffle_ps(tmp14486, tmp14488, 68);
__m512 tmp14504 = _mm512_shuffle_ps(tmp14486, tmp14488, 238);
__m512 tmp14505 = _mm512_shuffle_ps(tmp14489, tmp14491, 68);
__m512 tmp14506 = _mm512_shuffle_ps(tmp14489, tmp14491, 238);
__m512 tmp14507 = _mm512_shuffle_ps(tmp14490, tmp14492, 68);
__m512 tmp14508 = _mm512_shuffle_ps(tmp14490, tmp14492, 238);
__m512 tmp14509 = _mm512_shuffle_f32x4(tmp14493, tmp14497, 136);
__m512 tmp14510 = _mm512_shuffle_f32x4(tmp14493, tmp14497, 221);
__m512 tmp14511 = _mm512_shuffle_f32x4(tmp14494, tmp14498, 136);
__m512 tmp14512 = _mm512_shuffle_f32x4(tmp14494, tmp14498, 221);
__m512 tmp14513 = _mm512_shuffle_f32x4(tmp14495, tmp14499, 136);
__m512 tmp14514 = _mm512_shuffle_f32x4(tmp14495, tmp14499, 221);
__m512 tmp14515 = _mm512_shuffle_f32x4(tmp14496, tmp14500, 136);
__m512 tmp14516 = _mm512_shuffle_f32x4(tmp14496, tmp14500, 221);
__m512 tmp14517 = _mm512_shuffle_f32x4(tmp14501, tmp14505, 136);
__m512 tmp14518 = _mm512_shuffle_f32x4(tmp14501, tmp14505, 221);
__m512 tmp14519 = _mm512_shuffle_f32x4(tmp14502, tmp14506, 136);
__m512 tmp14520 = _mm512_shuffle_f32x4(tmp14502, tmp14506, 221);
__m512 tmp14521 = _mm512_shuffle_f32x4(tmp14503, tmp14507, 136);
__m512 tmp14522 = _mm512_shuffle_f32x4(tmp14503, tmp14507, 221);
__m512 tmp14523 = _mm512_shuffle_f32x4(tmp14504, tmp14508, 136);
__m512 tmp14524 = _mm512_shuffle_f32x4(tmp14504, tmp14508, 221);
in2083 = _mm512_shuffle_f32x4(tmp14509, tmp14517, 136);
in2088 = _mm512_shuffle_f32x4(tmp14509, tmp14517, 221);
tmp14461 = _mm512_shuffle_f32x4(tmp14511, tmp14519, 136);
tmp14466 = _mm512_shuffle_f32x4(tmp14511, tmp14519, 221);
tmp14462 = _mm512_shuffle_f32x4(tmp14513, tmp14521, 136);
tmp14467 = _mm512_shuffle_f32x4(tmp14513, tmp14521, 221);
tmp14463 = _mm512_shuffle_f32x4(tmp14515, tmp14523, 136);
tmp14468 = _mm512_shuffle_f32x4(tmp14515, tmp14523, 221);
tmp14460 = _mm512_shuffle_f32x4(tmp14510, tmp14518, 136);
tmp14465 = _mm512_shuffle_f32x4(tmp14510, tmp14518, 221);
in2085 = _mm512_shuffle_f32x4(tmp14512, tmp14520, 136);
in2090 = _mm512_shuffle_f32x4(tmp14512, tmp14520, 221);
in2087 = _mm512_shuffle_f32x4(tmp14514, tmp14522, 136);
in2092 = _mm512_shuffle_f32x4(tmp14514, tmp14522, 221);
in2086 = _mm512_shuffle_f32x4(tmp14516, tmp14524, 136);
in2091 = _mm512_shuffle_f32x4(tmp14516, tmp14524, 221);
__m512 tmp14469 = _mm512_add_ps(tmp14461, in2085);
__m512 tmp14473 = _mm512_add_ps(tmp14466, in2090);
__m512 tmp14470 = _mm512_sub_ps(tmp14460, tmp14462);
__m512 tmp14474 = _mm512_sub_ps(tmp14465, tmp14467);
__m512 tmp14471 = _mm512_add_ps(tmp14462, in2087);
__m512 tmp14475 = _mm512_add_ps(tmp14467, in2092);
in2083 = _mm512_sub_ps(in2083, in2087);
in2088 = _mm512_sub_ps(in2088, in2092);
tmp14469 = _mm512_fmadd_ps(tmp14463, _mm512_set1_ps(-4.25e+00f), tmp14469);
tmp14473 = _mm512_fmadd_ps(tmp14468, _mm512_set1_ps(-4.25e+00f), tmp14473);
tmp14471 = _mm512_fmadd_ps(tmp14460, _mm512_set1_ps(-4.25e+00f), tmp14471);
tmp14475 = _mm512_fmadd_ps(tmp14465, _mm512_set1_ps(-4.25e+00f), tmp14475);
in2083 = _mm512_fmadd_ps(tmp14470, _mm512_set1_ps(5.25e+00f), in2083);
in2088 = _mm512_fmadd_ps(tmp14474, _mm512_set1_ps(5.25e+00f), in2088);
tmp14470 = _mm512_fmadd_ps(tmp14462, _mm512_set1_ps(2.5e-01f), in2087);
tmp14474 = _mm512_fmadd_ps(tmp14467, _mm512_set1_ps(2.5e-01f), in2092);
tmp14462 = _mm512_fmadd_ps(tmp14462, _mm512_set1_ps(4e+00f), in2087);
tmp14467 = _mm512_fmadd_ps(tmp14467, _mm512_set1_ps(4e+00f), in2092);
__m512 tmp14472 = _mm512_sub_ps(tmp14471, tmp14469);
__m512 tmp14476 = _mm512_sub_ps(tmp14475, tmp14473);
tmp14471 = _mm512_add_ps(tmp14469, tmp14471);
tmp14475 = _mm512_add_ps(tmp14473, tmp14475);
tmp14469 = _mm512_fmadd_ps(tmp14461, _mm512_set1_ps(2.5e-01f), in2085);
tmp14473 = _mm512_fmadd_ps(tmp14466, _mm512_set1_ps(2.5e-01f), in2090);
tmp14470 = _mm512_fmadd_ps(tmp14460, _mm512_set1_ps(-1.25e+00f), tmp14470);
tmp14474 = _mm512_fmadd_ps(tmp14465, _mm512_set1_ps(-1.25e+00f), tmp14474);
tmp14460 = _mm512_fmadd_ps(tmp14460, _mm512_set1_ps(-5e+00f), tmp14462);
tmp14465 = _mm512_fmadd_ps(tmp14465, _mm512_set1_ps(-5e+00f), tmp14467);
tmp14469 = _mm512_fmadd_ps(tmp14463, _mm512_set1_ps(-1.25e+00f), tmp14469);
tmp14473 = _mm512_fmadd_ps(tmp14468, _mm512_set1_ps(-1.25e+00f), tmp14473);
in2087 = _mm512_fmadd_ps(tmp14469, _mm512_set1_ps(2e+00f), tmp14470);
in2092 = _mm512_fmadd_ps(tmp14473, _mm512_set1_ps(2e+00f), tmp14474);
tmp14470 = _mm512_fnmadd_ps(tmp14469, _mm512_set1_ps(2e+00f), tmp14470);
tmp14474 = _mm512_fnmadd_ps(tmp14473, _mm512_set1_ps(2e+00f), tmp14474);
tmp14469 = _mm512_fmadd_ps(in2085, _mm512_set1_ps(2.5e-01f), tmp14461);
tmp14473 = _mm512_fmadd_ps(in2090, _mm512_set1_ps(2.5e-01f), tmp14466);
tmp14461 = _mm512_sub_ps(in2086, tmp14461);
tmp14466 = _mm512_sub_ps(in2091, tmp14466);
tmp14469 = _mm512_fmadd_ps(tmp14463, _mm512_set1_ps(-1.25e+00f), tmp14469);
tmp14473 = _mm512_fmadd_ps(tmp14468, _mm512_set1_ps(-1.25e+00f), tmp14473);
tmp14463 = _mm512_sub_ps(tmp14463, in2085);
tmp14468 = _mm512_sub_ps(tmp14468, in2090);
tmp14463 = _mm512_fmadd_ps(tmp14463, _mm512_set1_ps(5.25e+00f), tmp14461);
tmp14468 = _mm512_fmadd_ps(tmp14468, _mm512_set1_ps(5.25e+00f), tmp14466);
tmp14462 = _mm512_fmadd_ps(tmp14469, _mm512_set1_ps(2e+00f), tmp14460);
tmp14467 = _mm512_fmadd_ps(tmp14473, _mm512_set1_ps(2e+00f), tmp14465);
tmp14460 = _mm512_fnmadd_ps(tmp14469, _mm512_set1_ps(2e+00f), tmp14460);
tmp14465 = _mm512_fnmadd_ps(tmp14473, _mm512_set1_ps(2e+00f), tmp14465);
__m512 out1935 = _mm512_shuffle_f32x4(in2083, tmp14471, 68);
__m512 out1943 = _mm512_shuffle_f32x4(in2083, tmp14471, 238);
__m512 out1936 = _mm512_shuffle_f32x4(tmp14472, in2087, 68);
__m512 out1944 = _mm512_shuffle_f32x4(tmp14472, in2087, 238);
__m512 out1937 = _mm512_shuffle_f32x4(tmp14470, tmp14462, 68);
__m512 out1945 = _mm512_shuffle_f32x4(tmp14470, tmp14462, 238);
__m512 out1938 = _mm512_shuffle_f32x4(tmp14460, tmp14463, 68);
__m512 out1946 = _mm512_shuffle_f32x4(tmp14460, tmp14463, 238);
__m512 out1939 = _mm512_shuffle_f32x4(in2088, tmp14475, 68);
__m512 out1947 = _mm512_shuffle_f32x4(in2088, tmp14475, 238);
__m512 out1940 = _mm512_shuffle_f32x4(tmp14476, in2092, 68);
__m512 out1948 = _mm512_shuffle_f32x4(tmp14476, in2092, 238);
__m512 out1941 = _mm512_shuffle_f32x4(tmp14474, tmp14467, 68);
__m512 out1949 = _mm512_shuffle_f32x4(tmp14474, tmp14467, 238);
__m512 out1942 = _mm512_shuffle_f32x4(tmp14465, tmp14468, 68);
__m512 out1950 = _mm512_shuffle_f32x4(tmp14465, tmp14468, 238);
_mm512_storeu_ps(dfPtr10+512+819200*i45+49152*j38+49152*s36+768*k132, out1935);
_mm512_storeu_ps(dfPtr10+640+819200*i45+49152*j38+49152*s36+768*k132, out1943);
_mm512_storeu_ps(dfPtr10+576+819200*i45+49152*j38+49152*s36+768*k132, out1939);
_mm512_storeu_ps(dfPtr10+704+819200*i45+49152*j38+49152*s36+768*k132, out1947);
_mm512_storeu_ps(dfPtr10+205312+819200*i45+49152*j38+49152*s36+768*k132, out1936);
_mm512_storeu_ps(dfPtr10+205440+819200*i45+49152*j38+49152*s36+768*k132, out1944);
_mm512_storeu_ps(dfPtr10+205376+819200*i45+49152*j38+49152*s36+768*k132, out1940);
_mm512_storeu_ps(dfPtr10+205504+819200*i45+49152*j38+49152*s36+768*k132, out1948);
_mm512_storeu_ps(dfPtr10+410112+819200*i45+49152*j38+49152*s36+768*k132, out1937);
_mm512_storeu_ps(dfPtr10+410240+819200*i45+49152*j38+49152*s36+768*k132, out1945);
_mm512_storeu_ps(dfPtr10+410176+819200*i45+49152*j38+49152*s36+768*k132, out1941);
_mm512_storeu_ps(dfPtr10+410304+819200*i45+49152*j38+49152*s36+768*k132, out1949);
_mm512_storeu_ps(dfPtr10+614912+819200*i45+49152*j38+49152*s36+768*k132, out1938);
_mm512_storeu_ps(dfPtr10+615040+819200*i45+49152*j38+49152*s36+768*k132, out1946);
_mm512_storeu_ps(dfPtr10+614976+819200*i45+49152*j38+49152*s36+768*k132, out1942);
_mm512_storeu_ps(dfPtr10+615104+819200*i45+49152*j38+49152*s36+768*k132, out1950);
}
if (j38 >= last9) return;
++j38;
rel21 = 4;
}
ptrdiff_t h49 = base21+24;
ptrdiff_t w61 = 24;
ptrdiff_t k133 = 0;
for (; k133 != 32; ++k133) {
__m512 dat2147 = _mm512_maskz_loadu_ps(31, datPtr23+0+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 dat2148 = _mm512_maskz_loadu_ps(31, datPtr23+3136+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 dat2149 = _mm512_maskz_loadu_ps(31, datPtr23+6272+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 dat2150 = _mm512_maskz_loadu_ps(31, datPtr23+9408+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512i pm212 = _mm512_set_epi32(15, 15, 15, 20, 19, 18, 17, 16, 15, 15, 15, 4, 3, 2, 1, 0);
__m512 in2093 = _mm512_permutex2var_ps(dat2147, pm212, dat2148);
__m512 in2098 = _mm512_permutex2var_ps(dat2149, pm212, dat2150);
__m512 dat2151 = _mm512_maskz_loadu_ps(31, datPtr23+112+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 dat2152 = _mm512_maskz_loadu_ps(31, datPtr23+3248+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 dat2153 = _mm512_maskz_loadu_ps(31, datPtr23+6384+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 dat2154 = _mm512_maskz_loadu_ps(31, datPtr23+9520+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 in2094 = _mm512_permutex2var_ps(dat2151, pm212, dat2152);
__m512 in2099 = _mm512_permutex2var_ps(dat2153, pm212, dat2154);
__m512 dat2155 = _mm512_maskz_loadu_ps(31, datPtr23+224+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 dat2156 = _mm512_maskz_loadu_ps(31, datPtr23+3360+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 dat2157 = _mm512_maskz_loadu_ps(31, datPtr23+6496+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 dat2158 = _mm512_maskz_loadu_ps(31, datPtr23+9632+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 in2095 = _mm512_permutex2var_ps(dat2155, pm212, dat2156);
__m512 in2100 = _mm512_permutex2var_ps(dat2157, pm212, dat2158);
__m512 dat2159 = _mm512_maskz_loadu_ps(31, datPtr23+336+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 dat2160 = _mm512_maskz_loadu_ps(31, datPtr23+3472+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 dat2161 = _mm512_maskz_loadu_ps(31, datPtr23+6608+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 dat2162 = _mm512_maskz_loadu_ps(31, datPtr23+9744+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 in2096 = _mm512_permutex2var_ps(dat2159, pm212, dat2160);
__m512 in2101 = _mm512_permutex2var_ps(dat2161, pm212, dat2162);
__m512 dat2163 = _mm512_maskz_loadu_ps(31, datPtr23+448+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 dat2164 = _mm512_maskz_loadu_ps(31, datPtr23+3584+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 dat2165 = _mm512_maskz_loadu_ps(31, datPtr23+6720+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 dat2166 = _mm512_maskz_loadu_ps(31, datPtr23+9856+401408*i45+112*h49+4*w61+401408*s36+12544*k133);
__m512 in2097 = _mm512_permutex2var_ps(dat2163, pm212, dat2164);
__m512 in2102 = _mm512_permutex2var_ps(dat2165, pm212, dat2166);
__m512 tmp14525 = in2094;
__m512 tmp14530 = in2099;
__m512 tmp14526 = _mm512_sub_ps(in2097, in2095);
__m512 tmp14531 = _mm512_sub_ps(in2102, in2100);
__m512 tmp14527 = in2095;
__m512 tmp14532 = in2100;
in2093 = in2093;
in2098 = in2098;
tmp14525 = _mm512_fmadd_ps(in2096, _mm512_set1_ps(-4.25e+00f), tmp14525);
tmp14530 = _mm512_fmadd_ps(in2101, _mm512_set1_ps(-4.25e+00f), tmp14530);
tmp14527 = _mm512_fmadd_ps(in2097, _mm512_set1_ps(-4.25e+00f), tmp14527);
tmp14532 = _mm512_fmadd_ps(in2102, _mm512_set1_ps(-4.25e+00f), tmp14532);
in2093 = _mm512_fmadd_ps(tmp14526, _mm512_set1_ps(5.25e+00f), in2093);
in2098 = _mm512_fmadd_ps(tmp14531, _mm512_set1_ps(5.25e+00f), in2098);
tmp14526 = _mm512_mul_ps(in2095, _mm512_set1_ps(2.5e-01f));
tmp14531 = _mm512_mul_ps(in2100, _mm512_set1_ps(2.5e-01f));
in2095 = _mm512_mul_ps(in2095, _mm512_set1_ps(4e+00f));
in2100 = _mm512_mul_ps(in2100, _mm512_set1_ps(4e+00f));
__m512 tmp14528 = _mm512_sub_ps(tmp14527, tmp14525);
__m512 tmp14533 = _mm512_sub_ps(tmp14532, tmp14530);
tmp14527 = _mm512_add_ps(tmp14525, tmp14527);
tmp14532 = _mm512_add_ps(tmp14530, tmp14532);
tmp14525 = _mm512_mul_ps(in2094, _mm512_set1_ps(2.5e-01f));
tmp14530 = _mm512_mul_ps(in2099, _mm512_set1_ps(2.5e-01f));
tmp14526 = _mm512_fmadd_ps(in2097, _mm512_set1_ps(-1.25e+00f), tmp14526);
tmp14531 = _mm512_fmadd_ps(in2102, _mm512_set1_ps(-1.25e+00f), tmp14531);
in2097 = _mm512_fmadd_ps(in2097, _mm512_set1_ps(-5e+00f), in2095);
in2102 = _mm512_fmadd_ps(in2102, _mm512_set1_ps(-5e+00f), in2100);
tmp14525 = _mm512_fmadd_ps(in2096, _mm512_set1_ps(-1.25e+00f), tmp14525);
tmp14530 = _mm512_fmadd_ps(in2101, _mm512_set1_ps(-1.25e+00f), tmp14530);
__m512 tmp14529 = _mm512_fmadd_ps(tmp14525, _mm512_set1_ps(2e+00f), tmp14526);
__m512 tmp14534 = _mm512_fmadd_ps(tmp14530, _mm512_set1_ps(2e+00f), tmp14531);
tmp14526 = _mm512_fnmadd_ps(tmp14525, _mm512_set1_ps(2e+00f), tmp14526);
tmp14531 = _mm512_fnmadd_ps(tmp14530, _mm512_set1_ps(2e+00f), tmp14531);
tmp14525 = in2094;
tmp14530 = in2099;
in2094 = _mm512_sub_ps(_mm512_setzero_ps(), in2094);
in2099 = _mm512_sub_ps(_mm512_setzero_ps(), in2099);
tmp14525 = _mm512_fmadd_ps(in2096, _mm512_set1_ps(-1.25e+00f), tmp14525);
tmp14530 = _mm512_fmadd_ps(in2101, _mm512_set1_ps(-1.25e+00f), tmp14530);
in2096 = in2096;
in2101 = in2101;
in2096 = _mm512_fmadd_ps(in2096, _mm512_set1_ps(5.25e+00f), in2094);
in2101 = _mm512_fmadd_ps(in2101, _mm512_set1_ps(5.25e+00f), in2099);
in2095 = _mm512_fmadd_ps(tmp14525, _mm512_set1_ps(2e+00f), in2097);
in2100 = _mm512_fmadd_ps(tmp14530, _mm512_set1_ps(2e+00f), in2102);
in2097 = _mm512_fnmadd_ps(tmp14525, _mm512_set1_ps(2e+00f), in2097);
in2102 = _mm512_fnmadd_ps(tmp14530, _mm512_set1_ps(2e+00f), in2102);
__m512 tmp14545 = _mm512_unpacklo_ps(in2093, tmp14527);
__m512 tmp14546 = _mm512_unpackhi_ps(in2093, tmp14527);
__m512 tmp14547 = _mm512_unpacklo_ps(tmp14528, tmp14529);
__m512 tmp14548 = _mm512_unpackhi_ps(tmp14528, tmp14529);
__m512 tmp14549 = _mm512_unpacklo_ps(tmp14526, in2095);
__m512 tmp14550 = _mm512_unpackhi_ps(tmp14526, in2095);
__m512 tmp14551 = _mm512_unpacklo_ps(in2097, in2096);
__m512 tmp14552 = _mm512_unpackhi_ps(in2097, in2096);
__m512 tmp14553 = _mm512_unpacklo_ps(in2098, tmp14532);
__m512 tmp14554 = _mm512_unpackhi_ps(in2098, tmp14532);
__m512 tmp14555 = _mm512_unpacklo_ps(tmp14533, tmp14534);
__m512 tmp14556 = _mm512_unpackhi_ps(tmp14533, tmp14534);
__m512 tmp14557 = _mm512_unpacklo_ps(tmp14531, in2100);
__m512 tmp14558 = _mm512_unpackhi_ps(tmp14531, in2100);
__m512 tmp14559 = _mm512_unpacklo_ps(in2102, in2101);
__m512 tmp14560 = _mm512_unpackhi_ps(in2102, in2101);
__m512 tmp14561 = _mm512_shuffle_ps(tmp14545, tmp14547, 68);
__m512 tmp14562 = _mm512_shuffle_ps(tmp14545, tmp14547, 238);
__m512 tmp14563 = _mm512_shuffle_ps(tmp14546, tmp14548, 68);
__m512 tmp14564 = _mm512_shuffle_ps(tmp14546, tmp14548, 238);
__m512 tmp14565 = _mm512_shuffle_ps(tmp14549, tmp14551, 68);
__m512 tmp14566 = _mm512_shuffle_ps(tmp14549, tmp14551, 238);
__m512 tmp14567 = _mm512_shuffle_ps(tmp14550, tmp14552, 68);
__m512 tmp14568 = _mm512_shuffle_ps(tmp14550, tmp14552, 238);
__m512 tmp14569 = _mm512_shuffle_ps(tmp14553, tmp14555, 68);
__m512 tmp14570 = _mm512_shuffle_ps(tmp14553, tmp14555, 238);
__m512 tmp14571 = _mm512_shuffle_ps(tmp14554, tmp14556, 68);
__m512 tmp14572 = _mm512_shuffle_ps(tmp14554, tmp14556, 238);
__m512 tmp14573 = _mm512_shuffle_ps(tmp14557, tmp14559, 68);
__m512 tmp14574 = _mm512_shuffle_ps(tmp14557, tmp14559, 238);
__m512 tmp14575 = _mm512_shuffle_ps(tmp14558, tmp14560, 68);
__m512 tmp14576 = _mm512_shuffle_ps(tmp14558, tmp14560, 238);
__m512 tmp14577 = _mm512_shuffle_f32x4(tmp14561, tmp14565, 136);
__m512 tmp14578 = _mm512_shuffle_f32x4(tmp14561, tmp14565, 221);
__m512 tmp14579 = _mm512_shuffle_f32x4(tmp14562, tmp14566, 136);
__m512 tmp14580 = _mm512_shuffle_f32x4(tmp14562, tmp14566, 221);
__m512 tmp14581 = _mm512_shuffle_f32x4(tmp14563, tmp14567, 136);
__m512 tmp14582 = _mm512_shuffle_f32x4(tmp14563, tmp14567, 221);
__m512 tmp14583 = _mm512_shuffle_f32x4(tmp14564, tmp14568, 136);
__m512 tmp14584 = _mm512_shuffle_f32x4(tmp14564, tmp14568, 221);
__m512 tmp14585 = _mm512_shuffle_f32x4(tmp14569, tmp14573, 136);
__m512 tmp14586 = _mm512_shuffle_f32x4(tmp14569, tmp14573, 221);
__m512 tmp14587 = _mm512_shuffle_f32x4(tmp14570, tmp14574, 136);
__m512 tmp14588 = _mm512_shuffle_f32x4(tmp14570, tmp14574, 221);
__m512 tmp14589 = _mm512_shuffle_f32x4(tmp14571, tmp14575, 136);
__m512 tmp14590 = _mm512_shuffle_f32x4(tmp14571, tmp14575, 221);
__m512 tmp14591 = _mm512_shuffle_f32x4(tmp14572, tmp14576, 136);
__m512 tmp14592 = _mm512_shuffle_f32x4(tmp14572, tmp14576, 221);
in2093 = _mm512_shuffle_f32x4(tmp14577, tmp14585, 136);
in2098 = _mm512_shuffle_f32x4(tmp14577, tmp14585, 221);
tmp14527 = _mm512_shuffle_f32x4(tmp14579, tmp14587, 136);
tmp14532 = _mm512_shuffle_f32x4(tmp14579, tmp14587, 221);
tmp14528 = _mm512_shuffle_f32x4(tmp14581, tmp14589, 136);
tmp14533 = _mm512_shuffle_f32x4(tmp14581, tmp14589, 221);
tmp14529 = _mm512_shuffle_f32x4(tmp14583, tmp14591, 136);
tmp14534 = _mm512_shuffle_f32x4(tmp14583, tmp14591, 221);
tmp14526 = _mm512_shuffle_f32x4(tmp14578, tmp14586, 136);
tmp14531 = _mm512_shuffle_f32x4(tmp14578, tmp14586, 221);
in2095 = _mm512_shuffle_f32x4(tmp14580, tmp14588, 136);
in2097 = _mm512_shuffle_f32x4(tmp14582, tmp14590, 136);
in2096 = _mm512_shuffle_f32x4(tmp14584, tmp14592, 136);
(void)in2095;
(void)in2097;
(void)in2096;
__m512 tmp14535 = tmp14527;
__m512 tmp14540 = tmp14532;
__m512 tmp14536 = _mm512_sub_ps(tmp14526, tmp14528);
__m512 tmp14541 = _mm512_sub_ps(tmp14531, tmp14533);
__m512 tmp14537 = tmp14528;
__m512 tmp14542 = tmp14533;
in2093 = in2093;
in2098 = in2098;
tmp14535 = _mm512_fmadd_ps(tmp14529, _mm512_set1_ps(-4.25e+00f), tmp14535);
tmp14540 = _mm512_fmadd_ps(tmp14534, _mm512_set1_ps(-4.25e+00f), tmp14540);
tmp14537 = _mm512_fmadd_ps(tmp14526, _mm512_set1_ps(-4.25e+00f), tmp14537);
tmp14542 = _mm512_fmadd_ps(tmp14531, _mm512_set1_ps(-4.25e+00f), tmp14542);
in2093 = _mm512_fmadd_ps(tmp14536, _mm512_set1_ps(5.25e+00f), in2093);
in2098 = _mm512_fmadd_ps(tmp14541, _mm512_set1_ps(5.25e+00f), in2098);
tmp14536 = _mm512_mul_ps(tmp14528, _mm512_set1_ps(2.5e-01f));
tmp14541 = _mm512_mul_ps(tmp14533, _mm512_set1_ps(2.5e-01f));
tmp14528 = _mm512_mul_ps(tmp14528, _mm512_set1_ps(4e+00f));
tmp14533 = _mm512_mul_ps(tmp14533, _mm512_set1_ps(4e+00f));
__m512 tmp14538 = _mm512_sub_ps(tmp14537, tmp14535);
__m512 tmp14543 = _mm512_sub_ps(tmp14542, tmp14540);
tmp14537 = _mm512_add_ps(tmp14535, tmp14537);
tmp14542 = _mm512_add_ps(tmp14540, tmp14542);
tmp14535 = _mm512_mul_ps(tmp14527, _mm512_set1_ps(2.5e-01f));
tmp14540 = _mm512_mul_ps(tmp14532, _mm512_set1_ps(2.5e-01f));
tmp14536 = _mm512_fmadd_ps(tmp14526, _mm512_set1_ps(-1.25e+00f), tmp14536);
tmp14541 = _mm512_fmadd_ps(tmp14531, _mm512_set1_ps(-1.25e+00f), tmp14541);
tmp14526 = _mm512_fmadd_ps(tmp14526, _mm512_set1_ps(-5e+00f), tmp14528);
tmp14531 = _mm512_fmadd_ps(tmp14531, _mm512_set1_ps(-5e+00f), tmp14533);
tmp14535 = _mm512_fmadd_ps(tmp14529, _mm512_set1_ps(-1.25e+00f), tmp14535);
tmp14540 = _mm512_fmadd_ps(tmp14534, _mm512_set1_ps(-1.25e+00f), tmp14540);
__m512 tmp14539 = _mm512_fmadd_ps(tmp14535, _mm512_set1_ps(2e+00f), tmp14536);
__m512 tmp14544 = _mm512_fmadd_ps(tmp14540, _mm512_set1_ps(2e+00f), tmp14541);
tmp14536 = _mm512_fnmadd_ps(tmp14535, _mm512_set1_ps(2e+00f), tmp14536);
tmp14541 = _mm512_fnmadd_ps(tmp14540, _mm512_set1_ps(2e+00f), tmp14541);
tmp14535 = tmp14527;
tmp14540 = tmp14532;
tmp14527 = _mm512_sub_ps(_mm512_setzero_ps(), tmp14527);
tmp14532 = _mm512_sub_ps(_mm512_setzero_ps(), tmp14532);
tmp14535 = _mm512_fmadd_ps(tmp14529, _mm512_set1_ps(-1.25e+00f), tmp14535);
tmp14540 = _mm512_fmadd_ps(tmp14534, _mm512_set1_ps(-1.25e+00f), tmp14540);
tmp14529 = tmp14529;
tmp14534 = tmp14534;
tmp14529 = _mm512_fmadd_ps(tmp14529, _mm512_set1_ps(5.25e+00f), tmp14527);
tmp14534 = _mm512_fmadd_ps(tmp14534, _mm512_set1_ps(5.25e+00f), tmp14532);
tmp14528 = _mm512_fmadd_ps(tmp14535, _mm512_set1_ps(2e+00f), tmp14526);
tmp14533 = _mm512_fmadd_ps(tmp14540, _mm512_set1_ps(2e+00f), tmp14531);
tmp14526 = _mm512_fnmadd_ps(tmp14535, _mm512_set1_ps(2e+00f), tmp14526);
tmp14531 = _mm512_fnmadd_ps(tmp14540, _mm512_set1_ps(2e+00f), tmp14531);
__m512 out1951 = _mm512_shuffle_f32x4(in2093, tmp14537, 68);
__m512 out1959 = _mm512_shuffle_f32x4(in2093, tmp14537, 238);
__m512 out1952 = _mm512_shuffle_f32x4(tmp14538, tmp14539, 68);
__m512 out1960 = _mm512_shuffle_f32x4(tmp14538, tmp14539, 238);
__m512 out1953 = _mm512_shuffle_f32x4(tmp14536, tmp14528, 68);
__m512 out1961 = _mm512_shuffle_f32x4(tmp14536, tmp14528, 238);
__m512 out1954 = _mm512_shuffle_f32x4(tmp14526, tmp14529, 68);
__m512 out1962 = _mm512_shuffle_f32x4(tmp14526, tmp14529, 238);
__m512 out1955 = _mm512_shuffle_f32x4(in2098, tmp14542, 68);
__m512 out1963 = _mm512_shuffle_f32x4(in2098, tmp14542, 238);
__m512 out1956 = _mm512_shuffle_f32x4(tmp14543, tmp14544, 68);
__m512 out1964 = _mm512_shuffle_f32x4(tmp14543, tmp14544, 238);
__m512 out1957 = _mm512_shuffle_f32x4(tmp14541, tmp14533, 68);
__m512 out1965 = _mm512_shuffle_f32x4(tmp14541, tmp14533, 238);
__m512 out1958 = _mm512_shuffle_f32x4(tmp14531, tmp14534, 68);
__m512 out1966 = _mm512_shuffle_f32x4(tmp14531, tmp14534, 238);
_mm512_storeu_ps(dfPtr10+0+819200*i45+49152*j38+8192*s36+256*k133, out1951);
_mm512_storeu_ps(dfPtr10+128+819200*i45+49152*j38+8192*s36+256*k133, out1959);
_mm512_storeu_ps(dfPtr10+64+819200*i45+49152*j38+8192*s36+256*k133, out1955);
_mm512_storeu_ps(dfPtr10+192+819200*i45+49152*j38+8192*s36+256*k133, out1963);
_mm512_storeu_ps(dfPtr10+204800+819200*i45+49152*j38+8192*s36+256*k133, out1952);
_mm512_storeu_ps(dfPtr10+204928+819200*i45+49152*j38+8192*s36+256*k133, out1960);
_mm512_storeu_ps(dfPtr10+204864+819200*i45+49152*j38+8192*s36+256*k133, out1956);
_mm512_storeu_ps(dfPtr10+204992+819200*i45+49152*j38+8192*s36+256*k133, out1964);
_mm512_storeu_ps(dfPtr10+409600+819200*i45+49152*j38+8192*s36+256*k133, out1953);
_mm512_storeu_ps(dfPtr10+409728+819200*i45+49152*j38+8192*s36+256*k133, out1961);
_mm512_storeu_ps(dfPtr10+409664+819200*i45+49152*j38+8192*s36+256*k133, out1957);
_mm512_storeu_ps(dfPtr10+409792+819200*i45+49152*j38+8192*s36+256*k133, out1965);
_mm512_storeu_ps(dfPtr10+614400+819200*i45+49152*j38+8192*s36+256*k133, out1954);
_mm512_storeu_ps(dfPtr10+614528+819200*i45+49152*j38+8192*s36+256*k133, out1962);
_mm512_storeu_ps(dfPtr10+614464+819200*i45+49152*j38+8192*s36+256*k133, out1958);
_mm512_storeu_ps(dfPtr10+614592+819200*i45+49152*j38+8192*s36+256*k133, out1966);
}
if (j38 >= last9) return;
++j38;
}

static void ResNet50ThreeArrangeDats4(ResNet50ThreaderTeam1* team50, char** tensors73) {
ResNet50ThreaderTask1 task77;
task77.callee1 = ResNet50ThreeArrangeDats4Callee1;
task77.any1 = tensors73;
task77.nd1 = 4;
task77.hull1[0] = 1;
task77.hull1[1] = 5;
task77.hull1[2] = 1;
task77.hull1[3] = 1;
ResNet50ThreaderDo1(team50, &task77);
}

static void ResNet50ThreeProduceSums4Callee1(ResNet50ThreaderTask1* task78, int64_t* pt44) {
void** pair20 = task78->any1;
char** tensors76 = pair20[0];
ptrdiff_t e23 = 0;
ptrdiff_t g26 = 0;
ptrdiff_t f47 = pt44[2];
ptrdiff_t d15 = pt44[1];
ptrdiff_t w62 = pt44[0];
char*restrict bfPtr11 = tensors76[0]+512*e23;
char*restrict wfPtr11 = tensors76[0]+512+6488064*e23;
char*restrict dfPtr11 = tensors76[1]+2534400*e23;
char*restrict sfPtr10 = tensors76[2];
ptrdiff_t i46 = 1*g26;
ptrdiff_t j39 = 1*f47;
ptrdiff_t k134 = 1*d15;
ptrdiff_t kk43 = k134+0;
for (; k134 != 4; ++k134) {
ptrdiff_t l52 = 4*w62;
ptrdiff_t ll7 = l52+3;
for (; l52 != 32; ++l52) {
__m512 sum364;
__m512 sum370;
__m512 sum376;
__m512 sum382;
if (__builtin_expect(!j39, 0)) {
sum364 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr11+0+512*i46+16*l52)));
sum370 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr11+4+512*i46+16*l52)));
sum376 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr11+8+512*i46+16*l52)));
sum382 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr11+12+512*i46+16*l52)));
} else {
sum364 = _mm512_setzero_ps();
sum370 = _mm512_setzero_ps();
sum376 = _mm512_setzero_ps();
sum382 = _mm512_setzero_ps();
}
__m512 sum365 = sum364;
__m512 sum366 = sum364;
__m512 sum367 = sum364;
__m512 sum368 = sum364;
__m512 sum369 = sum364;
__m512 sum371 = sum370;
__m512 sum372 = sum370;
__m512 sum373 = sum370;
__m512 sum374 = sum370;
__m512 sum375 = sum370;
__m512 sum377 = sum376;
__m512 sum378 = sum376;
__m512 sum379 = sum376;
__m512 sum380 = sum376;
__m512 sum381 = sum376;
__m512 sum383 = sum382;
__m512 sum384 = sum382;
__m512 sum385 = sum382;
__m512 sum386 = sum382;
__m512 sum387 = sum382;
ptrdiff_t b60 = 0;
for (; b60 != 128; ++b60) {
__m512i wfs29 = _mm512_maskz_loadu_epi32(65535, wfPtr11+0+2097152*i46+524288*j39+16384*l52+128*b60);
__m512 wf121 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs29));
__m512 df668 = _mm512_loadu_ps(dfPtr11+0+819200*i46+204800*j39+49152*k134+384*b60);
sum364 = _mm512_fmadd_ps(wf121, df668, sum364);
__m512 df669 = _mm512_loadu_ps(dfPtr11+64+819200*i46+204800*j39+49152*k134+384*b60);
sum365 = _mm512_fmadd_ps(wf121, df669, sum365);
__m512 df670 = _mm512_loadu_ps(dfPtr11+128+819200*i46+204800*j39+49152*k134+384*b60);
sum366 = _mm512_fmadd_ps(wf121, df670, sum366);
__m512 df671 = _mm512_loadu_ps(dfPtr11+192+819200*i46+204800*j39+49152*k134+384*b60);
sum367 = _mm512_fmadd_ps(wf121, df671, sum367);
__m512 df672 = _mm512_loadu_ps(dfPtr11+256+819200*i46+204800*j39+49152*k134+384*b60);
sum368 = _mm512_fmadd_ps(wf121, df672, sum368);
__m512 df673 = _mm512_loadu_ps(dfPtr11+320+819200*i46+204800*j39+49152*k134+384*b60);
sum369 = _mm512_fmadd_ps(wf121, df673, sum369);
__m512 wf122 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs29, 1));
sum370 = _mm512_fmadd_ps(wf122, df668, sum370);
sum371 = _mm512_fmadd_ps(wf122, df669, sum371);
sum372 = _mm512_fmadd_ps(wf122, df670, sum372);
sum373 = _mm512_fmadd_ps(wf122, df671, sum373);
sum374 = _mm512_fmadd_ps(wf122, df672, sum374);
sum375 = _mm512_fmadd_ps(wf122, df673, sum375);
__m512i wfs30 = _mm512_maskz_loadu_epi32(65535, wfPtr11+64+2097152*i46+524288*j39+16384*l52+128*b60);
__m512 wf123 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs30));
sum376 = _mm512_fmadd_ps(wf123, df668, sum376);
sum377 = _mm512_fmadd_ps(wf123, df669, sum377);
sum378 = _mm512_fmadd_ps(wf123, df670, sum378);
sum379 = _mm512_fmadd_ps(wf123, df671, sum379);
sum380 = _mm512_fmadd_ps(wf123, df672, sum380);
sum381 = _mm512_fmadd_ps(wf123, df673, sum381);
__m512 wf124 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs30, 1));
sum382 = _mm512_fmadd_ps(wf124, df668, sum382);
sum383 = _mm512_fmadd_ps(wf124, df669, sum383);
sum384 = _mm512_fmadd_ps(wf124, df670, sum384);
sum385 = _mm512_fmadd_ps(wf124, df671, sum385);
sum386 = _mm512_fmadd_ps(wf124, df672, sum386);
sum387 = _mm512_fmadd_ps(wf124, df673, sum387);
}
_mm512_storeu_ps(sfPtr10+0+819200*i46+204800*j39+49152*k134+1536*l52, sum364);
_mm512_storeu_ps(sfPtr10+64+819200*i46+204800*j39+49152*k134+1536*l52, sum365);
_mm512_storeu_ps(sfPtr10+128+819200*i46+204800*j39+49152*k134+1536*l52, sum366);
_mm512_storeu_ps(sfPtr10+192+819200*i46+204800*j39+49152*k134+1536*l52, sum367);
_mm512_storeu_ps(sfPtr10+256+819200*i46+204800*j39+49152*k134+1536*l52, sum368);
_mm512_storeu_ps(sfPtr10+320+819200*i46+204800*j39+49152*k134+1536*l52, sum369);
_mm512_storeu_ps(sfPtr10+384+819200*i46+204800*j39+49152*k134+1536*l52, sum370);
_mm512_storeu_ps(sfPtr10+448+819200*i46+204800*j39+49152*k134+1536*l52, sum371);
_mm512_storeu_ps(sfPtr10+512+819200*i46+204800*j39+49152*k134+1536*l52, sum372);
_mm512_storeu_ps(sfPtr10+576+819200*i46+204800*j39+49152*k134+1536*l52, sum373);
_mm512_storeu_ps(sfPtr10+640+819200*i46+204800*j39+49152*k134+1536*l52, sum374);
_mm512_storeu_ps(sfPtr10+704+819200*i46+204800*j39+49152*k134+1536*l52, sum375);
_mm512_storeu_ps(sfPtr10+768+819200*i46+204800*j39+49152*k134+1536*l52, sum376);
_mm512_storeu_ps(sfPtr10+832+819200*i46+204800*j39+49152*k134+1536*l52, sum377);
_mm512_storeu_ps(sfPtr10+896+819200*i46+204800*j39+49152*k134+1536*l52, sum378);
_mm512_storeu_ps(sfPtr10+960+819200*i46+204800*j39+49152*k134+1536*l52, sum379);
_mm512_storeu_ps(sfPtr10+1024+819200*i46+204800*j39+49152*k134+1536*l52, sum380);
_mm512_storeu_ps(sfPtr10+1088+819200*i46+204800*j39+49152*k134+1536*l52, sum381);
_mm512_storeu_ps(sfPtr10+1152+819200*i46+204800*j39+49152*k134+1536*l52, sum382);
_mm512_storeu_ps(sfPtr10+1216+819200*i46+204800*j39+49152*k134+1536*l52, sum383);
_mm512_storeu_ps(sfPtr10+1280+819200*i46+204800*j39+49152*k134+1536*l52, sum384);
_mm512_storeu_ps(sfPtr10+1344+819200*i46+204800*j39+49152*k134+1536*l52, sum385);
_mm512_storeu_ps(sfPtr10+1408+819200*i46+204800*j39+49152*k134+1536*l52, sum386);
_mm512_storeu_ps(sfPtr10+1472+819200*i46+204800*j39+49152*k134+1536*l52, sum387);
if (l52 >= ll7) return;
}
if (k134 >= kk43) return;
}
ptrdiff_t l53 = 4*w62;
ptrdiff_t ll8 = l53+3;
for (; l53 != 32; ++l53) {
__m512 sum388;
__m512 sum389;
__m512 sum390;
__m512 sum391;
if (__builtin_expect(!j39, 0)) {
sum388 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr11+0+512*i46+16*l53)));
sum389 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr11+4+512*i46+16*l53)));
sum390 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr11+8+512*i46+16*l53)));
sum391 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr11+12+512*i46+16*l53)));
} else {
sum388 = _mm512_setzero_ps();
sum389 = _mm512_setzero_ps();
sum390 = _mm512_setzero_ps();
sum391 = _mm512_setzero_ps();
}
ptrdiff_t b61 = 0;
for (; b61 != 128; ++b61) {
__m512i wfs31 = _mm512_maskz_loadu_epi32(65535, wfPtr11+0+2097152*i46+524288*j39+16384*l53+128*b61);
__m512 wf125 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs31));
__m512 df674 = _mm512_loadu_ps(dfPtr11+0+819200*i46+204800*j39+49152*k134+64*b61);
sum388 = _mm512_fmadd_ps(wf125, df674, sum388);
__m512 wf126 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs31, 1));
sum389 = _mm512_fmadd_ps(wf126, df674, sum389);
__m512i wfs32 = _mm512_maskz_loadu_epi32(65535, wfPtr11+64+2097152*i46+524288*j39+16384*l53+128*b61);
__m512 wf127 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs32));
sum390 = _mm512_fmadd_ps(wf127, df674, sum390);
__m512 wf128 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs32, 1));
sum391 = _mm512_fmadd_ps(wf128, df674, sum391);
}
_mm512_storeu_ps(sfPtr10+0+819200*i46+204800*j39+49152*k134+256*l53, sum388);
_mm512_storeu_ps(sfPtr10+64+819200*i46+204800*j39+49152*k134+256*l53, sum389);
_mm512_storeu_ps(sfPtr10+128+819200*i46+204800*j39+49152*k134+256*l53, sum390);
_mm512_storeu_ps(sfPtr10+192+819200*i46+204800*j39+49152*k134+256*l53, sum391);
if (l53 >= ll8) return;
}
}

static void ResNet50ThreeProduceSums4(ResNet50ThreaderTeam1* team51, char** tensors75) {
void* pair19[] = {tensors75, 0};
ResNet50ThreaderTask1 task79;
task79.callee1 = ResNet50ThreeProduceSums4Callee1;
task79.any1 = pair19;
task79.nd1 = 4;
task79.hull1[0] = 8;
task79.hull1[1] = 5;
task79.hull1[2] = 4;
task79.hull1[3] = 1;
ResNet50ThreaderDo1(team51, &task79);
}

static void ResNet50ThreeConsumeSums4Callee1(ResNet50ThreaderTask1* task80, int64_t* pt45) {
char** tensors78 = task80->any1;
ptrdiff_t w63 = 0;
ptrdiff_t d16 = pt45[1];
ptrdiff_t g27 = 0;
char*restrict sfPtr11 = tensors78[0];
char*restrict datPtr24 = tensors78[1];
ptrdiff_t i47 = 1*g27;
ptrdiff_t j40 = 1*d16;
ptrdiff_t last10 = j40+0;
ptrdiff_t rel22 = j40-0;
ptrdiff_t base22 = 0;
if (rel22 < 2) {
if (rel22 < 1) {
ptrdiff_t toH43 = base22+0;
ptrdiff_t toW43 = 0;
ptrdiff_t k135 = 32*w63;
for (; k135 != 32; ++k135) {
ptrdiff_t l54 = 0;
for (; l54 != 2; ++l54) {
__m512 sf1009 = _mm512_loadu_ps(sfPtr11+0+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1010 = _mm512_loadu_ps(sfPtr11+128+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2103 = _mm512_shuffle_f32x4(sf1009, sf1010, 68);
__m512 in2104 = _mm512_shuffle_f32x4(sf1009, sf1010, 238);
__m512 sf1011 = _mm512_loadu_ps(sfPtr11+64+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1012 = _mm512_loadu_ps(sfPtr11+192+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2111 = _mm512_shuffle_f32x4(sf1011, sf1012, 68);
__m512 in2112 = _mm512_shuffle_f32x4(sf1011, sf1012, 238);
__m512 sf1013 = _mm512_loadu_ps(sfPtr11+204800+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1014 = _mm512_loadu_ps(sfPtr11+204928+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2105 = _mm512_shuffle_f32x4(sf1013, sf1014, 68);
__m512 in2106 = _mm512_shuffle_f32x4(sf1013, sf1014, 238);
__m512 sf1015 = _mm512_loadu_ps(sfPtr11+204864+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1016 = _mm512_loadu_ps(sfPtr11+204992+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2113 = _mm512_shuffle_f32x4(sf1015, sf1016, 68);
__m512 in2114 = _mm512_shuffle_f32x4(sf1015, sf1016, 238);
__m512 sf1017 = _mm512_loadu_ps(sfPtr11+409600+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1018 = _mm512_loadu_ps(sfPtr11+409728+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2107 = _mm512_shuffle_f32x4(sf1017, sf1018, 68);
__m512 in2108 = _mm512_shuffle_f32x4(sf1017, sf1018, 238);
__m512 sf1019 = _mm512_loadu_ps(sfPtr11+409664+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1020 = _mm512_loadu_ps(sfPtr11+409792+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2115 = _mm512_shuffle_f32x4(sf1019, sf1020, 68);
__m512 in2116 = _mm512_shuffle_f32x4(sf1019, sf1020, 238);
__m512 sf1021 = _mm512_loadu_ps(sfPtr11+614400+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1022 = _mm512_loadu_ps(sfPtr11+614528+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2109 = _mm512_shuffle_f32x4(sf1021, sf1022, 68);
__m512 in2110 = _mm512_shuffle_f32x4(sf1021, sf1022, 238);
__m512 sf1023 = _mm512_loadu_ps(sfPtr11+614464+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1024 = _mm512_loadu_ps(sfPtr11+614592+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2117 = _mm512_shuffle_f32x4(sf1023, sf1024, 68);
__m512 in2118 = _mm512_shuffle_f32x4(sf1023, sf1024, 238);
__m512 tmp14609 = _mm512_add_ps(in2104, in2105);
__m512 tmp14629 = _mm512_add_ps(in2112, in2113);
__m512 tmp14608 = _mm512_add_ps(in2106, in2107);
__m512 tmp14628 = _mm512_add_ps(in2114, in2115);
__m512 tmp14614 = _mm512_sub_ps(in2106, in2107);
__m512 tmp14634 = _mm512_sub_ps(in2114, in2115);
__m512 tmp14613 = _mm512_sub_ps(in2104, in2105);
__m512 tmp14633 = _mm512_sub_ps(in2112, in2113);
__m512 tmp14610 = _mm512_add_ps(in2108, in2109);
__m512 tmp14630 = _mm512_add_ps(in2116, in2117);
__m512 tmp14615 = _mm512_sub_ps(in2108, in2109);
__m512 tmp14635 = _mm512_sub_ps(in2116, in2117);
__m512 tmp14612 = _mm512_fmadd_ps(tmp14614, _mm512_set1_ps(2e+00f), tmp14613);
__m512 tmp14632 = _mm512_fmadd_ps(tmp14634, _mm512_set1_ps(2e+00f), tmp14633);
__m512 tmp14619 = _mm512_fmadd_ps(tmp14614, _mm512_set1_ps(8e+00f), tmp14613);
__m512 tmp14639 = _mm512_fmadd_ps(tmp14634, _mm512_set1_ps(8e+00f), tmp14633);
__m512 tmp14607 = _mm512_add_ps(tmp14608, tmp14609);
__m512 tmp14627 = _mm512_add_ps(tmp14628, tmp14629);
__m512 tmp14611 = _mm512_fmadd_ps(tmp14615, _mm512_set1_ps(1.6e+01f), tmp14612);
__m512 tmp14631 = _mm512_fmadd_ps(tmp14635, _mm512_set1_ps(1.6e+01f), tmp14632);
__m512 tmp14618 = _mm512_fmadd_ps(tmp14615, _mm512_set1_ps(4e+00f), tmp14619);
__m512 tmp14638 = _mm512_fmadd_ps(tmp14635, _mm512_set1_ps(4e+00f), tmp14639);
__m512 tmp14624 = _mm512_add_ps(tmp14615, tmp14613);
__m512 tmp14644 = _mm512_add_ps(tmp14635, tmp14633);
__m512 tmp14617 = _mm512_fmadd_ps(tmp14608, _mm512_set1_ps(4e+00f), tmp14609);
__m512 tmp14637 = _mm512_fmadd_ps(tmp14628, _mm512_set1_ps(4e+00f), tmp14629);
__m512 tmp14621 = _mm512_fmadd_ps(tmp14608, _mm512_set1_ps(1.6e+01f), tmp14609);
__m512 tmp14641 = _mm512_fmadd_ps(tmp14628, _mm512_set1_ps(1.6e+01f), tmp14629);
__m512 tmp14606 = _mm512_add_ps(tmp14607, in2103);
__m512 tmp14626 = _mm512_add_ps(tmp14627, in2111);
__m512 tmp14623 = _mm512_add_ps(tmp14624, in2110);
__m512 tmp14643 = _mm512_add_ps(tmp14644, in2118);
__m512 tmp14605 = _mm512_fmadd_ps(tmp14610, _mm512_set1_ps(3.2e+01f), tmp14606);
__m512 tmp14625 = _mm512_fmadd_ps(tmp14630, _mm512_set1_ps(3.2e+01f), tmp14626);
__m512 tmp14616 = _mm512_fmadd_ps(tmp14610, _mm512_set1_ps(8e+00f), tmp14617);
__m512 tmp14636 = _mm512_fmadd_ps(tmp14630, _mm512_set1_ps(8e+00f), tmp14637);
__m512 tmp14622 = _mm512_fmadd_ps(tmp14614, _mm512_set1_ps(3.2e+01f), tmp14623);
__m512 tmp14642 = _mm512_fmadd_ps(tmp14634, _mm512_set1_ps(3.2e+01f), tmp14643);
__m512 tmp14620 = _mm512_fmadd_ps(tmp14610, _mm512_set1_ps(2e+00f), tmp14621);
__m512 tmp14640 = _mm512_fmadd_ps(tmp14630, _mm512_set1_ps(2e+00f), tmp14641);
__m512 tmp14593 = tmp14605;
__m512 tmp14599 = tmp14625;
__m512 tmp14594 = tmp14611;
__m512 tmp14600 = tmp14631;
__m512 tmp14595 = tmp14616;
__m512 tmp14601 = tmp14636;
__m512 tmp14596 = tmp14618;
__m512 tmp14602 = tmp14638;
__m512 tmp14597 = tmp14620;
__m512 tmp14603 = tmp14640;
__m512 tmp14598 = tmp14622;
__m512 tmp14604 = tmp14642;
__m512 tmp14689 = _mm512_unpacklo_ps(tmp14593, tmp14594);
__m512 tmp14690 = _mm512_unpackhi_ps(tmp14593, tmp14594);
__m512 tmp14691 = _mm512_unpacklo_ps(tmp14595, tmp14596);
__m512 tmp14692 = _mm512_unpackhi_ps(tmp14595, tmp14596);
__m512 tmp14693 = _mm512_unpacklo_ps(tmp14597, tmp14598);
__m512 tmp14694 = _mm512_unpackhi_ps(tmp14597, tmp14598);
__m512 tmp14695 = _mm512_unpacklo_ps(tmp14599, tmp14600);
__m512 tmp14696 = _mm512_unpackhi_ps(tmp14599, tmp14600);
__m512 tmp14697 = _mm512_unpacklo_ps(tmp14601, tmp14602);
__m512 tmp14698 = _mm512_unpackhi_ps(tmp14601, tmp14602);
__m512 tmp14699 = _mm512_unpacklo_ps(tmp14603, tmp14604);
__m512 tmp14700 = _mm512_unpackhi_ps(tmp14603, tmp14604);
__m512 tmp14701 = _mm512_shuffle_ps(tmp14689, tmp14691, 68);
__m512 tmp14702 = _mm512_shuffle_ps(tmp14689, tmp14691, 238);
__m512 tmp14703 = _mm512_shuffle_ps(tmp14690, tmp14692, 68);
__m512 tmp14704 = _mm512_shuffle_ps(tmp14690, tmp14692, 238);
__m512 tmp14705 = _mm512_shuffle_ps(tmp14693, tmp14695, 68);
__m512 tmp14706 = _mm512_shuffle_ps(tmp14693, tmp14695, 238);
__m512 tmp14707 = _mm512_shuffle_ps(tmp14694, tmp14696, 68);
__m512 tmp14708 = _mm512_shuffle_ps(tmp14694, tmp14696, 238);
__m512 tmp14709 = _mm512_shuffle_ps(tmp14697, tmp14699, 68);
__m512 tmp14710 = _mm512_shuffle_ps(tmp14697, tmp14699, 238);
__m512 tmp14711 = _mm512_shuffle_ps(tmp14698, tmp14700, 68);
__m512 tmp14712 = _mm512_shuffle_ps(tmp14698, tmp14700, 238);
__m512 tmp14713 = _mm512_shuffle_f32x4(tmp14701, tmp14705, 136);
__m512 tmp14714 = _mm512_shuffle_f32x4(tmp14701, tmp14705, 221);
__m512 tmp14715 = _mm512_shuffle_f32x4(tmp14702, tmp14706, 136);
__m512 tmp14716 = _mm512_shuffle_f32x4(tmp14702, tmp14706, 221);
__m512 tmp14717 = _mm512_shuffle_f32x4(tmp14703, tmp14707, 136);
__m512 tmp14718 = _mm512_shuffle_f32x4(tmp14703, tmp14707, 221);
__m512 tmp14719 = _mm512_shuffle_f32x4(tmp14704, tmp14708, 136);
__m512 tmp14720 = _mm512_shuffle_f32x4(tmp14704, tmp14708, 221);
__m512 tmp14721 = _mm512_shuffle_f32x4(tmp14709, tmp14709, 136);
__m512 tmp14722 = _mm512_shuffle_f32x4(tmp14709, tmp14709, 221);
__m512 tmp14723 = _mm512_shuffle_f32x4(tmp14710, tmp14710, 136);
__m512 tmp14724 = _mm512_shuffle_f32x4(tmp14710, tmp14710, 221);
__m512 tmp14725 = _mm512_shuffle_f32x4(tmp14711, tmp14711, 136);
__m512 tmp14726 = _mm512_shuffle_f32x4(tmp14711, tmp14711, 221);
__m512 tmp14727 = _mm512_shuffle_f32x4(tmp14712, tmp14712, 136);
__m512 tmp14728 = _mm512_shuffle_f32x4(tmp14712, tmp14712, 221);
tmp14593 = _mm512_shuffle_f32x4(tmp14713, tmp14721, 136);
tmp14601 = _mm512_shuffle_f32x4(tmp14713, tmp14721, 221);
tmp14594 = _mm512_shuffle_f32x4(tmp14715, tmp14723, 136);
tmp14602 = _mm512_shuffle_f32x4(tmp14715, tmp14723, 221);
tmp14595 = _mm512_shuffle_f32x4(tmp14717, tmp14725, 136);
tmp14603 = _mm512_shuffle_f32x4(tmp14717, tmp14725, 221);
tmp14596 = _mm512_shuffle_f32x4(tmp14719, tmp14727, 136);
tmp14604 = _mm512_shuffle_f32x4(tmp14719, tmp14727, 221);
tmp14597 = _mm512_shuffle_f32x4(tmp14714, tmp14722, 136);
__m512 tmp14645 = _mm512_shuffle_f32x4(tmp14714, tmp14722, 221);
tmp14598 = _mm512_shuffle_f32x4(tmp14716, tmp14724, 136);
__m512 tmp14646 = _mm512_shuffle_f32x4(tmp14716, tmp14724, 221);
tmp14599 = _mm512_shuffle_f32x4(tmp14718, tmp14726, 136);
__m512 tmp14647 = _mm512_shuffle_f32x4(tmp14718, tmp14726, 221);
tmp14600 = _mm512_shuffle_f32x4(tmp14720, tmp14728, 136);
__m512 tmp14648 = _mm512_shuffle_f32x4(tmp14720, tmp14728, 221);
__m512 tmp14653 = _mm512_add_ps(tmp14594, tmp14595);
__m512 tmp14673 = _mm512_add_ps(tmp14602, tmp14603);
__m512 tmp14652 = _mm512_add_ps(tmp14596, tmp14597);
__m512 tmp14672 = _mm512_add_ps(tmp14604, tmp14645);
__m512 tmp14658 = _mm512_sub_ps(tmp14596, tmp14597);
__m512 tmp14678 = _mm512_sub_ps(tmp14604, tmp14645);
__m512 tmp14657 = _mm512_sub_ps(tmp14594, tmp14595);
__m512 tmp14677 = _mm512_sub_ps(tmp14602, tmp14603);
__m512 tmp14654 = _mm512_add_ps(tmp14598, tmp14599);
__m512 tmp14674 = _mm512_add_ps(tmp14646, tmp14647);
__m512 tmp14659 = _mm512_sub_ps(tmp14598, tmp14599);
__m512 tmp14679 = _mm512_sub_ps(tmp14646, tmp14647);
__m512 tmp14656 = _mm512_fmadd_ps(tmp14658, _mm512_set1_ps(2e+00f), tmp14657);
__m512 tmp14676 = _mm512_fmadd_ps(tmp14678, _mm512_set1_ps(2e+00f), tmp14677);
__m512 tmp14663 = _mm512_fmadd_ps(tmp14658, _mm512_set1_ps(8e+00f), tmp14657);
__m512 tmp14683 = _mm512_fmadd_ps(tmp14678, _mm512_set1_ps(8e+00f), tmp14677);
__m512 tmp14651 = _mm512_add_ps(tmp14652, tmp14653);
__m512 tmp14671 = _mm512_add_ps(tmp14672, tmp14673);
__m512 tmp14655 = _mm512_fmadd_ps(tmp14659, _mm512_set1_ps(1.6e+01f), tmp14656);
__m512 tmp14675 = _mm512_fmadd_ps(tmp14679, _mm512_set1_ps(1.6e+01f), tmp14676);
__m512 tmp14662 = _mm512_fmadd_ps(tmp14659, _mm512_set1_ps(4e+00f), tmp14663);
__m512 tmp14682 = _mm512_fmadd_ps(tmp14679, _mm512_set1_ps(4e+00f), tmp14683);
__m512 tmp14668 = _mm512_add_ps(tmp14659, tmp14657);
__m512 tmp14688 = _mm512_add_ps(tmp14679, tmp14677);
__m512 tmp14661 = _mm512_fmadd_ps(tmp14652, _mm512_set1_ps(4e+00f), tmp14653);
__m512 tmp14681 = _mm512_fmadd_ps(tmp14672, _mm512_set1_ps(4e+00f), tmp14673);
__m512 tmp14665 = _mm512_fmadd_ps(tmp14652, _mm512_set1_ps(1.6e+01f), tmp14653);
__m512 tmp14685 = _mm512_fmadd_ps(tmp14672, _mm512_set1_ps(1.6e+01f), tmp14673);
__m512 tmp14650 = _mm512_add_ps(tmp14651, tmp14593);
__m512 tmp14670 = _mm512_add_ps(tmp14671, tmp14601);
__m512 tmp14667 = _mm512_add_ps(tmp14668, tmp14600);
__m512 tmp14687 = _mm512_add_ps(tmp14688, tmp14648);
__m512 tmp14649 = _mm512_fmadd_ps(tmp14654, _mm512_set1_ps(3.2e+01f), tmp14650);
__m512 tmp14669 = _mm512_fmadd_ps(tmp14674, _mm512_set1_ps(3.2e+01f), tmp14670);
__m512 tmp14660 = _mm512_fmadd_ps(tmp14654, _mm512_set1_ps(8e+00f), tmp14661);
__m512 tmp14680 = _mm512_fmadd_ps(tmp14674, _mm512_set1_ps(8e+00f), tmp14681);
__m512 tmp14666 = _mm512_fmadd_ps(tmp14658, _mm512_set1_ps(3.2e+01f), tmp14667);
__m512 tmp14686 = _mm512_fmadd_ps(tmp14678, _mm512_set1_ps(3.2e+01f), tmp14687);
__m512 tmp14664 = _mm512_fmadd_ps(tmp14654, _mm512_set1_ps(2e+00f), tmp14665);
__m512 tmp14684 = _mm512_fmadd_ps(tmp14674, _mm512_set1_ps(2e+00f), tmp14685);
__m512 out1967 = tmp14649;
__m512 out1973 = tmp14669;
__m512 out1968 = tmp14655;
__m512 out1974 = tmp14675;
__m512 out1969 = tmp14660;
__m512 out1975 = tmp14680;
__m512 out1970 = tmp14662;
__m512 out1976 = tmp14682;
__m512 out1971 = tmp14664;
__m512 out1977 = tmp14684;
__m512 out1972 = tmp14666;
__m512 out1978 = tmp14686;
out1967 = _mm512_max_ps(_mm512_setzero_ps(), out1967);
out1973 = _mm512_max_ps(_mm512_setzero_ps(), out1973);
out1968 = _mm512_max_ps(_mm512_setzero_ps(), out1968);
out1974 = _mm512_max_ps(_mm512_setzero_ps(), out1974);
out1969 = _mm512_max_ps(_mm512_setzero_ps(), out1969);
out1975 = _mm512_max_ps(_mm512_setzero_ps(), out1975);
out1970 = _mm512_max_ps(_mm512_setzero_ps(), out1970);
out1976 = _mm512_max_ps(_mm512_setzero_ps(), out1976);
out1971 = _mm512_max_ps(_mm512_setzero_ps(), out1971);
out1977 = _mm512_max_ps(_mm512_setzero_ps(), out1977);
out1972 = _mm512_max_ps(_mm512_setzero_ps(), out1972);
out1978 = _mm512_max_ps(_mm512_setzero_ps(), out1978);
_mm512_mask_storeu_ps(datPtr24+0+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1967);
_mm512_mask_storeu_ps(datPtr24+48+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1973);
_mm512_mask_storeu_ps(datPtr24+112+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1968);
_mm512_mask_storeu_ps(datPtr24+160+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1974);
_mm512_mask_storeu_ps(datPtr24+224+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1969);
_mm512_mask_storeu_ps(datPtr24+272+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1975);
_mm512_mask_storeu_ps(datPtr24+336+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1970);
_mm512_mask_storeu_ps(datPtr24+384+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1976);
_mm512_mask_storeu_ps(datPtr24+448+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1971);
_mm512_mask_storeu_ps(datPtr24+496+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1977);
_mm512_mask_storeu_ps(datPtr24+560+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1972);
_mm512_mask_storeu_ps(datPtr24+608+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1978);
__m512 sf1025 = _mm512_loadu_ps(sfPtr11+256+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1026 = _mm512_loadu_ps(sfPtr11+384+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2119 = _mm512_shuffle_f32x4(sf1026, sf1025, 68);
__m512 in2120 = _mm512_shuffle_f32x4(sf1026, sf1025, 238);
__m512 sf1027 = _mm512_loadu_ps(sfPtr11+320+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1028 = _mm512_loadu_ps(sfPtr11+448+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2127 = _mm512_shuffle_f32x4(sf1028, sf1027, 68);
__m512 in2128 = _mm512_shuffle_f32x4(sf1028, sf1027, 238);
__m512 sf1029 = _mm512_loadu_ps(sfPtr11+205056+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1030 = _mm512_loadu_ps(sfPtr11+205184+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2121 = _mm512_shuffle_f32x4(sf1030, sf1029, 68);
__m512 in2122 = _mm512_shuffle_f32x4(sf1030, sf1029, 238);
__m512 sf1031 = _mm512_loadu_ps(sfPtr11+205120+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1032 = _mm512_loadu_ps(sfPtr11+205248+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2129 = _mm512_shuffle_f32x4(sf1032, sf1031, 68);
__m512 in2130 = _mm512_shuffle_f32x4(sf1032, sf1031, 238);
__m512 sf1033 = _mm512_loadu_ps(sfPtr11+409856+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1034 = _mm512_loadu_ps(sfPtr11+409984+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2123 = _mm512_shuffle_f32x4(sf1034, sf1033, 68);
__m512 in2124 = _mm512_shuffle_f32x4(sf1034, sf1033, 238);
__m512 sf1035 = _mm512_loadu_ps(sfPtr11+409920+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1036 = _mm512_loadu_ps(sfPtr11+410048+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2131 = _mm512_shuffle_f32x4(sf1036, sf1035, 68);
__m512 in2132 = _mm512_shuffle_f32x4(sf1036, sf1035, 238);
__m512 sf1037 = _mm512_loadu_ps(sfPtr11+614656+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1038 = _mm512_loadu_ps(sfPtr11+614784+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2125 = _mm512_shuffle_f32x4(sf1038, sf1037, 68);
__m512 in2126 = _mm512_shuffle_f32x4(sf1038, sf1037, 238);
__m512 sf1039 = _mm512_loadu_ps(sfPtr11+614720+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1040 = _mm512_loadu_ps(sfPtr11+614848+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2133 = _mm512_shuffle_f32x4(sf1040, sf1039, 68);
__m512 in2134 = _mm512_shuffle_f32x4(sf1040, sf1039, 238);
__m512 tmp14745 = _mm512_add_ps(in2120, in2121);
__m512 tmp14765 = _mm512_add_ps(in2128, in2129);
__m512 tmp14744 = _mm512_add_ps(in2122, in2123);
__m512 tmp14764 = _mm512_add_ps(in2130, in2131);
__m512 tmp14750 = _mm512_sub_ps(in2122, in2123);
__m512 tmp14770 = _mm512_sub_ps(in2130, in2131);
__m512 tmp14749 = _mm512_sub_ps(in2120, in2121);
__m512 tmp14769 = _mm512_sub_ps(in2128, in2129);
__m512 tmp14746 = _mm512_add_ps(in2124, in2125);
__m512 tmp14766 = _mm512_add_ps(in2132, in2133);
__m512 tmp14751 = _mm512_sub_ps(in2124, in2125);
__m512 tmp14771 = _mm512_sub_ps(in2132, in2133);
__m512 tmp14748 = _mm512_fmadd_ps(tmp14750, _mm512_set1_ps(2e+00f), tmp14749);
__m512 tmp14768 = _mm512_fmadd_ps(tmp14770, _mm512_set1_ps(2e+00f), tmp14769);
__m512 tmp14755 = _mm512_fmadd_ps(tmp14750, _mm512_set1_ps(8e+00f), tmp14749);
__m512 tmp14775 = _mm512_fmadd_ps(tmp14770, _mm512_set1_ps(8e+00f), tmp14769);
__m512 tmp14743 = _mm512_add_ps(tmp14744, tmp14745);
__m512 tmp14763 = _mm512_add_ps(tmp14764, tmp14765);
__m512 tmp14747 = _mm512_fmadd_ps(tmp14751, _mm512_set1_ps(1.6e+01f), tmp14748);
__m512 tmp14767 = _mm512_fmadd_ps(tmp14771, _mm512_set1_ps(1.6e+01f), tmp14768);
__m512 tmp14754 = _mm512_fmadd_ps(tmp14751, _mm512_set1_ps(4e+00f), tmp14755);
__m512 tmp14774 = _mm512_fmadd_ps(tmp14771, _mm512_set1_ps(4e+00f), tmp14775);
__m512 tmp14760 = _mm512_add_ps(tmp14751, tmp14749);
__m512 tmp14780 = _mm512_add_ps(tmp14771, tmp14769);
__m512 tmp14753 = _mm512_fmadd_ps(tmp14744, _mm512_set1_ps(4e+00f), tmp14745);
__m512 tmp14773 = _mm512_fmadd_ps(tmp14764, _mm512_set1_ps(4e+00f), tmp14765);
__m512 tmp14757 = _mm512_fmadd_ps(tmp14744, _mm512_set1_ps(1.6e+01f), tmp14745);
__m512 tmp14777 = _mm512_fmadd_ps(tmp14764, _mm512_set1_ps(1.6e+01f), tmp14765);
__m512 tmp14742 = _mm512_add_ps(tmp14743, in2119);
__m512 tmp14762 = _mm512_add_ps(tmp14763, in2127);
__m512 tmp14759 = _mm512_add_ps(tmp14760, in2126);
__m512 tmp14779 = _mm512_add_ps(tmp14780, in2134);
__m512 tmp14741 = _mm512_fmadd_ps(tmp14746, _mm512_set1_ps(3.2e+01f), tmp14742);
__m512 tmp14761 = _mm512_fmadd_ps(tmp14766, _mm512_set1_ps(3.2e+01f), tmp14762);
__m512 tmp14752 = _mm512_fmadd_ps(tmp14746, _mm512_set1_ps(8e+00f), tmp14753);
__m512 tmp14772 = _mm512_fmadd_ps(tmp14766, _mm512_set1_ps(8e+00f), tmp14773);
__m512 tmp14758 = _mm512_fmadd_ps(tmp14750, _mm512_set1_ps(3.2e+01f), tmp14759);
__m512 tmp14778 = _mm512_fmadd_ps(tmp14770, _mm512_set1_ps(3.2e+01f), tmp14779);
__m512 tmp14756 = _mm512_fmadd_ps(tmp14746, _mm512_set1_ps(2e+00f), tmp14757);
__m512 tmp14776 = _mm512_fmadd_ps(tmp14766, _mm512_set1_ps(2e+00f), tmp14777);
__m512 tmp14729 = tmp14741;
__m512 tmp14735 = tmp14761;
__m512 tmp14730 = tmp14747;
__m512 tmp14736 = tmp14767;
__m512 tmp14731 = tmp14752;
__m512 tmp14737 = tmp14772;
__m512 tmp14732 = tmp14754;
__m512 tmp14738 = tmp14774;
__m512 tmp14733 = tmp14756;
__m512 tmp14739 = tmp14776;
__m512 tmp14734 = tmp14758;
__m512 tmp14740 = tmp14778;
__m512 tmp14825 = _mm512_unpacklo_ps(tmp14729, tmp14730);
__m512 tmp14826 = _mm512_unpackhi_ps(tmp14729, tmp14730);
__m512 tmp14827 = _mm512_unpacklo_ps(tmp14731, tmp14732);
__m512 tmp14828 = _mm512_unpackhi_ps(tmp14731, tmp14732);
__m512 tmp14829 = _mm512_unpacklo_ps(tmp14733, tmp14734);
__m512 tmp14830 = _mm512_unpackhi_ps(tmp14733, tmp14734);
__m512 tmp14831 = _mm512_unpacklo_ps(tmp14735, tmp14736);
__m512 tmp14832 = _mm512_unpackhi_ps(tmp14735, tmp14736);
__m512 tmp14833 = _mm512_unpacklo_ps(tmp14737, tmp14738);
__m512 tmp14834 = _mm512_unpackhi_ps(tmp14737, tmp14738);
__m512 tmp14835 = _mm512_unpacklo_ps(tmp14739, tmp14740);
__m512 tmp14836 = _mm512_unpackhi_ps(tmp14739, tmp14740);
__m512 tmp14837 = _mm512_shuffle_ps(tmp14825, tmp14827, 68);
__m512 tmp14838 = _mm512_shuffle_ps(tmp14825, tmp14827, 238);
__m512 tmp14839 = _mm512_shuffle_ps(tmp14826, tmp14828, 68);
__m512 tmp14840 = _mm512_shuffle_ps(tmp14826, tmp14828, 238);
__m512 tmp14841 = _mm512_shuffle_ps(tmp14829, tmp14831, 68);
__m512 tmp14842 = _mm512_shuffle_ps(tmp14829, tmp14831, 238);
__m512 tmp14843 = _mm512_shuffle_ps(tmp14830, tmp14832, 68);
__m512 tmp14844 = _mm512_shuffle_ps(tmp14830, tmp14832, 238);
__m512 tmp14845 = _mm512_shuffle_ps(tmp14833, tmp14835, 68);
__m512 tmp14846 = _mm512_shuffle_ps(tmp14833, tmp14835, 238);
__m512 tmp14847 = _mm512_shuffle_ps(tmp14834, tmp14836, 68);
__m512 tmp14848 = _mm512_shuffle_ps(tmp14834, tmp14836, 238);
__m512 tmp14849 = _mm512_shuffle_f32x4(tmp14837, tmp14841, 136);
__m512 tmp14850 = _mm512_shuffle_f32x4(tmp14837, tmp14841, 221);
__m512 tmp14851 = _mm512_shuffle_f32x4(tmp14838, tmp14842, 136);
__m512 tmp14852 = _mm512_shuffle_f32x4(tmp14838, tmp14842, 221);
__m512 tmp14853 = _mm512_shuffle_f32x4(tmp14839, tmp14843, 136);
__m512 tmp14854 = _mm512_shuffle_f32x4(tmp14839, tmp14843, 221);
__m512 tmp14855 = _mm512_shuffle_f32x4(tmp14840, tmp14844, 136);
__m512 tmp14856 = _mm512_shuffle_f32x4(tmp14840, tmp14844, 221);
__m512 tmp14857 = _mm512_shuffle_f32x4(tmp14845, tmp14845, 136);
__m512 tmp14858 = _mm512_shuffle_f32x4(tmp14845, tmp14845, 221);
__m512 tmp14859 = _mm512_shuffle_f32x4(tmp14846, tmp14846, 136);
__m512 tmp14860 = _mm512_shuffle_f32x4(tmp14846, tmp14846, 221);
__m512 tmp14861 = _mm512_shuffle_f32x4(tmp14847, tmp14847, 136);
__m512 tmp14862 = _mm512_shuffle_f32x4(tmp14847, tmp14847, 221);
__m512 tmp14863 = _mm512_shuffle_f32x4(tmp14848, tmp14848, 136);
__m512 tmp14864 = _mm512_shuffle_f32x4(tmp14848, tmp14848, 221);
tmp14729 = _mm512_shuffle_f32x4(tmp14849, tmp14857, 136);
tmp14737 = _mm512_shuffle_f32x4(tmp14849, tmp14857, 221);
tmp14730 = _mm512_shuffle_f32x4(tmp14851, tmp14859, 136);
tmp14738 = _mm512_shuffle_f32x4(tmp14851, tmp14859, 221);
tmp14731 = _mm512_shuffle_f32x4(tmp14853, tmp14861, 136);
tmp14739 = _mm512_shuffle_f32x4(tmp14853, tmp14861, 221);
tmp14732 = _mm512_shuffle_f32x4(tmp14855, tmp14863, 136);
tmp14740 = _mm512_shuffle_f32x4(tmp14855, tmp14863, 221);
tmp14733 = _mm512_shuffle_f32x4(tmp14850, tmp14858, 136);
__m512 tmp14781 = _mm512_shuffle_f32x4(tmp14850, tmp14858, 221);
tmp14734 = _mm512_shuffle_f32x4(tmp14852, tmp14860, 136);
__m512 tmp14782 = _mm512_shuffle_f32x4(tmp14852, tmp14860, 221);
tmp14735 = _mm512_shuffle_f32x4(tmp14854, tmp14862, 136);
__m512 tmp14783 = _mm512_shuffle_f32x4(tmp14854, tmp14862, 221);
tmp14736 = _mm512_shuffle_f32x4(tmp14856, tmp14864, 136);
__m512 tmp14784 = _mm512_shuffle_f32x4(tmp14856, tmp14864, 221);
__m512 tmp14789 = _mm512_add_ps(tmp14730, tmp14731);
__m512 tmp14809 = _mm512_add_ps(tmp14738, tmp14739);
__m512 tmp14788 = _mm512_add_ps(tmp14732, tmp14733);
__m512 tmp14808 = _mm512_add_ps(tmp14740, tmp14781);
__m512 tmp14794 = _mm512_sub_ps(tmp14732, tmp14733);
__m512 tmp14814 = _mm512_sub_ps(tmp14740, tmp14781);
__m512 tmp14793 = _mm512_sub_ps(tmp14730, tmp14731);
__m512 tmp14813 = _mm512_sub_ps(tmp14738, tmp14739);
__m512 tmp14790 = _mm512_add_ps(tmp14734, tmp14735);
__m512 tmp14810 = _mm512_add_ps(tmp14782, tmp14783);
__m512 tmp14795 = _mm512_sub_ps(tmp14734, tmp14735);
__m512 tmp14815 = _mm512_sub_ps(tmp14782, tmp14783);
__m512 tmp14792 = _mm512_fmadd_ps(tmp14794, _mm512_set1_ps(2e+00f), tmp14793);
__m512 tmp14812 = _mm512_fmadd_ps(tmp14814, _mm512_set1_ps(2e+00f), tmp14813);
__m512 tmp14799 = _mm512_fmadd_ps(tmp14794, _mm512_set1_ps(8e+00f), tmp14793);
__m512 tmp14819 = _mm512_fmadd_ps(tmp14814, _mm512_set1_ps(8e+00f), tmp14813);
__m512 tmp14787 = _mm512_add_ps(tmp14788, tmp14789);
__m512 tmp14807 = _mm512_add_ps(tmp14808, tmp14809);
__m512 tmp14791 = _mm512_fmadd_ps(tmp14795, _mm512_set1_ps(1.6e+01f), tmp14792);
__m512 tmp14811 = _mm512_fmadd_ps(tmp14815, _mm512_set1_ps(1.6e+01f), tmp14812);
__m512 tmp14798 = _mm512_fmadd_ps(tmp14795, _mm512_set1_ps(4e+00f), tmp14799);
__m512 tmp14818 = _mm512_fmadd_ps(tmp14815, _mm512_set1_ps(4e+00f), tmp14819);
__m512 tmp14804 = _mm512_add_ps(tmp14795, tmp14793);
__m512 tmp14824 = _mm512_add_ps(tmp14815, tmp14813);
__m512 tmp14797 = _mm512_fmadd_ps(tmp14788, _mm512_set1_ps(4e+00f), tmp14789);
__m512 tmp14817 = _mm512_fmadd_ps(tmp14808, _mm512_set1_ps(4e+00f), tmp14809);
__m512 tmp14801 = _mm512_fmadd_ps(tmp14788, _mm512_set1_ps(1.6e+01f), tmp14789);
__m512 tmp14821 = _mm512_fmadd_ps(tmp14808, _mm512_set1_ps(1.6e+01f), tmp14809);
__m512 tmp14786 = _mm512_add_ps(tmp14787, tmp14729);
__m512 tmp14806 = _mm512_add_ps(tmp14807, tmp14737);
__m512 tmp14803 = _mm512_add_ps(tmp14804, tmp14736);
__m512 tmp14823 = _mm512_add_ps(tmp14824, tmp14784);
__m512 tmp14785 = _mm512_fmadd_ps(tmp14790, _mm512_set1_ps(3.2e+01f), tmp14786);
__m512 tmp14805 = _mm512_fmadd_ps(tmp14810, _mm512_set1_ps(3.2e+01f), tmp14806);
__m512 tmp14796 = _mm512_fmadd_ps(tmp14790, _mm512_set1_ps(8e+00f), tmp14797);
__m512 tmp14816 = _mm512_fmadd_ps(tmp14810, _mm512_set1_ps(8e+00f), tmp14817);
__m512 tmp14802 = _mm512_fmadd_ps(tmp14794, _mm512_set1_ps(3.2e+01f), tmp14803);
__m512 tmp14822 = _mm512_fmadd_ps(tmp14814, _mm512_set1_ps(3.2e+01f), tmp14823);
__m512 tmp14800 = _mm512_fmadd_ps(tmp14790, _mm512_set1_ps(2e+00f), tmp14801);
__m512 tmp14820 = _mm512_fmadd_ps(tmp14810, _mm512_set1_ps(2e+00f), tmp14821);
__m512 out1985 = tmp14785;
__m512 out1979 = tmp14805;
__m512 out1986 = tmp14791;
__m512 out1980 = tmp14811;
__m512 out1987 = tmp14796;
__m512 out1981 = tmp14816;
__m512 out1988 = tmp14798;
__m512 out1982 = tmp14818;
__m512 out1989 = tmp14800;
__m512 out1983 = tmp14820;
__m512 out1990 = tmp14802;
__m512 out1984 = tmp14822;
out1985 = _mm512_max_ps(_mm512_setzero_ps(), out1985);
out1979 = _mm512_max_ps(_mm512_setzero_ps(), out1979);
out1986 = _mm512_max_ps(_mm512_setzero_ps(), out1986);
out1980 = _mm512_max_ps(_mm512_setzero_ps(), out1980);
out1987 = _mm512_max_ps(_mm512_setzero_ps(), out1987);
out1981 = _mm512_max_ps(_mm512_setzero_ps(), out1981);
out1988 = _mm512_max_ps(_mm512_setzero_ps(), out1988);
out1982 = _mm512_max_ps(_mm512_setzero_ps(), out1982);
out1989 = _mm512_max_ps(_mm512_setzero_ps(), out1989);
out1983 = _mm512_max_ps(_mm512_setzero_ps(), out1983);
out1990 = _mm512_max_ps(_mm512_setzero_ps(), out1990);
out1984 = _mm512_max_ps(_mm512_setzero_ps(), out1984);
_mm512_mask_storeu_ps(datPtr24+3136+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1985);
_mm512_mask_storeu_ps(datPtr24+96+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 15, out1979);
_mm512_mask_storeu_ps(datPtr24+648+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4032, out1979);
_mm512_mask_storeu_ps(datPtr24+3248+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1986);
_mm512_mask_storeu_ps(datPtr24+208+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 15, out1980);
_mm512_mask_storeu_ps(datPtr24+760+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4032, out1980);
_mm512_mask_storeu_ps(datPtr24+3360+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1987);
_mm512_mask_storeu_ps(datPtr24+320+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 15, out1981);
_mm512_mask_storeu_ps(datPtr24+872+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4032, out1981);
_mm512_mask_storeu_ps(datPtr24+3472+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1988);
_mm512_mask_storeu_ps(datPtr24+432+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 15, out1982);
_mm512_mask_storeu_ps(datPtr24+984+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4032, out1982);
_mm512_mask_storeu_ps(datPtr24+3584+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1989);
_mm512_mask_storeu_ps(datPtr24+544+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 15, out1983);
_mm512_mask_storeu_ps(datPtr24+1096+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4032, out1983);
_mm512_mask_storeu_ps(datPtr24+3696+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1990);
_mm512_mask_storeu_ps(datPtr24+656+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 15, out1984);
_mm512_mask_storeu_ps(datPtr24+1208+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4032, out1984);
__m512 sf1041 = _mm512_loadu_ps(sfPtr11+512+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1042 = _mm512_loadu_ps(sfPtr11+640+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2135 = _mm512_shuffle_f32x4(sf1041, sf1042, 68);
__m512 in2136 = _mm512_shuffle_f32x4(sf1041, sf1042, 238);
__m512 sf1043 = _mm512_loadu_ps(sfPtr11+576+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1044 = _mm512_loadu_ps(sfPtr11+704+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2143 = _mm512_shuffle_f32x4(sf1043, sf1044, 68);
__m512 in2144 = _mm512_shuffle_f32x4(sf1043, sf1044, 238);
__m512 sf1045 = _mm512_loadu_ps(sfPtr11+205312+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1046 = _mm512_loadu_ps(sfPtr11+205440+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2137 = _mm512_shuffle_f32x4(sf1045, sf1046, 68);
__m512 in2138 = _mm512_shuffle_f32x4(sf1045, sf1046, 238);
__m512 sf1047 = _mm512_loadu_ps(sfPtr11+205376+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1048 = _mm512_loadu_ps(sfPtr11+205504+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2145 = _mm512_shuffle_f32x4(sf1047, sf1048, 68);
__m512 in2146 = _mm512_shuffle_f32x4(sf1047, sf1048, 238);
__m512 sf1049 = _mm512_loadu_ps(sfPtr11+410112+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1050 = _mm512_loadu_ps(sfPtr11+410240+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2139 = _mm512_shuffle_f32x4(sf1049, sf1050, 68);
__m512 in2140 = _mm512_shuffle_f32x4(sf1049, sf1050, 238);
__m512 sf1051 = _mm512_loadu_ps(sfPtr11+410176+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1052 = _mm512_loadu_ps(sfPtr11+410304+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2147 = _mm512_shuffle_f32x4(sf1051, sf1052, 68);
__m512 in2148 = _mm512_shuffle_f32x4(sf1051, sf1052, 238);
__m512 sf1053 = _mm512_loadu_ps(sfPtr11+614912+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1054 = _mm512_loadu_ps(sfPtr11+615040+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2141 = _mm512_shuffle_f32x4(sf1053, sf1054, 68);
__m512 in2142 = _mm512_shuffle_f32x4(sf1053, sf1054, 238);
__m512 sf1055 = _mm512_loadu_ps(sfPtr11+614976+819200*i47+49152*j40+1536*k135+768*l54);
__m512 sf1056 = _mm512_loadu_ps(sfPtr11+615104+819200*i47+49152*j40+1536*k135+768*l54);
__m512 in2149 = _mm512_shuffle_f32x4(sf1055, sf1056, 68);
__m512 in2150 = _mm512_shuffle_f32x4(sf1055, sf1056, 238);
__m512 tmp14881 = _mm512_add_ps(in2136, in2137);
__m512 tmp14901 = _mm512_add_ps(in2144, in2145);
__m512 tmp14880 = _mm512_add_ps(in2138, in2139);
__m512 tmp14900 = _mm512_add_ps(in2146, in2147);
__m512 tmp14886 = _mm512_sub_ps(in2138, in2139);
__m512 tmp14906 = _mm512_sub_ps(in2146, in2147);
__m512 tmp14885 = _mm512_sub_ps(in2136, in2137);
__m512 tmp14905 = _mm512_sub_ps(in2144, in2145);
__m512 tmp14882 = _mm512_add_ps(in2140, in2141);
__m512 tmp14902 = _mm512_add_ps(in2148, in2149);
__m512 tmp14887 = _mm512_sub_ps(in2140, in2141);
__m512 tmp14907 = _mm512_sub_ps(in2148, in2149);
__m512 tmp14884 = _mm512_fmadd_ps(tmp14886, _mm512_set1_ps(2e+00f), tmp14885);
__m512 tmp14904 = _mm512_fmadd_ps(tmp14906, _mm512_set1_ps(2e+00f), tmp14905);
__m512 tmp14891 = _mm512_fmadd_ps(tmp14886, _mm512_set1_ps(8e+00f), tmp14885);
__m512 tmp14911 = _mm512_fmadd_ps(tmp14906, _mm512_set1_ps(8e+00f), tmp14905);
__m512 tmp14879 = _mm512_add_ps(tmp14880, tmp14881);
__m512 tmp14899 = _mm512_add_ps(tmp14900, tmp14901);
__m512 tmp14883 = _mm512_fmadd_ps(tmp14887, _mm512_set1_ps(1.6e+01f), tmp14884);
__m512 tmp14903 = _mm512_fmadd_ps(tmp14907, _mm512_set1_ps(1.6e+01f), tmp14904);
__m512 tmp14890 = _mm512_fmadd_ps(tmp14887, _mm512_set1_ps(4e+00f), tmp14891);
__m512 tmp14910 = _mm512_fmadd_ps(tmp14907, _mm512_set1_ps(4e+00f), tmp14911);
__m512 tmp14896 = _mm512_add_ps(tmp14887, tmp14885);
__m512 tmp14916 = _mm512_add_ps(tmp14907, tmp14905);
__m512 tmp14889 = _mm512_fmadd_ps(tmp14880, _mm512_set1_ps(4e+00f), tmp14881);
__m512 tmp14909 = _mm512_fmadd_ps(tmp14900, _mm512_set1_ps(4e+00f), tmp14901);
__m512 tmp14893 = _mm512_fmadd_ps(tmp14880, _mm512_set1_ps(1.6e+01f), tmp14881);
__m512 tmp14913 = _mm512_fmadd_ps(tmp14900, _mm512_set1_ps(1.6e+01f), tmp14901);
__m512 tmp14878 = _mm512_add_ps(tmp14879, in2135);
__m512 tmp14898 = _mm512_add_ps(tmp14899, in2143);
__m512 tmp14895 = _mm512_add_ps(tmp14896, in2142);
__m512 tmp14915 = _mm512_add_ps(tmp14916, in2150);
__m512 tmp14877 = _mm512_fmadd_ps(tmp14882, _mm512_set1_ps(3.2e+01f), tmp14878);
__m512 tmp14897 = _mm512_fmadd_ps(tmp14902, _mm512_set1_ps(3.2e+01f), tmp14898);
__m512 tmp14888 = _mm512_fmadd_ps(tmp14882, _mm512_set1_ps(8e+00f), tmp14889);
__m512 tmp14908 = _mm512_fmadd_ps(tmp14902, _mm512_set1_ps(8e+00f), tmp14909);
__m512 tmp14894 = _mm512_fmadd_ps(tmp14886, _mm512_set1_ps(3.2e+01f), tmp14895);
__m512 tmp14914 = _mm512_fmadd_ps(tmp14906, _mm512_set1_ps(3.2e+01f), tmp14915);
__m512 tmp14892 = _mm512_fmadd_ps(tmp14882, _mm512_set1_ps(2e+00f), tmp14893);
__m512 tmp14912 = _mm512_fmadd_ps(tmp14902, _mm512_set1_ps(2e+00f), tmp14913);
__m512 tmp14865 = tmp14877;
__m512 tmp14871 = tmp14897;
__m512 tmp14866 = tmp14883;
__m512 tmp14872 = tmp14903;
__m512 tmp14867 = tmp14888;
__m512 tmp14873 = tmp14908;
__m512 tmp14868 = tmp14890;
__m512 tmp14874 = tmp14910;
__m512 tmp14869 = tmp14892;
__m512 tmp14875 = tmp14912;
__m512 tmp14870 = tmp14894;
__m512 tmp14876 = tmp14914;
__m512 tmp14961 = _mm512_unpacklo_ps(tmp14865, tmp14866);
__m512 tmp14962 = _mm512_unpackhi_ps(tmp14865, tmp14866);
__m512 tmp14963 = _mm512_unpacklo_ps(tmp14867, tmp14868);
__m512 tmp14964 = _mm512_unpackhi_ps(tmp14867, tmp14868);
__m512 tmp14965 = _mm512_unpacklo_ps(tmp14869, tmp14870);
__m512 tmp14966 = _mm512_unpackhi_ps(tmp14869, tmp14870);
__m512 tmp14967 = _mm512_unpacklo_ps(tmp14871, tmp14872);
__m512 tmp14968 = _mm512_unpackhi_ps(tmp14871, tmp14872);
__m512 tmp14969 = _mm512_unpacklo_ps(tmp14873, tmp14874);
__m512 tmp14970 = _mm512_unpackhi_ps(tmp14873, tmp14874);
__m512 tmp14971 = _mm512_unpacklo_ps(tmp14875, tmp14876);
__m512 tmp14972 = _mm512_unpackhi_ps(tmp14875, tmp14876);
__m512 tmp14973 = _mm512_shuffle_ps(tmp14961, tmp14963, 68);
__m512 tmp14974 = _mm512_shuffle_ps(tmp14961, tmp14963, 238);
__m512 tmp14975 = _mm512_shuffle_ps(tmp14962, tmp14964, 68);
__m512 tmp14976 = _mm512_shuffle_ps(tmp14962, tmp14964, 238);
__m512 tmp14977 = _mm512_shuffle_ps(tmp14965, tmp14967, 68);
__m512 tmp14978 = _mm512_shuffle_ps(tmp14965, tmp14967, 238);
__m512 tmp14979 = _mm512_shuffle_ps(tmp14966, tmp14968, 68);
__m512 tmp14980 = _mm512_shuffle_ps(tmp14966, tmp14968, 238);
__m512 tmp14981 = _mm512_shuffle_ps(tmp14969, tmp14971, 68);
__m512 tmp14982 = _mm512_shuffle_ps(tmp14969, tmp14971, 238);
__m512 tmp14983 = _mm512_shuffle_ps(tmp14970, tmp14972, 68);
__m512 tmp14984 = _mm512_shuffle_ps(tmp14970, tmp14972, 238);
__m512 tmp14985 = _mm512_shuffle_f32x4(tmp14973, tmp14977, 136);
__m512 tmp14986 = _mm512_shuffle_f32x4(tmp14973, tmp14977, 221);
__m512 tmp14987 = _mm512_shuffle_f32x4(tmp14974, tmp14978, 136);
__m512 tmp14988 = _mm512_shuffle_f32x4(tmp14974, tmp14978, 221);
__m512 tmp14989 = _mm512_shuffle_f32x4(tmp14975, tmp14979, 136);
__m512 tmp14990 = _mm512_shuffle_f32x4(tmp14975, tmp14979, 221);
__m512 tmp14991 = _mm512_shuffle_f32x4(tmp14976, tmp14980, 136);
__m512 tmp14992 = _mm512_shuffle_f32x4(tmp14976, tmp14980, 221);
__m512 tmp14993 = _mm512_shuffle_f32x4(tmp14981, tmp14981, 136);
__m512 tmp14994 = _mm512_shuffle_f32x4(tmp14981, tmp14981, 221);
__m512 tmp14995 = _mm512_shuffle_f32x4(tmp14982, tmp14982, 136);
__m512 tmp14996 = _mm512_shuffle_f32x4(tmp14982, tmp14982, 221);
__m512 tmp14997 = _mm512_shuffle_f32x4(tmp14983, tmp14983, 136);
__m512 tmp14998 = _mm512_shuffle_f32x4(tmp14983, tmp14983, 221);
__m512 tmp14999 = _mm512_shuffle_f32x4(tmp14984, tmp14984, 136);
__m512 tmp15000 = _mm512_shuffle_f32x4(tmp14984, tmp14984, 221);
tmp14865 = _mm512_shuffle_f32x4(tmp14985, tmp14993, 136);
tmp14873 = _mm512_shuffle_f32x4(tmp14985, tmp14993, 221);
tmp14866 = _mm512_shuffle_f32x4(tmp14987, tmp14995, 136);
tmp14874 = _mm512_shuffle_f32x4(tmp14987, tmp14995, 221);
tmp14867 = _mm512_shuffle_f32x4(tmp14989, tmp14997, 136);
tmp14875 = _mm512_shuffle_f32x4(tmp14989, tmp14997, 221);
tmp14868 = _mm512_shuffle_f32x4(tmp14991, tmp14999, 136);
tmp14876 = _mm512_shuffle_f32x4(tmp14991, tmp14999, 221);
tmp14869 = _mm512_shuffle_f32x4(tmp14986, tmp14994, 136);
__m512 tmp14917 = _mm512_shuffle_f32x4(tmp14986, tmp14994, 221);
tmp14870 = _mm512_shuffle_f32x4(tmp14988, tmp14996, 136);
__m512 tmp14918 = _mm512_shuffle_f32x4(tmp14988, tmp14996, 221);
tmp14871 = _mm512_shuffle_f32x4(tmp14990, tmp14998, 136);
__m512 tmp14919 = _mm512_shuffle_f32x4(tmp14990, tmp14998, 221);
tmp14872 = _mm512_shuffle_f32x4(tmp14992, tmp15000, 136);
__m512 tmp14920 = _mm512_shuffle_f32x4(tmp14992, tmp15000, 221);
__m512 tmp14925 = _mm512_add_ps(tmp14866, tmp14867);
__m512 tmp14945 = _mm512_add_ps(tmp14874, tmp14875);
__m512 tmp14924 = _mm512_add_ps(tmp14868, tmp14869);
__m512 tmp14944 = _mm512_add_ps(tmp14876, tmp14917);
__m512 tmp14930 = _mm512_sub_ps(tmp14868, tmp14869);
__m512 tmp14950 = _mm512_sub_ps(tmp14876, tmp14917);
__m512 tmp14929 = _mm512_sub_ps(tmp14866, tmp14867);
__m512 tmp14949 = _mm512_sub_ps(tmp14874, tmp14875);
__m512 tmp14926 = _mm512_add_ps(tmp14870, tmp14871);
__m512 tmp14946 = _mm512_add_ps(tmp14918, tmp14919);
__m512 tmp14931 = _mm512_sub_ps(tmp14870, tmp14871);
__m512 tmp14951 = _mm512_sub_ps(tmp14918, tmp14919);
__m512 tmp14928 = _mm512_fmadd_ps(tmp14930, _mm512_set1_ps(2e+00f), tmp14929);
__m512 tmp14948 = _mm512_fmadd_ps(tmp14950, _mm512_set1_ps(2e+00f), tmp14949);
__m512 tmp14935 = _mm512_fmadd_ps(tmp14930, _mm512_set1_ps(8e+00f), tmp14929);
__m512 tmp14955 = _mm512_fmadd_ps(tmp14950, _mm512_set1_ps(8e+00f), tmp14949);
__m512 tmp14923 = _mm512_add_ps(tmp14924, tmp14925);
__m512 tmp14943 = _mm512_add_ps(tmp14944, tmp14945);
__m512 tmp14927 = _mm512_fmadd_ps(tmp14931, _mm512_set1_ps(1.6e+01f), tmp14928);
__m512 tmp14947 = _mm512_fmadd_ps(tmp14951, _mm512_set1_ps(1.6e+01f), tmp14948);
__m512 tmp14934 = _mm512_fmadd_ps(tmp14931, _mm512_set1_ps(4e+00f), tmp14935);
__m512 tmp14954 = _mm512_fmadd_ps(tmp14951, _mm512_set1_ps(4e+00f), tmp14955);
__m512 tmp14940 = _mm512_add_ps(tmp14931, tmp14929);
__m512 tmp14960 = _mm512_add_ps(tmp14951, tmp14949);
__m512 tmp14933 = _mm512_fmadd_ps(tmp14924, _mm512_set1_ps(4e+00f), tmp14925);
__m512 tmp14953 = _mm512_fmadd_ps(tmp14944, _mm512_set1_ps(4e+00f), tmp14945);
__m512 tmp14937 = _mm512_fmadd_ps(tmp14924, _mm512_set1_ps(1.6e+01f), tmp14925);
__m512 tmp14957 = _mm512_fmadd_ps(tmp14944, _mm512_set1_ps(1.6e+01f), tmp14945);
__m512 tmp14922 = _mm512_add_ps(tmp14923, tmp14865);
__m512 tmp14942 = _mm512_add_ps(tmp14943, tmp14873);
__m512 tmp14939 = _mm512_add_ps(tmp14940, tmp14872);
__m512 tmp14959 = _mm512_add_ps(tmp14960, tmp14920);
__m512 tmp14921 = _mm512_fmadd_ps(tmp14926, _mm512_set1_ps(3.2e+01f), tmp14922);
__m512 tmp14941 = _mm512_fmadd_ps(tmp14946, _mm512_set1_ps(3.2e+01f), tmp14942);
__m512 tmp14932 = _mm512_fmadd_ps(tmp14926, _mm512_set1_ps(8e+00f), tmp14933);
__m512 tmp14952 = _mm512_fmadd_ps(tmp14946, _mm512_set1_ps(8e+00f), tmp14953);
__m512 tmp14938 = _mm512_fmadd_ps(tmp14930, _mm512_set1_ps(3.2e+01f), tmp14939);
__m512 tmp14958 = _mm512_fmadd_ps(tmp14950, _mm512_set1_ps(3.2e+01f), tmp14959);
__m512 tmp14936 = _mm512_fmadd_ps(tmp14926, _mm512_set1_ps(2e+00f), tmp14937);
__m512 tmp14956 = _mm512_fmadd_ps(tmp14946, _mm512_set1_ps(2e+00f), tmp14957);
__m512 out1991 = tmp14921;
__m512 out1997 = tmp14941;
__m512 out1992 = tmp14927;
__m512 out1998 = tmp14947;
__m512 out1993 = tmp14932;
__m512 out1999 = tmp14952;
__m512 out1994 = tmp14934;
__m512 out2000 = tmp14954;
__m512 out1995 = tmp14936;
__m512 out2001 = tmp14956;
__m512 out1996 = tmp14938;
__m512 out2002 = tmp14958;
out1991 = _mm512_max_ps(_mm512_setzero_ps(), out1991);
out1997 = _mm512_max_ps(_mm512_setzero_ps(), out1997);
out1992 = _mm512_max_ps(_mm512_setzero_ps(), out1992);
out1998 = _mm512_max_ps(_mm512_setzero_ps(), out1998);
out1993 = _mm512_max_ps(_mm512_setzero_ps(), out1993);
out1999 = _mm512_max_ps(_mm512_setzero_ps(), out1999);
out1994 = _mm512_max_ps(_mm512_setzero_ps(), out1994);
out2000 = _mm512_max_ps(_mm512_setzero_ps(), out2000);
out1995 = _mm512_max_ps(_mm512_setzero_ps(), out1995);
out2001 = _mm512_max_ps(_mm512_setzero_ps(), out2001);
out1996 = _mm512_max_ps(_mm512_setzero_ps(), out1996);
out2002 = _mm512_max_ps(_mm512_setzero_ps(), out2002);
_mm512_mask_storeu_ps(datPtr24+3184+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1991);
_mm512_mask_storeu_ps(datPtr24+3232+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 15, out1997);
_mm512_mask_storeu_ps(datPtr24+3784+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4032, out1997);
_mm512_mask_storeu_ps(datPtr24+3296+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1992);
_mm512_mask_storeu_ps(datPtr24+3344+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 15, out1998);
_mm512_mask_storeu_ps(datPtr24+3896+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4032, out1998);
_mm512_mask_storeu_ps(datPtr24+3408+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1993);
_mm512_mask_storeu_ps(datPtr24+3456+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 15, out1999);
_mm512_mask_storeu_ps(datPtr24+4008+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4032, out1999);
_mm512_mask_storeu_ps(datPtr24+3520+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1994);
_mm512_mask_storeu_ps(datPtr24+3568+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 15, out2000);
_mm512_mask_storeu_ps(datPtr24+4120+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4032, out2000);
_mm512_mask_storeu_ps(datPtr24+3632+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1995);
_mm512_mask_storeu_ps(datPtr24+3680+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 15, out2001);
_mm512_mask_storeu_ps(datPtr24+4232+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4032, out2001);
_mm512_mask_storeu_ps(datPtr24+3744+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4095, out1996);
_mm512_mask_storeu_ps(datPtr24+3792+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 15, out2002);
_mm512_mask_storeu_ps(datPtr24+4344+401408*i47+112*toH43+4*toW43+12544*k135+6272*l54, 4032, out2002);
}
}
if (j40 >= last10) return;
++j40;
rel22 = 1;
}
ptrdiff_t toH44 = base22+6;
ptrdiff_t toW44 = 6;
ptrdiff_t k136 = 32*w63;
for (; k136 != 32; ++k136) {
ptrdiff_t l55 = 0;
for (; l55 != 2; ++l55) {
__m512 sf1057 = _mm512_loadu_ps(sfPtr11+0+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1058 = _mm512_loadu_ps(sfPtr11+128+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2151 = _mm512_shuffle_f32x4(sf1057, sf1058, 68);
__m512 in2152 = _mm512_shuffle_f32x4(sf1057, sf1058, 238);
__m512 sf1059 = _mm512_loadu_ps(sfPtr11+64+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1060 = _mm512_loadu_ps(sfPtr11+192+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2159 = _mm512_shuffle_f32x4(sf1059, sf1060, 68);
__m512 in2160 = _mm512_shuffle_f32x4(sf1059, sf1060, 238);
__m512 sf1061 = _mm512_loadu_ps(sfPtr11+204800+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1062 = _mm512_loadu_ps(sfPtr11+204928+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2153 = _mm512_shuffle_f32x4(sf1061, sf1062, 68);
__m512 in2154 = _mm512_shuffle_f32x4(sf1061, sf1062, 238);
__m512 sf1063 = _mm512_loadu_ps(sfPtr11+204864+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1064 = _mm512_loadu_ps(sfPtr11+204992+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2161 = _mm512_shuffle_f32x4(sf1063, sf1064, 68);
__m512 in2162 = _mm512_shuffle_f32x4(sf1063, sf1064, 238);
__m512 sf1065 = _mm512_loadu_ps(sfPtr11+409600+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1066 = _mm512_loadu_ps(sfPtr11+409728+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2155 = _mm512_shuffle_f32x4(sf1065, sf1066, 68);
__m512 in2156 = _mm512_shuffle_f32x4(sf1065, sf1066, 238);
__m512 sf1067 = _mm512_loadu_ps(sfPtr11+409664+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1068 = _mm512_loadu_ps(sfPtr11+409792+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2163 = _mm512_shuffle_f32x4(sf1067, sf1068, 68);
__m512 in2164 = _mm512_shuffle_f32x4(sf1067, sf1068, 238);
__m512 sf1069 = _mm512_loadu_ps(sfPtr11+614400+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1070 = _mm512_loadu_ps(sfPtr11+614528+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2157 = _mm512_shuffle_f32x4(sf1069, sf1070, 68);
__m512 in2158 = _mm512_shuffle_f32x4(sf1069, sf1070, 238);
__m512 sf1071 = _mm512_loadu_ps(sfPtr11+614464+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1072 = _mm512_loadu_ps(sfPtr11+614592+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2165 = _mm512_shuffle_f32x4(sf1071, sf1072, 68);
__m512 in2166 = _mm512_shuffle_f32x4(sf1071, sf1072, 238);
__m512 tmp15017 = _mm512_add_ps(in2152, in2153);
__m512 tmp15037 = _mm512_add_ps(in2160, in2161);
__m512 tmp15016 = _mm512_add_ps(in2154, in2155);
__m512 tmp15036 = _mm512_add_ps(in2162, in2163);
__m512 tmp15022 = _mm512_sub_ps(in2154, in2155);
__m512 tmp15042 = _mm512_sub_ps(in2162, in2163);
__m512 tmp15021 = _mm512_sub_ps(in2152, in2153);
__m512 tmp15041 = _mm512_sub_ps(in2160, in2161);
__m512 tmp15018 = _mm512_add_ps(in2156, in2157);
__m512 tmp15038 = _mm512_add_ps(in2164, in2165);
__m512 tmp15023 = _mm512_sub_ps(in2156, in2157);
__m512 tmp15043 = _mm512_sub_ps(in2164, in2165);
__m512 tmp15020 = _mm512_fmadd_ps(tmp15022, _mm512_set1_ps(2e+00f), tmp15021);
__m512 tmp15040 = _mm512_fmadd_ps(tmp15042, _mm512_set1_ps(2e+00f), tmp15041);
__m512 tmp15027 = _mm512_fmadd_ps(tmp15022, _mm512_set1_ps(8e+00f), tmp15021);
__m512 tmp15047 = _mm512_fmadd_ps(tmp15042, _mm512_set1_ps(8e+00f), tmp15041);
__m512 tmp15015 = _mm512_add_ps(tmp15016, tmp15017);
__m512 tmp15035 = _mm512_add_ps(tmp15036, tmp15037);
__m512 tmp15019 = _mm512_fmadd_ps(tmp15023, _mm512_set1_ps(1.6e+01f), tmp15020);
__m512 tmp15039 = _mm512_fmadd_ps(tmp15043, _mm512_set1_ps(1.6e+01f), tmp15040);
__m512 tmp15026 = _mm512_fmadd_ps(tmp15023, _mm512_set1_ps(4e+00f), tmp15027);
__m512 tmp15046 = _mm512_fmadd_ps(tmp15043, _mm512_set1_ps(4e+00f), tmp15047);
__m512 tmp15032 = _mm512_add_ps(tmp15023, tmp15021);
__m512 tmp15052 = _mm512_add_ps(tmp15043, tmp15041);
__m512 tmp15025 = _mm512_fmadd_ps(tmp15016, _mm512_set1_ps(4e+00f), tmp15017);
__m512 tmp15045 = _mm512_fmadd_ps(tmp15036, _mm512_set1_ps(4e+00f), tmp15037);
__m512 tmp15029 = _mm512_fmadd_ps(tmp15016, _mm512_set1_ps(1.6e+01f), tmp15017);
__m512 tmp15049 = _mm512_fmadd_ps(tmp15036, _mm512_set1_ps(1.6e+01f), tmp15037);
__m512 tmp15014 = _mm512_add_ps(tmp15015, in2151);
__m512 tmp15034 = _mm512_add_ps(tmp15035, in2159);
__m512 tmp15031 = _mm512_add_ps(tmp15032, in2158);
__m512 tmp15051 = _mm512_add_ps(tmp15052, in2166);
__m512 tmp15013 = _mm512_fmadd_ps(tmp15018, _mm512_set1_ps(3.2e+01f), tmp15014);
__m512 tmp15033 = _mm512_fmadd_ps(tmp15038, _mm512_set1_ps(3.2e+01f), tmp15034);
__m512 tmp15024 = _mm512_fmadd_ps(tmp15018, _mm512_set1_ps(8e+00f), tmp15025);
__m512 tmp15044 = _mm512_fmadd_ps(tmp15038, _mm512_set1_ps(8e+00f), tmp15045);
__m512 tmp15030 = _mm512_fmadd_ps(tmp15022, _mm512_set1_ps(3.2e+01f), tmp15031);
__m512 tmp15050 = _mm512_fmadd_ps(tmp15042, _mm512_set1_ps(3.2e+01f), tmp15051);
__m512 tmp15028 = _mm512_fmadd_ps(tmp15018, _mm512_set1_ps(2e+00f), tmp15029);
__m512 tmp15048 = _mm512_fmadd_ps(tmp15038, _mm512_set1_ps(2e+00f), tmp15049);
__m512 tmp15001 = tmp15013;
__m512 tmp15007 = tmp15033;
__m512 tmp15002 = tmp15019;
__m512 tmp15008 = tmp15039;
__m512 tmp15003 = tmp15024;
__m512 tmp15009 = tmp15044;
__m512 tmp15004 = tmp15026;
__m512 tmp15010 = tmp15046;
__m512 tmp15005 = tmp15028;
__m512 tmp15011 = tmp15048;
__m512 tmp15006 = tmp15030;
__m512 tmp15012 = tmp15050;
__m512 tmp15097 = _mm512_unpacklo_ps(tmp15001, tmp15002);
__m512 tmp15098 = _mm512_unpackhi_ps(tmp15001, tmp15002);
__m512 tmp15099 = _mm512_unpacklo_ps(tmp15003, tmp15004);
__m512 tmp15100 = _mm512_unpackhi_ps(tmp15003, tmp15004);
__m512 tmp15101 = _mm512_unpacklo_ps(tmp15005, tmp15006);
__m512 tmp15102 = _mm512_unpackhi_ps(tmp15005, tmp15006);
__m512 tmp15103 = _mm512_unpacklo_ps(tmp15007, tmp15008);
__m512 tmp15104 = _mm512_unpackhi_ps(tmp15007, tmp15008);
__m512 tmp15105 = _mm512_unpacklo_ps(tmp15009, tmp15010);
__m512 tmp15106 = _mm512_unpackhi_ps(tmp15009, tmp15010);
__m512 tmp15107 = _mm512_unpacklo_ps(tmp15011, tmp15012);
__m512 tmp15108 = _mm512_unpackhi_ps(tmp15011, tmp15012);
__m512 tmp15109 = _mm512_shuffle_ps(tmp15097, tmp15099, 68);
__m512 tmp15110 = _mm512_shuffle_ps(tmp15097, tmp15099, 238);
__m512 tmp15111 = _mm512_shuffle_ps(tmp15098, tmp15100, 68);
__m512 tmp15112 = _mm512_shuffle_ps(tmp15098, tmp15100, 238);
__m512 tmp15113 = _mm512_shuffle_ps(tmp15101, tmp15103, 68);
__m512 tmp15114 = _mm512_shuffle_ps(tmp15101, tmp15103, 238);
__m512 tmp15115 = _mm512_shuffle_ps(tmp15102, tmp15104, 68);
__m512 tmp15116 = _mm512_shuffle_ps(tmp15102, tmp15104, 238);
__m512 tmp15117 = _mm512_shuffle_ps(tmp15105, tmp15107, 68);
__m512 tmp15118 = _mm512_shuffle_ps(tmp15105, tmp15107, 238);
__m512 tmp15119 = _mm512_shuffle_ps(tmp15106, tmp15108, 68);
__m512 tmp15120 = _mm512_shuffle_ps(tmp15106, tmp15108, 238);
__m512 tmp15121 = _mm512_shuffle_f32x4(tmp15109, tmp15113, 136);
__m512 tmp15122 = _mm512_shuffle_f32x4(tmp15109, tmp15113, 221);
__m512 tmp15123 = _mm512_shuffle_f32x4(tmp15110, tmp15114, 136);
__m512 tmp15124 = _mm512_shuffle_f32x4(tmp15110, tmp15114, 221);
__m512 tmp15125 = _mm512_shuffle_f32x4(tmp15111, tmp15115, 136);
__m512 tmp15126 = _mm512_shuffle_f32x4(tmp15111, tmp15115, 221);
__m512 tmp15127 = _mm512_shuffle_f32x4(tmp15112, tmp15116, 136);
__m512 tmp15128 = _mm512_shuffle_f32x4(tmp15112, tmp15116, 221);
__m512 tmp15129 = _mm512_shuffle_f32x4(tmp15117, tmp15117, 136);
__m512 tmp15130 = _mm512_shuffle_f32x4(tmp15117, tmp15117, 221);
__m512 tmp15131 = _mm512_shuffle_f32x4(tmp15118, tmp15118, 136);
__m512 tmp15132 = _mm512_shuffle_f32x4(tmp15118, tmp15118, 221);
__m512 tmp15133 = _mm512_shuffle_f32x4(tmp15119, tmp15119, 136);
__m512 tmp15134 = _mm512_shuffle_f32x4(tmp15119, tmp15119, 221);
__m512 tmp15135 = _mm512_shuffle_f32x4(tmp15120, tmp15120, 136);
__m512 tmp15136 = _mm512_shuffle_f32x4(tmp15120, tmp15120, 221);
tmp15001 = _mm512_shuffle_f32x4(tmp15121, tmp15129, 136);
tmp15009 = _mm512_shuffle_f32x4(tmp15121, tmp15129, 221);
tmp15002 = _mm512_shuffle_f32x4(tmp15123, tmp15131, 136);
tmp15010 = _mm512_shuffle_f32x4(tmp15123, tmp15131, 221);
tmp15003 = _mm512_shuffle_f32x4(tmp15125, tmp15133, 136);
tmp15011 = _mm512_shuffle_f32x4(tmp15125, tmp15133, 221);
tmp15004 = _mm512_shuffle_f32x4(tmp15127, tmp15135, 136);
tmp15012 = _mm512_shuffle_f32x4(tmp15127, tmp15135, 221);
tmp15005 = _mm512_shuffle_f32x4(tmp15122, tmp15130, 136);
__m512 tmp15053 = _mm512_shuffle_f32x4(tmp15122, tmp15130, 221);
tmp15006 = _mm512_shuffle_f32x4(tmp15124, tmp15132, 136);
__m512 tmp15054 = _mm512_shuffle_f32x4(tmp15124, tmp15132, 221);
tmp15007 = _mm512_shuffle_f32x4(tmp15126, tmp15134, 136);
__m512 tmp15055 = _mm512_shuffle_f32x4(tmp15126, tmp15134, 221);
tmp15008 = _mm512_shuffle_f32x4(tmp15128, tmp15136, 136);
__m512 tmp15056 = _mm512_shuffle_f32x4(tmp15128, tmp15136, 221);
__m512 tmp15061 = _mm512_add_ps(tmp15002, tmp15003);
__m512 tmp15081 = _mm512_add_ps(tmp15010, tmp15011);
__m512 tmp15060 = _mm512_add_ps(tmp15004, tmp15005);
__m512 tmp15080 = _mm512_add_ps(tmp15012, tmp15053);
__m512 tmp15066 = _mm512_sub_ps(tmp15004, tmp15005);
__m512 tmp15086 = _mm512_sub_ps(tmp15012, tmp15053);
__m512 tmp15065 = _mm512_sub_ps(tmp15002, tmp15003);
__m512 tmp15085 = _mm512_sub_ps(tmp15010, tmp15011);
__m512 tmp15062 = _mm512_add_ps(tmp15006, tmp15007);
__m512 tmp15082 = _mm512_add_ps(tmp15054, tmp15055);
__m512 tmp15067 = _mm512_sub_ps(tmp15006, tmp15007);
__m512 tmp15087 = _mm512_sub_ps(tmp15054, tmp15055);
__m512 tmp15064 = _mm512_fmadd_ps(tmp15066, _mm512_set1_ps(2e+00f), tmp15065);
__m512 tmp15084 = _mm512_fmadd_ps(tmp15086, _mm512_set1_ps(2e+00f), tmp15085);
__m512 tmp15071 = _mm512_fmadd_ps(tmp15066, _mm512_set1_ps(8e+00f), tmp15065);
__m512 tmp15091 = _mm512_fmadd_ps(tmp15086, _mm512_set1_ps(8e+00f), tmp15085);
__m512 tmp15059 = _mm512_add_ps(tmp15060, tmp15061);
__m512 tmp15079 = _mm512_add_ps(tmp15080, tmp15081);
__m512 tmp15063 = _mm512_fmadd_ps(tmp15067, _mm512_set1_ps(1.6e+01f), tmp15064);
__m512 tmp15083 = _mm512_fmadd_ps(tmp15087, _mm512_set1_ps(1.6e+01f), tmp15084);
__m512 tmp15070 = _mm512_fmadd_ps(tmp15067, _mm512_set1_ps(4e+00f), tmp15071);
__m512 tmp15090 = _mm512_fmadd_ps(tmp15087, _mm512_set1_ps(4e+00f), tmp15091);
__m512 tmp15076 = _mm512_add_ps(tmp15067, tmp15065);
__m512 tmp15096 = _mm512_add_ps(tmp15087, tmp15085);
__m512 tmp15069 = _mm512_fmadd_ps(tmp15060, _mm512_set1_ps(4e+00f), tmp15061);
__m512 tmp15089 = _mm512_fmadd_ps(tmp15080, _mm512_set1_ps(4e+00f), tmp15081);
__m512 tmp15073 = _mm512_fmadd_ps(tmp15060, _mm512_set1_ps(1.6e+01f), tmp15061);
__m512 tmp15093 = _mm512_fmadd_ps(tmp15080, _mm512_set1_ps(1.6e+01f), tmp15081);
__m512 tmp15058 = _mm512_add_ps(tmp15059, tmp15001);
__m512 tmp15078 = _mm512_add_ps(tmp15079, tmp15009);
__m512 tmp15075 = _mm512_add_ps(tmp15076, tmp15008);
__m512 tmp15095 = _mm512_add_ps(tmp15096, tmp15056);
__m512 tmp15057 = _mm512_fmadd_ps(tmp15062, _mm512_set1_ps(3.2e+01f), tmp15058);
__m512 tmp15077 = _mm512_fmadd_ps(tmp15082, _mm512_set1_ps(3.2e+01f), tmp15078);
__m512 tmp15068 = _mm512_fmadd_ps(tmp15062, _mm512_set1_ps(8e+00f), tmp15069);
__m512 tmp15088 = _mm512_fmadd_ps(tmp15082, _mm512_set1_ps(8e+00f), tmp15089);
__m512 tmp15074 = _mm512_fmadd_ps(tmp15066, _mm512_set1_ps(3.2e+01f), tmp15075);
__m512 tmp15094 = _mm512_fmadd_ps(tmp15086, _mm512_set1_ps(3.2e+01f), tmp15095);
__m512 tmp15072 = _mm512_fmadd_ps(tmp15062, _mm512_set1_ps(2e+00f), tmp15073);
__m512 tmp15092 = _mm512_fmadd_ps(tmp15082, _mm512_set1_ps(2e+00f), tmp15093);
__m512 out2003 = tmp15057;
__m512 out2009 = tmp15077;
__m512 out2004 = tmp15063;
__m512 out2010 = tmp15083;
__m512 out2005 = tmp15068;
__m512 out2011 = tmp15088;
__m512 out2006 = tmp15070;
__m512 out2012 = tmp15090;
__m512 out2007 = tmp15072;
__m512 out2013 = tmp15092;
__m512 out2008 = tmp15074;
__m512 out2014 = tmp15094;
out2003 = _mm512_max_ps(_mm512_setzero_ps(), out2003);
out2009 = _mm512_max_ps(_mm512_setzero_ps(), out2009);
out2004 = _mm512_max_ps(_mm512_setzero_ps(), out2004);
out2010 = _mm512_max_ps(_mm512_setzero_ps(), out2010);
out2005 = _mm512_max_ps(_mm512_setzero_ps(), out2005);
out2011 = _mm512_max_ps(_mm512_setzero_ps(), out2011);
out2006 = _mm512_max_ps(_mm512_setzero_ps(), out2006);
out2012 = _mm512_max_ps(_mm512_setzero_ps(), out2012);
out2007 = _mm512_max_ps(_mm512_setzero_ps(), out2007);
out2013 = _mm512_max_ps(_mm512_setzero_ps(), out2013);
out2008 = _mm512_max_ps(_mm512_setzero_ps(), out2008);
out2014 = _mm512_max_ps(_mm512_setzero_ps(), out2014);
_mm512_mask_storeu_ps(datPtr24+0+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2003);
_mm512_mask_storeu_ps(datPtr24+48+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 1023, out2009);
_mm512_mask_storeu_ps(datPtr24+112+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2004);
_mm512_mask_storeu_ps(datPtr24+160+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 1023, out2010);
_mm512_mask_storeu_ps(datPtr24+224+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2005);
_mm512_mask_storeu_ps(datPtr24+272+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 1023, out2011);
_mm512_mask_storeu_ps(datPtr24+336+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2006);
_mm512_mask_storeu_ps(datPtr24+384+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 1023, out2012);
_mm512_mask_storeu_ps(datPtr24+448+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2007);
_mm512_mask_storeu_ps(datPtr24+496+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 1023, out2013);
_mm512_mask_storeu_ps(datPtr24+560+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2008);
_mm512_mask_storeu_ps(datPtr24+608+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 1023, out2014);
__m512 sf1073 = _mm512_loadu_ps(sfPtr11+256+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1074 = _mm512_loadu_ps(sfPtr11+384+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2167 = _mm512_shuffle_f32x4(sf1073, sf1074, 68);
__m512 in2168 = _mm512_shuffle_f32x4(sf1073, sf1074, 238);
__m512 sf1075 = _mm512_loadu_ps(sfPtr11+320+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1076 = _mm512_loadu_ps(sfPtr11+448+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2175 = _mm512_shuffle_f32x4(sf1075, sf1076, 68);
__m512 in2176 = _mm512_shuffle_f32x4(sf1075, sf1076, 238);
__m512 sf1077 = _mm512_loadu_ps(sfPtr11+205056+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1078 = _mm512_loadu_ps(sfPtr11+205184+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2169 = _mm512_shuffle_f32x4(sf1077, sf1078, 68);
__m512 in2170 = _mm512_shuffle_f32x4(sf1077, sf1078, 238);
__m512 sf1079 = _mm512_loadu_ps(sfPtr11+205120+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1080 = _mm512_loadu_ps(sfPtr11+205248+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2177 = _mm512_shuffle_f32x4(sf1079, sf1080, 68);
__m512 in2178 = _mm512_shuffle_f32x4(sf1079, sf1080, 238);
__m512 sf1081 = _mm512_loadu_ps(sfPtr11+409856+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1082 = _mm512_loadu_ps(sfPtr11+409984+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2171 = _mm512_shuffle_f32x4(sf1081, sf1082, 68);
__m512 in2172 = _mm512_shuffle_f32x4(sf1081, sf1082, 238);
__m512 sf1083 = _mm512_loadu_ps(sfPtr11+409920+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1084 = _mm512_loadu_ps(sfPtr11+410048+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2179 = _mm512_shuffle_f32x4(sf1083, sf1084, 68);
__m512 in2180 = _mm512_shuffle_f32x4(sf1083, sf1084, 238);
__m512 sf1085 = _mm512_loadu_ps(sfPtr11+614656+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1086 = _mm512_loadu_ps(sfPtr11+614784+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2173 = _mm512_shuffle_f32x4(sf1085, sf1086, 68);
__m512 in2174 = _mm512_shuffle_f32x4(sf1085, sf1086, 238);
__m512 sf1087 = _mm512_loadu_ps(sfPtr11+614720+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1088 = _mm512_loadu_ps(sfPtr11+614848+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2181 = _mm512_shuffle_f32x4(sf1087, sf1088, 68);
__m512 in2182 = _mm512_shuffle_f32x4(sf1087, sf1088, 238);
__m512 tmp15153 = _mm512_add_ps(in2168, in2169);
__m512 tmp15173 = _mm512_add_ps(in2176, in2177);
__m512 tmp15152 = _mm512_add_ps(in2170, in2171);
__m512 tmp15172 = _mm512_add_ps(in2178, in2179);
__m512 tmp15158 = _mm512_sub_ps(in2170, in2171);
__m512 tmp15178 = _mm512_sub_ps(in2178, in2179);
__m512 tmp15157 = _mm512_sub_ps(in2168, in2169);
__m512 tmp15177 = _mm512_sub_ps(in2176, in2177);
__m512 tmp15154 = _mm512_add_ps(in2172, in2173);
__m512 tmp15174 = _mm512_add_ps(in2180, in2181);
__m512 tmp15159 = _mm512_sub_ps(in2172, in2173);
__m512 tmp15179 = _mm512_sub_ps(in2180, in2181);
__m512 tmp15156 = _mm512_fmadd_ps(tmp15158, _mm512_set1_ps(2e+00f), tmp15157);
__m512 tmp15176 = _mm512_fmadd_ps(tmp15178, _mm512_set1_ps(2e+00f), tmp15177);
__m512 tmp15163 = _mm512_fmadd_ps(tmp15158, _mm512_set1_ps(8e+00f), tmp15157);
__m512 tmp15183 = _mm512_fmadd_ps(tmp15178, _mm512_set1_ps(8e+00f), tmp15177);
__m512 tmp15151 = _mm512_add_ps(tmp15152, tmp15153);
__m512 tmp15171 = _mm512_add_ps(tmp15172, tmp15173);
__m512 tmp15155 = _mm512_fmadd_ps(tmp15159, _mm512_set1_ps(1.6e+01f), tmp15156);
__m512 tmp15175 = _mm512_fmadd_ps(tmp15179, _mm512_set1_ps(1.6e+01f), tmp15176);
__m512 tmp15162 = _mm512_fmadd_ps(tmp15159, _mm512_set1_ps(4e+00f), tmp15163);
__m512 tmp15182 = _mm512_fmadd_ps(tmp15179, _mm512_set1_ps(4e+00f), tmp15183);
__m512 tmp15168 = _mm512_add_ps(tmp15159, tmp15157);
__m512 tmp15188 = _mm512_add_ps(tmp15179, tmp15177);
__m512 tmp15161 = _mm512_fmadd_ps(tmp15152, _mm512_set1_ps(4e+00f), tmp15153);
__m512 tmp15181 = _mm512_fmadd_ps(tmp15172, _mm512_set1_ps(4e+00f), tmp15173);
__m512 tmp15165 = _mm512_fmadd_ps(tmp15152, _mm512_set1_ps(1.6e+01f), tmp15153);
__m512 tmp15185 = _mm512_fmadd_ps(tmp15172, _mm512_set1_ps(1.6e+01f), tmp15173);
__m512 tmp15150 = _mm512_add_ps(tmp15151, in2167);
__m512 tmp15170 = _mm512_add_ps(tmp15171, in2175);
__m512 tmp15167 = _mm512_add_ps(tmp15168, in2174);
__m512 tmp15187 = _mm512_add_ps(tmp15188, in2182);
__m512 tmp15149 = _mm512_fmadd_ps(tmp15154, _mm512_set1_ps(3.2e+01f), tmp15150);
__m512 tmp15169 = _mm512_fmadd_ps(tmp15174, _mm512_set1_ps(3.2e+01f), tmp15170);
__m512 tmp15160 = _mm512_fmadd_ps(tmp15154, _mm512_set1_ps(8e+00f), tmp15161);
__m512 tmp15180 = _mm512_fmadd_ps(tmp15174, _mm512_set1_ps(8e+00f), tmp15181);
__m512 tmp15166 = _mm512_fmadd_ps(tmp15158, _mm512_set1_ps(3.2e+01f), tmp15167);
__m512 tmp15186 = _mm512_fmadd_ps(tmp15178, _mm512_set1_ps(3.2e+01f), tmp15187);
__m512 tmp15164 = _mm512_fmadd_ps(tmp15154, _mm512_set1_ps(2e+00f), tmp15165);
__m512 tmp15184 = _mm512_fmadd_ps(tmp15174, _mm512_set1_ps(2e+00f), tmp15185);
__m512 tmp15137 = tmp15149;
__m512 tmp15143 = tmp15169;
__m512 tmp15138 = tmp15155;
__m512 tmp15144 = tmp15175;
__m512 tmp15139 = tmp15160;
__m512 tmp15145 = tmp15180;
__m512 tmp15140 = tmp15162;
__m512 tmp15146 = tmp15182;
__m512 tmp15141 = tmp15164;
__m512 tmp15147 = tmp15184;
__m512 tmp15142 = tmp15166;
__m512 tmp15148 = tmp15186;
__m512 tmp15233 = _mm512_unpacklo_ps(tmp15137, tmp15138);
__m512 tmp15234 = _mm512_unpackhi_ps(tmp15137, tmp15138);
__m512 tmp15235 = _mm512_unpacklo_ps(tmp15139, tmp15140);
__m512 tmp15236 = _mm512_unpackhi_ps(tmp15139, tmp15140);
__m512 tmp15237 = _mm512_unpacklo_ps(tmp15141, tmp15142);
__m512 tmp15238 = _mm512_unpackhi_ps(tmp15141, tmp15142);
__m512 tmp15239 = _mm512_unpacklo_ps(tmp15143, tmp15144);
__m512 tmp15240 = _mm512_unpackhi_ps(tmp15143, tmp15144);
__m512 tmp15241 = _mm512_unpacklo_ps(tmp15145, tmp15146);
__m512 tmp15242 = _mm512_unpackhi_ps(tmp15145, tmp15146);
__m512 tmp15243 = _mm512_unpacklo_ps(tmp15147, tmp15148);
__m512 tmp15244 = _mm512_unpackhi_ps(tmp15147, tmp15148);
__m512 tmp15245 = _mm512_shuffle_ps(tmp15233, tmp15235, 68);
__m512 tmp15246 = _mm512_shuffle_ps(tmp15233, tmp15235, 238);
__m512 tmp15247 = _mm512_shuffle_ps(tmp15234, tmp15236, 68);
__m512 tmp15248 = _mm512_shuffle_ps(tmp15234, tmp15236, 238);
__m512 tmp15249 = _mm512_shuffle_ps(tmp15237, tmp15239, 68);
__m512 tmp15250 = _mm512_shuffle_ps(tmp15237, tmp15239, 238);
__m512 tmp15251 = _mm512_shuffle_ps(tmp15238, tmp15240, 68);
__m512 tmp15252 = _mm512_shuffle_ps(tmp15238, tmp15240, 238);
__m512 tmp15253 = _mm512_shuffle_ps(tmp15241, tmp15243, 68);
__m512 tmp15254 = _mm512_shuffle_ps(tmp15241, tmp15243, 238);
__m512 tmp15255 = _mm512_shuffle_ps(tmp15242, tmp15244, 68);
__m512 tmp15256 = _mm512_shuffle_ps(tmp15242, tmp15244, 238);
__m512 tmp15257 = _mm512_shuffle_f32x4(tmp15245, tmp15249, 136);
__m512 tmp15258 = _mm512_shuffle_f32x4(tmp15245, tmp15249, 221);
__m512 tmp15259 = _mm512_shuffle_f32x4(tmp15246, tmp15250, 136);
__m512 tmp15260 = _mm512_shuffle_f32x4(tmp15246, tmp15250, 221);
__m512 tmp15261 = _mm512_shuffle_f32x4(tmp15247, tmp15251, 136);
__m512 tmp15262 = _mm512_shuffle_f32x4(tmp15247, tmp15251, 221);
__m512 tmp15263 = _mm512_shuffle_f32x4(tmp15248, tmp15252, 136);
__m512 tmp15264 = _mm512_shuffle_f32x4(tmp15248, tmp15252, 221);
__m512 tmp15265 = _mm512_shuffle_f32x4(tmp15253, tmp15253, 136);
__m512 tmp15266 = _mm512_shuffle_f32x4(tmp15253, tmp15253, 221);
__m512 tmp15267 = _mm512_shuffle_f32x4(tmp15254, tmp15254, 136);
__m512 tmp15268 = _mm512_shuffle_f32x4(tmp15254, tmp15254, 221);
__m512 tmp15269 = _mm512_shuffle_f32x4(tmp15255, tmp15255, 136);
__m512 tmp15270 = _mm512_shuffle_f32x4(tmp15255, tmp15255, 221);
__m512 tmp15271 = _mm512_shuffle_f32x4(tmp15256, tmp15256, 136);
__m512 tmp15272 = _mm512_shuffle_f32x4(tmp15256, tmp15256, 221);
tmp15137 = _mm512_shuffle_f32x4(tmp15257, tmp15265, 136);
tmp15145 = _mm512_shuffle_f32x4(tmp15257, tmp15265, 221);
tmp15138 = _mm512_shuffle_f32x4(tmp15259, tmp15267, 136);
tmp15146 = _mm512_shuffle_f32x4(tmp15259, tmp15267, 221);
tmp15139 = _mm512_shuffle_f32x4(tmp15261, tmp15269, 136);
tmp15147 = _mm512_shuffle_f32x4(tmp15261, tmp15269, 221);
tmp15140 = _mm512_shuffle_f32x4(tmp15263, tmp15271, 136);
tmp15148 = _mm512_shuffle_f32x4(tmp15263, tmp15271, 221);
tmp15141 = _mm512_shuffle_f32x4(tmp15258, tmp15266, 136);
__m512 tmp15189 = _mm512_shuffle_f32x4(tmp15258, tmp15266, 221);
tmp15142 = _mm512_shuffle_f32x4(tmp15260, tmp15268, 136);
__m512 tmp15190 = _mm512_shuffle_f32x4(tmp15260, tmp15268, 221);
tmp15143 = _mm512_shuffle_f32x4(tmp15262, tmp15270, 136);
__m512 tmp15191 = _mm512_shuffle_f32x4(tmp15262, tmp15270, 221);
tmp15144 = _mm512_shuffle_f32x4(tmp15264, tmp15272, 136);
__m512 tmp15192 = _mm512_shuffle_f32x4(tmp15264, tmp15272, 221);
__m512 tmp15197 = _mm512_add_ps(tmp15138, tmp15139);
__m512 tmp15217 = _mm512_add_ps(tmp15146, tmp15147);
__m512 tmp15196 = _mm512_add_ps(tmp15140, tmp15141);
__m512 tmp15216 = _mm512_add_ps(tmp15148, tmp15189);
__m512 tmp15202 = _mm512_sub_ps(tmp15140, tmp15141);
__m512 tmp15222 = _mm512_sub_ps(tmp15148, tmp15189);
__m512 tmp15201 = _mm512_sub_ps(tmp15138, tmp15139);
__m512 tmp15221 = _mm512_sub_ps(tmp15146, tmp15147);
__m512 tmp15198 = _mm512_add_ps(tmp15142, tmp15143);
__m512 tmp15218 = _mm512_add_ps(tmp15190, tmp15191);
__m512 tmp15203 = _mm512_sub_ps(tmp15142, tmp15143);
__m512 tmp15223 = _mm512_sub_ps(tmp15190, tmp15191);
__m512 tmp15200 = _mm512_fmadd_ps(tmp15202, _mm512_set1_ps(2e+00f), tmp15201);
__m512 tmp15220 = _mm512_fmadd_ps(tmp15222, _mm512_set1_ps(2e+00f), tmp15221);
__m512 tmp15207 = _mm512_fmadd_ps(tmp15202, _mm512_set1_ps(8e+00f), tmp15201);
__m512 tmp15227 = _mm512_fmadd_ps(tmp15222, _mm512_set1_ps(8e+00f), tmp15221);
__m512 tmp15195 = _mm512_add_ps(tmp15196, tmp15197);
__m512 tmp15215 = _mm512_add_ps(tmp15216, tmp15217);
__m512 tmp15199 = _mm512_fmadd_ps(tmp15203, _mm512_set1_ps(1.6e+01f), tmp15200);
__m512 tmp15219 = _mm512_fmadd_ps(tmp15223, _mm512_set1_ps(1.6e+01f), tmp15220);
__m512 tmp15206 = _mm512_fmadd_ps(tmp15203, _mm512_set1_ps(4e+00f), tmp15207);
__m512 tmp15226 = _mm512_fmadd_ps(tmp15223, _mm512_set1_ps(4e+00f), tmp15227);
__m512 tmp15212 = _mm512_add_ps(tmp15203, tmp15201);
__m512 tmp15232 = _mm512_add_ps(tmp15223, tmp15221);
__m512 tmp15205 = _mm512_fmadd_ps(tmp15196, _mm512_set1_ps(4e+00f), tmp15197);
__m512 tmp15225 = _mm512_fmadd_ps(tmp15216, _mm512_set1_ps(4e+00f), tmp15217);
__m512 tmp15209 = _mm512_fmadd_ps(tmp15196, _mm512_set1_ps(1.6e+01f), tmp15197);
__m512 tmp15229 = _mm512_fmadd_ps(tmp15216, _mm512_set1_ps(1.6e+01f), tmp15217);
__m512 tmp15194 = _mm512_add_ps(tmp15195, tmp15137);
__m512 tmp15214 = _mm512_add_ps(tmp15215, tmp15145);
__m512 tmp15211 = _mm512_add_ps(tmp15212, tmp15144);
__m512 tmp15231 = _mm512_add_ps(tmp15232, tmp15192);
__m512 tmp15193 = _mm512_fmadd_ps(tmp15198, _mm512_set1_ps(3.2e+01f), tmp15194);
__m512 tmp15213 = _mm512_fmadd_ps(tmp15218, _mm512_set1_ps(3.2e+01f), tmp15214);
__m512 tmp15204 = _mm512_fmadd_ps(tmp15198, _mm512_set1_ps(8e+00f), tmp15205);
__m512 tmp15224 = _mm512_fmadd_ps(tmp15218, _mm512_set1_ps(8e+00f), tmp15225);
__m512 tmp15210 = _mm512_fmadd_ps(tmp15202, _mm512_set1_ps(3.2e+01f), tmp15211);
__m512 tmp15230 = _mm512_fmadd_ps(tmp15222, _mm512_set1_ps(3.2e+01f), tmp15231);
__m512 tmp15208 = _mm512_fmadd_ps(tmp15198, _mm512_set1_ps(2e+00f), tmp15209);
__m512 tmp15228 = _mm512_fmadd_ps(tmp15218, _mm512_set1_ps(2e+00f), tmp15229);
__m512 out2015 = tmp15193;
__m512 out2021 = tmp15213;
__m512 out2016 = tmp15199;
__m512 out2022 = tmp15219;
__m512 out2017 = tmp15204;
__m512 out2023 = tmp15224;
__m512 out2018 = tmp15206;
__m512 out2024 = tmp15226;
__m512 out2019 = tmp15208;
__m512 out2025 = tmp15228;
__m512 out2020 = tmp15210;
__m512 out2026 = tmp15230;
out2015 = _mm512_max_ps(_mm512_setzero_ps(), out2015);
out2021 = _mm512_max_ps(_mm512_setzero_ps(), out2021);
out2016 = _mm512_max_ps(_mm512_setzero_ps(), out2016);
out2022 = _mm512_max_ps(_mm512_setzero_ps(), out2022);
out2017 = _mm512_max_ps(_mm512_setzero_ps(), out2017);
out2023 = _mm512_max_ps(_mm512_setzero_ps(), out2023);
out2018 = _mm512_max_ps(_mm512_setzero_ps(), out2018);
out2024 = _mm512_max_ps(_mm512_setzero_ps(), out2024);
out2019 = _mm512_max_ps(_mm512_setzero_ps(), out2019);
out2025 = _mm512_max_ps(_mm512_setzero_ps(), out2025);
out2020 = _mm512_max_ps(_mm512_setzero_ps(), out2020);
out2026 = _mm512_max_ps(_mm512_setzero_ps(), out2026);
_mm512_mask_storeu_ps(datPtr24+648+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2015);
_mm512_mask_storeu_ps(datPtr24+3136+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2021);
_mm512_mask_storeu_ps(datPtr24+760+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2016);
_mm512_mask_storeu_ps(datPtr24+3248+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2022);
_mm512_mask_storeu_ps(datPtr24+872+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2017);
_mm512_mask_storeu_ps(datPtr24+3360+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2023);
_mm512_mask_storeu_ps(datPtr24+984+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2018);
_mm512_mask_storeu_ps(datPtr24+3472+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2024);
_mm512_mask_storeu_ps(datPtr24+1096+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2019);
_mm512_mask_storeu_ps(datPtr24+3584+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2025);
_mm512_mask_storeu_ps(datPtr24+1208+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2020);
_mm512_mask_storeu_ps(datPtr24+3696+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2026);
__m512 sf1089 = _mm512_loadu_ps(sfPtr11+512+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1090 = _mm512_loadu_ps(sfPtr11+640+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2183 = _mm512_shuffle_f32x4(sf1089, sf1090, 68);
__m512 in2184 = _mm512_shuffle_f32x4(sf1089, sf1090, 238);
__m512 sf1091 = _mm512_loadu_ps(sfPtr11+576+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1092 = _mm512_loadu_ps(sfPtr11+704+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2191 = _mm512_shuffle_f32x4(sf1091, sf1092, 68);
__m512 in2192 = _mm512_shuffle_f32x4(sf1091, sf1092, 238);
__m512 sf1093 = _mm512_loadu_ps(sfPtr11+205312+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1094 = _mm512_loadu_ps(sfPtr11+205440+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2185 = _mm512_shuffle_f32x4(sf1093, sf1094, 68);
__m512 in2186 = _mm512_shuffle_f32x4(sf1093, sf1094, 238);
__m512 sf1095 = _mm512_loadu_ps(sfPtr11+205376+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1096 = _mm512_loadu_ps(sfPtr11+205504+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2193 = _mm512_shuffle_f32x4(sf1095, sf1096, 68);
__m512 in2194 = _mm512_shuffle_f32x4(sf1095, sf1096, 238);
__m512 sf1097 = _mm512_loadu_ps(sfPtr11+410112+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1098 = _mm512_loadu_ps(sfPtr11+410240+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2187 = _mm512_shuffle_f32x4(sf1097, sf1098, 68);
__m512 in2188 = _mm512_shuffle_f32x4(sf1097, sf1098, 238);
__m512 sf1099 = _mm512_loadu_ps(sfPtr11+410176+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1100 = _mm512_loadu_ps(sfPtr11+410304+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2195 = _mm512_shuffle_f32x4(sf1099, sf1100, 68);
__m512 in2196 = _mm512_shuffle_f32x4(sf1099, sf1100, 238);
__m512 sf1101 = _mm512_loadu_ps(sfPtr11+614912+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1102 = _mm512_loadu_ps(sfPtr11+615040+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2189 = _mm512_shuffle_f32x4(sf1101, sf1102, 68);
__m512 in2190 = _mm512_shuffle_f32x4(sf1101, sf1102, 238);
__m512 sf1103 = _mm512_loadu_ps(sfPtr11+614976+819200*i47+49152*j40+1536*k136+768*l55);
__m512 sf1104 = _mm512_loadu_ps(sfPtr11+615104+819200*i47+49152*j40+1536*k136+768*l55);
__m512 in2197 = _mm512_shuffle_f32x4(sf1103, sf1104, 68);
__m512 in2198 = _mm512_shuffle_f32x4(sf1103, sf1104, 238);
__m512 tmp15289 = _mm512_add_ps(in2184, in2185);
__m512 tmp15309 = _mm512_add_ps(in2192, in2193);
__m512 tmp15288 = _mm512_add_ps(in2186, in2187);
__m512 tmp15308 = _mm512_add_ps(in2194, in2195);
__m512 tmp15294 = _mm512_sub_ps(in2186, in2187);
__m512 tmp15314 = _mm512_sub_ps(in2194, in2195);
__m512 tmp15293 = _mm512_sub_ps(in2184, in2185);
__m512 tmp15313 = _mm512_sub_ps(in2192, in2193);
__m512 tmp15290 = _mm512_add_ps(in2188, in2189);
__m512 tmp15310 = _mm512_add_ps(in2196, in2197);
__m512 tmp15295 = _mm512_sub_ps(in2188, in2189);
__m512 tmp15315 = _mm512_sub_ps(in2196, in2197);
__m512 tmp15292 = _mm512_fmadd_ps(tmp15294, _mm512_set1_ps(2e+00f), tmp15293);
__m512 tmp15312 = _mm512_fmadd_ps(tmp15314, _mm512_set1_ps(2e+00f), tmp15313);
__m512 tmp15299 = _mm512_fmadd_ps(tmp15294, _mm512_set1_ps(8e+00f), tmp15293);
__m512 tmp15319 = _mm512_fmadd_ps(tmp15314, _mm512_set1_ps(8e+00f), tmp15313);
__m512 tmp15287 = _mm512_add_ps(tmp15288, tmp15289);
__m512 tmp15307 = _mm512_add_ps(tmp15308, tmp15309);
__m512 tmp15291 = _mm512_fmadd_ps(tmp15295, _mm512_set1_ps(1.6e+01f), tmp15292);
__m512 tmp15311 = _mm512_fmadd_ps(tmp15315, _mm512_set1_ps(1.6e+01f), tmp15312);
__m512 tmp15298 = _mm512_fmadd_ps(tmp15295, _mm512_set1_ps(4e+00f), tmp15299);
__m512 tmp15318 = _mm512_fmadd_ps(tmp15315, _mm512_set1_ps(4e+00f), tmp15319);
__m512 tmp15304 = _mm512_add_ps(tmp15295, tmp15293);
__m512 tmp15324 = _mm512_add_ps(tmp15315, tmp15313);
__m512 tmp15297 = _mm512_fmadd_ps(tmp15288, _mm512_set1_ps(4e+00f), tmp15289);
__m512 tmp15317 = _mm512_fmadd_ps(tmp15308, _mm512_set1_ps(4e+00f), tmp15309);
__m512 tmp15301 = _mm512_fmadd_ps(tmp15288, _mm512_set1_ps(1.6e+01f), tmp15289);
__m512 tmp15321 = _mm512_fmadd_ps(tmp15308, _mm512_set1_ps(1.6e+01f), tmp15309);
__m512 tmp15286 = _mm512_add_ps(tmp15287, in2183);
__m512 tmp15306 = _mm512_add_ps(tmp15307, in2191);
__m512 tmp15303 = _mm512_add_ps(tmp15304, in2190);
__m512 tmp15323 = _mm512_add_ps(tmp15324, in2198);
__m512 tmp15285 = _mm512_fmadd_ps(tmp15290, _mm512_set1_ps(3.2e+01f), tmp15286);
__m512 tmp15305 = _mm512_fmadd_ps(tmp15310, _mm512_set1_ps(3.2e+01f), tmp15306);
__m512 tmp15296 = _mm512_fmadd_ps(tmp15290, _mm512_set1_ps(8e+00f), tmp15297);
__m512 tmp15316 = _mm512_fmadd_ps(tmp15310, _mm512_set1_ps(8e+00f), tmp15317);
__m512 tmp15302 = _mm512_fmadd_ps(tmp15294, _mm512_set1_ps(3.2e+01f), tmp15303);
__m512 tmp15322 = _mm512_fmadd_ps(tmp15314, _mm512_set1_ps(3.2e+01f), tmp15323);
__m512 tmp15300 = _mm512_fmadd_ps(tmp15290, _mm512_set1_ps(2e+00f), tmp15301);
__m512 tmp15320 = _mm512_fmadd_ps(tmp15310, _mm512_set1_ps(2e+00f), tmp15321);
__m512 tmp15273 = tmp15285;
__m512 tmp15279 = tmp15305;
__m512 tmp15274 = tmp15291;
__m512 tmp15280 = tmp15311;
__m512 tmp15275 = tmp15296;
__m512 tmp15281 = tmp15316;
__m512 tmp15276 = tmp15298;
__m512 tmp15282 = tmp15318;
__m512 tmp15277 = tmp15300;
__m512 tmp15283 = tmp15320;
__m512 tmp15278 = tmp15302;
__m512 tmp15284 = tmp15322;
__m512 tmp15369 = _mm512_unpacklo_ps(tmp15273, tmp15274);
__m512 tmp15370 = _mm512_unpackhi_ps(tmp15273, tmp15274);
__m512 tmp15371 = _mm512_unpacklo_ps(tmp15275, tmp15276);
__m512 tmp15372 = _mm512_unpackhi_ps(tmp15275, tmp15276);
__m512 tmp15373 = _mm512_unpacklo_ps(tmp15277, tmp15278);
__m512 tmp15374 = _mm512_unpackhi_ps(tmp15277, tmp15278);
__m512 tmp15375 = _mm512_unpacklo_ps(tmp15279, tmp15280);
__m512 tmp15376 = _mm512_unpackhi_ps(tmp15279, tmp15280);
__m512 tmp15377 = _mm512_unpacklo_ps(tmp15281, tmp15282);
__m512 tmp15378 = _mm512_unpackhi_ps(tmp15281, tmp15282);
__m512 tmp15379 = _mm512_unpacklo_ps(tmp15283, tmp15284);
__m512 tmp15380 = _mm512_unpackhi_ps(tmp15283, tmp15284);
__m512 tmp15381 = _mm512_shuffle_ps(tmp15369, tmp15371, 68);
__m512 tmp15382 = _mm512_shuffle_ps(tmp15369, tmp15371, 238);
__m512 tmp15383 = _mm512_shuffle_ps(tmp15370, tmp15372, 68);
__m512 tmp15384 = _mm512_shuffle_ps(tmp15370, tmp15372, 238);
__m512 tmp15385 = _mm512_shuffle_ps(tmp15373, tmp15375, 68);
__m512 tmp15386 = _mm512_shuffle_ps(tmp15373, tmp15375, 238);
__m512 tmp15387 = _mm512_shuffle_ps(tmp15374, tmp15376, 68);
__m512 tmp15388 = _mm512_shuffle_ps(tmp15374, tmp15376, 238);
__m512 tmp15389 = _mm512_shuffle_ps(tmp15377, tmp15379, 68);
__m512 tmp15390 = _mm512_shuffle_ps(tmp15377, tmp15379, 238);
__m512 tmp15391 = _mm512_shuffle_ps(tmp15378, tmp15380, 68);
__m512 tmp15392 = _mm512_shuffle_ps(tmp15378, tmp15380, 238);
__m512 tmp15393 = _mm512_shuffle_f32x4(tmp15381, tmp15385, 136);
__m512 tmp15394 = _mm512_shuffle_f32x4(tmp15381, tmp15385, 221);
__m512 tmp15395 = _mm512_shuffle_f32x4(tmp15382, tmp15386, 136);
__m512 tmp15396 = _mm512_shuffle_f32x4(tmp15382, tmp15386, 221);
__m512 tmp15397 = _mm512_shuffle_f32x4(tmp15383, tmp15387, 136);
__m512 tmp15398 = _mm512_shuffle_f32x4(tmp15383, tmp15387, 221);
__m512 tmp15399 = _mm512_shuffle_f32x4(tmp15384, tmp15388, 136);
__m512 tmp15400 = _mm512_shuffle_f32x4(tmp15384, tmp15388, 221);
__m512 tmp15401 = _mm512_shuffle_f32x4(tmp15389, tmp15389, 136);
__m512 tmp15402 = _mm512_shuffle_f32x4(tmp15389, tmp15389, 221);
__m512 tmp15403 = _mm512_shuffle_f32x4(tmp15390, tmp15390, 136);
__m512 tmp15404 = _mm512_shuffle_f32x4(tmp15390, tmp15390, 221);
__m512 tmp15405 = _mm512_shuffle_f32x4(tmp15391, tmp15391, 136);
__m512 tmp15406 = _mm512_shuffle_f32x4(tmp15391, tmp15391, 221);
__m512 tmp15407 = _mm512_shuffle_f32x4(tmp15392, tmp15392, 136);
__m512 tmp15408 = _mm512_shuffle_f32x4(tmp15392, tmp15392, 221);
tmp15273 = _mm512_shuffle_f32x4(tmp15393, tmp15401, 136);
tmp15281 = _mm512_shuffle_f32x4(tmp15393, tmp15401, 221);
tmp15274 = _mm512_shuffle_f32x4(tmp15395, tmp15403, 136);
tmp15282 = _mm512_shuffle_f32x4(tmp15395, tmp15403, 221);
tmp15275 = _mm512_shuffle_f32x4(tmp15397, tmp15405, 136);
tmp15283 = _mm512_shuffle_f32x4(tmp15397, tmp15405, 221);
tmp15276 = _mm512_shuffle_f32x4(tmp15399, tmp15407, 136);
tmp15284 = _mm512_shuffle_f32x4(tmp15399, tmp15407, 221);
tmp15277 = _mm512_shuffle_f32x4(tmp15394, tmp15402, 136);
__m512 tmp15325 = _mm512_shuffle_f32x4(tmp15394, tmp15402, 221);
tmp15278 = _mm512_shuffle_f32x4(tmp15396, tmp15404, 136);
__m512 tmp15326 = _mm512_shuffle_f32x4(tmp15396, tmp15404, 221);
tmp15279 = _mm512_shuffle_f32x4(tmp15398, tmp15406, 136);
__m512 tmp15327 = _mm512_shuffle_f32x4(tmp15398, tmp15406, 221);
tmp15280 = _mm512_shuffle_f32x4(tmp15400, tmp15408, 136);
__m512 tmp15328 = _mm512_shuffle_f32x4(tmp15400, tmp15408, 221);
__m512 tmp15333 = _mm512_add_ps(tmp15274, tmp15275);
__m512 tmp15353 = _mm512_add_ps(tmp15282, tmp15283);
__m512 tmp15332 = _mm512_add_ps(tmp15276, tmp15277);
__m512 tmp15352 = _mm512_add_ps(tmp15284, tmp15325);
__m512 tmp15338 = _mm512_sub_ps(tmp15276, tmp15277);
__m512 tmp15358 = _mm512_sub_ps(tmp15284, tmp15325);
__m512 tmp15337 = _mm512_sub_ps(tmp15274, tmp15275);
__m512 tmp15357 = _mm512_sub_ps(tmp15282, tmp15283);
__m512 tmp15334 = _mm512_add_ps(tmp15278, tmp15279);
__m512 tmp15354 = _mm512_add_ps(tmp15326, tmp15327);
__m512 tmp15339 = _mm512_sub_ps(tmp15278, tmp15279);
__m512 tmp15359 = _mm512_sub_ps(tmp15326, tmp15327);
__m512 tmp15336 = _mm512_fmadd_ps(tmp15338, _mm512_set1_ps(2e+00f), tmp15337);
__m512 tmp15356 = _mm512_fmadd_ps(tmp15358, _mm512_set1_ps(2e+00f), tmp15357);
__m512 tmp15343 = _mm512_fmadd_ps(tmp15338, _mm512_set1_ps(8e+00f), tmp15337);
__m512 tmp15363 = _mm512_fmadd_ps(tmp15358, _mm512_set1_ps(8e+00f), tmp15357);
__m512 tmp15331 = _mm512_add_ps(tmp15332, tmp15333);
__m512 tmp15351 = _mm512_add_ps(tmp15352, tmp15353);
__m512 tmp15335 = _mm512_fmadd_ps(tmp15339, _mm512_set1_ps(1.6e+01f), tmp15336);
__m512 tmp15355 = _mm512_fmadd_ps(tmp15359, _mm512_set1_ps(1.6e+01f), tmp15356);
__m512 tmp15342 = _mm512_fmadd_ps(tmp15339, _mm512_set1_ps(4e+00f), tmp15343);
__m512 tmp15362 = _mm512_fmadd_ps(tmp15359, _mm512_set1_ps(4e+00f), tmp15363);
__m512 tmp15348 = _mm512_add_ps(tmp15339, tmp15337);
__m512 tmp15368 = _mm512_add_ps(tmp15359, tmp15357);
__m512 tmp15341 = _mm512_fmadd_ps(tmp15332, _mm512_set1_ps(4e+00f), tmp15333);
__m512 tmp15361 = _mm512_fmadd_ps(tmp15352, _mm512_set1_ps(4e+00f), tmp15353);
__m512 tmp15345 = _mm512_fmadd_ps(tmp15332, _mm512_set1_ps(1.6e+01f), tmp15333);
__m512 tmp15365 = _mm512_fmadd_ps(tmp15352, _mm512_set1_ps(1.6e+01f), tmp15353);
__m512 tmp15330 = _mm512_add_ps(tmp15331, tmp15273);
__m512 tmp15350 = _mm512_add_ps(tmp15351, tmp15281);
__m512 tmp15347 = _mm512_add_ps(tmp15348, tmp15280);
__m512 tmp15367 = _mm512_add_ps(tmp15368, tmp15328);
__m512 tmp15329 = _mm512_fmadd_ps(tmp15334, _mm512_set1_ps(3.2e+01f), tmp15330);
__m512 tmp15349 = _mm512_fmadd_ps(tmp15354, _mm512_set1_ps(3.2e+01f), tmp15350);
__m512 tmp15340 = _mm512_fmadd_ps(tmp15334, _mm512_set1_ps(8e+00f), tmp15341);
__m512 tmp15360 = _mm512_fmadd_ps(tmp15354, _mm512_set1_ps(8e+00f), tmp15361);
__m512 tmp15346 = _mm512_fmadd_ps(tmp15338, _mm512_set1_ps(3.2e+01f), tmp15347);
__m512 tmp15366 = _mm512_fmadd_ps(tmp15358, _mm512_set1_ps(3.2e+01f), tmp15367);
__m512 tmp15344 = _mm512_fmadd_ps(tmp15334, _mm512_set1_ps(2e+00f), tmp15345);
__m512 tmp15364 = _mm512_fmadd_ps(tmp15354, _mm512_set1_ps(2e+00f), tmp15365);
__m512 out2027 = tmp15329;
__m512 out2033 = tmp15349;
__m512 out2028 = tmp15335;
__m512 out2034 = tmp15355;
__m512 out2029 = tmp15340;
__m512 out2035 = tmp15360;
__m512 out2030 = tmp15342;
__m512 out2036 = tmp15362;
__m512 out2031 = tmp15344;
__m512 out2037 = tmp15364;
__m512 out2032 = tmp15346;
__m512 out2038 = tmp15366;
out2027 = _mm512_max_ps(_mm512_setzero_ps(), out2027);
out2033 = _mm512_max_ps(_mm512_setzero_ps(), out2033);
out2028 = _mm512_max_ps(_mm512_setzero_ps(), out2028);
out2034 = _mm512_max_ps(_mm512_setzero_ps(), out2034);
out2029 = _mm512_max_ps(_mm512_setzero_ps(), out2029);
out2035 = _mm512_max_ps(_mm512_setzero_ps(), out2035);
out2030 = _mm512_max_ps(_mm512_setzero_ps(), out2030);
out2036 = _mm512_max_ps(_mm512_setzero_ps(), out2036);
out2031 = _mm512_max_ps(_mm512_setzero_ps(), out2031);
out2037 = _mm512_max_ps(_mm512_setzero_ps(), out2037);
out2032 = _mm512_max_ps(_mm512_setzero_ps(), out2032);
out2038 = _mm512_max_ps(_mm512_setzero_ps(), out2038);
_mm512_mask_storeu_ps(datPtr24+3184+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 1023, out2027);
_mm512_mask_storeu_ps(datPtr24+3784+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2033);
_mm512_mask_storeu_ps(datPtr24+3296+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 1023, out2028);
_mm512_mask_storeu_ps(datPtr24+3896+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2034);
_mm512_mask_storeu_ps(datPtr24+3408+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 1023, out2029);
_mm512_mask_storeu_ps(datPtr24+4008+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2035);
_mm512_mask_storeu_ps(datPtr24+3520+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 1023, out2030);
_mm512_mask_storeu_ps(datPtr24+4120+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2036);
_mm512_mask_storeu_ps(datPtr24+3632+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 1023, out2031);
_mm512_mask_storeu_ps(datPtr24+4232+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2037);
_mm512_mask_storeu_ps(datPtr24+3744+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 1023, out2032);
_mm512_mask_storeu_ps(datPtr24+4344+401408*i47+112*toH44+4*toW44+12544*k136+6272*l55, 4095, out2038);
}
}
if (j40 >= last10) return;
++j40;
rel22 = 2;
}
if (rel22 < 3) {
ptrdiff_t toH45 = base22+12;
ptrdiff_t toW45 = 12;
ptrdiff_t k137 = 32*w63;
for (; k137 != 32; ++k137) {
ptrdiff_t l56 = 0;
for (; l56 != 2; ++l56) {
__m512 sf1105 = _mm512_loadu_ps(sfPtr11+0+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1106 = _mm512_loadu_ps(sfPtr11+128+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2199 = _mm512_shuffle_f32x4(sf1105, sf1106, 68);
__m512 in2200 = _mm512_shuffle_f32x4(sf1105, sf1106, 238);
__m512 sf1107 = _mm512_loadu_ps(sfPtr11+64+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1108 = _mm512_loadu_ps(sfPtr11+192+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2207 = _mm512_shuffle_f32x4(sf1107, sf1108, 68);
__m512 in2208 = _mm512_shuffle_f32x4(sf1107, sf1108, 238);
__m512 sf1109 = _mm512_loadu_ps(sfPtr11+204800+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1110 = _mm512_loadu_ps(sfPtr11+204928+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2201 = _mm512_shuffle_f32x4(sf1109, sf1110, 68);
__m512 in2202 = _mm512_shuffle_f32x4(sf1109, sf1110, 238);
__m512 sf1111 = _mm512_loadu_ps(sfPtr11+204864+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1112 = _mm512_loadu_ps(sfPtr11+204992+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2209 = _mm512_shuffle_f32x4(sf1111, sf1112, 68);
__m512 in2210 = _mm512_shuffle_f32x4(sf1111, sf1112, 238);
__m512 sf1113 = _mm512_loadu_ps(sfPtr11+409600+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1114 = _mm512_loadu_ps(sfPtr11+409728+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2203 = _mm512_shuffle_f32x4(sf1113, sf1114, 68);
__m512 in2204 = _mm512_shuffle_f32x4(sf1113, sf1114, 238);
__m512 sf1115 = _mm512_loadu_ps(sfPtr11+409664+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1116 = _mm512_loadu_ps(sfPtr11+409792+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2211 = _mm512_shuffle_f32x4(sf1115, sf1116, 68);
__m512 in2212 = _mm512_shuffle_f32x4(sf1115, sf1116, 238);
__m512 sf1117 = _mm512_loadu_ps(sfPtr11+614400+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1118 = _mm512_loadu_ps(sfPtr11+614528+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2205 = _mm512_shuffle_f32x4(sf1117, sf1118, 68);
__m512 in2206 = _mm512_shuffle_f32x4(sf1117, sf1118, 238);
__m512 sf1119 = _mm512_loadu_ps(sfPtr11+614464+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1120 = _mm512_loadu_ps(sfPtr11+614592+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2213 = _mm512_shuffle_f32x4(sf1119, sf1120, 68);
__m512 in2214 = _mm512_shuffle_f32x4(sf1119, sf1120, 238);
__m512 tmp15425 = _mm512_add_ps(in2200, in2201);
__m512 tmp15445 = _mm512_add_ps(in2208, in2209);
__m512 tmp15424 = _mm512_add_ps(in2202, in2203);
__m512 tmp15444 = _mm512_add_ps(in2210, in2211);
__m512 tmp15430 = _mm512_sub_ps(in2202, in2203);
__m512 tmp15450 = _mm512_sub_ps(in2210, in2211);
__m512 tmp15429 = _mm512_sub_ps(in2200, in2201);
__m512 tmp15449 = _mm512_sub_ps(in2208, in2209);
__m512 tmp15426 = _mm512_add_ps(in2204, in2205);
__m512 tmp15446 = _mm512_add_ps(in2212, in2213);
__m512 tmp15431 = _mm512_sub_ps(in2204, in2205);
__m512 tmp15451 = _mm512_sub_ps(in2212, in2213);
__m512 tmp15428 = _mm512_fmadd_ps(tmp15430, _mm512_set1_ps(2e+00f), tmp15429);
__m512 tmp15448 = _mm512_fmadd_ps(tmp15450, _mm512_set1_ps(2e+00f), tmp15449);
__m512 tmp15435 = _mm512_fmadd_ps(tmp15430, _mm512_set1_ps(8e+00f), tmp15429);
__m512 tmp15455 = _mm512_fmadd_ps(tmp15450, _mm512_set1_ps(8e+00f), tmp15449);
__m512 tmp15423 = _mm512_add_ps(tmp15424, tmp15425);
__m512 tmp15443 = _mm512_add_ps(tmp15444, tmp15445);
__m512 tmp15427 = _mm512_fmadd_ps(tmp15431, _mm512_set1_ps(1.6e+01f), tmp15428);
__m512 tmp15447 = _mm512_fmadd_ps(tmp15451, _mm512_set1_ps(1.6e+01f), tmp15448);
__m512 tmp15434 = _mm512_fmadd_ps(tmp15431, _mm512_set1_ps(4e+00f), tmp15435);
__m512 tmp15454 = _mm512_fmadd_ps(tmp15451, _mm512_set1_ps(4e+00f), tmp15455);
__m512 tmp15440 = _mm512_add_ps(tmp15431, tmp15429);
__m512 tmp15460 = _mm512_add_ps(tmp15451, tmp15449);
__m512 tmp15433 = _mm512_fmadd_ps(tmp15424, _mm512_set1_ps(4e+00f), tmp15425);
__m512 tmp15453 = _mm512_fmadd_ps(tmp15444, _mm512_set1_ps(4e+00f), tmp15445);
__m512 tmp15437 = _mm512_fmadd_ps(tmp15424, _mm512_set1_ps(1.6e+01f), tmp15425);
__m512 tmp15457 = _mm512_fmadd_ps(tmp15444, _mm512_set1_ps(1.6e+01f), tmp15445);
__m512 tmp15422 = _mm512_add_ps(tmp15423, in2199);
__m512 tmp15442 = _mm512_add_ps(tmp15443, in2207);
__m512 tmp15439 = _mm512_add_ps(tmp15440, in2206);
__m512 tmp15459 = _mm512_add_ps(tmp15460, in2214);
__m512 tmp15421 = _mm512_fmadd_ps(tmp15426, _mm512_set1_ps(3.2e+01f), tmp15422);
__m512 tmp15441 = _mm512_fmadd_ps(tmp15446, _mm512_set1_ps(3.2e+01f), tmp15442);
__m512 tmp15432 = _mm512_fmadd_ps(tmp15426, _mm512_set1_ps(8e+00f), tmp15433);
__m512 tmp15452 = _mm512_fmadd_ps(tmp15446, _mm512_set1_ps(8e+00f), tmp15453);
__m512 tmp15438 = _mm512_fmadd_ps(tmp15430, _mm512_set1_ps(3.2e+01f), tmp15439);
__m512 tmp15458 = _mm512_fmadd_ps(tmp15450, _mm512_set1_ps(3.2e+01f), tmp15459);
__m512 tmp15436 = _mm512_fmadd_ps(tmp15426, _mm512_set1_ps(2e+00f), tmp15437);
__m512 tmp15456 = _mm512_fmadd_ps(tmp15446, _mm512_set1_ps(2e+00f), tmp15457);
__m512 tmp15409 = tmp15421;
__m512 tmp15415 = tmp15441;
__m512 tmp15410 = tmp15427;
__m512 tmp15416 = tmp15447;
__m512 tmp15411 = tmp15432;
__m512 tmp15417 = tmp15452;
__m512 tmp15412 = tmp15434;
__m512 tmp15418 = tmp15454;
__m512 tmp15413 = tmp15436;
__m512 tmp15419 = tmp15456;
__m512 tmp15414 = tmp15438;
__m512 tmp15420 = tmp15458;
__m512 tmp15505 = _mm512_unpacklo_ps(tmp15409, tmp15410);
__m512 tmp15506 = _mm512_unpackhi_ps(tmp15409, tmp15410);
__m512 tmp15507 = _mm512_unpacklo_ps(tmp15411, tmp15412);
__m512 tmp15508 = _mm512_unpackhi_ps(tmp15411, tmp15412);
__m512 tmp15509 = _mm512_unpacklo_ps(tmp15413, tmp15414);
__m512 tmp15510 = _mm512_unpackhi_ps(tmp15413, tmp15414);
__m512 tmp15511 = _mm512_unpacklo_ps(tmp15415, tmp15416);
__m512 tmp15512 = _mm512_unpackhi_ps(tmp15415, tmp15416);
__m512 tmp15513 = _mm512_unpacklo_ps(tmp15417, tmp15418);
__m512 tmp15514 = _mm512_unpackhi_ps(tmp15417, tmp15418);
__m512 tmp15515 = _mm512_unpacklo_ps(tmp15419, tmp15420);
__m512 tmp15516 = _mm512_unpackhi_ps(tmp15419, tmp15420);
__m512 tmp15517 = _mm512_shuffle_ps(tmp15505, tmp15507, 68);
__m512 tmp15518 = _mm512_shuffle_ps(tmp15505, tmp15507, 238);
__m512 tmp15519 = _mm512_shuffle_ps(tmp15506, tmp15508, 68);
__m512 tmp15520 = _mm512_shuffle_ps(tmp15506, tmp15508, 238);
__m512 tmp15521 = _mm512_shuffle_ps(tmp15509, tmp15511, 68);
__m512 tmp15522 = _mm512_shuffle_ps(tmp15509, tmp15511, 238);
__m512 tmp15523 = _mm512_shuffle_ps(tmp15510, tmp15512, 68);
__m512 tmp15524 = _mm512_shuffle_ps(tmp15510, tmp15512, 238);
__m512 tmp15525 = _mm512_shuffle_ps(tmp15513, tmp15515, 68);
__m512 tmp15526 = _mm512_shuffle_ps(tmp15513, tmp15515, 238);
__m512 tmp15527 = _mm512_shuffle_ps(tmp15514, tmp15516, 68);
__m512 tmp15528 = _mm512_shuffle_ps(tmp15514, tmp15516, 238);
__m512 tmp15529 = _mm512_shuffle_f32x4(tmp15517, tmp15521, 136);
__m512 tmp15530 = _mm512_shuffle_f32x4(tmp15517, tmp15521, 221);
__m512 tmp15531 = _mm512_shuffle_f32x4(tmp15518, tmp15522, 136);
__m512 tmp15532 = _mm512_shuffle_f32x4(tmp15518, tmp15522, 221);
__m512 tmp15533 = _mm512_shuffle_f32x4(tmp15519, tmp15523, 136);
__m512 tmp15534 = _mm512_shuffle_f32x4(tmp15519, tmp15523, 221);
__m512 tmp15535 = _mm512_shuffle_f32x4(tmp15520, tmp15524, 136);
__m512 tmp15536 = _mm512_shuffle_f32x4(tmp15520, tmp15524, 221);
__m512 tmp15537 = _mm512_shuffle_f32x4(tmp15525, tmp15525, 136);
__m512 tmp15538 = _mm512_shuffle_f32x4(tmp15525, tmp15525, 221);
__m512 tmp15539 = _mm512_shuffle_f32x4(tmp15526, tmp15526, 136);
__m512 tmp15540 = _mm512_shuffle_f32x4(tmp15526, tmp15526, 221);
__m512 tmp15541 = _mm512_shuffle_f32x4(tmp15527, tmp15527, 136);
__m512 tmp15542 = _mm512_shuffle_f32x4(tmp15527, tmp15527, 221);
__m512 tmp15543 = _mm512_shuffle_f32x4(tmp15528, tmp15528, 136);
__m512 tmp15544 = _mm512_shuffle_f32x4(tmp15528, tmp15528, 221);
tmp15409 = _mm512_shuffle_f32x4(tmp15529, tmp15537, 136);
tmp15417 = _mm512_shuffle_f32x4(tmp15529, tmp15537, 221);
tmp15410 = _mm512_shuffle_f32x4(tmp15531, tmp15539, 136);
tmp15418 = _mm512_shuffle_f32x4(tmp15531, tmp15539, 221);
tmp15411 = _mm512_shuffle_f32x4(tmp15533, tmp15541, 136);
tmp15419 = _mm512_shuffle_f32x4(tmp15533, tmp15541, 221);
tmp15412 = _mm512_shuffle_f32x4(tmp15535, tmp15543, 136);
tmp15420 = _mm512_shuffle_f32x4(tmp15535, tmp15543, 221);
tmp15413 = _mm512_shuffle_f32x4(tmp15530, tmp15538, 136);
__m512 tmp15461 = _mm512_shuffle_f32x4(tmp15530, tmp15538, 221);
tmp15414 = _mm512_shuffle_f32x4(tmp15532, tmp15540, 136);
__m512 tmp15462 = _mm512_shuffle_f32x4(tmp15532, tmp15540, 221);
tmp15415 = _mm512_shuffle_f32x4(tmp15534, tmp15542, 136);
__m512 tmp15463 = _mm512_shuffle_f32x4(tmp15534, tmp15542, 221);
tmp15416 = _mm512_shuffle_f32x4(tmp15536, tmp15544, 136);
__m512 tmp15464 = _mm512_shuffle_f32x4(tmp15536, tmp15544, 221);
__m512 tmp15469 = _mm512_add_ps(tmp15410, tmp15411);
__m512 tmp15489 = _mm512_add_ps(tmp15418, tmp15419);
__m512 tmp15468 = _mm512_add_ps(tmp15412, tmp15413);
__m512 tmp15488 = _mm512_add_ps(tmp15420, tmp15461);
__m512 tmp15474 = _mm512_sub_ps(tmp15412, tmp15413);
__m512 tmp15494 = _mm512_sub_ps(tmp15420, tmp15461);
__m512 tmp15473 = _mm512_sub_ps(tmp15410, tmp15411);
__m512 tmp15493 = _mm512_sub_ps(tmp15418, tmp15419);
__m512 tmp15470 = _mm512_add_ps(tmp15414, tmp15415);
__m512 tmp15490 = _mm512_add_ps(tmp15462, tmp15463);
__m512 tmp15475 = _mm512_sub_ps(tmp15414, tmp15415);
__m512 tmp15495 = _mm512_sub_ps(tmp15462, tmp15463);
__m512 tmp15472 = _mm512_fmadd_ps(tmp15474, _mm512_set1_ps(2e+00f), tmp15473);
__m512 tmp15492 = _mm512_fmadd_ps(tmp15494, _mm512_set1_ps(2e+00f), tmp15493);
__m512 tmp15479 = _mm512_fmadd_ps(tmp15474, _mm512_set1_ps(8e+00f), tmp15473);
__m512 tmp15499 = _mm512_fmadd_ps(tmp15494, _mm512_set1_ps(8e+00f), tmp15493);
__m512 tmp15467 = _mm512_add_ps(tmp15468, tmp15469);
__m512 tmp15487 = _mm512_add_ps(tmp15488, tmp15489);
__m512 tmp15471 = _mm512_fmadd_ps(tmp15475, _mm512_set1_ps(1.6e+01f), tmp15472);
__m512 tmp15491 = _mm512_fmadd_ps(tmp15495, _mm512_set1_ps(1.6e+01f), tmp15492);
__m512 tmp15478 = _mm512_fmadd_ps(tmp15475, _mm512_set1_ps(4e+00f), tmp15479);
__m512 tmp15498 = _mm512_fmadd_ps(tmp15495, _mm512_set1_ps(4e+00f), tmp15499);
__m512 tmp15484 = _mm512_add_ps(tmp15475, tmp15473);
__m512 tmp15504 = _mm512_add_ps(tmp15495, tmp15493);
__m512 tmp15477 = _mm512_fmadd_ps(tmp15468, _mm512_set1_ps(4e+00f), tmp15469);
__m512 tmp15497 = _mm512_fmadd_ps(tmp15488, _mm512_set1_ps(4e+00f), tmp15489);
__m512 tmp15481 = _mm512_fmadd_ps(tmp15468, _mm512_set1_ps(1.6e+01f), tmp15469);
__m512 tmp15501 = _mm512_fmadd_ps(tmp15488, _mm512_set1_ps(1.6e+01f), tmp15489);
__m512 tmp15466 = _mm512_add_ps(tmp15467, tmp15409);
__m512 tmp15486 = _mm512_add_ps(tmp15487, tmp15417);
__m512 tmp15483 = _mm512_add_ps(tmp15484, tmp15416);
__m512 tmp15503 = _mm512_add_ps(tmp15504, tmp15464);
__m512 tmp15465 = _mm512_fmadd_ps(tmp15470, _mm512_set1_ps(3.2e+01f), tmp15466);
__m512 tmp15485 = _mm512_fmadd_ps(tmp15490, _mm512_set1_ps(3.2e+01f), tmp15486);
__m512 tmp15476 = _mm512_fmadd_ps(tmp15470, _mm512_set1_ps(8e+00f), tmp15477);
__m512 tmp15496 = _mm512_fmadd_ps(tmp15490, _mm512_set1_ps(8e+00f), tmp15497);
__m512 tmp15482 = _mm512_fmadd_ps(tmp15474, _mm512_set1_ps(3.2e+01f), tmp15483);
__m512 tmp15502 = _mm512_fmadd_ps(tmp15494, _mm512_set1_ps(3.2e+01f), tmp15503);
__m512 tmp15480 = _mm512_fmadd_ps(tmp15470, _mm512_set1_ps(2e+00f), tmp15481);
__m512 tmp15500 = _mm512_fmadd_ps(tmp15490, _mm512_set1_ps(2e+00f), tmp15501);
__m512 out2039 = tmp15465;
__m512 out2045 = tmp15485;
__m512 out2040 = tmp15471;
__m512 out2046 = tmp15491;
__m512 out2041 = tmp15476;
__m512 out2047 = tmp15496;
__m512 out2042 = tmp15478;
__m512 out2048 = tmp15498;
__m512 out2043 = tmp15480;
__m512 out2049 = tmp15500;
__m512 out2044 = tmp15482;
__m512 out2050 = tmp15502;
out2039 = _mm512_max_ps(_mm512_setzero_ps(), out2039);
out2045 = _mm512_max_ps(_mm512_setzero_ps(), out2045);
out2040 = _mm512_max_ps(_mm512_setzero_ps(), out2040);
out2046 = _mm512_max_ps(_mm512_setzero_ps(), out2046);
out2041 = _mm512_max_ps(_mm512_setzero_ps(), out2041);
out2047 = _mm512_max_ps(_mm512_setzero_ps(), out2047);
out2042 = _mm512_max_ps(_mm512_setzero_ps(), out2042);
out2048 = _mm512_max_ps(_mm512_setzero_ps(), out2048);
out2043 = _mm512_max_ps(_mm512_setzero_ps(), out2043);
out2049 = _mm512_max_ps(_mm512_setzero_ps(), out2049);
out2044 = _mm512_max_ps(_mm512_setzero_ps(), out2044);
out2050 = _mm512_max_ps(_mm512_setzero_ps(), out2050);
_mm512_mask_storeu_ps(datPtr24+0+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2039);
_mm512_mask_storeu_ps(datPtr24+48+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 15, out2045);
_mm512_mask_storeu_ps(datPtr24+600+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4032, out2045);
_mm512_mask_storeu_ps(datPtr24+112+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2040);
_mm512_mask_storeu_ps(datPtr24+160+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 15, out2046);
_mm512_mask_storeu_ps(datPtr24+712+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4032, out2046);
_mm512_mask_storeu_ps(datPtr24+224+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2041);
_mm512_mask_storeu_ps(datPtr24+272+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 15, out2047);
_mm512_mask_storeu_ps(datPtr24+824+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4032, out2047);
_mm512_mask_storeu_ps(datPtr24+336+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2042);
_mm512_mask_storeu_ps(datPtr24+384+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 15, out2048);
_mm512_mask_storeu_ps(datPtr24+936+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4032, out2048);
_mm512_mask_storeu_ps(datPtr24+448+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2043);
_mm512_mask_storeu_ps(datPtr24+496+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 15, out2049);
_mm512_mask_storeu_ps(datPtr24+1048+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4032, out2049);
_mm512_mask_storeu_ps(datPtr24+560+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2044);
_mm512_mask_storeu_ps(datPtr24+608+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 15, out2050);
_mm512_mask_storeu_ps(datPtr24+1160+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4032, out2050);
__m512 sf1121 = _mm512_loadu_ps(sfPtr11+256+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1122 = _mm512_loadu_ps(sfPtr11+384+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2215 = _mm512_shuffle_f32x4(sf1121, sf1122, 68);
__m512 in2216 = _mm512_shuffle_f32x4(sf1121, sf1122, 238);
__m512 sf1123 = _mm512_loadu_ps(sfPtr11+320+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1124 = _mm512_loadu_ps(sfPtr11+448+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2223 = _mm512_shuffle_f32x4(sf1123, sf1124, 68);
__m512 in2224 = _mm512_shuffle_f32x4(sf1123, sf1124, 238);
__m512 sf1125 = _mm512_loadu_ps(sfPtr11+205056+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1126 = _mm512_loadu_ps(sfPtr11+205184+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2217 = _mm512_shuffle_f32x4(sf1125, sf1126, 68);
__m512 in2218 = _mm512_shuffle_f32x4(sf1125, sf1126, 238);
__m512 sf1127 = _mm512_loadu_ps(sfPtr11+205120+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1128 = _mm512_loadu_ps(sfPtr11+205248+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2225 = _mm512_shuffle_f32x4(sf1127, sf1128, 68);
__m512 in2226 = _mm512_shuffle_f32x4(sf1127, sf1128, 238);
__m512 sf1129 = _mm512_loadu_ps(sfPtr11+409856+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1130 = _mm512_loadu_ps(sfPtr11+409984+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2219 = _mm512_shuffle_f32x4(sf1129, sf1130, 68);
__m512 in2220 = _mm512_shuffle_f32x4(sf1129, sf1130, 238);
__m512 sf1131 = _mm512_loadu_ps(sfPtr11+409920+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1132 = _mm512_loadu_ps(sfPtr11+410048+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2227 = _mm512_shuffle_f32x4(sf1131, sf1132, 68);
__m512 in2228 = _mm512_shuffle_f32x4(sf1131, sf1132, 238);
__m512 sf1133 = _mm512_loadu_ps(sfPtr11+614656+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1134 = _mm512_loadu_ps(sfPtr11+614784+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2221 = _mm512_shuffle_f32x4(sf1133, sf1134, 68);
__m512 in2222 = _mm512_shuffle_f32x4(sf1133, sf1134, 238);
__m512 sf1135 = _mm512_loadu_ps(sfPtr11+614720+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1136 = _mm512_loadu_ps(sfPtr11+614848+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2229 = _mm512_shuffle_f32x4(sf1135, sf1136, 68);
__m512 in2230 = _mm512_shuffle_f32x4(sf1135, sf1136, 238);
__m512 tmp15561 = _mm512_add_ps(in2216, in2217);
__m512 tmp15581 = _mm512_add_ps(in2224, in2225);
__m512 tmp15560 = _mm512_add_ps(in2218, in2219);
__m512 tmp15580 = _mm512_add_ps(in2226, in2227);
__m512 tmp15566 = _mm512_sub_ps(in2218, in2219);
__m512 tmp15586 = _mm512_sub_ps(in2226, in2227);
__m512 tmp15565 = _mm512_sub_ps(in2216, in2217);
__m512 tmp15585 = _mm512_sub_ps(in2224, in2225);
__m512 tmp15562 = _mm512_add_ps(in2220, in2221);
__m512 tmp15582 = _mm512_add_ps(in2228, in2229);
__m512 tmp15567 = _mm512_sub_ps(in2220, in2221);
__m512 tmp15587 = _mm512_sub_ps(in2228, in2229);
__m512 tmp15564 = _mm512_fmadd_ps(tmp15566, _mm512_set1_ps(2e+00f), tmp15565);
__m512 tmp15584 = _mm512_fmadd_ps(tmp15586, _mm512_set1_ps(2e+00f), tmp15585);
__m512 tmp15571 = _mm512_fmadd_ps(tmp15566, _mm512_set1_ps(8e+00f), tmp15565);
__m512 tmp15591 = _mm512_fmadd_ps(tmp15586, _mm512_set1_ps(8e+00f), tmp15585);
__m512 tmp15559 = _mm512_add_ps(tmp15560, tmp15561);
__m512 tmp15579 = _mm512_add_ps(tmp15580, tmp15581);
__m512 tmp15563 = _mm512_fmadd_ps(tmp15567, _mm512_set1_ps(1.6e+01f), tmp15564);
__m512 tmp15583 = _mm512_fmadd_ps(tmp15587, _mm512_set1_ps(1.6e+01f), tmp15584);
__m512 tmp15570 = _mm512_fmadd_ps(tmp15567, _mm512_set1_ps(4e+00f), tmp15571);
__m512 tmp15590 = _mm512_fmadd_ps(tmp15587, _mm512_set1_ps(4e+00f), tmp15591);
__m512 tmp15576 = _mm512_add_ps(tmp15567, tmp15565);
__m512 tmp15596 = _mm512_add_ps(tmp15587, tmp15585);
__m512 tmp15569 = _mm512_fmadd_ps(tmp15560, _mm512_set1_ps(4e+00f), tmp15561);
__m512 tmp15589 = _mm512_fmadd_ps(tmp15580, _mm512_set1_ps(4e+00f), tmp15581);
__m512 tmp15573 = _mm512_fmadd_ps(tmp15560, _mm512_set1_ps(1.6e+01f), tmp15561);
__m512 tmp15593 = _mm512_fmadd_ps(tmp15580, _mm512_set1_ps(1.6e+01f), tmp15581);
__m512 tmp15558 = _mm512_add_ps(tmp15559, in2215);
__m512 tmp15578 = _mm512_add_ps(tmp15579, in2223);
__m512 tmp15575 = _mm512_add_ps(tmp15576, in2222);
__m512 tmp15595 = _mm512_add_ps(tmp15596, in2230);
__m512 tmp15557 = _mm512_fmadd_ps(tmp15562, _mm512_set1_ps(3.2e+01f), tmp15558);
__m512 tmp15577 = _mm512_fmadd_ps(tmp15582, _mm512_set1_ps(3.2e+01f), tmp15578);
__m512 tmp15568 = _mm512_fmadd_ps(tmp15562, _mm512_set1_ps(8e+00f), tmp15569);
__m512 tmp15588 = _mm512_fmadd_ps(tmp15582, _mm512_set1_ps(8e+00f), tmp15589);
__m512 tmp15574 = _mm512_fmadd_ps(tmp15566, _mm512_set1_ps(3.2e+01f), tmp15575);
__m512 tmp15594 = _mm512_fmadd_ps(tmp15586, _mm512_set1_ps(3.2e+01f), tmp15595);
__m512 tmp15572 = _mm512_fmadd_ps(tmp15562, _mm512_set1_ps(2e+00f), tmp15573);
__m512 tmp15592 = _mm512_fmadd_ps(tmp15582, _mm512_set1_ps(2e+00f), tmp15593);
__m512 tmp15545 = tmp15557;
__m512 tmp15551 = tmp15577;
__m512 tmp15546 = tmp15563;
__m512 tmp15552 = tmp15583;
__m512 tmp15547 = tmp15568;
__m512 tmp15553 = tmp15588;
__m512 tmp15548 = tmp15570;
__m512 tmp15554 = tmp15590;
__m512 tmp15549 = tmp15572;
__m512 tmp15555 = tmp15592;
__m512 tmp15550 = tmp15574;
__m512 tmp15556 = tmp15594;
__m512 tmp15641 = _mm512_unpacklo_ps(tmp15545, tmp15546);
__m512 tmp15642 = _mm512_unpackhi_ps(tmp15545, tmp15546);
__m512 tmp15643 = _mm512_unpacklo_ps(tmp15547, tmp15548);
__m512 tmp15644 = _mm512_unpackhi_ps(tmp15547, tmp15548);
__m512 tmp15645 = _mm512_unpacklo_ps(tmp15549, tmp15550);
__m512 tmp15646 = _mm512_unpackhi_ps(tmp15549, tmp15550);
__m512 tmp15647 = _mm512_unpacklo_ps(tmp15551, tmp15552);
__m512 tmp15648 = _mm512_unpackhi_ps(tmp15551, tmp15552);
__m512 tmp15649 = _mm512_unpacklo_ps(tmp15553, tmp15554);
__m512 tmp15650 = _mm512_unpackhi_ps(tmp15553, tmp15554);
__m512 tmp15651 = _mm512_unpacklo_ps(tmp15555, tmp15556);
__m512 tmp15652 = _mm512_unpackhi_ps(tmp15555, tmp15556);
__m512 tmp15653 = _mm512_shuffle_ps(tmp15641, tmp15643, 68);
__m512 tmp15654 = _mm512_shuffle_ps(tmp15641, tmp15643, 238);
__m512 tmp15655 = _mm512_shuffle_ps(tmp15642, tmp15644, 68);
__m512 tmp15656 = _mm512_shuffle_ps(tmp15642, tmp15644, 238);
__m512 tmp15657 = _mm512_shuffle_ps(tmp15645, tmp15647, 68);
__m512 tmp15658 = _mm512_shuffle_ps(tmp15645, tmp15647, 238);
__m512 tmp15659 = _mm512_shuffle_ps(tmp15646, tmp15648, 68);
__m512 tmp15660 = _mm512_shuffle_ps(tmp15646, tmp15648, 238);
__m512 tmp15661 = _mm512_shuffle_ps(tmp15649, tmp15651, 68);
__m512 tmp15662 = _mm512_shuffle_ps(tmp15649, tmp15651, 238);
__m512 tmp15663 = _mm512_shuffle_ps(tmp15650, tmp15652, 68);
__m512 tmp15664 = _mm512_shuffle_ps(tmp15650, tmp15652, 238);
__m512 tmp15665 = _mm512_shuffle_f32x4(tmp15653, tmp15657, 136);
__m512 tmp15666 = _mm512_shuffle_f32x4(tmp15653, tmp15657, 221);
__m512 tmp15667 = _mm512_shuffle_f32x4(tmp15654, tmp15658, 136);
__m512 tmp15668 = _mm512_shuffle_f32x4(tmp15654, tmp15658, 221);
__m512 tmp15669 = _mm512_shuffle_f32x4(tmp15655, tmp15659, 136);
__m512 tmp15670 = _mm512_shuffle_f32x4(tmp15655, tmp15659, 221);
__m512 tmp15671 = _mm512_shuffle_f32x4(tmp15656, tmp15660, 136);
__m512 tmp15672 = _mm512_shuffle_f32x4(tmp15656, tmp15660, 221);
__m512 tmp15673 = _mm512_shuffle_f32x4(tmp15661, tmp15661, 136);
__m512 tmp15674 = _mm512_shuffle_f32x4(tmp15661, tmp15661, 221);
__m512 tmp15675 = _mm512_shuffle_f32x4(tmp15662, tmp15662, 136);
__m512 tmp15676 = _mm512_shuffle_f32x4(tmp15662, tmp15662, 221);
__m512 tmp15677 = _mm512_shuffle_f32x4(tmp15663, tmp15663, 136);
__m512 tmp15678 = _mm512_shuffle_f32x4(tmp15663, tmp15663, 221);
__m512 tmp15679 = _mm512_shuffle_f32x4(tmp15664, tmp15664, 136);
__m512 tmp15680 = _mm512_shuffle_f32x4(tmp15664, tmp15664, 221);
tmp15545 = _mm512_shuffle_f32x4(tmp15665, tmp15673, 136);
tmp15553 = _mm512_shuffle_f32x4(tmp15665, tmp15673, 221);
tmp15546 = _mm512_shuffle_f32x4(tmp15667, tmp15675, 136);
tmp15554 = _mm512_shuffle_f32x4(tmp15667, tmp15675, 221);
tmp15547 = _mm512_shuffle_f32x4(tmp15669, tmp15677, 136);
tmp15555 = _mm512_shuffle_f32x4(tmp15669, tmp15677, 221);
tmp15548 = _mm512_shuffle_f32x4(tmp15671, tmp15679, 136);
tmp15556 = _mm512_shuffle_f32x4(tmp15671, tmp15679, 221);
tmp15549 = _mm512_shuffle_f32x4(tmp15666, tmp15674, 136);
__m512 tmp15597 = _mm512_shuffle_f32x4(tmp15666, tmp15674, 221);
tmp15550 = _mm512_shuffle_f32x4(tmp15668, tmp15676, 136);
__m512 tmp15598 = _mm512_shuffle_f32x4(tmp15668, tmp15676, 221);
tmp15551 = _mm512_shuffle_f32x4(tmp15670, tmp15678, 136);
__m512 tmp15599 = _mm512_shuffle_f32x4(tmp15670, tmp15678, 221);
tmp15552 = _mm512_shuffle_f32x4(tmp15672, tmp15680, 136);
__m512 tmp15600 = _mm512_shuffle_f32x4(tmp15672, tmp15680, 221);
__m512 tmp15605 = _mm512_add_ps(tmp15546, tmp15547);
__m512 tmp15625 = _mm512_add_ps(tmp15554, tmp15555);
__m512 tmp15604 = _mm512_add_ps(tmp15548, tmp15549);
__m512 tmp15624 = _mm512_add_ps(tmp15556, tmp15597);
__m512 tmp15610 = _mm512_sub_ps(tmp15548, tmp15549);
__m512 tmp15630 = _mm512_sub_ps(tmp15556, tmp15597);
__m512 tmp15609 = _mm512_sub_ps(tmp15546, tmp15547);
__m512 tmp15629 = _mm512_sub_ps(tmp15554, tmp15555);
__m512 tmp15606 = _mm512_add_ps(tmp15550, tmp15551);
__m512 tmp15626 = _mm512_add_ps(tmp15598, tmp15599);
__m512 tmp15611 = _mm512_sub_ps(tmp15550, tmp15551);
__m512 tmp15631 = _mm512_sub_ps(tmp15598, tmp15599);
__m512 tmp15608 = _mm512_fmadd_ps(tmp15610, _mm512_set1_ps(2e+00f), tmp15609);
__m512 tmp15628 = _mm512_fmadd_ps(tmp15630, _mm512_set1_ps(2e+00f), tmp15629);
__m512 tmp15615 = _mm512_fmadd_ps(tmp15610, _mm512_set1_ps(8e+00f), tmp15609);
__m512 tmp15635 = _mm512_fmadd_ps(tmp15630, _mm512_set1_ps(8e+00f), tmp15629);
__m512 tmp15603 = _mm512_add_ps(tmp15604, tmp15605);
__m512 tmp15623 = _mm512_add_ps(tmp15624, tmp15625);
__m512 tmp15607 = _mm512_fmadd_ps(tmp15611, _mm512_set1_ps(1.6e+01f), tmp15608);
__m512 tmp15627 = _mm512_fmadd_ps(tmp15631, _mm512_set1_ps(1.6e+01f), tmp15628);
__m512 tmp15614 = _mm512_fmadd_ps(tmp15611, _mm512_set1_ps(4e+00f), tmp15615);
__m512 tmp15634 = _mm512_fmadd_ps(tmp15631, _mm512_set1_ps(4e+00f), tmp15635);
__m512 tmp15620 = _mm512_add_ps(tmp15611, tmp15609);
__m512 tmp15640 = _mm512_add_ps(tmp15631, tmp15629);
__m512 tmp15613 = _mm512_fmadd_ps(tmp15604, _mm512_set1_ps(4e+00f), tmp15605);
__m512 tmp15633 = _mm512_fmadd_ps(tmp15624, _mm512_set1_ps(4e+00f), tmp15625);
__m512 tmp15617 = _mm512_fmadd_ps(tmp15604, _mm512_set1_ps(1.6e+01f), tmp15605);
__m512 tmp15637 = _mm512_fmadd_ps(tmp15624, _mm512_set1_ps(1.6e+01f), tmp15625);
__m512 tmp15602 = _mm512_add_ps(tmp15603, tmp15545);
__m512 tmp15622 = _mm512_add_ps(tmp15623, tmp15553);
__m512 tmp15619 = _mm512_add_ps(tmp15620, tmp15552);
__m512 tmp15639 = _mm512_add_ps(tmp15640, tmp15600);
__m512 tmp15601 = _mm512_fmadd_ps(tmp15606, _mm512_set1_ps(3.2e+01f), tmp15602);
__m512 tmp15621 = _mm512_fmadd_ps(tmp15626, _mm512_set1_ps(3.2e+01f), tmp15622);
__m512 tmp15612 = _mm512_fmadd_ps(tmp15606, _mm512_set1_ps(8e+00f), tmp15613);
__m512 tmp15632 = _mm512_fmadd_ps(tmp15626, _mm512_set1_ps(8e+00f), tmp15633);
__m512 tmp15618 = _mm512_fmadd_ps(tmp15610, _mm512_set1_ps(3.2e+01f), tmp15619);
__m512 tmp15638 = _mm512_fmadd_ps(tmp15630, _mm512_set1_ps(3.2e+01f), tmp15639);
__m512 tmp15616 = _mm512_fmadd_ps(tmp15606, _mm512_set1_ps(2e+00f), tmp15617);
__m512 tmp15636 = _mm512_fmadd_ps(tmp15626, _mm512_set1_ps(2e+00f), tmp15637);
__m512 out2051 = tmp15601;
__m512 out2057 = tmp15621;
__m512 out2052 = tmp15607;
__m512 out2058 = tmp15627;
__m512 out2053 = tmp15612;
__m512 out2059 = tmp15632;
__m512 out2054 = tmp15614;
__m512 out2060 = tmp15634;
__m512 out2055 = tmp15616;
__m512 out2061 = tmp15636;
__m512 out2056 = tmp15618;
__m512 out2062 = tmp15638;
out2051 = _mm512_max_ps(_mm512_setzero_ps(), out2051);
out2057 = _mm512_max_ps(_mm512_setzero_ps(), out2057);
out2052 = _mm512_max_ps(_mm512_setzero_ps(), out2052);
out2058 = _mm512_max_ps(_mm512_setzero_ps(), out2058);
out2053 = _mm512_max_ps(_mm512_setzero_ps(), out2053);
out2059 = _mm512_max_ps(_mm512_setzero_ps(), out2059);
out2054 = _mm512_max_ps(_mm512_setzero_ps(), out2054);
out2060 = _mm512_max_ps(_mm512_setzero_ps(), out2060);
out2055 = _mm512_max_ps(_mm512_setzero_ps(), out2055);
out2061 = _mm512_max_ps(_mm512_setzero_ps(), out2061);
out2056 = _mm512_max_ps(_mm512_setzero_ps(), out2056);
out2062 = _mm512_max_ps(_mm512_setzero_ps(), out2062);
_mm512_mask_storeu_ps(datPtr24+648+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2051);
_mm512_mask_storeu_ps(datPtr24+3136+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2057);
_mm512_mask_storeu_ps(datPtr24+760+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2052);
_mm512_mask_storeu_ps(datPtr24+3248+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2058);
_mm512_mask_storeu_ps(datPtr24+872+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2053);
_mm512_mask_storeu_ps(datPtr24+3360+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2059);
_mm512_mask_storeu_ps(datPtr24+984+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2054);
_mm512_mask_storeu_ps(datPtr24+3472+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2060);
_mm512_mask_storeu_ps(datPtr24+1096+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2055);
_mm512_mask_storeu_ps(datPtr24+3584+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2061);
_mm512_mask_storeu_ps(datPtr24+1208+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2056);
_mm512_mask_storeu_ps(datPtr24+3696+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2062);
__m512 sf1137 = _mm512_loadu_ps(sfPtr11+512+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1138 = _mm512_loadu_ps(sfPtr11+576+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2231 = _mm512_shuffle_f32x4(sf1138, sf1137, 68);
__m512 in2232 = _mm512_shuffle_f32x4(sf1138, sf1137, 238);
__m512 sf1139 = _mm512_loadu_ps(sfPtr11+640+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1140 = _mm512_loadu_ps(sfPtr11+704+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2239 = _mm512_shuffle_f32x4(sf1139, sf1140, 68);
__m512 in2240 = _mm512_shuffle_f32x4(sf1139, sf1140, 238);
__m512 sf1141 = _mm512_loadu_ps(sfPtr11+205312+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1142 = _mm512_loadu_ps(sfPtr11+205376+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2233 = _mm512_shuffle_f32x4(sf1142, sf1141, 68);
__m512 in2234 = _mm512_shuffle_f32x4(sf1142, sf1141, 238);
__m512 sf1143 = _mm512_loadu_ps(sfPtr11+205440+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1144 = _mm512_loadu_ps(sfPtr11+205504+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2241 = _mm512_shuffle_f32x4(sf1143, sf1144, 68);
__m512 in2242 = _mm512_shuffle_f32x4(sf1143, sf1144, 238);
__m512 sf1145 = _mm512_loadu_ps(sfPtr11+410112+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1146 = _mm512_loadu_ps(sfPtr11+410176+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2235 = _mm512_shuffle_f32x4(sf1146, sf1145, 68);
__m512 in2236 = _mm512_shuffle_f32x4(sf1146, sf1145, 238);
__m512 sf1147 = _mm512_loadu_ps(sfPtr11+410240+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1148 = _mm512_loadu_ps(sfPtr11+410304+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2243 = _mm512_shuffle_f32x4(sf1147, sf1148, 68);
__m512 in2244 = _mm512_shuffle_f32x4(sf1147, sf1148, 238);
__m512 sf1149 = _mm512_loadu_ps(sfPtr11+614912+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1150 = _mm512_loadu_ps(sfPtr11+614976+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2237 = _mm512_shuffle_f32x4(sf1150, sf1149, 68);
__m512 in2238 = _mm512_shuffle_f32x4(sf1150, sf1149, 238);
__m512 sf1151 = _mm512_loadu_ps(sfPtr11+615040+819200*i47+49152*j40+1536*k137+768*l56);
__m512 sf1152 = _mm512_loadu_ps(sfPtr11+615104+819200*i47+49152*j40+1536*k137+768*l56);
__m512 in2245 = _mm512_shuffle_f32x4(sf1151, sf1152, 68);
__m512 in2246 = _mm512_shuffle_f32x4(sf1151, sf1152, 238);
__m512 tmp15697 = _mm512_add_ps(in2232, in2233);
__m512 tmp15717 = _mm512_add_ps(in2240, in2241);
__m512 tmp15696 = _mm512_add_ps(in2234, in2235);
__m512 tmp15716 = _mm512_add_ps(in2242, in2243);
__m512 tmp15702 = _mm512_sub_ps(in2234, in2235);
__m512 tmp15722 = _mm512_sub_ps(in2242, in2243);
__m512 tmp15701 = _mm512_sub_ps(in2232, in2233);
__m512 tmp15721 = _mm512_sub_ps(in2240, in2241);
__m512 tmp15698 = _mm512_add_ps(in2236, in2237);
__m512 tmp15718 = _mm512_add_ps(in2244, in2245);
__m512 tmp15703 = _mm512_sub_ps(in2236, in2237);
__m512 tmp15723 = _mm512_sub_ps(in2244, in2245);
__m512 tmp15700 = _mm512_fmadd_ps(tmp15702, _mm512_set1_ps(2e+00f), tmp15701);
__m512 tmp15720 = _mm512_fmadd_ps(tmp15722, _mm512_set1_ps(2e+00f), tmp15721);
__m512 tmp15707 = _mm512_fmadd_ps(tmp15702, _mm512_set1_ps(8e+00f), tmp15701);
__m512 tmp15727 = _mm512_fmadd_ps(tmp15722, _mm512_set1_ps(8e+00f), tmp15721);
__m512 tmp15695 = _mm512_add_ps(tmp15696, tmp15697);
__m512 tmp15715 = _mm512_add_ps(tmp15716, tmp15717);
__m512 tmp15699 = _mm512_fmadd_ps(tmp15703, _mm512_set1_ps(1.6e+01f), tmp15700);
__m512 tmp15719 = _mm512_fmadd_ps(tmp15723, _mm512_set1_ps(1.6e+01f), tmp15720);
__m512 tmp15706 = _mm512_fmadd_ps(tmp15703, _mm512_set1_ps(4e+00f), tmp15707);
__m512 tmp15726 = _mm512_fmadd_ps(tmp15723, _mm512_set1_ps(4e+00f), tmp15727);
__m512 tmp15712 = _mm512_add_ps(tmp15703, tmp15701);
__m512 tmp15732 = _mm512_add_ps(tmp15723, tmp15721);
__m512 tmp15705 = _mm512_fmadd_ps(tmp15696, _mm512_set1_ps(4e+00f), tmp15697);
__m512 tmp15725 = _mm512_fmadd_ps(tmp15716, _mm512_set1_ps(4e+00f), tmp15717);
__m512 tmp15709 = _mm512_fmadd_ps(tmp15696, _mm512_set1_ps(1.6e+01f), tmp15697);
__m512 tmp15729 = _mm512_fmadd_ps(tmp15716, _mm512_set1_ps(1.6e+01f), tmp15717);
__m512 tmp15694 = _mm512_add_ps(tmp15695, in2231);
__m512 tmp15714 = _mm512_add_ps(tmp15715, in2239);
__m512 tmp15711 = _mm512_add_ps(tmp15712, in2238);
__m512 tmp15731 = _mm512_add_ps(tmp15732, in2246);
__m512 tmp15693 = _mm512_fmadd_ps(tmp15698, _mm512_set1_ps(3.2e+01f), tmp15694);
__m512 tmp15713 = _mm512_fmadd_ps(tmp15718, _mm512_set1_ps(3.2e+01f), tmp15714);
__m512 tmp15704 = _mm512_fmadd_ps(tmp15698, _mm512_set1_ps(8e+00f), tmp15705);
__m512 tmp15724 = _mm512_fmadd_ps(tmp15718, _mm512_set1_ps(8e+00f), tmp15725);
__m512 tmp15710 = _mm512_fmadd_ps(tmp15702, _mm512_set1_ps(3.2e+01f), tmp15711);
__m512 tmp15730 = _mm512_fmadd_ps(tmp15722, _mm512_set1_ps(3.2e+01f), tmp15731);
__m512 tmp15708 = _mm512_fmadd_ps(tmp15698, _mm512_set1_ps(2e+00f), tmp15709);
__m512 tmp15728 = _mm512_fmadd_ps(tmp15718, _mm512_set1_ps(2e+00f), tmp15729);
__m512 tmp15681 = tmp15693;
__m512 tmp15687 = tmp15713;
__m512 tmp15682 = tmp15699;
__m512 tmp15688 = tmp15719;
__m512 tmp15683 = tmp15704;
__m512 tmp15689 = tmp15724;
__m512 tmp15684 = tmp15706;
__m512 tmp15690 = tmp15726;
__m512 tmp15685 = tmp15708;
__m512 tmp15691 = tmp15728;
__m512 tmp15686 = tmp15710;
__m512 tmp15692 = tmp15730;
__m512 tmp15777 = _mm512_unpacklo_ps(tmp15681, tmp15682);
__m512 tmp15778 = _mm512_unpackhi_ps(tmp15681, tmp15682);
__m512 tmp15779 = _mm512_unpacklo_ps(tmp15683, tmp15684);
__m512 tmp15780 = _mm512_unpackhi_ps(tmp15683, tmp15684);
__m512 tmp15781 = _mm512_unpacklo_ps(tmp15685, tmp15686);
__m512 tmp15782 = _mm512_unpackhi_ps(tmp15685, tmp15686);
__m512 tmp15783 = _mm512_unpacklo_ps(tmp15687, tmp15688);
__m512 tmp15784 = _mm512_unpackhi_ps(tmp15687, tmp15688);
__m512 tmp15785 = _mm512_unpacklo_ps(tmp15689, tmp15690);
__m512 tmp15786 = _mm512_unpackhi_ps(tmp15689, tmp15690);
__m512 tmp15787 = _mm512_unpacklo_ps(tmp15691, tmp15692);
__m512 tmp15788 = _mm512_unpackhi_ps(tmp15691, tmp15692);
__m512 tmp15789 = _mm512_shuffle_ps(tmp15777, tmp15779, 68);
__m512 tmp15790 = _mm512_shuffle_ps(tmp15777, tmp15779, 238);
__m512 tmp15791 = _mm512_shuffle_ps(tmp15778, tmp15780, 68);
__m512 tmp15792 = _mm512_shuffle_ps(tmp15778, tmp15780, 238);
__m512 tmp15793 = _mm512_shuffle_ps(tmp15781, tmp15783, 68);
__m512 tmp15794 = _mm512_shuffle_ps(tmp15781, tmp15783, 238);
__m512 tmp15795 = _mm512_shuffle_ps(tmp15782, tmp15784, 68);
__m512 tmp15796 = _mm512_shuffle_ps(tmp15782, tmp15784, 238);
__m512 tmp15797 = _mm512_shuffle_ps(tmp15785, tmp15787, 68);
__m512 tmp15798 = _mm512_shuffle_ps(tmp15785, tmp15787, 238);
__m512 tmp15799 = _mm512_shuffle_ps(tmp15786, tmp15788, 68);
__m512 tmp15800 = _mm512_shuffle_ps(tmp15786, tmp15788, 238);
__m512 tmp15801 = _mm512_shuffle_f32x4(tmp15789, tmp15793, 136);
__m512 tmp15802 = _mm512_shuffle_f32x4(tmp15789, tmp15793, 221);
__m512 tmp15803 = _mm512_shuffle_f32x4(tmp15790, tmp15794, 136);
__m512 tmp15804 = _mm512_shuffle_f32x4(tmp15790, tmp15794, 221);
__m512 tmp15805 = _mm512_shuffle_f32x4(tmp15791, tmp15795, 136);
__m512 tmp15806 = _mm512_shuffle_f32x4(tmp15791, tmp15795, 221);
__m512 tmp15807 = _mm512_shuffle_f32x4(tmp15792, tmp15796, 136);
__m512 tmp15808 = _mm512_shuffle_f32x4(tmp15792, tmp15796, 221);
__m512 tmp15809 = _mm512_shuffle_f32x4(tmp15797, tmp15797, 136);
__m512 tmp15810 = _mm512_shuffle_f32x4(tmp15797, tmp15797, 221);
__m512 tmp15811 = _mm512_shuffle_f32x4(tmp15798, tmp15798, 136);
__m512 tmp15812 = _mm512_shuffle_f32x4(tmp15798, tmp15798, 221);
__m512 tmp15813 = _mm512_shuffle_f32x4(tmp15799, tmp15799, 136);
__m512 tmp15814 = _mm512_shuffle_f32x4(tmp15799, tmp15799, 221);
__m512 tmp15815 = _mm512_shuffle_f32x4(tmp15800, tmp15800, 136);
__m512 tmp15816 = _mm512_shuffle_f32x4(tmp15800, tmp15800, 221);
tmp15681 = _mm512_shuffle_f32x4(tmp15801, tmp15809, 136);
tmp15689 = _mm512_shuffle_f32x4(tmp15801, tmp15809, 221);
tmp15682 = _mm512_shuffle_f32x4(tmp15803, tmp15811, 136);
tmp15690 = _mm512_shuffle_f32x4(tmp15803, tmp15811, 221);
tmp15683 = _mm512_shuffle_f32x4(tmp15805, tmp15813, 136);
tmp15691 = _mm512_shuffle_f32x4(tmp15805, tmp15813, 221);
tmp15684 = _mm512_shuffle_f32x4(tmp15807, tmp15815, 136);
tmp15692 = _mm512_shuffle_f32x4(tmp15807, tmp15815, 221);
tmp15685 = _mm512_shuffle_f32x4(tmp15802, tmp15810, 136);
__m512 tmp15733 = _mm512_shuffle_f32x4(tmp15802, tmp15810, 221);
tmp15686 = _mm512_shuffle_f32x4(tmp15804, tmp15812, 136);
__m512 tmp15734 = _mm512_shuffle_f32x4(tmp15804, tmp15812, 221);
tmp15687 = _mm512_shuffle_f32x4(tmp15806, tmp15814, 136);
__m512 tmp15735 = _mm512_shuffle_f32x4(tmp15806, tmp15814, 221);
tmp15688 = _mm512_shuffle_f32x4(tmp15808, tmp15816, 136);
__m512 tmp15736 = _mm512_shuffle_f32x4(tmp15808, tmp15816, 221);
__m512 tmp15741 = _mm512_add_ps(tmp15682, tmp15683);
__m512 tmp15761 = _mm512_add_ps(tmp15690, tmp15691);
__m512 tmp15740 = _mm512_add_ps(tmp15684, tmp15685);
__m512 tmp15760 = _mm512_add_ps(tmp15692, tmp15733);
__m512 tmp15746 = _mm512_sub_ps(tmp15684, tmp15685);
__m512 tmp15766 = _mm512_sub_ps(tmp15692, tmp15733);
__m512 tmp15745 = _mm512_sub_ps(tmp15682, tmp15683);
__m512 tmp15765 = _mm512_sub_ps(tmp15690, tmp15691);
__m512 tmp15742 = _mm512_add_ps(tmp15686, tmp15687);
__m512 tmp15762 = _mm512_add_ps(tmp15734, tmp15735);
__m512 tmp15747 = _mm512_sub_ps(tmp15686, tmp15687);
__m512 tmp15767 = _mm512_sub_ps(tmp15734, tmp15735);
__m512 tmp15744 = _mm512_fmadd_ps(tmp15746, _mm512_set1_ps(2e+00f), tmp15745);
__m512 tmp15764 = _mm512_fmadd_ps(tmp15766, _mm512_set1_ps(2e+00f), tmp15765);
__m512 tmp15751 = _mm512_fmadd_ps(tmp15746, _mm512_set1_ps(8e+00f), tmp15745);
__m512 tmp15771 = _mm512_fmadd_ps(tmp15766, _mm512_set1_ps(8e+00f), tmp15765);
__m512 tmp15739 = _mm512_add_ps(tmp15740, tmp15741);
__m512 tmp15759 = _mm512_add_ps(tmp15760, tmp15761);
__m512 tmp15743 = _mm512_fmadd_ps(tmp15747, _mm512_set1_ps(1.6e+01f), tmp15744);
__m512 tmp15763 = _mm512_fmadd_ps(tmp15767, _mm512_set1_ps(1.6e+01f), tmp15764);
__m512 tmp15750 = _mm512_fmadd_ps(tmp15747, _mm512_set1_ps(4e+00f), tmp15751);
__m512 tmp15770 = _mm512_fmadd_ps(tmp15767, _mm512_set1_ps(4e+00f), tmp15771);
__m512 tmp15756 = _mm512_add_ps(tmp15747, tmp15745);
__m512 tmp15776 = _mm512_add_ps(tmp15767, tmp15765);
__m512 tmp15749 = _mm512_fmadd_ps(tmp15740, _mm512_set1_ps(4e+00f), tmp15741);
__m512 tmp15769 = _mm512_fmadd_ps(tmp15760, _mm512_set1_ps(4e+00f), tmp15761);
__m512 tmp15753 = _mm512_fmadd_ps(tmp15740, _mm512_set1_ps(1.6e+01f), tmp15741);
__m512 tmp15773 = _mm512_fmadd_ps(tmp15760, _mm512_set1_ps(1.6e+01f), tmp15761);
__m512 tmp15738 = _mm512_add_ps(tmp15739, tmp15681);
__m512 tmp15758 = _mm512_add_ps(tmp15759, tmp15689);
__m512 tmp15755 = _mm512_add_ps(tmp15756, tmp15688);
__m512 tmp15775 = _mm512_add_ps(tmp15776, tmp15736);
__m512 tmp15737 = _mm512_fmadd_ps(tmp15742, _mm512_set1_ps(3.2e+01f), tmp15738);
__m512 tmp15757 = _mm512_fmadd_ps(tmp15762, _mm512_set1_ps(3.2e+01f), tmp15758);
__m512 tmp15748 = _mm512_fmadd_ps(tmp15742, _mm512_set1_ps(8e+00f), tmp15749);
__m512 tmp15768 = _mm512_fmadd_ps(tmp15762, _mm512_set1_ps(8e+00f), tmp15769);
__m512 tmp15754 = _mm512_fmadd_ps(tmp15746, _mm512_set1_ps(3.2e+01f), tmp15755);
__m512 tmp15774 = _mm512_fmadd_ps(tmp15766, _mm512_set1_ps(3.2e+01f), tmp15775);
__m512 tmp15752 = _mm512_fmadd_ps(tmp15742, _mm512_set1_ps(2e+00f), tmp15753);
__m512 tmp15772 = _mm512_fmadd_ps(tmp15762, _mm512_set1_ps(2e+00f), tmp15773);
__m512 out2069 = tmp15737;
__m512 out2063 = tmp15757;
__m512 out2070 = tmp15743;
__m512 out2064 = tmp15763;
__m512 out2071 = tmp15748;
__m512 out2065 = tmp15768;
__m512 out2072 = tmp15750;
__m512 out2066 = tmp15770;
__m512 out2073 = tmp15752;
__m512 out2067 = tmp15772;
__m512 out2074 = tmp15754;
__m512 out2068 = tmp15774;
out2069 = _mm512_max_ps(_mm512_setzero_ps(), out2069);
out2063 = _mm512_max_ps(_mm512_setzero_ps(), out2063);
out2070 = _mm512_max_ps(_mm512_setzero_ps(), out2070);
out2064 = _mm512_max_ps(_mm512_setzero_ps(), out2064);
out2071 = _mm512_max_ps(_mm512_setzero_ps(), out2071);
out2065 = _mm512_max_ps(_mm512_setzero_ps(), out2065);
out2072 = _mm512_max_ps(_mm512_setzero_ps(), out2072);
out2066 = _mm512_max_ps(_mm512_setzero_ps(), out2066);
out2073 = _mm512_max_ps(_mm512_setzero_ps(), out2073);
out2067 = _mm512_max_ps(_mm512_setzero_ps(), out2067);
out2074 = _mm512_max_ps(_mm512_setzero_ps(), out2074);
out2068 = _mm512_max_ps(_mm512_setzero_ps(), out2068);
_mm512_mask_storeu_ps(datPtr24+3760+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2069);
_mm512_mask_storeu_ps(datPtr24+3184+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 15, out2063);
_mm512_mask_storeu_ps(datPtr24+3784+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4032, out2063);
_mm512_mask_storeu_ps(datPtr24+3872+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2070);
_mm512_mask_storeu_ps(datPtr24+3296+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 15, out2064);
_mm512_mask_storeu_ps(datPtr24+3896+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4032, out2064);
_mm512_mask_storeu_ps(datPtr24+3984+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2071);
_mm512_mask_storeu_ps(datPtr24+3408+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 15, out2065);
_mm512_mask_storeu_ps(datPtr24+4008+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4032, out2065);
_mm512_mask_storeu_ps(datPtr24+4096+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2072);
_mm512_mask_storeu_ps(datPtr24+3520+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 15, out2066);
_mm512_mask_storeu_ps(datPtr24+4120+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4032, out2066);
_mm512_mask_storeu_ps(datPtr24+4208+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2073);
_mm512_mask_storeu_ps(datPtr24+3632+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 15, out2067);
_mm512_mask_storeu_ps(datPtr24+4232+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4032, out2067);
_mm512_mask_storeu_ps(datPtr24+4320+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4095, out2074);
_mm512_mask_storeu_ps(datPtr24+3744+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 15, out2068);
_mm512_mask_storeu_ps(datPtr24+4344+401408*i47+112*toH45+4*toW45+12544*k137+6272*l56, 4032, out2068);
}
}
if (j40 >= last10) return;
++j40;
rel22 = 3;
}
if (rel22 < 4) {
ptrdiff_t toH46 = base22+18;
ptrdiff_t toW46 = 18;
ptrdiff_t k138 = 32*w63;
for (; k138 != 32; ++k138) {
ptrdiff_t l57 = 0;
for (; l57 != 2; ++l57) {
__m512 sf1153 = _mm512_loadu_ps(sfPtr11+0+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1154 = _mm512_loadu_ps(sfPtr11+128+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2247 = _mm512_shuffle_f32x4(sf1153, sf1154, 68);
__m512 in2248 = _mm512_shuffle_f32x4(sf1153, sf1154, 238);
__m512 sf1155 = _mm512_loadu_ps(sfPtr11+64+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1156 = _mm512_loadu_ps(sfPtr11+192+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2255 = _mm512_shuffle_f32x4(sf1155, sf1156, 68);
__m512 in2256 = _mm512_shuffle_f32x4(sf1155, sf1156, 238);
__m512 sf1157 = _mm512_loadu_ps(sfPtr11+204800+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1158 = _mm512_loadu_ps(sfPtr11+204928+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2249 = _mm512_shuffle_f32x4(sf1157, sf1158, 68);
__m512 in2250 = _mm512_shuffle_f32x4(sf1157, sf1158, 238);
__m512 sf1159 = _mm512_loadu_ps(sfPtr11+204864+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1160 = _mm512_loadu_ps(sfPtr11+204992+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2257 = _mm512_shuffle_f32x4(sf1159, sf1160, 68);
__m512 in2258 = _mm512_shuffle_f32x4(sf1159, sf1160, 238);
__m512 sf1161 = _mm512_loadu_ps(sfPtr11+409600+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1162 = _mm512_loadu_ps(sfPtr11+409728+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2251 = _mm512_shuffle_f32x4(sf1161, sf1162, 68);
__m512 in2252 = _mm512_shuffle_f32x4(sf1161, sf1162, 238);
__m512 sf1163 = _mm512_loadu_ps(sfPtr11+409664+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1164 = _mm512_loadu_ps(sfPtr11+409792+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2259 = _mm512_shuffle_f32x4(sf1163, sf1164, 68);
__m512 in2260 = _mm512_shuffle_f32x4(sf1163, sf1164, 238);
__m512 sf1165 = _mm512_loadu_ps(sfPtr11+614400+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1166 = _mm512_loadu_ps(sfPtr11+614528+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2253 = _mm512_shuffle_f32x4(sf1165, sf1166, 68);
__m512 in2254 = _mm512_shuffle_f32x4(sf1165, sf1166, 238);
__m512 sf1167 = _mm512_loadu_ps(sfPtr11+614464+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1168 = _mm512_loadu_ps(sfPtr11+614592+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2261 = _mm512_shuffle_f32x4(sf1167, sf1168, 68);
__m512 in2262 = _mm512_shuffle_f32x4(sf1167, sf1168, 238);
__m512 tmp15833 = _mm512_add_ps(in2248, in2249);
__m512 tmp15853 = _mm512_add_ps(in2256, in2257);
__m512 tmp15832 = _mm512_add_ps(in2250, in2251);
__m512 tmp15852 = _mm512_add_ps(in2258, in2259);
__m512 tmp15838 = _mm512_sub_ps(in2250, in2251);
__m512 tmp15858 = _mm512_sub_ps(in2258, in2259);
__m512 tmp15837 = _mm512_sub_ps(in2248, in2249);
__m512 tmp15857 = _mm512_sub_ps(in2256, in2257);
__m512 tmp15834 = _mm512_add_ps(in2252, in2253);
__m512 tmp15854 = _mm512_add_ps(in2260, in2261);
__m512 tmp15839 = _mm512_sub_ps(in2252, in2253);
__m512 tmp15859 = _mm512_sub_ps(in2260, in2261);
__m512 tmp15836 = _mm512_fmadd_ps(tmp15838, _mm512_set1_ps(2e+00f), tmp15837);
__m512 tmp15856 = _mm512_fmadd_ps(tmp15858, _mm512_set1_ps(2e+00f), tmp15857);
__m512 tmp15843 = _mm512_fmadd_ps(tmp15838, _mm512_set1_ps(8e+00f), tmp15837);
__m512 tmp15863 = _mm512_fmadd_ps(tmp15858, _mm512_set1_ps(8e+00f), tmp15857);
__m512 tmp15831 = _mm512_add_ps(tmp15832, tmp15833);
__m512 tmp15851 = _mm512_add_ps(tmp15852, tmp15853);
__m512 tmp15835 = _mm512_fmadd_ps(tmp15839, _mm512_set1_ps(1.6e+01f), tmp15836);
__m512 tmp15855 = _mm512_fmadd_ps(tmp15859, _mm512_set1_ps(1.6e+01f), tmp15856);
__m512 tmp15842 = _mm512_fmadd_ps(tmp15839, _mm512_set1_ps(4e+00f), tmp15843);
__m512 tmp15862 = _mm512_fmadd_ps(tmp15859, _mm512_set1_ps(4e+00f), tmp15863);
__m512 tmp15848 = _mm512_add_ps(tmp15839, tmp15837);
__m512 tmp15868 = _mm512_add_ps(tmp15859, tmp15857);
__m512 tmp15841 = _mm512_fmadd_ps(tmp15832, _mm512_set1_ps(4e+00f), tmp15833);
__m512 tmp15861 = _mm512_fmadd_ps(tmp15852, _mm512_set1_ps(4e+00f), tmp15853);
__m512 tmp15845 = _mm512_fmadd_ps(tmp15832, _mm512_set1_ps(1.6e+01f), tmp15833);
__m512 tmp15865 = _mm512_fmadd_ps(tmp15852, _mm512_set1_ps(1.6e+01f), tmp15853);
__m512 tmp15830 = _mm512_add_ps(tmp15831, in2247);
__m512 tmp15850 = _mm512_add_ps(tmp15851, in2255);
__m512 tmp15847 = _mm512_add_ps(tmp15848, in2254);
__m512 tmp15867 = _mm512_add_ps(tmp15868, in2262);
__m512 tmp15829 = _mm512_fmadd_ps(tmp15834, _mm512_set1_ps(3.2e+01f), tmp15830);
__m512 tmp15849 = _mm512_fmadd_ps(tmp15854, _mm512_set1_ps(3.2e+01f), tmp15850);
__m512 tmp15840 = _mm512_fmadd_ps(tmp15834, _mm512_set1_ps(8e+00f), tmp15841);
__m512 tmp15860 = _mm512_fmadd_ps(tmp15854, _mm512_set1_ps(8e+00f), tmp15861);
__m512 tmp15846 = _mm512_fmadd_ps(tmp15838, _mm512_set1_ps(3.2e+01f), tmp15847);
__m512 tmp15866 = _mm512_fmadd_ps(tmp15858, _mm512_set1_ps(3.2e+01f), tmp15867);
__m512 tmp15844 = _mm512_fmadd_ps(tmp15834, _mm512_set1_ps(2e+00f), tmp15845);
__m512 tmp15864 = _mm512_fmadd_ps(tmp15854, _mm512_set1_ps(2e+00f), tmp15865);
__m512 tmp15817 = tmp15829;
__m512 tmp15823 = tmp15849;
__m512 tmp15818 = tmp15835;
__m512 tmp15824 = tmp15855;
__m512 tmp15819 = tmp15840;
__m512 tmp15825 = tmp15860;
__m512 tmp15820 = tmp15842;
__m512 tmp15826 = tmp15862;
__m512 tmp15821 = tmp15844;
__m512 tmp15827 = tmp15864;
__m512 tmp15822 = tmp15846;
__m512 tmp15828 = tmp15866;
__m512 tmp15908 = _mm512_unpacklo_ps(tmp15817, tmp15818);
__m512 tmp15909 = _mm512_unpackhi_ps(tmp15817, tmp15818);
__m512 tmp15910 = _mm512_unpacklo_ps(tmp15819, tmp15820);
__m512 tmp15911 = _mm512_unpackhi_ps(tmp15819, tmp15820);
__m512 tmp15912 = _mm512_unpacklo_ps(tmp15821, tmp15822);
__m512 tmp15913 = _mm512_unpackhi_ps(tmp15821, tmp15822);
__m512 tmp15914 = _mm512_unpacklo_ps(tmp15823, tmp15824);
__m512 tmp15915 = _mm512_unpackhi_ps(tmp15823, tmp15824);
__m512 tmp15916 = _mm512_unpacklo_ps(tmp15825, tmp15826);
__m512 tmp15917 = _mm512_unpackhi_ps(tmp15825, tmp15826);
__m512 tmp15918 = _mm512_unpacklo_ps(tmp15827, tmp15828);
__m512 tmp15919 = _mm512_unpackhi_ps(tmp15827, tmp15828);
__m512 tmp15920 = _mm512_shuffle_ps(tmp15908, tmp15910, 68);
__m512 tmp15921 = _mm512_shuffle_ps(tmp15908, tmp15910, 238);
__m512 tmp15922 = _mm512_shuffle_ps(tmp15909, tmp15911, 68);
__m512 tmp15923 = _mm512_shuffle_ps(tmp15909, tmp15911, 238);
__m512 tmp15924 = _mm512_shuffle_ps(tmp15912, tmp15914, 68);
__m512 tmp15925 = _mm512_shuffle_ps(tmp15912, tmp15914, 238);
__m512 tmp15926 = _mm512_shuffle_ps(tmp15913, tmp15915, 68);
__m512 tmp15927 = _mm512_shuffle_ps(tmp15913, tmp15915, 238);
__m512 tmp15928 = _mm512_shuffle_ps(tmp15916, tmp15918, 68);
__m512 tmp15929 = _mm512_shuffle_ps(tmp15916, tmp15918, 238);
__m512 tmp15930 = _mm512_shuffle_ps(tmp15917, tmp15919, 68);
__m512 tmp15931 = _mm512_shuffle_ps(tmp15917, tmp15919, 238);
__m512 tmp15932 = _mm512_shuffle_f32x4(tmp15920, tmp15924, 136);
__m512 tmp15933 = _mm512_shuffle_f32x4(tmp15920, tmp15924, 221);
__m512 tmp15934 = _mm512_shuffle_f32x4(tmp15921, tmp15925, 136);
__m512 tmp15935 = _mm512_shuffle_f32x4(tmp15921, tmp15925, 221);
__m512 tmp15936 = _mm512_shuffle_f32x4(tmp15922, tmp15926, 136);
__m512 tmp15937 = _mm512_shuffle_f32x4(tmp15922, tmp15926, 221);
__m512 tmp15938 = _mm512_shuffle_f32x4(tmp15923, tmp15927, 136);
__m512 tmp15939 = _mm512_shuffle_f32x4(tmp15923, tmp15927, 221);
__m512 tmp15940 = _mm512_shuffle_f32x4(tmp15928, tmp15928, 136);
__m512 tmp15941 = _mm512_shuffle_f32x4(tmp15928, tmp15928, 221);
__m512 tmp15942 = _mm512_shuffle_f32x4(tmp15929, tmp15929, 136);
__m512 tmp15943 = _mm512_shuffle_f32x4(tmp15929, tmp15929, 221);
__m512 tmp15944 = _mm512_shuffle_f32x4(tmp15930, tmp15930, 136);
__m512 tmp15945 = _mm512_shuffle_f32x4(tmp15930, tmp15930, 221);
__m512 tmp15946 = _mm512_shuffle_f32x4(tmp15931, tmp15931, 136);
__m512 tmp15947 = _mm512_shuffle_f32x4(tmp15931, tmp15931, 221);
tmp15817 = _mm512_shuffle_f32x4(tmp15932, tmp15940, 136);
tmp15825 = _mm512_shuffle_f32x4(tmp15932, tmp15940, 221);
tmp15818 = _mm512_shuffle_f32x4(tmp15934, tmp15942, 136);
tmp15826 = _mm512_shuffle_f32x4(tmp15934, tmp15942, 221);
tmp15819 = _mm512_shuffle_f32x4(tmp15936, tmp15944, 136);
tmp15827 = _mm512_shuffle_f32x4(tmp15936, tmp15944, 221);
tmp15820 = _mm512_shuffle_f32x4(tmp15938, tmp15946, 136);
tmp15828 = _mm512_shuffle_f32x4(tmp15938, tmp15946, 221);
tmp15821 = _mm512_shuffle_f32x4(tmp15933, tmp15941, 136);
__m512 tmp15869 = _mm512_shuffle_f32x4(tmp15933, tmp15941, 221);
tmp15822 = _mm512_shuffle_f32x4(tmp15935, tmp15943, 136);
__m512 tmp15870 = _mm512_shuffle_f32x4(tmp15935, tmp15943, 221);
tmp15823 = _mm512_shuffle_f32x4(tmp15937, tmp15945, 136);
__m512 tmp15871 = _mm512_shuffle_f32x4(tmp15937, tmp15945, 221);
tmp15824 = _mm512_shuffle_f32x4(tmp15939, tmp15947, 136);
__m512 tmp15872 = _mm512_shuffle_f32x4(tmp15939, tmp15947, 221);
(void)tmp15872;
__m512 tmp15877 = _mm512_add_ps(tmp15818, tmp15819);
__m512 tmp15897 = _mm512_add_ps(tmp15826, tmp15827);
__m512 tmp15876 = _mm512_add_ps(tmp15820, tmp15821);
__m512 tmp15896 = _mm512_add_ps(tmp15828, tmp15869);
__m512 tmp15882 = _mm512_sub_ps(tmp15820, tmp15821);
__m512 tmp15902 = _mm512_sub_ps(tmp15828, tmp15869);
__m512 tmp15881 = _mm512_sub_ps(tmp15818, tmp15819);
__m512 tmp15901 = _mm512_sub_ps(tmp15826, tmp15827);
__m512 tmp15878 = _mm512_add_ps(tmp15822, tmp15823);
__m512 tmp15898 = _mm512_add_ps(tmp15870, tmp15871);
__m512 tmp15883 = _mm512_sub_ps(tmp15822, tmp15823);
__m512 tmp15903 = _mm512_sub_ps(tmp15870, tmp15871);
__m512 tmp15880 = _mm512_fmadd_ps(tmp15882, _mm512_set1_ps(2e+00f), tmp15881);
__m512 tmp15900 = _mm512_fmadd_ps(tmp15902, _mm512_set1_ps(2e+00f), tmp15901);
__m512 tmp15887 = _mm512_fmadd_ps(tmp15882, _mm512_set1_ps(8e+00f), tmp15881);
__m512 tmp15907 = _mm512_fmadd_ps(tmp15902, _mm512_set1_ps(8e+00f), tmp15901);
__m512 tmp15875 = _mm512_add_ps(tmp15876, tmp15877);
__m512 tmp15895 = _mm512_add_ps(tmp15896, tmp15897);
__m512 tmp15879 = _mm512_fmadd_ps(tmp15883, _mm512_set1_ps(1.6e+01f), tmp15880);
__m512 tmp15899 = _mm512_fmadd_ps(tmp15903, _mm512_set1_ps(1.6e+01f), tmp15900);
__m512 tmp15886 = _mm512_fmadd_ps(tmp15883, _mm512_set1_ps(4e+00f), tmp15887);
__m512 tmp15906 = _mm512_fmadd_ps(tmp15903, _mm512_set1_ps(4e+00f), tmp15907);
__m512 tmp15892 = _mm512_add_ps(tmp15883, tmp15881);
__m512 tmp15885 = _mm512_fmadd_ps(tmp15876, _mm512_set1_ps(4e+00f), tmp15877);
__m512 tmp15905 = _mm512_fmadd_ps(tmp15896, _mm512_set1_ps(4e+00f), tmp15897);
__m512 tmp15889 = _mm512_fmadd_ps(tmp15876, _mm512_set1_ps(1.6e+01f), tmp15877);
__m512 tmp15874 = _mm512_add_ps(tmp15875, tmp15817);
__m512 tmp15894 = _mm512_add_ps(tmp15895, tmp15825);
__m512 tmp15891 = _mm512_add_ps(tmp15892, tmp15824);
__m512 tmp15873 = _mm512_fmadd_ps(tmp15878, _mm512_set1_ps(3.2e+01f), tmp15874);
__m512 tmp15893 = _mm512_fmadd_ps(tmp15898, _mm512_set1_ps(3.2e+01f), tmp15894);
__m512 tmp15884 = _mm512_fmadd_ps(tmp15878, _mm512_set1_ps(8e+00f), tmp15885);
__m512 tmp15904 = _mm512_fmadd_ps(tmp15898, _mm512_set1_ps(8e+00f), tmp15905);
__m512 tmp15890 = _mm512_fmadd_ps(tmp15882, _mm512_set1_ps(3.2e+01f), tmp15891);
__m512 tmp15888 = _mm512_fmadd_ps(tmp15878, _mm512_set1_ps(2e+00f), tmp15889);
__m512 out2075 = tmp15873;
__m512 out2081 = tmp15893;
__m512 out2076 = tmp15879;
__m512 out2082 = tmp15899;
__m512 out2077 = tmp15884;
__m512 out2083 = tmp15904;
__m512 out2078 = tmp15886;
__m512 out2084 = tmp15906;
__m512 out2079 = tmp15888;
__m512 out2080 = tmp15890;
out2075 = _mm512_max_ps(_mm512_setzero_ps(), out2075);
out2081 = _mm512_max_ps(_mm512_setzero_ps(), out2081);
out2076 = _mm512_max_ps(_mm512_setzero_ps(), out2076);
out2082 = _mm512_max_ps(_mm512_setzero_ps(), out2082);
out2077 = _mm512_max_ps(_mm512_setzero_ps(), out2077);
out2083 = _mm512_max_ps(_mm512_setzero_ps(), out2083);
out2078 = _mm512_max_ps(_mm512_setzero_ps(), out2078);
out2084 = _mm512_max_ps(_mm512_setzero_ps(), out2084);
out2079 = _mm512_max_ps(_mm512_setzero_ps(), out2079);
out2080 = _mm512_max_ps(_mm512_setzero_ps(), out2080);
_mm512_mask_storeu_ps(datPtr24+0+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 1023, out2075);
_mm512_mask_storeu_ps(datPtr24+600+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 4095, out2081);
_mm512_mask_storeu_ps(datPtr24+112+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 1023, out2076);
_mm512_mask_storeu_ps(datPtr24+712+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 4095, out2082);
_mm512_mask_storeu_ps(datPtr24+224+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 1023, out2077);
_mm512_mask_storeu_ps(datPtr24+824+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 4095, out2083);
_mm512_mask_storeu_ps(datPtr24+336+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 1023, out2078);
_mm512_mask_storeu_ps(datPtr24+936+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 4095, out2084);
_mm512_mask_storeu_ps(datPtr24+448+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 1023, out2079);
_mm512_mask_storeu_ps(datPtr24+560+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 1023, out2080);
__m512 sf1169 = _mm512_loadu_ps(sfPtr11+256+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1170 = _mm512_loadu_ps(sfPtr11+384+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2263 = _mm512_shuffle_f32x4(sf1169, sf1170, 68);
__m512 in2264 = _mm512_shuffle_f32x4(sf1169, sf1170, 238);
__m512 sf1171 = _mm512_loadu_ps(sfPtr11+320+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1172 = _mm512_loadu_ps(sfPtr11+448+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2271 = _mm512_shuffle_f32x4(sf1171, sf1172, 68);
__m512 in2272 = _mm512_shuffle_f32x4(sf1171, sf1172, 238);
__m512 sf1173 = _mm512_loadu_ps(sfPtr11+205056+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1174 = _mm512_loadu_ps(sfPtr11+205184+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2265 = _mm512_shuffle_f32x4(sf1173, sf1174, 68);
__m512 in2266 = _mm512_shuffle_f32x4(sf1173, sf1174, 238);
__m512 sf1175 = _mm512_loadu_ps(sfPtr11+205120+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1176 = _mm512_loadu_ps(sfPtr11+205248+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2273 = _mm512_shuffle_f32x4(sf1175, sf1176, 68);
__m512 in2274 = _mm512_shuffle_f32x4(sf1175, sf1176, 238);
__m512 sf1177 = _mm512_loadu_ps(sfPtr11+409856+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1178 = _mm512_loadu_ps(sfPtr11+409984+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2267 = _mm512_shuffle_f32x4(sf1177, sf1178, 68);
__m512 in2268 = _mm512_shuffle_f32x4(sf1177, sf1178, 238);
__m512 sf1179 = _mm512_loadu_ps(sfPtr11+409920+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1180 = _mm512_loadu_ps(sfPtr11+410048+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2275 = _mm512_shuffle_f32x4(sf1179, sf1180, 68);
__m512 in2276 = _mm512_shuffle_f32x4(sf1179, sf1180, 238);
__m512 sf1181 = _mm512_loadu_ps(sfPtr11+614656+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1182 = _mm512_loadu_ps(sfPtr11+614784+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2269 = _mm512_shuffle_f32x4(sf1181, sf1182, 68);
__m512 in2270 = _mm512_shuffle_f32x4(sf1181, sf1182, 238);
__m512 sf1183 = _mm512_loadu_ps(sfPtr11+614720+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1184 = _mm512_loadu_ps(sfPtr11+614848+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2277 = _mm512_shuffle_f32x4(sf1183, sf1184, 68);
__m512 in2278 = _mm512_shuffle_f32x4(sf1183, sf1184, 238);
__m512 tmp15964 = _mm512_add_ps(in2264, in2265);
__m512 tmp15984 = _mm512_add_ps(in2272, in2273);
__m512 tmp15963 = _mm512_add_ps(in2266, in2267);
__m512 tmp15983 = _mm512_add_ps(in2274, in2275);
__m512 tmp15969 = _mm512_sub_ps(in2266, in2267);
__m512 tmp15989 = _mm512_sub_ps(in2274, in2275);
__m512 tmp15968 = _mm512_sub_ps(in2264, in2265);
__m512 tmp15988 = _mm512_sub_ps(in2272, in2273);
__m512 tmp15965 = _mm512_add_ps(in2268, in2269);
__m512 tmp15985 = _mm512_add_ps(in2276, in2277);
__m512 tmp15970 = _mm512_sub_ps(in2268, in2269);
__m512 tmp15990 = _mm512_sub_ps(in2276, in2277);
__m512 tmp15967 = _mm512_fmadd_ps(tmp15969, _mm512_set1_ps(2e+00f), tmp15968);
__m512 tmp15987 = _mm512_fmadd_ps(tmp15989, _mm512_set1_ps(2e+00f), tmp15988);
__m512 tmp15974 = _mm512_fmadd_ps(tmp15969, _mm512_set1_ps(8e+00f), tmp15968);
__m512 tmp15994 = _mm512_fmadd_ps(tmp15989, _mm512_set1_ps(8e+00f), tmp15988);
__m512 tmp15962 = _mm512_add_ps(tmp15963, tmp15964);
__m512 tmp15982 = _mm512_add_ps(tmp15983, tmp15984);
__m512 tmp15966 = _mm512_fmadd_ps(tmp15970, _mm512_set1_ps(1.6e+01f), tmp15967);
__m512 tmp15986 = _mm512_fmadd_ps(tmp15990, _mm512_set1_ps(1.6e+01f), tmp15987);
__m512 tmp15973 = _mm512_fmadd_ps(tmp15970, _mm512_set1_ps(4e+00f), tmp15974);
__m512 tmp15993 = _mm512_fmadd_ps(tmp15990, _mm512_set1_ps(4e+00f), tmp15994);
__m512 tmp15979 = _mm512_add_ps(tmp15970, tmp15968);
__m512 tmp15999 = _mm512_add_ps(tmp15990, tmp15988);
__m512 tmp15972 = _mm512_fmadd_ps(tmp15963, _mm512_set1_ps(4e+00f), tmp15964);
__m512 tmp15992 = _mm512_fmadd_ps(tmp15983, _mm512_set1_ps(4e+00f), tmp15984);
__m512 tmp15976 = _mm512_fmadd_ps(tmp15963, _mm512_set1_ps(1.6e+01f), tmp15964);
__m512 tmp15996 = _mm512_fmadd_ps(tmp15983, _mm512_set1_ps(1.6e+01f), tmp15984);
__m512 tmp15961 = _mm512_add_ps(tmp15962, in2263);
__m512 tmp15981 = _mm512_add_ps(tmp15982, in2271);
__m512 tmp15978 = _mm512_add_ps(tmp15979, in2270);
__m512 tmp15998 = _mm512_add_ps(tmp15999, in2278);
__m512 tmp15960 = _mm512_fmadd_ps(tmp15965, _mm512_set1_ps(3.2e+01f), tmp15961);
__m512 tmp15980 = _mm512_fmadd_ps(tmp15985, _mm512_set1_ps(3.2e+01f), tmp15981);
__m512 tmp15971 = _mm512_fmadd_ps(tmp15965, _mm512_set1_ps(8e+00f), tmp15972);
__m512 tmp15991 = _mm512_fmadd_ps(tmp15985, _mm512_set1_ps(8e+00f), tmp15992);
__m512 tmp15977 = _mm512_fmadd_ps(tmp15969, _mm512_set1_ps(3.2e+01f), tmp15978);
__m512 tmp15997 = _mm512_fmadd_ps(tmp15989, _mm512_set1_ps(3.2e+01f), tmp15998);
__m512 tmp15975 = _mm512_fmadd_ps(tmp15965, _mm512_set1_ps(2e+00f), tmp15976);
__m512 tmp15995 = _mm512_fmadd_ps(tmp15985, _mm512_set1_ps(2e+00f), tmp15996);
__m512 tmp15948 = tmp15960;
__m512 tmp15954 = tmp15980;
__m512 tmp15949 = tmp15966;
__m512 tmp15955 = tmp15986;
__m512 tmp15950 = tmp15971;
__m512 tmp15956 = tmp15991;
__m512 tmp15951 = tmp15973;
__m512 tmp15957 = tmp15993;
__m512 tmp15952 = tmp15975;
__m512 tmp15958 = tmp15995;
__m512 tmp15953 = tmp15977;
__m512 tmp15959 = tmp15997;
__m512 tmp16039 = _mm512_unpacklo_ps(tmp15948, tmp15949);
__m512 tmp16040 = _mm512_unpackhi_ps(tmp15948, tmp15949);
__m512 tmp16041 = _mm512_unpacklo_ps(tmp15950, tmp15951);
__m512 tmp16042 = _mm512_unpackhi_ps(tmp15950, tmp15951);
__m512 tmp16043 = _mm512_unpacklo_ps(tmp15952, tmp15953);
__m512 tmp16044 = _mm512_unpackhi_ps(tmp15952, tmp15953);
__m512 tmp16045 = _mm512_unpacklo_ps(tmp15954, tmp15955);
__m512 tmp16046 = _mm512_unpackhi_ps(tmp15954, tmp15955);
__m512 tmp16047 = _mm512_unpacklo_ps(tmp15956, tmp15957);
__m512 tmp16048 = _mm512_unpackhi_ps(tmp15956, tmp15957);
__m512 tmp16049 = _mm512_unpacklo_ps(tmp15958, tmp15959);
__m512 tmp16050 = _mm512_unpackhi_ps(tmp15958, tmp15959);
__m512 tmp16051 = _mm512_shuffle_ps(tmp16039, tmp16041, 68);
__m512 tmp16052 = _mm512_shuffle_ps(tmp16039, tmp16041, 238);
__m512 tmp16053 = _mm512_shuffle_ps(tmp16040, tmp16042, 68);
__m512 tmp16054 = _mm512_shuffle_ps(tmp16040, tmp16042, 238);
__m512 tmp16055 = _mm512_shuffle_ps(tmp16043, tmp16045, 68);
__m512 tmp16056 = _mm512_shuffle_ps(tmp16043, tmp16045, 238);
__m512 tmp16057 = _mm512_shuffle_ps(tmp16044, tmp16046, 68);
__m512 tmp16058 = _mm512_shuffle_ps(tmp16044, tmp16046, 238);
__m512 tmp16059 = _mm512_shuffle_ps(tmp16047, tmp16049, 68);
__m512 tmp16060 = _mm512_shuffle_ps(tmp16047, tmp16049, 238);
__m512 tmp16061 = _mm512_shuffle_ps(tmp16048, tmp16050, 68);
__m512 tmp16062 = _mm512_shuffle_ps(tmp16048, tmp16050, 238);
__m512 tmp16063 = _mm512_shuffle_f32x4(tmp16051, tmp16055, 136);
__m512 tmp16064 = _mm512_shuffle_f32x4(tmp16051, tmp16055, 221);
__m512 tmp16065 = _mm512_shuffle_f32x4(tmp16052, tmp16056, 136);
__m512 tmp16066 = _mm512_shuffle_f32x4(tmp16052, tmp16056, 221);
__m512 tmp16067 = _mm512_shuffle_f32x4(tmp16053, tmp16057, 136);
__m512 tmp16068 = _mm512_shuffle_f32x4(tmp16053, tmp16057, 221);
__m512 tmp16069 = _mm512_shuffle_f32x4(tmp16054, tmp16058, 136);
__m512 tmp16070 = _mm512_shuffle_f32x4(tmp16054, tmp16058, 221);
__m512 tmp16071 = _mm512_shuffle_f32x4(tmp16059, tmp16059, 136);
__m512 tmp16072 = _mm512_shuffle_f32x4(tmp16059, tmp16059, 221);
__m512 tmp16073 = _mm512_shuffle_f32x4(tmp16060, tmp16060, 136);
__m512 tmp16074 = _mm512_shuffle_f32x4(tmp16060, tmp16060, 221);
__m512 tmp16075 = _mm512_shuffle_f32x4(tmp16061, tmp16061, 136);
__m512 tmp16076 = _mm512_shuffle_f32x4(tmp16061, tmp16061, 221);
__m512 tmp16077 = _mm512_shuffle_f32x4(tmp16062, tmp16062, 136);
__m512 tmp16078 = _mm512_shuffle_f32x4(tmp16062, tmp16062, 221);
tmp15948 = _mm512_shuffle_f32x4(tmp16063, tmp16071, 136);
tmp15956 = _mm512_shuffle_f32x4(tmp16063, tmp16071, 221);
tmp15949 = _mm512_shuffle_f32x4(tmp16065, tmp16073, 136);
tmp15957 = _mm512_shuffle_f32x4(tmp16065, tmp16073, 221);
tmp15950 = _mm512_shuffle_f32x4(tmp16067, tmp16075, 136);
tmp15958 = _mm512_shuffle_f32x4(tmp16067, tmp16075, 221);
tmp15951 = _mm512_shuffle_f32x4(tmp16069, tmp16077, 136);
tmp15959 = _mm512_shuffle_f32x4(tmp16069, tmp16077, 221);
tmp15952 = _mm512_shuffle_f32x4(tmp16064, tmp16072, 136);
__m512 tmp16000 = _mm512_shuffle_f32x4(tmp16064, tmp16072, 221);
tmp15953 = _mm512_shuffle_f32x4(tmp16066, tmp16074, 136);
__m512 tmp16001 = _mm512_shuffle_f32x4(tmp16066, tmp16074, 221);
tmp15954 = _mm512_shuffle_f32x4(tmp16068, tmp16076, 136);
__m512 tmp16002 = _mm512_shuffle_f32x4(tmp16068, tmp16076, 221);
tmp15955 = _mm512_shuffle_f32x4(tmp16070, tmp16078, 136);
__m512 tmp16003 = _mm512_shuffle_f32x4(tmp16070, tmp16078, 221);
(void)tmp15955;
__m512 tmp16008 = _mm512_add_ps(tmp15949, tmp15950);
__m512 tmp16023 = _mm512_add_ps(tmp15957, tmp15958);
__m512 tmp16007 = _mm512_add_ps(tmp15951, tmp15952);
__m512 tmp16022 = _mm512_add_ps(tmp15959, tmp16000);
__m512 tmp16013 = _mm512_sub_ps(tmp15951, tmp15952);
__m512 tmp16028 = _mm512_sub_ps(tmp15959, tmp16000);
__m512 tmp16012 = _mm512_sub_ps(tmp15949, tmp15950);
__m512 tmp16027 = _mm512_sub_ps(tmp15957, tmp15958);
__m512 tmp16009 = _mm512_add_ps(tmp15953, tmp15954);
__m512 tmp16024 = _mm512_add_ps(tmp16001, tmp16002);
__m512 tmp16014 = _mm512_sub_ps(tmp15953, tmp15954);
__m512 tmp16029 = _mm512_sub_ps(tmp16001, tmp16002);
__m512 tmp16011 = _mm512_fmadd_ps(tmp16013, _mm512_set1_ps(2e+00f), tmp16012);
__m512 tmp16026 = _mm512_fmadd_ps(tmp16028, _mm512_set1_ps(2e+00f), tmp16027);
__m512 tmp16018 = _mm512_fmadd_ps(tmp16013, _mm512_set1_ps(8e+00f), tmp16012);
__m512 tmp16033 = _mm512_fmadd_ps(tmp16028, _mm512_set1_ps(8e+00f), tmp16027);
__m512 tmp16006 = _mm512_add_ps(tmp16007, tmp16008);
__m512 tmp16021 = _mm512_add_ps(tmp16022, tmp16023);
__m512 tmp16010 = _mm512_fmadd_ps(tmp16014, _mm512_set1_ps(1.6e+01f), tmp16011);
__m512 tmp16025 = _mm512_fmadd_ps(tmp16029, _mm512_set1_ps(1.6e+01f), tmp16026);
__m512 tmp16017 = _mm512_fmadd_ps(tmp16014, _mm512_set1_ps(4e+00f), tmp16018);
__m512 tmp16032 = _mm512_fmadd_ps(tmp16029, _mm512_set1_ps(4e+00f), tmp16033);
__m512 tmp16038 = _mm512_add_ps(tmp16029, tmp16027);
__m512 tmp16016 = _mm512_fmadd_ps(tmp16007, _mm512_set1_ps(4e+00f), tmp16008);
__m512 tmp16031 = _mm512_fmadd_ps(tmp16022, _mm512_set1_ps(4e+00f), tmp16023);
__m512 tmp16035 = _mm512_fmadd_ps(tmp16022, _mm512_set1_ps(1.6e+01f), tmp16023);
__m512 tmp16005 = _mm512_add_ps(tmp16006, tmp15948);
__m512 tmp16020 = _mm512_add_ps(tmp16021, tmp15956);
__m512 tmp16037 = _mm512_add_ps(tmp16038, tmp16003);
__m512 tmp16004 = _mm512_fmadd_ps(tmp16009, _mm512_set1_ps(3.2e+01f), tmp16005);
__m512 tmp16019 = _mm512_fmadd_ps(tmp16024, _mm512_set1_ps(3.2e+01f), tmp16020);
__m512 tmp16015 = _mm512_fmadd_ps(tmp16009, _mm512_set1_ps(8e+00f), tmp16016);
__m512 tmp16030 = _mm512_fmadd_ps(tmp16024, _mm512_set1_ps(8e+00f), tmp16031);
__m512 tmp16036 = _mm512_fmadd_ps(tmp16028, _mm512_set1_ps(3.2e+01f), tmp16037);
__m512 tmp16034 = _mm512_fmadd_ps(tmp16024, _mm512_set1_ps(2e+00f), tmp16035);
__m512 out2085 = tmp16004;
__m512 out2089 = tmp16019;
__m512 out2086 = tmp16010;
__m512 out2090 = tmp16025;
__m512 out2087 = tmp16015;
__m512 out2091 = tmp16030;
__m512 out2088 = tmp16017;
__m512 out2092 = tmp16032;
__m512 out2093 = tmp16034;
__m512 out2094 = tmp16036;
out2085 = _mm512_max_ps(_mm512_setzero_ps(), out2085);
out2089 = _mm512_max_ps(_mm512_setzero_ps(), out2089);
out2086 = _mm512_max_ps(_mm512_setzero_ps(), out2086);
out2090 = _mm512_max_ps(_mm512_setzero_ps(), out2090);
out2087 = _mm512_max_ps(_mm512_setzero_ps(), out2087);
out2091 = _mm512_max_ps(_mm512_setzero_ps(), out2091);
out2088 = _mm512_max_ps(_mm512_setzero_ps(), out2088);
out2092 = _mm512_max_ps(_mm512_setzero_ps(), out2092);
out2093 = _mm512_max_ps(_mm512_setzero_ps(), out2093);
out2094 = _mm512_max_ps(_mm512_setzero_ps(), out2094);
_mm512_mask_storeu_ps(datPtr24+648+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 4095, out2085);
_mm512_mask_storeu_ps(datPtr24+3136+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 1023, out2089);
_mm512_mask_storeu_ps(datPtr24+760+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 4095, out2086);
_mm512_mask_storeu_ps(datPtr24+3248+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 1023, out2090);
_mm512_mask_storeu_ps(datPtr24+872+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 4095, out2087);
_mm512_mask_storeu_ps(datPtr24+3360+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 1023, out2091);
_mm512_mask_storeu_ps(datPtr24+984+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 4095, out2088);
_mm512_mask_storeu_ps(datPtr24+3472+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 1023, out2092);
_mm512_mask_storeu_ps(datPtr24+3584+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 1023, out2093);
_mm512_mask_storeu_ps(datPtr24+3696+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 1023, out2094);
__m512 sf1185 = _mm512_loadu_ps(sfPtr11+512+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1186 = _mm512_loadu_ps(sfPtr11+640+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2279 = _mm512_shuffle_f32x4(sf1185, sf1186, 68);
__m512 in2280 = _mm512_shuffle_f32x4(sf1185, sf1186, 238);
__m512 sf1187 = _mm512_loadu_ps(sfPtr11+576+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1188 = _mm512_loadu_ps(sfPtr11+704+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2287 = _mm512_shuffle_f32x4(sf1187, sf1188, 68);
__m512 in2288 = _mm512_shuffle_f32x4(sf1187, sf1188, 238);
__m512 sf1189 = _mm512_loadu_ps(sfPtr11+205312+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1190 = _mm512_loadu_ps(sfPtr11+205440+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2281 = _mm512_shuffle_f32x4(sf1189, sf1190, 68);
__m512 in2282 = _mm512_shuffle_f32x4(sf1189, sf1190, 238);
__m512 sf1191 = _mm512_loadu_ps(sfPtr11+205376+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1192 = _mm512_loadu_ps(sfPtr11+205504+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2289 = _mm512_shuffle_f32x4(sf1191, sf1192, 68);
__m512 in2290 = _mm512_shuffle_f32x4(sf1191, sf1192, 238);
__m512 sf1193 = _mm512_loadu_ps(sfPtr11+410112+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1194 = _mm512_loadu_ps(sfPtr11+410240+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2283 = _mm512_shuffle_f32x4(sf1193, sf1194, 68);
__m512 in2284 = _mm512_shuffle_f32x4(sf1193, sf1194, 238);
__m512 sf1195 = _mm512_loadu_ps(sfPtr11+410176+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1196 = _mm512_loadu_ps(sfPtr11+410304+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2291 = _mm512_shuffle_f32x4(sf1195, sf1196, 68);
__m512 in2292 = _mm512_shuffle_f32x4(sf1195, sf1196, 238);
__m512 sf1197 = _mm512_loadu_ps(sfPtr11+614912+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1198 = _mm512_loadu_ps(sfPtr11+615040+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2285 = _mm512_shuffle_f32x4(sf1197, sf1198, 68);
__m512 in2286 = _mm512_shuffle_f32x4(sf1197, sf1198, 238);
__m512 sf1199 = _mm512_loadu_ps(sfPtr11+614976+819200*i47+49152*j40+1536*k138+768*l57);
__m512 sf1200 = _mm512_loadu_ps(sfPtr11+615104+819200*i47+49152*j40+1536*k138+768*l57);
__m512 in2293 = _mm512_shuffle_f32x4(sf1199, sf1200, 68);
__m512 in2294 = _mm512_shuffle_f32x4(sf1199, sf1200, 238);
__m512 tmp16095 = _mm512_add_ps(in2280, in2281);
__m512 tmp16115 = _mm512_add_ps(in2288, in2289);
__m512 tmp16094 = _mm512_add_ps(in2282, in2283);
__m512 tmp16114 = _mm512_add_ps(in2290, in2291);
__m512 tmp16100 = _mm512_sub_ps(in2282, in2283);
__m512 tmp16120 = _mm512_sub_ps(in2290, in2291);
__m512 tmp16099 = _mm512_sub_ps(in2280, in2281);
__m512 tmp16119 = _mm512_sub_ps(in2288, in2289);
__m512 tmp16096 = _mm512_add_ps(in2284, in2285);
__m512 tmp16116 = _mm512_add_ps(in2292, in2293);
__m512 tmp16101 = _mm512_sub_ps(in2284, in2285);
__m512 tmp16121 = _mm512_sub_ps(in2292, in2293);
__m512 tmp16098 = _mm512_fmadd_ps(tmp16100, _mm512_set1_ps(2e+00f), tmp16099);
__m512 tmp16118 = _mm512_fmadd_ps(tmp16120, _mm512_set1_ps(2e+00f), tmp16119);
__m512 tmp16105 = _mm512_fmadd_ps(tmp16100, _mm512_set1_ps(8e+00f), tmp16099);
__m512 tmp16125 = _mm512_fmadd_ps(tmp16120, _mm512_set1_ps(8e+00f), tmp16119);
__m512 tmp16093 = _mm512_add_ps(tmp16094, tmp16095);
__m512 tmp16113 = _mm512_add_ps(tmp16114, tmp16115);
__m512 tmp16097 = _mm512_fmadd_ps(tmp16101, _mm512_set1_ps(1.6e+01f), tmp16098);
__m512 tmp16117 = _mm512_fmadd_ps(tmp16121, _mm512_set1_ps(1.6e+01f), tmp16118);
__m512 tmp16104 = _mm512_fmadd_ps(tmp16101, _mm512_set1_ps(4e+00f), tmp16105);
__m512 tmp16124 = _mm512_fmadd_ps(tmp16121, _mm512_set1_ps(4e+00f), tmp16125);
__m512 tmp16110 = _mm512_add_ps(tmp16101, tmp16099);
__m512 tmp16130 = _mm512_add_ps(tmp16121, tmp16119);
__m512 tmp16103 = _mm512_fmadd_ps(tmp16094, _mm512_set1_ps(4e+00f), tmp16095);
__m512 tmp16123 = _mm512_fmadd_ps(tmp16114, _mm512_set1_ps(4e+00f), tmp16115);
__m512 tmp16107 = _mm512_fmadd_ps(tmp16094, _mm512_set1_ps(1.6e+01f), tmp16095);
__m512 tmp16127 = _mm512_fmadd_ps(tmp16114, _mm512_set1_ps(1.6e+01f), tmp16115);
__m512 tmp16092 = _mm512_add_ps(tmp16093, in2279);
__m512 tmp16112 = _mm512_add_ps(tmp16113, in2287);
__m512 tmp16109 = _mm512_add_ps(tmp16110, in2286);
__m512 tmp16129 = _mm512_add_ps(tmp16130, in2294);
__m512 tmp16091 = _mm512_fmadd_ps(tmp16096, _mm512_set1_ps(3.2e+01f), tmp16092);
__m512 tmp16111 = _mm512_fmadd_ps(tmp16116, _mm512_set1_ps(3.2e+01f), tmp16112);
__m512 tmp16102 = _mm512_fmadd_ps(tmp16096, _mm512_set1_ps(8e+00f), tmp16103);
__m512 tmp16122 = _mm512_fmadd_ps(tmp16116, _mm512_set1_ps(8e+00f), tmp16123);
__m512 tmp16108 = _mm512_fmadd_ps(tmp16100, _mm512_set1_ps(3.2e+01f), tmp16109);
__m512 tmp16128 = _mm512_fmadd_ps(tmp16120, _mm512_set1_ps(3.2e+01f), tmp16129);
__m512 tmp16106 = _mm512_fmadd_ps(tmp16096, _mm512_set1_ps(2e+00f), tmp16107);
__m512 tmp16126 = _mm512_fmadd_ps(tmp16116, _mm512_set1_ps(2e+00f), tmp16127);
__m512 tmp16079 = tmp16091;
__m512 tmp16085 = tmp16111;
__m512 tmp16080 = tmp16097;
__m512 tmp16086 = tmp16117;
__m512 tmp16081 = tmp16102;
__m512 tmp16087 = tmp16122;
__m512 tmp16082 = tmp16104;
__m512 tmp16088 = tmp16124;
__m512 tmp16083 = tmp16106;
__m512 tmp16089 = tmp16126;
__m512 tmp16084 = tmp16108;
__m512 tmp16090 = tmp16128;
__m512 tmp16165 = _mm512_unpacklo_ps(tmp16079, tmp16080);
__m512 tmp16166 = _mm512_unpackhi_ps(tmp16079, tmp16080);
__m512 tmp16167 = _mm512_unpacklo_ps(tmp16081, tmp16082);
__m512 tmp16168 = _mm512_unpackhi_ps(tmp16081, tmp16082);
__m512 tmp16169 = _mm512_unpacklo_ps(tmp16083, tmp16084);
__m512 tmp16170 = _mm512_unpackhi_ps(tmp16083, tmp16084);
__m512 tmp16171 = _mm512_unpacklo_ps(tmp16085, tmp16086);
__m512 tmp16172 = _mm512_unpackhi_ps(tmp16085, tmp16086);
__m512 tmp16173 = _mm512_unpacklo_ps(tmp16087, tmp16088);
__m512 tmp16174 = _mm512_unpackhi_ps(tmp16087, tmp16088);
__m512 tmp16175 = _mm512_unpacklo_ps(tmp16089, tmp16090);
__m512 tmp16176 = _mm512_unpackhi_ps(tmp16089, tmp16090);
__m512 tmp16177 = _mm512_shuffle_ps(tmp16165, tmp16167, 68);
__m512 tmp16178 = _mm512_shuffle_ps(tmp16165, tmp16167, 238);
__m512 tmp16179 = _mm512_shuffle_ps(tmp16166, tmp16168, 68);
__m512 tmp16180 = _mm512_shuffle_ps(tmp16166, tmp16168, 238);
__m512 tmp16181 = _mm512_shuffle_ps(tmp16169, tmp16171, 68);
__m512 tmp16182 = _mm512_shuffle_ps(tmp16169, tmp16171, 238);
__m512 tmp16183 = _mm512_shuffle_ps(tmp16170, tmp16172, 68);
__m512 tmp16184 = _mm512_shuffle_ps(tmp16170, tmp16172, 238);
__m512 tmp16185 = _mm512_shuffle_ps(tmp16173, tmp16175, 68);
__m512 tmp16186 = _mm512_shuffle_ps(tmp16173, tmp16175, 238);
__m512 tmp16187 = _mm512_shuffle_ps(tmp16174, tmp16176, 68);
__m512 tmp16188 = _mm512_shuffle_ps(tmp16174, tmp16176, 238);
__m512 tmp16189 = _mm512_shuffle_f32x4(tmp16177, tmp16181, 136);
__m512 tmp16190 = _mm512_shuffle_f32x4(tmp16177, tmp16181, 221);
__m512 tmp16191 = _mm512_shuffle_f32x4(tmp16178, tmp16182, 136);
__m512 tmp16192 = _mm512_shuffle_f32x4(tmp16178, tmp16182, 221);
__m512 tmp16193 = _mm512_shuffle_f32x4(tmp16179, tmp16183, 136);
__m512 tmp16194 = _mm512_shuffle_f32x4(tmp16179, tmp16183, 221);
__m512 tmp16195 = _mm512_shuffle_f32x4(tmp16180, tmp16184, 136);
__m512 tmp16196 = _mm512_shuffle_f32x4(tmp16180, tmp16184, 221);
__m512 tmp16197 = _mm512_shuffle_f32x4(tmp16185, tmp16185, 136);
__m512 tmp16198 = _mm512_shuffle_f32x4(tmp16185, tmp16185, 221);
__m512 tmp16199 = _mm512_shuffle_f32x4(tmp16186, tmp16186, 136);
__m512 tmp16200 = _mm512_shuffle_f32x4(tmp16186, tmp16186, 221);
__m512 tmp16201 = _mm512_shuffle_f32x4(tmp16187, tmp16187, 136);
__m512 tmp16202 = _mm512_shuffle_f32x4(tmp16187, tmp16187, 221);
__m512 tmp16203 = _mm512_shuffle_f32x4(tmp16188, tmp16188, 136);
__m512 tmp16204 = _mm512_shuffle_f32x4(tmp16188, tmp16188, 221);
tmp16079 = _mm512_shuffle_f32x4(tmp16189, tmp16197, 136);
tmp16087 = _mm512_shuffle_f32x4(tmp16189, tmp16197, 221);
tmp16080 = _mm512_shuffle_f32x4(tmp16191, tmp16199, 136);
tmp16088 = _mm512_shuffle_f32x4(tmp16191, tmp16199, 221);
tmp16081 = _mm512_shuffle_f32x4(tmp16193, tmp16201, 136);
tmp16089 = _mm512_shuffle_f32x4(tmp16193, tmp16201, 221);
tmp16082 = _mm512_shuffle_f32x4(tmp16195, tmp16203, 136);
tmp16090 = _mm512_shuffle_f32x4(tmp16195, tmp16203, 221);
tmp16083 = _mm512_shuffle_f32x4(tmp16190, tmp16198, 136);
__m512 tmp16131 = _mm512_shuffle_f32x4(tmp16190, tmp16198, 221);
tmp16084 = _mm512_shuffle_f32x4(tmp16192, tmp16200, 136);
__m512 tmp16132 = _mm512_shuffle_f32x4(tmp16192, tmp16200, 221);
tmp16085 = _mm512_shuffle_f32x4(tmp16194, tmp16202, 136);
__m512 tmp16133 = _mm512_shuffle_f32x4(tmp16194, tmp16202, 221);
tmp16086 = _mm512_shuffle_f32x4(tmp16196, tmp16204, 136);
__m512 tmp16134 = _mm512_shuffle_f32x4(tmp16196, tmp16204, 221);
(void)tmp16086;
(void)tmp16134;
__m512 tmp16139 = _mm512_add_ps(tmp16080, tmp16081);
__m512 tmp16154 = _mm512_add_ps(tmp16088, tmp16089);
__m512 tmp16138 = _mm512_add_ps(tmp16082, tmp16083);
__m512 tmp16153 = _mm512_add_ps(tmp16090, tmp16131);
__m512 tmp16144 = _mm512_sub_ps(tmp16082, tmp16083);
__m512 tmp16159 = _mm512_sub_ps(tmp16090, tmp16131);
__m512 tmp16143 = _mm512_sub_ps(tmp16080, tmp16081);
__m512 tmp16158 = _mm512_sub_ps(tmp16088, tmp16089);
__m512 tmp16140 = _mm512_add_ps(tmp16084, tmp16085);
__m512 tmp16155 = _mm512_add_ps(tmp16132, tmp16133);
__m512 tmp16145 = _mm512_sub_ps(tmp16084, tmp16085);
__m512 tmp16160 = _mm512_sub_ps(tmp16132, tmp16133);
__m512 tmp16142 = _mm512_fmadd_ps(tmp16144, _mm512_set1_ps(2e+00f), tmp16143);
__m512 tmp16157 = _mm512_fmadd_ps(tmp16159, _mm512_set1_ps(2e+00f), tmp16158);
__m512 tmp16149 = _mm512_fmadd_ps(tmp16144, _mm512_set1_ps(8e+00f), tmp16143);
__m512 tmp16164 = _mm512_fmadd_ps(tmp16159, _mm512_set1_ps(8e+00f), tmp16158);
__m512 tmp16137 = _mm512_add_ps(tmp16138, tmp16139);
__m512 tmp16152 = _mm512_add_ps(tmp16153, tmp16154);
__m512 tmp16141 = _mm512_fmadd_ps(tmp16145, _mm512_set1_ps(1.6e+01f), tmp16142);
__m512 tmp16156 = _mm512_fmadd_ps(tmp16160, _mm512_set1_ps(1.6e+01f), tmp16157);
__m512 tmp16148 = _mm512_fmadd_ps(tmp16145, _mm512_set1_ps(4e+00f), tmp16149);
__m512 tmp16163 = _mm512_fmadd_ps(tmp16160, _mm512_set1_ps(4e+00f), tmp16164);
__m512 tmp16147 = _mm512_fmadd_ps(tmp16138, _mm512_set1_ps(4e+00f), tmp16139);
__m512 tmp16162 = _mm512_fmadd_ps(tmp16153, _mm512_set1_ps(4e+00f), tmp16154);
__m512 tmp16136 = _mm512_add_ps(tmp16137, tmp16079);
__m512 tmp16151 = _mm512_add_ps(tmp16152, tmp16087);
__m512 tmp16135 = _mm512_fmadd_ps(tmp16140, _mm512_set1_ps(3.2e+01f), tmp16136);
__m512 tmp16150 = _mm512_fmadd_ps(tmp16155, _mm512_set1_ps(3.2e+01f), tmp16151);
__m512 tmp16146 = _mm512_fmadd_ps(tmp16140, _mm512_set1_ps(8e+00f), tmp16147);
__m512 tmp16161 = _mm512_fmadd_ps(tmp16155, _mm512_set1_ps(8e+00f), tmp16162);
__m512 out2095 = tmp16135;
__m512 out2099 = tmp16150;
__m512 out2096 = tmp16141;
__m512 out2100 = tmp16156;
__m512 out2097 = tmp16146;
__m512 out2101 = tmp16161;
__m512 out2098 = tmp16148;
__m512 out2102 = tmp16163;
out2095 = _mm512_max_ps(_mm512_setzero_ps(), out2095);
out2099 = _mm512_max_ps(_mm512_setzero_ps(), out2099);
out2096 = _mm512_max_ps(_mm512_setzero_ps(), out2096);
out2100 = _mm512_max_ps(_mm512_setzero_ps(), out2100);
out2097 = _mm512_max_ps(_mm512_setzero_ps(), out2097);
out2101 = _mm512_max_ps(_mm512_setzero_ps(), out2101);
out2098 = _mm512_max_ps(_mm512_setzero_ps(), out2098);
out2102 = _mm512_max_ps(_mm512_setzero_ps(), out2102);
_mm512_mask_storeu_ps(datPtr24+3736+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 4095, out2095);
_mm512_mask_storeu_ps(datPtr24+3784+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 4095, out2099);
_mm512_mask_storeu_ps(datPtr24+3848+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 4095, out2096);
_mm512_mask_storeu_ps(datPtr24+3896+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 4095, out2100);
_mm512_mask_storeu_ps(datPtr24+3960+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 4095, out2097);
_mm512_mask_storeu_ps(datPtr24+4008+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 4095, out2101);
_mm512_mask_storeu_ps(datPtr24+4072+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 4095, out2098);
_mm512_mask_storeu_ps(datPtr24+4120+401408*i47+112*toH46+4*toW46+12544*k138+6272*l57, 4095, out2102);
}
}
if (j40 >= last10) return;
++j40;
rel22 = 4;
}
ptrdiff_t toH47 = base22+24;
ptrdiff_t toW47 = 24;
ptrdiff_t k139 = 32*w63;
for (; k139 != 32; ++k139) {
ptrdiff_t l58 = 0;
for (; l58 != 1; ++l58) {
__m512 sf1201 = _mm512_loadu_ps(sfPtr11+0+819200*i47+49152*j40+256*k139+256*l58);
__m512 sf1202 = _mm512_loadu_ps(sfPtr11+64+819200*i47+49152*j40+256*k139+256*l58);
__m512 in2295 = _mm512_shuffle_f32x4(sf1201, sf1202, 68);
__m512 in2296 = _mm512_shuffle_f32x4(sf1201, sf1202, 238);
__m512 sf1203 = _mm512_loadu_ps(sfPtr11+128+819200*i47+49152*j40+256*k139+256*l58);
__m512 sf1204 = _mm512_loadu_ps(sfPtr11+192+819200*i47+49152*j40+256*k139+256*l58);
__m512 in2303 = _mm512_shuffle_f32x4(sf1203, sf1204, 68);
__m512 in2304 = _mm512_shuffle_f32x4(sf1203, sf1204, 238);
__m512 sf1205 = _mm512_loadu_ps(sfPtr11+204800+819200*i47+49152*j40+256*k139+256*l58);
__m512 sf1206 = _mm512_loadu_ps(sfPtr11+204864+819200*i47+49152*j40+256*k139+256*l58);
__m512 in2297 = _mm512_shuffle_f32x4(sf1205, sf1206, 68);
__m512 in2298 = _mm512_shuffle_f32x4(sf1205, sf1206, 238);
__m512 sf1207 = _mm512_loadu_ps(sfPtr11+204928+819200*i47+49152*j40+256*k139+256*l58);
__m512 sf1208 = _mm512_loadu_ps(sfPtr11+204992+819200*i47+49152*j40+256*k139+256*l58);
__m512 in2305 = _mm512_shuffle_f32x4(sf1207, sf1208, 68);
__m512 in2306 = _mm512_shuffle_f32x4(sf1207, sf1208, 238);
__m512 sf1209 = _mm512_loadu_ps(sfPtr11+409600+819200*i47+49152*j40+256*k139+256*l58);
__m512 sf1210 = _mm512_loadu_ps(sfPtr11+409664+819200*i47+49152*j40+256*k139+256*l58);
__m512 in2299 = _mm512_shuffle_f32x4(sf1209, sf1210, 68);
__m512 in2300 = _mm512_shuffle_f32x4(sf1209, sf1210, 238);
__m512 sf1211 = _mm512_loadu_ps(sfPtr11+409728+819200*i47+49152*j40+256*k139+256*l58);
__m512 sf1212 = _mm512_loadu_ps(sfPtr11+409792+819200*i47+49152*j40+256*k139+256*l58);
__m512 in2307 = _mm512_shuffle_f32x4(sf1211, sf1212, 68);
__m512 in2308 = _mm512_shuffle_f32x4(sf1211, sf1212, 238);
__m512 sf1213 = _mm512_loadu_ps(sfPtr11+614400+819200*i47+49152*j40+256*k139+256*l58);
__m512 sf1214 = _mm512_loadu_ps(sfPtr11+614464+819200*i47+49152*j40+256*k139+256*l58);
__m512 in2301 = _mm512_shuffle_f32x4(sf1213, sf1214, 68);
__m512 in2302 = _mm512_shuffle_f32x4(sf1213, sf1214, 238);
__m512 sf1215 = _mm512_loadu_ps(sfPtr11+614528+819200*i47+49152*j40+256*k139+256*l58);
__m512 sf1216 = _mm512_loadu_ps(sfPtr11+614592+819200*i47+49152*j40+256*k139+256*l58);
__m512 in2309 = _mm512_shuffle_f32x4(sf1215, sf1216, 68);
__m512 in2310 = _mm512_shuffle_f32x4(sf1215, sf1216, 238);
(void)in2302;
(void)in2310;
__m512 tmp16217 = _mm512_add_ps(in2296, in2297);
__m512 tmp16232 = _mm512_add_ps(in2304, in2305);
__m512 tmp16216 = _mm512_add_ps(in2298, in2299);
__m512 tmp16231 = _mm512_add_ps(in2306, in2307);
__m512 tmp16222 = _mm512_sub_ps(in2298, in2299);
__m512 tmp16237 = _mm512_sub_ps(in2306, in2307);
__m512 tmp16221 = _mm512_sub_ps(in2296, in2297);
__m512 tmp16236 = _mm512_sub_ps(in2304, in2305);
__m512 tmp16218 = _mm512_add_ps(in2300, in2301);
__m512 tmp16233 = _mm512_add_ps(in2308, in2309);
__m512 tmp16223 = _mm512_sub_ps(in2300, in2301);
__m512 tmp16238 = _mm512_sub_ps(in2308, in2309);
__m512 tmp16220 = _mm512_fmadd_ps(tmp16222, _mm512_set1_ps(2e+00f), tmp16221);
__m512 tmp16235 = _mm512_fmadd_ps(tmp16237, _mm512_set1_ps(2e+00f), tmp16236);
__m512 tmp16227 = _mm512_fmadd_ps(tmp16222, _mm512_set1_ps(8e+00f), tmp16221);
__m512 tmp16242 = _mm512_fmadd_ps(tmp16237, _mm512_set1_ps(8e+00f), tmp16236);
__m512 tmp16215 = _mm512_add_ps(tmp16216, tmp16217);
__m512 tmp16230 = _mm512_add_ps(tmp16231, tmp16232);
__m512 tmp16219 = _mm512_fmadd_ps(tmp16223, _mm512_set1_ps(1.6e+01f), tmp16220);
__m512 tmp16234 = _mm512_fmadd_ps(tmp16238, _mm512_set1_ps(1.6e+01f), tmp16235);
__m512 tmp16226 = _mm512_fmadd_ps(tmp16223, _mm512_set1_ps(4e+00f), tmp16227);
__m512 tmp16241 = _mm512_fmadd_ps(tmp16238, _mm512_set1_ps(4e+00f), tmp16242);
__m512 tmp16225 = _mm512_fmadd_ps(tmp16216, _mm512_set1_ps(4e+00f), tmp16217);
__m512 tmp16240 = _mm512_fmadd_ps(tmp16231, _mm512_set1_ps(4e+00f), tmp16232);
__m512 tmp16214 = _mm512_add_ps(tmp16215, in2295);
__m512 tmp16229 = _mm512_add_ps(tmp16230, in2303);
__m512 tmp16213 = _mm512_fmadd_ps(tmp16218, _mm512_set1_ps(3.2e+01f), tmp16214);
__m512 tmp16228 = _mm512_fmadd_ps(tmp16233, _mm512_set1_ps(3.2e+01f), tmp16229);
__m512 tmp16224 = _mm512_fmadd_ps(tmp16218, _mm512_set1_ps(8e+00f), tmp16225);
__m512 tmp16239 = _mm512_fmadd_ps(tmp16233, _mm512_set1_ps(8e+00f), tmp16240);
__m512 tmp16205 = tmp16213;
__m512 tmp16209 = tmp16228;
__m512 tmp16206 = tmp16219;
__m512 tmp16210 = tmp16234;
__m512 tmp16207 = tmp16224;
__m512 tmp16211 = tmp16239;
__m512 tmp16208 = tmp16226;
__m512 tmp16212 = tmp16241;
__m512 tmp16243 = _mm512_setzero_ps();
__m512 tmp16244 = _mm512_setzero_ps();
__m512 tmp16281 = _mm512_unpacklo_ps(tmp16205, tmp16206);
__m512 tmp16282 = _mm512_unpackhi_ps(tmp16205, tmp16206);
__m512 tmp16283 = _mm512_unpacklo_ps(tmp16207, tmp16208);
__m512 tmp16284 = _mm512_unpackhi_ps(tmp16207, tmp16208);
__m512 tmp16285 = _mm512_unpacklo_ps(tmp16243, tmp16244);
__m512 tmp16286 = _mm512_unpackhi_ps(tmp16243, tmp16244);
__m512 tmp16287 = _mm512_unpacklo_ps(tmp16209, tmp16210);
__m512 tmp16288 = _mm512_unpackhi_ps(tmp16209, tmp16210);
__m512 tmp16289 = _mm512_unpacklo_ps(tmp16211, tmp16212);
__m512 tmp16290 = _mm512_unpackhi_ps(tmp16211, tmp16212);
__m512 tmp16291 = _mm512_shuffle_ps(tmp16281, tmp16283, 68);
__m512 tmp16292 = _mm512_shuffle_ps(tmp16281, tmp16283, 238);
__m512 tmp16293 = _mm512_shuffle_ps(tmp16282, tmp16284, 68);
__m512 tmp16294 = _mm512_shuffle_ps(tmp16282, tmp16284, 238);
__m512 tmp16295 = _mm512_shuffle_ps(tmp16285, tmp16287, 68);
__m512 tmp16296 = _mm512_shuffle_ps(tmp16285, tmp16287, 238);
__m512 tmp16297 = _mm512_shuffle_ps(tmp16286, tmp16288, 68);
__m512 tmp16298 = _mm512_shuffle_ps(tmp16286, tmp16288, 238);
__m512 tmp16299 = _mm512_shuffle_ps(tmp16289, tmp16289, 238);
__m512 tmp16300 = _mm512_shuffle_ps(tmp16290, tmp16290, 238);
__m512 tmp16301 = _mm512_shuffle_f32x4(tmp16291, tmp16295, 136);
__m512 tmp16302 = _mm512_shuffle_f32x4(tmp16291, tmp16295, 221);
__m512 tmp16303 = _mm512_shuffle_f32x4(tmp16292, tmp16296, 136);
__m512 tmp16304 = _mm512_shuffle_f32x4(tmp16292, tmp16296, 221);
__m512 tmp16305 = _mm512_shuffle_f32x4(tmp16293, tmp16297, 136);
__m512 tmp16306 = _mm512_shuffle_f32x4(tmp16293, tmp16297, 221);
__m512 tmp16307 = _mm512_shuffle_f32x4(tmp16294, tmp16298, 136);
__m512 tmp16308 = _mm512_shuffle_f32x4(tmp16294, tmp16298, 221);
__m512 tmp16309 = _mm512_shuffle_f32x4(tmp16289, tmp16289, 136);
__m512 tmp16310 = _mm512_shuffle_f32x4(tmp16289, tmp16289, 221);
__m512 tmp16311 = _mm512_shuffle_f32x4(tmp16299, tmp16299, 136);
__m512 tmp16312 = _mm512_shuffle_f32x4(tmp16299, tmp16299, 221);
__m512 tmp16313 = _mm512_shuffle_f32x4(tmp16290, tmp16290, 136);
__m512 tmp16314 = _mm512_shuffle_f32x4(tmp16290, tmp16290, 221);
__m512 tmp16315 = _mm512_shuffle_f32x4(tmp16300, tmp16300, 136);
__m512 tmp16316 = _mm512_shuffle_f32x4(tmp16300, tmp16300, 221);
tmp16205 = _mm512_shuffle_f32x4(tmp16301, tmp16309, 136);
tmp16211 = _mm512_shuffle_f32x4(tmp16301, tmp16309, 221);
tmp16206 = _mm512_shuffle_f32x4(tmp16303, tmp16311, 136);
tmp16212 = _mm512_shuffle_f32x4(tmp16303, tmp16311, 221);
tmp16207 = _mm512_shuffle_f32x4(tmp16305, tmp16313, 136);
__m512 tmp16245 = _mm512_shuffle_f32x4(tmp16305, tmp16313, 221);
tmp16208 = _mm512_shuffle_f32x4(tmp16307, tmp16315, 136);
__m512 tmp16246 = _mm512_shuffle_f32x4(tmp16307, tmp16315, 221);
tmp16243 = _mm512_shuffle_f32x4(tmp16302, tmp16310, 136);
__m512 tmp16247 = _mm512_shuffle_f32x4(tmp16302, tmp16310, 221);
tmp16244 = _mm512_shuffle_f32x4(tmp16304, tmp16312, 136);
__m512 tmp16248 = _mm512_shuffle_f32x4(tmp16304, tmp16312, 221);
tmp16209 = _mm512_shuffle_f32x4(tmp16306, tmp16314, 136);
__m512 tmp16249 = _mm512_shuffle_f32x4(tmp16306, tmp16314, 221);
tmp16210 = _mm512_shuffle_f32x4(tmp16308, tmp16316, 136);
__m512 tmp16250 = _mm512_shuffle_f32x4(tmp16308, tmp16316, 221);
(void)tmp16210;
(void)tmp16250;
__m512 tmp16255 = _mm512_add_ps(tmp16206, tmp16207);
__m512 tmp16270 = _mm512_add_ps(tmp16212, tmp16245);
__m512 tmp16254 = _mm512_add_ps(tmp16208, tmp16243);
__m512 tmp16269 = _mm512_add_ps(tmp16246, tmp16247);
__m512 tmp16260 = _mm512_sub_ps(tmp16208, tmp16243);
__m512 tmp16275 = _mm512_sub_ps(tmp16246, tmp16247);
__m512 tmp16259 = _mm512_sub_ps(tmp16206, tmp16207);
__m512 tmp16274 = _mm512_sub_ps(tmp16212, tmp16245);
__m512 tmp16256 = _mm512_add_ps(tmp16244, tmp16209);
__m512 tmp16271 = _mm512_add_ps(tmp16248, tmp16249);
__m512 tmp16261 = _mm512_sub_ps(tmp16244, tmp16209);
__m512 tmp16276 = _mm512_sub_ps(tmp16248, tmp16249);
__m512 tmp16258 = _mm512_fmadd_ps(tmp16260, _mm512_set1_ps(2e+00f), tmp16259);
__m512 tmp16273 = _mm512_fmadd_ps(tmp16275, _mm512_set1_ps(2e+00f), tmp16274);
__m512 tmp16265 = _mm512_fmadd_ps(tmp16260, _mm512_set1_ps(8e+00f), tmp16259);
__m512 tmp16280 = _mm512_fmadd_ps(tmp16275, _mm512_set1_ps(8e+00f), tmp16274);
__m512 tmp16253 = _mm512_add_ps(tmp16254, tmp16255);
__m512 tmp16268 = _mm512_add_ps(tmp16269, tmp16270);
__m512 tmp16257 = _mm512_fmadd_ps(tmp16261, _mm512_set1_ps(1.6e+01f), tmp16258);
__m512 tmp16272 = _mm512_fmadd_ps(tmp16276, _mm512_set1_ps(1.6e+01f), tmp16273);
__m512 tmp16264 = _mm512_fmadd_ps(tmp16261, _mm512_set1_ps(4e+00f), tmp16265);
__m512 tmp16279 = _mm512_fmadd_ps(tmp16276, _mm512_set1_ps(4e+00f), tmp16280);
__m512 tmp16263 = _mm512_fmadd_ps(tmp16254, _mm512_set1_ps(4e+00f), tmp16255);
__m512 tmp16278 = _mm512_fmadd_ps(tmp16269, _mm512_set1_ps(4e+00f), tmp16270);
__m512 tmp16252 = _mm512_add_ps(tmp16253, tmp16205);
__m512 tmp16267 = _mm512_add_ps(tmp16268, tmp16211);
__m512 tmp16251 = _mm512_fmadd_ps(tmp16256, _mm512_set1_ps(3.2e+01f), tmp16252);
__m512 tmp16266 = _mm512_fmadd_ps(tmp16271, _mm512_set1_ps(3.2e+01f), tmp16267);
__m512 tmp16262 = _mm512_fmadd_ps(tmp16256, _mm512_set1_ps(8e+00f), tmp16263);
__m512 tmp16277 = _mm512_fmadd_ps(tmp16271, _mm512_set1_ps(8e+00f), tmp16278);
__m512 out2103 = tmp16251;
__m512 out2107 = tmp16266;
__m512 out2104 = tmp16257;
__m512 out2108 = tmp16272;
__m512 out2105 = tmp16262;
__m512 out2109 = tmp16277;
__m512 out2106 = tmp16264;
__m512 out2110 = tmp16279;
out2103 = _mm512_max_ps(_mm512_setzero_ps(), out2103);
out2107 = _mm512_max_ps(_mm512_setzero_ps(), out2107);
out2104 = _mm512_max_ps(_mm512_setzero_ps(), out2104);
out2108 = _mm512_max_ps(_mm512_setzero_ps(), out2108);
out2105 = _mm512_max_ps(_mm512_setzero_ps(), out2105);
out2109 = _mm512_max_ps(_mm512_setzero_ps(), out2109);
out2106 = _mm512_max_ps(_mm512_setzero_ps(), out2106);
out2110 = _mm512_max_ps(_mm512_setzero_ps(), out2110);
_mm512_mask_storeu_ps(datPtr24+0+401408*i47+112*toH47+4*toW47+12544*k139+12544*l58, 15, out2103);
_mm512_mask_storeu_ps(datPtr24+6248+401408*i47+112*toH47+4*toW47+12544*k139+12544*l58, 960, out2103);
_mm512_mask_storeu_ps(datPtr24+3136+401408*i47+112*toH47+4*toW47+12544*k139+12544*l58, 15, out2107);
_mm512_mask_storeu_ps(datPtr24+9384+401408*i47+112*toH47+4*toW47+12544*k139+12544*l58, 960, out2107);
_mm512_mask_storeu_ps(datPtr24+112+401408*i47+112*toH47+4*toW47+12544*k139+12544*l58, 15, out2104);
_mm512_mask_storeu_ps(datPtr24+6360+401408*i47+112*toH47+4*toW47+12544*k139+12544*l58, 960, out2104);
_mm512_mask_storeu_ps(datPtr24+3248+401408*i47+112*toH47+4*toW47+12544*k139+12544*l58, 15, out2108);
_mm512_mask_storeu_ps(datPtr24+9496+401408*i47+112*toH47+4*toW47+12544*k139+12544*l58, 960, out2108);
_mm512_mask_storeu_ps(datPtr24+224+401408*i47+112*toH47+4*toW47+12544*k139+12544*l58, 15, out2105);
_mm512_mask_storeu_ps(datPtr24+6472+401408*i47+112*toH47+4*toW47+12544*k139+12544*l58, 960, out2105);
_mm512_mask_storeu_ps(datPtr24+3360+401408*i47+112*toH47+4*toW47+12544*k139+12544*l58, 15, out2109);
_mm512_mask_storeu_ps(datPtr24+9608+401408*i47+112*toH47+4*toW47+12544*k139+12544*l58, 960, out2109);
_mm512_mask_storeu_ps(datPtr24+336+401408*i47+112*toH47+4*toW47+12544*k139+12544*l58, 15, out2106);
_mm512_mask_storeu_ps(datPtr24+6584+401408*i47+112*toH47+4*toW47+12544*k139+12544*l58, 960, out2106);
_mm512_mask_storeu_ps(datPtr24+3472+401408*i47+112*toH47+4*toW47+12544*k139+12544*l58, 15, out2110);
_mm512_mask_storeu_ps(datPtr24+9720+401408*i47+112*toH47+4*toW47+12544*k139+12544*l58, 960, out2110);
}
}
if (j40 >= last10) return;
++j40;
}

static void ResNet50ThreeConsumeSums4(ResNet50ThreaderTeam1* team52, char** tensors77) {
ResNet50ThreaderTask1 task81;
task81.callee1 = ResNet50ThreeConsumeSums4Callee1;
task81.any1 = tensors77;
task81.nd1 = 3;
task81.hull1[0] = 1;
task81.hull1[1] = 5;
task81.hull1[2] = 1;
ResNet50ThreaderDo1(team52, &task81);
}

static void ResNet50ThreeArrangeFilts5Callee1(ResNet50ThreaderTask1* task88, int64_t* pt49) {
char** tensors86 = task88->any1;
ptrdiff_t b63 = pt49[0];
ptrdiff_t g29 = 0;
ptrdiff_t e25 = 0;
char*restrict bfPtr12 = tensors86[3]+1024*e25;
char*restrict wfPtr12 = tensors86[3]+1024+12976128*e25;
char*restrict wtPtr16 = tensors86[0]+14256*e25;
char*restrict biasPtr16 = tensors86[1];
char*restrict bnPtr16 = tensors86[2];
ptrdiff_t i52 = 1*g29;
ptrdiff_t j44 = 1*b63;
ptrdiff_t jj44 = j44+0;
if (j44 < 64) {
for (; j44 != 64; ++j44) {
ptrdiff_t k147 = 0+1*j44;
ptrdiff_t cut22 = 0;
__m512 postMul51 = _mm512_set1_ps(((float*)bnPtr16+(ptrdiff_t)2*(0+256*i52+4*j44))[0]);
__m512 postMul52 = _mm512_set1_ps(((float*)bnPtr16+(ptrdiff_t)2*(1+256*i52+4*j44))[0]);
__m512 postMul53 = _mm512_set1_ps(((float*)bnPtr16+(ptrdiff_t)2*(2+256*i52+4*j44))[0]);
__m512 postMul54 = _mm512_set1_ps(((float*)bnPtr16+(ptrdiff_t)2*(3+256*i52+4*j44))[0]);
ptrdiff_t s42 = 0;
for (; s42 != 256; ++s42) {
__m512 wt549 = _mm512_maskz_loadu_ps(511, wtPtr16+0+2359296*i52+36864*j44+36*s42);
__m512 wt550 = _mm512_maskz_loadu_ps(511, wtPtr16+9216+2359296*i52+36864*j44+36*s42);
__m512 wt551 = _mm512_maskz_loadu_ps(511, wtPtr16+18432+2359296*i52+36864*j44+36*s42);
__m512 wt552 = _mm512_maskz_loadu_ps(511, wtPtr16+27648+2359296*i52+36864*j44+36*s42);
wt549 = _mm512_mul_ps(wt549, postMul51);
wt550 = _mm512_mul_ps(wt550, postMul52);
wt551 = _mm512_mul_ps(wt551, postMul53);
wt552 = _mm512_mul_ps(wt552, postMul54);
__m512i pm219 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm220 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp16557 = _mm512_permutex2var_ps(wt549, pm219, wt551);
__m512 tmp16558 = _mm512_permutex2var_ps(wt550, pm219, wt552);
__m512 tmp16559 = _mm512_permutex2var_ps(wt549, pm220, wt551);
__m512 tmp16560 = _mm512_permutex2var_ps(wt550, pm220, wt552);
__m512 in2311 = _mm512_permutex2var_ps(tmp16557, pm219, tmp16558);
__m512 in2312 = _mm512_permutex2var_ps(tmp16557, pm220, tmp16558);
__m512 in2313 = _mm512_permutex2var_ps(tmp16559, pm219, tmp16560);
__m512 tmp16561 = _mm512_fmadd_ps(in2311, _mm512_set1_ps(4e+00f), in2313);
__m512 tmp16562 = _mm512_add_ps(in2311, in2313);
__m512 tmp16563 = _mm512_fmadd_ps(in2313, _mm512_set1_ps(4e+00f), in2311);
__m512 tmp16564 = _mm512_add_ps(in2312, tmp16562);
__m512 tmp16565 = _mm512_fmadd_ps(in2312, _mm512_set1_ps(2e+00f), tmp16563);
tmp16563 = _mm512_fnmadd_ps(in2312, _mm512_set1_ps(2e+00f), tmp16563);
__m512 tmp16566 = _mm512_fnmadd_ps(in2312, _mm512_set1_ps(2e+00f), tmp16561);
tmp16561 = _mm512_fmadd_ps(in2312, _mm512_set1_ps(2e+00f), tmp16561);
tmp16562 = _mm512_sub_ps(tmp16562, in2312);
__m512 tmp16583 = _mm512_unpacklo_ps(in2311, tmp16564);
__m512 tmp16584 = _mm512_unpackhi_ps(in2311, tmp16564);
__m512 tmp16585 = _mm512_unpacklo_ps(tmp16562, tmp16565);
__m512 tmp16586 = _mm512_unpackhi_ps(tmp16562, tmp16565);
__m512 tmp16587 = _mm512_unpacklo_ps(tmp16563, tmp16561);
__m512 tmp16588 = _mm512_unpackhi_ps(tmp16563, tmp16561);
__m512 tmp16589 = _mm512_unpacklo_ps(tmp16566, in2313);
__m512 tmp16590 = _mm512_unpackhi_ps(tmp16566, in2313);
__m512 tmp16591 = _mm512_shuffle_ps(tmp16583, tmp16585, 68);
__m512 tmp16592 = _mm512_shuffle_ps(tmp16583, tmp16585, 238);
__m512 tmp16593 = _mm512_shuffle_ps(tmp16584, tmp16586, 68);
__m512 tmp16594 = _mm512_shuffle_ps(tmp16584, tmp16586, 238);
__m512 tmp16595 = _mm512_shuffle_ps(tmp16587, tmp16589, 68);
__m512 tmp16596 = _mm512_shuffle_ps(tmp16587, tmp16589, 238);
__m512 tmp16597 = _mm512_shuffle_ps(tmp16588, tmp16590, 68);
__m512 tmp16598 = _mm512_shuffle_ps(tmp16588, tmp16590, 238);
__m512 tmp16599 = _mm512_shuffle_f32x4(tmp16591, tmp16595, 136);
__m512 tmp16600 = _mm512_shuffle_f32x4(tmp16591, tmp16595, 221);
__m512 tmp16601 = _mm512_shuffle_f32x4(tmp16592, tmp16596, 136);
__m512 tmp16602 = _mm512_shuffle_f32x4(tmp16592, tmp16596, 221);
__m512 tmp16603 = _mm512_shuffle_f32x4(tmp16593, tmp16597, 136);
__m512 tmp16604 = _mm512_shuffle_f32x4(tmp16593, tmp16597, 221);
__m512 tmp16605 = _mm512_shuffle_f32x4(tmp16594, tmp16598, 136);
__m512 tmp16606 = _mm512_shuffle_f32x4(tmp16594, tmp16598, 221);
in2311 = _mm512_shuffle_f32x4(tmp16599, tmp16599, 136);
__m512 tmp16567 = _mm512_shuffle_f32x4(tmp16599, tmp16599, 221);
tmp16564 = _mm512_shuffle_f32x4(tmp16601, tmp16601, 136);
__m512 tmp16568 = _mm512_shuffle_f32x4(tmp16601, tmp16601, 221);
tmp16562 = _mm512_shuffle_f32x4(tmp16603, tmp16603, 136);
__m512 tmp16569 = _mm512_shuffle_f32x4(tmp16603, tmp16603, 221);
tmp16565 = _mm512_shuffle_f32x4(tmp16605, tmp16605, 136);
__m512 tmp16570 = _mm512_shuffle_f32x4(tmp16605, tmp16605, 221);
tmp16563 = _mm512_shuffle_f32x4(tmp16600, tmp16600, 136);
tmp16561 = _mm512_shuffle_f32x4(tmp16602, tmp16602, 136);
tmp16566 = _mm512_shuffle_f32x4(tmp16604, tmp16604, 136);
in2313 = _mm512_shuffle_f32x4(tmp16606, tmp16606, 136);
in2311 = _mm512_shuffle_f32x4(in2311, tmp16565, 68);
tmp16564 = _mm512_shuffle_f32x4(tmp16564, tmp16563, 68);
tmp16562 = _mm512_shuffle_f32x4(tmp16562, tmp16561, 68);
tmp16566 = _mm512_shuffle_f32x4(tmp16566, tmp16568, 68);
in2313 = _mm512_shuffle_f32x4(in2313, tmp16569, 68);
tmp16567 = _mm512_shuffle_f32x4(tmp16567, tmp16570, 68);
__m512 tmp16571 = _mm512_fmadd_ps(in2311, _mm512_set1_ps(4e+00f), tmp16562);
__m512 tmp16577 = _mm512_fmadd_ps(tmp16566, _mm512_set1_ps(4e+00f), tmp16567);
__m512 tmp16572 = _mm512_add_ps(in2311, tmp16562);
__m512 tmp16578 = _mm512_add_ps(tmp16566, tmp16567);
__m512 tmp16573 = _mm512_fmadd_ps(tmp16562, _mm512_set1_ps(4e+00f), in2311);
__m512 tmp16579 = _mm512_fmadd_ps(tmp16567, _mm512_set1_ps(4e+00f), tmp16566);
__m512 tmp16574 = _mm512_add_ps(tmp16564, tmp16572);
__m512 tmp16580 = _mm512_add_ps(in2313, tmp16578);
__m512 tmp16575 = _mm512_fmadd_ps(tmp16564, _mm512_set1_ps(2e+00f), tmp16573);
__m512 tmp16581 = _mm512_fmadd_ps(in2313, _mm512_set1_ps(2e+00f), tmp16579);
tmp16573 = _mm512_fnmadd_ps(tmp16564, _mm512_set1_ps(2e+00f), tmp16573);
tmp16579 = _mm512_fnmadd_ps(in2313, _mm512_set1_ps(2e+00f), tmp16579);
__m512 tmp16576 = _mm512_fnmadd_ps(tmp16564, _mm512_set1_ps(2e+00f), tmp16571);
__m512 tmp16582 = _mm512_fnmadd_ps(in2313, _mm512_set1_ps(2e+00f), tmp16577);
tmp16571 = _mm512_fmadd_ps(tmp16564, _mm512_set1_ps(2e+00f), tmp16571);
tmp16577 = _mm512_fmadd_ps(in2313, _mm512_set1_ps(2e+00f), tmp16577);
tmp16572 = _mm512_sub_ps(tmp16572, tmp16564);
tmp16578 = _mm512_sub_ps(tmp16578, in2313);
in2311 = _mm512_mul_ps(in2311, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp16574 = _mm512_mul_ps(tmp16574, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp16572 = _mm512_mul_ps(tmp16572, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp16575 = _mm512_mul_ps(tmp16575, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp16573 = _mm512_mul_ps(tmp16573, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp16571 = _mm512_mul_ps(tmp16571, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp16576 = _mm512_mul_ps(tmp16576, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp16562 = _mm512_mul_ps(tmp16562, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp16566 = _mm512_mul_ps(tmp16566, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp16580 = _mm512_mul_ps(tmp16580, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp16578 = _mm512_mul_ps(tmp16578, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp16581 = _mm512_mul_ps(tmp16581, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp16579 = _mm512_mul_ps(tmp16579, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp16577 = _mm512_mul_ps(tmp16577, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp16582 = _mm512_mul_ps(tmp16582, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp16567 = _mm512_mul_ps(tmp16567, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out2111 = _mm512_shuffle_f32x4(in2311, tmp16574, 68);
__m512 out2115 = _mm512_shuffle_f32x4(in2311, tmp16574, 238);
__m512 out2112 = _mm512_shuffle_f32x4(tmp16572, tmp16575, 68);
__m512 out2116 = _mm512_shuffle_f32x4(tmp16572, tmp16575, 238);
__m512 out2113 = _mm512_shuffle_f32x4(tmp16573, tmp16571, 68);
__m512 out2117 = _mm512_shuffle_f32x4(tmp16573, tmp16571, 238);
__m512 out2114 = _mm512_shuffle_f32x4(tmp16576, tmp16562, 68);
__m512 out2118 = _mm512_shuffle_f32x4(tmp16576, tmp16562, 238);
__m512 out2119 = _mm512_shuffle_f32x4(tmp16566, tmp16580, 68);
__m512 out2123 = _mm512_shuffle_f32x4(tmp16566, tmp16580, 238);
__m512 out2120 = _mm512_shuffle_f32x4(tmp16578, tmp16581, 68);
__m512 out2124 = _mm512_shuffle_f32x4(tmp16578, tmp16581, 238);
__m512 out2121 = _mm512_shuffle_f32x4(tmp16579, tmp16577, 68);
__m512 out2125 = _mm512_shuffle_f32x4(tmp16579, tmp16577, 238);
__m512 out2122 = _mm512_shuffle_f32x4(tmp16582, tmp16567, 68);
__m512 out2126 = _mm512_shuffle_f32x4(tmp16582, tmp16567, 238);
ptrdiff_t off17 = 32*cut22;
ptrdiff_t off18 = (size_t)(cut22+1)/4*32768+(size_t)(cut22+1)%4*32;
ptrdiff_t off19 = (size_t)(cut22+2)/4*32768+(size_t)(cut22+2)%4*32;
ptrdiff_t off20 = (size_t)(cut22+3)/4*32768+(size_t)(cut22+3)%4*32;
__m512i wf129 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2111, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf130 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2115, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf131 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2119, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf132 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2123, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf133 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2112, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf134 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2116, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf135 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2120, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf136 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2124, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf137 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2113, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf138 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2117, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf139 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2121, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf140 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2125, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf141 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2114, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf142 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2118, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf143 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2122, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf144 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2126, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr12+0+8388608*i52+32768*k147+off17+128*s42, 255, wf129);
_mm512_mask_storeu_epi32(wfPtr12+0+8388608*i52+32768*k147+off18+128*s42, 255, wf130);
_mm512_mask_storeu_epi32(wfPtr12+0+8388608*i52+32768*k147+off19+128*s42, 255, wf131);
_mm512_mask_storeu_epi32(wfPtr12+0+8388608*i52+32768*k147+off20+128*s42, 255, wf132);
_mm512_mask_storeu_epi32(wfPtr12+2097152+8388608*i52+32768*k147+off17+128*s42, 255, wf133);
_mm512_mask_storeu_epi32(wfPtr12+2097152+8388608*i52+32768*k147+off18+128*s42, 255, wf134);
_mm512_mask_storeu_epi32(wfPtr12+2097152+8388608*i52+32768*k147+off19+128*s42, 255, wf135);
_mm512_mask_storeu_epi32(wfPtr12+2097152+8388608*i52+32768*k147+off20+128*s42, 255, wf136);
_mm512_mask_storeu_epi32(wfPtr12+4194304+8388608*i52+32768*k147+off17+128*s42, 255, wf137);
_mm512_mask_storeu_epi32(wfPtr12+4194304+8388608*i52+32768*k147+off18+128*s42, 255, wf138);
_mm512_mask_storeu_epi32(wfPtr12+4194304+8388608*i52+32768*k147+off19+128*s42, 255, wf139);
_mm512_mask_storeu_epi32(wfPtr12+4194304+8388608*i52+32768*k147+off20+128*s42, 255, wf140);
_mm512_mask_storeu_epi32(wfPtr12+6291456+8388608*i52+32768*k147+off17+128*s42, 255, wf141);
_mm512_mask_storeu_epi32(wfPtr12+6291456+8388608*i52+32768*k147+off18+128*s42, 255, wf142);
_mm512_mask_storeu_epi32(wfPtr12+6291456+8388608*i52+32768*k147+off19+128*s42, 255, wf143);
_mm512_mask_storeu_epi32(wfPtr12+6291456+8388608*i52+32768*k147+off20+128*s42, 255, wf144);
}
__m512 bias6 = _mm512_setzero_ps();
if (!e25) {
bias6 = _mm512_maskz_loadu_ps(15, biasPtr16-0+1024*i52+16*j44);
__m512i pmMul33 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd33 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas11 = _mm512_maskz_loadu_ps(255, bnPtr16+(ptrdiff_t)8*(0+256*i52+4*j44));
__m512 postMul55 = _mm512_permutexvar_ps(pmMul33, mas11);
__m512 postAdd33 = _mm512_permutexvar_ps(pmAdd33, mas11);
bias6 = _mm512_fmadd_ps(bias6, postMul55, postAdd33);
}
_mm512_mask_storeu_ps(bfPtr12-0+1024*i52+16*j44, 15, bias6);
if (j44 >= jj44) return;
}
}
}

static void ResNet50ThreeArrangeFilts5(ResNet50ThreaderTeam1* team56, char** tensors85) {
ResNet50ThreaderTask1 task89;
task89.callee1 = ResNet50ThreeArrangeFilts5Callee1;
task89.any1 = tensors85;
task89.nd1 = 3;
task89.hull1[0] = 64;
task89.hull1[1] = 1;
task89.hull1[2] = 1;
ResNet50ThreaderDo1(team56, &task89);
}

static void ResNet50ThreeArrangeDats5Callee1(ResNet50ThreaderTask1* task90, int64_t* pt50) {
char** tensors88 = task90->any1;
ptrdiff_t s43 = pt50[0];
ptrdiff_t c43 = pt50[1];
ptrdiff_t g30 = 0;
ptrdiff_t e26 = 0;
char*restrict datPtr27 = tensors88[0]-60+329472*e26;
char*restrict dfPtr12 = tensors88[1]+912384*e26;
ptrdiff_t i53 = 1*g30;
ptrdiff_t j45 = 1*c43;
ptrdiff_t last11 = j45+0;
ptrdiff_t rel23 = j45-0;
ptrdiff_t base23 = 0;
if (rel23 < 1) {
ptrdiff_t h53 = base23+0;
ptrdiff_t w65 = 0;
ptrdiff_t k148 = 0;
for (; k148 != 64; ++k148) {
__m512 dat2245 = _mm512_maskz_loadu_ps(127, datPtr27+340+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2245 = _mm512_max_ps(_mm512_setzero_ps(), dat2245);
__m512i pm221 = _mm512_set_epi32(6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15);
__m512 in2321 = _mm512_permutexvar_ps(pm221, dat2245);
__m512 dat2246 = _mm512_maskz_loadu_ps(16383, datPtr27+60+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2246 = _mm512_max_ps(_mm512_setzero_ps(), dat2246);
__m512 dat2247 = _mm512_maskz_loadu_ps(127, datPtr27+396+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2247 = _mm512_max_ps(_mm512_setzero_ps(), dat2247);
__m512i pm222 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in2314 = _mm512_permutexvar_ps(pm222, dat2246);
__m512i pm223 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 15, 15, 13, 12, 11);
__m512 in2322 = _mm512_permutex2var_ps(dat2246, pm223, dat2247);
__m512 dat2248 = _mm512_maskz_loadu_ps(16383, datPtr27+116+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2248 = _mm512_max_ps(_mm512_setzero_ps(), dat2248);
__m512 dat2249 = _mm512_maskz_loadu_ps(127, datPtr27+452+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2249 = _mm512_max_ps(_mm512_setzero_ps(), dat2249);
__m512 in2315 = _mm512_permutexvar_ps(pm222, dat2248);
__m512 in2323 = _mm512_permutex2var_ps(dat2248, pm223, dat2249);
__m512 dat2250 = _mm512_maskz_loadu_ps(16383, datPtr27+172+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2250 = _mm512_max_ps(_mm512_setzero_ps(), dat2250);
__m512 dat2251 = _mm512_maskz_loadu_ps(127, datPtr27+508+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2251 = _mm512_max_ps(_mm512_setzero_ps(), dat2251);
__m512 in2316 = _mm512_permutexvar_ps(pm222, dat2250);
__m512 in2324 = _mm512_permutex2var_ps(dat2250, pm223, dat2251);
__m512 dat2252 = _mm512_maskz_loadu_ps(16383, datPtr27+228+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2252 = _mm512_max_ps(_mm512_setzero_ps(), dat2252);
__m512 dat2253 = _mm512_maskz_loadu_ps(127, datPtr27+564+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2253 = _mm512_max_ps(_mm512_setzero_ps(), dat2253);
__m512 in2317 = _mm512_permutexvar_ps(pm222, dat2252);
__m512 in2325 = _mm512_permutex2var_ps(dat2252, pm223, dat2253);
__m512 dat2254 = _mm512_maskz_loadu_ps(16383, datPtr27+284+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2254 = _mm512_max_ps(_mm512_setzero_ps(), dat2254);
__m512 dat2255 = _mm512_maskz_loadu_ps(127, datPtr27+620+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2255 = _mm512_max_ps(_mm512_setzero_ps(), dat2255);
__m512 in2318 = _mm512_permutexvar_ps(pm222, dat2254);
__m512 in2326 = _mm512_permutex2var_ps(dat2254, pm223, dat2255);
__m512 dat2256 = _mm512_maskz_loadu_ps(16383, datPtr27+340+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2256 = _mm512_max_ps(_mm512_setzero_ps(), dat2256);
__m512 dat2257 = _mm512_maskz_loadu_ps(127, datPtr27+676+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2257 = _mm512_max_ps(_mm512_setzero_ps(), dat2257);
__m512 in2319 = _mm512_permutexvar_ps(pm222, dat2256);
__m512 in2327 = _mm512_permutex2var_ps(dat2256, pm223, dat2257);
__m512 dat2258 = _mm512_maskz_loadu_ps(16383, datPtr27+396+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2258 = _mm512_max_ps(_mm512_setzero_ps(), dat2258);
__m512 dat2259 = _mm512_maskz_loadu_ps(127, datPtr27+732+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2259 = _mm512_max_ps(_mm512_setzero_ps(), dat2259);
__m512 in2320 = _mm512_permutexvar_ps(pm222, dat2258);
__m512 in2328 = _mm512_permutex2var_ps(dat2258, pm223, dat2259);
__m512 tmp16607 = _mm512_add_ps(in2314, in2318);
__m512 tmp16612 = _mm512_add_ps(in2322, in2326);
__m512 tmp16608 = _mm512_sub_ps(in2317, in2315);
__m512 tmp16613 = _mm512_sub_ps(in2325, in2323);
__m512 tmp16609 = _mm512_add_ps(in2315, in2319);
__m512 tmp16614 = _mm512_add_ps(in2323, in2327);
__m512 tmp16610 = _mm512_sub_ps(_mm512_setzero_ps(), in2319);
in2321 = _mm512_sub_ps(in2321, in2327);
tmp16607 = _mm512_fmadd_ps(in2316, _mm512_set1_ps(-4.25e+00f), tmp16607);
tmp16612 = _mm512_fmadd_ps(in2324, _mm512_set1_ps(-4.25e+00f), tmp16612);
tmp16609 = _mm512_fmadd_ps(in2317, _mm512_set1_ps(-4.25e+00f), tmp16609);
tmp16614 = _mm512_fmadd_ps(in2325, _mm512_set1_ps(-4.25e+00f), tmp16614);
tmp16610 = _mm512_fmadd_ps(tmp16608, _mm512_set1_ps(5.25e+00f), tmp16610);
in2321 = _mm512_fmadd_ps(tmp16613, _mm512_set1_ps(5.25e+00f), in2321);
tmp16608 = _mm512_fmadd_ps(in2315, _mm512_set1_ps(2.5e-01f), in2319);
tmp16613 = _mm512_fmadd_ps(in2323, _mm512_set1_ps(2.5e-01f), in2327);
in2315 = _mm512_fmadd_ps(in2315, _mm512_set1_ps(4e+00f), in2319);
in2323 = _mm512_fmadd_ps(in2323, _mm512_set1_ps(4e+00f), in2327);
__m512 tmp16611 = _mm512_sub_ps(tmp16609, tmp16607);
__m512 tmp16615 = _mm512_sub_ps(tmp16614, tmp16612);
tmp16609 = _mm512_add_ps(tmp16607, tmp16609);
tmp16614 = _mm512_add_ps(tmp16612, tmp16614);
tmp16607 = _mm512_fmadd_ps(in2314, _mm512_set1_ps(2.5e-01f), in2318);
tmp16612 = _mm512_fmadd_ps(in2322, _mm512_set1_ps(2.5e-01f), in2326);
tmp16608 = _mm512_fmadd_ps(in2317, _mm512_set1_ps(-1.25e+00f), tmp16608);
tmp16613 = _mm512_fmadd_ps(in2325, _mm512_set1_ps(-1.25e+00f), tmp16613);
in2317 = _mm512_fmadd_ps(in2317, _mm512_set1_ps(-5e+00f), in2315);
in2325 = _mm512_fmadd_ps(in2325, _mm512_set1_ps(-5e+00f), in2323);
tmp16607 = _mm512_fmadd_ps(in2316, _mm512_set1_ps(-1.25e+00f), tmp16607);
tmp16612 = _mm512_fmadd_ps(in2324, _mm512_set1_ps(-1.25e+00f), tmp16612);
in2319 = _mm512_fmadd_ps(tmp16607, _mm512_set1_ps(2e+00f), tmp16608);
in2327 = _mm512_fmadd_ps(tmp16612, _mm512_set1_ps(2e+00f), tmp16613);
tmp16608 = _mm512_fnmadd_ps(tmp16607, _mm512_set1_ps(2e+00f), tmp16608);
tmp16613 = _mm512_fnmadd_ps(tmp16612, _mm512_set1_ps(2e+00f), tmp16613);
tmp16607 = _mm512_fmadd_ps(in2318, _mm512_set1_ps(2.5e-01f), in2314);
tmp16612 = _mm512_fmadd_ps(in2326, _mm512_set1_ps(2.5e-01f), in2322);
in2314 = _mm512_sub_ps(in2320, in2314);
in2322 = _mm512_sub_ps(in2328, in2322);
tmp16607 = _mm512_fmadd_ps(in2316, _mm512_set1_ps(-1.25e+00f), tmp16607);
tmp16612 = _mm512_fmadd_ps(in2324, _mm512_set1_ps(-1.25e+00f), tmp16612);
in2316 = _mm512_sub_ps(in2316, in2318);
in2324 = _mm512_sub_ps(in2324, in2326);
in2316 = _mm512_fmadd_ps(in2316, _mm512_set1_ps(5.25e+00f), in2314);
in2324 = _mm512_fmadd_ps(in2324, _mm512_set1_ps(5.25e+00f), in2322);
in2315 = _mm512_fmadd_ps(tmp16607, _mm512_set1_ps(2e+00f), in2317);
in2323 = _mm512_fmadd_ps(tmp16612, _mm512_set1_ps(2e+00f), in2325);
in2317 = _mm512_fnmadd_ps(tmp16607, _mm512_set1_ps(2e+00f), in2317);
in2325 = _mm512_fnmadd_ps(tmp16612, _mm512_set1_ps(2e+00f), in2325);
__m512 tmp16624 = _mm512_unpacklo_ps(tmp16610, tmp16609);
__m512 tmp16625 = _mm512_unpackhi_ps(tmp16610, tmp16609);
__m512 tmp16626 = _mm512_unpacklo_ps(tmp16611, in2319);
__m512 tmp16627 = _mm512_unpackhi_ps(tmp16611, in2319);
__m512 tmp16628 = _mm512_unpacklo_ps(tmp16608, in2315);
__m512 tmp16629 = _mm512_unpackhi_ps(tmp16608, in2315);
__m512 tmp16630 = _mm512_unpacklo_ps(in2317, in2316);
__m512 tmp16631 = _mm512_unpackhi_ps(in2317, in2316);
__m512 tmp16632 = _mm512_unpacklo_ps(in2321, tmp16614);
__m512 tmp16633 = _mm512_unpackhi_ps(in2321, tmp16614);
__m512 tmp16634 = _mm512_unpacklo_ps(tmp16615, in2327);
__m512 tmp16635 = _mm512_unpackhi_ps(tmp16615, in2327);
__m512 tmp16636 = _mm512_unpacklo_ps(tmp16613, in2323);
__m512 tmp16637 = _mm512_unpackhi_ps(tmp16613, in2323);
__m512 tmp16638 = _mm512_unpacklo_ps(in2325, in2324);
__m512 tmp16639 = _mm512_unpackhi_ps(in2325, in2324);
__m512 tmp16640 = _mm512_shuffle_ps(tmp16624, tmp16626, 68);
__m512 tmp16641 = _mm512_shuffle_ps(tmp16624, tmp16626, 238);
__m512 tmp16642 = _mm512_shuffle_ps(tmp16625, tmp16627, 68);
__m512 tmp16643 = _mm512_shuffle_ps(tmp16625, tmp16627, 238);
__m512 tmp16644 = _mm512_shuffle_ps(tmp16628, tmp16630, 68);
__m512 tmp16645 = _mm512_shuffle_ps(tmp16628, tmp16630, 238);
__m512 tmp16646 = _mm512_shuffle_ps(tmp16629, tmp16631, 68);
__m512 tmp16647 = _mm512_shuffle_ps(tmp16629, tmp16631, 238);
__m512 tmp16648 = _mm512_shuffle_ps(tmp16632, tmp16634, 68);
__m512 tmp16649 = _mm512_shuffle_ps(tmp16632, tmp16634, 238);
__m512 tmp16650 = _mm512_shuffle_ps(tmp16633, tmp16635, 68);
__m512 tmp16651 = _mm512_shuffle_ps(tmp16633, tmp16635, 238);
__m512 tmp16652 = _mm512_shuffle_ps(tmp16636, tmp16638, 68);
__m512 tmp16653 = _mm512_shuffle_ps(tmp16636, tmp16638, 238);
__m512 tmp16654 = _mm512_shuffle_ps(tmp16637, tmp16639, 68);
__m512 tmp16655 = _mm512_shuffle_ps(tmp16637, tmp16639, 238);
__m512 tmp16656 = _mm512_shuffle_f32x4(tmp16640, tmp16644, 136);
__m512 tmp16657 = _mm512_shuffle_f32x4(tmp16640, tmp16644, 221);
__m512 tmp16658 = _mm512_shuffle_f32x4(tmp16641, tmp16645, 136);
__m512 tmp16659 = _mm512_shuffle_f32x4(tmp16641, tmp16645, 221);
__m512 tmp16660 = _mm512_shuffle_f32x4(tmp16642, tmp16646, 136);
__m512 tmp16661 = _mm512_shuffle_f32x4(tmp16642, tmp16646, 221);
__m512 tmp16662 = _mm512_shuffle_f32x4(tmp16643, tmp16647, 136);
__m512 tmp16663 = _mm512_shuffle_f32x4(tmp16643, tmp16647, 221);
__m512 tmp16664 = _mm512_shuffle_f32x4(tmp16648, tmp16652, 136);
__m512 tmp16665 = _mm512_shuffle_f32x4(tmp16648, tmp16652, 221);
__m512 tmp16666 = _mm512_shuffle_f32x4(tmp16649, tmp16653, 136);
__m512 tmp16667 = _mm512_shuffle_f32x4(tmp16649, tmp16653, 221);
__m512 tmp16668 = _mm512_shuffle_f32x4(tmp16650, tmp16654, 136);
__m512 tmp16669 = _mm512_shuffle_f32x4(tmp16650, tmp16654, 221);
__m512 tmp16670 = _mm512_shuffle_f32x4(tmp16651, tmp16655, 136);
__m512 tmp16671 = _mm512_shuffle_f32x4(tmp16651, tmp16655, 221);
tmp16610 = _mm512_shuffle_f32x4(tmp16656, tmp16664, 136);
in2321 = _mm512_shuffle_f32x4(tmp16656, tmp16664, 221);
tmp16609 = _mm512_shuffle_f32x4(tmp16658, tmp16666, 136);
tmp16614 = _mm512_shuffle_f32x4(tmp16658, tmp16666, 221);
tmp16611 = _mm512_shuffle_f32x4(tmp16660, tmp16668, 136);
tmp16615 = _mm512_shuffle_f32x4(tmp16660, tmp16668, 221);
in2319 = _mm512_shuffle_f32x4(tmp16662, tmp16670, 136);
in2327 = _mm512_shuffle_f32x4(tmp16662, tmp16670, 221);
tmp16608 = _mm512_shuffle_f32x4(tmp16657, tmp16665, 136);
tmp16613 = _mm512_shuffle_f32x4(tmp16657, tmp16665, 221);
in2315 = _mm512_shuffle_f32x4(tmp16659, tmp16667, 136);
in2323 = _mm512_shuffle_f32x4(tmp16659, tmp16667, 221);
in2317 = _mm512_shuffle_f32x4(tmp16661, tmp16669, 136);
in2325 = _mm512_shuffle_f32x4(tmp16661, tmp16669, 221);
in2316 = _mm512_shuffle_f32x4(tmp16663, tmp16671, 136);
in2324 = _mm512_shuffle_f32x4(tmp16663, tmp16671, 221);
__m512 tmp16616 = _mm512_add_ps(tmp16609, in2315);
__m512 tmp16620 = _mm512_add_ps(tmp16614, in2323);
__m512 tmp16617 = _mm512_sub_ps(tmp16608, tmp16611);
__m512 tmp16621 = _mm512_sub_ps(tmp16613, tmp16615);
__m512 tmp16618 = _mm512_add_ps(tmp16611, in2317);
__m512 tmp16622 = _mm512_add_ps(tmp16615, in2325);
tmp16610 = _mm512_sub_ps(tmp16610, in2317);
in2321 = _mm512_sub_ps(in2321, in2325);
tmp16616 = _mm512_fmadd_ps(in2319, _mm512_set1_ps(-4.25e+00f), tmp16616);
tmp16620 = _mm512_fmadd_ps(in2327, _mm512_set1_ps(-4.25e+00f), tmp16620);
tmp16618 = _mm512_fmadd_ps(tmp16608, _mm512_set1_ps(-4.25e+00f), tmp16618);
tmp16622 = _mm512_fmadd_ps(tmp16613, _mm512_set1_ps(-4.25e+00f), tmp16622);
tmp16610 = _mm512_fmadd_ps(tmp16617, _mm512_set1_ps(5.25e+00f), tmp16610);
in2321 = _mm512_fmadd_ps(tmp16621, _mm512_set1_ps(5.25e+00f), in2321);
tmp16617 = _mm512_fmadd_ps(tmp16611, _mm512_set1_ps(2.5e-01f), in2317);
tmp16621 = _mm512_fmadd_ps(tmp16615, _mm512_set1_ps(2.5e-01f), in2325);
tmp16611 = _mm512_fmadd_ps(tmp16611, _mm512_set1_ps(4e+00f), in2317);
tmp16615 = _mm512_fmadd_ps(tmp16615, _mm512_set1_ps(4e+00f), in2325);
__m512 tmp16619 = _mm512_sub_ps(tmp16618, tmp16616);
__m512 tmp16623 = _mm512_sub_ps(tmp16622, tmp16620);
tmp16618 = _mm512_add_ps(tmp16616, tmp16618);
tmp16622 = _mm512_add_ps(tmp16620, tmp16622);
tmp16616 = _mm512_fmadd_ps(tmp16609, _mm512_set1_ps(2.5e-01f), in2315);
tmp16620 = _mm512_fmadd_ps(tmp16614, _mm512_set1_ps(2.5e-01f), in2323);
tmp16617 = _mm512_fmadd_ps(tmp16608, _mm512_set1_ps(-1.25e+00f), tmp16617);
tmp16621 = _mm512_fmadd_ps(tmp16613, _mm512_set1_ps(-1.25e+00f), tmp16621);
tmp16608 = _mm512_fmadd_ps(tmp16608, _mm512_set1_ps(-5e+00f), tmp16611);
tmp16613 = _mm512_fmadd_ps(tmp16613, _mm512_set1_ps(-5e+00f), tmp16615);
tmp16616 = _mm512_fmadd_ps(in2319, _mm512_set1_ps(-1.25e+00f), tmp16616);
tmp16620 = _mm512_fmadd_ps(in2327, _mm512_set1_ps(-1.25e+00f), tmp16620);
in2317 = _mm512_fmadd_ps(tmp16616, _mm512_set1_ps(2e+00f), tmp16617);
in2325 = _mm512_fmadd_ps(tmp16620, _mm512_set1_ps(2e+00f), tmp16621);
tmp16617 = _mm512_fnmadd_ps(tmp16616, _mm512_set1_ps(2e+00f), tmp16617);
tmp16621 = _mm512_fnmadd_ps(tmp16620, _mm512_set1_ps(2e+00f), tmp16621);
tmp16616 = _mm512_fmadd_ps(in2315, _mm512_set1_ps(2.5e-01f), tmp16609);
tmp16620 = _mm512_fmadd_ps(in2323, _mm512_set1_ps(2.5e-01f), tmp16614);
tmp16609 = _mm512_sub_ps(in2316, tmp16609);
tmp16614 = _mm512_sub_ps(in2324, tmp16614);
tmp16616 = _mm512_fmadd_ps(in2319, _mm512_set1_ps(-1.25e+00f), tmp16616);
tmp16620 = _mm512_fmadd_ps(in2327, _mm512_set1_ps(-1.25e+00f), tmp16620);
in2319 = _mm512_sub_ps(in2319, in2315);
in2327 = _mm512_sub_ps(in2327, in2323);
in2319 = _mm512_fmadd_ps(in2319, _mm512_set1_ps(5.25e+00f), tmp16609);
in2327 = _mm512_fmadd_ps(in2327, _mm512_set1_ps(5.25e+00f), tmp16614);
tmp16611 = _mm512_fmadd_ps(tmp16616, _mm512_set1_ps(2e+00f), tmp16608);
tmp16615 = _mm512_fmadd_ps(tmp16620, _mm512_set1_ps(2e+00f), tmp16613);
tmp16608 = _mm512_fnmadd_ps(tmp16616, _mm512_set1_ps(2e+00f), tmp16608);
tmp16613 = _mm512_fnmadd_ps(tmp16620, _mm512_set1_ps(2e+00f), tmp16613);
__m512 out2127 = _mm512_shuffle_f32x4(tmp16610, tmp16618, 68);
__m512 out2135 = _mm512_shuffle_f32x4(tmp16610, tmp16618, 238);
__m512 out2128 = _mm512_shuffle_f32x4(tmp16619, in2317, 68);
__m512 out2136 = _mm512_shuffle_f32x4(tmp16619, in2317, 238);
__m512 out2129 = _mm512_shuffle_f32x4(tmp16617, tmp16611, 68);
__m512 out2137 = _mm512_shuffle_f32x4(tmp16617, tmp16611, 238);
__m512 out2130 = _mm512_shuffle_f32x4(tmp16608, in2319, 68);
__m512 out2138 = _mm512_shuffle_f32x4(tmp16608, in2319, 238);
__m512 out2131 = _mm512_shuffle_f32x4(in2321, tmp16622, 68);
__m512 out2139 = _mm512_shuffle_f32x4(in2321, tmp16622, 238);
__m512 out2132 = _mm512_shuffle_f32x4(tmp16623, in2325, 68);
__m512 out2140 = _mm512_shuffle_f32x4(tmp16623, in2325, 238);
__m512 out2133 = _mm512_shuffle_f32x4(tmp16621, tmp16615, 68);
__m512 out2141 = _mm512_shuffle_f32x4(tmp16621, tmp16615, 238);
__m512 out2134 = _mm512_shuffle_f32x4(tmp16613, in2327, 68);
__m512 out2142 = _mm512_shuffle_f32x4(tmp16613, in2327, 238);
_mm512_storeu_ps(dfPtr12+0+589824*i53+98304*j45+49152*s43+768*k148, out2127);
_mm512_storeu_ps(dfPtr12+128+589824*i53+98304*j45+49152*s43+768*k148, out2135);
_mm512_storeu_ps(dfPtr12+64+589824*i53+98304*j45+49152*s43+768*k148, out2131);
_mm512_storeu_ps(dfPtr12+192+589824*i53+98304*j45+49152*s43+768*k148, out2139);
_mm512_storeu_ps(dfPtr12+147456+589824*i53+98304*j45+49152*s43+768*k148, out2128);
_mm512_storeu_ps(dfPtr12+147584+589824*i53+98304*j45+49152*s43+768*k148, out2136);
_mm512_storeu_ps(dfPtr12+147520+589824*i53+98304*j45+49152*s43+768*k148, out2132);
_mm512_storeu_ps(dfPtr12+147648+589824*i53+98304*j45+49152*s43+768*k148, out2140);
_mm512_storeu_ps(dfPtr12+294912+589824*i53+98304*j45+49152*s43+768*k148, out2129);
_mm512_storeu_ps(dfPtr12+295040+589824*i53+98304*j45+49152*s43+768*k148, out2137);
_mm512_storeu_ps(dfPtr12+294976+589824*i53+98304*j45+49152*s43+768*k148, out2133);
_mm512_storeu_ps(dfPtr12+295104+589824*i53+98304*j45+49152*s43+768*k148, out2141);
_mm512_storeu_ps(dfPtr12+442368+589824*i53+98304*j45+49152*s43+768*k148, out2130);
_mm512_storeu_ps(dfPtr12+442496+589824*i53+98304*j45+49152*s43+768*k148, out2138);
_mm512_storeu_ps(dfPtr12+442432+589824*i53+98304*j45+49152*s43+768*k148, out2134);
_mm512_storeu_ps(dfPtr12+442560+589824*i53+98304*j45+49152*s43+768*k148, out2142);
__m512 dat2260 = _mm512_maskz_loadu_ps(511, datPtr27+360+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2260 = _mm512_max_ps(_mm512_setzero_ps(), dat2260);
__m512i pm224 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in2329 = _mm512_permutexvar_ps(pm224, dat2260);
__m512 dat2261 = _mm512_maskz_loadu_ps(511, datPtr27+416+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2261 = _mm512_max_ps(_mm512_setzero_ps(), dat2261);
__m512 dat2262 = _mm512_maskz_loadu_ps(8191, datPtr27+892+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2262 = _mm512_max_ps(_mm512_setzero_ps(), dat2262);
__m512 in2330 = _mm512_permutexvar_ps(pm224, dat2261);
__m512i pm225 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in2337 = _mm512_permutexvar_ps(pm225, dat2262);
__m512 dat2263 = _mm512_maskz_loadu_ps(511, datPtr27+472+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2263 = _mm512_max_ps(_mm512_setzero_ps(), dat2263);
__m512 dat2264 = _mm512_maskz_loadu_ps(8191, datPtr27+948+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2264 = _mm512_max_ps(_mm512_setzero_ps(), dat2264);
__m512 in2331 = _mm512_permutexvar_ps(pm224, dat2263);
__m512 in2338 = _mm512_permutexvar_ps(pm225, dat2264);
__m512 dat2265 = _mm512_maskz_loadu_ps(511, datPtr27+528+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2265 = _mm512_max_ps(_mm512_setzero_ps(), dat2265);
__m512 dat2266 = _mm512_maskz_loadu_ps(8191, datPtr27+1004+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2266 = _mm512_max_ps(_mm512_setzero_ps(), dat2266);
__m512 in2332 = _mm512_permutexvar_ps(pm224, dat2265);
__m512 in2339 = _mm512_permutexvar_ps(pm225, dat2266);
__m512 dat2267 = _mm512_maskz_loadu_ps(511, datPtr27+584+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2267 = _mm512_max_ps(_mm512_setzero_ps(), dat2267);
__m512 dat2268 = _mm512_maskz_loadu_ps(8191, datPtr27+1060+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2268 = _mm512_max_ps(_mm512_setzero_ps(), dat2268);
__m512 in2333 = _mm512_permutexvar_ps(pm224, dat2267);
__m512 in2340 = _mm512_permutexvar_ps(pm225, dat2268);
__m512 dat2269 = _mm512_maskz_loadu_ps(511, datPtr27+640+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2269 = _mm512_max_ps(_mm512_setzero_ps(), dat2269);
__m512 dat2270 = _mm512_maskz_loadu_ps(8191, datPtr27+1116+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2270 = _mm512_max_ps(_mm512_setzero_ps(), dat2270);
__m512 in2334 = _mm512_permutexvar_ps(pm224, dat2269);
__m512 in2341 = _mm512_permutexvar_ps(pm225, dat2270);
__m512 dat2271 = _mm512_maskz_loadu_ps(511, datPtr27+696+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2271 = _mm512_max_ps(_mm512_setzero_ps(), dat2271);
__m512 dat2272 = _mm512_maskz_loadu_ps(8191, datPtr27+1172+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2272 = _mm512_max_ps(_mm512_setzero_ps(), dat2272);
__m512 in2335 = _mm512_permutexvar_ps(pm224, dat2271);
__m512 in2342 = _mm512_permutexvar_ps(pm225, dat2272);
__m512 dat2273 = _mm512_maskz_loadu_ps(511, datPtr27+752+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2273 = _mm512_max_ps(_mm512_setzero_ps(), dat2273);
__m512 dat2274 = _mm512_maskz_loadu_ps(8191, datPtr27+1228+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2274 = _mm512_max_ps(_mm512_setzero_ps(), dat2274);
__m512 in2336 = _mm512_permutexvar_ps(pm224, dat2273);
__m512 in2343 = _mm512_permutexvar_ps(pm225, dat2274);
__m512 tmp16672 = _mm512_add_ps(in2330, in2334);
__m512 tmp16676 = _mm512_add_ps(in2337, in2341);
__m512 tmp16673 = _mm512_sub_ps(in2333, in2331);
__m512 tmp16677 = _mm512_sub_ps(in2340, in2338);
__m512 tmp16674 = _mm512_add_ps(in2331, in2335);
__m512 tmp16678 = _mm512_add_ps(in2338, in2342);
in2329 = _mm512_sub_ps(in2329, in2335);
__m512 tmp16679 = _mm512_sub_ps(_mm512_setzero_ps(), in2342);
tmp16672 = _mm512_fmadd_ps(in2332, _mm512_set1_ps(-4.25e+00f), tmp16672);
tmp16676 = _mm512_fmadd_ps(in2339, _mm512_set1_ps(-4.25e+00f), tmp16676);
tmp16674 = _mm512_fmadd_ps(in2333, _mm512_set1_ps(-4.25e+00f), tmp16674);
tmp16678 = _mm512_fmadd_ps(in2340, _mm512_set1_ps(-4.25e+00f), tmp16678);
in2329 = _mm512_fmadd_ps(tmp16673, _mm512_set1_ps(5.25e+00f), in2329);
tmp16679 = _mm512_fmadd_ps(tmp16677, _mm512_set1_ps(5.25e+00f), tmp16679);
tmp16673 = _mm512_fmadd_ps(in2331, _mm512_set1_ps(2.5e-01f), in2335);
tmp16677 = _mm512_fmadd_ps(in2338, _mm512_set1_ps(2.5e-01f), in2342);
in2331 = _mm512_fmadd_ps(in2331, _mm512_set1_ps(4e+00f), in2335);
in2338 = _mm512_fmadd_ps(in2338, _mm512_set1_ps(4e+00f), in2342);
__m512 tmp16675 = _mm512_sub_ps(tmp16674, tmp16672);
__m512 tmp16680 = _mm512_sub_ps(tmp16678, tmp16676);
tmp16674 = _mm512_add_ps(tmp16672, tmp16674);
tmp16678 = _mm512_add_ps(tmp16676, tmp16678);
tmp16672 = _mm512_fmadd_ps(in2330, _mm512_set1_ps(2.5e-01f), in2334);
tmp16676 = _mm512_fmadd_ps(in2337, _mm512_set1_ps(2.5e-01f), in2341);
tmp16673 = _mm512_fmadd_ps(in2333, _mm512_set1_ps(-1.25e+00f), tmp16673);
tmp16677 = _mm512_fmadd_ps(in2340, _mm512_set1_ps(-1.25e+00f), tmp16677);
in2333 = _mm512_fmadd_ps(in2333, _mm512_set1_ps(-5e+00f), in2331);
in2340 = _mm512_fmadd_ps(in2340, _mm512_set1_ps(-5e+00f), in2338);
tmp16672 = _mm512_fmadd_ps(in2332, _mm512_set1_ps(-1.25e+00f), tmp16672);
tmp16676 = _mm512_fmadd_ps(in2339, _mm512_set1_ps(-1.25e+00f), tmp16676);
in2335 = _mm512_fmadd_ps(tmp16672, _mm512_set1_ps(2e+00f), tmp16673);
in2342 = _mm512_fmadd_ps(tmp16676, _mm512_set1_ps(2e+00f), tmp16677);
tmp16673 = _mm512_fnmadd_ps(tmp16672, _mm512_set1_ps(2e+00f), tmp16673);
tmp16677 = _mm512_fnmadd_ps(tmp16676, _mm512_set1_ps(2e+00f), tmp16677);
tmp16672 = _mm512_fmadd_ps(in2334, _mm512_set1_ps(2.5e-01f), in2330);
tmp16676 = _mm512_fmadd_ps(in2341, _mm512_set1_ps(2.5e-01f), in2337);
in2330 = _mm512_sub_ps(in2336, in2330);
in2337 = _mm512_sub_ps(in2343, in2337);
tmp16672 = _mm512_fmadd_ps(in2332, _mm512_set1_ps(-1.25e+00f), tmp16672);
tmp16676 = _mm512_fmadd_ps(in2339, _mm512_set1_ps(-1.25e+00f), tmp16676);
in2332 = _mm512_sub_ps(in2332, in2334);
in2339 = _mm512_sub_ps(in2339, in2341);
in2332 = _mm512_fmadd_ps(in2332, _mm512_set1_ps(5.25e+00f), in2330);
in2339 = _mm512_fmadd_ps(in2339, _mm512_set1_ps(5.25e+00f), in2337);
in2331 = _mm512_fmadd_ps(tmp16672, _mm512_set1_ps(2e+00f), in2333);
in2338 = _mm512_fmadd_ps(tmp16676, _mm512_set1_ps(2e+00f), in2340);
in2333 = _mm512_fnmadd_ps(tmp16672, _mm512_set1_ps(2e+00f), in2333);
in2340 = _mm512_fnmadd_ps(tmp16676, _mm512_set1_ps(2e+00f), in2340);
__m512 tmp16689 = _mm512_unpacklo_ps(in2329, tmp16674);
__m512 tmp16690 = _mm512_unpackhi_ps(in2329, tmp16674);
__m512 tmp16691 = _mm512_unpacklo_ps(tmp16675, in2335);
__m512 tmp16692 = _mm512_unpackhi_ps(tmp16675, in2335);
__m512 tmp16693 = _mm512_unpacklo_ps(tmp16673, in2331);
__m512 tmp16694 = _mm512_unpackhi_ps(tmp16673, in2331);
__m512 tmp16695 = _mm512_unpacklo_ps(in2333, in2332);
__m512 tmp16696 = _mm512_unpackhi_ps(in2333, in2332);
__m512 tmp16697 = _mm512_unpacklo_ps(tmp16679, tmp16678);
__m512 tmp16698 = _mm512_unpackhi_ps(tmp16679, tmp16678);
__m512 tmp16699 = _mm512_unpacklo_ps(tmp16680, in2342);
__m512 tmp16700 = _mm512_unpackhi_ps(tmp16680, in2342);
__m512 tmp16701 = _mm512_unpacklo_ps(tmp16677, in2338);
__m512 tmp16702 = _mm512_unpackhi_ps(tmp16677, in2338);
__m512 tmp16703 = _mm512_unpacklo_ps(in2340, in2339);
__m512 tmp16704 = _mm512_unpackhi_ps(in2340, in2339);
__m512 tmp16705 = _mm512_shuffle_ps(tmp16689, tmp16691, 68);
__m512 tmp16706 = _mm512_shuffle_ps(tmp16689, tmp16691, 238);
__m512 tmp16707 = _mm512_shuffle_ps(tmp16690, tmp16692, 68);
__m512 tmp16708 = _mm512_shuffle_ps(tmp16690, tmp16692, 238);
__m512 tmp16709 = _mm512_shuffle_ps(tmp16693, tmp16695, 68);
__m512 tmp16710 = _mm512_shuffle_ps(tmp16693, tmp16695, 238);
__m512 tmp16711 = _mm512_shuffle_ps(tmp16694, tmp16696, 68);
__m512 tmp16712 = _mm512_shuffle_ps(tmp16694, tmp16696, 238);
__m512 tmp16713 = _mm512_shuffle_ps(tmp16697, tmp16699, 68);
__m512 tmp16714 = _mm512_shuffle_ps(tmp16697, tmp16699, 238);
__m512 tmp16715 = _mm512_shuffle_ps(tmp16698, tmp16700, 68);
__m512 tmp16716 = _mm512_shuffle_ps(tmp16698, tmp16700, 238);
__m512 tmp16717 = _mm512_shuffle_ps(tmp16701, tmp16703, 68);
__m512 tmp16718 = _mm512_shuffle_ps(tmp16701, tmp16703, 238);
__m512 tmp16719 = _mm512_shuffle_ps(tmp16702, tmp16704, 68);
__m512 tmp16720 = _mm512_shuffle_ps(tmp16702, tmp16704, 238);
__m512 tmp16721 = _mm512_shuffle_f32x4(tmp16705, tmp16709, 136);
__m512 tmp16722 = _mm512_shuffle_f32x4(tmp16705, tmp16709, 221);
__m512 tmp16723 = _mm512_shuffle_f32x4(tmp16706, tmp16710, 136);
__m512 tmp16724 = _mm512_shuffle_f32x4(tmp16706, tmp16710, 221);
__m512 tmp16725 = _mm512_shuffle_f32x4(tmp16707, tmp16711, 136);
__m512 tmp16726 = _mm512_shuffle_f32x4(tmp16707, tmp16711, 221);
__m512 tmp16727 = _mm512_shuffle_f32x4(tmp16708, tmp16712, 136);
__m512 tmp16728 = _mm512_shuffle_f32x4(tmp16708, tmp16712, 221);
__m512 tmp16729 = _mm512_shuffle_f32x4(tmp16713, tmp16717, 136);
__m512 tmp16730 = _mm512_shuffle_f32x4(tmp16713, tmp16717, 221);
__m512 tmp16731 = _mm512_shuffle_f32x4(tmp16714, tmp16718, 136);
__m512 tmp16732 = _mm512_shuffle_f32x4(tmp16714, tmp16718, 221);
__m512 tmp16733 = _mm512_shuffle_f32x4(tmp16715, tmp16719, 136);
__m512 tmp16734 = _mm512_shuffle_f32x4(tmp16715, tmp16719, 221);
__m512 tmp16735 = _mm512_shuffle_f32x4(tmp16716, tmp16720, 136);
__m512 tmp16736 = _mm512_shuffle_f32x4(tmp16716, tmp16720, 221);
in2329 = _mm512_shuffle_f32x4(tmp16721, tmp16729, 136);
tmp16679 = _mm512_shuffle_f32x4(tmp16721, tmp16729, 221);
tmp16674 = _mm512_shuffle_f32x4(tmp16723, tmp16731, 136);
tmp16678 = _mm512_shuffle_f32x4(tmp16723, tmp16731, 221);
tmp16675 = _mm512_shuffle_f32x4(tmp16725, tmp16733, 136);
tmp16680 = _mm512_shuffle_f32x4(tmp16725, tmp16733, 221);
in2335 = _mm512_shuffle_f32x4(tmp16727, tmp16735, 136);
in2342 = _mm512_shuffle_f32x4(tmp16727, tmp16735, 221);
tmp16673 = _mm512_shuffle_f32x4(tmp16722, tmp16730, 136);
tmp16677 = _mm512_shuffle_f32x4(tmp16722, tmp16730, 221);
in2331 = _mm512_shuffle_f32x4(tmp16724, tmp16732, 136);
in2338 = _mm512_shuffle_f32x4(tmp16724, tmp16732, 221);
in2333 = _mm512_shuffle_f32x4(tmp16726, tmp16734, 136);
in2340 = _mm512_shuffle_f32x4(tmp16726, tmp16734, 221);
in2332 = _mm512_shuffle_f32x4(tmp16728, tmp16736, 136);
in2339 = _mm512_shuffle_f32x4(tmp16728, tmp16736, 221);
__m512 tmp16681 = _mm512_add_ps(tmp16674, in2331);
__m512 tmp16685 = _mm512_add_ps(tmp16678, in2338);
__m512 tmp16682 = _mm512_sub_ps(tmp16673, tmp16675);
__m512 tmp16686 = _mm512_sub_ps(tmp16677, tmp16680);
__m512 tmp16683 = _mm512_add_ps(tmp16675, in2333);
__m512 tmp16687 = _mm512_add_ps(tmp16680, in2340);
in2329 = _mm512_sub_ps(in2329, in2333);
tmp16679 = _mm512_sub_ps(tmp16679, in2340);
tmp16681 = _mm512_fmadd_ps(in2335, _mm512_set1_ps(-4.25e+00f), tmp16681);
tmp16685 = _mm512_fmadd_ps(in2342, _mm512_set1_ps(-4.25e+00f), tmp16685);
tmp16683 = _mm512_fmadd_ps(tmp16673, _mm512_set1_ps(-4.25e+00f), tmp16683);
tmp16687 = _mm512_fmadd_ps(tmp16677, _mm512_set1_ps(-4.25e+00f), tmp16687);
in2329 = _mm512_fmadd_ps(tmp16682, _mm512_set1_ps(5.25e+00f), in2329);
tmp16679 = _mm512_fmadd_ps(tmp16686, _mm512_set1_ps(5.25e+00f), tmp16679);
tmp16682 = _mm512_fmadd_ps(tmp16675, _mm512_set1_ps(2.5e-01f), in2333);
tmp16686 = _mm512_fmadd_ps(tmp16680, _mm512_set1_ps(2.5e-01f), in2340);
tmp16675 = _mm512_fmadd_ps(tmp16675, _mm512_set1_ps(4e+00f), in2333);
tmp16680 = _mm512_fmadd_ps(tmp16680, _mm512_set1_ps(4e+00f), in2340);
__m512 tmp16684 = _mm512_sub_ps(tmp16683, tmp16681);
__m512 tmp16688 = _mm512_sub_ps(tmp16687, tmp16685);
tmp16683 = _mm512_add_ps(tmp16681, tmp16683);
tmp16687 = _mm512_add_ps(tmp16685, tmp16687);
tmp16681 = _mm512_fmadd_ps(tmp16674, _mm512_set1_ps(2.5e-01f), in2331);
tmp16685 = _mm512_fmadd_ps(tmp16678, _mm512_set1_ps(2.5e-01f), in2338);
tmp16682 = _mm512_fmadd_ps(tmp16673, _mm512_set1_ps(-1.25e+00f), tmp16682);
tmp16686 = _mm512_fmadd_ps(tmp16677, _mm512_set1_ps(-1.25e+00f), tmp16686);
tmp16673 = _mm512_fmadd_ps(tmp16673, _mm512_set1_ps(-5e+00f), tmp16675);
tmp16677 = _mm512_fmadd_ps(tmp16677, _mm512_set1_ps(-5e+00f), tmp16680);
tmp16681 = _mm512_fmadd_ps(in2335, _mm512_set1_ps(-1.25e+00f), tmp16681);
tmp16685 = _mm512_fmadd_ps(in2342, _mm512_set1_ps(-1.25e+00f), tmp16685);
in2333 = _mm512_fmadd_ps(tmp16681, _mm512_set1_ps(2e+00f), tmp16682);
in2340 = _mm512_fmadd_ps(tmp16685, _mm512_set1_ps(2e+00f), tmp16686);
tmp16682 = _mm512_fnmadd_ps(tmp16681, _mm512_set1_ps(2e+00f), tmp16682);
tmp16686 = _mm512_fnmadd_ps(tmp16685, _mm512_set1_ps(2e+00f), tmp16686);
tmp16681 = _mm512_fmadd_ps(in2331, _mm512_set1_ps(2.5e-01f), tmp16674);
tmp16685 = _mm512_fmadd_ps(in2338, _mm512_set1_ps(2.5e-01f), tmp16678);
tmp16674 = _mm512_sub_ps(in2332, tmp16674);
tmp16678 = _mm512_sub_ps(in2339, tmp16678);
tmp16681 = _mm512_fmadd_ps(in2335, _mm512_set1_ps(-1.25e+00f), tmp16681);
tmp16685 = _mm512_fmadd_ps(in2342, _mm512_set1_ps(-1.25e+00f), tmp16685);
in2335 = _mm512_sub_ps(in2335, in2331);
in2342 = _mm512_sub_ps(in2342, in2338);
in2335 = _mm512_fmadd_ps(in2335, _mm512_set1_ps(5.25e+00f), tmp16674);
in2342 = _mm512_fmadd_ps(in2342, _mm512_set1_ps(5.25e+00f), tmp16678);
tmp16675 = _mm512_fmadd_ps(tmp16681, _mm512_set1_ps(2e+00f), tmp16673);
tmp16680 = _mm512_fmadd_ps(tmp16685, _mm512_set1_ps(2e+00f), tmp16677);
tmp16673 = _mm512_fnmadd_ps(tmp16681, _mm512_set1_ps(2e+00f), tmp16673);
tmp16677 = _mm512_fnmadd_ps(tmp16685, _mm512_set1_ps(2e+00f), tmp16677);
__m512 out2143 = _mm512_shuffle_f32x4(in2329, tmp16683, 68);
__m512 out2151 = _mm512_shuffle_f32x4(in2329, tmp16683, 238);
__m512 out2144 = _mm512_shuffle_f32x4(tmp16684, in2333, 68);
__m512 out2152 = _mm512_shuffle_f32x4(tmp16684, in2333, 238);
__m512 out2145 = _mm512_shuffle_f32x4(tmp16682, tmp16675, 68);
__m512 out2153 = _mm512_shuffle_f32x4(tmp16682, tmp16675, 238);
__m512 out2146 = _mm512_shuffle_f32x4(tmp16673, in2335, 68);
__m512 out2154 = _mm512_shuffle_f32x4(tmp16673, in2335, 238);
__m512 out2147 = _mm512_shuffle_f32x4(tmp16679, tmp16687, 68);
__m512 out2155 = _mm512_shuffle_f32x4(tmp16679, tmp16687, 238);
__m512 out2148 = _mm512_shuffle_f32x4(tmp16688, in2340, 68);
__m512 out2156 = _mm512_shuffle_f32x4(tmp16688, in2340, 238);
__m512 out2149 = _mm512_shuffle_f32x4(tmp16686, tmp16680, 68);
__m512 out2157 = _mm512_shuffle_f32x4(tmp16686, tmp16680, 238);
__m512 out2150 = _mm512_shuffle_f32x4(tmp16677, in2342, 68);
__m512 out2158 = _mm512_shuffle_f32x4(tmp16677, in2342, 238);
_mm512_storeu_ps(dfPtr12+256+589824*i53+98304*j45+49152*s43+768*k148, out2143);
_mm512_storeu_ps(dfPtr12+384+589824*i53+98304*j45+49152*s43+768*k148, out2151);
_mm512_storeu_ps(dfPtr12+320+589824*i53+98304*j45+49152*s43+768*k148, out2147);
_mm512_storeu_ps(dfPtr12+448+589824*i53+98304*j45+49152*s43+768*k148, out2155);
_mm512_storeu_ps(dfPtr12+147712+589824*i53+98304*j45+49152*s43+768*k148, out2144);
_mm512_storeu_ps(dfPtr12+147840+589824*i53+98304*j45+49152*s43+768*k148, out2152);
_mm512_storeu_ps(dfPtr12+147776+589824*i53+98304*j45+49152*s43+768*k148, out2148);
_mm512_storeu_ps(dfPtr12+147904+589824*i53+98304*j45+49152*s43+768*k148, out2156);
_mm512_storeu_ps(dfPtr12+295168+589824*i53+98304*j45+49152*s43+768*k148, out2145);
_mm512_storeu_ps(dfPtr12+295296+589824*i53+98304*j45+49152*s43+768*k148, out2153);
_mm512_storeu_ps(dfPtr12+295232+589824*i53+98304*j45+49152*s43+768*k148, out2149);
_mm512_storeu_ps(dfPtr12+295360+589824*i53+98304*j45+49152*s43+768*k148, out2157);
_mm512_storeu_ps(dfPtr12+442624+589824*i53+98304*j45+49152*s43+768*k148, out2146);
_mm512_storeu_ps(dfPtr12+442752+589824*i53+98304*j45+49152*s43+768*k148, out2154);
_mm512_storeu_ps(dfPtr12+442688+589824*i53+98304*j45+49152*s43+768*k148, out2150);
_mm512_storeu_ps(dfPtr12+442816+589824*i53+98304*j45+49152*s43+768*k148, out2158);
__m512 dat2275 = _mm512_maskz_loadu_ps(16383, datPtr27+1172+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2275 = _mm512_max_ps(_mm512_setzero_ps(), dat2275);
__m512i pm226 = _mm512_set_epi32(6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15);
__m512 in2344 = _mm512_permutexvar_ps(pm226, dat2275);
__m512i pm227 = _mm512_set_epi32(15, 15, 15, 15, 15, 13, 12, 11, 12, 11, 10, 9, 8, 7, 6, 5);
__m512 in2352 = _mm512_permutexvar_ps(pm227, dat2275);
__m512 dat2276 = _mm512_maskz_loadu_ps(7, datPtr27+936+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2276 = _mm512_max_ps(_mm512_setzero_ps(), dat2276);
__m512 dat2277 = _mm512_maskz_loadu_ps(16383, datPtr27+1228+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2277 = _mm512_max_ps(_mm512_setzero_ps(), dat2277);
__m512i pm228 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 15, 15, 2, 1, 0);
__m512 in2345 = _mm512_permutex2var_ps(dat2276, pm228, dat2277);
__m512 in2353 = _mm512_permutexvar_ps(pm227, dat2277);
__m512 dat2278 = _mm512_maskz_loadu_ps(7, datPtr27+992+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2278 = _mm512_max_ps(_mm512_setzero_ps(), dat2278);
__m512 dat2279 = _mm512_maskz_loadu_ps(16383, datPtr27+1284+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2279 = _mm512_max_ps(_mm512_setzero_ps(), dat2279);
__m512 in2346 = _mm512_permutex2var_ps(dat2278, pm228, dat2279);
__m512 in2354 = _mm512_permutexvar_ps(pm227, dat2279);
__m512 dat2280 = _mm512_maskz_loadu_ps(7, datPtr27+1048+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2280 = _mm512_max_ps(_mm512_setzero_ps(), dat2280);
__m512 dat2281 = _mm512_maskz_loadu_ps(16383, datPtr27+1340+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2281 = _mm512_max_ps(_mm512_setzero_ps(), dat2281);
__m512 in2347 = _mm512_permutex2var_ps(dat2280, pm228, dat2281);
__m512 in2355 = _mm512_permutexvar_ps(pm227, dat2281);
__m512 dat2282 = _mm512_maskz_loadu_ps(7, datPtr27+1104+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2282 = _mm512_max_ps(_mm512_setzero_ps(), dat2282);
__m512 dat2283 = _mm512_maskz_loadu_ps(16383, datPtr27+1396+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2283 = _mm512_max_ps(_mm512_setzero_ps(), dat2283);
__m512 in2348 = _mm512_permutex2var_ps(dat2282, pm228, dat2283);
__m512 in2356 = _mm512_permutexvar_ps(pm227, dat2283);
__m512 dat2284 = _mm512_maskz_loadu_ps(7, datPtr27+1160+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2284 = _mm512_max_ps(_mm512_setzero_ps(), dat2284);
__m512 dat2285 = _mm512_maskz_loadu_ps(16383, datPtr27+1452+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2285 = _mm512_max_ps(_mm512_setzero_ps(), dat2285);
__m512 in2349 = _mm512_permutex2var_ps(dat2284, pm228, dat2285);
__m512 in2357 = _mm512_permutexvar_ps(pm227, dat2285);
__m512 dat2286 = _mm512_maskz_loadu_ps(7, datPtr27+1216+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2286 = _mm512_max_ps(_mm512_setzero_ps(), dat2286);
__m512 dat2287 = _mm512_maskz_loadu_ps(16383, datPtr27+1508+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2287 = _mm512_max_ps(_mm512_setzero_ps(), dat2287);
__m512 in2350 = _mm512_permutex2var_ps(dat2286, pm228, dat2287);
__m512 in2358 = _mm512_permutexvar_ps(pm227, dat2287);
__m512 dat2288 = _mm512_maskz_loadu_ps(7, datPtr27+1272+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2288 = _mm512_max_ps(_mm512_setzero_ps(), dat2288);
__m512 dat2289 = _mm512_maskz_loadu_ps(16383, datPtr27+1564+212992*i53+56*h53+4*w65+106496*s43+1664*k148);
dat2289 = _mm512_max_ps(_mm512_setzero_ps(), dat2289);
__m512 in2351 = _mm512_permutex2var_ps(dat2288, pm228, dat2289);
__m512 in2359 = _mm512_permutexvar_ps(pm227, dat2289);
__m512 tmp16737 = _mm512_add_ps(in2345, in2349);
__m512 tmp16741 = _mm512_add_ps(in2353, in2357);
__m512 tmp16738 = _mm512_sub_ps(in2348, in2346);
__m512 tmp16742 = _mm512_sub_ps(in2356, in2354);
__m512 tmp16739 = _mm512_add_ps(in2346, in2350);
__m512 tmp16743 = _mm512_add_ps(in2354, in2358);
in2344 = _mm512_sub_ps(in2344, in2350);
in2352 = _mm512_sub_ps(in2352, in2358);
tmp16737 = _mm512_fmadd_ps(in2347, _mm512_set1_ps(-4.25e+00f), tmp16737);
tmp16741 = _mm512_fmadd_ps(in2355, _mm512_set1_ps(-4.25e+00f), tmp16741);
tmp16739 = _mm512_fmadd_ps(in2348, _mm512_set1_ps(-4.25e+00f), tmp16739);
tmp16743 = _mm512_fmadd_ps(in2356, _mm512_set1_ps(-4.25e+00f), tmp16743);
in2344 = _mm512_fmadd_ps(tmp16738, _mm512_set1_ps(5.25e+00f), in2344);
in2352 = _mm512_fmadd_ps(tmp16742, _mm512_set1_ps(5.25e+00f), in2352);
tmp16738 = _mm512_fmadd_ps(in2346, _mm512_set1_ps(2.5e-01f), in2350);
tmp16742 = _mm512_fmadd_ps(in2354, _mm512_set1_ps(2.5e-01f), in2358);
in2346 = _mm512_fmadd_ps(in2346, _mm512_set1_ps(4e+00f), in2350);
in2354 = _mm512_fmadd_ps(in2354, _mm512_set1_ps(4e+00f), in2358);
__m512 tmp16740 = _mm512_sub_ps(tmp16739, tmp16737);
__m512 tmp16744 = _mm512_sub_ps(tmp16743, tmp16741);
tmp16739 = _mm512_add_ps(tmp16737, tmp16739);
tmp16743 = _mm512_add_ps(tmp16741, tmp16743);
tmp16737 = _mm512_fmadd_ps(in2345, _mm512_set1_ps(2.5e-01f), in2349);
tmp16741 = _mm512_fmadd_ps(in2353, _mm512_set1_ps(2.5e-01f), in2357);
tmp16738 = _mm512_fmadd_ps(in2348, _mm512_set1_ps(-1.25e+00f), tmp16738);
tmp16742 = _mm512_fmadd_ps(in2356, _mm512_set1_ps(-1.25e+00f), tmp16742);
in2348 = _mm512_fmadd_ps(in2348, _mm512_set1_ps(-5e+00f), in2346);
in2356 = _mm512_fmadd_ps(in2356, _mm512_set1_ps(-5e+00f), in2354);
tmp16737 = _mm512_fmadd_ps(in2347, _mm512_set1_ps(-1.25e+00f), tmp16737);
tmp16741 = _mm512_fmadd_ps(in2355, _mm512_set1_ps(-1.25e+00f), tmp16741);
in2350 = _mm512_fmadd_ps(tmp16737, _mm512_set1_ps(2e+00f), tmp16738);
in2358 = _mm512_fmadd_ps(tmp16741, _mm512_set1_ps(2e+00f), tmp16742);
tmp16738 = _mm512_fnmadd_ps(tmp16737, _mm512_set1_ps(2e+00f), tmp16738);
tmp16742 = _mm512_fnmadd_ps(tmp16741, _mm512_set1_ps(2e+00f), tmp16742);
tmp16737 = _mm512_fmadd_ps(in2349, _mm512_set1_ps(2.5e-01f), in2345);
tmp16741 = _mm512_fmadd_ps(in2357, _mm512_set1_ps(2.5e-01f), in2353);
in2345 = _mm512_sub_ps(in2351, in2345);
in2353 = _mm512_sub_ps(in2359, in2353);
tmp16737 = _mm512_fmadd_ps(in2347, _mm512_set1_ps(-1.25e+00f), tmp16737);
tmp16741 = _mm512_fmadd_ps(in2355, _mm512_set1_ps(-1.25e+00f), tmp16741);
in2347 = _mm512_sub_ps(in2347, in2349);
in2355 = _mm512_sub_ps(in2355, in2357);
in2347 = _mm512_fmadd_ps(in2347, _mm512_set1_ps(5.25e+00f), in2345);
in2355 = _mm512_fmadd_ps(in2355, _mm512_set1_ps(5.25e+00f), in2353);
in2346 = _mm512_fmadd_ps(tmp16737, _mm512_set1_ps(2e+00f), in2348);
in2354 = _mm512_fmadd_ps(tmp16741, _mm512_set1_ps(2e+00f), in2356);
in2348 = _mm512_fnmadd_ps(tmp16737, _mm512_set1_ps(2e+00f), in2348);
in2356 = _mm512_fnmadd_ps(tmp16741, _mm512_set1_ps(2e+00f), in2356);
__m512 tmp16753 = _mm512_unpacklo_ps(in2344, tmp16739);
__m512 tmp16754 = _mm512_unpackhi_ps(in2344, tmp16739);
__m512 tmp16755 = _mm512_unpacklo_ps(tmp16740, in2350);
__m512 tmp16756 = _mm512_unpackhi_ps(tmp16740, in2350);
__m512 tmp16757 = _mm512_unpacklo_ps(tmp16738, in2346);
__m512 tmp16758 = _mm512_unpackhi_ps(tmp16738, in2346);
__m512 tmp16759 = _mm512_unpacklo_ps(in2348, in2347);
__m512 tmp16760 = _mm512_unpackhi_ps(in2348, in2347);
__m512 tmp16761 = _mm512_unpacklo_ps(in2352, tmp16743);
__m512 tmp16762 = _mm512_unpackhi_ps(in2352, tmp16743);
__m512 tmp16763 = _mm512_unpacklo_ps(tmp16744, in2358);
__m512 tmp16764 = _mm512_unpackhi_ps(tmp16744, in2358);
__m512 tmp16765 = _mm512_unpacklo_ps(tmp16742, in2354);
__m512 tmp16766 = _mm512_unpackhi_ps(tmp16742, in2354);
__m512 tmp16767 = _mm512_unpacklo_ps(in2356, in2355);
__m512 tmp16768 = _mm512_unpackhi_ps(in2356, in2355);
__m512 tmp16769 = _mm512_shuffle_ps(tmp16753, tmp16755, 68);
__m512 tmp16770 = _mm512_shuffle_ps(tmp16753, tmp16755, 238);
__m512 tmp16771 = _mm512_shuffle_ps(tmp16754, tmp16756, 68);
__m512 tmp16772 = _mm512_shuffle_ps(tmp16754, tmp16756, 238);
__m512 tmp16773 = _mm512_shuffle_ps(tmp16757, tmp16759, 68);
__m512 tmp16774 = _mm512_shuffle_ps(tmp16757, tmp16759, 238);
__m512 tmp16775 = _mm512_shuffle_ps(tmp16758, tmp16760, 68);
__m512 tmp16776 = _mm512_shuffle_ps(tmp16758, tmp16760, 238);
__m512 tmp16777 = _mm512_shuffle_ps(tmp16761, tmp16763, 68);
__m512 tmp16778 = _mm512_shuffle_ps(tmp16761, tmp16763, 238);
__m512 tmp16779 = _mm512_shuffle_ps(tmp16762, tmp16764, 68);
__m512 tmp16780 = _mm512_shuffle_ps(tmp16762, tmp16764, 238);
__m512 tmp16781 = _mm512_shuffle_ps(tmp16765, tmp16767, 68);
__m512 tmp16782 = _mm512_shuffle_ps(tmp16765, tmp16767, 238);
__m512 tmp16783 = _mm512_shuffle_ps(tmp16766, tmp16768, 68);
__m512 tmp16784 = _mm512_shuffle_ps(tmp16766, tmp16768, 238);
__m512 tmp16785 = _mm512_shuffle_f32x4(tmp16769, tmp16773, 136);
__m512 tmp16786 = _mm512_shuffle_f32x4(tmp16769, tmp16773, 221);
__m512 tmp16787 = _mm512_shuffle_f32x4(tmp16770, tmp16774, 136);
__m512 tmp16788 = _mm512_shuffle_f32x4(tmp16770, tmp16774, 221);
__m512 tmp16789 = _mm512_shuffle_f32x4(tmp16771, tmp16775, 136);
__m512 tmp16790 = _mm512_shuffle_f32x4(tmp16771, tmp16775, 221);
__m512 tmp16791 = _mm512_shuffle_f32x4(tmp16772, tmp16776, 136);
__m512 tmp16792 = _mm512_shuffle_f32x4(tmp16772, tmp16776, 221);
__m512 tmp16793 = _mm512_shuffle_f32x4(tmp16777, tmp16781, 136);
__m512 tmp16794 = _mm512_shuffle_f32x4(tmp16777, tmp16781, 221);
__m512 tmp16795 = _mm512_shuffle_f32x4(tmp16778, tmp16782, 136);
__m512 tmp16796 = _mm512_shuffle_f32x4(tmp16778, tmp16782, 221);
__m512 tmp16797 = _mm512_shuffle_f32x4(tmp16779, tmp16783, 136);
__m512 tmp16798 = _mm512_shuffle_f32x4(tmp16779, tmp16783, 221);
__m512 tmp16799 = _mm512_shuffle_f32x4(tmp16780, tmp16784, 136);
__m512 tmp16800 = _mm512_shuffle_f32x4(tmp16780, tmp16784, 221);
in2344 = _mm512_shuffle_f32x4(tmp16785, tmp16793, 136);
in2352 = _mm512_shuffle_f32x4(tmp16785, tmp16793, 221);
tmp16739 = _mm512_shuffle_f32x4(tmp16787, tmp16795, 136);
tmp16743 = _mm512_shuffle_f32x4(tmp16787, tmp16795, 221);
tmp16740 = _mm512_shuffle_f32x4(tmp16789, tmp16797, 136);
tmp16744 = _mm512_shuffle_f32x4(tmp16789, tmp16797, 221);
in2350 = _mm512_shuffle_f32x4(tmp16791, tmp16799, 136);
in2358 = _mm512_shuffle_f32x4(tmp16791, tmp16799, 221);
tmp16738 = _mm512_shuffle_f32x4(tmp16786, tmp16794, 136);
tmp16742 = _mm512_shuffle_f32x4(tmp16786, tmp16794, 221);
in2346 = _mm512_shuffle_f32x4(tmp16788, tmp16796, 136);
in2354 = _mm512_shuffle_f32x4(tmp16788, tmp16796, 221);
in2348 = _mm512_shuffle_f32x4(tmp16790, tmp16798, 136);
in2356 = _mm512_shuffle_f32x4(tmp16790, tmp16798, 221);
in2347 = _mm512_shuffle_f32x4(tmp16792, tmp16800, 136);
in2355 = _mm512_shuffle_f32x4(tmp16792, tmp16800, 221);
__m512 tmp16745 = _mm512_add_ps(tmp16739, in2346);
__m512 tmp16749 = _mm512_add_ps(tmp16743, in2354);
__m512 tmp16746 = _mm512_sub_ps(tmp16738, tmp16740);
__m512 tmp16750 = _mm512_sub_ps(tmp16742, tmp16744);
__m512 tmp16747 = _mm512_add_ps(tmp16740, in2348);
__m512 tmp16751 = _mm512_add_ps(tmp16744, in2356);
in2344 = _mm512_sub_ps(in2344, in2348);
in2352 = _mm512_sub_ps(in2352, in2356);
tmp16745 = _mm512_fmadd_ps(in2350, _mm512_set1_ps(-4.25e+00f), tmp16745);
tmp16749 = _mm512_fmadd_ps(in2358, _mm512_set1_ps(-4.25e+00f), tmp16749);
tmp16747 = _mm512_fmadd_ps(tmp16738, _mm512_set1_ps(-4.25e+00f), tmp16747);
tmp16751 = _mm512_fmadd_ps(tmp16742, _mm512_set1_ps(-4.25e+00f), tmp16751);
in2344 = _mm512_fmadd_ps(tmp16746, _mm512_set1_ps(5.25e+00f), in2344);
in2352 = _mm512_fmadd_ps(tmp16750, _mm512_set1_ps(5.25e+00f), in2352);
tmp16746 = _mm512_fmadd_ps(tmp16740, _mm512_set1_ps(2.5e-01f), in2348);
tmp16750 = _mm512_fmadd_ps(tmp16744, _mm512_set1_ps(2.5e-01f), in2356);
tmp16740 = _mm512_fmadd_ps(tmp16740, _mm512_set1_ps(4e+00f), in2348);
tmp16744 = _mm512_fmadd_ps(tmp16744, _mm512_set1_ps(4e+00f), in2356);
__m512 tmp16748 = _mm512_sub_ps(tmp16747, tmp16745);
__m512 tmp16752 = _mm512_sub_ps(tmp16751, tmp16749);
tmp16747 = _mm512_add_ps(tmp16745, tmp16747);
tmp16751 = _mm512_add_ps(tmp16749, tmp16751);
tmp16745 = _mm512_fmadd_ps(tmp16739, _mm512_set1_ps(2.5e-01f), in2346);
tmp16749 = _mm512_fmadd_ps(tmp16743, _mm512_set1_ps(2.5e-01f), in2354);
tmp16746 = _mm512_fmadd_ps(tmp16738, _mm512_set1_ps(-1.25e+00f), tmp16746);
tmp16750 = _mm512_fmadd_ps(tmp16742, _mm512_set1_ps(-1.25e+00f), tmp16750);
tmp16738 = _mm512_fmadd_ps(tmp16738, _mm512_set1_ps(-5e+00f), tmp16740);
tmp16742 = _mm512_fmadd_ps(tmp16742, _mm512_set1_ps(-5e+00f), tmp16744);
tmp16745 = _mm512_fmadd_ps(in2350, _mm512_set1_ps(-1.25e+00f), tmp16745);
tmp16749 = _mm512_fmadd_ps(in2358, _mm512_set1_ps(-1.25e+00f), tmp16749);
in2348 = _mm512_fmadd_ps(tmp16745, _mm512_set1_ps(2e+00f), tmp16746);
in2356 = _mm512_fmadd_ps(tmp16749, _mm512_set1_ps(2e+00f), tmp16750);
tmp16746 = _mm512_fnmadd_ps(tmp16745, _mm512_set1_ps(2e+00f), tmp16746);
tmp16750 = _mm512_fnmadd_ps(tmp16749, _mm512_set1_ps(2e+00f), tmp16750);
tmp16745 = _mm512_fmadd_ps(in2346, _mm512_set1_ps(2.5e-01f), tmp16739);
tmp16749 = _mm512_fmadd_ps(in2354, _mm512_set1_ps(2.5e-01f), tmp16743);
tmp16739 = _mm512_sub_ps(in2347, tmp16739);
tmp16743 = _mm512_sub_ps(in2355, tmp16743);
tmp16745 = _mm512_fmadd_ps(in2350, _mm512_set1_ps(-1.25e+00f), tmp16745);
tmp16749 = _mm512_fmadd_ps(in2358, _mm512_set1_ps(-1.25e+00f), tmp16749);
in2350 = _mm512_sub_ps(in2350, in2346);
in2358 = _mm512_sub_ps(in2358, in2354);
in2350 = _mm512_fmadd_ps(in2350, _mm512_set1_ps(5.25e+00f), tmp16739);
in2358 = _mm512_fmadd_ps(in2358, _mm512_set1_ps(5.25e+00f), tmp16743);
tmp16740 = _mm512_fmadd_ps(tmp16745, _mm512_set1_ps(2e+00f), tmp16738);
tmp16744 = _mm512_fmadd_ps(tmp16749, _mm512_set1_ps(2e+00f), tmp16742);
tmp16738 = _mm512_fnmadd_ps(tmp16745, _mm512_set1_ps(2e+00f), tmp16738);
tmp16742 = _mm512_fnmadd_ps(tmp16749, _mm512_set1_ps(2e+00f), tmp16742);
__m512 out2159 = _mm512_shuffle_f32x4(in2344, tmp16747, 68);
__m512 out2167 = _mm512_shuffle_f32x4(in2344, tmp16747, 238);
__m512 out2160 = _mm512_shuffle_f32x4(tmp16748, in2348, 68);
__m512 out2168 = _mm512_shuffle_f32x4(tmp16748, in2348, 238);
__m512 out2161 = _mm512_shuffle_f32x4(tmp16746, tmp16740, 68);
__m512 out2169 = _mm512_shuffle_f32x4(tmp16746, tmp16740, 238);
__m512 out2162 = _mm512_shuffle_f32x4(tmp16738, in2350, 68);
__m512 out2170 = _mm512_shuffle_f32x4(tmp16738, in2350, 238);
__m512 out2163 = _mm512_shuffle_f32x4(in2352, tmp16751, 68);
__m512 out2171 = _mm512_shuffle_f32x4(in2352, tmp16751, 238);
__m512 out2164 = _mm512_shuffle_f32x4(tmp16752, in2356, 68);
__m512 out2172 = _mm512_shuffle_f32x4(tmp16752, in2356, 238);
__m512 out2165 = _mm512_shuffle_f32x4(tmp16750, tmp16744, 68);
__m512 out2173 = _mm512_shuffle_f32x4(tmp16750, tmp16744, 238);
__m512 out2166 = _mm512_shuffle_f32x4(tmp16742, in2358, 68);
__m512 out2174 = _mm512_shuffle_f32x4(tmp16742, in2358, 238);
_mm512_storeu_ps(dfPtr12+512+589824*i53+98304*j45+49152*s43+768*k148, out2159);
_mm512_storeu_ps(dfPtr12+640+589824*i53+98304*j45+49152*s43+768*k148, out2167);
_mm512_storeu_ps(dfPtr12+576+589824*i53+98304*j45+49152*s43+768*k148, out2163);
_mm512_storeu_ps(dfPtr12+704+589824*i53+98304*j45+49152*s43+768*k148, out2171);
_mm512_storeu_ps(dfPtr12+147968+589824*i53+98304*j45+49152*s43+768*k148, out2160);
_mm512_storeu_ps(dfPtr12+148096+589824*i53+98304*j45+49152*s43+768*k148, out2168);
_mm512_storeu_ps(dfPtr12+148032+589824*i53+98304*j45+49152*s43+768*k148, out2164);
_mm512_storeu_ps(dfPtr12+148160+589824*i53+98304*j45+49152*s43+768*k148, out2172);
_mm512_storeu_ps(dfPtr12+295424+589824*i53+98304*j45+49152*s43+768*k148, out2161);
_mm512_storeu_ps(dfPtr12+295552+589824*i53+98304*j45+49152*s43+768*k148, out2169);
_mm512_storeu_ps(dfPtr12+295488+589824*i53+98304*j45+49152*s43+768*k148, out2165);
_mm512_storeu_ps(dfPtr12+295616+589824*i53+98304*j45+49152*s43+768*k148, out2173);
_mm512_storeu_ps(dfPtr12+442880+589824*i53+98304*j45+49152*s43+768*k148, out2162);
_mm512_storeu_ps(dfPtr12+443008+589824*i53+98304*j45+49152*s43+768*k148, out2170);
_mm512_storeu_ps(dfPtr12+442944+589824*i53+98304*j45+49152*s43+768*k148, out2166);
_mm512_storeu_ps(dfPtr12+443072+589824*i53+98304*j45+49152*s43+768*k148, out2174);
}
if (j45 >= last11) return;
++j45;
rel23 = 1;
}
ptrdiff_t h54 = base23+12;
ptrdiff_t w66 = 0;
ptrdiff_t k149 = 0;
for (; k149 != 32; ++k149) {
__m512 dat2290 = _mm512_maskz_loadu_ps(16383, datPtr27+4+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2290 = _mm512_max_ps(_mm512_setzero_ps(), dat2290);
__m512 dat2291 = _mm512_maskz_loadu_ps(127, datPtr27+836+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2291 = _mm512_max_ps(_mm512_setzero_ps(), dat2291);
__m512i pm229 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in2360 = _mm512_permutexvar_ps(pm229, dat2290);
__m512i pm230 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 15, 15, 13, 12, 11);
__m512 in2363 = _mm512_permutex2var_ps(dat2290, pm230, dat2291);
__m512 dat2292 = _mm512_maskz_loadu_ps(16383, datPtr27+60+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2292 = _mm512_max_ps(_mm512_setzero_ps(), dat2292);
__m512 dat2293 = _mm512_maskz_loadu_ps(127, datPtr27+892+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2293 = _mm512_max_ps(_mm512_setzero_ps(), dat2293);
__m512 in2361 = _mm512_permutexvar_ps(pm229, dat2292);
__m512 in2364 = _mm512_permutex2var_ps(dat2292, pm230, dat2293);
__m512 dat2294 = _mm512_maskz_loadu_ps(16383, datPtr27+116+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2294 = _mm512_max_ps(_mm512_setzero_ps(), dat2294);
__m512 dat2295 = _mm512_maskz_loadu_ps(127, datPtr27+948+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2295 = _mm512_max_ps(_mm512_setzero_ps(), dat2295);
__m512 in2362 = _mm512_permutexvar_ps(pm229, dat2294);
__m512 in2365 = _mm512_permutex2var_ps(dat2294, pm230, dat2295);
__m512 tmp16801 = in2361;
__m512 tmp16808 = in2364;
__m512 tmp16802 = _mm512_sub_ps(_mm512_setzero_ps(), in2362);
__m512 tmp16809 = _mm512_sub_ps(_mm512_setzero_ps(), in2365);
__m512 tmp16803 = in2362;
__m512 tmp16810 = in2365;
in2360 = in2360;
in2363 = in2363;
tmp16801 = tmp16801;
tmp16808 = tmp16808;
tmp16803 = tmp16803;
tmp16810 = tmp16810;
in2360 = _mm512_fmadd_ps(tmp16802, _mm512_set1_ps(5.25e+00f), in2360);
in2363 = _mm512_fmadd_ps(tmp16809, _mm512_set1_ps(5.25e+00f), in2363);
tmp16802 = _mm512_mul_ps(in2362, _mm512_set1_ps(2.5e-01f));
tmp16809 = _mm512_mul_ps(in2365, _mm512_set1_ps(2.5e-01f));
in2362 = _mm512_mul_ps(in2362, _mm512_set1_ps(4e+00f));
in2365 = _mm512_mul_ps(in2365, _mm512_set1_ps(4e+00f));
__m512 tmp16804 = _mm512_sub_ps(tmp16803, tmp16801);
__m512 tmp16811 = _mm512_sub_ps(tmp16810, tmp16808);
tmp16803 = _mm512_add_ps(tmp16801, tmp16803);
tmp16810 = _mm512_add_ps(tmp16808, tmp16810);
tmp16801 = _mm512_mul_ps(in2361, _mm512_set1_ps(2.5e-01f));
tmp16808 = _mm512_mul_ps(in2364, _mm512_set1_ps(2.5e-01f));
tmp16802 = tmp16802;
tmp16809 = tmp16809;
__m512 tmp16805 = in2362;
__m512 tmp16812 = in2365;
tmp16801 = tmp16801;
tmp16808 = tmp16808;
__m512 tmp16806 = _mm512_fmadd_ps(tmp16801, _mm512_set1_ps(2e+00f), tmp16802);
__m512 tmp16813 = _mm512_fmadd_ps(tmp16808, _mm512_set1_ps(2e+00f), tmp16809);
tmp16802 = _mm512_fnmadd_ps(tmp16801, _mm512_set1_ps(2e+00f), tmp16802);
tmp16809 = _mm512_fnmadd_ps(tmp16808, _mm512_set1_ps(2e+00f), tmp16809);
tmp16801 = in2361;
tmp16808 = in2364;
in2361 = _mm512_sub_ps(_mm512_setzero_ps(), in2361);
in2364 = _mm512_sub_ps(_mm512_setzero_ps(), in2364);
tmp16801 = tmp16801;
tmp16808 = tmp16808;
__m512 tmp16807 = in2361;
__m512 tmp16814 = in2364;
in2362 = _mm512_fmadd_ps(tmp16801, _mm512_set1_ps(2e+00f), tmp16805);
in2365 = _mm512_fmadd_ps(tmp16808, _mm512_set1_ps(2e+00f), tmp16812);
tmp16805 = _mm512_fnmadd_ps(tmp16801, _mm512_set1_ps(2e+00f), tmp16805);
tmp16812 = _mm512_fnmadd_ps(tmp16808, _mm512_set1_ps(2e+00f), tmp16812);
__m512 tmp16823 = _mm512_unpacklo_ps(in2360, tmp16803);
__m512 tmp16824 = _mm512_unpackhi_ps(in2360, tmp16803);
__m512 tmp16825 = _mm512_unpacklo_ps(tmp16804, tmp16806);
__m512 tmp16826 = _mm512_unpackhi_ps(tmp16804, tmp16806);
__m512 tmp16827 = _mm512_unpacklo_ps(tmp16802, in2362);
__m512 tmp16828 = _mm512_unpackhi_ps(tmp16802, in2362);
__m512 tmp16829 = _mm512_unpacklo_ps(tmp16805, tmp16807);
__m512 tmp16830 = _mm512_unpackhi_ps(tmp16805, tmp16807);
__m512 tmp16831 = _mm512_unpacklo_ps(in2363, tmp16810);
__m512 tmp16832 = _mm512_unpackhi_ps(in2363, tmp16810);
__m512 tmp16833 = _mm512_unpacklo_ps(tmp16811, tmp16813);
__m512 tmp16834 = _mm512_unpackhi_ps(tmp16811, tmp16813);
__m512 tmp16835 = _mm512_unpacklo_ps(tmp16809, in2365);
__m512 tmp16836 = _mm512_unpackhi_ps(tmp16809, in2365);
__m512 tmp16837 = _mm512_unpacklo_ps(tmp16812, tmp16814);
__m512 tmp16838 = _mm512_unpackhi_ps(tmp16812, tmp16814);
__m512 tmp16839 = _mm512_shuffle_ps(tmp16823, tmp16825, 68);
__m512 tmp16840 = _mm512_shuffle_ps(tmp16823, tmp16825, 238);
__m512 tmp16841 = _mm512_shuffle_ps(tmp16824, tmp16826, 68);
__m512 tmp16842 = _mm512_shuffle_ps(tmp16824, tmp16826, 238);
__m512 tmp16843 = _mm512_shuffle_ps(tmp16827, tmp16829, 68);
__m512 tmp16844 = _mm512_shuffle_ps(tmp16827, tmp16829, 238);
__m512 tmp16845 = _mm512_shuffle_ps(tmp16828, tmp16830, 68);
__m512 tmp16846 = _mm512_shuffle_ps(tmp16828, tmp16830, 238);
__m512 tmp16847 = _mm512_shuffle_ps(tmp16831, tmp16833, 68);
__m512 tmp16848 = _mm512_shuffle_ps(tmp16831, tmp16833, 238);
__m512 tmp16849 = _mm512_shuffle_ps(tmp16832, tmp16834, 68);
__m512 tmp16850 = _mm512_shuffle_ps(tmp16832, tmp16834, 238);
__m512 tmp16851 = _mm512_shuffle_ps(tmp16835, tmp16837, 68);
__m512 tmp16852 = _mm512_shuffle_ps(tmp16835, tmp16837, 238);
__m512 tmp16853 = _mm512_shuffle_ps(tmp16836, tmp16838, 68);
__m512 tmp16854 = _mm512_shuffle_ps(tmp16836, tmp16838, 238);
__m512 tmp16855 = _mm512_shuffle_f32x4(tmp16839, tmp16843, 136);
__m512 tmp16856 = _mm512_shuffle_f32x4(tmp16839, tmp16843, 221);
__m512 tmp16857 = _mm512_shuffle_f32x4(tmp16840, tmp16844, 136);
__m512 tmp16858 = _mm512_shuffle_f32x4(tmp16840, tmp16844, 221);
__m512 tmp16859 = _mm512_shuffle_f32x4(tmp16841, tmp16845, 136);
__m512 tmp16860 = _mm512_shuffle_f32x4(tmp16841, tmp16845, 221);
__m512 tmp16861 = _mm512_shuffle_f32x4(tmp16842, tmp16846, 136);
__m512 tmp16862 = _mm512_shuffle_f32x4(tmp16842, tmp16846, 221);
__m512 tmp16863 = _mm512_shuffle_f32x4(tmp16847, tmp16851, 136);
__m512 tmp16864 = _mm512_shuffle_f32x4(tmp16847, tmp16851, 221);
__m512 tmp16865 = _mm512_shuffle_f32x4(tmp16848, tmp16852, 136);
__m512 tmp16866 = _mm512_shuffle_f32x4(tmp16848, tmp16852, 221);
__m512 tmp16867 = _mm512_shuffle_f32x4(tmp16849, tmp16853, 136);
__m512 tmp16868 = _mm512_shuffle_f32x4(tmp16849, tmp16853, 221);
__m512 tmp16869 = _mm512_shuffle_f32x4(tmp16850, tmp16854, 136);
__m512 tmp16870 = _mm512_shuffle_f32x4(tmp16850, tmp16854, 221);
in2360 = _mm512_shuffle_f32x4(tmp16855, tmp16863, 136);
in2363 = _mm512_shuffle_f32x4(tmp16855, tmp16863, 221);
tmp16803 = _mm512_shuffle_f32x4(tmp16857, tmp16865, 136);
tmp16810 = _mm512_shuffle_f32x4(tmp16857, tmp16865, 221);
tmp16804 = _mm512_shuffle_f32x4(tmp16859, tmp16867, 136);
tmp16811 = _mm512_shuffle_f32x4(tmp16859, tmp16867, 221);
tmp16806 = _mm512_shuffle_f32x4(tmp16861, tmp16869, 136);
tmp16813 = _mm512_shuffle_f32x4(tmp16861, tmp16869, 221);
tmp16802 = _mm512_shuffle_f32x4(tmp16856, tmp16864, 136);
tmp16809 = _mm512_shuffle_f32x4(tmp16856, tmp16864, 221);
in2362 = _mm512_shuffle_f32x4(tmp16858, tmp16866, 136);
in2365 = _mm512_shuffle_f32x4(tmp16858, tmp16866, 221);
tmp16805 = _mm512_shuffle_f32x4(tmp16860, tmp16868, 136);
tmp16812 = _mm512_shuffle_f32x4(tmp16860, tmp16868, 221);
tmp16807 = _mm512_shuffle_f32x4(tmp16862, tmp16870, 136);
tmp16814 = _mm512_shuffle_f32x4(tmp16862, tmp16870, 221);
__m512 tmp16815 = _mm512_add_ps(tmp16803, in2362);
__m512 tmp16819 = _mm512_add_ps(tmp16810, in2365);
__m512 tmp16816 = _mm512_sub_ps(tmp16802, tmp16804);
__m512 tmp16820 = _mm512_sub_ps(tmp16809, tmp16811);
__m512 tmp16817 = _mm512_add_ps(tmp16804, tmp16805);
__m512 tmp16821 = _mm512_add_ps(tmp16811, tmp16812);
in2360 = _mm512_sub_ps(in2360, tmp16805);
in2363 = _mm512_sub_ps(in2363, tmp16812);
tmp16815 = _mm512_fmadd_ps(tmp16806, _mm512_set1_ps(-4.25e+00f), tmp16815);
tmp16819 = _mm512_fmadd_ps(tmp16813, _mm512_set1_ps(-4.25e+00f), tmp16819);
tmp16817 = _mm512_fmadd_ps(tmp16802, _mm512_set1_ps(-4.25e+00f), tmp16817);
tmp16821 = _mm512_fmadd_ps(tmp16809, _mm512_set1_ps(-4.25e+00f), tmp16821);
in2360 = _mm512_fmadd_ps(tmp16816, _mm512_set1_ps(5.25e+00f), in2360);
in2363 = _mm512_fmadd_ps(tmp16820, _mm512_set1_ps(5.25e+00f), in2363);
tmp16816 = _mm512_fmadd_ps(tmp16804, _mm512_set1_ps(2.5e-01f), tmp16805);
tmp16820 = _mm512_fmadd_ps(tmp16811, _mm512_set1_ps(2.5e-01f), tmp16812);
tmp16804 = _mm512_fmadd_ps(tmp16804, _mm512_set1_ps(4e+00f), tmp16805);
tmp16811 = _mm512_fmadd_ps(tmp16811, _mm512_set1_ps(4e+00f), tmp16812);
__m512 tmp16818 = _mm512_sub_ps(tmp16817, tmp16815);
__m512 tmp16822 = _mm512_sub_ps(tmp16821, tmp16819);
tmp16817 = _mm512_add_ps(tmp16815, tmp16817);
tmp16821 = _mm512_add_ps(tmp16819, tmp16821);
tmp16815 = _mm512_fmadd_ps(tmp16803, _mm512_set1_ps(2.5e-01f), in2362);
tmp16819 = _mm512_fmadd_ps(tmp16810, _mm512_set1_ps(2.5e-01f), in2365);
tmp16816 = _mm512_fmadd_ps(tmp16802, _mm512_set1_ps(-1.25e+00f), tmp16816);
tmp16820 = _mm512_fmadd_ps(tmp16809, _mm512_set1_ps(-1.25e+00f), tmp16820);
tmp16802 = _mm512_fmadd_ps(tmp16802, _mm512_set1_ps(-5e+00f), tmp16804);
tmp16809 = _mm512_fmadd_ps(tmp16809, _mm512_set1_ps(-5e+00f), tmp16811);
tmp16815 = _mm512_fmadd_ps(tmp16806, _mm512_set1_ps(-1.25e+00f), tmp16815);
tmp16819 = _mm512_fmadd_ps(tmp16813, _mm512_set1_ps(-1.25e+00f), tmp16819);
tmp16805 = _mm512_fmadd_ps(tmp16815, _mm512_set1_ps(2e+00f), tmp16816);
tmp16812 = _mm512_fmadd_ps(tmp16819, _mm512_set1_ps(2e+00f), tmp16820);
tmp16816 = _mm512_fnmadd_ps(tmp16815, _mm512_set1_ps(2e+00f), tmp16816);
tmp16820 = _mm512_fnmadd_ps(tmp16819, _mm512_set1_ps(2e+00f), tmp16820);
tmp16815 = _mm512_fmadd_ps(in2362, _mm512_set1_ps(2.5e-01f), tmp16803);
tmp16819 = _mm512_fmadd_ps(in2365, _mm512_set1_ps(2.5e-01f), tmp16810);
tmp16803 = _mm512_sub_ps(tmp16807, tmp16803);
tmp16810 = _mm512_sub_ps(tmp16814, tmp16810);
tmp16815 = _mm512_fmadd_ps(tmp16806, _mm512_set1_ps(-1.25e+00f), tmp16815);
tmp16819 = _mm512_fmadd_ps(tmp16813, _mm512_set1_ps(-1.25e+00f), tmp16819);
tmp16806 = _mm512_sub_ps(tmp16806, in2362);
tmp16813 = _mm512_sub_ps(tmp16813, in2365);
tmp16806 = _mm512_fmadd_ps(tmp16806, _mm512_set1_ps(5.25e+00f), tmp16803);
tmp16813 = _mm512_fmadd_ps(tmp16813, _mm512_set1_ps(5.25e+00f), tmp16810);
tmp16804 = _mm512_fmadd_ps(tmp16815, _mm512_set1_ps(2e+00f), tmp16802);
tmp16811 = _mm512_fmadd_ps(tmp16819, _mm512_set1_ps(2e+00f), tmp16809);
tmp16802 = _mm512_fnmadd_ps(tmp16815, _mm512_set1_ps(2e+00f), tmp16802);
tmp16809 = _mm512_fnmadd_ps(tmp16819, _mm512_set1_ps(2e+00f), tmp16809);
__m512 out2175 = _mm512_shuffle_f32x4(in2360, tmp16817, 68);
__m512 out2183 = _mm512_shuffle_f32x4(in2360, tmp16817, 238);
__m512 out2176 = _mm512_shuffle_f32x4(tmp16818, tmp16805, 68);
__m512 out2184 = _mm512_shuffle_f32x4(tmp16818, tmp16805, 238);
__m512 out2177 = _mm512_shuffle_f32x4(tmp16816, tmp16804, 68);
__m512 out2185 = _mm512_shuffle_f32x4(tmp16816, tmp16804, 238);
__m512 out2178 = _mm512_shuffle_f32x4(tmp16802, tmp16806, 68);
__m512 out2186 = _mm512_shuffle_f32x4(tmp16802, tmp16806, 238);
__m512 out2179 = _mm512_shuffle_f32x4(in2363, tmp16821, 68);
__m512 out2187 = _mm512_shuffle_f32x4(in2363, tmp16821, 238);
__m512 out2180 = _mm512_shuffle_f32x4(tmp16822, tmp16812, 68);
__m512 out2188 = _mm512_shuffle_f32x4(tmp16822, tmp16812, 238);
__m512 out2181 = _mm512_shuffle_f32x4(tmp16820, tmp16811, 68);
__m512 out2189 = _mm512_shuffle_f32x4(tmp16820, tmp16811, 238);
__m512 out2182 = _mm512_shuffle_f32x4(tmp16809, tmp16813, 68);
__m512 out2190 = _mm512_shuffle_f32x4(tmp16809, tmp16813, 238);
_mm512_storeu_ps(dfPtr12+0+589824*i53+98304*j45+24576*s43+768*k149, out2175);
_mm512_storeu_ps(dfPtr12+128+589824*i53+98304*j45+24576*s43+768*k149, out2183);
_mm512_storeu_ps(dfPtr12+64+589824*i53+98304*j45+24576*s43+768*k149, out2179);
_mm512_storeu_ps(dfPtr12+192+589824*i53+98304*j45+24576*s43+768*k149, out2187);
_mm512_storeu_ps(dfPtr12+147456+589824*i53+98304*j45+24576*s43+768*k149, out2176);
_mm512_storeu_ps(dfPtr12+147584+589824*i53+98304*j45+24576*s43+768*k149, out2184);
_mm512_storeu_ps(dfPtr12+147520+589824*i53+98304*j45+24576*s43+768*k149, out2180);
_mm512_storeu_ps(dfPtr12+147648+589824*i53+98304*j45+24576*s43+768*k149, out2188);
_mm512_storeu_ps(dfPtr12+294912+589824*i53+98304*j45+24576*s43+768*k149, out2177);
_mm512_storeu_ps(dfPtr12+295040+589824*i53+98304*j45+24576*s43+768*k149, out2185);
_mm512_storeu_ps(dfPtr12+294976+589824*i53+98304*j45+24576*s43+768*k149, out2181);
_mm512_storeu_ps(dfPtr12+295104+589824*i53+98304*j45+24576*s43+768*k149, out2189);
_mm512_storeu_ps(dfPtr12+442368+589824*i53+98304*j45+24576*s43+768*k149, out2178);
_mm512_storeu_ps(dfPtr12+442496+589824*i53+98304*j45+24576*s43+768*k149, out2186);
_mm512_storeu_ps(dfPtr12+442432+589824*i53+98304*j45+24576*s43+768*k149, out2182);
_mm512_storeu_ps(dfPtr12+442560+589824*i53+98304*j45+24576*s43+768*k149, out2190);
__m512 dat2296 = _mm512_maskz_loadu_ps(511, datPtr27+856+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2296 = _mm512_max_ps(_mm512_setzero_ps(), dat2296);
__m512 dat2297 = _mm512_maskz_loadu_ps(8191, datPtr27+1668+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2297 = _mm512_max_ps(_mm512_setzero_ps(), dat2297);
__m512i pm231 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in2366 = _mm512_permutexvar_ps(pm231, dat2296);
__m512i pm232 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in2369 = _mm512_permutexvar_ps(pm232, dat2297);
__m512 dat2298 = _mm512_maskz_loadu_ps(511, datPtr27+912+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2298 = _mm512_max_ps(_mm512_setzero_ps(), dat2298);
__m512 dat2299 = _mm512_maskz_loadu_ps(8191, datPtr27+1724+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2299 = _mm512_max_ps(_mm512_setzero_ps(), dat2299);
__m512 in2367 = _mm512_permutexvar_ps(pm231, dat2298);
__m512 in2370 = _mm512_permutexvar_ps(pm232, dat2299);
__m512 dat2300 = _mm512_maskz_loadu_ps(511, datPtr27+968+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2300 = _mm512_max_ps(_mm512_setzero_ps(), dat2300);
__m512 dat2301 = _mm512_maskz_loadu_ps(8191, datPtr27+1780+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2301 = _mm512_max_ps(_mm512_setzero_ps(), dat2301);
__m512 in2368 = _mm512_permutexvar_ps(pm231, dat2300);
__m512 in2371 = _mm512_permutexvar_ps(pm232, dat2301);
__m512 tmp16871 = in2367;
__m512 tmp16878 = in2370;
__m512 tmp16872 = _mm512_sub_ps(_mm512_setzero_ps(), in2368);
__m512 tmp16879 = _mm512_sub_ps(_mm512_setzero_ps(), in2371);
__m512 tmp16873 = in2368;
__m512 tmp16880 = in2371;
in2366 = in2366;
in2369 = in2369;
tmp16871 = tmp16871;
tmp16878 = tmp16878;
tmp16873 = tmp16873;
tmp16880 = tmp16880;
in2366 = _mm512_fmadd_ps(tmp16872, _mm512_set1_ps(5.25e+00f), in2366);
in2369 = _mm512_fmadd_ps(tmp16879, _mm512_set1_ps(5.25e+00f), in2369);
tmp16872 = _mm512_mul_ps(in2368, _mm512_set1_ps(2.5e-01f));
tmp16879 = _mm512_mul_ps(in2371, _mm512_set1_ps(2.5e-01f));
in2368 = _mm512_mul_ps(in2368, _mm512_set1_ps(4e+00f));
in2371 = _mm512_mul_ps(in2371, _mm512_set1_ps(4e+00f));
__m512 tmp16874 = _mm512_sub_ps(tmp16873, tmp16871);
__m512 tmp16881 = _mm512_sub_ps(tmp16880, tmp16878);
tmp16873 = _mm512_add_ps(tmp16871, tmp16873);
tmp16880 = _mm512_add_ps(tmp16878, tmp16880);
tmp16871 = _mm512_mul_ps(in2367, _mm512_set1_ps(2.5e-01f));
tmp16878 = _mm512_mul_ps(in2370, _mm512_set1_ps(2.5e-01f));
tmp16872 = tmp16872;
tmp16879 = tmp16879;
__m512 tmp16875 = in2368;
__m512 tmp16882 = in2371;
tmp16871 = tmp16871;
tmp16878 = tmp16878;
__m512 tmp16876 = _mm512_fmadd_ps(tmp16871, _mm512_set1_ps(2e+00f), tmp16872);
__m512 tmp16883 = _mm512_fmadd_ps(tmp16878, _mm512_set1_ps(2e+00f), tmp16879);
tmp16872 = _mm512_fnmadd_ps(tmp16871, _mm512_set1_ps(2e+00f), tmp16872);
tmp16879 = _mm512_fnmadd_ps(tmp16878, _mm512_set1_ps(2e+00f), tmp16879);
tmp16871 = in2367;
tmp16878 = in2370;
in2367 = _mm512_sub_ps(_mm512_setzero_ps(), in2367);
in2370 = _mm512_sub_ps(_mm512_setzero_ps(), in2370);
tmp16871 = tmp16871;
tmp16878 = tmp16878;
__m512 tmp16877 = in2367;
__m512 tmp16884 = in2370;
in2368 = _mm512_fmadd_ps(tmp16871, _mm512_set1_ps(2e+00f), tmp16875);
in2371 = _mm512_fmadd_ps(tmp16878, _mm512_set1_ps(2e+00f), tmp16882);
tmp16875 = _mm512_fnmadd_ps(tmp16871, _mm512_set1_ps(2e+00f), tmp16875);
tmp16882 = _mm512_fnmadd_ps(tmp16878, _mm512_set1_ps(2e+00f), tmp16882);
__m512 tmp16893 = _mm512_unpacklo_ps(in2366, tmp16873);
__m512 tmp16894 = _mm512_unpackhi_ps(in2366, tmp16873);
__m512 tmp16895 = _mm512_unpacklo_ps(tmp16874, tmp16876);
__m512 tmp16896 = _mm512_unpackhi_ps(tmp16874, tmp16876);
__m512 tmp16897 = _mm512_unpacklo_ps(tmp16872, in2368);
__m512 tmp16898 = _mm512_unpackhi_ps(tmp16872, in2368);
__m512 tmp16899 = _mm512_unpacklo_ps(tmp16875, tmp16877);
__m512 tmp16900 = _mm512_unpackhi_ps(tmp16875, tmp16877);
__m512 tmp16901 = _mm512_unpacklo_ps(in2369, tmp16880);
__m512 tmp16902 = _mm512_unpackhi_ps(in2369, tmp16880);
__m512 tmp16903 = _mm512_unpacklo_ps(tmp16881, tmp16883);
__m512 tmp16904 = _mm512_unpackhi_ps(tmp16881, tmp16883);
__m512 tmp16905 = _mm512_unpacklo_ps(tmp16879, in2371);
__m512 tmp16906 = _mm512_unpackhi_ps(tmp16879, in2371);
__m512 tmp16907 = _mm512_unpacklo_ps(tmp16882, tmp16884);
__m512 tmp16908 = _mm512_unpackhi_ps(tmp16882, tmp16884);
__m512 tmp16909 = _mm512_shuffle_ps(tmp16893, tmp16895, 68);
__m512 tmp16910 = _mm512_shuffle_ps(tmp16893, tmp16895, 238);
__m512 tmp16911 = _mm512_shuffle_ps(tmp16894, tmp16896, 68);
__m512 tmp16912 = _mm512_shuffle_ps(tmp16894, tmp16896, 238);
__m512 tmp16913 = _mm512_shuffle_ps(tmp16897, tmp16899, 68);
__m512 tmp16914 = _mm512_shuffle_ps(tmp16897, tmp16899, 238);
__m512 tmp16915 = _mm512_shuffle_ps(tmp16898, tmp16900, 68);
__m512 tmp16916 = _mm512_shuffle_ps(tmp16898, tmp16900, 238);
__m512 tmp16917 = _mm512_shuffle_ps(tmp16901, tmp16903, 68);
__m512 tmp16918 = _mm512_shuffle_ps(tmp16901, tmp16903, 238);
__m512 tmp16919 = _mm512_shuffle_ps(tmp16902, tmp16904, 68);
__m512 tmp16920 = _mm512_shuffle_ps(tmp16902, tmp16904, 238);
__m512 tmp16921 = _mm512_shuffle_ps(tmp16905, tmp16907, 68);
__m512 tmp16922 = _mm512_shuffle_ps(tmp16905, tmp16907, 238);
__m512 tmp16923 = _mm512_shuffle_ps(tmp16906, tmp16908, 68);
__m512 tmp16924 = _mm512_shuffle_ps(tmp16906, tmp16908, 238);
__m512 tmp16925 = _mm512_shuffle_f32x4(tmp16909, tmp16913, 136);
__m512 tmp16926 = _mm512_shuffle_f32x4(tmp16909, tmp16913, 221);
__m512 tmp16927 = _mm512_shuffle_f32x4(tmp16910, tmp16914, 136);
__m512 tmp16928 = _mm512_shuffle_f32x4(tmp16910, tmp16914, 221);
__m512 tmp16929 = _mm512_shuffle_f32x4(tmp16911, tmp16915, 136);
__m512 tmp16930 = _mm512_shuffle_f32x4(tmp16911, tmp16915, 221);
__m512 tmp16931 = _mm512_shuffle_f32x4(tmp16912, tmp16916, 136);
__m512 tmp16932 = _mm512_shuffle_f32x4(tmp16912, tmp16916, 221);
__m512 tmp16933 = _mm512_shuffle_f32x4(tmp16917, tmp16921, 136);
__m512 tmp16934 = _mm512_shuffle_f32x4(tmp16917, tmp16921, 221);
__m512 tmp16935 = _mm512_shuffle_f32x4(tmp16918, tmp16922, 136);
__m512 tmp16936 = _mm512_shuffle_f32x4(tmp16918, tmp16922, 221);
__m512 tmp16937 = _mm512_shuffle_f32x4(tmp16919, tmp16923, 136);
__m512 tmp16938 = _mm512_shuffle_f32x4(tmp16919, tmp16923, 221);
__m512 tmp16939 = _mm512_shuffle_f32x4(tmp16920, tmp16924, 136);
__m512 tmp16940 = _mm512_shuffle_f32x4(tmp16920, tmp16924, 221);
in2366 = _mm512_shuffle_f32x4(tmp16925, tmp16933, 136);
in2369 = _mm512_shuffle_f32x4(tmp16925, tmp16933, 221);
tmp16873 = _mm512_shuffle_f32x4(tmp16927, tmp16935, 136);
tmp16880 = _mm512_shuffle_f32x4(tmp16927, tmp16935, 221);
tmp16874 = _mm512_shuffle_f32x4(tmp16929, tmp16937, 136);
tmp16881 = _mm512_shuffle_f32x4(tmp16929, tmp16937, 221);
tmp16876 = _mm512_shuffle_f32x4(tmp16931, tmp16939, 136);
tmp16883 = _mm512_shuffle_f32x4(tmp16931, tmp16939, 221);
tmp16872 = _mm512_shuffle_f32x4(tmp16926, tmp16934, 136);
tmp16879 = _mm512_shuffle_f32x4(tmp16926, tmp16934, 221);
in2368 = _mm512_shuffle_f32x4(tmp16928, tmp16936, 136);
in2371 = _mm512_shuffle_f32x4(tmp16928, tmp16936, 221);
tmp16875 = _mm512_shuffle_f32x4(tmp16930, tmp16938, 136);
tmp16882 = _mm512_shuffle_f32x4(tmp16930, tmp16938, 221);
tmp16877 = _mm512_shuffle_f32x4(tmp16932, tmp16940, 136);
tmp16884 = _mm512_shuffle_f32x4(tmp16932, tmp16940, 221);
__m512 tmp16885 = _mm512_add_ps(tmp16873, in2368);
__m512 tmp16889 = _mm512_add_ps(tmp16880, in2371);
__m512 tmp16886 = _mm512_sub_ps(tmp16872, tmp16874);
__m512 tmp16890 = _mm512_sub_ps(tmp16879, tmp16881);
__m512 tmp16887 = _mm512_add_ps(tmp16874, tmp16875);
__m512 tmp16891 = _mm512_add_ps(tmp16881, tmp16882);
in2366 = _mm512_sub_ps(in2366, tmp16875);
in2369 = _mm512_sub_ps(in2369, tmp16882);
tmp16885 = _mm512_fmadd_ps(tmp16876, _mm512_set1_ps(-4.25e+00f), tmp16885);
tmp16889 = _mm512_fmadd_ps(tmp16883, _mm512_set1_ps(-4.25e+00f), tmp16889);
tmp16887 = _mm512_fmadd_ps(tmp16872, _mm512_set1_ps(-4.25e+00f), tmp16887);
tmp16891 = _mm512_fmadd_ps(tmp16879, _mm512_set1_ps(-4.25e+00f), tmp16891);
in2366 = _mm512_fmadd_ps(tmp16886, _mm512_set1_ps(5.25e+00f), in2366);
in2369 = _mm512_fmadd_ps(tmp16890, _mm512_set1_ps(5.25e+00f), in2369);
tmp16886 = _mm512_fmadd_ps(tmp16874, _mm512_set1_ps(2.5e-01f), tmp16875);
tmp16890 = _mm512_fmadd_ps(tmp16881, _mm512_set1_ps(2.5e-01f), tmp16882);
tmp16874 = _mm512_fmadd_ps(tmp16874, _mm512_set1_ps(4e+00f), tmp16875);
tmp16881 = _mm512_fmadd_ps(tmp16881, _mm512_set1_ps(4e+00f), tmp16882);
__m512 tmp16888 = _mm512_sub_ps(tmp16887, tmp16885);
__m512 tmp16892 = _mm512_sub_ps(tmp16891, tmp16889);
tmp16887 = _mm512_add_ps(tmp16885, tmp16887);
tmp16891 = _mm512_add_ps(tmp16889, tmp16891);
tmp16885 = _mm512_fmadd_ps(tmp16873, _mm512_set1_ps(2.5e-01f), in2368);
tmp16889 = _mm512_fmadd_ps(tmp16880, _mm512_set1_ps(2.5e-01f), in2371);
tmp16886 = _mm512_fmadd_ps(tmp16872, _mm512_set1_ps(-1.25e+00f), tmp16886);
tmp16890 = _mm512_fmadd_ps(tmp16879, _mm512_set1_ps(-1.25e+00f), tmp16890);
tmp16872 = _mm512_fmadd_ps(tmp16872, _mm512_set1_ps(-5e+00f), tmp16874);
tmp16879 = _mm512_fmadd_ps(tmp16879, _mm512_set1_ps(-5e+00f), tmp16881);
tmp16885 = _mm512_fmadd_ps(tmp16876, _mm512_set1_ps(-1.25e+00f), tmp16885);
tmp16889 = _mm512_fmadd_ps(tmp16883, _mm512_set1_ps(-1.25e+00f), tmp16889);
tmp16875 = _mm512_fmadd_ps(tmp16885, _mm512_set1_ps(2e+00f), tmp16886);
tmp16882 = _mm512_fmadd_ps(tmp16889, _mm512_set1_ps(2e+00f), tmp16890);
tmp16886 = _mm512_fnmadd_ps(tmp16885, _mm512_set1_ps(2e+00f), tmp16886);
tmp16890 = _mm512_fnmadd_ps(tmp16889, _mm512_set1_ps(2e+00f), tmp16890);
tmp16885 = _mm512_fmadd_ps(in2368, _mm512_set1_ps(2.5e-01f), tmp16873);
tmp16889 = _mm512_fmadd_ps(in2371, _mm512_set1_ps(2.5e-01f), tmp16880);
tmp16873 = _mm512_sub_ps(tmp16877, tmp16873);
tmp16880 = _mm512_sub_ps(tmp16884, tmp16880);
tmp16885 = _mm512_fmadd_ps(tmp16876, _mm512_set1_ps(-1.25e+00f), tmp16885);
tmp16889 = _mm512_fmadd_ps(tmp16883, _mm512_set1_ps(-1.25e+00f), tmp16889);
tmp16876 = _mm512_sub_ps(tmp16876, in2368);
tmp16883 = _mm512_sub_ps(tmp16883, in2371);
tmp16876 = _mm512_fmadd_ps(tmp16876, _mm512_set1_ps(5.25e+00f), tmp16873);
tmp16883 = _mm512_fmadd_ps(tmp16883, _mm512_set1_ps(5.25e+00f), tmp16880);
tmp16874 = _mm512_fmadd_ps(tmp16885, _mm512_set1_ps(2e+00f), tmp16872);
tmp16881 = _mm512_fmadd_ps(tmp16889, _mm512_set1_ps(2e+00f), tmp16879);
tmp16872 = _mm512_fnmadd_ps(tmp16885, _mm512_set1_ps(2e+00f), tmp16872);
tmp16879 = _mm512_fnmadd_ps(tmp16889, _mm512_set1_ps(2e+00f), tmp16879);
__m512 out2191 = _mm512_shuffle_f32x4(in2366, tmp16887, 68);
__m512 out2199 = _mm512_shuffle_f32x4(in2366, tmp16887, 238);
__m512 out2192 = _mm512_shuffle_f32x4(tmp16888, tmp16875, 68);
__m512 out2200 = _mm512_shuffle_f32x4(tmp16888, tmp16875, 238);
__m512 out2193 = _mm512_shuffle_f32x4(tmp16886, tmp16874, 68);
__m512 out2201 = _mm512_shuffle_f32x4(tmp16886, tmp16874, 238);
__m512 out2194 = _mm512_shuffle_f32x4(tmp16872, tmp16876, 68);
__m512 out2202 = _mm512_shuffle_f32x4(tmp16872, tmp16876, 238);
__m512 out2195 = _mm512_shuffle_f32x4(in2369, tmp16891, 68);
__m512 out2203 = _mm512_shuffle_f32x4(in2369, tmp16891, 238);
__m512 out2196 = _mm512_shuffle_f32x4(tmp16892, tmp16882, 68);
__m512 out2204 = _mm512_shuffle_f32x4(tmp16892, tmp16882, 238);
__m512 out2197 = _mm512_shuffle_f32x4(tmp16890, tmp16881, 68);
__m512 out2205 = _mm512_shuffle_f32x4(tmp16890, tmp16881, 238);
__m512 out2198 = _mm512_shuffle_f32x4(tmp16879, tmp16883, 68);
__m512 out2206 = _mm512_shuffle_f32x4(tmp16879, tmp16883, 238);
_mm512_storeu_ps(dfPtr12+256+589824*i53+98304*j45+24576*s43+768*k149, out2191);
_mm512_storeu_ps(dfPtr12+384+589824*i53+98304*j45+24576*s43+768*k149, out2199);
_mm512_storeu_ps(dfPtr12+320+589824*i53+98304*j45+24576*s43+768*k149, out2195);
_mm512_storeu_ps(dfPtr12+448+589824*i53+98304*j45+24576*s43+768*k149, out2203);
_mm512_storeu_ps(dfPtr12+147712+589824*i53+98304*j45+24576*s43+768*k149, out2192);
_mm512_storeu_ps(dfPtr12+147840+589824*i53+98304*j45+24576*s43+768*k149, out2200);
_mm512_storeu_ps(dfPtr12+147776+589824*i53+98304*j45+24576*s43+768*k149, out2196);
_mm512_storeu_ps(dfPtr12+147904+589824*i53+98304*j45+24576*s43+768*k149, out2204);
_mm512_storeu_ps(dfPtr12+295168+589824*i53+98304*j45+24576*s43+768*k149, out2193);
_mm512_storeu_ps(dfPtr12+295296+589824*i53+98304*j45+24576*s43+768*k149, out2201);
_mm512_storeu_ps(dfPtr12+295232+589824*i53+98304*j45+24576*s43+768*k149, out2197);
_mm512_storeu_ps(dfPtr12+295360+589824*i53+98304*j45+24576*s43+768*k149, out2205);
_mm512_storeu_ps(dfPtr12+442624+589824*i53+98304*j45+24576*s43+768*k149, out2194);
_mm512_storeu_ps(dfPtr12+442752+589824*i53+98304*j45+24576*s43+768*k149, out2202);
_mm512_storeu_ps(dfPtr12+442688+589824*i53+98304*j45+24576*s43+768*k149, out2198);
_mm512_storeu_ps(dfPtr12+442816+589824*i53+98304*j45+24576*s43+768*k149, out2206);
__m512 dat2302 = _mm512_maskz_loadu_ps(7, datPtr27+1712+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2302 = _mm512_max_ps(_mm512_setzero_ps(), dat2302);
__m512 dat2303 = _mm512_maskz_loadu_ps(16383, datPtr27+2500+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2303 = _mm512_max_ps(_mm512_setzero_ps(), dat2303);
__m512i pm233 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 15, 15, 2, 1, 0);
__m512 in2372 = _mm512_permutex2var_ps(dat2302, pm233, dat2303);
__m512i pm234 = _mm512_set_epi32(15, 15, 15, 15, 15, 13, 12, 11, 12, 11, 10, 9, 8, 7, 6, 5);
__m512 in2375 = _mm512_permutexvar_ps(pm234, dat2303);
__m512 dat2304 = _mm512_maskz_loadu_ps(7, datPtr27+1768+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2304 = _mm512_max_ps(_mm512_setzero_ps(), dat2304);
__m512 dat2305 = _mm512_maskz_loadu_ps(16383, datPtr27+2556+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2305 = _mm512_max_ps(_mm512_setzero_ps(), dat2305);
__m512 in2373 = _mm512_permutex2var_ps(dat2304, pm233, dat2305);
__m512 in2376 = _mm512_permutexvar_ps(pm234, dat2305);
__m512 dat2306 = _mm512_maskz_loadu_ps(7, datPtr27+1824+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2306 = _mm512_max_ps(_mm512_setzero_ps(), dat2306);
__m512 dat2307 = _mm512_maskz_loadu_ps(16383, datPtr27+2612+212992*i53+56*h54+4*w66+106496*s43+3328*k149);
dat2307 = _mm512_max_ps(_mm512_setzero_ps(), dat2307);
__m512 in2374 = _mm512_permutex2var_ps(dat2306, pm233, dat2307);
__m512 in2377 = _mm512_permutexvar_ps(pm234, dat2307);
__m512 tmp16941 = in2373;
__m512 tmp16948 = in2376;
__m512 tmp16942 = _mm512_sub_ps(_mm512_setzero_ps(), in2374);
__m512 tmp16949 = _mm512_sub_ps(_mm512_setzero_ps(), in2377);
__m512 tmp16943 = in2374;
__m512 tmp16950 = in2377;
in2372 = in2372;
in2375 = in2375;
tmp16941 = tmp16941;
tmp16948 = tmp16948;
tmp16943 = tmp16943;
tmp16950 = tmp16950;
in2372 = _mm512_fmadd_ps(tmp16942, _mm512_set1_ps(5.25e+00f), in2372);
in2375 = _mm512_fmadd_ps(tmp16949, _mm512_set1_ps(5.25e+00f), in2375);
tmp16942 = _mm512_mul_ps(in2374, _mm512_set1_ps(2.5e-01f));
tmp16949 = _mm512_mul_ps(in2377, _mm512_set1_ps(2.5e-01f));
in2374 = _mm512_mul_ps(in2374, _mm512_set1_ps(4e+00f));
in2377 = _mm512_mul_ps(in2377, _mm512_set1_ps(4e+00f));
__m512 tmp16944 = _mm512_sub_ps(tmp16943, tmp16941);
__m512 tmp16951 = _mm512_sub_ps(tmp16950, tmp16948);
tmp16943 = _mm512_add_ps(tmp16941, tmp16943);
tmp16950 = _mm512_add_ps(tmp16948, tmp16950);
tmp16941 = _mm512_mul_ps(in2373, _mm512_set1_ps(2.5e-01f));
tmp16948 = _mm512_mul_ps(in2376, _mm512_set1_ps(2.5e-01f));
tmp16942 = tmp16942;
tmp16949 = tmp16949;
__m512 tmp16945 = in2374;
__m512 tmp16952 = in2377;
tmp16941 = tmp16941;
tmp16948 = tmp16948;
__m512 tmp16946 = _mm512_fmadd_ps(tmp16941, _mm512_set1_ps(2e+00f), tmp16942);
__m512 tmp16953 = _mm512_fmadd_ps(tmp16948, _mm512_set1_ps(2e+00f), tmp16949);
tmp16942 = _mm512_fnmadd_ps(tmp16941, _mm512_set1_ps(2e+00f), tmp16942);
tmp16949 = _mm512_fnmadd_ps(tmp16948, _mm512_set1_ps(2e+00f), tmp16949);
tmp16941 = in2373;
tmp16948 = in2376;
in2373 = _mm512_sub_ps(_mm512_setzero_ps(), in2373);
in2376 = _mm512_sub_ps(_mm512_setzero_ps(), in2376);
tmp16941 = tmp16941;
tmp16948 = tmp16948;
__m512 tmp16947 = in2373;
__m512 tmp16954 = in2376;
in2374 = _mm512_fmadd_ps(tmp16941, _mm512_set1_ps(2e+00f), tmp16945);
in2377 = _mm512_fmadd_ps(tmp16948, _mm512_set1_ps(2e+00f), tmp16952);
tmp16945 = _mm512_fnmadd_ps(tmp16941, _mm512_set1_ps(2e+00f), tmp16945);
tmp16952 = _mm512_fnmadd_ps(tmp16948, _mm512_set1_ps(2e+00f), tmp16952);
__m512 tmp16963 = _mm512_unpacklo_ps(in2372, tmp16943);
__m512 tmp16964 = _mm512_unpackhi_ps(in2372, tmp16943);
__m512 tmp16965 = _mm512_unpacklo_ps(tmp16944, tmp16946);
__m512 tmp16966 = _mm512_unpackhi_ps(tmp16944, tmp16946);
__m512 tmp16967 = _mm512_unpacklo_ps(tmp16942, in2374);
__m512 tmp16968 = _mm512_unpackhi_ps(tmp16942, in2374);
__m512 tmp16969 = _mm512_unpacklo_ps(tmp16945, tmp16947);
__m512 tmp16970 = _mm512_unpackhi_ps(tmp16945, tmp16947);
__m512 tmp16971 = _mm512_unpacklo_ps(in2375, tmp16950);
__m512 tmp16972 = _mm512_unpackhi_ps(in2375, tmp16950);
__m512 tmp16973 = _mm512_unpacklo_ps(tmp16951, tmp16953);
__m512 tmp16974 = _mm512_unpackhi_ps(tmp16951, tmp16953);
__m512 tmp16975 = _mm512_unpacklo_ps(tmp16949, in2377);
__m512 tmp16976 = _mm512_unpackhi_ps(tmp16949, in2377);
__m512 tmp16977 = _mm512_unpacklo_ps(tmp16952, tmp16954);
__m512 tmp16978 = _mm512_unpackhi_ps(tmp16952, tmp16954);
__m512 tmp16979 = _mm512_shuffle_ps(tmp16963, tmp16965, 68);
__m512 tmp16980 = _mm512_shuffle_ps(tmp16963, tmp16965, 238);
__m512 tmp16981 = _mm512_shuffle_ps(tmp16964, tmp16966, 68);
__m512 tmp16982 = _mm512_shuffle_ps(tmp16964, tmp16966, 238);
__m512 tmp16983 = _mm512_shuffle_ps(tmp16967, tmp16969, 68);
__m512 tmp16984 = _mm512_shuffle_ps(tmp16967, tmp16969, 238);
__m512 tmp16985 = _mm512_shuffle_ps(tmp16968, tmp16970, 68);
__m512 tmp16986 = _mm512_shuffle_ps(tmp16968, tmp16970, 238);
__m512 tmp16987 = _mm512_shuffle_ps(tmp16971, tmp16973, 68);
__m512 tmp16988 = _mm512_shuffle_ps(tmp16971, tmp16973, 238);
__m512 tmp16989 = _mm512_shuffle_ps(tmp16972, tmp16974, 68);
__m512 tmp16990 = _mm512_shuffle_ps(tmp16972, tmp16974, 238);
__m512 tmp16991 = _mm512_shuffle_ps(tmp16975, tmp16977, 68);
__m512 tmp16992 = _mm512_shuffle_ps(tmp16975, tmp16977, 238);
__m512 tmp16993 = _mm512_shuffle_ps(tmp16976, tmp16978, 68);
__m512 tmp16994 = _mm512_shuffle_ps(tmp16976, tmp16978, 238);
__m512 tmp16995 = _mm512_shuffle_f32x4(tmp16979, tmp16983, 136);
__m512 tmp16996 = _mm512_shuffle_f32x4(tmp16979, tmp16983, 221);
__m512 tmp16997 = _mm512_shuffle_f32x4(tmp16980, tmp16984, 136);
__m512 tmp16998 = _mm512_shuffle_f32x4(tmp16980, tmp16984, 221);
__m512 tmp16999 = _mm512_shuffle_f32x4(tmp16981, tmp16985, 136);
__m512 tmp17000 = _mm512_shuffle_f32x4(tmp16981, tmp16985, 221);
__m512 tmp17001 = _mm512_shuffle_f32x4(tmp16982, tmp16986, 136);
__m512 tmp17002 = _mm512_shuffle_f32x4(tmp16982, tmp16986, 221);
__m512 tmp17003 = _mm512_shuffle_f32x4(tmp16987, tmp16991, 136);
__m512 tmp17004 = _mm512_shuffle_f32x4(tmp16987, tmp16991, 221);
__m512 tmp17005 = _mm512_shuffle_f32x4(tmp16988, tmp16992, 136);
__m512 tmp17006 = _mm512_shuffle_f32x4(tmp16988, tmp16992, 221);
__m512 tmp17007 = _mm512_shuffle_f32x4(tmp16989, tmp16993, 136);
__m512 tmp17008 = _mm512_shuffle_f32x4(tmp16989, tmp16993, 221);
__m512 tmp17009 = _mm512_shuffle_f32x4(tmp16990, tmp16994, 136);
__m512 tmp17010 = _mm512_shuffle_f32x4(tmp16990, tmp16994, 221);
in2372 = _mm512_shuffle_f32x4(tmp16995, tmp17003, 136);
in2375 = _mm512_shuffle_f32x4(tmp16995, tmp17003, 221);
tmp16943 = _mm512_shuffle_f32x4(tmp16997, tmp17005, 136);
tmp16950 = _mm512_shuffle_f32x4(tmp16997, tmp17005, 221);
tmp16944 = _mm512_shuffle_f32x4(tmp16999, tmp17007, 136);
tmp16951 = _mm512_shuffle_f32x4(tmp16999, tmp17007, 221);
tmp16946 = _mm512_shuffle_f32x4(tmp17001, tmp17009, 136);
tmp16953 = _mm512_shuffle_f32x4(tmp17001, tmp17009, 221);
tmp16942 = _mm512_shuffle_f32x4(tmp16996, tmp17004, 136);
tmp16949 = _mm512_shuffle_f32x4(tmp16996, tmp17004, 221);
in2374 = _mm512_shuffle_f32x4(tmp16998, tmp17006, 136);
in2377 = _mm512_shuffle_f32x4(tmp16998, tmp17006, 221);
tmp16945 = _mm512_shuffle_f32x4(tmp17000, tmp17008, 136);
tmp16952 = _mm512_shuffle_f32x4(tmp17000, tmp17008, 221);
tmp16947 = _mm512_shuffle_f32x4(tmp17002, tmp17010, 136);
tmp16954 = _mm512_shuffle_f32x4(tmp17002, tmp17010, 221);
__m512 tmp16955 = _mm512_add_ps(tmp16943, in2374);
__m512 tmp16959 = _mm512_add_ps(tmp16950, in2377);
__m512 tmp16956 = _mm512_sub_ps(tmp16942, tmp16944);
__m512 tmp16960 = _mm512_sub_ps(tmp16949, tmp16951);
__m512 tmp16957 = _mm512_add_ps(tmp16944, tmp16945);
__m512 tmp16961 = _mm512_add_ps(tmp16951, tmp16952);
in2372 = _mm512_sub_ps(in2372, tmp16945);
in2375 = _mm512_sub_ps(in2375, tmp16952);
tmp16955 = _mm512_fmadd_ps(tmp16946, _mm512_set1_ps(-4.25e+00f), tmp16955);
tmp16959 = _mm512_fmadd_ps(tmp16953, _mm512_set1_ps(-4.25e+00f), tmp16959);
tmp16957 = _mm512_fmadd_ps(tmp16942, _mm512_set1_ps(-4.25e+00f), tmp16957);
tmp16961 = _mm512_fmadd_ps(tmp16949, _mm512_set1_ps(-4.25e+00f), tmp16961);
in2372 = _mm512_fmadd_ps(tmp16956, _mm512_set1_ps(5.25e+00f), in2372);
in2375 = _mm512_fmadd_ps(tmp16960, _mm512_set1_ps(5.25e+00f), in2375);
tmp16956 = _mm512_fmadd_ps(tmp16944, _mm512_set1_ps(2.5e-01f), tmp16945);
tmp16960 = _mm512_fmadd_ps(tmp16951, _mm512_set1_ps(2.5e-01f), tmp16952);
tmp16944 = _mm512_fmadd_ps(tmp16944, _mm512_set1_ps(4e+00f), tmp16945);
tmp16951 = _mm512_fmadd_ps(tmp16951, _mm512_set1_ps(4e+00f), tmp16952);
__m512 tmp16958 = _mm512_sub_ps(tmp16957, tmp16955);
__m512 tmp16962 = _mm512_sub_ps(tmp16961, tmp16959);
tmp16957 = _mm512_add_ps(tmp16955, tmp16957);
tmp16961 = _mm512_add_ps(tmp16959, tmp16961);
tmp16955 = _mm512_fmadd_ps(tmp16943, _mm512_set1_ps(2.5e-01f), in2374);
tmp16959 = _mm512_fmadd_ps(tmp16950, _mm512_set1_ps(2.5e-01f), in2377);
tmp16956 = _mm512_fmadd_ps(tmp16942, _mm512_set1_ps(-1.25e+00f), tmp16956);
tmp16960 = _mm512_fmadd_ps(tmp16949, _mm512_set1_ps(-1.25e+00f), tmp16960);
tmp16942 = _mm512_fmadd_ps(tmp16942, _mm512_set1_ps(-5e+00f), tmp16944);
tmp16949 = _mm512_fmadd_ps(tmp16949, _mm512_set1_ps(-5e+00f), tmp16951);
tmp16955 = _mm512_fmadd_ps(tmp16946, _mm512_set1_ps(-1.25e+00f), tmp16955);
tmp16959 = _mm512_fmadd_ps(tmp16953, _mm512_set1_ps(-1.25e+00f), tmp16959);
tmp16945 = _mm512_fmadd_ps(tmp16955, _mm512_set1_ps(2e+00f), tmp16956);
tmp16952 = _mm512_fmadd_ps(tmp16959, _mm512_set1_ps(2e+00f), tmp16960);
tmp16956 = _mm512_fnmadd_ps(tmp16955, _mm512_set1_ps(2e+00f), tmp16956);
tmp16960 = _mm512_fnmadd_ps(tmp16959, _mm512_set1_ps(2e+00f), tmp16960);
tmp16955 = _mm512_fmadd_ps(in2374, _mm512_set1_ps(2.5e-01f), tmp16943);
tmp16959 = _mm512_fmadd_ps(in2377, _mm512_set1_ps(2.5e-01f), tmp16950);
tmp16943 = _mm512_sub_ps(tmp16947, tmp16943);
tmp16950 = _mm512_sub_ps(tmp16954, tmp16950);
tmp16955 = _mm512_fmadd_ps(tmp16946, _mm512_set1_ps(-1.25e+00f), tmp16955);
tmp16959 = _mm512_fmadd_ps(tmp16953, _mm512_set1_ps(-1.25e+00f), tmp16959);
tmp16946 = _mm512_sub_ps(tmp16946, in2374);
tmp16953 = _mm512_sub_ps(tmp16953, in2377);
tmp16946 = _mm512_fmadd_ps(tmp16946, _mm512_set1_ps(5.25e+00f), tmp16943);
tmp16953 = _mm512_fmadd_ps(tmp16953, _mm512_set1_ps(5.25e+00f), tmp16950);
tmp16944 = _mm512_fmadd_ps(tmp16955, _mm512_set1_ps(2e+00f), tmp16942);
tmp16951 = _mm512_fmadd_ps(tmp16959, _mm512_set1_ps(2e+00f), tmp16949);
tmp16942 = _mm512_fnmadd_ps(tmp16955, _mm512_set1_ps(2e+00f), tmp16942);
tmp16949 = _mm512_fnmadd_ps(tmp16959, _mm512_set1_ps(2e+00f), tmp16949);
__m512 out2207 = _mm512_shuffle_f32x4(in2372, tmp16957, 68);
__m512 out2215 = _mm512_shuffle_f32x4(in2372, tmp16957, 238);
__m512 out2208 = _mm512_shuffle_f32x4(tmp16958, tmp16945, 68);
__m512 out2216 = _mm512_shuffle_f32x4(tmp16958, tmp16945, 238);
__m512 out2209 = _mm512_shuffle_f32x4(tmp16956, tmp16944, 68);
__m512 out2217 = _mm512_shuffle_f32x4(tmp16956, tmp16944, 238);
__m512 out2210 = _mm512_shuffle_f32x4(tmp16942, tmp16946, 68);
__m512 out2218 = _mm512_shuffle_f32x4(tmp16942, tmp16946, 238);
__m512 out2211 = _mm512_shuffle_f32x4(in2375, tmp16961, 68);
__m512 out2219 = _mm512_shuffle_f32x4(in2375, tmp16961, 238);
__m512 out2212 = _mm512_shuffle_f32x4(tmp16962, tmp16952, 68);
__m512 out2220 = _mm512_shuffle_f32x4(tmp16962, tmp16952, 238);
__m512 out2213 = _mm512_shuffle_f32x4(tmp16960, tmp16951, 68);
__m512 out2221 = _mm512_shuffle_f32x4(tmp16960, tmp16951, 238);
__m512 out2214 = _mm512_shuffle_f32x4(tmp16949, tmp16953, 68);
__m512 out2222 = _mm512_shuffle_f32x4(tmp16949, tmp16953, 238);
_mm512_storeu_ps(dfPtr12+512+589824*i53+98304*j45+24576*s43+768*k149, out2207);
_mm512_storeu_ps(dfPtr12+640+589824*i53+98304*j45+24576*s43+768*k149, out2215);
_mm512_storeu_ps(dfPtr12+576+589824*i53+98304*j45+24576*s43+768*k149, out2211);
_mm512_storeu_ps(dfPtr12+704+589824*i53+98304*j45+24576*s43+768*k149, out2219);
_mm512_storeu_ps(dfPtr12+147968+589824*i53+98304*j45+24576*s43+768*k149, out2208);
_mm512_storeu_ps(dfPtr12+148096+589824*i53+98304*j45+24576*s43+768*k149, out2216);
_mm512_storeu_ps(dfPtr12+148032+589824*i53+98304*j45+24576*s43+768*k149, out2212);
_mm512_storeu_ps(dfPtr12+148160+589824*i53+98304*j45+24576*s43+768*k149, out2220);
_mm512_storeu_ps(dfPtr12+295424+589824*i53+98304*j45+24576*s43+768*k149, out2209);
_mm512_storeu_ps(dfPtr12+295552+589824*i53+98304*j45+24576*s43+768*k149, out2217);
_mm512_storeu_ps(dfPtr12+295488+589824*i53+98304*j45+24576*s43+768*k149, out2213);
_mm512_storeu_ps(dfPtr12+295616+589824*i53+98304*j45+24576*s43+768*k149, out2221);
_mm512_storeu_ps(dfPtr12+442880+589824*i53+98304*j45+24576*s43+768*k149, out2210);
_mm512_storeu_ps(dfPtr12+443008+589824*i53+98304*j45+24576*s43+768*k149, out2218);
_mm512_storeu_ps(dfPtr12+442944+589824*i53+98304*j45+24576*s43+768*k149, out2214);
_mm512_storeu_ps(dfPtr12+443072+589824*i53+98304*j45+24576*s43+768*k149, out2222);
}
if (j45 >= last11) return;
++j45;
}

static void ResNet50ThreeArrangeDats5(ResNet50ThreaderTeam1* team57, char** tensors87) {
ResNet50ThreaderTask1 task91;
task91.callee1 = ResNet50ThreeArrangeDats5Callee1;
task91.any1 = tensors87;
task91.nd1 = 4;
task91.hull1[0] = 2;
task91.hull1[1] = 2;
task91.hull1[2] = 1;
task91.hull1[3] = 1;
ResNet50ThreaderDo1(team57, &task91);
}

static void ResNet50ThreeProduceSums5Callee1(ResNet50ThreaderTask1* task92, int64_t* pt51) {
void** pair24 = task92->any1;
char** tensors90 = pair24[0];
ptrdiff_t e27 = 0;
ptrdiff_t g31 = 0;
ptrdiff_t f48 = pt51[2];
ptrdiff_t d18 = pt51[1];
ptrdiff_t w67 = pt51[0];
char*restrict bfPtr13 = tensors90[0]+1024*e27;
char*restrict wfPtr13 = tensors90[0]+1024+12976128*e27;
char*restrict dfPtr13 = tensors90[1]+912384*e27;
char*restrict sfPtr12 = tensors90[2];
ptrdiff_t i54 = 1*g31;
ptrdiff_t j46 = 1*f48;
ptrdiff_t k150 = 1*d18;
ptrdiff_t kk48 = k150+0;
for (; k150 != 1; ++k150) {
ptrdiff_t l62 = 2*w67;
ptrdiff_t ll9 = l62+1;
for (; l62 != 64; ++l62) {
__m512 sum445;
__m512 sum451;
__m512 sum457;
__m512 sum463;
if (__builtin_expect(!j46, 0)) {
sum445 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr13+0+1024*i54+16*l62)));
sum451 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr13+4+1024*i54+16*l62)));
sum457 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr13+8+1024*i54+16*l62)));
sum463 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr13+12+1024*i54+16*l62)));
} else {
sum445 = _mm512_setzero_ps();
sum451 = _mm512_setzero_ps();
sum457 = _mm512_setzero_ps();
sum463 = _mm512_setzero_ps();
}
__m512 sum446 = sum445;
__m512 sum447 = sum445;
__m512 sum448 = sum445;
__m512 sum449 = sum445;
__m512 sum450 = sum445;
__m512 sum452 = sum451;
__m512 sum453 = sum451;
__m512 sum454 = sum451;
__m512 sum455 = sum451;
__m512 sum456 = sum451;
__m512 sum458 = sum457;
__m512 sum459 = sum457;
__m512 sum460 = sum457;
__m512 sum461 = sum457;
__m512 sum462 = sum457;
__m512 sum464 = sum463;
__m512 sum465 = sum463;
__m512 sum466 = sum463;
__m512 sum467 = sum463;
__m512 sum468 = sum463;
ptrdiff_t b64 = 0;
for (; b64 != 256; ++b64) {
__m512i wfs33 = _mm512_maskz_loadu_epi32(65535, wfPtr13+0+8388608*i54+2097152*j46+32768*l62+128*b64);
__m512 wf145 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs33));
__m512 df675 = _mm512_loadu_ps(dfPtr13+0+589824*i54+147456*j46+98304*k150+384*b64);
sum445 = _mm512_fmadd_ps(wf145, df675, sum445);
__m512 df676 = _mm512_loadu_ps(dfPtr13+64+589824*i54+147456*j46+98304*k150+384*b64);
sum446 = _mm512_fmadd_ps(wf145, df676, sum446);
__m512 df677 = _mm512_loadu_ps(dfPtr13+128+589824*i54+147456*j46+98304*k150+384*b64);
sum447 = _mm512_fmadd_ps(wf145, df677, sum447);
__m512 df678 = _mm512_loadu_ps(dfPtr13+192+589824*i54+147456*j46+98304*k150+384*b64);
sum448 = _mm512_fmadd_ps(wf145, df678, sum448);
__m512 df679 = _mm512_loadu_ps(dfPtr13+256+589824*i54+147456*j46+98304*k150+384*b64);
sum449 = _mm512_fmadd_ps(wf145, df679, sum449);
__m512 df680 = _mm512_loadu_ps(dfPtr13+320+589824*i54+147456*j46+98304*k150+384*b64);
sum450 = _mm512_fmadd_ps(wf145, df680, sum450);
__m512 wf146 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs33, 1));
sum451 = _mm512_fmadd_ps(wf146, df675, sum451);
sum452 = _mm512_fmadd_ps(wf146, df676, sum452);
sum453 = _mm512_fmadd_ps(wf146, df677, sum453);
sum454 = _mm512_fmadd_ps(wf146, df678, sum454);
sum455 = _mm512_fmadd_ps(wf146, df679, sum455);
sum456 = _mm512_fmadd_ps(wf146, df680, sum456);
__m512i wfs34 = _mm512_maskz_loadu_epi32(65535, wfPtr13+64+8388608*i54+2097152*j46+32768*l62+128*b64);
__m512 wf147 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs34));
sum457 = _mm512_fmadd_ps(wf147, df675, sum457);
sum458 = _mm512_fmadd_ps(wf147, df676, sum458);
sum459 = _mm512_fmadd_ps(wf147, df677, sum459);
sum460 = _mm512_fmadd_ps(wf147, df678, sum460);
sum461 = _mm512_fmadd_ps(wf147, df679, sum461);
sum462 = _mm512_fmadd_ps(wf147, df680, sum462);
__m512 wf148 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs34, 1));
sum463 = _mm512_fmadd_ps(wf148, df675, sum463);
sum464 = _mm512_fmadd_ps(wf148, df676, sum464);
sum465 = _mm512_fmadd_ps(wf148, df677, sum465);
sum466 = _mm512_fmadd_ps(wf148, df678, sum466);
sum467 = _mm512_fmadd_ps(wf148, df679, sum467);
sum468 = _mm512_fmadd_ps(wf148, df680, sum468);
}
_mm512_storeu_ps(sfPtr12+0+589824*i54+147456*j46+98304*k150+1536*l62, sum445);
_mm512_storeu_ps(sfPtr12+64+589824*i54+147456*j46+98304*k150+1536*l62, sum446);
_mm512_storeu_ps(sfPtr12+128+589824*i54+147456*j46+98304*k150+1536*l62, sum447);
_mm512_storeu_ps(sfPtr12+192+589824*i54+147456*j46+98304*k150+1536*l62, sum448);
_mm512_storeu_ps(sfPtr12+256+589824*i54+147456*j46+98304*k150+1536*l62, sum449);
_mm512_storeu_ps(sfPtr12+320+589824*i54+147456*j46+98304*k150+1536*l62, sum450);
_mm512_storeu_ps(sfPtr12+384+589824*i54+147456*j46+98304*k150+1536*l62, sum451);
_mm512_storeu_ps(sfPtr12+448+589824*i54+147456*j46+98304*k150+1536*l62, sum452);
_mm512_storeu_ps(sfPtr12+512+589824*i54+147456*j46+98304*k150+1536*l62, sum453);
_mm512_storeu_ps(sfPtr12+576+589824*i54+147456*j46+98304*k150+1536*l62, sum454);
_mm512_storeu_ps(sfPtr12+640+589824*i54+147456*j46+98304*k150+1536*l62, sum455);
_mm512_storeu_ps(sfPtr12+704+589824*i54+147456*j46+98304*k150+1536*l62, sum456);
_mm512_storeu_ps(sfPtr12+768+589824*i54+147456*j46+98304*k150+1536*l62, sum457);
_mm512_storeu_ps(sfPtr12+832+589824*i54+147456*j46+98304*k150+1536*l62, sum458);
_mm512_storeu_ps(sfPtr12+896+589824*i54+147456*j46+98304*k150+1536*l62, sum459);
_mm512_storeu_ps(sfPtr12+960+589824*i54+147456*j46+98304*k150+1536*l62, sum460);
_mm512_storeu_ps(sfPtr12+1024+589824*i54+147456*j46+98304*k150+1536*l62, sum461);
_mm512_storeu_ps(sfPtr12+1088+589824*i54+147456*j46+98304*k150+1536*l62, sum462);
_mm512_storeu_ps(sfPtr12+1152+589824*i54+147456*j46+98304*k150+1536*l62, sum463);
_mm512_storeu_ps(sfPtr12+1216+589824*i54+147456*j46+98304*k150+1536*l62, sum464);
_mm512_storeu_ps(sfPtr12+1280+589824*i54+147456*j46+98304*k150+1536*l62, sum465);
_mm512_storeu_ps(sfPtr12+1344+589824*i54+147456*j46+98304*k150+1536*l62, sum466);
_mm512_storeu_ps(sfPtr12+1408+589824*i54+147456*j46+98304*k150+1536*l62, sum467);
_mm512_storeu_ps(sfPtr12+1472+589824*i54+147456*j46+98304*k150+1536*l62, sum468);
if (l62 >= ll9) return;
}
if (k150 >= kk48) return;
}
ptrdiff_t l63 = 2*w67;
ptrdiff_t ll10 = l63+1;
for (; l63 != 64; ++l63) {
__m512 sum469;
__m512 sum472;
__m512 sum475;
__m512 sum478;
if (__builtin_expect(!j46, 0)) {
sum469 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr13+0+1024*i54+16*l63)));
sum472 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr13+4+1024*i54+16*l63)));
sum475 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr13+8+1024*i54+16*l63)));
sum478 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr13+12+1024*i54+16*l63)));
} else {
sum469 = _mm512_setzero_ps();
sum472 = _mm512_setzero_ps();
sum475 = _mm512_setzero_ps();
sum478 = _mm512_setzero_ps();
}
__m512 sum470 = sum469;
__m512 sum471 = sum469;
__m512 sum473 = sum472;
__m512 sum474 = sum472;
__m512 sum476 = sum475;
__m512 sum477 = sum475;
__m512 sum479 = sum478;
__m512 sum480 = sum478;
ptrdiff_t b65 = 0;
for (; b65 != 256; ++b65) {
__m512i wfs35 = _mm512_maskz_loadu_epi32(65535, wfPtr13+0+8388608*i54+2097152*j46+32768*l63+128*b65);
__m512 wf149 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs35));
__m512 df681 = _mm512_loadu_ps(dfPtr13+0+589824*i54+147456*j46+98304*k150+192*b65);
sum469 = _mm512_fmadd_ps(wf149, df681, sum469);
__m512 df682 = _mm512_loadu_ps(dfPtr13+64+589824*i54+147456*j46+98304*k150+192*b65);
sum470 = _mm512_fmadd_ps(wf149, df682, sum470);
__m512 df683 = _mm512_loadu_ps(dfPtr13+128+589824*i54+147456*j46+98304*k150+192*b65);
sum471 = _mm512_fmadd_ps(wf149, df683, sum471);
__m512 wf150 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs35, 1));
sum472 = _mm512_fmadd_ps(wf150, df681, sum472);
sum473 = _mm512_fmadd_ps(wf150, df682, sum473);
sum474 = _mm512_fmadd_ps(wf150, df683, sum474);
__m512i wfs36 = _mm512_maskz_loadu_epi32(65535, wfPtr13+64+8388608*i54+2097152*j46+32768*l63+128*b65);
__m512 wf151 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs36));
sum475 = _mm512_fmadd_ps(wf151, df681, sum475);
sum476 = _mm512_fmadd_ps(wf151, df682, sum476);
sum477 = _mm512_fmadd_ps(wf151, df683, sum477);
__m512 wf152 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs36, 1));
sum478 = _mm512_fmadd_ps(wf152, df681, sum478);
sum479 = _mm512_fmadd_ps(wf152, df682, sum479);
sum480 = _mm512_fmadd_ps(wf152, df683, sum480);
}
_mm512_storeu_ps(sfPtr12+0+589824*i54+147456*j46+98304*k150+768*l63, sum469);
_mm512_storeu_ps(sfPtr12+64+589824*i54+147456*j46+98304*k150+768*l63, sum470);
_mm512_storeu_ps(sfPtr12+128+589824*i54+147456*j46+98304*k150+768*l63, sum471);
_mm512_storeu_ps(sfPtr12+192+589824*i54+147456*j46+98304*k150+768*l63, sum472);
_mm512_storeu_ps(sfPtr12+256+589824*i54+147456*j46+98304*k150+768*l63, sum473);
_mm512_storeu_ps(sfPtr12+320+589824*i54+147456*j46+98304*k150+768*l63, sum474);
_mm512_storeu_ps(sfPtr12+384+589824*i54+147456*j46+98304*k150+768*l63, sum475);
_mm512_storeu_ps(sfPtr12+448+589824*i54+147456*j46+98304*k150+768*l63, sum476);
_mm512_storeu_ps(sfPtr12+512+589824*i54+147456*j46+98304*k150+768*l63, sum477);
_mm512_storeu_ps(sfPtr12+576+589824*i54+147456*j46+98304*k150+768*l63, sum478);
_mm512_storeu_ps(sfPtr12+640+589824*i54+147456*j46+98304*k150+768*l63, sum479);
_mm512_storeu_ps(sfPtr12+704+589824*i54+147456*j46+98304*k150+768*l63, sum480);
if (l63 >= ll10) return;
}
}

static void ResNet50ThreeProduceSums5(ResNet50ThreaderTeam1* team58, char** tensors89) {
void* pair23[] = {tensors89, 0};
ResNet50ThreaderTask1 task93;
task93.callee1 = ResNet50ThreeProduceSums5Callee1;
task93.any1 = pair23;
task93.nd1 = 4;
task93.hull1[0] = 32;
task93.hull1[1] = 2;
task93.hull1[2] = 4;
task93.hull1[3] = 1;
ResNet50ThreaderDo1(team58, &task93);
}

static void ResNet50ThreeConsumeSums5Callee1(ResNet50ThreaderTask1* task94, int64_t* pt52) {
char** tensors92 = task94->any1;
ptrdiff_t w68 = pt52[0];
ptrdiff_t d19 = pt52[1];
ptrdiff_t g32 = 0;
char*restrict sfPtr13 = tensors92[0];
char*restrict datPtr28 = tensors92[1];
ptrdiff_t i55 = 1*g32;
ptrdiff_t j47 = 1*d19;
ptrdiff_t last12 = j47+0;
ptrdiff_t rel24 = j47-0;
ptrdiff_t base24 = 0;
if (rel24 < 1) {
ptrdiff_t toH48 = base24+0;
ptrdiff_t toW48 = 0;
ptrdiff_t k151 = 32*w68;
ptrdiff_t kk49 = k151+31;
for (; k151 != 64; ++k151) {
ptrdiff_t l64 = 0;
for (; l64 != 2; ++l64) {
__m512 sf1217 = _mm512_loadu_ps(sfPtr13+0+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1218 = _mm512_loadu_ps(sfPtr13+128+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2378 = _mm512_shuffle_f32x4(sf1217, sf1218, 68);
__m512 in2379 = _mm512_shuffle_f32x4(sf1217, sf1218, 238);
__m512 sf1219 = _mm512_loadu_ps(sfPtr13+64+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1220 = _mm512_loadu_ps(sfPtr13+192+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2386 = _mm512_shuffle_f32x4(sf1219, sf1220, 68);
__m512 in2387 = _mm512_shuffle_f32x4(sf1219, sf1220, 238);
__m512 sf1221 = _mm512_loadu_ps(sfPtr13+147456+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1222 = _mm512_loadu_ps(sfPtr13+147584+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2380 = _mm512_shuffle_f32x4(sf1221, sf1222, 68);
__m512 in2381 = _mm512_shuffle_f32x4(sf1221, sf1222, 238);
__m512 sf1223 = _mm512_loadu_ps(sfPtr13+147520+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1224 = _mm512_loadu_ps(sfPtr13+147648+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2388 = _mm512_shuffle_f32x4(sf1223, sf1224, 68);
__m512 in2389 = _mm512_shuffle_f32x4(sf1223, sf1224, 238);
__m512 sf1225 = _mm512_loadu_ps(sfPtr13+294912+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1226 = _mm512_loadu_ps(sfPtr13+295040+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2382 = _mm512_shuffle_f32x4(sf1225, sf1226, 68);
__m512 in2383 = _mm512_shuffle_f32x4(sf1225, sf1226, 238);
__m512 sf1227 = _mm512_loadu_ps(sfPtr13+294976+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1228 = _mm512_loadu_ps(sfPtr13+295104+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2390 = _mm512_shuffle_f32x4(sf1227, sf1228, 68);
__m512 in2391 = _mm512_shuffle_f32x4(sf1227, sf1228, 238);
__m512 sf1229 = _mm512_loadu_ps(sfPtr13+442368+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1230 = _mm512_loadu_ps(sfPtr13+442496+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2384 = _mm512_shuffle_f32x4(sf1229, sf1230, 68);
__m512 in2385 = _mm512_shuffle_f32x4(sf1229, sf1230, 238);
__m512 sf1231 = _mm512_loadu_ps(sfPtr13+442432+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1232 = _mm512_loadu_ps(sfPtr13+442560+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2392 = _mm512_shuffle_f32x4(sf1231, sf1232, 68);
__m512 in2393 = _mm512_shuffle_f32x4(sf1231, sf1232, 238);
__m512 tmp17027 = _mm512_add_ps(in2379, in2380);
__m512 tmp17047 = _mm512_add_ps(in2387, in2388);
__m512 tmp17026 = _mm512_add_ps(in2381, in2382);
__m512 tmp17046 = _mm512_add_ps(in2389, in2390);
__m512 tmp17032 = _mm512_sub_ps(in2381, in2382);
__m512 tmp17052 = _mm512_sub_ps(in2389, in2390);
__m512 tmp17031 = _mm512_sub_ps(in2379, in2380);
__m512 tmp17051 = _mm512_sub_ps(in2387, in2388);
__m512 tmp17028 = _mm512_add_ps(in2383, in2384);
__m512 tmp17048 = _mm512_add_ps(in2391, in2392);
__m512 tmp17033 = _mm512_sub_ps(in2383, in2384);
__m512 tmp17053 = _mm512_sub_ps(in2391, in2392);
__m512 tmp17030 = _mm512_fmadd_ps(tmp17032, _mm512_set1_ps(2e+00f), tmp17031);
__m512 tmp17050 = _mm512_fmadd_ps(tmp17052, _mm512_set1_ps(2e+00f), tmp17051);
__m512 tmp17037 = _mm512_fmadd_ps(tmp17032, _mm512_set1_ps(8e+00f), tmp17031);
__m512 tmp17057 = _mm512_fmadd_ps(tmp17052, _mm512_set1_ps(8e+00f), tmp17051);
__m512 tmp17025 = _mm512_add_ps(tmp17026, tmp17027);
__m512 tmp17045 = _mm512_add_ps(tmp17046, tmp17047);
__m512 tmp17029 = _mm512_fmadd_ps(tmp17033, _mm512_set1_ps(1.6e+01f), tmp17030);
__m512 tmp17049 = _mm512_fmadd_ps(tmp17053, _mm512_set1_ps(1.6e+01f), tmp17050);
__m512 tmp17036 = _mm512_fmadd_ps(tmp17033, _mm512_set1_ps(4e+00f), tmp17037);
__m512 tmp17056 = _mm512_fmadd_ps(tmp17053, _mm512_set1_ps(4e+00f), tmp17057);
__m512 tmp17042 = _mm512_add_ps(tmp17033, tmp17031);
__m512 tmp17062 = _mm512_add_ps(tmp17053, tmp17051);
__m512 tmp17035 = _mm512_fmadd_ps(tmp17026, _mm512_set1_ps(4e+00f), tmp17027);
__m512 tmp17055 = _mm512_fmadd_ps(tmp17046, _mm512_set1_ps(4e+00f), tmp17047);
__m512 tmp17039 = _mm512_fmadd_ps(tmp17026, _mm512_set1_ps(1.6e+01f), tmp17027);
__m512 tmp17059 = _mm512_fmadd_ps(tmp17046, _mm512_set1_ps(1.6e+01f), tmp17047);
__m512 tmp17024 = _mm512_add_ps(tmp17025, in2378);
__m512 tmp17044 = _mm512_add_ps(tmp17045, in2386);
__m512 tmp17041 = _mm512_add_ps(tmp17042, in2385);
__m512 tmp17061 = _mm512_add_ps(tmp17062, in2393);
__m512 tmp17023 = _mm512_fmadd_ps(tmp17028, _mm512_set1_ps(3.2e+01f), tmp17024);
__m512 tmp17043 = _mm512_fmadd_ps(tmp17048, _mm512_set1_ps(3.2e+01f), tmp17044);
__m512 tmp17034 = _mm512_fmadd_ps(tmp17028, _mm512_set1_ps(8e+00f), tmp17035);
__m512 tmp17054 = _mm512_fmadd_ps(tmp17048, _mm512_set1_ps(8e+00f), tmp17055);
__m512 tmp17040 = _mm512_fmadd_ps(tmp17032, _mm512_set1_ps(3.2e+01f), tmp17041);
__m512 tmp17060 = _mm512_fmadd_ps(tmp17052, _mm512_set1_ps(3.2e+01f), tmp17061);
__m512 tmp17038 = _mm512_fmadd_ps(tmp17028, _mm512_set1_ps(2e+00f), tmp17039);
__m512 tmp17058 = _mm512_fmadd_ps(tmp17048, _mm512_set1_ps(2e+00f), tmp17059);
__m512 tmp17011 = tmp17023;
__m512 tmp17017 = tmp17043;
__m512 tmp17012 = tmp17029;
__m512 tmp17018 = tmp17049;
__m512 tmp17013 = tmp17034;
__m512 tmp17019 = tmp17054;
__m512 tmp17014 = tmp17036;
__m512 tmp17020 = tmp17056;
__m512 tmp17015 = tmp17038;
__m512 tmp17021 = tmp17058;
__m512 tmp17016 = tmp17040;
__m512 tmp17022 = tmp17060;
__m512 tmp17107 = _mm512_unpacklo_ps(tmp17011, tmp17012);
__m512 tmp17108 = _mm512_unpackhi_ps(tmp17011, tmp17012);
__m512 tmp17109 = _mm512_unpacklo_ps(tmp17013, tmp17014);
__m512 tmp17110 = _mm512_unpackhi_ps(tmp17013, tmp17014);
__m512 tmp17111 = _mm512_unpacklo_ps(tmp17015, tmp17016);
__m512 tmp17112 = _mm512_unpackhi_ps(tmp17015, tmp17016);
__m512 tmp17113 = _mm512_unpacklo_ps(tmp17017, tmp17018);
__m512 tmp17114 = _mm512_unpackhi_ps(tmp17017, tmp17018);
__m512 tmp17115 = _mm512_unpacklo_ps(tmp17019, tmp17020);
__m512 tmp17116 = _mm512_unpackhi_ps(tmp17019, tmp17020);
__m512 tmp17117 = _mm512_unpacklo_ps(tmp17021, tmp17022);
__m512 tmp17118 = _mm512_unpackhi_ps(tmp17021, tmp17022);
__m512 tmp17119 = _mm512_shuffle_ps(tmp17107, tmp17109, 68);
__m512 tmp17120 = _mm512_shuffle_ps(tmp17107, tmp17109, 238);
__m512 tmp17121 = _mm512_shuffle_ps(tmp17108, tmp17110, 68);
__m512 tmp17122 = _mm512_shuffle_ps(tmp17108, tmp17110, 238);
__m512 tmp17123 = _mm512_shuffle_ps(tmp17111, tmp17113, 68);
__m512 tmp17124 = _mm512_shuffle_ps(tmp17111, tmp17113, 238);
__m512 tmp17125 = _mm512_shuffle_ps(tmp17112, tmp17114, 68);
__m512 tmp17126 = _mm512_shuffle_ps(tmp17112, tmp17114, 238);
__m512 tmp17127 = _mm512_shuffle_ps(tmp17115, tmp17117, 68);
__m512 tmp17128 = _mm512_shuffle_ps(tmp17115, tmp17117, 238);
__m512 tmp17129 = _mm512_shuffle_ps(tmp17116, tmp17118, 68);
__m512 tmp17130 = _mm512_shuffle_ps(tmp17116, tmp17118, 238);
__m512 tmp17131 = _mm512_shuffle_f32x4(tmp17119, tmp17123, 136);
__m512 tmp17132 = _mm512_shuffle_f32x4(tmp17119, tmp17123, 221);
__m512 tmp17133 = _mm512_shuffle_f32x4(tmp17120, tmp17124, 136);
__m512 tmp17134 = _mm512_shuffle_f32x4(tmp17120, tmp17124, 221);
__m512 tmp17135 = _mm512_shuffle_f32x4(tmp17121, tmp17125, 136);
__m512 tmp17136 = _mm512_shuffle_f32x4(tmp17121, tmp17125, 221);
__m512 tmp17137 = _mm512_shuffle_f32x4(tmp17122, tmp17126, 136);
__m512 tmp17138 = _mm512_shuffle_f32x4(tmp17122, tmp17126, 221);
__m512 tmp17139 = _mm512_shuffle_f32x4(tmp17127, tmp17127, 136);
__m512 tmp17140 = _mm512_shuffle_f32x4(tmp17127, tmp17127, 221);
__m512 tmp17141 = _mm512_shuffle_f32x4(tmp17128, tmp17128, 136);
__m512 tmp17142 = _mm512_shuffle_f32x4(tmp17128, tmp17128, 221);
__m512 tmp17143 = _mm512_shuffle_f32x4(tmp17129, tmp17129, 136);
__m512 tmp17144 = _mm512_shuffle_f32x4(tmp17129, tmp17129, 221);
__m512 tmp17145 = _mm512_shuffle_f32x4(tmp17130, tmp17130, 136);
__m512 tmp17146 = _mm512_shuffle_f32x4(tmp17130, tmp17130, 221);
tmp17011 = _mm512_shuffle_f32x4(tmp17131, tmp17139, 136);
tmp17019 = _mm512_shuffle_f32x4(tmp17131, tmp17139, 221);
tmp17012 = _mm512_shuffle_f32x4(tmp17133, tmp17141, 136);
tmp17020 = _mm512_shuffle_f32x4(tmp17133, tmp17141, 221);
tmp17013 = _mm512_shuffle_f32x4(tmp17135, tmp17143, 136);
tmp17021 = _mm512_shuffle_f32x4(tmp17135, tmp17143, 221);
tmp17014 = _mm512_shuffle_f32x4(tmp17137, tmp17145, 136);
tmp17022 = _mm512_shuffle_f32x4(tmp17137, tmp17145, 221);
tmp17015 = _mm512_shuffle_f32x4(tmp17132, tmp17140, 136);
__m512 tmp17063 = _mm512_shuffle_f32x4(tmp17132, tmp17140, 221);
tmp17016 = _mm512_shuffle_f32x4(tmp17134, tmp17142, 136);
__m512 tmp17064 = _mm512_shuffle_f32x4(tmp17134, tmp17142, 221);
tmp17017 = _mm512_shuffle_f32x4(tmp17136, tmp17144, 136);
__m512 tmp17065 = _mm512_shuffle_f32x4(tmp17136, tmp17144, 221);
tmp17018 = _mm512_shuffle_f32x4(tmp17138, tmp17146, 136);
__m512 tmp17066 = _mm512_shuffle_f32x4(tmp17138, tmp17146, 221);
__m512 tmp17071 = _mm512_add_ps(tmp17012, tmp17013);
__m512 tmp17091 = _mm512_add_ps(tmp17020, tmp17021);
__m512 tmp17070 = _mm512_add_ps(tmp17014, tmp17015);
__m512 tmp17090 = _mm512_add_ps(tmp17022, tmp17063);
__m512 tmp17076 = _mm512_sub_ps(tmp17014, tmp17015);
__m512 tmp17096 = _mm512_sub_ps(tmp17022, tmp17063);
__m512 tmp17075 = _mm512_sub_ps(tmp17012, tmp17013);
__m512 tmp17095 = _mm512_sub_ps(tmp17020, tmp17021);
__m512 tmp17072 = _mm512_add_ps(tmp17016, tmp17017);
__m512 tmp17092 = _mm512_add_ps(tmp17064, tmp17065);
__m512 tmp17077 = _mm512_sub_ps(tmp17016, tmp17017);
__m512 tmp17097 = _mm512_sub_ps(tmp17064, tmp17065);
__m512 tmp17074 = _mm512_fmadd_ps(tmp17076, _mm512_set1_ps(2e+00f), tmp17075);
__m512 tmp17094 = _mm512_fmadd_ps(tmp17096, _mm512_set1_ps(2e+00f), tmp17095);
__m512 tmp17081 = _mm512_fmadd_ps(tmp17076, _mm512_set1_ps(8e+00f), tmp17075);
__m512 tmp17101 = _mm512_fmadd_ps(tmp17096, _mm512_set1_ps(8e+00f), tmp17095);
__m512 tmp17069 = _mm512_add_ps(tmp17070, tmp17071);
__m512 tmp17089 = _mm512_add_ps(tmp17090, tmp17091);
__m512 tmp17073 = _mm512_fmadd_ps(tmp17077, _mm512_set1_ps(1.6e+01f), tmp17074);
__m512 tmp17093 = _mm512_fmadd_ps(tmp17097, _mm512_set1_ps(1.6e+01f), tmp17094);
__m512 tmp17080 = _mm512_fmadd_ps(tmp17077, _mm512_set1_ps(4e+00f), tmp17081);
__m512 tmp17100 = _mm512_fmadd_ps(tmp17097, _mm512_set1_ps(4e+00f), tmp17101);
__m512 tmp17086 = _mm512_add_ps(tmp17077, tmp17075);
__m512 tmp17106 = _mm512_add_ps(tmp17097, tmp17095);
__m512 tmp17079 = _mm512_fmadd_ps(tmp17070, _mm512_set1_ps(4e+00f), tmp17071);
__m512 tmp17099 = _mm512_fmadd_ps(tmp17090, _mm512_set1_ps(4e+00f), tmp17091);
__m512 tmp17083 = _mm512_fmadd_ps(tmp17070, _mm512_set1_ps(1.6e+01f), tmp17071);
__m512 tmp17103 = _mm512_fmadd_ps(tmp17090, _mm512_set1_ps(1.6e+01f), tmp17091);
__m512 tmp17068 = _mm512_add_ps(tmp17069, tmp17011);
__m512 tmp17088 = _mm512_add_ps(tmp17089, tmp17019);
__m512 tmp17085 = _mm512_add_ps(tmp17086, tmp17018);
__m512 tmp17105 = _mm512_add_ps(tmp17106, tmp17066);
__m512 tmp17067 = _mm512_fmadd_ps(tmp17072, _mm512_set1_ps(3.2e+01f), tmp17068);
__m512 tmp17087 = _mm512_fmadd_ps(tmp17092, _mm512_set1_ps(3.2e+01f), tmp17088);
__m512 tmp17078 = _mm512_fmadd_ps(tmp17072, _mm512_set1_ps(8e+00f), tmp17079);
__m512 tmp17098 = _mm512_fmadd_ps(tmp17092, _mm512_set1_ps(8e+00f), tmp17099);
__m512 tmp17084 = _mm512_fmadd_ps(tmp17076, _mm512_set1_ps(3.2e+01f), tmp17085);
__m512 tmp17104 = _mm512_fmadd_ps(tmp17096, _mm512_set1_ps(3.2e+01f), tmp17105);
__m512 tmp17082 = _mm512_fmadd_ps(tmp17072, _mm512_set1_ps(2e+00f), tmp17083);
__m512 tmp17102 = _mm512_fmadd_ps(tmp17092, _mm512_set1_ps(2e+00f), tmp17103);
__m512 out2223 = tmp17067;
__m512 out2229 = tmp17087;
__m512 out2224 = tmp17073;
__m512 out2230 = tmp17093;
__m512 out2225 = tmp17078;
__m512 out2231 = tmp17098;
__m512 out2226 = tmp17080;
__m512 out2232 = tmp17100;
__m512 out2227 = tmp17082;
__m512 out2233 = tmp17102;
__m512 out2228 = tmp17084;
__m512 out2234 = tmp17104;
out2223 = _mm512_max_ps(_mm512_setzero_ps(), out2223);
out2229 = _mm512_max_ps(_mm512_setzero_ps(), out2229);
out2224 = _mm512_max_ps(_mm512_setzero_ps(), out2224);
out2230 = _mm512_max_ps(_mm512_setzero_ps(), out2230);
out2225 = _mm512_max_ps(_mm512_setzero_ps(), out2225);
out2231 = _mm512_max_ps(_mm512_setzero_ps(), out2231);
out2226 = _mm512_max_ps(_mm512_setzero_ps(), out2226);
out2232 = _mm512_max_ps(_mm512_setzero_ps(), out2232);
out2227 = _mm512_max_ps(_mm512_setzero_ps(), out2227);
out2233 = _mm512_max_ps(_mm512_setzero_ps(), out2233);
out2228 = _mm512_max_ps(_mm512_setzero_ps(), out2228);
out2234 = _mm512_max_ps(_mm512_setzero_ps(), out2234);
_mm512_mask_storeu_ps(datPtr28+0+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2223);
_mm512_mask_storeu_ps(datPtr28+48+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 3, out2229);
_mm512_mask_storeu_ps(datPtr28+312+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4032, out2229);
_mm512_mask_storeu_ps(datPtr28+56+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2224);
_mm512_mask_storeu_ps(datPtr28+104+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 3, out2230);
_mm512_mask_storeu_ps(datPtr28+368+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4032, out2230);
_mm512_mask_storeu_ps(datPtr28+112+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2225);
_mm512_mask_storeu_ps(datPtr28+160+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 3, out2231);
_mm512_mask_storeu_ps(datPtr28+424+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4032, out2231);
_mm512_mask_storeu_ps(datPtr28+168+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2226);
_mm512_mask_storeu_ps(datPtr28+216+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 3, out2232);
_mm512_mask_storeu_ps(datPtr28+480+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4032, out2232);
_mm512_mask_storeu_ps(datPtr28+224+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2227);
_mm512_mask_storeu_ps(datPtr28+272+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 3, out2233);
_mm512_mask_storeu_ps(datPtr28+536+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4032, out2233);
_mm512_mask_storeu_ps(datPtr28+280+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2228);
_mm512_mask_storeu_ps(datPtr28+328+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 3, out2234);
_mm512_mask_storeu_ps(datPtr28+592+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4032, out2234);
__m512 sf1233 = _mm512_loadu_ps(sfPtr13+256+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1234 = _mm512_loadu_ps(sfPtr13+384+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2394 = _mm512_shuffle_f32x4(sf1233, sf1234, 68);
__m512 in2395 = _mm512_shuffle_f32x4(sf1233, sf1234, 238);
__m512 sf1235 = _mm512_loadu_ps(sfPtr13+320+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1236 = _mm512_loadu_ps(sfPtr13+448+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2402 = _mm512_shuffle_f32x4(sf1235, sf1236, 68);
__m512 in2403 = _mm512_shuffle_f32x4(sf1235, sf1236, 238);
__m512 sf1237 = _mm512_loadu_ps(sfPtr13+147712+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1238 = _mm512_loadu_ps(sfPtr13+147840+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2396 = _mm512_shuffle_f32x4(sf1237, sf1238, 68);
__m512 in2397 = _mm512_shuffle_f32x4(sf1237, sf1238, 238);
__m512 sf1239 = _mm512_loadu_ps(sfPtr13+147776+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1240 = _mm512_loadu_ps(sfPtr13+147904+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2404 = _mm512_shuffle_f32x4(sf1239, sf1240, 68);
__m512 in2405 = _mm512_shuffle_f32x4(sf1239, sf1240, 238);
__m512 sf1241 = _mm512_loadu_ps(sfPtr13+295168+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1242 = _mm512_loadu_ps(sfPtr13+295296+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2398 = _mm512_shuffle_f32x4(sf1241, sf1242, 68);
__m512 in2399 = _mm512_shuffle_f32x4(sf1241, sf1242, 238);
__m512 sf1243 = _mm512_loadu_ps(sfPtr13+295232+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1244 = _mm512_loadu_ps(sfPtr13+295360+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2406 = _mm512_shuffle_f32x4(sf1243, sf1244, 68);
__m512 in2407 = _mm512_shuffle_f32x4(sf1243, sf1244, 238);
__m512 sf1245 = _mm512_loadu_ps(sfPtr13+442624+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1246 = _mm512_loadu_ps(sfPtr13+442752+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2400 = _mm512_shuffle_f32x4(sf1245, sf1246, 68);
__m512 in2401 = _mm512_shuffle_f32x4(sf1245, sf1246, 238);
__m512 sf1247 = _mm512_loadu_ps(sfPtr13+442688+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1248 = _mm512_loadu_ps(sfPtr13+442816+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2408 = _mm512_shuffle_f32x4(sf1247, sf1248, 68);
__m512 in2409 = _mm512_shuffle_f32x4(sf1247, sf1248, 238);
__m512 tmp17163 = _mm512_add_ps(in2395, in2396);
__m512 tmp17183 = _mm512_add_ps(in2403, in2404);
__m512 tmp17162 = _mm512_add_ps(in2397, in2398);
__m512 tmp17182 = _mm512_add_ps(in2405, in2406);
__m512 tmp17168 = _mm512_sub_ps(in2397, in2398);
__m512 tmp17188 = _mm512_sub_ps(in2405, in2406);
__m512 tmp17167 = _mm512_sub_ps(in2395, in2396);
__m512 tmp17187 = _mm512_sub_ps(in2403, in2404);
__m512 tmp17164 = _mm512_add_ps(in2399, in2400);
__m512 tmp17184 = _mm512_add_ps(in2407, in2408);
__m512 tmp17169 = _mm512_sub_ps(in2399, in2400);
__m512 tmp17189 = _mm512_sub_ps(in2407, in2408);
__m512 tmp17166 = _mm512_fmadd_ps(tmp17168, _mm512_set1_ps(2e+00f), tmp17167);
__m512 tmp17186 = _mm512_fmadd_ps(tmp17188, _mm512_set1_ps(2e+00f), tmp17187);
__m512 tmp17173 = _mm512_fmadd_ps(tmp17168, _mm512_set1_ps(8e+00f), tmp17167);
__m512 tmp17193 = _mm512_fmadd_ps(tmp17188, _mm512_set1_ps(8e+00f), tmp17187);
__m512 tmp17161 = _mm512_add_ps(tmp17162, tmp17163);
__m512 tmp17181 = _mm512_add_ps(tmp17182, tmp17183);
__m512 tmp17165 = _mm512_fmadd_ps(tmp17169, _mm512_set1_ps(1.6e+01f), tmp17166);
__m512 tmp17185 = _mm512_fmadd_ps(tmp17189, _mm512_set1_ps(1.6e+01f), tmp17186);
__m512 tmp17172 = _mm512_fmadd_ps(tmp17169, _mm512_set1_ps(4e+00f), tmp17173);
__m512 tmp17192 = _mm512_fmadd_ps(tmp17189, _mm512_set1_ps(4e+00f), tmp17193);
__m512 tmp17178 = _mm512_add_ps(tmp17169, tmp17167);
__m512 tmp17198 = _mm512_add_ps(tmp17189, tmp17187);
__m512 tmp17171 = _mm512_fmadd_ps(tmp17162, _mm512_set1_ps(4e+00f), tmp17163);
__m512 tmp17191 = _mm512_fmadd_ps(tmp17182, _mm512_set1_ps(4e+00f), tmp17183);
__m512 tmp17175 = _mm512_fmadd_ps(tmp17162, _mm512_set1_ps(1.6e+01f), tmp17163);
__m512 tmp17195 = _mm512_fmadd_ps(tmp17182, _mm512_set1_ps(1.6e+01f), tmp17183);
__m512 tmp17160 = _mm512_add_ps(tmp17161, in2394);
__m512 tmp17180 = _mm512_add_ps(tmp17181, in2402);
__m512 tmp17177 = _mm512_add_ps(tmp17178, in2401);
__m512 tmp17197 = _mm512_add_ps(tmp17198, in2409);
__m512 tmp17159 = _mm512_fmadd_ps(tmp17164, _mm512_set1_ps(3.2e+01f), tmp17160);
__m512 tmp17179 = _mm512_fmadd_ps(tmp17184, _mm512_set1_ps(3.2e+01f), tmp17180);
__m512 tmp17170 = _mm512_fmadd_ps(tmp17164, _mm512_set1_ps(8e+00f), tmp17171);
__m512 tmp17190 = _mm512_fmadd_ps(tmp17184, _mm512_set1_ps(8e+00f), tmp17191);
__m512 tmp17176 = _mm512_fmadd_ps(tmp17168, _mm512_set1_ps(3.2e+01f), tmp17177);
__m512 tmp17196 = _mm512_fmadd_ps(tmp17188, _mm512_set1_ps(3.2e+01f), tmp17197);
__m512 tmp17174 = _mm512_fmadd_ps(tmp17164, _mm512_set1_ps(2e+00f), tmp17175);
__m512 tmp17194 = _mm512_fmadd_ps(tmp17184, _mm512_set1_ps(2e+00f), tmp17195);
__m512 tmp17147 = tmp17159;
__m512 tmp17153 = tmp17179;
__m512 tmp17148 = tmp17165;
__m512 tmp17154 = tmp17185;
__m512 tmp17149 = tmp17170;
__m512 tmp17155 = tmp17190;
__m512 tmp17150 = tmp17172;
__m512 tmp17156 = tmp17192;
__m512 tmp17151 = tmp17174;
__m512 tmp17157 = tmp17194;
__m512 tmp17152 = tmp17176;
__m512 tmp17158 = tmp17196;
__m512 tmp17243 = _mm512_unpacklo_ps(tmp17147, tmp17148);
__m512 tmp17244 = _mm512_unpackhi_ps(tmp17147, tmp17148);
__m512 tmp17245 = _mm512_unpacklo_ps(tmp17149, tmp17150);
__m512 tmp17246 = _mm512_unpackhi_ps(tmp17149, tmp17150);
__m512 tmp17247 = _mm512_unpacklo_ps(tmp17151, tmp17152);
__m512 tmp17248 = _mm512_unpackhi_ps(tmp17151, tmp17152);
__m512 tmp17249 = _mm512_unpacklo_ps(tmp17153, tmp17154);
__m512 tmp17250 = _mm512_unpackhi_ps(tmp17153, tmp17154);
__m512 tmp17251 = _mm512_unpacklo_ps(tmp17155, tmp17156);
__m512 tmp17252 = _mm512_unpackhi_ps(tmp17155, tmp17156);
__m512 tmp17253 = _mm512_unpacklo_ps(tmp17157, tmp17158);
__m512 tmp17254 = _mm512_unpackhi_ps(tmp17157, tmp17158);
__m512 tmp17255 = _mm512_shuffle_ps(tmp17243, tmp17245, 68);
__m512 tmp17256 = _mm512_shuffle_ps(tmp17243, tmp17245, 238);
__m512 tmp17257 = _mm512_shuffle_ps(tmp17244, tmp17246, 68);
__m512 tmp17258 = _mm512_shuffle_ps(tmp17244, tmp17246, 238);
__m512 tmp17259 = _mm512_shuffle_ps(tmp17247, tmp17249, 68);
__m512 tmp17260 = _mm512_shuffle_ps(tmp17247, tmp17249, 238);
__m512 tmp17261 = _mm512_shuffle_ps(tmp17248, tmp17250, 68);
__m512 tmp17262 = _mm512_shuffle_ps(tmp17248, tmp17250, 238);
__m512 tmp17263 = _mm512_shuffle_ps(tmp17251, tmp17253, 68);
__m512 tmp17264 = _mm512_shuffle_ps(tmp17251, tmp17253, 238);
__m512 tmp17265 = _mm512_shuffle_ps(tmp17252, tmp17254, 68);
__m512 tmp17266 = _mm512_shuffle_ps(tmp17252, tmp17254, 238);
__m512 tmp17267 = _mm512_shuffle_f32x4(tmp17255, tmp17259, 136);
__m512 tmp17268 = _mm512_shuffle_f32x4(tmp17255, tmp17259, 221);
__m512 tmp17269 = _mm512_shuffle_f32x4(tmp17256, tmp17260, 136);
__m512 tmp17270 = _mm512_shuffle_f32x4(tmp17256, tmp17260, 221);
__m512 tmp17271 = _mm512_shuffle_f32x4(tmp17257, tmp17261, 136);
__m512 tmp17272 = _mm512_shuffle_f32x4(tmp17257, tmp17261, 221);
__m512 tmp17273 = _mm512_shuffle_f32x4(tmp17258, tmp17262, 136);
__m512 tmp17274 = _mm512_shuffle_f32x4(tmp17258, tmp17262, 221);
__m512 tmp17275 = _mm512_shuffle_f32x4(tmp17263, tmp17263, 136);
__m512 tmp17276 = _mm512_shuffle_f32x4(tmp17263, tmp17263, 221);
__m512 tmp17277 = _mm512_shuffle_f32x4(tmp17264, tmp17264, 136);
__m512 tmp17278 = _mm512_shuffle_f32x4(tmp17264, tmp17264, 221);
__m512 tmp17279 = _mm512_shuffle_f32x4(tmp17265, tmp17265, 136);
__m512 tmp17280 = _mm512_shuffle_f32x4(tmp17265, tmp17265, 221);
__m512 tmp17281 = _mm512_shuffle_f32x4(tmp17266, tmp17266, 136);
__m512 tmp17282 = _mm512_shuffle_f32x4(tmp17266, tmp17266, 221);
tmp17147 = _mm512_shuffle_f32x4(tmp17267, tmp17275, 136);
tmp17155 = _mm512_shuffle_f32x4(tmp17267, tmp17275, 221);
tmp17148 = _mm512_shuffle_f32x4(tmp17269, tmp17277, 136);
tmp17156 = _mm512_shuffle_f32x4(tmp17269, tmp17277, 221);
tmp17149 = _mm512_shuffle_f32x4(tmp17271, tmp17279, 136);
tmp17157 = _mm512_shuffle_f32x4(tmp17271, tmp17279, 221);
tmp17150 = _mm512_shuffle_f32x4(tmp17273, tmp17281, 136);
tmp17158 = _mm512_shuffle_f32x4(tmp17273, tmp17281, 221);
tmp17151 = _mm512_shuffle_f32x4(tmp17268, tmp17276, 136);
__m512 tmp17199 = _mm512_shuffle_f32x4(tmp17268, tmp17276, 221);
tmp17152 = _mm512_shuffle_f32x4(tmp17270, tmp17278, 136);
__m512 tmp17200 = _mm512_shuffle_f32x4(tmp17270, tmp17278, 221);
tmp17153 = _mm512_shuffle_f32x4(tmp17272, tmp17280, 136);
__m512 tmp17201 = _mm512_shuffle_f32x4(tmp17272, tmp17280, 221);
tmp17154 = _mm512_shuffle_f32x4(tmp17274, tmp17282, 136);
__m512 tmp17202 = _mm512_shuffle_f32x4(tmp17274, tmp17282, 221);
__m512 tmp17207 = _mm512_add_ps(tmp17148, tmp17149);
__m512 tmp17227 = _mm512_add_ps(tmp17156, tmp17157);
__m512 tmp17206 = _mm512_add_ps(tmp17150, tmp17151);
__m512 tmp17226 = _mm512_add_ps(tmp17158, tmp17199);
__m512 tmp17212 = _mm512_sub_ps(tmp17150, tmp17151);
__m512 tmp17232 = _mm512_sub_ps(tmp17158, tmp17199);
__m512 tmp17211 = _mm512_sub_ps(tmp17148, tmp17149);
__m512 tmp17231 = _mm512_sub_ps(tmp17156, tmp17157);
__m512 tmp17208 = _mm512_add_ps(tmp17152, tmp17153);
__m512 tmp17228 = _mm512_add_ps(tmp17200, tmp17201);
__m512 tmp17213 = _mm512_sub_ps(tmp17152, tmp17153);
__m512 tmp17233 = _mm512_sub_ps(tmp17200, tmp17201);
__m512 tmp17210 = _mm512_fmadd_ps(tmp17212, _mm512_set1_ps(2e+00f), tmp17211);
__m512 tmp17230 = _mm512_fmadd_ps(tmp17232, _mm512_set1_ps(2e+00f), tmp17231);
__m512 tmp17217 = _mm512_fmadd_ps(tmp17212, _mm512_set1_ps(8e+00f), tmp17211);
__m512 tmp17237 = _mm512_fmadd_ps(tmp17232, _mm512_set1_ps(8e+00f), tmp17231);
__m512 tmp17205 = _mm512_add_ps(tmp17206, tmp17207);
__m512 tmp17225 = _mm512_add_ps(tmp17226, tmp17227);
__m512 tmp17209 = _mm512_fmadd_ps(tmp17213, _mm512_set1_ps(1.6e+01f), tmp17210);
__m512 tmp17229 = _mm512_fmadd_ps(tmp17233, _mm512_set1_ps(1.6e+01f), tmp17230);
__m512 tmp17216 = _mm512_fmadd_ps(tmp17213, _mm512_set1_ps(4e+00f), tmp17217);
__m512 tmp17236 = _mm512_fmadd_ps(tmp17233, _mm512_set1_ps(4e+00f), tmp17237);
__m512 tmp17222 = _mm512_add_ps(tmp17213, tmp17211);
__m512 tmp17242 = _mm512_add_ps(tmp17233, tmp17231);
__m512 tmp17215 = _mm512_fmadd_ps(tmp17206, _mm512_set1_ps(4e+00f), tmp17207);
__m512 tmp17235 = _mm512_fmadd_ps(tmp17226, _mm512_set1_ps(4e+00f), tmp17227);
__m512 tmp17219 = _mm512_fmadd_ps(tmp17206, _mm512_set1_ps(1.6e+01f), tmp17207);
__m512 tmp17239 = _mm512_fmadd_ps(tmp17226, _mm512_set1_ps(1.6e+01f), tmp17227);
__m512 tmp17204 = _mm512_add_ps(tmp17205, tmp17147);
__m512 tmp17224 = _mm512_add_ps(tmp17225, tmp17155);
__m512 tmp17221 = _mm512_add_ps(tmp17222, tmp17154);
__m512 tmp17241 = _mm512_add_ps(tmp17242, tmp17202);
__m512 tmp17203 = _mm512_fmadd_ps(tmp17208, _mm512_set1_ps(3.2e+01f), tmp17204);
__m512 tmp17223 = _mm512_fmadd_ps(tmp17228, _mm512_set1_ps(3.2e+01f), tmp17224);
__m512 tmp17214 = _mm512_fmadd_ps(tmp17208, _mm512_set1_ps(8e+00f), tmp17215);
__m512 tmp17234 = _mm512_fmadd_ps(tmp17228, _mm512_set1_ps(8e+00f), tmp17235);
__m512 tmp17220 = _mm512_fmadd_ps(tmp17212, _mm512_set1_ps(3.2e+01f), tmp17221);
__m512 tmp17240 = _mm512_fmadd_ps(tmp17232, _mm512_set1_ps(3.2e+01f), tmp17241);
__m512 tmp17218 = _mm512_fmadd_ps(tmp17208, _mm512_set1_ps(2e+00f), tmp17219);
__m512 tmp17238 = _mm512_fmadd_ps(tmp17228, _mm512_set1_ps(2e+00f), tmp17239);
__m512 out2235 = tmp17203;
__m512 out2241 = tmp17223;
__m512 out2236 = tmp17209;
__m512 out2242 = tmp17229;
__m512 out2237 = tmp17214;
__m512 out2243 = tmp17234;
__m512 out2238 = tmp17216;
__m512 out2244 = tmp17236;
__m512 out2239 = tmp17218;
__m512 out2245 = tmp17238;
__m512 out2240 = tmp17220;
__m512 out2246 = tmp17240;
out2235 = _mm512_max_ps(_mm512_setzero_ps(), out2235);
out2241 = _mm512_max_ps(_mm512_setzero_ps(), out2241);
out2236 = _mm512_max_ps(_mm512_setzero_ps(), out2236);
out2242 = _mm512_max_ps(_mm512_setzero_ps(), out2242);
out2237 = _mm512_max_ps(_mm512_setzero_ps(), out2237);
out2243 = _mm512_max_ps(_mm512_setzero_ps(), out2243);
out2238 = _mm512_max_ps(_mm512_setzero_ps(), out2238);
out2244 = _mm512_max_ps(_mm512_setzero_ps(), out2244);
out2239 = _mm512_max_ps(_mm512_setzero_ps(), out2239);
out2245 = _mm512_max_ps(_mm512_setzero_ps(), out2245);
out2240 = _mm512_max_ps(_mm512_setzero_ps(), out2240);
out2246 = _mm512_max_ps(_mm512_setzero_ps(), out2246);
_mm512_mask_storeu_ps(datPtr28+360+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 255, out2235);
_mm512_mask_storeu_ps(datPtr28+832+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2241);
_mm512_mask_storeu_ps(datPtr28+416+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 255, out2236);
_mm512_mask_storeu_ps(datPtr28+888+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2242);
_mm512_mask_storeu_ps(datPtr28+472+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 255, out2237);
_mm512_mask_storeu_ps(datPtr28+944+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2243);
_mm512_mask_storeu_ps(datPtr28+528+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 255, out2238);
_mm512_mask_storeu_ps(datPtr28+1000+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2244);
_mm512_mask_storeu_ps(datPtr28+584+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 255, out2239);
_mm512_mask_storeu_ps(datPtr28+1056+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2245);
_mm512_mask_storeu_ps(datPtr28+640+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 255, out2240);
_mm512_mask_storeu_ps(datPtr28+1112+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2246);
__m512 sf1249 = _mm512_loadu_ps(sfPtr13+512+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1250 = _mm512_loadu_ps(sfPtr13+576+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2410 = _mm512_shuffle_f32x4(sf1250, sf1249, 68);
__m512 in2411 = _mm512_shuffle_f32x4(sf1250, sf1249, 238);
__m512 sf1251 = _mm512_loadu_ps(sfPtr13+640+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1252 = _mm512_loadu_ps(sfPtr13+704+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2418 = _mm512_shuffle_f32x4(sf1251, sf1252, 68);
__m512 in2419 = _mm512_shuffle_f32x4(sf1251, sf1252, 238);
__m512 sf1253 = _mm512_loadu_ps(sfPtr13+147968+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1254 = _mm512_loadu_ps(sfPtr13+148032+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2412 = _mm512_shuffle_f32x4(sf1254, sf1253, 68);
__m512 in2413 = _mm512_shuffle_f32x4(sf1254, sf1253, 238);
__m512 sf1255 = _mm512_loadu_ps(sfPtr13+148096+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1256 = _mm512_loadu_ps(sfPtr13+148160+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2420 = _mm512_shuffle_f32x4(sf1255, sf1256, 68);
__m512 in2421 = _mm512_shuffle_f32x4(sf1255, sf1256, 238);
__m512 sf1257 = _mm512_loadu_ps(sfPtr13+295424+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1258 = _mm512_loadu_ps(sfPtr13+295488+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2414 = _mm512_shuffle_f32x4(sf1258, sf1257, 68);
__m512 in2415 = _mm512_shuffle_f32x4(sf1258, sf1257, 238);
__m512 sf1259 = _mm512_loadu_ps(sfPtr13+295552+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1260 = _mm512_loadu_ps(sfPtr13+295616+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2422 = _mm512_shuffle_f32x4(sf1259, sf1260, 68);
__m512 in2423 = _mm512_shuffle_f32x4(sf1259, sf1260, 238);
__m512 sf1261 = _mm512_loadu_ps(sfPtr13+442880+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1262 = _mm512_loadu_ps(sfPtr13+442944+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2416 = _mm512_shuffle_f32x4(sf1262, sf1261, 68);
__m512 in2417 = _mm512_shuffle_f32x4(sf1262, sf1261, 238);
__m512 sf1263 = _mm512_loadu_ps(sfPtr13+443008+589824*i55+98304*j47+1536*k151+768*l64);
__m512 sf1264 = _mm512_loadu_ps(sfPtr13+443072+589824*i55+98304*j47+1536*k151+768*l64);
__m512 in2424 = _mm512_shuffle_f32x4(sf1263, sf1264, 68);
__m512 in2425 = _mm512_shuffle_f32x4(sf1263, sf1264, 238);
__m512 tmp17299 = _mm512_add_ps(in2411, in2412);
__m512 tmp17319 = _mm512_add_ps(in2419, in2420);
__m512 tmp17298 = _mm512_add_ps(in2413, in2414);
__m512 tmp17318 = _mm512_add_ps(in2421, in2422);
__m512 tmp17304 = _mm512_sub_ps(in2413, in2414);
__m512 tmp17324 = _mm512_sub_ps(in2421, in2422);
__m512 tmp17303 = _mm512_sub_ps(in2411, in2412);
__m512 tmp17323 = _mm512_sub_ps(in2419, in2420);
__m512 tmp17300 = _mm512_add_ps(in2415, in2416);
__m512 tmp17320 = _mm512_add_ps(in2423, in2424);
__m512 tmp17305 = _mm512_sub_ps(in2415, in2416);
__m512 tmp17325 = _mm512_sub_ps(in2423, in2424);
__m512 tmp17302 = _mm512_fmadd_ps(tmp17304, _mm512_set1_ps(2e+00f), tmp17303);
__m512 tmp17322 = _mm512_fmadd_ps(tmp17324, _mm512_set1_ps(2e+00f), tmp17323);
__m512 tmp17309 = _mm512_fmadd_ps(tmp17304, _mm512_set1_ps(8e+00f), tmp17303);
__m512 tmp17329 = _mm512_fmadd_ps(tmp17324, _mm512_set1_ps(8e+00f), tmp17323);
__m512 tmp17297 = _mm512_add_ps(tmp17298, tmp17299);
__m512 tmp17317 = _mm512_add_ps(tmp17318, tmp17319);
__m512 tmp17301 = _mm512_fmadd_ps(tmp17305, _mm512_set1_ps(1.6e+01f), tmp17302);
__m512 tmp17321 = _mm512_fmadd_ps(tmp17325, _mm512_set1_ps(1.6e+01f), tmp17322);
__m512 tmp17308 = _mm512_fmadd_ps(tmp17305, _mm512_set1_ps(4e+00f), tmp17309);
__m512 tmp17328 = _mm512_fmadd_ps(tmp17325, _mm512_set1_ps(4e+00f), tmp17329);
__m512 tmp17314 = _mm512_add_ps(tmp17305, tmp17303);
__m512 tmp17334 = _mm512_add_ps(tmp17325, tmp17323);
__m512 tmp17307 = _mm512_fmadd_ps(tmp17298, _mm512_set1_ps(4e+00f), tmp17299);
__m512 tmp17327 = _mm512_fmadd_ps(tmp17318, _mm512_set1_ps(4e+00f), tmp17319);
__m512 tmp17311 = _mm512_fmadd_ps(tmp17298, _mm512_set1_ps(1.6e+01f), tmp17299);
__m512 tmp17331 = _mm512_fmadd_ps(tmp17318, _mm512_set1_ps(1.6e+01f), tmp17319);
__m512 tmp17296 = _mm512_add_ps(tmp17297, in2410);
__m512 tmp17316 = _mm512_add_ps(tmp17317, in2418);
__m512 tmp17313 = _mm512_add_ps(tmp17314, in2417);
__m512 tmp17333 = _mm512_add_ps(tmp17334, in2425);
__m512 tmp17295 = _mm512_fmadd_ps(tmp17300, _mm512_set1_ps(3.2e+01f), tmp17296);
__m512 tmp17315 = _mm512_fmadd_ps(tmp17320, _mm512_set1_ps(3.2e+01f), tmp17316);
__m512 tmp17306 = _mm512_fmadd_ps(tmp17300, _mm512_set1_ps(8e+00f), tmp17307);
__m512 tmp17326 = _mm512_fmadd_ps(tmp17320, _mm512_set1_ps(8e+00f), tmp17327);
__m512 tmp17312 = _mm512_fmadd_ps(tmp17304, _mm512_set1_ps(3.2e+01f), tmp17313);
__m512 tmp17332 = _mm512_fmadd_ps(tmp17324, _mm512_set1_ps(3.2e+01f), tmp17333);
__m512 tmp17310 = _mm512_fmadd_ps(tmp17300, _mm512_set1_ps(2e+00f), tmp17311);
__m512 tmp17330 = _mm512_fmadd_ps(tmp17320, _mm512_set1_ps(2e+00f), tmp17331);
__m512 tmp17283 = tmp17295;
__m512 tmp17289 = tmp17315;
__m512 tmp17284 = tmp17301;
__m512 tmp17290 = tmp17321;
__m512 tmp17285 = tmp17306;
__m512 tmp17291 = tmp17326;
__m512 tmp17286 = tmp17308;
__m512 tmp17292 = tmp17328;
__m512 tmp17287 = tmp17310;
__m512 tmp17293 = tmp17330;
__m512 tmp17288 = tmp17312;
__m512 tmp17294 = tmp17332;
__m512 tmp17379 = _mm512_unpacklo_ps(tmp17283, tmp17284);
__m512 tmp17380 = _mm512_unpackhi_ps(tmp17283, tmp17284);
__m512 tmp17381 = _mm512_unpacklo_ps(tmp17285, tmp17286);
__m512 tmp17382 = _mm512_unpackhi_ps(tmp17285, tmp17286);
__m512 tmp17383 = _mm512_unpacklo_ps(tmp17287, tmp17288);
__m512 tmp17384 = _mm512_unpackhi_ps(tmp17287, tmp17288);
__m512 tmp17385 = _mm512_unpacklo_ps(tmp17289, tmp17290);
__m512 tmp17386 = _mm512_unpackhi_ps(tmp17289, tmp17290);
__m512 tmp17387 = _mm512_unpacklo_ps(tmp17291, tmp17292);
__m512 tmp17388 = _mm512_unpackhi_ps(tmp17291, tmp17292);
__m512 tmp17389 = _mm512_unpacklo_ps(tmp17293, tmp17294);
__m512 tmp17390 = _mm512_unpackhi_ps(tmp17293, tmp17294);
__m512 tmp17391 = _mm512_shuffle_ps(tmp17379, tmp17381, 68);
__m512 tmp17392 = _mm512_shuffle_ps(tmp17379, tmp17381, 238);
__m512 tmp17393 = _mm512_shuffle_ps(tmp17380, tmp17382, 68);
__m512 tmp17394 = _mm512_shuffle_ps(tmp17380, tmp17382, 238);
__m512 tmp17395 = _mm512_shuffle_ps(tmp17383, tmp17385, 68);
__m512 tmp17396 = _mm512_shuffle_ps(tmp17383, tmp17385, 238);
__m512 tmp17397 = _mm512_shuffle_ps(tmp17384, tmp17386, 68);
__m512 tmp17398 = _mm512_shuffle_ps(tmp17384, tmp17386, 238);
__m512 tmp17399 = _mm512_shuffle_ps(tmp17387, tmp17389, 68);
__m512 tmp17400 = _mm512_shuffle_ps(tmp17387, tmp17389, 238);
__m512 tmp17401 = _mm512_shuffle_ps(tmp17388, tmp17390, 68);
__m512 tmp17402 = _mm512_shuffle_ps(tmp17388, tmp17390, 238);
__m512 tmp17403 = _mm512_shuffle_f32x4(tmp17391, tmp17395, 136);
__m512 tmp17404 = _mm512_shuffle_f32x4(tmp17391, tmp17395, 221);
__m512 tmp17405 = _mm512_shuffle_f32x4(tmp17392, tmp17396, 136);
__m512 tmp17406 = _mm512_shuffle_f32x4(tmp17392, tmp17396, 221);
__m512 tmp17407 = _mm512_shuffle_f32x4(tmp17393, tmp17397, 136);
__m512 tmp17408 = _mm512_shuffle_f32x4(tmp17393, tmp17397, 221);
__m512 tmp17409 = _mm512_shuffle_f32x4(tmp17394, tmp17398, 136);
__m512 tmp17410 = _mm512_shuffle_f32x4(tmp17394, tmp17398, 221);
__m512 tmp17411 = _mm512_shuffle_f32x4(tmp17399, tmp17399, 136);
__m512 tmp17412 = _mm512_shuffle_f32x4(tmp17399, tmp17399, 221);
__m512 tmp17413 = _mm512_shuffle_f32x4(tmp17400, tmp17400, 136);
__m512 tmp17414 = _mm512_shuffle_f32x4(tmp17400, tmp17400, 221);
__m512 tmp17415 = _mm512_shuffle_f32x4(tmp17401, tmp17401, 136);
__m512 tmp17416 = _mm512_shuffle_f32x4(tmp17401, tmp17401, 221);
__m512 tmp17417 = _mm512_shuffle_f32x4(tmp17402, tmp17402, 136);
__m512 tmp17418 = _mm512_shuffle_f32x4(tmp17402, tmp17402, 221);
tmp17283 = _mm512_shuffle_f32x4(tmp17403, tmp17411, 136);
tmp17291 = _mm512_shuffle_f32x4(tmp17403, tmp17411, 221);
tmp17284 = _mm512_shuffle_f32x4(tmp17405, tmp17413, 136);
tmp17292 = _mm512_shuffle_f32x4(tmp17405, tmp17413, 221);
tmp17285 = _mm512_shuffle_f32x4(tmp17407, tmp17415, 136);
tmp17293 = _mm512_shuffle_f32x4(tmp17407, tmp17415, 221);
tmp17286 = _mm512_shuffle_f32x4(tmp17409, tmp17417, 136);
tmp17294 = _mm512_shuffle_f32x4(tmp17409, tmp17417, 221);
tmp17287 = _mm512_shuffle_f32x4(tmp17404, tmp17412, 136);
__m512 tmp17335 = _mm512_shuffle_f32x4(tmp17404, tmp17412, 221);
tmp17288 = _mm512_shuffle_f32x4(tmp17406, tmp17414, 136);
__m512 tmp17336 = _mm512_shuffle_f32x4(tmp17406, tmp17414, 221);
tmp17289 = _mm512_shuffle_f32x4(tmp17408, tmp17416, 136);
__m512 tmp17337 = _mm512_shuffle_f32x4(tmp17408, tmp17416, 221);
tmp17290 = _mm512_shuffle_f32x4(tmp17410, tmp17418, 136);
__m512 tmp17338 = _mm512_shuffle_f32x4(tmp17410, tmp17418, 221);
__m512 tmp17343 = _mm512_add_ps(tmp17284, tmp17285);
__m512 tmp17363 = _mm512_add_ps(tmp17292, tmp17293);
__m512 tmp17342 = _mm512_add_ps(tmp17286, tmp17287);
__m512 tmp17362 = _mm512_add_ps(tmp17294, tmp17335);
__m512 tmp17348 = _mm512_sub_ps(tmp17286, tmp17287);
__m512 tmp17368 = _mm512_sub_ps(tmp17294, tmp17335);
__m512 tmp17347 = _mm512_sub_ps(tmp17284, tmp17285);
__m512 tmp17367 = _mm512_sub_ps(tmp17292, tmp17293);
__m512 tmp17344 = _mm512_add_ps(tmp17288, tmp17289);
__m512 tmp17364 = _mm512_add_ps(tmp17336, tmp17337);
__m512 tmp17349 = _mm512_sub_ps(tmp17288, tmp17289);
__m512 tmp17369 = _mm512_sub_ps(tmp17336, tmp17337);
__m512 tmp17346 = _mm512_fmadd_ps(tmp17348, _mm512_set1_ps(2e+00f), tmp17347);
__m512 tmp17366 = _mm512_fmadd_ps(tmp17368, _mm512_set1_ps(2e+00f), tmp17367);
__m512 tmp17353 = _mm512_fmadd_ps(tmp17348, _mm512_set1_ps(8e+00f), tmp17347);
__m512 tmp17373 = _mm512_fmadd_ps(tmp17368, _mm512_set1_ps(8e+00f), tmp17367);
__m512 tmp17341 = _mm512_add_ps(tmp17342, tmp17343);
__m512 tmp17361 = _mm512_add_ps(tmp17362, tmp17363);
__m512 tmp17345 = _mm512_fmadd_ps(tmp17349, _mm512_set1_ps(1.6e+01f), tmp17346);
__m512 tmp17365 = _mm512_fmadd_ps(tmp17369, _mm512_set1_ps(1.6e+01f), tmp17366);
__m512 tmp17352 = _mm512_fmadd_ps(tmp17349, _mm512_set1_ps(4e+00f), tmp17353);
__m512 tmp17372 = _mm512_fmadd_ps(tmp17369, _mm512_set1_ps(4e+00f), tmp17373);
__m512 tmp17358 = _mm512_add_ps(tmp17349, tmp17347);
__m512 tmp17378 = _mm512_add_ps(tmp17369, tmp17367);
__m512 tmp17351 = _mm512_fmadd_ps(tmp17342, _mm512_set1_ps(4e+00f), tmp17343);
__m512 tmp17371 = _mm512_fmadd_ps(tmp17362, _mm512_set1_ps(4e+00f), tmp17363);
__m512 tmp17355 = _mm512_fmadd_ps(tmp17342, _mm512_set1_ps(1.6e+01f), tmp17343);
__m512 tmp17375 = _mm512_fmadd_ps(tmp17362, _mm512_set1_ps(1.6e+01f), tmp17363);
__m512 tmp17340 = _mm512_add_ps(tmp17341, tmp17283);
__m512 tmp17360 = _mm512_add_ps(tmp17361, tmp17291);
__m512 tmp17357 = _mm512_add_ps(tmp17358, tmp17290);
__m512 tmp17377 = _mm512_add_ps(tmp17378, tmp17338);
__m512 tmp17339 = _mm512_fmadd_ps(tmp17344, _mm512_set1_ps(3.2e+01f), tmp17340);
__m512 tmp17359 = _mm512_fmadd_ps(tmp17364, _mm512_set1_ps(3.2e+01f), tmp17360);
__m512 tmp17350 = _mm512_fmadd_ps(tmp17344, _mm512_set1_ps(8e+00f), tmp17351);
__m512 tmp17370 = _mm512_fmadd_ps(tmp17364, _mm512_set1_ps(8e+00f), tmp17371);
__m512 tmp17356 = _mm512_fmadd_ps(tmp17348, _mm512_set1_ps(3.2e+01f), tmp17357);
__m512 tmp17376 = _mm512_fmadd_ps(tmp17368, _mm512_set1_ps(3.2e+01f), tmp17377);
__m512 tmp17354 = _mm512_fmadd_ps(tmp17344, _mm512_set1_ps(2e+00f), tmp17355);
__m512 tmp17374 = _mm512_fmadd_ps(tmp17364, _mm512_set1_ps(2e+00f), tmp17375);
__m512 out2253 = tmp17339;
__m512 out2247 = tmp17359;
__m512 out2254 = tmp17345;
__m512 out2248 = tmp17365;
__m512 out2255 = tmp17350;
__m512 out2249 = tmp17370;
__m512 out2256 = tmp17352;
__m512 out2250 = tmp17372;
__m512 out2257 = tmp17354;
__m512 out2251 = tmp17374;
__m512 out2258 = tmp17356;
__m512 out2252 = tmp17376;
out2253 = _mm512_max_ps(_mm512_setzero_ps(), out2253);
out2247 = _mm512_max_ps(_mm512_setzero_ps(), out2247);
out2254 = _mm512_max_ps(_mm512_setzero_ps(), out2254);
out2248 = _mm512_max_ps(_mm512_setzero_ps(), out2248);
out2255 = _mm512_max_ps(_mm512_setzero_ps(), out2255);
out2249 = _mm512_max_ps(_mm512_setzero_ps(), out2249);
out2256 = _mm512_max_ps(_mm512_setzero_ps(), out2256);
out2250 = _mm512_max_ps(_mm512_setzero_ps(), out2250);
out2257 = _mm512_max_ps(_mm512_setzero_ps(), out2257);
out2251 = _mm512_max_ps(_mm512_setzero_ps(), out2251);
out2258 = _mm512_max_ps(_mm512_setzero_ps(), out2258);
out2252 = _mm512_max_ps(_mm512_setzero_ps(), out2252);
_mm512_mask_storeu_ps(datPtr28+1168+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2253);
_mm512_mask_storeu_ps(datPtr28+880+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 3, out2247);
_mm512_mask_storeu_ps(datPtr28+1192+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 192, out2247);
_mm512_mask_storeu_ps(datPtr28+1224+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2254);
_mm512_mask_storeu_ps(datPtr28+936+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 3, out2248);
_mm512_mask_storeu_ps(datPtr28+1248+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 192, out2248);
_mm512_mask_storeu_ps(datPtr28+1280+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2255);
_mm512_mask_storeu_ps(datPtr28+992+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 3, out2249);
_mm512_mask_storeu_ps(datPtr28+1304+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 192, out2249);
_mm512_mask_storeu_ps(datPtr28+1336+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2256);
_mm512_mask_storeu_ps(datPtr28+1048+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 3, out2250);
_mm512_mask_storeu_ps(datPtr28+1360+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 192, out2250);
_mm512_mask_storeu_ps(datPtr28+1392+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2257);
_mm512_mask_storeu_ps(datPtr28+1104+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 3, out2251);
_mm512_mask_storeu_ps(datPtr28+1416+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 192, out2251);
_mm512_mask_storeu_ps(datPtr28+1448+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 4095, out2258);
_mm512_mask_storeu_ps(datPtr28+1160+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 3, out2252);
_mm512_mask_storeu_ps(datPtr28+1472+212992*i55+56*toH48+4*toW48+3328*k151+1664*l64, 192, out2252);
}
if (k151 >= kk49) return;
}
if (j47 >= last12) return;
++j47;
rel24 = 1;
}
ptrdiff_t toH49 = base24+12;
ptrdiff_t toW49 = 0;
ptrdiff_t k152 = 32*w68;
ptrdiff_t kk50 = k152+31;
for (; k152 != 64; ++k152) {
ptrdiff_t l65 = 0;
for (; l65 != 1; ++l65) {
__m512 sf1265 = _mm512_loadu_ps(sfPtr13+0+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1266 = _mm512_loadu_ps(sfPtr13+128+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2426 = _mm512_shuffle_f32x4(sf1265, sf1266, 68);
__m512 in2427 = _mm512_shuffle_f32x4(sf1265, sf1266, 238);
__m512 sf1267 = _mm512_loadu_ps(sfPtr13+64+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1268 = _mm512_loadu_ps(sfPtr13+192+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2434 = _mm512_shuffle_f32x4(sf1267, sf1268, 68);
__m512 in2435 = _mm512_shuffle_f32x4(sf1267, sf1268, 238);
__m512 sf1269 = _mm512_loadu_ps(sfPtr13+147456+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1270 = _mm512_loadu_ps(sfPtr13+147584+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2428 = _mm512_shuffle_f32x4(sf1269, sf1270, 68);
__m512 in2429 = _mm512_shuffle_f32x4(sf1269, sf1270, 238);
__m512 sf1271 = _mm512_loadu_ps(sfPtr13+147520+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1272 = _mm512_loadu_ps(sfPtr13+147648+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2436 = _mm512_shuffle_f32x4(sf1271, sf1272, 68);
__m512 in2437 = _mm512_shuffle_f32x4(sf1271, sf1272, 238);
__m512 sf1273 = _mm512_loadu_ps(sfPtr13+294912+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1274 = _mm512_loadu_ps(sfPtr13+295040+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2430 = _mm512_shuffle_f32x4(sf1273, sf1274, 68);
__m512 in2431 = _mm512_shuffle_f32x4(sf1273, sf1274, 238);
__m512 sf1275 = _mm512_loadu_ps(sfPtr13+294976+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1276 = _mm512_loadu_ps(sfPtr13+295104+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2438 = _mm512_shuffle_f32x4(sf1275, sf1276, 68);
__m512 in2439 = _mm512_shuffle_f32x4(sf1275, sf1276, 238);
__m512 sf1277 = _mm512_loadu_ps(sfPtr13+442368+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1278 = _mm512_loadu_ps(sfPtr13+442496+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2432 = _mm512_shuffle_f32x4(sf1277, sf1278, 68);
__m512 in2433 = _mm512_shuffle_f32x4(sf1277, sf1278, 238);
__m512 sf1279 = _mm512_loadu_ps(sfPtr13+442432+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1280 = _mm512_loadu_ps(sfPtr13+442560+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2440 = _mm512_shuffle_f32x4(sf1279, sf1280, 68);
__m512 in2441 = _mm512_shuffle_f32x4(sf1279, sf1280, 238);
__m512 tmp17435 = _mm512_add_ps(in2427, in2428);
__m512 tmp17455 = _mm512_add_ps(in2435, in2436);
__m512 tmp17434 = _mm512_add_ps(in2429, in2430);
__m512 tmp17454 = _mm512_add_ps(in2437, in2438);
__m512 tmp17440 = _mm512_sub_ps(in2429, in2430);
__m512 tmp17460 = _mm512_sub_ps(in2437, in2438);
__m512 tmp17439 = _mm512_sub_ps(in2427, in2428);
__m512 tmp17459 = _mm512_sub_ps(in2435, in2436);
__m512 tmp17436 = _mm512_add_ps(in2431, in2432);
__m512 tmp17456 = _mm512_add_ps(in2439, in2440);
__m512 tmp17441 = _mm512_sub_ps(in2431, in2432);
__m512 tmp17461 = _mm512_sub_ps(in2439, in2440);
__m512 tmp17438 = _mm512_fmadd_ps(tmp17440, _mm512_set1_ps(2e+00f), tmp17439);
__m512 tmp17458 = _mm512_fmadd_ps(tmp17460, _mm512_set1_ps(2e+00f), tmp17459);
__m512 tmp17445 = _mm512_fmadd_ps(tmp17440, _mm512_set1_ps(8e+00f), tmp17439);
__m512 tmp17465 = _mm512_fmadd_ps(tmp17460, _mm512_set1_ps(8e+00f), tmp17459);
__m512 tmp17433 = _mm512_add_ps(tmp17434, tmp17435);
__m512 tmp17453 = _mm512_add_ps(tmp17454, tmp17455);
__m512 tmp17437 = _mm512_fmadd_ps(tmp17441, _mm512_set1_ps(1.6e+01f), tmp17438);
__m512 tmp17457 = _mm512_fmadd_ps(tmp17461, _mm512_set1_ps(1.6e+01f), tmp17458);
__m512 tmp17444 = _mm512_fmadd_ps(tmp17441, _mm512_set1_ps(4e+00f), tmp17445);
__m512 tmp17464 = _mm512_fmadd_ps(tmp17461, _mm512_set1_ps(4e+00f), tmp17465);
__m512 tmp17450 = _mm512_add_ps(tmp17441, tmp17439);
__m512 tmp17470 = _mm512_add_ps(tmp17461, tmp17459);
__m512 tmp17443 = _mm512_fmadd_ps(tmp17434, _mm512_set1_ps(4e+00f), tmp17435);
__m512 tmp17463 = _mm512_fmadd_ps(tmp17454, _mm512_set1_ps(4e+00f), tmp17455);
__m512 tmp17447 = _mm512_fmadd_ps(tmp17434, _mm512_set1_ps(1.6e+01f), tmp17435);
__m512 tmp17467 = _mm512_fmadd_ps(tmp17454, _mm512_set1_ps(1.6e+01f), tmp17455);
__m512 tmp17432 = _mm512_add_ps(tmp17433, in2426);
__m512 tmp17452 = _mm512_add_ps(tmp17453, in2434);
__m512 tmp17449 = _mm512_add_ps(tmp17450, in2433);
__m512 tmp17469 = _mm512_add_ps(tmp17470, in2441);
__m512 tmp17431 = _mm512_fmadd_ps(tmp17436, _mm512_set1_ps(3.2e+01f), tmp17432);
__m512 tmp17451 = _mm512_fmadd_ps(tmp17456, _mm512_set1_ps(3.2e+01f), tmp17452);
__m512 tmp17442 = _mm512_fmadd_ps(tmp17436, _mm512_set1_ps(8e+00f), tmp17443);
__m512 tmp17462 = _mm512_fmadd_ps(tmp17456, _mm512_set1_ps(8e+00f), tmp17463);
__m512 tmp17448 = _mm512_fmadd_ps(tmp17440, _mm512_set1_ps(3.2e+01f), tmp17449);
__m512 tmp17468 = _mm512_fmadd_ps(tmp17460, _mm512_set1_ps(3.2e+01f), tmp17469);
__m512 tmp17446 = _mm512_fmadd_ps(tmp17436, _mm512_set1_ps(2e+00f), tmp17447);
__m512 tmp17466 = _mm512_fmadd_ps(tmp17456, _mm512_set1_ps(2e+00f), tmp17467);
__m512 tmp17419 = tmp17431;
__m512 tmp17425 = tmp17451;
__m512 tmp17420 = tmp17437;
__m512 tmp17426 = tmp17457;
__m512 tmp17421 = tmp17442;
__m512 tmp17427 = tmp17462;
__m512 tmp17422 = tmp17444;
__m512 tmp17428 = tmp17464;
__m512 tmp17423 = tmp17446;
__m512 tmp17429 = tmp17466;
__m512 tmp17424 = tmp17448;
__m512 tmp17430 = tmp17468;
__m512 tmp17497 = _mm512_unpacklo_ps(tmp17419, tmp17420);
__m512 tmp17498 = _mm512_unpackhi_ps(tmp17419, tmp17420);
__m512 tmp17499 = _mm512_unpacklo_ps(tmp17421, tmp17422);
__m512 tmp17500 = _mm512_unpackhi_ps(tmp17421, tmp17422);
__m512 tmp17501 = _mm512_unpacklo_ps(tmp17423, tmp17424);
__m512 tmp17502 = _mm512_unpackhi_ps(tmp17423, tmp17424);
__m512 tmp17503 = _mm512_unpacklo_ps(tmp17425, tmp17426);
__m512 tmp17504 = _mm512_unpackhi_ps(tmp17425, tmp17426);
__m512 tmp17505 = _mm512_unpacklo_ps(tmp17427, tmp17428);
__m512 tmp17506 = _mm512_unpackhi_ps(tmp17427, tmp17428);
__m512 tmp17507 = _mm512_unpacklo_ps(tmp17429, tmp17430);
__m512 tmp17508 = _mm512_unpackhi_ps(tmp17429, tmp17430);
__m512 tmp17509 = _mm512_shuffle_ps(tmp17497, tmp17499, 68);
__m512 tmp17510 = _mm512_shuffle_ps(tmp17497, tmp17499, 238);
__m512 tmp17511 = _mm512_shuffle_ps(tmp17498, tmp17500, 68);
__m512 tmp17512 = _mm512_shuffle_ps(tmp17498, tmp17500, 238);
__m512 tmp17513 = _mm512_shuffle_ps(tmp17501, tmp17503, 68);
__m512 tmp17514 = _mm512_shuffle_ps(tmp17501, tmp17503, 238);
__m512 tmp17515 = _mm512_shuffle_ps(tmp17502, tmp17504, 68);
__m512 tmp17516 = _mm512_shuffle_ps(tmp17502, tmp17504, 238);
__m512 tmp17517 = _mm512_shuffle_ps(tmp17505, tmp17507, 68);
__m512 tmp17518 = _mm512_shuffle_ps(tmp17505, tmp17507, 238);
__m512 tmp17519 = _mm512_shuffle_ps(tmp17506, tmp17508, 68);
__m512 tmp17520 = _mm512_shuffle_ps(tmp17506, tmp17508, 238);
__m512 tmp17521 = _mm512_shuffle_f32x4(tmp17509, tmp17513, 136);
__m512 tmp17522 = _mm512_shuffle_f32x4(tmp17509, tmp17513, 221);
__m512 tmp17523 = _mm512_shuffle_f32x4(tmp17510, tmp17514, 136);
__m512 tmp17524 = _mm512_shuffle_f32x4(tmp17510, tmp17514, 221);
__m512 tmp17525 = _mm512_shuffle_f32x4(tmp17511, tmp17515, 136);
__m512 tmp17526 = _mm512_shuffle_f32x4(tmp17511, tmp17515, 221);
__m512 tmp17527 = _mm512_shuffle_f32x4(tmp17512, tmp17516, 136);
__m512 tmp17528 = _mm512_shuffle_f32x4(tmp17512, tmp17516, 221);
__m512 tmp17529 = _mm512_shuffle_f32x4(tmp17517, tmp17517, 136);
__m512 tmp17530 = _mm512_shuffle_f32x4(tmp17517, tmp17517, 221);
__m512 tmp17531 = _mm512_shuffle_f32x4(tmp17518, tmp17518, 136);
__m512 tmp17532 = _mm512_shuffle_f32x4(tmp17518, tmp17518, 221);
__m512 tmp17533 = _mm512_shuffle_f32x4(tmp17519, tmp17519, 136);
__m512 tmp17534 = _mm512_shuffle_f32x4(tmp17519, tmp17519, 221);
__m512 tmp17535 = _mm512_shuffle_f32x4(tmp17520, tmp17520, 136);
__m512 tmp17536 = _mm512_shuffle_f32x4(tmp17520, tmp17520, 221);
tmp17419 = _mm512_shuffle_f32x4(tmp17521, tmp17529, 136);
tmp17427 = _mm512_shuffle_f32x4(tmp17521, tmp17529, 221);
tmp17420 = _mm512_shuffle_f32x4(tmp17523, tmp17531, 136);
tmp17428 = _mm512_shuffle_f32x4(tmp17523, tmp17531, 221);
tmp17421 = _mm512_shuffle_f32x4(tmp17525, tmp17533, 136);
tmp17429 = _mm512_shuffle_f32x4(tmp17525, tmp17533, 221);
tmp17422 = _mm512_shuffle_f32x4(tmp17527, tmp17535, 136);
tmp17430 = _mm512_shuffle_f32x4(tmp17527, tmp17535, 221);
tmp17423 = _mm512_shuffle_f32x4(tmp17522, tmp17530, 136);
__m512 tmp17471 = _mm512_shuffle_f32x4(tmp17522, tmp17530, 221);
tmp17424 = _mm512_shuffle_f32x4(tmp17524, tmp17532, 136);
__m512 tmp17472 = _mm512_shuffle_f32x4(tmp17524, tmp17532, 221);
tmp17425 = _mm512_shuffle_f32x4(tmp17526, tmp17534, 136);
__m512 tmp17473 = _mm512_shuffle_f32x4(tmp17526, tmp17534, 221);
tmp17426 = _mm512_shuffle_f32x4(tmp17528, tmp17536, 136);
__m512 tmp17474 = _mm512_shuffle_f32x4(tmp17528, tmp17536, 221);
(void)tmp17426;
(void)tmp17474;
__m512 tmp17479 = _mm512_add_ps(tmp17420, tmp17421);
__m512 tmp17490 = _mm512_add_ps(tmp17428, tmp17429);
__m512 tmp17478 = _mm512_add_ps(tmp17422, tmp17423);
__m512 tmp17489 = _mm512_add_ps(tmp17430, tmp17471);
__m512 tmp17484 = _mm512_sub_ps(tmp17422, tmp17423);
__m512 tmp17495 = _mm512_sub_ps(tmp17430, tmp17471);
__m512 tmp17483 = _mm512_sub_ps(tmp17420, tmp17421);
__m512 tmp17494 = _mm512_sub_ps(tmp17428, tmp17429);
__m512 tmp17480 = _mm512_add_ps(tmp17424, tmp17425);
__m512 tmp17491 = _mm512_add_ps(tmp17472, tmp17473);
__m512 tmp17485 = _mm512_sub_ps(tmp17424, tmp17425);
__m512 tmp17496 = _mm512_sub_ps(tmp17472, tmp17473);
__m512 tmp17482 = _mm512_fmadd_ps(tmp17484, _mm512_set1_ps(2e+00f), tmp17483);
__m512 tmp17493 = _mm512_fmadd_ps(tmp17495, _mm512_set1_ps(2e+00f), tmp17494);
__m512 tmp17477 = _mm512_add_ps(tmp17478, tmp17479);
__m512 tmp17488 = _mm512_add_ps(tmp17489, tmp17490);
__m512 tmp17481 = _mm512_fmadd_ps(tmp17485, _mm512_set1_ps(1.6e+01f), tmp17482);
__m512 tmp17492 = _mm512_fmadd_ps(tmp17496, _mm512_set1_ps(1.6e+01f), tmp17493);
__m512 tmp17476 = _mm512_add_ps(tmp17477, tmp17419);
__m512 tmp17487 = _mm512_add_ps(tmp17488, tmp17427);
__m512 tmp17475 = _mm512_fmadd_ps(tmp17480, _mm512_set1_ps(3.2e+01f), tmp17476);
__m512 tmp17486 = _mm512_fmadd_ps(tmp17491, _mm512_set1_ps(3.2e+01f), tmp17487);
__m512 out2259 = tmp17475;
__m512 out2261 = tmp17486;
__m512 out2260 = tmp17481;
__m512 out2262 = tmp17492;
out2259 = _mm512_max_ps(_mm512_setzero_ps(), out2259);
out2261 = _mm512_max_ps(_mm512_setzero_ps(), out2261);
out2260 = _mm512_max_ps(_mm512_setzero_ps(), out2260);
out2262 = _mm512_max_ps(_mm512_setzero_ps(), out2262);
_mm512_mask_storeu_ps(datPtr28+0+212992*i55+56*toH49+4*toW49+3328*k152+3328*l65, 4095, out2259);
_mm512_mask_storeu_ps(datPtr28+48+212992*i55+56*toH49+4*toW49+3328*k152+3328*l65, 3, out2261);
_mm512_mask_storeu_ps(datPtr28+808+212992*i55+56*toH49+4*toW49+3328*k152+3328*l65, 4032, out2261);
_mm512_mask_storeu_ps(datPtr28+56+212992*i55+56*toH49+4*toW49+3328*k152+3328*l65, 4095, out2260);
_mm512_mask_storeu_ps(datPtr28+104+212992*i55+56*toH49+4*toW49+3328*k152+3328*l65, 3, out2262);
_mm512_mask_storeu_ps(datPtr28+864+212992*i55+56*toH49+4*toW49+3328*k152+3328*l65, 4032, out2262);
__m512 sf1281 = _mm512_loadu_ps(sfPtr13+256+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1282 = _mm512_loadu_ps(sfPtr13+384+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2442 = _mm512_shuffle_f32x4(sf1281, sf1282, 68);
__m512 in2443 = _mm512_shuffle_f32x4(sf1281, sf1282, 238);
__m512 sf1283 = _mm512_loadu_ps(sfPtr13+320+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1284 = _mm512_loadu_ps(sfPtr13+448+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2450 = _mm512_shuffle_f32x4(sf1283, sf1284, 68);
__m512 in2451 = _mm512_shuffle_f32x4(sf1283, sf1284, 238);
__m512 sf1285 = _mm512_loadu_ps(sfPtr13+147712+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1286 = _mm512_loadu_ps(sfPtr13+147840+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2444 = _mm512_shuffle_f32x4(sf1285, sf1286, 68);
__m512 in2445 = _mm512_shuffle_f32x4(sf1285, sf1286, 238);
__m512 sf1287 = _mm512_loadu_ps(sfPtr13+147776+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1288 = _mm512_loadu_ps(sfPtr13+147904+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2452 = _mm512_shuffle_f32x4(sf1287, sf1288, 68);
__m512 in2453 = _mm512_shuffle_f32x4(sf1287, sf1288, 238);
__m512 sf1289 = _mm512_loadu_ps(sfPtr13+295168+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1290 = _mm512_loadu_ps(sfPtr13+295296+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2446 = _mm512_shuffle_f32x4(sf1289, sf1290, 68);
__m512 in2447 = _mm512_shuffle_f32x4(sf1289, sf1290, 238);
__m512 sf1291 = _mm512_loadu_ps(sfPtr13+295232+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1292 = _mm512_loadu_ps(sfPtr13+295360+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2454 = _mm512_shuffle_f32x4(sf1291, sf1292, 68);
__m512 in2455 = _mm512_shuffle_f32x4(sf1291, sf1292, 238);
__m512 sf1293 = _mm512_loadu_ps(sfPtr13+442624+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1294 = _mm512_loadu_ps(sfPtr13+442752+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2448 = _mm512_shuffle_f32x4(sf1293, sf1294, 68);
__m512 in2449 = _mm512_shuffle_f32x4(sf1293, sf1294, 238);
__m512 sf1295 = _mm512_loadu_ps(sfPtr13+442688+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1296 = _mm512_loadu_ps(sfPtr13+442816+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2456 = _mm512_shuffle_f32x4(sf1295, sf1296, 68);
__m512 in2457 = _mm512_shuffle_f32x4(sf1295, sf1296, 238);
__m512 tmp17553 = _mm512_add_ps(in2443, in2444);
__m512 tmp17573 = _mm512_add_ps(in2451, in2452);
__m512 tmp17552 = _mm512_add_ps(in2445, in2446);
__m512 tmp17572 = _mm512_add_ps(in2453, in2454);
__m512 tmp17558 = _mm512_sub_ps(in2445, in2446);
__m512 tmp17578 = _mm512_sub_ps(in2453, in2454);
__m512 tmp17557 = _mm512_sub_ps(in2443, in2444);
__m512 tmp17577 = _mm512_sub_ps(in2451, in2452);
__m512 tmp17554 = _mm512_add_ps(in2447, in2448);
__m512 tmp17574 = _mm512_add_ps(in2455, in2456);
__m512 tmp17559 = _mm512_sub_ps(in2447, in2448);
__m512 tmp17579 = _mm512_sub_ps(in2455, in2456);
__m512 tmp17556 = _mm512_fmadd_ps(tmp17558, _mm512_set1_ps(2e+00f), tmp17557);
__m512 tmp17576 = _mm512_fmadd_ps(tmp17578, _mm512_set1_ps(2e+00f), tmp17577);
__m512 tmp17563 = _mm512_fmadd_ps(tmp17558, _mm512_set1_ps(8e+00f), tmp17557);
__m512 tmp17583 = _mm512_fmadd_ps(tmp17578, _mm512_set1_ps(8e+00f), tmp17577);
__m512 tmp17551 = _mm512_add_ps(tmp17552, tmp17553);
__m512 tmp17571 = _mm512_add_ps(tmp17572, tmp17573);
__m512 tmp17555 = _mm512_fmadd_ps(tmp17559, _mm512_set1_ps(1.6e+01f), tmp17556);
__m512 tmp17575 = _mm512_fmadd_ps(tmp17579, _mm512_set1_ps(1.6e+01f), tmp17576);
__m512 tmp17562 = _mm512_fmadd_ps(tmp17559, _mm512_set1_ps(4e+00f), tmp17563);
__m512 tmp17582 = _mm512_fmadd_ps(tmp17579, _mm512_set1_ps(4e+00f), tmp17583);
__m512 tmp17568 = _mm512_add_ps(tmp17559, tmp17557);
__m512 tmp17588 = _mm512_add_ps(tmp17579, tmp17577);
__m512 tmp17561 = _mm512_fmadd_ps(tmp17552, _mm512_set1_ps(4e+00f), tmp17553);
__m512 tmp17581 = _mm512_fmadd_ps(tmp17572, _mm512_set1_ps(4e+00f), tmp17573);
__m512 tmp17565 = _mm512_fmadd_ps(tmp17552, _mm512_set1_ps(1.6e+01f), tmp17553);
__m512 tmp17585 = _mm512_fmadd_ps(tmp17572, _mm512_set1_ps(1.6e+01f), tmp17573);
__m512 tmp17550 = _mm512_add_ps(tmp17551, in2442);
__m512 tmp17570 = _mm512_add_ps(tmp17571, in2450);
__m512 tmp17567 = _mm512_add_ps(tmp17568, in2449);
__m512 tmp17587 = _mm512_add_ps(tmp17588, in2457);
__m512 tmp17549 = _mm512_fmadd_ps(tmp17554, _mm512_set1_ps(3.2e+01f), tmp17550);
__m512 tmp17569 = _mm512_fmadd_ps(tmp17574, _mm512_set1_ps(3.2e+01f), tmp17570);
__m512 tmp17560 = _mm512_fmadd_ps(tmp17554, _mm512_set1_ps(8e+00f), tmp17561);
__m512 tmp17580 = _mm512_fmadd_ps(tmp17574, _mm512_set1_ps(8e+00f), tmp17581);
__m512 tmp17566 = _mm512_fmadd_ps(tmp17558, _mm512_set1_ps(3.2e+01f), tmp17567);
__m512 tmp17586 = _mm512_fmadd_ps(tmp17578, _mm512_set1_ps(3.2e+01f), tmp17587);
__m512 tmp17564 = _mm512_fmadd_ps(tmp17554, _mm512_set1_ps(2e+00f), tmp17565);
__m512 tmp17584 = _mm512_fmadd_ps(tmp17574, _mm512_set1_ps(2e+00f), tmp17585);
__m512 tmp17537 = tmp17549;
__m512 tmp17543 = tmp17569;
__m512 tmp17538 = tmp17555;
__m512 tmp17544 = tmp17575;
__m512 tmp17539 = tmp17560;
__m512 tmp17545 = tmp17580;
__m512 tmp17540 = tmp17562;
__m512 tmp17546 = tmp17582;
__m512 tmp17541 = tmp17564;
__m512 tmp17547 = tmp17584;
__m512 tmp17542 = tmp17566;
__m512 tmp17548 = tmp17586;
__m512 tmp17615 = _mm512_unpacklo_ps(tmp17537, tmp17538);
__m512 tmp17616 = _mm512_unpackhi_ps(tmp17537, tmp17538);
__m512 tmp17617 = _mm512_unpacklo_ps(tmp17539, tmp17540);
__m512 tmp17618 = _mm512_unpackhi_ps(tmp17539, tmp17540);
__m512 tmp17619 = _mm512_unpacklo_ps(tmp17541, tmp17542);
__m512 tmp17620 = _mm512_unpackhi_ps(tmp17541, tmp17542);
__m512 tmp17621 = _mm512_unpacklo_ps(tmp17543, tmp17544);
__m512 tmp17622 = _mm512_unpackhi_ps(tmp17543, tmp17544);
__m512 tmp17623 = _mm512_unpacklo_ps(tmp17545, tmp17546);
__m512 tmp17624 = _mm512_unpackhi_ps(tmp17545, tmp17546);
__m512 tmp17625 = _mm512_unpacklo_ps(tmp17547, tmp17548);
__m512 tmp17626 = _mm512_unpackhi_ps(tmp17547, tmp17548);
__m512 tmp17627 = _mm512_shuffle_ps(tmp17615, tmp17617, 68);
__m512 tmp17628 = _mm512_shuffle_ps(tmp17615, tmp17617, 238);
__m512 tmp17629 = _mm512_shuffle_ps(tmp17616, tmp17618, 68);
__m512 tmp17630 = _mm512_shuffle_ps(tmp17616, tmp17618, 238);
__m512 tmp17631 = _mm512_shuffle_ps(tmp17619, tmp17621, 68);
__m512 tmp17632 = _mm512_shuffle_ps(tmp17619, tmp17621, 238);
__m512 tmp17633 = _mm512_shuffle_ps(tmp17620, tmp17622, 68);
__m512 tmp17634 = _mm512_shuffle_ps(tmp17620, tmp17622, 238);
__m512 tmp17635 = _mm512_shuffle_ps(tmp17623, tmp17625, 68);
__m512 tmp17636 = _mm512_shuffle_ps(tmp17623, tmp17625, 238);
__m512 tmp17637 = _mm512_shuffle_ps(tmp17624, tmp17626, 68);
__m512 tmp17638 = _mm512_shuffle_ps(tmp17624, tmp17626, 238);
__m512 tmp17639 = _mm512_shuffle_f32x4(tmp17627, tmp17631, 136);
__m512 tmp17640 = _mm512_shuffle_f32x4(tmp17627, tmp17631, 221);
__m512 tmp17641 = _mm512_shuffle_f32x4(tmp17628, tmp17632, 136);
__m512 tmp17642 = _mm512_shuffle_f32x4(tmp17628, tmp17632, 221);
__m512 tmp17643 = _mm512_shuffle_f32x4(tmp17629, tmp17633, 136);
__m512 tmp17644 = _mm512_shuffle_f32x4(tmp17629, tmp17633, 221);
__m512 tmp17645 = _mm512_shuffle_f32x4(tmp17630, tmp17634, 136);
__m512 tmp17646 = _mm512_shuffle_f32x4(tmp17630, tmp17634, 221);
__m512 tmp17647 = _mm512_shuffle_f32x4(tmp17635, tmp17635, 136);
__m512 tmp17648 = _mm512_shuffle_f32x4(tmp17635, tmp17635, 221);
__m512 tmp17649 = _mm512_shuffle_f32x4(tmp17636, tmp17636, 136);
__m512 tmp17650 = _mm512_shuffle_f32x4(tmp17636, tmp17636, 221);
__m512 tmp17651 = _mm512_shuffle_f32x4(tmp17637, tmp17637, 136);
__m512 tmp17652 = _mm512_shuffle_f32x4(tmp17637, tmp17637, 221);
__m512 tmp17653 = _mm512_shuffle_f32x4(tmp17638, tmp17638, 136);
__m512 tmp17654 = _mm512_shuffle_f32x4(tmp17638, tmp17638, 221);
tmp17537 = _mm512_shuffle_f32x4(tmp17639, tmp17647, 136);
tmp17545 = _mm512_shuffle_f32x4(tmp17639, tmp17647, 221);
tmp17538 = _mm512_shuffle_f32x4(tmp17641, tmp17649, 136);
tmp17546 = _mm512_shuffle_f32x4(tmp17641, tmp17649, 221);
tmp17539 = _mm512_shuffle_f32x4(tmp17643, tmp17651, 136);
tmp17547 = _mm512_shuffle_f32x4(tmp17643, tmp17651, 221);
tmp17540 = _mm512_shuffle_f32x4(tmp17645, tmp17653, 136);
tmp17548 = _mm512_shuffle_f32x4(tmp17645, tmp17653, 221);
tmp17541 = _mm512_shuffle_f32x4(tmp17640, tmp17648, 136);
__m512 tmp17589 = _mm512_shuffle_f32x4(tmp17640, tmp17648, 221);
tmp17542 = _mm512_shuffle_f32x4(tmp17642, tmp17650, 136);
__m512 tmp17590 = _mm512_shuffle_f32x4(tmp17642, tmp17650, 221);
tmp17543 = _mm512_shuffle_f32x4(tmp17644, tmp17652, 136);
__m512 tmp17591 = _mm512_shuffle_f32x4(tmp17644, tmp17652, 221);
tmp17544 = _mm512_shuffle_f32x4(tmp17646, tmp17654, 136);
__m512 tmp17592 = _mm512_shuffle_f32x4(tmp17646, tmp17654, 221);
(void)tmp17544;
(void)tmp17592;
__m512 tmp17597 = _mm512_add_ps(tmp17538, tmp17539);
__m512 tmp17608 = _mm512_add_ps(tmp17546, tmp17547);
__m512 tmp17596 = _mm512_add_ps(tmp17540, tmp17541);
__m512 tmp17607 = _mm512_add_ps(tmp17548, tmp17589);
__m512 tmp17602 = _mm512_sub_ps(tmp17540, tmp17541);
__m512 tmp17613 = _mm512_sub_ps(tmp17548, tmp17589);
__m512 tmp17601 = _mm512_sub_ps(tmp17538, tmp17539);
__m512 tmp17612 = _mm512_sub_ps(tmp17546, tmp17547);
__m512 tmp17598 = _mm512_add_ps(tmp17542, tmp17543);
__m512 tmp17609 = _mm512_add_ps(tmp17590, tmp17591);
__m512 tmp17603 = _mm512_sub_ps(tmp17542, tmp17543);
__m512 tmp17614 = _mm512_sub_ps(tmp17590, tmp17591);
__m512 tmp17600 = _mm512_fmadd_ps(tmp17602, _mm512_set1_ps(2e+00f), tmp17601);
__m512 tmp17611 = _mm512_fmadd_ps(tmp17613, _mm512_set1_ps(2e+00f), tmp17612);
__m512 tmp17595 = _mm512_add_ps(tmp17596, tmp17597);
__m512 tmp17606 = _mm512_add_ps(tmp17607, tmp17608);
__m512 tmp17599 = _mm512_fmadd_ps(tmp17603, _mm512_set1_ps(1.6e+01f), tmp17600);
__m512 tmp17610 = _mm512_fmadd_ps(tmp17614, _mm512_set1_ps(1.6e+01f), tmp17611);
__m512 tmp17594 = _mm512_add_ps(tmp17595, tmp17537);
__m512 tmp17605 = _mm512_add_ps(tmp17606, tmp17545);
__m512 tmp17593 = _mm512_fmadd_ps(tmp17598, _mm512_set1_ps(3.2e+01f), tmp17594);
__m512 tmp17604 = _mm512_fmadd_ps(tmp17609, _mm512_set1_ps(3.2e+01f), tmp17605);
__m512 out2263 = tmp17593;
__m512 out2265 = tmp17604;
__m512 out2264 = tmp17599;
__m512 out2266 = tmp17610;
out2263 = _mm512_max_ps(_mm512_setzero_ps(), out2263);
out2265 = _mm512_max_ps(_mm512_setzero_ps(), out2265);
out2264 = _mm512_max_ps(_mm512_setzero_ps(), out2264);
out2266 = _mm512_max_ps(_mm512_setzero_ps(), out2266);
_mm512_mask_storeu_ps(datPtr28+856+212992*i55+56*toH49+4*toW49+3328*k152+3328*l65, 255, out2263);
_mm512_mask_storeu_ps(datPtr28+1664+212992*i55+56*toH49+4*toW49+3328*k152+3328*l65, 4095, out2265);
_mm512_mask_storeu_ps(datPtr28+912+212992*i55+56*toH49+4*toW49+3328*k152+3328*l65, 255, out2264);
_mm512_mask_storeu_ps(datPtr28+1720+212992*i55+56*toH49+4*toW49+3328*k152+3328*l65, 4095, out2266);
__m512 sf1297 = _mm512_loadu_ps(sfPtr13+512+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1298 = _mm512_loadu_ps(sfPtr13+576+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2458 = _mm512_shuffle_f32x4(sf1298, sf1297, 68);
__m512 in2459 = _mm512_shuffle_f32x4(sf1298, sf1297, 238);
__m512 sf1299 = _mm512_loadu_ps(sfPtr13+640+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1300 = _mm512_loadu_ps(sfPtr13+704+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2466 = _mm512_shuffle_f32x4(sf1299, sf1300, 68);
__m512 in2467 = _mm512_shuffle_f32x4(sf1299, sf1300, 238);
__m512 sf1301 = _mm512_loadu_ps(sfPtr13+147968+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1302 = _mm512_loadu_ps(sfPtr13+148032+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2460 = _mm512_shuffle_f32x4(sf1302, sf1301, 68);
__m512 in2461 = _mm512_shuffle_f32x4(sf1302, sf1301, 238);
__m512 sf1303 = _mm512_loadu_ps(sfPtr13+148096+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1304 = _mm512_loadu_ps(sfPtr13+148160+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2468 = _mm512_shuffle_f32x4(sf1303, sf1304, 68);
__m512 in2469 = _mm512_shuffle_f32x4(sf1303, sf1304, 238);
__m512 sf1305 = _mm512_loadu_ps(sfPtr13+295424+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1306 = _mm512_loadu_ps(sfPtr13+295488+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2462 = _mm512_shuffle_f32x4(sf1306, sf1305, 68);
__m512 in2463 = _mm512_shuffle_f32x4(sf1306, sf1305, 238);
__m512 sf1307 = _mm512_loadu_ps(sfPtr13+295552+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1308 = _mm512_loadu_ps(sfPtr13+295616+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2470 = _mm512_shuffle_f32x4(sf1307, sf1308, 68);
__m512 in2471 = _mm512_shuffle_f32x4(sf1307, sf1308, 238);
__m512 sf1309 = _mm512_loadu_ps(sfPtr13+442880+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1310 = _mm512_loadu_ps(sfPtr13+442944+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2464 = _mm512_shuffle_f32x4(sf1310, sf1309, 68);
__m512 in2465 = _mm512_shuffle_f32x4(sf1310, sf1309, 238);
__m512 sf1311 = _mm512_loadu_ps(sfPtr13+443008+589824*i55+98304*j47+768*k152+768*l65);
__m512 sf1312 = _mm512_loadu_ps(sfPtr13+443072+589824*i55+98304*j47+768*k152+768*l65);
__m512 in2472 = _mm512_shuffle_f32x4(sf1311, sf1312, 68);
__m512 in2473 = _mm512_shuffle_f32x4(sf1311, sf1312, 238);
__m512 tmp17671 = _mm512_add_ps(in2459, in2460);
__m512 tmp17691 = _mm512_add_ps(in2467, in2468);
__m512 tmp17670 = _mm512_add_ps(in2461, in2462);
__m512 tmp17690 = _mm512_add_ps(in2469, in2470);
__m512 tmp17676 = _mm512_sub_ps(in2461, in2462);
__m512 tmp17696 = _mm512_sub_ps(in2469, in2470);
__m512 tmp17675 = _mm512_sub_ps(in2459, in2460);
__m512 tmp17695 = _mm512_sub_ps(in2467, in2468);
__m512 tmp17672 = _mm512_add_ps(in2463, in2464);
__m512 tmp17692 = _mm512_add_ps(in2471, in2472);
__m512 tmp17677 = _mm512_sub_ps(in2463, in2464);
__m512 tmp17697 = _mm512_sub_ps(in2471, in2472);
__m512 tmp17674 = _mm512_fmadd_ps(tmp17676, _mm512_set1_ps(2e+00f), tmp17675);
__m512 tmp17694 = _mm512_fmadd_ps(tmp17696, _mm512_set1_ps(2e+00f), tmp17695);
__m512 tmp17681 = _mm512_fmadd_ps(tmp17676, _mm512_set1_ps(8e+00f), tmp17675);
__m512 tmp17701 = _mm512_fmadd_ps(tmp17696, _mm512_set1_ps(8e+00f), tmp17695);
__m512 tmp17669 = _mm512_add_ps(tmp17670, tmp17671);
__m512 tmp17689 = _mm512_add_ps(tmp17690, tmp17691);
__m512 tmp17673 = _mm512_fmadd_ps(tmp17677, _mm512_set1_ps(1.6e+01f), tmp17674);
__m512 tmp17693 = _mm512_fmadd_ps(tmp17697, _mm512_set1_ps(1.6e+01f), tmp17694);
__m512 tmp17680 = _mm512_fmadd_ps(tmp17677, _mm512_set1_ps(4e+00f), tmp17681);
__m512 tmp17700 = _mm512_fmadd_ps(tmp17697, _mm512_set1_ps(4e+00f), tmp17701);
__m512 tmp17686 = _mm512_add_ps(tmp17677, tmp17675);
__m512 tmp17706 = _mm512_add_ps(tmp17697, tmp17695);
__m512 tmp17679 = _mm512_fmadd_ps(tmp17670, _mm512_set1_ps(4e+00f), tmp17671);
__m512 tmp17699 = _mm512_fmadd_ps(tmp17690, _mm512_set1_ps(4e+00f), tmp17691);
__m512 tmp17683 = _mm512_fmadd_ps(tmp17670, _mm512_set1_ps(1.6e+01f), tmp17671);
__m512 tmp17703 = _mm512_fmadd_ps(tmp17690, _mm512_set1_ps(1.6e+01f), tmp17691);
__m512 tmp17668 = _mm512_add_ps(tmp17669, in2458);
__m512 tmp17688 = _mm512_add_ps(tmp17689, in2466);
__m512 tmp17685 = _mm512_add_ps(tmp17686, in2465);
__m512 tmp17705 = _mm512_add_ps(tmp17706, in2473);
__m512 tmp17667 = _mm512_fmadd_ps(tmp17672, _mm512_set1_ps(3.2e+01f), tmp17668);
__m512 tmp17687 = _mm512_fmadd_ps(tmp17692, _mm512_set1_ps(3.2e+01f), tmp17688);
__m512 tmp17678 = _mm512_fmadd_ps(tmp17672, _mm512_set1_ps(8e+00f), tmp17679);
__m512 tmp17698 = _mm512_fmadd_ps(tmp17692, _mm512_set1_ps(8e+00f), tmp17699);
__m512 tmp17684 = _mm512_fmadd_ps(tmp17676, _mm512_set1_ps(3.2e+01f), tmp17685);
__m512 tmp17704 = _mm512_fmadd_ps(tmp17696, _mm512_set1_ps(3.2e+01f), tmp17705);
__m512 tmp17682 = _mm512_fmadd_ps(tmp17672, _mm512_set1_ps(2e+00f), tmp17683);
__m512 tmp17702 = _mm512_fmadd_ps(tmp17692, _mm512_set1_ps(2e+00f), tmp17703);
__m512 tmp17655 = tmp17667;
__m512 tmp17661 = tmp17687;
__m512 tmp17656 = tmp17673;
__m512 tmp17662 = tmp17693;
__m512 tmp17657 = tmp17678;
__m512 tmp17663 = tmp17698;
__m512 tmp17658 = tmp17680;
__m512 tmp17664 = tmp17700;
__m512 tmp17659 = tmp17682;
__m512 tmp17665 = tmp17702;
__m512 tmp17660 = tmp17684;
__m512 tmp17666 = tmp17704;
__m512 tmp17733 = _mm512_unpacklo_ps(tmp17655, tmp17656);
__m512 tmp17734 = _mm512_unpackhi_ps(tmp17655, tmp17656);
__m512 tmp17735 = _mm512_unpacklo_ps(tmp17657, tmp17658);
__m512 tmp17736 = _mm512_unpackhi_ps(tmp17657, tmp17658);
__m512 tmp17737 = _mm512_unpacklo_ps(tmp17659, tmp17660);
__m512 tmp17738 = _mm512_unpackhi_ps(tmp17659, tmp17660);
__m512 tmp17739 = _mm512_unpacklo_ps(tmp17661, tmp17662);
__m512 tmp17740 = _mm512_unpackhi_ps(tmp17661, tmp17662);
__m512 tmp17741 = _mm512_unpacklo_ps(tmp17663, tmp17664);
__m512 tmp17742 = _mm512_unpackhi_ps(tmp17663, tmp17664);
__m512 tmp17743 = _mm512_unpacklo_ps(tmp17665, tmp17666);
__m512 tmp17744 = _mm512_unpackhi_ps(tmp17665, tmp17666);
__m512 tmp17745 = _mm512_shuffle_ps(tmp17733, tmp17735, 68);
__m512 tmp17746 = _mm512_shuffle_ps(tmp17733, tmp17735, 238);
__m512 tmp17747 = _mm512_shuffle_ps(tmp17734, tmp17736, 68);
__m512 tmp17748 = _mm512_shuffle_ps(tmp17734, tmp17736, 238);
__m512 tmp17749 = _mm512_shuffle_ps(tmp17737, tmp17739, 68);
__m512 tmp17750 = _mm512_shuffle_ps(tmp17737, tmp17739, 238);
__m512 tmp17751 = _mm512_shuffle_ps(tmp17738, tmp17740, 68);
__m512 tmp17752 = _mm512_shuffle_ps(tmp17738, tmp17740, 238);
__m512 tmp17753 = _mm512_shuffle_ps(tmp17741, tmp17743, 68);
__m512 tmp17754 = _mm512_shuffle_ps(tmp17741, tmp17743, 238);
__m512 tmp17755 = _mm512_shuffle_ps(tmp17742, tmp17744, 68);
__m512 tmp17756 = _mm512_shuffle_ps(tmp17742, tmp17744, 238);
__m512 tmp17757 = _mm512_shuffle_f32x4(tmp17745, tmp17749, 136);
__m512 tmp17758 = _mm512_shuffle_f32x4(tmp17745, tmp17749, 221);
__m512 tmp17759 = _mm512_shuffle_f32x4(tmp17746, tmp17750, 136);
__m512 tmp17760 = _mm512_shuffle_f32x4(tmp17746, tmp17750, 221);
__m512 tmp17761 = _mm512_shuffle_f32x4(tmp17747, tmp17751, 136);
__m512 tmp17762 = _mm512_shuffle_f32x4(tmp17747, tmp17751, 221);
__m512 tmp17763 = _mm512_shuffle_f32x4(tmp17748, tmp17752, 136);
__m512 tmp17764 = _mm512_shuffle_f32x4(tmp17748, tmp17752, 221);
__m512 tmp17765 = _mm512_shuffle_f32x4(tmp17753, tmp17753, 136);
__m512 tmp17766 = _mm512_shuffle_f32x4(tmp17753, tmp17753, 221);
__m512 tmp17767 = _mm512_shuffle_f32x4(tmp17754, tmp17754, 136);
__m512 tmp17768 = _mm512_shuffle_f32x4(tmp17754, tmp17754, 221);
__m512 tmp17769 = _mm512_shuffle_f32x4(tmp17755, tmp17755, 136);
__m512 tmp17770 = _mm512_shuffle_f32x4(tmp17755, tmp17755, 221);
__m512 tmp17771 = _mm512_shuffle_f32x4(tmp17756, tmp17756, 136);
__m512 tmp17772 = _mm512_shuffle_f32x4(tmp17756, tmp17756, 221);
tmp17655 = _mm512_shuffle_f32x4(tmp17757, tmp17765, 136);
tmp17663 = _mm512_shuffle_f32x4(tmp17757, tmp17765, 221);
tmp17656 = _mm512_shuffle_f32x4(tmp17759, tmp17767, 136);
tmp17664 = _mm512_shuffle_f32x4(tmp17759, tmp17767, 221);
tmp17657 = _mm512_shuffle_f32x4(tmp17761, tmp17769, 136);
tmp17665 = _mm512_shuffle_f32x4(tmp17761, tmp17769, 221);
tmp17658 = _mm512_shuffle_f32x4(tmp17763, tmp17771, 136);
tmp17666 = _mm512_shuffle_f32x4(tmp17763, tmp17771, 221);
tmp17659 = _mm512_shuffle_f32x4(tmp17758, tmp17766, 136);
__m512 tmp17707 = _mm512_shuffle_f32x4(tmp17758, tmp17766, 221);
tmp17660 = _mm512_shuffle_f32x4(tmp17760, tmp17768, 136);
__m512 tmp17708 = _mm512_shuffle_f32x4(tmp17760, tmp17768, 221);
tmp17661 = _mm512_shuffle_f32x4(tmp17762, tmp17770, 136);
__m512 tmp17709 = _mm512_shuffle_f32x4(tmp17762, tmp17770, 221);
tmp17662 = _mm512_shuffle_f32x4(tmp17764, tmp17772, 136);
__m512 tmp17710 = _mm512_shuffle_f32x4(tmp17764, tmp17772, 221);
(void)tmp17662;
(void)tmp17710;
__m512 tmp17715 = _mm512_add_ps(tmp17656, tmp17657);
__m512 tmp17726 = _mm512_add_ps(tmp17664, tmp17665);
__m512 tmp17714 = _mm512_add_ps(tmp17658, tmp17659);
__m512 tmp17725 = _mm512_add_ps(tmp17666, tmp17707);
__m512 tmp17720 = _mm512_sub_ps(tmp17658, tmp17659);
__m512 tmp17731 = _mm512_sub_ps(tmp17666, tmp17707);
__m512 tmp17719 = _mm512_sub_ps(tmp17656, tmp17657);
__m512 tmp17730 = _mm512_sub_ps(tmp17664, tmp17665);
__m512 tmp17716 = _mm512_add_ps(tmp17660, tmp17661);
__m512 tmp17727 = _mm512_add_ps(tmp17708, tmp17709);
__m512 tmp17721 = _mm512_sub_ps(tmp17660, tmp17661);
__m512 tmp17732 = _mm512_sub_ps(tmp17708, tmp17709);
__m512 tmp17718 = _mm512_fmadd_ps(tmp17720, _mm512_set1_ps(2e+00f), tmp17719);
__m512 tmp17729 = _mm512_fmadd_ps(tmp17731, _mm512_set1_ps(2e+00f), tmp17730);
__m512 tmp17713 = _mm512_add_ps(tmp17714, tmp17715);
__m512 tmp17724 = _mm512_add_ps(tmp17725, tmp17726);
__m512 tmp17717 = _mm512_fmadd_ps(tmp17721, _mm512_set1_ps(1.6e+01f), tmp17718);
__m512 tmp17728 = _mm512_fmadd_ps(tmp17732, _mm512_set1_ps(1.6e+01f), tmp17729);
__m512 tmp17712 = _mm512_add_ps(tmp17713, tmp17655);
__m512 tmp17723 = _mm512_add_ps(tmp17724, tmp17663);
__m512 tmp17711 = _mm512_fmadd_ps(tmp17716, _mm512_set1_ps(3.2e+01f), tmp17712);
__m512 tmp17722 = _mm512_fmadd_ps(tmp17727, _mm512_set1_ps(3.2e+01f), tmp17723);
__m512 out2269 = tmp17711;
__m512 out2267 = tmp17722;
__m512 out2270 = tmp17717;
__m512 out2268 = tmp17728;
out2269 = _mm512_max_ps(_mm512_setzero_ps(), out2269);
out2267 = _mm512_max_ps(_mm512_setzero_ps(), out2267);
out2270 = _mm512_max_ps(_mm512_setzero_ps(), out2270);
out2268 = _mm512_max_ps(_mm512_setzero_ps(), out2268);
_mm512_mask_storeu_ps(datPtr28+2496+212992*i55+56*toH49+4*toW49+3328*k152+3328*l65, 4095, out2269);
_mm512_mask_storeu_ps(datPtr28+1712+212992*i55+56*toH49+4*toW49+3328*k152+3328*l65, 3, out2267);
_mm512_mask_storeu_ps(datPtr28+2520+212992*i55+56*toH49+4*toW49+3328*k152+3328*l65, 192, out2267);
_mm512_mask_storeu_ps(datPtr28+2552+212992*i55+56*toH49+4*toW49+3328*k152+3328*l65, 4095, out2270);
_mm512_mask_storeu_ps(datPtr28+1768+212992*i55+56*toH49+4*toW49+3328*k152+3328*l65, 3, out2268);
_mm512_mask_storeu_ps(datPtr28+2576+212992*i55+56*toH49+4*toW49+3328*k152+3328*l65, 192, out2268);
}
if (k152 >= kk50) return;
}
if (j47 >= last12) return;
++j47;
}

static void ResNet50ThreeConsumeSums5(ResNet50ThreaderTeam1* team59, char** tensors91) {
ResNet50ThreaderTask1 task95;
task95.callee1 = ResNet50ThreeConsumeSums5Callee1;
task95.any1 = tensors91;
task95.nd1 = 3;
task95.hull1[0] = 2;
task95.hull1[1] = 2;
task95.hull1[2] = 1;
ResNet50ThreaderDo1(team59, &task95);
}

static void ResNet50ThreeArrangeFilts6Callee1(ResNet50ThreaderTask1* task108, int64_t* pt59) {
char** tensors106 = task108->any1;
ptrdiff_t b68 = pt59[0];
ptrdiff_t g35 = 0;
ptrdiff_t e30 = 0;
char*restrict bfPtr14 = tensors106[3]+1024*e30;
char*restrict wfPtr14 = tensors106[3]+1024+12976128*e30;
char*restrict wtPtr19 = tensors106[0]+14256*e30;
char*restrict biasPtr19 = tensors106[1];
char*restrict bnPtr19 = tensors106[2];
ptrdiff_t i62 = 1*g35;
ptrdiff_t j54 = 1*b68;
ptrdiff_t jj51 = j54+0;
if (j54 < 64) {
for (; j54 != 64; ++j54) {
ptrdiff_t k165 = 0+1*j54;
ptrdiff_t cut27 = 0;
__m512 postMul62 = _mm512_set1_ps(((float*)bnPtr19+(ptrdiff_t)2*(0+256*i62+4*j54))[0]);
__m512 postMul63 = _mm512_set1_ps(((float*)bnPtr19+(ptrdiff_t)2*(1+256*i62+4*j54))[0]);
__m512 postMul64 = _mm512_set1_ps(((float*)bnPtr19+(ptrdiff_t)2*(2+256*i62+4*j54))[0]);
__m512 postMul65 = _mm512_set1_ps(((float*)bnPtr19+(ptrdiff_t)2*(3+256*i62+4*j54))[0]);
ptrdiff_t s54 = 0;
for (; s54 != 256; ++s54) {
__m512 wt689 = _mm512_maskz_loadu_ps(511, wtPtr19+0+2359296*i62+36864*j54+36*s54);
__m512 wt690 = _mm512_maskz_loadu_ps(511, wtPtr19+9216+2359296*i62+36864*j54+36*s54);
__m512 wt691 = _mm512_maskz_loadu_ps(511, wtPtr19+18432+2359296*i62+36864*j54+36*s54);
__m512 wt692 = _mm512_maskz_loadu_ps(511, wtPtr19+27648+2359296*i62+36864*j54+36*s54);
wt689 = _mm512_mul_ps(wt689, postMul62);
wt690 = _mm512_mul_ps(wt690, postMul63);
wt691 = _mm512_mul_ps(wt691, postMul64);
wt692 = _mm512_mul_ps(wt692, postMul65);
__m512i pm235 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm236 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp18061 = _mm512_permutex2var_ps(wt689, pm235, wt691);
__m512 tmp18062 = _mm512_permutex2var_ps(wt690, pm235, wt692);
__m512 tmp18063 = _mm512_permutex2var_ps(wt689, pm236, wt691);
__m512 tmp18064 = _mm512_permutex2var_ps(wt690, pm236, wt692);
__m512 in2474 = _mm512_permutex2var_ps(tmp18061, pm235, tmp18062);
__m512 in2475 = _mm512_permutex2var_ps(tmp18061, pm236, tmp18062);
__m512 in2476 = _mm512_permutex2var_ps(tmp18063, pm235, tmp18064);
__m512 tmp18065 = _mm512_fmadd_ps(in2474, _mm512_set1_ps(4e+00f), in2476);
__m512 tmp18066 = _mm512_add_ps(in2474, in2476);
__m512 tmp18067 = _mm512_fmadd_ps(in2476, _mm512_set1_ps(4e+00f), in2474);
__m512 tmp18068 = _mm512_add_ps(in2475, tmp18066);
__m512 tmp18069 = _mm512_fmadd_ps(in2475, _mm512_set1_ps(2e+00f), tmp18067);
tmp18067 = _mm512_fnmadd_ps(in2475, _mm512_set1_ps(2e+00f), tmp18067);
__m512 tmp18070 = _mm512_fnmadd_ps(in2475, _mm512_set1_ps(2e+00f), tmp18065);
tmp18065 = _mm512_fmadd_ps(in2475, _mm512_set1_ps(2e+00f), tmp18065);
tmp18066 = _mm512_sub_ps(tmp18066, in2475);
__m512 tmp18087 = _mm512_unpacklo_ps(in2474, tmp18068);
__m512 tmp18088 = _mm512_unpackhi_ps(in2474, tmp18068);
__m512 tmp18089 = _mm512_unpacklo_ps(tmp18066, tmp18069);
__m512 tmp18090 = _mm512_unpackhi_ps(tmp18066, tmp18069);
__m512 tmp18091 = _mm512_unpacklo_ps(tmp18067, tmp18065);
__m512 tmp18092 = _mm512_unpackhi_ps(tmp18067, tmp18065);
__m512 tmp18093 = _mm512_unpacklo_ps(tmp18070, in2476);
__m512 tmp18094 = _mm512_unpackhi_ps(tmp18070, in2476);
__m512 tmp18095 = _mm512_shuffle_ps(tmp18087, tmp18089, 68);
__m512 tmp18096 = _mm512_shuffle_ps(tmp18087, tmp18089, 238);
__m512 tmp18097 = _mm512_shuffle_ps(tmp18088, tmp18090, 68);
__m512 tmp18098 = _mm512_shuffle_ps(tmp18088, tmp18090, 238);
__m512 tmp18099 = _mm512_shuffle_ps(tmp18091, tmp18093, 68);
__m512 tmp18100 = _mm512_shuffle_ps(tmp18091, tmp18093, 238);
__m512 tmp18101 = _mm512_shuffle_ps(tmp18092, tmp18094, 68);
__m512 tmp18102 = _mm512_shuffle_ps(tmp18092, tmp18094, 238);
__m512 tmp18103 = _mm512_shuffle_f32x4(tmp18095, tmp18099, 136);
__m512 tmp18104 = _mm512_shuffle_f32x4(tmp18095, tmp18099, 221);
__m512 tmp18105 = _mm512_shuffle_f32x4(tmp18096, tmp18100, 136);
__m512 tmp18106 = _mm512_shuffle_f32x4(tmp18096, tmp18100, 221);
__m512 tmp18107 = _mm512_shuffle_f32x4(tmp18097, tmp18101, 136);
__m512 tmp18108 = _mm512_shuffle_f32x4(tmp18097, tmp18101, 221);
__m512 tmp18109 = _mm512_shuffle_f32x4(tmp18098, tmp18102, 136);
__m512 tmp18110 = _mm512_shuffle_f32x4(tmp18098, tmp18102, 221);
in2474 = _mm512_shuffle_f32x4(tmp18103, tmp18103, 136);
__m512 tmp18071 = _mm512_shuffle_f32x4(tmp18103, tmp18103, 221);
tmp18068 = _mm512_shuffle_f32x4(tmp18105, tmp18105, 136);
__m512 tmp18072 = _mm512_shuffle_f32x4(tmp18105, tmp18105, 221);
tmp18066 = _mm512_shuffle_f32x4(tmp18107, tmp18107, 136);
__m512 tmp18073 = _mm512_shuffle_f32x4(tmp18107, tmp18107, 221);
tmp18069 = _mm512_shuffle_f32x4(tmp18109, tmp18109, 136);
__m512 tmp18074 = _mm512_shuffle_f32x4(tmp18109, tmp18109, 221);
tmp18067 = _mm512_shuffle_f32x4(tmp18104, tmp18104, 136);
tmp18065 = _mm512_shuffle_f32x4(tmp18106, tmp18106, 136);
tmp18070 = _mm512_shuffle_f32x4(tmp18108, tmp18108, 136);
in2476 = _mm512_shuffle_f32x4(tmp18110, tmp18110, 136);
in2474 = _mm512_shuffle_f32x4(in2474, tmp18069, 68);
tmp18068 = _mm512_shuffle_f32x4(tmp18068, tmp18067, 68);
tmp18066 = _mm512_shuffle_f32x4(tmp18066, tmp18065, 68);
tmp18070 = _mm512_shuffle_f32x4(tmp18070, tmp18072, 68);
in2476 = _mm512_shuffle_f32x4(in2476, tmp18073, 68);
tmp18071 = _mm512_shuffle_f32x4(tmp18071, tmp18074, 68);
__m512 tmp18075 = _mm512_fmadd_ps(in2474, _mm512_set1_ps(4e+00f), tmp18066);
__m512 tmp18081 = _mm512_fmadd_ps(tmp18070, _mm512_set1_ps(4e+00f), tmp18071);
__m512 tmp18076 = _mm512_add_ps(in2474, tmp18066);
__m512 tmp18082 = _mm512_add_ps(tmp18070, tmp18071);
__m512 tmp18077 = _mm512_fmadd_ps(tmp18066, _mm512_set1_ps(4e+00f), in2474);
__m512 tmp18083 = _mm512_fmadd_ps(tmp18071, _mm512_set1_ps(4e+00f), tmp18070);
__m512 tmp18078 = _mm512_add_ps(tmp18068, tmp18076);
__m512 tmp18084 = _mm512_add_ps(in2476, tmp18082);
__m512 tmp18079 = _mm512_fmadd_ps(tmp18068, _mm512_set1_ps(2e+00f), tmp18077);
__m512 tmp18085 = _mm512_fmadd_ps(in2476, _mm512_set1_ps(2e+00f), tmp18083);
tmp18077 = _mm512_fnmadd_ps(tmp18068, _mm512_set1_ps(2e+00f), tmp18077);
tmp18083 = _mm512_fnmadd_ps(in2476, _mm512_set1_ps(2e+00f), tmp18083);
__m512 tmp18080 = _mm512_fnmadd_ps(tmp18068, _mm512_set1_ps(2e+00f), tmp18075);
__m512 tmp18086 = _mm512_fnmadd_ps(in2476, _mm512_set1_ps(2e+00f), tmp18081);
tmp18075 = _mm512_fmadd_ps(tmp18068, _mm512_set1_ps(2e+00f), tmp18075);
tmp18081 = _mm512_fmadd_ps(in2476, _mm512_set1_ps(2e+00f), tmp18081);
tmp18076 = _mm512_sub_ps(tmp18076, tmp18068);
tmp18082 = _mm512_sub_ps(tmp18082, in2476);
in2474 = _mm512_mul_ps(in2474, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp18078 = _mm512_mul_ps(tmp18078, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp18076 = _mm512_mul_ps(tmp18076, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp18079 = _mm512_mul_ps(tmp18079, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp18077 = _mm512_mul_ps(tmp18077, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp18075 = _mm512_mul_ps(tmp18075, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp18080 = _mm512_mul_ps(tmp18080, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp18066 = _mm512_mul_ps(tmp18066, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp18070 = _mm512_mul_ps(tmp18070, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp18084 = _mm512_mul_ps(tmp18084, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp18082 = _mm512_mul_ps(tmp18082, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp18085 = _mm512_mul_ps(tmp18085, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp18083 = _mm512_mul_ps(tmp18083, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp18081 = _mm512_mul_ps(tmp18081, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp18086 = _mm512_mul_ps(tmp18086, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp18071 = _mm512_mul_ps(tmp18071, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out2271 = _mm512_shuffle_f32x4(in2474, tmp18078, 68);
__m512 out2275 = _mm512_shuffle_f32x4(in2474, tmp18078, 238);
__m512 out2272 = _mm512_shuffle_f32x4(tmp18076, tmp18079, 68);
__m512 out2276 = _mm512_shuffle_f32x4(tmp18076, tmp18079, 238);
__m512 out2273 = _mm512_shuffle_f32x4(tmp18077, tmp18075, 68);
__m512 out2277 = _mm512_shuffle_f32x4(tmp18077, tmp18075, 238);
__m512 out2274 = _mm512_shuffle_f32x4(tmp18080, tmp18066, 68);
__m512 out2278 = _mm512_shuffle_f32x4(tmp18080, tmp18066, 238);
__m512 out2279 = _mm512_shuffle_f32x4(tmp18070, tmp18084, 68);
__m512 out2283 = _mm512_shuffle_f32x4(tmp18070, tmp18084, 238);
__m512 out2280 = _mm512_shuffle_f32x4(tmp18082, tmp18085, 68);
__m512 out2284 = _mm512_shuffle_f32x4(tmp18082, tmp18085, 238);
__m512 out2281 = _mm512_shuffle_f32x4(tmp18083, tmp18081, 68);
__m512 out2285 = _mm512_shuffle_f32x4(tmp18083, tmp18081, 238);
__m512 out2282 = _mm512_shuffle_f32x4(tmp18086, tmp18071, 68);
__m512 out2286 = _mm512_shuffle_f32x4(tmp18086, tmp18071, 238);
ptrdiff_t off21 = 32*cut27;
ptrdiff_t off22 = (size_t)(cut27+1)/4*32768+(size_t)(cut27+1)%4*32;
ptrdiff_t off23 = (size_t)(cut27+2)/4*32768+(size_t)(cut27+2)%4*32;
ptrdiff_t off24 = (size_t)(cut27+3)/4*32768+(size_t)(cut27+3)%4*32;
__m512i wf153 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2271, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf154 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2275, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf155 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2279, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf156 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2283, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf157 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2272, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf158 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2276, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf159 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2280, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf160 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2284, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf161 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2273, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf162 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2277, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf163 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2281, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf164 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2285, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf165 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2274, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf166 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2278, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf167 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2282, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf168 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2286, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr14+0+8388608*i62+32768*k165+off21+128*s54, 255, wf153);
_mm512_mask_storeu_epi32(wfPtr14+0+8388608*i62+32768*k165+off22+128*s54, 255, wf154);
_mm512_mask_storeu_epi32(wfPtr14+0+8388608*i62+32768*k165+off23+128*s54, 255, wf155);
_mm512_mask_storeu_epi32(wfPtr14+0+8388608*i62+32768*k165+off24+128*s54, 255, wf156);
_mm512_mask_storeu_epi32(wfPtr14+2097152+8388608*i62+32768*k165+off21+128*s54, 255, wf157);
_mm512_mask_storeu_epi32(wfPtr14+2097152+8388608*i62+32768*k165+off22+128*s54, 255, wf158);
_mm512_mask_storeu_epi32(wfPtr14+2097152+8388608*i62+32768*k165+off23+128*s54, 255, wf159);
_mm512_mask_storeu_epi32(wfPtr14+2097152+8388608*i62+32768*k165+off24+128*s54, 255, wf160);
_mm512_mask_storeu_epi32(wfPtr14+4194304+8388608*i62+32768*k165+off21+128*s54, 255, wf161);
_mm512_mask_storeu_epi32(wfPtr14+4194304+8388608*i62+32768*k165+off22+128*s54, 255, wf162);
_mm512_mask_storeu_epi32(wfPtr14+4194304+8388608*i62+32768*k165+off23+128*s54, 255, wf163);
_mm512_mask_storeu_epi32(wfPtr14+4194304+8388608*i62+32768*k165+off24+128*s54, 255, wf164);
_mm512_mask_storeu_epi32(wfPtr14+6291456+8388608*i62+32768*k165+off21+128*s54, 255, wf165);
_mm512_mask_storeu_epi32(wfPtr14+6291456+8388608*i62+32768*k165+off22+128*s54, 255, wf166);
_mm512_mask_storeu_epi32(wfPtr14+6291456+8388608*i62+32768*k165+off23+128*s54, 255, wf167);
_mm512_mask_storeu_epi32(wfPtr14+6291456+8388608*i62+32768*k165+off24+128*s54, 255, wf168);
}
__m512 bias7 = _mm512_setzero_ps();
if (!e30) {
bias7 = _mm512_maskz_loadu_ps(15, biasPtr19-0+1024*i62+16*j54);
__m512i pmMul40 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd40 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas12 = _mm512_maskz_loadu_ps(255, bnPtr19+(ptrdiff_t)8*(0+256*i62+4*j54));
__m512 postMul66 = _mm512_permutexvar_ps(pmMul40, mas12);
__m512 postAdd40 = _mm512_permutexvar_ps(pmAdd40, mas12);
bias7 = _mm512_fmadd_ps(bias7, postMul66, postAdd40);
}
_mm512_mask_storeu_ps(bfPtr14-0+1024*i62+16*j54, 15, bias7);
if (j54 >= jj51) return;
}
}
}

static void ResNet50ThreeArrangeFilts6(ResNet50ThreaderTeam1* team66, char** tensors105) {
ResNet50ThreaderTask1 task109;
task109.callee1 = ResNet50ThreeArrangeFilts6Callee1;
task109.any1 = tensors105;
task109.nd1 = 3;
task109.hull1[0] = 64;
task109.hull1[1] = 1;
task109.hull1[2] = 1;
ResNet50ThreaderDo1(team66, &task109);
}

static void ResNet50ThreeArrangeDats6Callee1(ResNet50ThreaderTask1* task110, int64_t* pt60) {
char** tensors108 = task110->any1;
ptrdiff_t s55 = pt60[0];
ptrdiff_t c52 = pt60[1];
ptrdiff_t g36 = 0;
ptrdiff_t e31 = 0;
char*restrict datPtr34 = tensors108[0]-60+329472*e31;
char*restrict dfPtr14 = tensors108[1]+912384*e31;
ptrdiff_t i63 = 1*g36;
ptrdiff_t j55 = 1*c52;
ptrdiff_t last13 = j55+0;
ptrdiff_t rel25 = j55-0;
ptrdiff_t base25 = 0;
if (rel25 < 1) {
ptrdiff_t h55 = base25+0;
ptrdiff_t w71 = 0;
ptrdiff_t k166 = 0;
for (; k166 != 64; ++k166) {
__m512 dat2338 = _mm512_maskz_loadu_ps(127, datPtr34+340+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512i pm237 = _mm512_set_epi32(6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15);
__m512 in2484 = _mm512_permutexvar_ps(pm237, dat2338);
__m512 dat2339 = _mm512_maskz_loadu_ps(16383, datPtr34+60+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2340 = _mm512_maskz_loadu_ps(127, datPtr34+396+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512i pm238 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in2477 = _mm512_permutexvar_ps(pm238, dat2339);
__m512i pm239 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 15, 15, 13, 12, 11);
__m512 in2485 = _mm512_permutex2var_ps(dat2339, pm239, dat2340);
__m512 dat2341 = _mm512_maskz_loadu_ps(16383, datPtr34+116+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2342 = _mm512_maskz_loadu_ps(127, datPtr34+452+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2478 = _mm512_permutexvar_ps(pm238, dat2341);
__m512 in2486 = _mm512_permutex2var_ps(dat2341, pm239, dat2342);
__m512 dat2343 = _mm512_maskz_loadu_ps(16383, datPtr34+172+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2344 = _mm512_maskz_loadu_ps(127, datPtr34+508+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2479 = _mm512_permutexvar_ps(pm238, dat2343);
__m512 in2487 = _mm512_permutex2var_ps(dat2343, pm239, dat2344);
__m512 dat2345 = _mm512_maskz_loadu_ps(16383, datPtr34+228+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2346 = _mm512_maskz_loadu_ps(127, datPtr34+564+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2480 = _mm512_permutexvar_ps(pm238, dat2345);
__m512 in2488 = _mm512_permutex2var_ps(dat2345, pm239, dat2346);
__m512 dat2347 = _mm512_maskz_loadu_ps(16383, datPtr34+284+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2348 = _mm512_maskz_loadu_ps(127, datPtr34+620+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2481 = _mm512_permutexvar_ps(pm238, dat2347);
__m512 in2489 = _mm512_permutex2var_ps(dat2347, pm239, dat2348);
__m512 dat2349 = _mm512_maskz_loadu_ps(16383, datPtr34+340+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2350 = _mm512_maskz_loadu_ps(127, datPtr34+676+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2482 = _mm512_permutexvar_ps(pm238, dat2349);
__m512 in2490 = _mm512_permutex2var_ps(dat2349, pm239, dat2350);
__m512 dat2351 = _mm512_maskz_loadu_ps(16383, datPtr34+396+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2352 = _mm512_maskz_loadu_ps(127, datPtr34+732+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2483 = _mm512_permutexvar_ps(pm238, dat2351);
__m512 in2491 = _mm512_permutex2var_ps(dat2351, pm239, dat2352);
__m512 tmp18111 = _mm512_add_ps(in2477, in2481);
__m512 tmp18116 = _mm512_add_ps(in2485, in2489);
__m512 tmp18112 = _mm512_sub_ps(in2480, in2478);
__m512 tmp18117 = _mm512_sub_ps(in2488, in2486);
__m512 tmp18113 = _mm512_add_ps(in2478, in2482);
__m512 tmp18118 = _mm512_add_ps(in2486, in2490);
__m512 tmp18114 = _mm512_sub_ps(_mm512_setzero_ps(), in2482);
in2484 = _mm512_sub_ps(in2484, in2490);
tmp18111 = _mm512_fmadd_ps(in2479, _mm512_set1_ps(-4.25e+00f), tmp18111);
tmp18116 = _mm512_fmadd_ps(in2487, _mm512_set1_ps(-4.25e+00f), tmp18116);
tmp18113 = _mm512_fmadd_ps(in2480, _mm512_set1_ps(-4.25e+00f), tmp18113);
tmp18118 = _mm512_fmadd_ps(in2488, _mm512_set1_ps(-4.25e+00f), tmp18118);
tmp18114 = _mm512_fmadd_ps(tmp18112, _mm512_set1_ps(5.25e+00f), tmp18114);
in2484 = _mm512_fmadd_ps(tmp18117, _mm512_set1_ps(5.25e+00f), in2484);
tmp18112 = _mm512_fmadd_ps(in2478, _mm512_set1_ps(2.5e-01f), in2482);
tmp18117 = _mm512_fmadd_ps(in2486, _mm512_set1_ps(2.5e-01f), in2490);
in2478 = _mm512_fmadd_ps(in2478, _mm512_set1_ps(4e+00f), in2482);
in2486 = _mm512_fmadd_ps(in2486, _mm512_set1_ps(4e+00f), in2490);
__m512 tmp18115 = _mm512_sub_ps(tmp18113, tmp18111);
__m512 tmp18119 = _mm512_sub_ps(tmp18118, tmp18116);
tmp18113 = _mm512_add_ps(tmp18111, tmp18113);
tmp18118 = _mm512_add_ps(tmp18116, tmp18118);
tmp18111 = _mm512_fmadd_ps(in2477, _mm512_set1_ps(2.5e-01f), in2481);
tmp18116 = _mm512_fmadd_ps(in2485, _mm512_set1_ps(2.5e-01f), in2489);
tmp18112 = _mm512_fmadd_ps(in2480, _mm512_set1_ps(-1.25e+00f), tmp18112);
tmp18117 = _mm512_fmadd_ps(in2488, _mm512_set1_ps(-1.25e+00f), tmp18117);
in2480 = _mm512_fmadd_ps(in2480, _mm512_set1_ps(-5e+00f), in2478);
in2488 = _mm512_fmadd_ps(in2488, _mm512_set1_ps(-5e+00f), in2486);
tmp18111 = _mm512_fmadd_ps(in2479, _mm512_set1_ps(-1.25e+00f), tmp18111);
tmp18116 = _mm512_fmadd_ps(in2487, _mm512_set1_ps(-1.25e+00f), tmp18116);
in2482 = _mm512_fmadd_ps(tmp18111, _mm512_set1_ps(2e+00f), tmp18112);
in2490 = _mm512_fmadd_ps(tmp18116, _mm512_set1_ps(2e+00f), tmp18117);
tmp18112 = _mm512_fnmadd_ps(tmp18111, _mm512_set1_ps(2e+00f), tmp18112);
tmp18117 = _mm512_fnmadd_ps(tmp18116, _mm512_set1_ps(2e+00f), tmp18117);
tmp18111 = _mm512_fmadd_ps(in2481, _mm512_set1_ps(2.5e-01f), in2477);
tmp18116 = _mm512_fmadd_ps(in2489, _mm512_set1_ps(2.5e-01f), in2485);
in2477 = _mm512_sub_ps(in2483, in2477);
in2485 = _mm512_sub_ps(in2491, in2485);
tmp18111 = _mm512_fmadd_ps(in2479, _mm512_set1_ps(-1.25e+00f), tmp18111);
tmp18116 = _mm512_fmadd_ps(in2487, _mm512_set1_ps(-1.25e+00f), tmp18116);
in2479 = _mm512_sub_ps(in2479, in2481);
in2487 = _mm512_sub_ps(in2487, in2489);
in2479 = _mm512_fmadd_ps(in2479, _mm512_set1_ps(5.25e+00f), in2477);
in2487 = _mm512_fmadd_ps(in2487, _mm512_set1_ps(5.25e+00f), in2485);
in2478 = _mm512_fmadd_ps(tmp18111, _mm512_set1_ps(2e+00f), in2480);
in2486 = _mm512_fmadd_ps(tmp18116, _mm512_set1_ps(2e+00f), in2488);
in2480 = _mm512_fnmadd_ps(tmp18111, _mm512_set1_ps(2e+00f), in2480);
in2488 = _mm512_fnmadd_ps(tmp18116, _mm512_set1_ps(2e+00f), in2488);
__m512 tmp18128 = _mm512_unpacklo_ps(tmp18114, tmp18113);
__m512 tmp18129 = _mm512_unpackhi_ps(tmp18114, tmp18113);
__m512 tmp18130 = _mm512_unpacklo_ps(tmp18115, in2482);
__m512 tmp18131 = _mm512_unpackhi_ps(tmp18115, in2482);
__m512 tmp18132 = _mm512_unpacklo_ps(tmp18112, in2478);
__m512 tmp18133 = _mm512_unpackhi_ps(tmp18112, in2478);
__m512 tmp18134 = _mm512_unpacklo_ps(in2480, in2479);
__m512 tmp18135 = _mm512_unpackhi_ps(in2480, in2479);
__m512 tmp18136 = _mm512_unpacklo_ps(in2484, tmp18118);
__m512 tmp18137 = _mm512_unpackhi_ps(in2484, tmp18118);
__m512 tmp18138 = _mm512_unpacklo_ps(tmp18119, in2490);
__m512 tmp18139 = _mm512_unpackhi_ps(tmp18119, in2490);
__m512 tmp18140 = _mm512_unpacklo_ps(tmp18117, in2486);
__m512 tmp18141 = _mm512_unpackhi_ps(tmp18117, in2486);
__m512 tmp18142 = _mm512_unpacklo_ps(in2488, in2487);
__m512 tmp18143 = _mm512_unpackhi_ps(in2488, in2487);
__m512 tmp18144 = _mm512_shuffle_ps(tmp18128, tmp18130, 68);
__m512 tmp18145 = _mm512_shuffle_ps(tmp18128, tmp18130, 238);
__m512 tmp18146 = _mm512_shuffle_ps(tmp18129, tmp18131, 68);
__m512 tmp18147 = _mm512_shuffle_ps(tmp18129, tmp18131, 238);
__m512 tmp18148 = _mm512_shuffle_ps(tmp18132, tmp18134, 68);
__m512 tmp18149 = _mm512_shuffle_ps(tmp18132, tmp18134, 238);
__m512 tmp18150 = _mm512_shuffle_ps(tmp18133, tmp18135, 68);
__m512 tmp18151 = _mm512_shuffle_ps(tmp18133, tmp18135, 238);
__m512 tmp18152 = _mm512_shuffle_ps(tmp18136, tmp18138, 68);
__m512 tmp18153 = _mm512_shuffle_ps(tmp18136, tmp18138, 238);
__m512 tmp18154 = _mm512_shuffle_ps(tmp18137, tmp18139, 68);
__m512 tmp18155 = _mm512_shuffle_ps(tmp18137, tmp18139, 238);
__m512 tmp18156 = _mm512_shuffle_ps(tmp18140, tmp18142, 68);
__m512 tmp18157 = _mm512_shuffle_ps(tmp18140, tmp18142, 238);
__m512 tmp18158 = _mm512_shuffle_ps(tmp18141, tmp18143, 68);
__m512 tmp18159 = _mm512_shuffle_ps(tmp18141, tmp18143, 238);
__m512 tmp18160 = _mm512_shuffle_f32x4(tmp18144, tmp18148, 136);
__m512 tmp18161 = _mm512_shuffle_f32x4(tmp18144, tmp18148, 221);
__m512 tmp18162 = _mm512_shuffle_f32x4(tmp18145, tmp18149, 136);
__m512 tmp18163 = _mm512_shuffle_f32x4(tmp18145, tmp18149, 221);
__m512 tmp18164 = _mm512_shuffle_f32x4(tmp18146, tmp18150, 136);
__m512 tmp18165 = _mm512_shuffle_f32x4(tmp18146, tmp18150, 221);
__m512 tmp18166 = _mm512_shuffle_f32x4(tmp18147, tmp18151, 136);
__m512 tmp18167 = _mm512_shuffle_f32x4(tmp18147, tmp18151, 221);
__m512 tmp18168 = _mm512_shuffle_f32x4(tmp18152, tmp18156, 136);
__m512 tmp18169 = _mm512_shuffle_f32x4(tmp18152, tmp18156, 221);
__m512 tmp18170 = _mm512_shuffle_f32x4(tmp18153, tmp18157, 136);
__m512 tmp18171 = _mm512_shuffle_f32x4(tmp18153, tmp18157, 221);
__m512 tmp18172 = _mm512_shuffle_f32x4(tmp18154, tmp18158, 136);
__m512 tmp18173 = _mm512_shuffle_f32x4(tmp18154, tmp18158, 221);
__m512 tmp18174 = _mm512_shuffle_f32x4(tmp18155, tmp18159, 136);
__m512 tmp18175 = _mm512_shuffle_f32x4(tmp18155, tmp18159, 221);
tmp18114 = _mm512_shuffle_f32x4(tmp18160, tmp18168, 136);
in2484 = _mm512_shuffle_f32x4(tmp18160, tmp18168, 221);
tmp18113 = _mm512_shuffle_f32x4(tmp18162, tmp18170, 136);
tmp18118 = _mm512_shuffle_f32x4(tmp18162, tmp18170, 221);
tmp18115 = _mm512_shuffle_f32x4(tmp18164, tmp18172, 136);
tmp18119 = _mm512_shuffle_f32x4(tmp18164, tmp18172, 221);
in2482 = _mm512_shuffle_f32x4(tmp18166, tmp18174, 136);
in2490 = _mm512_shuffle_f32x4(tmp18166, tmp18174, 221);
tmp18112 = _mm512_shuffle_f32x4(tmp18161, tmp18169, 136);
tmp18117 = _mm512_shuffle_f32x4(tmp18161, tmp18169, 221);
in2478 = _mm512_shuffle_f32x4(tmp18163, tmp18171, 136);
in2486 = _mm512_shuffle_f32x4(tmp18163, tmp18171, 221);
in2480 = _mm512_shuffle_f32x4(tmp18165, tmp18173, 136);
in2488 = _mm512_shuffle_f32x4(tmp18165, tmp18173, 221);
in2479 = _mm512_shuffle_f32x4(tmp18167, tmp18175, 136);
in2487 = _mm512_shuffle_f32x4(tmp18167, tmp18175, 221);
__m512 tmp18120 = _mm512_add_ps(tmp18113, in2478);
__m512 tmp18124 = _mm512_add_ps(tmp18118, in2486);
__m512 tmp18121 = _mm512_sub_ps(tmp18112, tmp18115);
__m512 tmp18125 = _mm512_sub_ps(tmp18117, tmp18119);
__m512 tmp18122 = _mm512_add_ps(tmp18115, in2480);
__m512 tmp18126 = _mm512_add_ps(tmp18119, in2488);
tmp18114 = _mm512_sub_ps(tmp18114, in2480);
in2484 = _mm512_sub_ps(in2484, in2488);
tmp18120 = _mm512_fmadd_ps(in2482, _mm512_set1_ps(-4.25e+00f), tmp18120);
tmp18124 = _mm512_fmadd_ps(in2490, _mm512_set1_ps(-4.25e+00f), tmp18124);
tmp18122 = _mm512_fmadd_ps(tmp18112, _mm512_set1_ps(-4.25e+00f), tmp18122);
tmp18126 = _mm512_fmadd_ps(tmp18117, _mm512_set1_ps(-4.25e+00f), tmp18126);
tmp18114 = _mm512_fmadd_ps(tmp18121, _mm512_set1_ps(5.25e+00f), tmp18114);
in2484 = _mm512_fmadd_ps(tmp18125, _mm512_set1_ps(5.25e+00f), in2484);
tmp18121 = _mm512_fmadd_ps(tmp18115, _mm512_set1_ps(2.5e-01f), in2480);
tmp18125 = _mm512_fmadd_ps(tmp18119, _mm512_set1_ps(2.5e-01f), in2488);
tmp18115 = _mm512_fmadd_ps(tmp18115, _mm512_set1_ps(4e+00f), in2480);
tmp18119 = _mm512_fmadd_ps(tmp18119, _mm512_set1_ps(4e+00f), in2488);
__m512 tmp18123 = _mm512_sub_ps(tmp18122, tmp18120);
__m512 tmp18127 = _mm512_sub_ps(tmp18126, tmp18124);
tmp18122 = _mm512_add_ps(tmp18120, tmp18122);
tmp18126 = _mm512_add_ps(tmp18124, tmp18126);
tmp18120 = _mm512_fmadd_ps(tmp18113, _mm512_set1_ps(2.5e-01f), in2478);
tmp18124 = _mm512_fmadd_ps(tmp18118, _mm512_set1_ps(2.5e-01f), in2486);
tmp18121 = _mm512_fmadd_ps(tmp18112, _mm512_set1_ps(-1.25e+00f), tmp18121);
tmp18125 = _mm512_fmadd_ps(tmp18117, _mm512_set1_ps(-1.25e+00f), tmp18125);
tmp18112 = _mm512_fmadd_ps(tmp18112, _mm512_set1_ps(-5e+00f), tmp18115);
tmp18117 = _mm512_fmadd_ps(tmp18117, _mm512_set1_ps(-5e+00f), tmp18119);
tmp18120 = _mm512_fmadd_ps(in2482, _mm512_set1_ps(-1.25e+00f), tmp18120);
tmp18124 = _mm512_fmadd_ps(in2490, _mm512_set1_ps(-1.25e+00f), tmp18124);
in2480 = _mm512_fmadd_ps(tmp18120, _mm512_set1_ps(2e+00f), tmp18121);
in2488 = _mm512_fmadd_ps(tmp18124, _mm512_set1_ps(2e+00f), tmp18125);
tmp18121 = _mm512_fnmadd_ps(tmp18120, _mm512_set1_ps(2e+00f), tmp18121);
tmp18125 = _mm512_fnmadd_ps(tmp18124, _mm512_set1_ps(2e+00f), tmp18125);
tmp18120 = _mm512_fmadd_ps(in2478, _mm512_set1_ps(2.5e-01f), tmp18113);
tmp18124 = _mm512_fmadd_ps(in2486, _mm512_set1_ps(2.5e-01f), tmp18118);
tmp18113 = _mm512_sub_ps(in2479, tmp18113);
tmp18118 = _mm512_sub_ps(in2487, tmp18118);
tmp18120 = _mm512_fmadd_ps(in2482, _mm512_set1_ps(-1.25e+00f), tmp18120);
tmp18124 = _mm512_fmadd_ps(in2490, _mm512_set1_ps(-1.25e+00f), tmp18124);
in2482 = _mm512_sub_ps(in2482, in2478);
in2490 = _mm512_sub_ps(in2490, in2486);
in2482 = _mm512_fmadd_ps(in2482, _mm512_set1_ps(5.25e+00f), tmp18113);
in2490 = _mm512_fmadd_ps(in2490, _mm512_set1_ps(5.25e+00f), tmp18118);
tmp18115 = _mm512_fmadd_ps(tmp18120, _mm512_set1_ps(2e+00f), tmp18112);
tmp18119 = _mm512_fmadd_ps(tmp18124, _mm512_set1_ps(2e+00f), tmp18117);
tmp18112 = _mm512_fnmadd_ps(tmp18120, _mm512_set1_ps(2e+00f), tmp18112);
tmp18117 = _mm512_fnmadd_ps(tmp18124, _mm512_set1_ps(2e+00f), tmp18117);
__m512 out2287 = _mm512_shuffle_f32x4(tmp18114, tmp18122, 68);
__m512 out2295 = _mm512_shuffle_f32x4(tmp18114, tmp18122, 238);
__m512 out2288 = _mm512_shuffle_f32x4(tmp18123, in2480, 68);
__m512 out2296 = _mm512_shuffle_f32x4(tmp18123, in2480, 238);
__m512 out2289 = _mm512_shuffle_f32x4(tmp18121, tmp18115, 68);
__m512 out2297 = _mm512_shuffle_f32x4(tmp18121, tmp18115, 238);
__m512 out2290 = _mm512_shuffle_f32x4(tmp18112, in2482, 68);
__m512 out2298 = _mm512_shuffle_f32x4(tmp18112, in2482, 238);
__m512 out2291 = _mm512_shuffle_f32x4(in2484, tmp18126, 68);
__m512 out2299 = _mm512_shuffle_f32x4(in2484, tmp18126, 238);
__m512 out2292 = _mm512_shuffle_f32x4(tmp18127, in2488, 68);
__m512 out2300 = _mm512_shuffle_f32x4(tmp18127, in2488, 238);
__m512 out2293 = _mm512_shuffle_f32x4(tmp18125, tmp18119, 68);
__m512 out2301 = _mm512_shuffle_f32x4(tmp18125, tmp18119, 238);
__m512 out2294 = _mm512_shuffle_f32x4(tmp18117, in2490, 68);
__m512 out2302 = _mm512_shuffle_f32x4(tmp18117, in2490, 238);
_mm512_storeu_ps(dfPtr14+0+589824*i63+98304*j55+49152*s55+768*k166, out2287);
_mm512_storeu_ps(dfPtr14+128+589824*i63+98304*j55+49152*s55+768*k166, out2295);
_mm512_storeu_ps(dfPtr14+64+589824*i63+98304*j55+49152*s55+768*k166, out2291);
_mm512_storeu_ps(dfPtr14+192+589824*i63+98304*j55+49152*s55+768*k166, out2299);
_mm512_storeu_ps(dfPtr14+147456+589824*i63+98304*j55+49152*s55+768*k166, out2288);
_mm512_storeu_ps(dfPtr14+147584+589824*i63+98304*j55+49152*s55+768*k166, out2296);
_mm512_storeu_ps(dfPtr14+147520+589824*i63+98304*j55+49152*s55+768*k166, out2292);
_mm512_storeu_ps(dfPtr14+147648+589824*i63+98304*j55+49152*s55+768*k166, out2300);
_mm512_storeu_ps(dfPtr14+294912+589824*i63+98304*j55+49152*s55+768*k166, out2289);
_mm512_storeu_ps(dfPtr14+295040+589824*i63+98304*j55+49152*s55+768*k166, out2297);
_mm512_storeu_ps(dfPtr14+294976+589824*i63+98304*j55+49152*s55+768*k166, out2293);
_mm512_storeu_ps(dfPtr14+295104+589824*i63+98304*j55+49152*s55+768*k166, out2301);
_mm512_storeu_ps(dfPtr14+442368+589824*i63+98304*j55+49152*s55+768*k166, out2290);
_mm512_storeu_ps(dfPtr14+442496+589824*i63+98304*j55+49152*s55+768*k166, out2298);
_mm512_storeu_ps(dfPtr14+442432+589824*i63+98304*j55+49152*s55+768*k166, out2294);
_mm512_storeu_ps(dfPtr14+442560+589824*i63+98304*j55+49152*s55+768*k166, out2302);
__m512 dat2353 = _mm512_maskz_loadu_ps(511, datPtr34+360+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512i pm240 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in2492 = _mm512_permutexvar_ps(pm240, dat2353);
__m512 dat2354 = _mm512_maskz_loadu_ps(511, datPtr34+416+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2355 = _mm512_maskz_loadu_ps(8191, datPtr34+892+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2493 = _mm512_permutexvar_ps(pm240, dat2354);
__m512i pm241 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in2500 = _mm512_permutexvar_ps(pm241, dat2355);
__m512 dat2356 = _mm512_maskz_loadu_ps(511, datPtr34+472+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2357 = _mm512_maskz_loadu_ps(8191, datPtr34+948+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2494 = _mm512_permutexvar_ps(pm240, dat2356);
__m512 in2501 = _mm512_permutexvar_ps(pm241, dat2357);
__m512 dat2358 = _mm512_maskz_loadu_ps(511, datPtr34+528+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2359 = _mm512_maskz_loadu_ps(8191, datPtr34+1004+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2495 = _mm512_permutexvar_ps(pm240, dat2358);
__m512 in2502 = _mm512_permutexvar_ps(pm241, dat2359);
__m512 dat2360 = _mm512_maskz_loadu_ps(511, datPtr34+584+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2361 = _mm512_maskz_loadu_ps(8191, datPtr34+1060+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2496 = _mm512_permutexvar_ps(pm240, dat2360);
__m512 in2503 = _mm512_permutexvar_ps(pm241, dat2361);
__m512 dat2362 = _mm512_maskz_loadu_ps(511, datPtr34+640+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2363 = _mm512_maskz_loadu_ps(8191, datPtr34+1116+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2497 = _mm512_permutexvar_ps(pm240, dat2362);
__m512 in2504 = _mm512_permutexvar_ps(pm241, dat2363);
__m512 dat2364 = _mm512_maskz_loadu_ps(511, datPtr34+696+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2365 = _mm512_maskz_loadu_ps(8191, datPtr34+1172+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2498 = _mm512_permutexvar_ps(pm240, dat2364);
__m512 in2505 = _mm512_permutexvar_ps(pm241, dat2365);
__m512 dat2366 = _mm512_maskz_loadu_ps(511, datPtr34+752+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2367 = _mm512_maskz_loadu_ps(8191, datPtr34+1228+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2499 = _mm512_permutexvar_ps(pm240, dat2366);
__m512 in2506 = _mm512_permutexvar_ps(pm241, dat2367);
__m512 tmp18176 = _mm512_add_ps(in2493, in2497);
__m512 tmp18180 = _mm512_add_ps(in2500, in2504);
__m512 tmp18177 = _mm512_sub_ps(in2496, in2494);
__m512 tmp18181 = _mm512_sub_ps(in2503, in2501);
__m512 tmp18178 = _mm512_add_ps(in2494, in2498);
__m512 tmp18182 = _mm512_add_ps(in2501, in2505);
in2492 = _mm512_sub_ps(in2492, in2498);
__m512 tmp18183 = _mm512_sub_ps(_mm512_setzero_ps(), in2505);
tmp18176 = _mm512_fmadd_ps(in2495, _mm512_set1_ps(-4.25e+00f), tmp18176);
tmp18180 = _mm512_fmadd_ps(in2502, _mm512_set1_ps(-4.25e+00f), tmp18180);
tmp18178 = _mm512_fmadd_ps(in2496, _mm512_set1_ps(-4.25e+00f), tmp18178);
tmp18182 = _mm512_fmadd_ps(in2503, _mm512_set1_ps(-4.25e+00f), tmp18182);
in2492 = _mm512_fmadd_ps(tmp18177, _mm512_set1_ps(5.25e+00f), in2492);
tmp18183 = _mm512_fmadd_ps(tmp18181, _mm512_set1_ps(5.25e+00f), tmp18183);
tmp18177 = _mm512_fmadd_ps(in2494, _mm512_set1_ps(2.5e-01f), in2498);
tmp18181 = _mm512_fmadd_ps(in2501, _mm512_set1_ps(2.5e-01f), in2505);
in2494 = _mm512_fmadd_ps(in2494, _mm512_set1_ps(4e+00f), in2498);
in2501 = _mm512_fmadd_ps(in2501, _mm512_set1_ps(4e+00f), in2505);
__m512 tmp18179 = _mm512_sub_ps(tmp18178, tmp18176);
__m512 tmp18184 = _mm512_sub_ps(tmp18182, tmp18180);
tmp18178 = _mm512_add_ps(tmp18176, tmp18178);
tmp18182 = _mm512_add_ps(tmp18180, tmp18182);
tmp18176 = _mm512_fmadd_ps(in2493, _mm512_set1_ps(2.5e-01f), in2497);
tmp18180 = _mm512_fmadd_ps(in2500, _mm512_set1_ps(2.5e-01f), in2504);
tmp18177 = _mm512_fmadd_ps(in2496, _mm512_set1_ps(-1.25e+00f), tmp18177);
tmp18181 = _mm512_fmadd_ps(in2503, _mm512_set1_ps(-1.25e+00f), tmp18181);
in2496 = _mm512_fmadd_ps(in2496, _mm512_set1_ps(-5e+00f), in2494);
in2503 = _mm512_fmadd_ps(in2503, _mm512_set1_ps(-5e+00f), in2501);
tmp18176 = _mm512_fmadd_ps(in2495, _mm512_set1_ps(-1.25e+00f), tmp18176);
tmp18180 = _mm512_fmadd_ps(in2502, _mm512_set1_ps(-1.25e+00f), tmp18180);
in2498 = _mm512_fmadd_ps(tmp18176, _mm512_set1_ps(2e+00f), tmp18177);
in2505 = _mm512_fmadd_ps(tmp18180, _mm512_set1_ps(2e+00f), tmp18181);
tmp18177 = _mm512_fnmadd_ps(tmp18176, _mm512_set1_ps(2e+00f), tmp18177);
tmp18181 = _mm512_fnmadd_ps(tmp18180, _mm512_set1_ps(2e+00f), tmp18181);
tmp18176 = _mm512_fmadd_ps(in2497, _mm512_set1_ps(2.5e-01f), in2493);
tmp18180 = _mm512_fmadd_ps(in2504, _mm512_set1_ps(2.5e-01f), in2500);
in2493 = _mm512_sub_ps(in2499, in2493);
in2500 = _mm512_sub_ps(in2506, in2500);
tmp18176 = _mm512_fmadd_ps(in2495, _mm512_set1_ps(-1.25e+00f), tmp18176);
tmp18180 = _mm512_fmadd_ps(in2502, _mm512_set1_ps(-1.25e+00f), tmp18180);
in2495 = _mm512_sub_ps(in2495, in2497);
in2502 = _mm512_sub_ps(in2502, in2504);
in2495 = _mm512_fmadd_ps(in2495, _mm512_set1_ps(5.25e+00f), in2493);
in2502 = _mm512_fmadd_ps(in2502, _mm512_set1_ps(5.25e+00f), in2500);
in2494 = _mm512_fmadd_ps(tmp18176, _mm512_set1_ps(2e+00f), in2496);
in2501 = _mm512_fmadd_ps(tmp18180, _mm512_set1_ps(2e+00f), in2503);
in2496 = _mm512_fnmadd_ps(tmp18176, _mm512_set1_ps(2e+00f), in2496);
in2503 = _mm512_fnmadd_ps(tmp18180, _mm512_set1_ps(2e+00f), in2503);
__m512 tmp18193 = _mm512_unpacklo_ps(in2492, tmp18178);
__m512 tmp18194 = _mm512_unpackhi_ps(in2492, tmp18178);
__m512 tmp18195 = _mm512_unpacklo_ps(tmp18179, in2498);
__m512 tmp18196 = _mm512_unpackhi_ps(tmp18179, in2498);
__m512 tmp18197 = _mm512_unpacklo_ps(tmp18177, in2494);
__m512 tmp18198 = _mm512_unpackhi_ps(tmp18177, in2494);
__m512 tmp18199 = _mm512_unpacklo_ps(in2496, in2495);
__m512 tmp18200 = _mm512_unpackhi_ps(in2496, in2495);
__m512 tmp18201 = _mm512_unpacklo_ps(tmp18183, tmp18182);
__m512 tmp18202 = _mm512_unpackhi_ps(tmp18183, tmp18182);
__m512 tmp18203 = _mm512_unpacklo_ps(tmp18184, in2505);
__m512 tmp18204 = _mm512_unpackhi_ps(tmp18184, in2505);
__m512 tmp18205 = _mm512_unpacklo_ps(tmp18181, in2501);
__m512 tmp18206 = _mm512_unpackhi_ps(tmp18181, in2501);
__m512 tmp18207 = _mm512_unpacklo_ps(in2503, in2502);
__m512 tmp18208 = _mm512_unpackhi_ps(in2503, in2502);
__m512 tmp18209 = _mm512_shuffle_ps(tmp18193, tmp18195, 68);
__m512 tmp18210 = _mm512_shuffle_ps(tmp18193, tmp18195, 238);
__m512 tmp18211 = _mm512_shuffle_ps(tmp18194, tmp18196, 68);
__m512 tmp18212 = _mm512_shuffle_ps(tmp18194, tmp18196, 238);
__m512 tmp18213 = _mm512_shuffle_ps(tmp18197, tmp18199, 68);
__m512 tmp18214 = _mm512_shuffle_ps(tmp18197, tmp18199, 238);
__m512 tmp18215 = _mm512_shuffle_ps(tmp18198, tmp18200, 68);
__m512 tmp18216 = _mm512_shuffle_ps(tmp18198, tmp18200, 238);
__m512 tmp18217 = _mm512_shuffle_ps(tmp18201, tmp18203, 68);
__m512 tmp18218 = _mm512_shuffle_ps(tmp18201, tmp18203, 238);
__m512 tmp18219 = _mm512_shuffle_ps(tmp18202, tmp18204, 68);
__m512 tmp18220 = _mm512_shuffle_ps(tmp18202, tmp18204, 238);
__m512 tmp18221 = _mm512_shuffle_ps(tmp18205, tmp18207, 68);
__m512 tmp18222 = _mm512_shuffle_ps(tmp18205, tmp18207, 238);
__m512 tmp18223 = _mm512_shuffle_ps(tmp18206, tmp18208, 68);
__m512 tmp18224 = _mm512_shuffle_ps(tmp18206, tmp18208, 238);
__m512 tmp18225 = _mm512_shuffle_f32x4(tmp18209, tmp18213, 136);
__m512 tmp18226 = _mm512_shuffle_f32x4(tmp18209, tmp18213, 221);
__m512 tmp18227 = _mm512_shuffle_f32x4(tmp18210, tmp18214, 136);
__m512 tmp18228 = _mm512_shuffle_f32x4(tmp18210, tmp18214, 221);
__m512 tmp18229 = _mm512_shuffle_f32x4(tmp18211, tmp18215, 136);
__m512 tmp18230 = _mm512_shuffle_f32x4(tmp18211, tmp18215, 221);
__m512 tmp18231 = _mm512_shuffle_f32x4(tmp18212, tmp18216, 136);
__m512 tmp18232 = _mm512_shuffle_f32x4(tmp18212, tmp18216, 221);
__m512 tmp18233 = _mm512_shuffle_f32x4(tmp18217, tmp18221, 136);
__m512 tmp18234 = _mm512_shuffle_f32x4(tmp18217, tmp18221, 221);
__m512 tmp18235 = _mm512_shuffle_f32x4(tmp18218, tmp18222, 136);
__m512 tmp18236 = _mm512_shuffle_f32x4(tmp18218, tmp18222, 221);
__m512 tmp18237 = _mm512_shuffle_f32x4(tmp18219, tmp18223, 136);
__m512 tmp18238 = _mm512_shuffle_f32x4(tmp18219, tmp18223, 221);
__m512 tmp18239 = _mm512_shuffle_f32x4(tmp18220, tmp18224, 136);
__m512 tmp18240 = _mm512_shuffle_f32x4(tmp18220, tmp18224, 221);
in2492 = _mm512_shuffle_f32x4(tmp18225, tmp18233, 136);
tmp18183 = _mm512_shuffle_f32x4(tmp18225, tmp18233, 221);
tmp18178 = _mm512_shuffle_f32x4(tmp18227, tmp18235, 136);
tmp18182 = _mm512_shuffle_f32x4(tmp18227, tmp18235, 221);
tmp18179 = _mm512_shuffle_f32x4(tmp18229, tmp18237, 136);
tmp18184 = _mm512_shuffle_f32x4(tmp18229, tmp18237, 221);
in2498 = _mm512_shuffle_f32x4(tmp18231, tmp18239, 136);
in2505 = _mm512_shuffle_f32x4(tmp18231, tmp18239, 221);
tmp18177 = _mm512_shuffle_f32x4(tmp18226, tmp18234, 136);
tmp18181 = _mm512_shuffle_f32x4(tmp18226, tmp18234, 221);
in2494 = _mm512_shuffle_f32x4(tmp18228, tmp18236, 136);
in2501 = _mm512_shuffle_f32x4(tmp18228, tmp18236, 221);
in2496 = _mm512_shuffle_f32x4(tmp18230, tmp18238, 136);
in2503 = _mm512_shuffle_f32x4(tmp18230, tmp18238, 221);
in2495 = _mm512_shuffle_f32x4(tmp18232, tmp18240, 136);
in2502 = _mm512_shuffle_f32x4(tmp18232, tmp18240, 221);
__m512 tmp18185 = _mm512_add_ps(tmp18178, in2494);
__m512 tmp18189 = _mm512_add_ps(tmp18182, in2501);
__m512 tmp18186 = _mm512_sub_ps(tmp18177, tmp18179);
__m512 tmp18190 = _mm512_sub_ps(tmp18181, tmp18184);
__m512 tmp18187 = _mm512_add_ps(tmp18179, in2496);
__m512 tmp18191 = _mm512_add_ps(tmp18184, in2503);
in2492 = _mm512_sub_ps(in2492, in2496);
tmp18183 = _mm512_sub_ps(tmp18183, in2503);
tmp18185 = _mm512_fmadd_ps(in2498, _mm512_set1_ps(-4.25e+00f), tmp18185);
tmp18189 = _mm512_fmadd_ps(in2505, _mm512_set1_ps(-4.25e+00f), tmp18189);
tmp18187 = _mm512_fmadd_ps(tmp18177, _mm512_set1_ps(-4.25e+00f), tmp18187);
tmp18191 = _mm512_fmadd_ps(tmp18181, _mm512_set1_ps(-4.25e+00f), tmp18191);
in2492 = _mm512_fmadd_ps(tmp18186, _mm512_set1_ps(5.25e+00f), in2492);
tmp18183 = _mm512_fmadd_ps(tmp18190, _mm512_set1_ps(5.25e+00f), tmp18183);
tmp18186 = _mm512_fmadd_ps(tmp18179, _mm512_set1_ps(2.5e-01f), in2496);
tmp18190 = _mm512_fmadd_ps(tmp18184, _mm512_set1_ps(2.5e-01f), in2503);
tmp18179 = _mm512_fmadd_ps(tmp18179, _mm512_set1_ps(4e+00f), in2496);
tmp18184 = _mm512_fmadd_ps(tmp18184, _mm512_set1_ps(4e+00f), in2503);
__m512 tmp18188 = _mm512_sub_ps(tmp18187, tmp18185);
__m512 tmp18192 = _mm512_sub_ps(tmp18191, tmp18189);
tmp18187 = _mm512_add_ps(tmp18185, tmp18187);
tmp18191 = _mm512_add_ps(tmp18189, tmp18191);
tmp18185 = _mm512_fmadd_ps(tmp18178, _mm512_set1_ps(2.5e-01f), in2494);
tmp18189 = _mm512_fmadd_ps(tmp18182, _mm512_set1_ps(2.5e-01f), in2501);
tmp18186 = _mm512_fmadd_ps(tmp18177, _mm512_set1_ps(-1.25e+00f), tmp18186);
tmp18190 = _mm512_fmadd_ps(tmp18181, _mm512_set1_ps(-1.25e+00f), tmp18190);
tmp18177 = _mm512_fmadd_ps(tmp18177, _mm512_set1_ps(-5e+00f), tmp18179);
tmp18181 = _mm512_fmadd_ps(tmp18181, _mm512_set1_ps(-5e+00f), tmp18184);
tmp18185 = _mm512_fmadd_ps(in2498, _mm512_set1_ps(-1.25e+00f), tmp18185);
tmp18189 = _mm512_fmadd_ps(in2505, _mm512_set1_ps(-1.25e+00f), tmp18189);
in2496 = _mm512_fmadd_ps(tmp18185, _mm512_set1_ps(2e+00f), tmp18186);
in2503 = _mm512_fmadd_ps(tmp18189, _mm512_set1_ps(2e+00f), tmp18190);
tmp18186 = _mm512_fnmadd_ps(tmp18185, _mm512_set1_ps(2e+00f), tmp18186);
tmp18190 = _mm512_fnmadd_ps(tmp18189, _mm512_set1_ps(2e+00f), tmp18190);
tmp18185 = _mm512_fmadd_ps(in2494, _mm512_set1_ps(2.5e-01f), tmp18178);
tmp18189 = _mm512_fmadd_ps(in2501, _mm512_set1_ps(2.5e-01f), tmp18182);
tmp18178 = _mm512_sub_ps(in2495, tmp18178);
tmp18182 = _mm512_sub_ps(in2502, tmp18182);
tmp18185 = _mm512_fmadd_ps(in2498, _mm512_set1_ps(-1.25e+00f), tmp18185);
tmp18189 = _mm512_fmadd_ps(in2505, _mm512_set1_ps(-1.25e+00f), tmp18189);
in2498 = _mm512_sub_ps(in2498, in2494);
in2505 = _mm512_sub_ps(in2505, in2501);
in2498 = _mm512_fmadd_ps(in2498, _mm512_set1_ps(5.25e+00f), tmp18178);
in2505 = _mm512_fmadd_ps(in2505, _mm512_set1_ps(5.25e+00f), tmp18182);
tmp18179 = _mm512_fmadd_ps(tmp18185, _mm512_set1_ps(2e+00f), tmp18177);
tmp18184 = _mm512_fmadd_ps(tmp18189, _mm512_set1_ps(2e+00f), tmp18181);
tmp18177 = _mm512_fnmadd_ps(tmp18185, _mm512_set1_ps(2e+00f), tmp18177);
tmp18181 = _mm512_fnmadd_ps(tmp18189, _mm512_set1_ps(2e+00f), tmp18181);
__m512 out2303 = _mm512_shuffle_f32x4(in2492, tmp18187, 68);
__m512 out2311 = _mm512_shuffle_f32x4(in2492, tmp18187, 238);
__m512 out2304 = _mm512_shuffle_f32x4(tmp18188, in2496, 68);
__m512 out2312 = _mm512_shuffle_f32x4(tmp18188, in2496, 238);
__m512 out2305 = _mm512_shuffle_f32x4(tmp18186, tmp18179, 68);
__m512 out2313 = _mm512_shuffle_f32x4(tmp18186, tmp18179, 238);
__m512 out2306 = _mm512_shuffle_f32x4(tmp18177, in2498, 68);
__m512 out2314 = _mm512_shuffle_f32x4(tmp18177, in2498, 238);
__m512 out2307 = _mm512_shuffle_f32x4(tmp18183, tmp18191, 68);
__m512 out2315 = _mm512_shuffle_f32x4(tmp18183, tmp18191, 238);
__m512 out2308 = _mm512_shuffle_f32x4(tmp18192, in2503, 68);
__m512 out2316 = _mm512_shuffle_f32x4(tmp18192, in2503, 238);
__m512 out2309 = _mm512_shuffle_f32x4(tmp18190, tmp18184, 68);
__m512 out2317 = _mm512_shuffle_f32x4(tmp18190, tmp18184, 238);
__m512 out2310 = _mm512_shuffle_f32x4(tmp18181, in2505, 68);
__m512 out2318 = _mm512_shuffle_f32x4(tmp18181, in2505, 238);
_mm512_storeu_ps(dfPtr14+256+589824*i63+98304*j55+49152*s55+768*k166, out2303);
_mm512_storeu_ps(dfPtr14+384+589824*i63+98304*j55+49152*s55+768*k166, out2311);
_mm512_storeu_ps(dfPtr14+320+589824*i63+98304*j55+49152*s55+768*k166, out2307);
_mm512_storeu_ps(dfPtr14+448+589824*i63+98304*j55+49152*s55+768*k166, out2315);
_mm512_storeu_ps(dfPtr14+147712+589824*i63+98304*j55+49152*s55+768*k166, out2304);
_mm512_storeu_ps(dfPtr14+147840+589824*i63+98304*j55+49152*s55+768*k166, out2312);
_mm512_storeu_ps(dfPtr14+147776+589824*i63+98304*j55+49152*s55+768*k166, out2308);
_mm512_storeu_ps(dfPtr14+147904+589824*i63+98304*j55+49152*s55+768*k166, out2316);
_mm512_storeu_ps(dfPtr14+295168+589824*i63+98304*j55+49152*s55+768*k166, out2305);
_mm512_storeu_ps(dfPtr14+295296+589824*i63+98304*j55+49152*s55+768*k166, out2313);
_mm512_storeu_ps(dfPtr14+295232+589824*i63+98304*j55+49152*s55+768*k166, out2309);
_mm512_storeu_ps(dfPtr14+295360+589824*i63+98304*j55+49152*s55+768*k166, out2317);
_mm512_storeu_ps(dfPtr14+442624+589824*i63+98304*j55+49152*s55+768*k166, out2306);
_mm512_storeu_ps(dfPtr14+442752+589824*i63+98304*j55+49152*s55+768*k166, out2314);
_mm512_storeu_ps(dfPtr14+442688+589824*i63+98304*j55+49152*s55+768*k166, out2310);
_mm512_storeu_ps(dfPtr14+442816+589824*i63+98304*j55+49152*s55+768*k166, out2318);
__m512 dat2368 = _mm512_maskz_loadu_ps(16383, datPtr34+1172+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512i pm242 = _mm512_set_epi32(6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15);
__m512 in2507 = _mm512_permutexvar_ps(pm242, dat2368);
__m512i pm243 = _mm512_set_epi32(15, 15, 15, 15, 15, 13, 12, 11, 12, 11, 10, 9, 8, 7, 6, 5);
__m512 in2515 = _mm512_permutexvar_ps(pm243, dat2368);
__m512 dat2369 = _mm512_maskz_loadu_ps(7, datPtr34+936+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2370 = _mm512_maskz_loadu_ps(16383, datPtr34+1228+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512i pm244 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 15, 15, 2, 1, 0);
__m512 in2508 = _mm512_permutex2var_ps(dat2369, pm244, dat2370);
__m512 in2516 = _mm512_permutexvar_ps(pm243, dat2370);
__m512 dat2371 = _mm512_maskz_loadu_ps(7, datPtr34+992+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2372 = _mm512_maskz_loadu_ps(16383, datPtr34+1284+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2509 = _mm512_permutex2var_ps(dat2371, pm244, dat2372);
__m512 in2517 = _mm512_permutexvar_ps(pm243, dat2372);
__m512 dat2373 = _mm512_maskz_loadu_ps(7, datPtr34+1048+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2374 = _mm512_maskz_loadu_ps(16383, datPtr34+1340+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2510 = _mm512_permutex2var_ps(dat2373, pm244, dat2374);
__m512 in2518 = _mm512_permutexvar_ps(pm243, dat2374);
__m512 dat2375 = _mm512_maskz_loadu_ps(7, datPtr34+1104+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2376 = _mm512_maskz_loadu_ps(16383, datPtr34+1396+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2511 = _mm512_permutex2var_ps(dat2375, pm244, dat2376);
__m512 in2519 = _mm512_permutexvar_ps(pm243, dat2376);
__m512 dat2377 = _mm512_maskz_loadu_ps(7, datPtr34+1160+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2378 = _mm512_maskz_loadu_ps(16383, datPtr34+1452+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2512 = _mm512_permutex2var_ps(dat2377, pm244, dat2378);
__m512 in2520 = _mm512_permutexvar_ps(pm243, dat2378);
__m512 dat2379 = _mm512_maskz_loadu_ps(7, datPtr34+1216+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2380 = _mm512_maskz_loadu_ps(16383, datPtr34+1508+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2513 = _mm512_permutex2var_ps(dat2379, pm244, dat2380);
__m512 in2521 = _mm512_permutexvar_ps(pm243, dat2380);
__m512 dat2381 = _mm512_maskz_loadu_ps(7, datPtr34+1272+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 dat2382 = _mm512_maskz_loadu_ps(16383, datPtr34+1564+212992*i63+56*h55+4*w71+106496*s55+1664*k166);
__m512 in2514 = _mm512_permutex2var_ps(dat2381, pm244, dat2382);
__m512 in2522 = _mm512_permutexvar_ps(pm243, dat2382);
__m512 tmp18241 = _mm512_add_ps(in2508, in2512);
__m512 tmp18245 = _mm512_add_ps(in2516, in2520);
__m512 tmp18242 = _mm512_sub_ps(in2511, in2509);
__m512 tmp18246 = _mm512_sub_ps(in2519, in2517);
__m512 tmp18243 = _mm512_add_ps(in2509, in2513);
__m512 tmp18247 = _mm512_add_ps(in2517, in2521);
in2507 = _mm512_sub_ps(in2507, in2513);
in2515 = _mm512_sub_ps(in2515, in2521);
tmp18241 = _mm512_fmadd_ps(in2510, _mm512_set1_ps(-4.25e+00f), tmp18241);
tmp18245 = _mm512_fmadd_ps(in2518, _mm512_set1_ps(-4.25e+00f), tmp18245);
tmp18243 = _mm512_fmadd_ps(in2511, _mm512_set1_ps(-4.25e+00f), tmp18243);
tmp18247 = _mm512_fmadd_ps(in2519, _mm512_set1_ps(-4.25e+00f), tmp18247);
in2507 = _mm512_fmadd_ps(tmp18242, _mm512_set1_ps(5.25e+00f), in2507);
in2515 = _mm512_fmadd_ps(tmp18246, _mm512_set1_ps(5.25e+00f), in2515);
tmp18242 = _mm512_fmadd_ps(in2509, _mm512_set1_ps(2.5e-01f), in2513);
tmp18246 = _mm512_fmadd_ps(in2517, _mm512_set1_ps(2.5e-01f), in2521);
in2509 = _mm512_fmadd_ps(in2509, _mm512_set1_ps(4e+00f), in2513);
in2517 = _mm512_fmadd_ps(in2517, _mm512_set1_ps(4e+00f), in2521);
__m512 tmp18244 = _mm512_sub_ps(tmp18243, tmp18241);
__m512 tmp18248 = _mm512_sub_ps(tmp18247, tmp18245);
tmp18243 = _mm512_add_ps(tmp18241, tmp18243);
tmp18247 = _mm512_add_ps(tmp18245, tmp18247);
tmp18241 = _mm512_fmadd_ps(in2508, _mm512_set1_ps(2.5e-01f), in2512);
tmp18245 = _mm512_fmadd_ps(in2516, _mm512_set1_ps(2.5e-01f), in2520);
tmp18242 = _mm512_fmadd_ps(in2511, _mm512_set1_ps(-1.25e+00f), tmp18242);
tmp18246 = _mm512_fmadd_ps(in2519, _mm512_set1_ps(-1.25e+00f), tmp18246);
in2511 = _mm512_fmadd_ps(in2511, _mm512_set1_ps(-5e+00f), in2509);
in2519 = _mm512_fmadd_ps(in2519, _mm512_set1_ps(-5e+00f), in2517);
tmp18241 = _mm512_fmadd_ps(in2510, _mm512_set1_ps(-1.25e+00f), tmp18241);
tmp18245 = _mm512_fmadd_ps(in2518, _mm512_set1_ps(-1.25e+00f), tmp18245);
in2513 = _mm512_fmadd_ps(tmp18241, _mm512_set1_ps(2e+00f), tmp18242);
in2521 = _mm512_fmadd_ps(tmp18245, _mm512_set1_ps(2e+00f), tmp18246);
tmp18242 = _mm512_fnmadd_ps(tmp18241, _mm512_set1_ps(2e+00f), tmp18242);
tmp18246 = _mm512_fnmadd_ps(tmp18245, _mm512_set1_ps(2e+00f), tmp18246);
tmp18241 = _mm512_fmadd_ps(in2512, _mm512_set1_ps(2.5e-01f), in2508);
tmp18245 = _mm512_fmadd_ps(in2520, _mm512_set1_ps(2.5e-01f), in2516);
in2508 = _mm512_sub_ps(in2514, in2508);
in2516 = _mm512_sub_ps(in2522, in2516);
tmp18241 = _mm512_fmadd_ps(in2510, _mm512_set1_ps(-1.25e+00f), tmp18241);
tmp18245 = _mm512_fmadd_ps(in2518, _mm512_set1_ps(-1.25e+00f), tmp18245);
in2510 = _mm512_sub_ps(in2510, in2512);
in2518 = _mm512_sub_ps(in2518, in2520);
in2510 = _mm512_fmadd_ps(in2510, _mm512_set1_ps(5.25e+00f), in2508);
in2518 = _mm512_fmadd_ps(in2518, _mm512_set1_ps(5.25e+00f), in2516);
in2509 = _mm512_fmadd_ps(tmp18241, _mm512_set1_ps(2e+00f), in2511);
in2517 = _mm512_fmadd_ps(tmp18245, _mm512_set1_ps(2e+00f), in2519);
in2511 = _mm512_fnmadd_ps(tmp18241, _mm512_set1_ps(2e+00f), in2511);
in2519 = _mm512_fnmadd_ps(tmp18245, _mm512_set1_ps(2e+00f), in2519);
__m512 tmp18257 = _mm512_unpacklo_ps(in2507, tmp18243);
__m512 tmp18258 = _mm512_unpackhi_ps(in2507, tmp18243);
__m512 tmp18259 = _mm512_unpacklo_ps(tmp18244, in2513);
__m512 tmp18260 = _mm512_unpackhi_ps(tmp18244, in2513);
__m512 tmp18261 = _mm512_unpacklo_ps(tmp18242, in2509);
__m512 tmp18262 = _mm512_unpackhi_ps(tmp18242, in2509);
__m512 tmp18263 = _mm512_unpacklo_ps(in2511, in2510);
__m512 tmp18264 = _mm512_unpackhi_ps(in2511, in2510);
__m512 tmp18265 = _mm512_unpacklo_ps(in2515, tmp18247);
__m512 tmp18266 = _mm512_unpackhi_ps(in2515, tmp18247);
__m512 tmp18267 = _mm512_unpacklo_ps(tmp18248, in2521);
__m512 tmp18268 = _mm512_unpackhi_ps(tmp18248, in2521);
__m512 tmp18269 = _mm512_unpacklo_ps(tmp18246, in2517);
__m512 tmp18270 = _mm512_unpackhi_ps(tmp18246, in2517);
__m512 tmp18271 = _mm512_unpacklo_ps(in2519, in2518);
__m512 tmp18272 = _mm512_unpackhi_ps(in2519, in2518);
__m512 tmp18273 = _mm512_shuffle_ps(tmp18257, tmp18259, 68);
__m512 tmp18274 = _mm512_shuffle_ps(tmp18257, tmp18259, 238);
__m512 tmp18275 = _mm512_shuffle_ps(tmp18258, tmp18260, 68);
__m512 tmp18276 = _mm512_shuffle_ps(tmp18258, tmp18260, 238);
__m512 tmp18277 = _mm512_shuffle_ps(tmp18261, tmp18263, 68);
__m512 tmp18278 = _mm512_shuffle_ps(tmp18261, tmp18263, 238);
__m512 tmp18279 = _mm512_shuffle_ps(tmp18262, tmp18264, 68);
__m512 tmp18280 = _mm512_shuffle_ps(tmp18262, tmp18264, 238);
__m512 tmp18281 = _mm512_shuffle_ps(tmp18265, tmp18267, 68);
__m512 tmp18282 = _mm512_shuffle_ps(tmp18265, tmp18267, 238);
__m512 tmp18283 = _mm512_shuffle_ps(tmp18266, tmp18268, 68);
__m512 tmp18284 = _mm512_shuffle_ps(tmp18266, tmp18268, 238);
__m512 tmp18285 = _mm512_shuffle_ps(tmp18269, tmp18271, 68);
__m512 tmp18286 = _mm512_shuffle_ps(tmp18269, tmp18271, 238);
__m512 tmp18287 = _mm512_shuffle_ps(tmp18270, tmp18272, 68);
__m512 tmp18288 = _mm512_shuffle_ps(tmp18270, tmp18272, 238);
__m512 tmp18289 = _mm512_shuffle_f32x4(tmp18273, tmp18277, 136);
__m512 tmp18290 = _mm512_shuffle_f32x4(tmp18273, tmp18277, 221);
__m512 tmp18291 = _mm512_shuffle_f32x4(tmp18274, tmp18278, 136);
__m512 tmp18292 = _mm512_shuffle_f32x4(tmp18274, tmp18278, 221);
__m512 tmp18293 = _mm512_shuffle_f32x4(tmp18275, tmp18279, 136);
__m512 tmp18294 = _mm512_shuffle_f32x4(tmp18275, tmp18279, 221);
__m512 tmp18295 = _mm512_shuffle_f32x4(tmp18276, tmp18280, 136);
__m512 tmp18296 = _mm512_shuffle_f32x4(tmp18276, tmp18280, 221);
__m512 tmp18297 = _mm512_shuffle_f32x4(tmp18281, tmp18285, 136);
__m512 tmp18298 = _mm512_shuffle_f32x4(tmp18281, tmp18285, 221);
__m512 tmp18299 = _mm512_shuffle_f32x4(tmp18282, tmp18286, 136);
__m512 tmp18300 = _mm512_shuffle_f32x4(tmp18282, tmp18286, 221);
__m512 tmp18301 = _mm512_shuffle_f32x4(tmp18283, tmp18287, 136);
__m512 tmp18302 = _mm512_shuffle_f32x4(tmp18283, tmp18287, 221);
__m512 tmp18303 = _mm512_shuffle_f32x4(tmp18284, tmp18288, 136);
__m512 tmp18304 = _mm512_shuffle_f32x4(tmp18284, tmp18288, 221);
in2507 = _mm512_shuffle_f32x4(tmp18289, tmp18297, 136);
in2515 = _mm512_shuffle_f32x4(tmp18289, tmp18297, 221);
tmp18243 = _mm512_shuffle_f32x4(tmp18291, tmp18299, 136);
tmp18247 = _mm512_shuffle_f32x4(tmp18291, tmp18299, 221);
tmp18244 = _mm512_shuffle_f32x4(tmp18293, tmp18301, 136);
tmp18248 = _mm512_shuffle_f32x4(tmp18293, tmp18301, 221);
in2513 = _mm512_shuffle_f32x4(tmp18295, tmp18303, 136);
in2521 = _mm512_shuffle_f32x4(tmp18295, tmp18303, 221);
tmp18242 = _mm512_shuffle_f32x4(tmp18290, tmp18298, 136);
tmp18246 = _mm512_shuffle_f32x4(tmp18290, tmp18298, 221);
in2509 = _mm512_shuffle_f32x4(tmp18292, tmp18300, 136);
in2517 = _mm512_shuffle_f32x4(tmp18292, tmp18300, 221);
in2511 = _mm512_shuffle_f32x4(tmp18294, tmp18302, 136);
in2519 = _mm512_shuffle_f32x4(tmp18294, tmp18302, 221);
in2510 = _mm512_shuffle_f32x4(tmp18296, tmp18304, 136);
in2518 = _mm512_shuffle_f32x4(tmp18296, tmp18304, 221);
__m512 tmp18249 = _mm512_add_ps(tmp18243, in2509);
__m512 tmp18253 = _mm512_add_ps(tmp18247, in2517);
__m512 tmp18250 = _mm512_sub_ps(tmp18242, tmp18244);
__m512 tmp18254 = _mm512_sub_ps(tmp18246, tmp18248);
__m512 tmp18251 = _mm512_add_ps(tmp18244, in2511);
__m512 tmp18255 = _mm512_add_ps(tmp18248, in2519);
in2507 = _mm512_sub_ps(in2507, in2511);
in2515 = _mm512_sub_ps(in2515, in2519);
tmp18249 = _mm512_fmadd_ps(in2513, _mm512_set1_ps(-4.25e+00f), tmp18249);
tmp18253 = _mm512_fmadd_ps(in2521, _mm512_set1_ps(-4.25e+00f), tmp18253);
tmp18251 = _mm512_fmadd_ps(tmp18242, _mm512_set1_ps(-4.25e+00f), tmp18251);
tmp18255 = _mm512_fmadd_ps(tmp18246, _mm512_set1_ps(-4.25e+00f), tmp18255);
in2507 = _mm512_fmadd_ps(tmp18250, _mm512_set1_ps(5.25e+00f), in2507);
in2515 = _mm512_fmadd_ps(tmp18254, _mm512_set1_ps(5.25e+00f), in2515);
tmp18250 = _mm512_fmadd_ps(tmp18244, _mm512_set1_ps(2.5e-01f), in2511);
tmp18254 = _mm512_fmadd_ps(tmp18248, _mm512_set1_ps(2.5e-01f), in2519);
tmp18244 = _mm512_fmadd_ps(tmp18244, _mm512_set1_ps(4e+00f), in2511);
tmp18248 = _mm512_fmadd_ps(tmp18248, _mm512_set1_ps(4e+00f), in2519);
__m512 tmp18252 = _mm512_sub_ps(tmp18251, tmp18249);
__m512 tmp18256 = _mm512_sub_ps(tmp18255, tmp18253);
tmp18251 = _mm512_add_ps(tmp18249, tmp18251);
tmp18255 = _mm512_add_ps(tmp18253, tmp18255);
tmp18249 = _mm512_fmadd_ps(tmp18243, _mm512_set1_ps(2.5e-01f), in2509);
tmp18253 = _mm512_fmadd_ps(tmp18247, _mm512_set1_ps(2.5e-01f), in2517);
tmp18250 = _mm512_fmadd_ps(tmp18242, _mm512_set1_ps(-1.25e+00f), tmp18250);
tmp18254 = _mm512_fmadd_ps(tmp18246, _mm512_set1_ps(-1.25e+00f), tmp18254);
tmp18242 = _mm512_fmadd_ps(tmp18242, _mm512_set1_ps(-5e+00f), tmp18244);
tmp18246 = _mm512_fmadd_ps(tmp18246, _mm512_set1_ps(-5e+00f), tmp18248);
tmp18249 = _mm512_fmadd_ps(in2513, _mm512_set1_ps(-1.25e+00f), tmp18249);
tmp18253 = _mm512_fmadd_ps(in2521, _mm512_set1_ps(-1.25e+00f), tmp18253);
in2511 = _mm512_fmadd_ps(tmp18249, _mm512_set1_ps(2e+00f), tmp18250);
in2519 = _mm512_fmadd_ps(tmp18253, _mm512_set1_ps(2e+00f), tmp18254);
tmp18250 = _mm512_fnmadd_ps(tmp18249, _mm512_set1_ps(2e+00f), tmp18250);
tmp18254 = _mm512_fnmadd_ps(tmp18253, _mm512_set1_ps(2e+00f), tmp18254);
tmp18249 = _mm512_fmadd_ps(in2509, _mm512_set1_ps(2.5e-01f), tmp18243);
tmp18253 = _mm512_fmadd_ps(in2517, _mm512_set1_ps(2.5e-01f), tmp18247);
tmp18243 = _mm512_sub_ps(in2510, tmp18243);
tmp18247 = _mm512_sub_ps(in2518, tmp18247);
tmp18249 = _mm512_fmadd_ps(in2513, _mm512_set1_ps(-1.25e+00f), tmp18249);
tmp18253 = _mm512_fmadd_ps(in2521, _mm512_set1_ps(-1.25e+00f), tmp18253);
in2513 = _mm512_sub_ps(in2513, in2509);
in2521 = _mm512_sub_ps(in2521, in2517);
in2513 = _mm512_fmadd_ps(in2513, _mm512_set1_ps(5.25e+00f), tmp18243);
in2521 = _mm512_fmadd_ps(in2521, _mm512_set1_ps(5.25e+00f), tmp18247);
tmp18244 = _mm512_fmadd_ps(tmp18249, _mm512_set1_ps(2e+00f), tmp18242);
tmp18248 = _mm512_fmadd_ps(tmp18253, _mm512_set1_ps(2e+00f), tmp18246);
tmp18242 = _mm512_fnmadd_ps(tmp18249, _mm512_set1_ps(2e+00f), tmp18242);
tmp18246 = _mm512_fnmadd_ps(tmp18253, _mm512_set1_ps(2e+00f), tmp18246);
__m512 out2319 = _mm512_shuffle_f32x4(in2507, tmp18251, 68);
__m512 out2327 = _mm512_shuffle_f32x4(in2507, tmp18251, 238);
__m512 out2320 = _mm512_shuffle_f32x4(tmp18252, in2511, 68);
__m512 out2328 = _mm512_shuffle_f32x4(tmp18252, in2511, 238);
__m512 out2321 = _mm512_shuffle_f32x4(tmp18250, tmp18244, 68);
__m512 out2329 = _mm512_shuffle_f32x4(tmp18250, tmp18244, 238);
__m512 out2322 = _mm512_shuffle_f32x4(tmp18242, in2513, 68);
__m512 out2330 = _mm512_shuffle_f32x4(tmp18242, in2513, 238);
__m512 out2323 = _mm512_shuffle_f32x4(in2515, tmp18255, 68);
__m512 out2331 = _mm512_shuffle_f32x4(in2515, tmp18255, 238);
__m512 out2324 = _mm512_shuffle_f32x4(tmp18256, in2519, 68);
__m512 out2332 = _mm512_shuffle_f32x4(tmp18256, in2519, 238);
__m512 out2325 = _mm512_shuffle_f32x4(tmp18254, tmp18248, 68);
__m512 out2333 = _mm512_shuffle_f32x4(tmp18254, tmp18248, 238);
__m512 out2326 = _mm512_shuffle_f32x4(tmp18246, in2521, 68);
__m512 out2334 = _mm512_shuffle_f32x4(tmp18246, in2521, 238);
_mm512_storeu_ps(dfPtr14+512+589824*i63+98304*j55+49152*s55+768*k166, out2319);
_mm512_storeu_ps(dfPtr14+640+589824*i63+98304*j55+49152*s55+768*k166, out2327);
_mm512_storeu_ps(dfPtr14+576+589824*i63+98304*j55+49152*s55+768*k166, out2323);
_mm512_storeu_ps(dfPtr14+704+589824*i63+98304*j55+49152*s55+768*k166, out2331);
_mm512_storeu_ps(dfPtr14+147968+589824*i63+98304*j55+49152*s55+768*k166, out2320);
_mm512_storeu_ps(dfPtr14+148096+589824*i63+98304*j55+49152*s55+768*k166, out2328);
_mm512_storeu_ps(dfPtr14+148032+589824*i63+98304*j55+49152*s55+768*k166, out2324);
_mm512_storeu_ps(dfPtr14+148160+589824*i63+98304*j55+49152*s55+768*k166, out2332);
_mm512_storeu_ps(dfPtr14+295424+589824*i63+98304*j55+49152*s55+768*k166, out2321);
_mm512_storeu_ps(dfPtr14+295552+589824*i63+98304*j55+49152*s55+768*k166, out2329);
_mm512_storeu_ps(dfPtr14+295488+589824*i63+98304*j55+49152*s55+768*k166, out2325);
_mm512_storeu_ps(dfPtr14+295616+589824*i63+98304*j55+49152*s55+768*k166, out2333);
_mm512_storeu_ps(dfPtr14+442880+589824*i63+98304*j55+49152*s55+768*k166, out2322);
_mm512_storeu_ps(dfPtr14+443008+589824*i63+98304*j55+49152*s55+768*k166, out2330);
_mm512_storeu_ps(dfPtr14+442944+589824*i63+98304*j55+49152*s55+768*k166, out2326);
_mm512_storeu_ps(dfPtr14+443072+589824*i63+98304*j55+49152*s55+768*k166, out2334);
}
if (j55 >= last13) return;
++j55;
rel25 = 1;
}
ptrdiff_t h56 = base25+12;
ptrdiff_t w72 = 0;
ptrdiff_t k167 = 0;
for (; k167 != 32; ++k167) {
__m512 dat2383 = _mm512_maskz_loadu_ps(16383, datPtr34+4+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512 dat2384 = _mm512_maskz_loadu_ps(127, datPtr34+836+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512i pm245 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in2523 = _mm512_permutexvar_ps(pm245, dat2383);
__m512i pm246 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 15, 15, 13, 12, 11);
__m512 in2526 = _mm512_permutex2var_ps(dat2383, pm246, dat2384);
__m512 dat2385 = _mm512_maskz_loadu_ps(16383, datPtr34+60+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512 dat2386 = _mm512_maskz_loadu_ps(127, datPtr34+892+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512 in2524 = _mm512_permutexvar_ps(pm245, dat2385);
__m512 in2527 = _mm512_permutex2var_ps(dat2385, pm246, dat2386);
__m512 dat2387 = _mm512_maskz_loadu_ps(16383, datPtr34+116+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512 dat2388 = _mm512_maskz_loadu_ps(127, datPtr34+948+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512 in2525 = _mm512_permutexvar_ps(pm245, dat2387);
__m512 in2528 = _mm512_permutex2var_ps(dat2387, pm246, dat2388);
__m512 tmp18305 = in2524;
__m512 tmp18312 = in2527;
__m512 tmp18306 = _mm512_sub_ps(_mm512_setzero_ps(), in2525);
__m512 tmp18313 = _mm512_sub_ps(_mm512_setzero_ps(), in2528);
__m512 tmp18307 = in2525;
__m512 tmp18314 = in2528;
in2523 = in2523;
in2526 = in2526;
tmp18305 = tmp18305;
tmp18312 = tmp18312;
tmp18307 = tmp18307;
tmp18314 = tmp18314;
in2523 = _mm512_fmadd_ps(tmp18306, _mm512_set1_ps(5.25e+00f), in2523);
in2526 = _mm512_fmadd_ps(tmp18313, _mm512_set1_ps(5.25e+00f), in2526);
tmp18306 = _mm512_mul_ps(in2525, _mm512_set1_ps(2.5e-01f));
tmp18313 = _mm512_mul_ps(in2528, _mm512_set1_ps(2.5e-01f));
in2525 = _mm512_mul_ps(in2525, _mm512_set1_ps(4e+00f));
in2528 = _mm512_mul_ps(in2528, _mm512_set1_ps(4e+00f));
__m512 tmp18308 = _mm512_sub_ps(tmp18307, tmp18305);
__m512 tmp18315 = _mm512_sub_ps(tmp18314, tmp18312);
tmp18307 = _mm512_add_ps(tmp18305, tmp18307);
tmp18314 = _mm512_add_ps(tmp18312, tmp18314);
tmp18305 = _mm512_mul_ps(in2524, _mm512_set1_ps(2.5e-01f));
tmp18312 = _mm512_mul_ps(in2527, _mm512_set1_ps(2.5e-01f));
tmp18306 = tmp18306;
tmp18313 = tmp18313;
__m512 tmp18309 = in2525;
__m512 tmp18316 = in2528;
tmp18305 = tmp18305;
tmp18312 = tmp18312;
__m512 tmp18310 = _mm512_fmadd_ps(tmp18305, _mm512_set1_ps(2e+00f), tmp18306);
__m512 tmp18317 = _mm512_fmadd_ps(tmp18312, _mm512_set1_ps(2e+00f), tmp18313);
tmp18306 = _mm512_fnmadd_ps(tmp18305, _mm512_set1_ps(2e+00f), tmp18306);
tmp18313 = _mm512_fnmadd_ps(tmp18312, _mm512_set1_ps(2e+00f), tmp18313);
tmp18305 = in2524;
tmp18312 = in2527;
in2524 = _mm512_sub_ps(_mm512_setzero_ps(), in2524);
in2527 = _mm512_sub_ps(_mm512_setzero_ps(), in2527);
tmp18305 = tmp18305;
tmp18312 = tmp18312;
__m512 tmp18311 = in2524;
__m512 tmp18318 = in2527;
in2525 = _mm512_fmadd_ps(tmp18305, _mm512_set1_ps(2e+00f), tmp18309);
in2528 = _mm512_fmadd_ps(tmp18312, _mm512_set1_ps(2e+00f), tmp18316);
tmp18309 = _mm512_fnmadd_ps(tmp18305, _mm512_set1_ps(2e+00f), tmp18309);
tmp18316 = _mm512_fnmadd_ps(tmp18312, _mm512_set1_ps(2e+00f), tmp18316);
__m512 tmp18327 = _mm512_unpacklo_ps(in2523, tmp18307);
__m512 tmp18328 = _mm512_unpackhi_ps(in2523, tmp18307);
__m512 tmp18329 = _mm512_unpacklo_ps(tmp18308, tmp18310);
__m512 tmp18330 = _mm512_unpackhi_ps(tmp18308, tmp18310);
__m512 tmp18331 = _mm512_unpacklo_ps(tmp18306, in2525);
__m512 tmp18332 = _mm512_unpackhi_ps(tmp18306, in2525);
__m512 tmp18333 = _mm512_unpacklo_ps(tmp18309, tmp18311);
__m512 tmp18334 = _mm512_unpackhi_ps(tmp18309, tmp18311);
__m512 tmp18335 = _mm512_unpacklo_ps(in2526, tmp18314);
__m512 tmp18336 = _mm512_unpackhi_ps(in2526, tmp18314);
__m512 tmp18337 = _mm512_unpacklo_ps(tmp18315, tmp18317);
__m512 tmp18338 = _mm512_unpackhi_ps(tmp18315, tmp18317);
__m512 tmp18339 = _mm512_unpacklo_ps(tmp18313, in2528);
__m512 tmp18340 = _mm512_unpackhi_ps(tmp18313, in2528);
__m512 tmp18341 = _mm512_unpacklo_ps(tmp18316, tmp18318);
__m512 tmp18342 = _mm512_unpackhi_ps(tmp18316, tmp18318);
__m512 tmp18343 = _mm512_shuffle_ps(tmp18327, tmp18329, 68);
__m512 tmp18344 = _mm512_shuffle_ps(tmp18327, tmp18329, 238);
__m512 tmp18345 = _mm512_shuffle_ps(tmp18328, tmp18330, 68);
__m512 tmp18346 = _mm512_shuffle_ps(tmp18328, tmp18330, 238);
__m512 tmp18347 = _mm512_shuffle_ps(tmp18331, tmp18333, 68);
__m512 tmp18348 = _mm512_shuffle_ps(tmp18331, tmp18333, 238);
__m512 tmp18349 = _mm512_shuffle_ps(tmp18332, tmp18334, 68);
__m512 tmp18350 = _mm512_shuffle_ps(tmp18332, tmp18334, 238);
__m512 tmp18351 = _mm512_shuffle_ps(tmp18335, tmp18337, 68);
__m512 tmp18352 = _mm512_shuffle_ps(tmp18335, tmp18337, 238);
__m512 tmp18353 = _mm512_shuffle_ps(tmp18336, tmp18338, 68);
__m512 tmp18354 = _mm512_shuffle_ps(tmp18336, tmp18338, 238);
__m512 tmp18355 = _mm512_shuffle_ps(tmp18339, tmp18341, 68);
__m512 tmp18356 = _mm512_shuffle_ps(tmp18339, tmp18341, 238);
__m512 tmp18357 = _mm512_shuffle_ps(tmp18340, tmp18342, 68);
__m512 tmp18358 = _mm512_shuffle_ps(tmp18340, tmp18342, 238);
__m512 tmp18359 = _mm512_shuffle_f32x4(tmp18343, tmp18347, 136);
__m512 tmp18360 = _mm512_shuffle_f32x4(tmp18343, tmp18347, 221);
__m512 tmp18361 = _mm512_shuffle_f32x4(tmp18344, tmp18348, 136);
__m512 tmp18362 = _mm512_shuffle_f32x4(tmp18344, tmp18348, 221);
__m512 tmp18363 = _mm512_shuffle_f32x4(tmp18345, tmp18349, 136);
__m512 tmp18364 = _mm512_shuffle_f32x4(tmp18345, tmp18349, 221);
__m512 tmp18365 = _mm512_shuffle_f32x4(tmp18346, tmp18350, 136);
__m512 tmp18366 = _mm512_shuffle_f32x4(tmp18346, tmp18350, 221);
__m512 tmp18367 = _mm512_shuffle_f32x4(tmp18351, tmp18355, 136);
__m512 tmp18368 = _mm512_shuffle_f32x4(tmp18351, tmp18355, 221);
__m512 tmp18369 = _mm512_shuffle_f32x4(tmp18352, tmp18356, 136);
__m512 tmp18370 = _mm512_shuffle_f32x4(tmp18352, tmp18356, 221);
__m512 tmp18371 = _mm512_shuffle_f32x4(tmp18353, tmp18357, 136);
__m512 tmp18372 = _mm512_shuffle_f32x4(tmp18353, tmp18357, 221);
__m512 tmp18373 = _mm512_shuffle_f32x4(tmp18354, tmp18358, 136);
__m512 tmp18374 = _mm512_shuffle_f32x4(tmp18354, tmp18358, 221);
in2523 = _mm512_shuffle_f32x4(tmp18359, tmp18367, 136);
in2526 = _mm512_shuffle_f32x4(tmp18359, tmp18367, 221);
tmp18307 = _mm512_shuffle_f32x4(tmp18361, tmp18369, 136);
tmp18314 = _mm512_shuffle_f32x4(tmp18361, tmp18369, 221);
tmp18308 = _mm512_shuffle_f32x4(tmp18363, tmp18371, 136);
tmp18315 = _mm512_shuffle_f32x4(tmp18363, tmp18371, 221);
tmp18310 = _mm512_shuffle_f32x4(tmp18365, tmp18373, 136);
tmp18317 = _mm512_shuffle_f32x4(tmp18365, tmp18373, 221);
tmp18306 = _mm512_shuffle_f32x4(tmp18360, tmp18368, 136);
tmp18313 = _mm512_shuffle_f32x4(tmp18360, tmp18368, 221);
in2525 = _mm512_shuffle_f32x4(tmp18362, tmp18370, 136);
in2528 = _mm512_shuffle_f32x4(tmp18362, tmp18370, 221);
tmp18309 = _mm512_shuffle_f32x4(tmp18364, tmp18372, 136);
tmp18316 = _mm512_shuffle_f32x4(tmp18364, tmp18372, 221);
tmp18311 = _mm512_shuffle_f32x4(tmp18366, tmp18374, 136);
tmp18318 = _mm512_shuffle_f32x4(tmp18366, tmp18374, 221);
__m512 tmp18319 = _mm512_add_ps(tmp18307, in2525);
__m512 tmp18323 = _mm512_add_ps(tmp18314, in2528);
__m512 tmp18320 = _mm512_sub_ps(tmp18306, tmp18308);
__m512 tmp18324 = _mm512_sub_ps(tmp18313, tmp18315);
__m512 tmp18321 = _mm512_add_ps(tmp18308, tmp18309);
__m512 tmp18325 = _mm512_add_ps(tmp18315, tmp18316);
in2523 = _mm512_sub_ps(in2523, tmp18309);
in2526 = _mm512_sub_ps(in2526, tmp18316);
tmp18319 = _mm512_fmadd_ps(tmp18310, _mm512_set1_ps(-4.25e+00f), tmp18319);
tmp18323 = _mm512_fmadd_ps(tmp18317, _mm512_set1_ps(-4.25e+00f), tmp18323);
tmp18321 = _mm512_fmadd_ps(tmp18306, _mm512_set1_ps(-4.25e+00f), tmp18321);
tmp18325 = _mm512_fmadd_ps(tmp18313, _mm512_set1_ps(-4.25e+00f), tmp18325);
in2523 = _mm512_fmadd_ps(tmp18320, _mm512_set1_ps(5.25e+00f), in2523);
in2526 = _mm512_fmadd_ps(tmp18324, _mm512_set1_ps(5.25e+00f), in2526);
tmp18320 = _mm512_fmadd_ps(tmp18308, _mm512_set1_ps(2.5e-01f), tmp18309);
tmp18324 = _mm512_fmadd_ps(tmp18315, _mm512_set1_ps(2.5e-01f), tmp18316);
tmp18308 = _mm512_fmadd_ps(tmp18308, _mm512_set1_ps(4e+00f), tmp18309);
tmp18315 = _mm512_fmadd_ps(tmp18315, _mm512_set1_ps(4e+00f), tmp18316);
__m512 tmp18322 = _mm512_sub_ps(tmp18321, tmp18319);
__m512 tmp18326 = _mm512_sub_ps(tmp18325, tmp18323);
tmp18321 = _mm512_add_ps(tmp18319, tmp18321);
tmp18325 = _mm512_add_ps(tmp18323, tmp18325);
tmp18319 = _mm512_fmadd_ps(tmp18307, _mm512_set1_ps(2.5e-01f), in2525);
tmp18323 = _mm512_fmadd_ps(tmp18314, _mm512_set1_ps(2.5e-01f), in2528);
tmp18320 = _mm512_fmadd_ps(tmp18306, _mm512_set1_ps(-1.25e+00f), tmp18320);
tmp18324 = _mm512_fmadd_ps(tmp18313, _mm512_set1_ps(-1.25e+00f), tmp18324);
tmp18306 = _mm512_fmadd_ps(tmp18306, _mm512_set1_ps(-5e+00f), tmp18308);
tmp18313 = _mm512_fmadd_ps(tmp18313, _mm512_set1_ps(-5e+00f), tmp18315);
tmp18319 = _mm512_fmadd_ps(tmp18310, _mm512_set1_ps(-1.25e+00f), tmp18319);
tmp18323 = _mm512_fmadd_ps(tmp18317, _mm512_set1_ps(-1.25e+00f), tmp18323);
tmp18309 = _mm512_fmadd_ps(tmp18319, _mm512_set1_ps(2e+00f), tmp18320);
tmp18316 = _mm512_fmadd_ps(tmp18323, _mm512_set1_ps(2e+00f), tmp18324);
tmp18320 = _mm512_fnmadd_ps(tmp18319, _mm512_set1_ps(2e+00f), tmp18320);
tmp18324 = _mm512_fnmadd_ps(tmp18323, _mm512_set1_ps(2e+00f), tmp18324);
tmp18319 = _mm512_fmadd_ps(in2525, _mm512_set1_ps(2.5e-01f), tmp18307);
tmp18323 = _mm512_fmadd_ps(in2528, _mm512_set1_ps(2.5e-01f), tmp18314);
tmp18307 = _mm512_sub_ps(tmp18311, tmp18307);
tmp18314 = _mm512_sub_ps(tmp18318, tmp18314);
tmp18319 = _mm512_fmadd_ps(tmp18310, _mm512_set1_ps(-1.25e+00f), tmp18319);
tmp18323 = _mm512_fmadd_ps(tmp18317, _mm512_set1_ps(-1.25e+00f), tmp18323);
tmp18310 = _mm512_sub_ps(tmp18310, in2525);
tmp18317 = _mm512_sub_ps(tmp18317, in2528);
tmp18310 = _mm512_fmadd_ps(tmp18310, _mm512_set1_ps(5.25e+00f), tmp18307);
tmp18317 = _mm512_fmadd_ps(tmp18317, _mm512_set1_ps(5.25e+00f), tmp18314);
tmp18308 = _mm512_fmadd_ps(tmp18319, _mm512_set1_ps(2e+00f), tmp18306);
tmp18315 = _mm512_fmadd_ps(tmp18323, _mm512_set1_ps(2e+00f), tmp18313);
tmp18306 = _mm512_fnmadd_ps(tmp18319, _mm512_set1_ps(2e+00f), tmp18306);
tmp18313 = _mm512_fnmadd_ps(tmp18323, _mm512_set1_ps(2e+00f), tmp18313);
__m512 out2335 = _mm512_shuffle_f32x4(in2523, tmp18321, 68);
__m512 out2343 = _mm512_shuffle_f32x4(in2523, tmp18321, 238);
__m512 out2336 = _mm512_shuffle_f32x4(tmp18322, tmp18309, 68);
__m512 out2344 = _mm512_shuffle_f32x4(tmp18322, tmp18309, 238);
__m512 out2337 = _mm512_shuffle_f32x4(tmp18320, tmp18308, 68);
__m512 out2345 = _mm512_shuffle_f32x4(tmp18320, tmp18308, 238);
__m512 out2338 = _mm512_shuffle_f32x4(tmp18306, tmp18310, 68);
__m512 out2346 = _mm512_shuffle_f32x4(tmp18306, tmp18310, 238);
__m512 out2339 = _mm512_shuffle_f32x4(in2526, tmp18325, 68);
__m512 out2347 = _mm512_shuffle_f32x4(in2526, tmp18325, 238);
__m512 out2340 = _mm512_shuffle_f32x4(tmp18326, tmp18316, 68);
__m512 out2348 = _mm512_shuffle_f32x4(tmp18326, tmp18316, 238);
__m512 out2341 = _mm512_shuffle_f32x4(tmp18324, tmp18315, 68);
__m512 out2349 = _mm512_shuffle_f32x4(tmp18324, tmp18315, 238);
__m512 out2342 = _mm512_shuffle_f32x4(tmp18313, tmp18317, 68);
__m512 out2350 = _mm512_shuffle_f32x4(tmp18313, tmp18317, 238);
_mm512_storeu_ps(dfPtr14+0+589824*i63+98304*j55+24576*s55+768*k167, out2335);
_mm512_storeu_ps(dfPtr14+128+589824*i63+98304*j55+24576*s55+768*k167, out2343);
_mm512_storeu_ps(dfPtr14+64+589824*i63+98304*j55+24576*s55+768*k167, out2339);
_mm512_storeu_ps(dfPtr14+192+589824*i63+98304*j55+24576*s55+768*k167, out2347);
_mm512_storeu_ps(dfPtr14+147456+589824*i63+98304*j55+24576*s55+768*k167, out2336);
_mm512_storeu_ps(dfPtr14+147584+589824*i63+98304*j55+24576*s55+768*k167, out2344);
_mm512_storeu_ps(dfPtr14+147520+589824*i63+98304*j55+24576*s55+768*k167, out2340);
_mm512_storeu_ps(dfPtr14+147648+589824*i63+98304*j55+24576*s55+768*k167, out2348);
_mm512_storeu_ps(dfPtr14+294912+589824*i63+98304*j55+24576*s55+768*k167, out2337);
_mm512_storeu_ps(dfPtr14+295040+589824*i63+98304*j55+24576*s55+768*k167, out2345);
_mm512_storeu_ps(dfPtr14+294976+589824*i63+98304*j55+24576*s55+768*k167, out2341);
_mm512_storeu_ps(dfPtr14+295104+589824*i63+98304*j55+24576*s55+768*k167, out2349);
_mm512_storeu_ps(dfPtr14+442368+589824*i63+98304*j55+24576*s55+768*k167, out2338);
_mm512_storeu_ps(dfPtr14+442496+589824*i63+98304*j55+24576*s55+768*k167, out2346);
_mm512_storeu_ps(dfPtr14+442432+589824*i63+98304*j55+24576*s55+768*k167, out2342);
_mm512_storeu_ps(dfPtr14+442560+589824*i63+98304*j55+24576*s55+768*k167, out2350);
__m512 dat2389 = _mm512_maskz_loadu_ps(511, datPtr34+856+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512 dat2390 = _mm512_maskz_loadu_ps(8191, datPtr34+1668+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512i pm247 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in2529 = _mm512_permutexvar_ps(pm247, dat2389);
__m512i pm248 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in2532 = _mm512_permutexvar_ps(pm248, dat2390);
__m512 dat2391 = _mm512_maskz_loadu_ps(511, datPtr34+912+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512 dat2392 = _mm512_maskz_loadu_ps(8191, datPtr34+1724+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512 in2530 = _mm512_permutexvar_ps(pm247, dat2391);
__m512 in2533 = _mm512_permutexvar_ps(pm248, dat2392);
__m512 dat2393 = _mm512_maskz_loadu_ps(511, datPtr34+968+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512 dat2394 = _mm512_maskz_loadu_ps(8191, datPtr34+1780+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512 in2531 = _mm512_permutexvar_ps(pm247, dat2393);
__m512 in2534 = _mm512_permutexvar_ps(pm248, dat2394);
__m512 tmp18375 = in2530;
__m512 tmp18382 = in2533;
__m512 tmp18376 = _mm512_sub_ps(_mm512_setzero_ps(), in2531);
__m512 tmp18383 = _mm512_sub_ps(_mm512_setzero_ps(), in2534);
__m512 tmp18377 = in2531;
__m512 tmp18384 = in2534;
in2529 = in2529;
in2532 = in2532;
tmp18375 = tmp18375;
tmp18382 = tmp18382;
tmp18377 = tmp18377;
tmp18384 = tmp18384;
in2529 = _mm512_fmadd_ps(tmp18376, _mm512_set1_ps(5.25e+00f), in2529);
in2532 = _mm512_fmadd_ps(tmp18383, _mm512_set1_ps(5.25e+00f), in2532);
tmp18376 = _mm512_mul_ps(in2531, _mm512_set1_ps(2.5e-01f));
tmp18383 = _mm512_mul_ps(in2534, _mm512_set1_ps(2.5e-01f));
in2531 = _mm512_mul_ps(in2531, _mm512_set1_ps(4e+00f));
in2534 = _mm512_mul_ps(in2534, _mm512_set1_ps(4e+00f));
__m512 tmp18378 = _mm512_sub_ps(tmp18377, tmp18375);
__m512 tmp18385 = _mm512_sub_ps(tmp18384, tmp18382);
tmp18377 = _mm512_add_ps(tmp18375, tmp18377);
tmp18384 = _mm512_add_ps(tmp18382, tmp18384);
tmp18375 = _mm512_mul_ps(in2530, _mm512_set1_ps(2.5e-01f));
tmp18382 = _mm512_mul_ps(in2533, _mm512_set1_ps(2.5e-01f));
tmp18376 = tmp18376;
tmp18383 = tmp18383;
__m512 tmp18379 = in2531;
__m512 tmp18386 = in2534;
tmp18375 = tmp18375;
tmp18382 = tmp18382;
__m512 tmp18380 = _mm512_fmadd_ps(tmp18375, _mm512_set1_ps(2e+00f), tmp18376);
__m512 tmp18387 = _mm512_fmadd_ps(tmp18382, _mm512_set1_ps(2e+00f), tmp18383);
tmp18376 = _mm512_fnmadd_ps(tmp18375, _mm512_set1_ps(2e+00f), tmp18376);
tmp18383 = _mm512_fnmadd_ps(tmp18382, _mm512_set1_ps(2e+00f), tmp18383);
tmp18375 = in2530;
tmp18382 = in2533;
in2530 = _mm512_sub_ps(_mm512_setzero_ps(), in2530);
in2533 = _mm512_sub_ps(_mm512_setzero_ps(), in2533);
tmp18375 = tmp18375;
tmp18382 = tmp18382;
__m512 tmp18381 = in2530;
__m512 tmp18388 = in2533;
in2531 = _mm512_fmadd_ps(tmp18375, _mm512_set1_ps(2e+00f), tmp18379);
in2534 = _mm512_fmadd_ps(tmp18382, _mm512_set1_ps(2e+00f), tmp18386);
tmp18379 = _mm512_fnmadd_ps(tmp18375, _mm512_set1_ps(2e+00f), tmp18379);
tmp18386 = _mm512_fnmadd_ps(tmp18382, _mm512_set1_ps(2e+00f), tmp18386);
__m512 tmp18397 = _mm512_unpacklo_ps(in2529, tmp18377);
__m512 tmp18398 = _mm512_unpackhi_ps(in2529, tmp18377);
__m512 tmp18399 = _mm512_unpacklo_ps(tmp18378, tmp18380);
__m512 tmp18400 = _mm512_unpackhi_ps(tmp18378, tmp18380);
__m512 tmp18401 = _mm512_unpacklo_ps(tmp18376, in2531);
__m512 tmp18402 = _mm512_unpackhi_ps(tmp18376, in2531);
__m512 tmp18403 = _mm512_unpacklo_ps(tmp18379, tmp18381);
__m512 tmp18404 = _mm512_unpackhi_ps(tmp18379, tmp18381);
__m512 tmp18405 = _mm512_unpacklo_ps(in2532, tmp18384);
__m512 tmp18406 = _mm512_unpackhi_ps(in2532, tmp18384);
__m512 tmp18407 = _mm512_unpacklo_ps(tmp18385, tmp18387);
__m512 tmp18408 = _mm512_unpackhi_ps(tmp18385, tmp18387);
__m512 tmp18409 = _mm512_unpacklo_ps(tmp18383, in2534);
__m512 tmp18410 = _mm512_unpackhi_ps(tmp18383, in2534);
__m512 tmp18411 = _mm512_unpacklo_ps(tmp18386, tmp18388);
__m512 tmp18412 = _mm512_unpackhi_ps(tmp18386, tmp18388);
__m512 tmp18413 = _mm512_shuffle_ps(tmp18397, tmp18399, 68);
__m512 tmp18414 = _mm512_shuffle_ps(tmp18397, tmp18399, 238);
__m512 tmp18415 = _mm512_shuffle_ps(tmp18398, tmp18400, 68);
__m512 tmp18416 = _mm512_shuffle_ps(tmp18398, tmp18400, 238);
__m512 tmp18417 = _mm512_shuffle_ps(tmp18401, tmp18403, 68);
__m512 tmp18418 = _mm512_shuffle_ps(tmp18401, tmp18403, 238);
__m512 tmp18419 = _mm512_shuffle_ps(tmp18402, tmp18404, 68);
__m512 tmp18420 = _mm512_shuffle_ps(tmp18402, tmp18404, 238);
__m512 tmp18421 = _mm512_shuffle_ps(tmp18405, tmp18407, 68);
__m512 tmp18422 = _mm512_shuffle_ps(tmp18405, tmp18407, 238);
__m512 tmp18423 = _mm512_shuffle_ps(tmp18406, tmp18408, 68);
__m512 tmp18424 = _mm512_shuffle_ps(tmp18406, tmp18408, 238);
__m512 tmp18425 = _mm512_shuffle_ps(tmp18409, tmp18411, 68);
__m512 tmp18426 = _mm512_shuffle_ps(tmp18409, tmp18411, 238);
__m512 tmp18427 = _mm512_shuffle_ps(tmp18410, tmp18412, 68);
__m512 tmp18428 = _mm512_shuffle_ps(tmp18410, tmp18412, 238);
__m512 tmp18429 = _mm512_shuffle_f32x4(tmp18413, tmp18417, 136);
__m512 tmp18430 = _mm512_shuffle_f32x4(tmp18413, tmp18417, 221);
__m512 tmp18431 = _mm512_shuffle_f32x4(tmp18414, tmp18418, 136);
__m512 tmp18432 = _mm512_shuffle_f32x4(tmp18414, tmp18418, 221);
__m512 tmp18433 = _mm512_shuffle_f32x4(tmp18415, tmp18419, 136);
__m512 tmp18434 = _mm512_shuffle_f32x4(tmp18415, tmp18419, 221);
__m512 tmp18435 = _mm512_shuffle_f32x4(tmp18416, tmp18420, 136);
__m512 tmp18436 = _mm512_shuffle_f32x4(tmp18416, tmp18420, 221);
__m512 tmp18437 = _mm512_shuffle_f32x4(tmp18421, tmp18425, 136);
__m512 tmp18438 = _mm512_shuffle_f32x4(tmp18421, tmp18425, 221);
__m512 tmp18439 = _mm512_shuffle_f32x4(tmp18422, tmp18426, 136);
__m512 tmp18440 = _mm512_shuffle_f32x4(tmp18422, tmp18426, 221);
__m512 tmp18441 = _mm512_shuffle_f32x4(tmp18423, tmp18427, 136);
__m512 tmp18442 = _mm512_shuffle_f32x4(tmp18423, tmp18427, 221);
__m512 tmp18443 = _mm512_shuffle_f32x4(tmp18424, tmp18428, 136);
__m512 tmp18444 = _mm512_shuffle_f32x4(tmp18424, tmp18428, 221);
in2529 = _mm512_shuffle_f32x4(tmp18429, tmp18437, 136);
in2532 = _mm512_shuffle_f32x4(tmp18429, tmp18437, 221);
tmp18377 = _mm512_shuffle_f32x4(tmp18431, tmp18439, 136);
tmp18384 = _mm512_shuffle_f32x4(tmp18431, tmp18439, 221);
tmp18378 = _mm512_shuffle_f32x4(tmp18433, tmp18441, 136);
tmp18385 = _mm512_shuffle_f32x4(tmp18433, tmp18441, 221);
tmp18380 = _mm512_shuffle_f32x4(tmp18435, tmp18443, 136);
tmp18387 = _mm512_shuffle_f32x4(tmp18435, tmp18443, 221);
tmp18376 = _mm512_shuffle_f32x4(tmp18430, tmp18438, 136);
tmp18383 = _mm512_shuffle_f32x4(tmp18430, tmp18438, 221);
in2531 = _mm512_shuffle_f32x4(tmp18432, tmp18440, 136);
in2534 = _mm512_shuffle_f32x4(tmp18432, tmp18440, 221);
tmp18379 = _mm512_shuffle_f32x4(tmp18434, tmp18442, 136);
tmp18386 = _mm512_shuffle_f32x4(tmp18434, tmp18442, 221);
tmp18381 = _mm512_shuffle_f32x4(tmp18436, tmp18444, 136);
tmp18388 = _mm512_shuffle_f32x4(tmp18436, tmp18444, 221);
__m512 tmp18389 = _mm512_add_ps(tmp18377, in2531);
__m512 tmp18393 = _mm512_add_ps(tmp18384, in2534);
__m512 tmp18390 = _mm512_sub_ps(tmp18376, tmp18378);
__m512 tmp18394 = _mm512_sub_ps(tmp18383, tmp18385);
__m512 tmp18391 = _mm512_add_ps(tmp18378, tmp18379);
__m512 tmp18395 = _mm512_add_ps(tmp18385, tmp18386);
in2529 = _mm512_sub_ps(in2529, tmp18379);
in2532 = _mm512_sub_ps(in2532, tmp18386);
tmp18389 = _mm512_fmadd_ps(tmp18380, _mm512_set1_ps(-4.25e+00f), tmp18389);
tmp18393 = _mm512_fmadd_ps(tmp18387, _mm512_set1_ps(-4.25e+00f), tmp18393);
tmp18391 = _mm512_fmadd_ps(tmp18376, _mm512_set1_ps(-4.25e+00f), tmp18391);
tmp18395 = _mm512_fmadd_ps(tmp18383, _mm512_set1_ps(-4.25e+00f), tmp18395);
in2529 = _mm512_fmadd_ps(tmp18390, _mm512_set1_ps(5.25e+00f), in2529);
in2532 = _mm512_fmadd_ps(tmp18394, _mm512_set1_ps(5.25e+00f), in2532);
tmp18390 = _mm512_fmadd_ps(tmp18378, _mm512_set1_ps(2.5e-01f), tmp18379);
tmp18394 = _mm512_fmadd_ps(tmp18385, _mm512_set1_ps(2.5e-01f), tmp18386);
tmp18378 = _mm512_fmadd_ps(tmp18378, _mm512_set1_ps(4e+00f), tmp18379);
tmp18385 = _mm512_fmadd_ps(tmp18385, _mm512_set1_ps(4e+00f), tmp18386);
__m512 tmp18392 = _mm512_sub_ps(tmp18391, tmp18389);
__m512 tmp18396 = _mm512_sub_ps(tmp18395, tmp18393);
tmp18391 = _mm512_add_ps(tmp18389, tmp18391);
tmp18395 = _mm512_add_ps(tmp18393, tmp18395);
tmp18389 = _mm512_fmadd_ps(tmp18377, _mm512_set1_ps(2.5e-01f), in2531);
tmp18393 = _mm512_fmadd_ps(tmp18384, _mm512_set1_ps(2.5e-01f), in2534);
tmp18390 = _mm512_fmadd_ps(tmp18376, _mm512_set1_ps(-1.25e+00f), tmp18390);
tmp18394 = _mm512_fmadd_ps(tmp18383, _mm512_set1_ps(-1.25e+00f), tmp18394);
tmp18376 = _mm512_fmadd_ps(tmp18376, _mm512_set1_ps(-5e+00f), tmp18378);
tmp18383 = _mm512_fmadd_ps(tmp18383, _mm512_set1_ps(-5e+00f), tmp18385);
tmp18389 = _mm512_fmadd_ps(tmp18380, _mm512_set1_ps(-1.25e+00f), tmp18389);
tmp18393 = _mm512_fmadd_ps(tmp18387, _mm512_set1_ps(-1.25e+00f), tmp18393);
tmp18379 = _mm512_fmadd_ps(tmp18389, _mm512_set1_ps(2e+00f), tmp18390);
tmp18386 = _mm512_fmadd_ps(tmp18393, _mm512_set1_ps(2e+00f), tmp18394);
tmp18390 = _mm512_fnmadd_ps(tmp18389, _mm512_set1_ps(2e+00f), tmp18390);
tmp18394 = _mm512_fnmadd_ps(tmp18393, _mm512_set1_ps(2e+00f), tmp18394);
tmp18389 = _mm512_fmadd_ps(in2531, _mm512_set1_ps(2.5e-01f), tmp18377);
tmp18393 = _mm512_fmadd_ps(in2534, _mm512_set1_ps(2.5e-01f), tmp18384);
tmp18377 = _mm512_sub_ps(tmp18381, tmp18377);
tmp18384 = _mm512_sub_ps(tmp18388, tmp18384);
tmp18389 = _mm512_fmadd_ps(tmp18380, _mm512_set1_ps(-1.25e+00f), tmp18389);
tmp18393 = _mm512_fmadd_ps(tmp18387, _mm512_set1_ps(-1.25e+00f), tmp18393);
tmp18380 = _mm512_sub_ps(tmp18380, in2531);
tmp18387 = _mm512_sub_ps(tmp18387, in2534);
tmp18380 = _mm512_fmadd_ps(tmp18380, _mm512_set1_ps(5.25e+00f), tmp18377);
tmp18387 = _mm512_fmadd_ps(tmp18387, _mm512_set1_ps(5.25e+00f), tmp18384);
tmp18378 = _mm512_fmadd_ps(tmp18389, _mm512_set1_ps(2e+00f), tmp18376);
tmp18385 = _mm512_fmadd_ps(tmp18393, _mm512_set1_ps(2e+00f), tmp18383);
tmp18376 = _mm512_fnmadd_ps(tmp18389, _mm512_set1_ps(2e+00f), tmp18376);
tmp18383 = _mm512_fnmadd_ps(tmp18393, _mm512_set1_ps(2e+00f), tmp18383);
__m512 out2351 = _mm512_shuffle_f32x4(in2529, tmp18391, 68);
__m512 out2359 = _mm512_shuffle_f32x4(in2529, tmp18391, 238);
__m512 out2352 = _mm512_shuffle_f32x4(tmp18392, tmp18379, 68);
__m512 out2360 = _mm512_shuffle_f32x4(tmp18392, tmp18379, 238);
__m512 out2353 = _mm512_shuffle_f32x4(tmp18390, tmp18378, 68);
__m512 out2361 = _mm512_shuffle_f32x4(tmp18390, tmp18378, 238);
__m512 out2354 = _mm512_shuffle_f32x4(tmp18376, tmp18380, 68);
__m512 out2362 = _mm512_shuffle_f32x4(tmp18376, tmp18380, 238);
__m512 out2355 = _mm512_shuffle_f32x4(in2532, tmp18395, 68);
__m512 out2363 = _mm512_shuffle_f32x4(in2532, tmp18395, 238);
__m512 out2356 = _mm512_shuffle_f32x4(tmp18396, tmp18386, 68);
__m512 out2364 = _mm512_shuffle_f32x4(tmp18396, tmp18386, 238);
__m512 out2357 = _mm512_shuffle_f32x4(tmp18394, tmp18385, 68);
__m512 out2365 = _mm512_shuffle_f32x4(tmp18394, tmp18385, 238);
__m512 out2358 = _mm512_shuffle_f32x4(tmp18383, tmp18387, 68);
__m512 out2366 = _mm512_shuffle_f32x4(tmp18383, tmp18387, 238);
_mm512_storeu_ps(dfPtr14+256+589824*i63+98304*j55+24576*s55+768*k167, out2351);
_mm512_storeu_ps(dfPtr14+384+589824*i63+98304*j55+24576*s55+768*k167, out2359);
_mm512_storeu_ps(dfPtr14+320+589824*i63+98304*j55+24576*s55+768*k167, out2355);
_mm512_storeu_ps(dfPtr14+448+589824*i63+98304*j55+24576*s55+768*k167, out2363);
_mm512_storeu_ps(dfPtr14+147712+589824*i63+98304*j55+24576*s55+768*k167, out2352);
_mm512_storeu_ps(dfPtr14+147840+589824*i63+98304*j55+24576*s55+768*k167, out2360);
_mm512_storeu_ps(dfPtr14+147776+589824*i63+98304*j55+24576*s55+768*k167, out2356);
_mm512_storeu_ps(dfPtr14+147904+589824*i63+98304*j55+24576*s55+768*k167, out2364);
_mm512_storeu_ps(dfPtr14+295168+589824*i63+98304*j55+24576*s55+768*k167, out2353);
_mm512_storeu_ps(dfPtr14+295296+589824*i63+98304*j55+24576*s55+768*k167, out2361);
_mm512_storeu_ps(dfPtr14+295232+589824*i63+98304*j55+24576*s55+768*k167, out2357);
_mm512_storeu_ps(dfPtr14+295360+589824*i63+98304*j55+24576*s55+768*k167, out2365);
_mm512_storeu_ps(dfPtr14+442624+589824*i63+98304*j55+24576*s55+768*k167, out2354);
_mm512_storeu_ps(dfPtr14+442752+589824*i63+98304*j55+24576*s55+768*k167, out2362);
_mm512_storeu_ps(dfPtr14+442688+589824*i63+98304*j55+24576*s55+768*k167, out2358);
_mm512_storeu_ps(dfPtr14+442816+589824*i63+98304*j55+24576*s55+768*k167, out2366);
__m512 dat2395 = _mm512_maskz_loadu_ps(7, datPtr34+1712+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512 dat2396 = _mm512_maskz_loadu_ps(16383, datPtr34+2500+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512i pm249 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 15, 15, 2, 1, 0);
__m512 in2535 = _mm512_permutex2var_ps(dat2395, pm249, dat2396);
__m512i pm250 = _mm512_set_epi32(15, 15, 15, 15, 15, 13, 12, 11, 12, 11, 10, 9, 8, 7, 6, 5);
__m512 in2538 = _mm512_permutexvar_ps(pm250, dat2396);
__m512 dat2397 = _mm512_maskz_loadu_ps(7, datPtr34+1768+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512 dat2398 = _mm512_maskz_loadu_ps(16383, datPtr34+2556+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512 in2536 = _mm512_permutex2var_ps(dat2397, pm249, dat2398);
__m512 in2539 = _mm512_permutexvar_ps(pm250, dat2398);
__m512 dat2399 = _mm512_maskz_loadu_ps(7, datPtr34+1824+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512 dat2400 = _mm512_maskz_loadu_ps(16383, datPtr34+2612+212992*i63+56*h56+4*w72+106496*s55+3328*k167);
__m512 in2537 = _mm512_permutex2var_ps(dat2399, pm249, dat2400);
__m512 in2540 = _mm512_permutexvar_ps(pm250, dat2400);
__m512 tmp18445 = in2536;
__m512 tmp18452 = in2539;
__m512 tmp18446 = _mm512_sub_ps(_mm512_setzero_ps(), in2537);
__m512 tmp18453 = _mm512_sub_ps(_mm512_setzero_ps(), in2540);
__m512 tmp18447 = in2537;
__m512 tmp18454 = in2540;
in2535 = in2535;
in2538 = in2538;
tmp18445 = tmp18445;
tmp18452 = tmp18452;
tmp18447 = tmp18447;
tmp18454 = tmp18454;
in2535 = _mm512_fmadd_ps(tmp18446, _mm512_set1_ps(5.25e+00f), in2535);
in2538 = _mm512_fmadd_ps(tmp18453, _mm512_set1_ps(5.25e+00f), in2538);
tmp18446 = _mm512_mul_ps(in2537, _mm512_set1_ps(2.5e-01f));
tmp18453 = _mm512_mul_ps(in2540, _mm512_set1_ps(2.5e-01f));
in2537 = _mm512_mul_ps(in2537, _mm512_set1_ps(4e+00f));
in2540 = _mm512_mul_ps(in2540, _mm512_set1_ps(4e+00f));
__m512 tmp18448 = _mm512_sub_ps(tmp18447, tmp18445);
__m512 tmp18455 = _mm512_sub_ps(tmp18454, tmp18452);
tmp18447 = _mm512_add_ps(tmp18445, tmp18447);
tmp18454 = _mm512_add_ps(tmp18452, tmp18454);
tmp18445 = _mm512_mul_ps(in2536, _mm512_set1_ps(2.5e-01f));
tmp18452 = _mm512_mul_ps(in2539, _mm512_set1_ps(2.5e-01f));
tmp18446 = tmp18446;
tmp18453 = tmp18453;
__m512 tmp18449 = in2537;
__m512 tmp18456 = in2540;
tmp18445 = tmp18445;
tmp18452 = tmp18452;
__m512 tmp18450 = _mm512_fmadd_ps(tmp18445, _mm512_set1_ps(2e+00f), tmp18446);
__m512 tmp18457 = _mm512_fmadd_ps(tmp18452, _mm512_set1_ps(2e+00f), tmp18453);
tmp18446 = _mm512_fnmadd_ps(tmp18445, _mm512_set1_ps(2e+00f), tmp18446);
tmp18453 = _mm512_fnmadd_ps(tmp18452, _mm512_set1_ps(2e+00f), tmp18453);
tmp18445 = in2536;
tmp18452 = in2539;
in2536 = _mm512_sub_ps(_mm512_setzero_ps(), in2536);
in2539 = _mm512_sub_ps(_mm512_setzero_ps(), in2539);
tmp18445 = tmp18445;
tmp18452 = tmp18452;
__m512 tmp18451 = in2536;
__m512 tmp18458 = in2539;
in2537 = _mm512_fmadd_ps(tmp18445, _mm512_set1_ps(2e+00f), tmp18449);
in2540 = _mm512_fmadd_ps(tmp18452, _mm512_set1_ps(2e+00f), tmp18456);
tmp18449 = _mm512_fnmadd_ps(tmp18445, _mm512_set1_ps(2e+00f), tmp18449);
tmp18456 = _mm512_fnmadd_ps(tmp18452, _mm512_set1_ps(2e+00f), tmp18456);
__m512 tmp18467 = _mm512_unpacklo_ps(in2535, tmp18447);
__m512 tmp18468 = _mm512_unpackhi_ps(in2535, tmp18447);
__m512 tmp18469 = _mm512_unpacklo_ps(tmp18448, tmp18450);
__m512 tmp18470 = _mm512_unpackhi_ps(tmp18448, tmp18450);
__m512 tmp18471 = _mm512_unpacklo_ps(tmp18446, in2537);
__m512 tmp18472 = _mm512_unpackhi_ps(tmp18446, in2537);
__m512 tmp18473 = _mm512_unpacklo_ps(tmp18449, tmp18451);
__m512 tmp18474 = _mm512_unpackhi_ps(tmp18449, tmp18451);
__m512 tmp18475 = _mm512_unpacklo_ps(in2538, tmp18454);
__m512 tmp18476 = _mm512_unpackhi_ps(in2538, tmp18454);
__m512 tmp18477 = _mm512_unpacklo_ps(tmp18455, tmp18457);
__m512 tmp18478 = _mm512_unpackhi_ps(tmp18455, tmp18457);
__m512 tmp18479 = _mm512_unpacklo_ps(tmp18453, in2540);
__m512 tmp18480 = _mm512_unpackhi_ps(tmp18453, in2540);
__m512 tmp18481 = _mm512_unpacklo_ps(tmp18456, tmp18458);
__m512 tmp18482 = _mm512_unpackhi_ps(tmp18456, tmp18458);
__m512 tmp18483 = _mm512_shuffle_ps(tmp18467, tmp18469, 68);
__m512 tmp18484 = _mm512_shuffle_ps(tmp18467, tmp18469, 238);
__m512 tmp18485 = _mm512_shuffle_ps(tmp18468, tmp18470, 68);
__m512 tmp18486 = _mm512_shuffle_ps(tmp18468, tmp18470, 238);
__m512 tmp18487 = _mm512_shuffle_ps(tmp18471, tmp18473, 68);
__m512 tmp18488 = _mm512_shuffle_ps(tmp18471, tmp18473, 238);
__m512 tmp18489 = _mm512_shuffle_ps(tmp18472, tmp18474, 68);
__m512 tmp18490 = _mm512_shuffle_ps(tmp18472, tmp18474, 238);
__m512 tmp18491 = _mm512_shuffle_ps(tmp18475, tmp18477, 68);
__m512 tmp18492 = _mm512_shuffle_ps(tmp18475, tmp18477, 238);
__m512 tmp18493 = _mm512_shuffle_ps(tmp18476, tmp18478, 68);
__m512 tmp18494 = _mm512_shuffle_ps(tmp18476, tmp18478, 238);
__m512 tmp18495 = _mm512_shuffle_ps(tmp18479, tmp18481, 68);
__m512 tmp18496 = _mm512_shuffle_ps(tmp18479, tmp18481, 238);
__m512 tmp18497 = _mm512_shuffle_ps(tmp18480, tmp18482, 68);
__m512 tmp18498 = _mm512_shuffle_ps(tmp18480, tmp18482, 238);
__m512 tmp18499 = _mm512_shuffle_f32x4(tmp18483, tmp18487, 136);
__m512 tmp18500 = _mm512_shuffle_f32x4(tmp18483, tmp18487, 221);
__m512 tmp18501 = _mm512_shuffle_f32x4(tmp18484, tmp18488, 136);
__m512 tmp18502 = _mm512_shuffle_f32x4(tmp18484, tmp18488, 221);
__m512 tmp18503 = _mm512_shuffle_f32x4(tmp18485, tmp18489, 136);
__m512 tmp18504 = _mm512_shuffle_f32x4(tmp18485, tmp18489, 221);
__m512 tmp18505 = _mm512_shuffle_f32x4(tmp18486, tmp18490, 136);
__m512 tmp18506 = _mm512_shuffle_f32x4(tmp18486, tmp18490, 221);
__m512 tmp18507 = _mm512_shuffle_f32x4(tmp18491, tmp18495, 136);
__m512 tmp18508 = _mm512_shuffle_f32x4(tmp18491, tmp18495, 221);
__m512 tmp18509 = _mm512_shuffle_f32x4(tmp18492, tmp18496, 136);
__m512 tmp18510 = _mm512_shuffle_f32x4(tmp18492, tmp18496, 221);
__m512 tmp18511 = _mm512_shuffle_f32x4(tmp18493, tmp18497, 136);
__m512 tmp18512 = _mm512_shuffle_f32x4(tmp18493, tmp18497, 221);
__m512 tmp18513 = _mm512_shuffle_f32x4(tmp18494, tmp18498, 136);
__m512 tmp18514 = _mm512_shuffle_f32x4(tmp18494, tmp18498, 221);
in2535 = _mm512_shuffle_f32x4(tmp18499, tmp18507, 136);
in2538 = _mm512_shuffle_f32x4(tmp18499, tmp18507, 221);
tmp18447 = _mm512_shuffle_f32x4(tmp18501, tmp18509, 136);
tmp18454 = _mm512_shuffle_f32x4(tmp18501, tmp18509, 221);
tmp18448 = _mm512_shuffle_f32x4(tmp18503, tmp18511, 136);
tmp18455 = _mm512_shuffle_f32x4(tmp18503, tmp18511, 221);
tmp18450 = _mm512_shuffle_f32x4(tmp18505, tmp18513, 136);
tmp18457 = _mm512_shuffle_f32x4(tmp18505, tmp18513, 221);
tmp18446 = _mm512_shuffle_f32x4(tmp18500, tmp18508, 136);
tmp18453 = _mm512_shuffle_f32x4(tmp18500, tmp18508, 221);
in2537 = _mm512_shuffle_f32x4(tmp18502, tmp18510, 136);
in2540 = _mm512_shuffle_f32x4(tmp18502, tmp18510, 221);
tmp18449 = _mm512_shuffle_f32x4(tmp18504, tmp18512, 136);
tmp18456 = _mm512_shuffle_f32x4(tmp18504, tmp18512, 221);
tmp18451 = _mm512_shuffle_f32x4(tmp18506, tmp18514, 136);
tmp18458 = _mm512_shuffle_f32x4(tmp18506, tmp18514, 221);
__m512 tmp18459 = _mm512_add_ps(tmp18447, in2537);
__m512 tmp18463 = _mm512_add_ps(tmp18454, in2540);
__m512 tmp18460 = _mm512_sub_ps(tmp18446, tmp18448);
__m512 tmp18464 = _mm512_sub_ps(tmp18453, tmp18455);
__m512 tmp18461 = _mm512_add_ps(tmp18448, tmp18449);
__m512 tmp18465 = _mm512_add_ps(tmp18455, tmp18456);
in2535 = _mm512_sub_ps(in2535, tmp18449);
in2538 = _mm512_sub_ps(in2538, tmp18456);
tmp18459 = _mm512_fmadd_ps(tmp18450, _mm512_set1_ps(-4.25e+00f), tmp18459);
tmp18463 = _mm512_fmadd_ps(tmp18457, _mm512_set1_ps(-4.25e+00f), tmp18463);
tmp18461 = _mm512_fmadd_ps(tmp18446, _mm512_set1_ps(-4.25e+00f), tmp18461);
tmp18465 = _mm512_fmadd_ps(tmp18453, _mm512_set1_ps(-4.25e+00f), tmp18465);
in2535 = _mm512_fmadd_ps(tmp18460, _mm512_set1_ps(5.25e+00f), in2535);
in2538 = _mm512_fmadd_ps(tmp18464, _mm512_set1_ps(5.25e+00f), in2538);
tmp18460 = _mm512_fmadd_ps(tmp18448, _mm512_set1_ps(2.5e-01f), tmp18449);
tmp18464 = _mm512_fmadd_ps(tmp18455, _mm512_set1_ps(2.5e-01f), tmp18456);
tmp18448 = _mm512_fmadd_ps(tmp18448, _mm512_set1_ps(4e+00f), tmp18449);
tmp18455 = _mm512_fmadd_ps(tmp18455, _mm512_set1_ps(4e+00f), tmp18456);
__m512 tmp18462 = _mm512_sub_ps(tmp18461, tmp18459);
__m512 tmp18466 = _mm512_sub_ps(tmp18465, tmp18463);
tmp18461 = _mm512_add_ps(tmp18459, tmp18461);
tmp18465 = _mm512_add_ps(tmp18463, tmp18465);
tmp18459 = _mm512_fmadd_ps(tmp18447, _mm512_set1_ps(2.5e-01f), in2537);
tmp18463 = _mm512_fmadd_ps(tmp18454, _mm512_set1_ps(2.5e-01f), in2540);
tmp18460 = _mm512_fmadd_ps(tmp18446, _mm512_set1_ps(-1.25e+00f), tmp18460);
tmp18464 = _mm512_fmadd_ps(tmp18453, _mm512_set1_ps(-1.25e+00f), tmp18464);
tmp18446 = _mm512_fmadd_ps(tmp18446, _mm512_set1_ps(-5e+00f), tmp18448);
tmp18453 = _mm512_fmadd_ps(tmp18453, _mm512_set1_ps(-5e+00f), tmp18455);
tmp18459 = _mm512_fmadd_ps(tmp18450, _mm512_set1_ps(-1.25e+00f), tmp18459);
tmp18463 = _mm512_fmadd_ps(tmp18457, _mm512_set1_ps(-1.25e+00f), tmp18463);
tmp18449 = _mm512_fmadd_ps(tmp18459, _mm512_set1_ps(2e+00f), tmp18460);
tmp18456 = _mm512_fmadd_ps(tmp18463, _mm512_set1_ps(2e+00f), tmp18464);
tmp18460 = _mm512_fnmadd_ps(tmp18459, _mm512_set1_ps(2e+00f), tmp18460);
tmp18464 = _mm512_fnmadd_ps(tmp18463, _mm512_set1_ps(2e+00f), tmp18464);
tmp18459 = _mm512_fmadd_ps(in2537, _mm512_set1_ps(2.5e-01f), tmp18447);
tmp18463 = _mm512_fmadd_ps(in2540, _mm512_set1_ps(2.5e-01f), tmp18454);
tmp18447 = _mm512_sub_ps(tmp18451, tmp18447);
tmp18454 = _mm512_sub_ps(tmp18458, tmp18454);
tmp18459 = _mm512_fmadd_ps(tmp18450, _mm512_set1_ps(-1.25e+00f), tmp18459);
tmp18463 = _mm512_fmadd_ps(tmp18457, _mm512_set1_ps(-1.25e+00f), tmp18463);
tmp18450 = _mm512_sub_ps(tmp18450, in2537);
tmp18457 = _mm512_sub_ps(tmp18457, in2540);
tmp18450 = _mm512_fmadd_ps(tmp18450, _mm512_set1_ps(5.25e+00f), tmp18447);
tmp18457 = _mm512_fmadd_ps(tmp18457, _mm512_set1_ps(5.25e+00f), tmp18454);
tmp18448 = _mm512_fmadd_ps(tmp18459, _mm512_set1_ps(2e+00f), tmp18446);
tmp18455 = _mm512_fmadd_ps(tmp18463, _mm512_set1_ps(2e+00f), tmp18453);
tmp18446 = _mm512_fnmadd_ps(tmp18459, _mm512_set1_ps(2e+00f), tmp18446);
tmp18453 = _mm512_fnmadd_ps(tmp18463, _mm512_set1_ps(2e+00f), tmp18453);
__m512 out2367 = _mm512_shuffle_f32x4(in2535, tmp18461, 68);
__m512 out2375 = _mm512_shuffle_f32x4(in2535, tmp18461, 238);
__m512 out2368 = _mm512_shuffle_f32x4(tmp18462, tmp18449, 68);
__m512 out2376 = _mm512_shuffle_f32x4(tmp18462, tmp18449, 238);
__m512 out2369 = _mm512_shuffle_f32x4(tmp18460, tmp18448, 68);
__m512 out2377 = _mm512_shuffle_f32x4(tmp18460, tmp18448, 238);
__m512 out2370 = _mm512_shuffle_f32x4(tmp18446, tmp18450, 68);
__m512 out2378 = _mm512_shuffle_f32x4(tmp18446, tmp18450, 238);
__m512 out2371 = _mm512_shuffle_f32x4(in2538, tmp18465, 68);
__m512 out2379 = _mm512_shuffle_f32x4(in2538, tmp18465, 238);
__m512 out2372 = _mm512_shuffle_f32x4(tmp18466, tmp18456, 68);
__m512 out2380 = _mm512_shuffle_f32x4(tmp18466, tmp18456, 238);
__m512 out2373 = _mm512_shuffle_f32x4(tmp18464, tmp18455, 68);
__m512 out2381 = _mm512_shuffle_f32x4(tmp18464, tmp18455, 238);
__m512 out2374 = _mm512_shuffle_f32x4(tmp18453, tmp18457, 68);
__m512 out2382 = _mm512_shuffle_f32x4(tmp18453, tmp18457, 238);
_mm512_storeu_ps(dfPtr14+512+589824*i63+98304*j55+24576*s55+768*k167, out2367);
_mm512_storeu_ps(dfPtr14+640+589824*i63+98304*j55+24576*s55+768*k167, out2375);
_mm512_storeu_ps(dfPtr14+576+589824*i63+98304*j55+24576*s55+768*k167, out2371);
_mm512_storeu_ps(dfPtr14+704+589824*i63+98304*j55+24576*s55+768*k167, out2379);
_mm512_storeu_ps(dfPtr14+147968+589824*i63+98304*j55+24576*s55+768*k167, out2368);
_mm512_storeu_ps(dfPtr14+148096+589824*i63+98304*j55+24576*s55+768*k167, out2376);
_mm512_storeu_ps(dfPtr14+148032+589824*i63+98304*j55+24576*s55+768*k167, out2372);
_mm512_storeu_ps(dfPtr14+148160+589824*i63+98304*j55+24576*s55+768*k167, out2380);
_mm512_storeu_ps(dfPtr14+295424+589824*i63+98304*j55+24576*s55+768*k167, out2369);
_mm512_storeu_ps(dfPtr14+295552+589824*i63+98304*j55+24576*s55+768*k167, out2377);
_mm512_storeu_ps(dfPtr14+295488+589824*i63+98304*j55+24576*s55+768*k167, out2373);
_mm512_storeu_ps(dfPtr14+295616+589824*i63+98304*j55+24576*s55+768*k167, out2381);
_mm512_storeu_ps(dfPtr14+442880+589824*i63+98304*j55+24576*s55+768*k167, out2370);
_mm512_storeu_ps(dfPtr14+443008+589824*i63+98304*j55+24576*s55+768*k167, out2378);
_mm512_storeu_ps(dfPtr14+442944+589824*i63+98304*j55+24576*s55+768*k167, out2374);
_mm512_storeu_ps(dfPtr14+443072+589824*i63+98304*j55+24576*s55+768*k167, out2382);
}
if (j55 >= last13) return;
++j55;
}

static void ResNet50ThreeArrangeDats6(ResNet50ThreaderTeam1* team67, char** tensors107) {
ResNet50ThreaderTask1 task111;
task111.callee1 = ResNet50ThreeArrangeDats6Callee1;
task111.any1 = tensors107;
task111.nd1 = 4;
task111.hull1[0] = 2;
task111.hull1[1] = 2;
task111.hull1[2] = 1;
task111.hull1[3] = 1;
ResNet50ThreaderDo1(team67, &task111);
}

static void ResNet50ThreeProduceSums6Callee1(ResNet50ThreaderTask1* task112, int64_t* pt61) {
void** pair30 = task112->any1;
char** tensors110 = pair30[0];
ptrdiff_t e32 = 0;
ptrdiff_t g37 = 0;
ptrdiff_t f49 = pt61[2];
ptrdiff_t d22 = pt61[1];
ptrdiff_t w73 = pt61[0];
char*restrict bfPtr15 = tensors110[0]+1024*e32;
char*restrict wfPtr15 = tensors110[0]+1024+12976128*e32;
char*restrict dfPtr15 = tensors110[1]+912384*e32;
char*restrict sfPtr14 = tensors110[2];
ptrdiff_t i64 = 1*g37;
ptrdiff_t j56 = 1*f49;
ptrdiff_t k168 = 1*d22;
ptrdiff_t kk59 = k168+0;
for (; k168 != 1; ++k168) {
ptrdiff_t l70 = 2*w73;
ptrdiff_t ll11 = l70+1;
for (; l70 != 64; ++l70) {
__m512 sum587;
__m512 sum593;
__m512 sum599;
__m512 sum605;
if (__builtin_expect(!j56, 0)) {
sum587 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr15+0+1024*i64+16*l70)));
sum593 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr15+4+1024*i64+16*l70)));
sum599 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr15+8+1024*i64+16*l70)));
sum605 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr15+12+1024*i64+16*l70)));
} else {
sum587 = _mm512_setzero_ps();
sum593 = _mm512_setzero_ps();
sum599 = _mm512_setzero_ps();
sum605 = _mm512_setzero_ps();
}
__m512 sum588 = sum587;
__m512 sum589 = sum587;
__m512 sum590 = sum587;
__m512 sum591 = sum587;
__m512 sum592 = sum587;
__m512 sum594 = sum593;
__m512 sum595 = sum593;
__m512 sum596 = sum593;
__m512 sum597 = sum593;
__m512 sum598 = sum593;
__m512 sum600 = sum599;
__m512 sum601 = sum599;
__m512 sum602 = sum599;
__m512 sum603 = sum599;
__m512 sum604 = sum599;
__m512 sum606 = sum605;
__m512 sum607 = sum605;
__m512 sum608 = sum605;
__m512 sum609 = sum605;
__m512 sum610 = sum605;
ptrdiff_t b69 = 0;
for (; b69 != 256; ++b69) {
__m512i wfs37 = _mm512_maskz_loadu_epi32(65535, wfPtr15+0+8388608*i64+2097152*j56+32768*l70+128*b69);
__m512 wf169 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs37));
__m512 df684 = _mm512_loadu_ps(dfPtr15+0+589824*i64+147456*j56+98304*k168+384*b69);
sum587 = _mm512_fmadd_ps(wf169, df684, sum587);
__m512 df685 = _mm512_loadu_ps(dfPtr15+64+589824*i64+147456*j56+98304*k168+384*b69);
sum588 = _mm512_fmadd_ps(wf169, df685, sum588);
__m512 df686 = _mm512_loadu_ps(dfPtr15+128+589824*i64+147456*j56+98304*k168+384*b69);
sum589 = _mm512_fmadd_ps(wf169, df686, sum589);
__m512 df687 = _mm512_loadu_ps(dfPtr15+192+589824*i64+147456*j56+98304*k168+384*b69);
sum590 = _mm512_fmadd_ps(wf169, df687, sum590);
__m512 df688 = _mm512_loadu_ps(dfPtr15+256+589824*i64+147456*j56+98304*k168+384*b69);
sum591 = _mm512_fmadd_ps(wf169, df688, sum591);
__m512 df689 = _mm512_loadu_ps(dfPtr15+320+589824*i64+147456*j56+98304*k168+384*b69);
sum592 = _mm512_fmadd_ps(wf169, df689, sum592);
__m512 wf170 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs37, 1));
sum593 = _mm512_fmadd_ps(wf170, df684, sum593);
sum594 = _mm512_fmadd_ps(wf170, df685, sum594);
sum595 = _mm512_fmadd_ps(wf170, df686, sum595);
sum596 = _mm512_fmadd_ps(wf170, df687, sum596);
sum597 = _mm512_fmadd_ps(wf170, df688, sum597);
sum598 = _mm512_fmadd_ps(wf170, df689, sum598);
__m512i wfs38 = _mm512_maskz_loadu_epi32(65535, wfPtr15+64+8388608*i64+2097152*j56+32768*l70+128*b69);
__m512 wf171 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs38));
sum599 = _mm512_fmadd_ps(wf171, df684, sum599);
sum600 = _mm512_fmadd_ps(wf171, df685, sum600);
sum601 = _mm512_fmadd_ps(wf171, df686, sum601);
sum602 = _mm512_fmadd_ps(wf171, df687, sum602);
sum603 = _mm512_fmadd_ps(wf171, df688, sum603);
sum604 = _mm512_fmadd_ps(wf171, df689, sum604);
__m512 wf172 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs38, 1));
sum605 = _mm512_fmadd_ps(wf172, df684, sum605);
sum606 = _mm512_fmadd_ps(wf172, df685, sum606);
sum607 = _mm512_fmadd_ps(wf172, df686, sum607);
sum608 = _mm512_fmadd_ps(wf172, df687, sum608);
sum609 = _mm512_fmadd_ps(wf172, df688, sum609);
sum610 = _mm512_fmadd_ps(wf172, df689, sum610);
}
_mm512_storeu_ps(sfPtr14+0+589824*i64+147456*j56+98304*k168+1536*l70, sum587);
_mm512_storeu_ps(sfPtr14+64+589824*i64+147456*j56+98304*k168+1536*l70, sum588);
_mm512_storeu_ps(sfPtr14+128+589824*i64+147456*j56+98304*k168+1536*l70, sum589);
_mm512_storeu_ps(sfPtr14+192+589824*i64+147456*j56+98304*k168+1536*l70, sum590);
_mm512_storeu_ps(sfPtr14+256+589824*i64+147456*j56+98304*k168+1536*l70, sum591);
_mm512_storeu_ps(sfPtr14+320+589824*i64+147456*j56+98304*k168+1536*l70, sum592);
_mm512_storeu_ps(sfPtr14+384+589824*i64+147456*j56+98304*k168+1536*l70, sum593);
_mm512_storeu_ps(sfPtr14+448+589824*i64+147456*j56+98304*k168+1536*l70, sum594);
_mm512_storeu_ps(sfPtr14+512+589824*i64+147456*j56+98304*k168+1536*l70, sum595);
_mm512_storeu_ps(sfPtr14+576+589824*i64+147456*j56+98304*k168+1536*l70, sum596);
_mm512_storeu_ps(sfPtr14+640+589824*i64+147456*j56+98304*k168+1536*l70, sum597);
_mm512_storeu_ps(sfPtr14+704+589824*i64+147456*j56+98304*k168+1536*l70, sum598);
_mm512_storeu_ps(sfPtr14+768+589824*i64+147456*j56+98304*k168+1536*l70, sum599);
_mm512_storeu_ps(sfPtr14+832+589824*i64+147456*j56+98304*k168+1536*l70, sum600);
_mm512_storeu_ps(sfPtr14+896+589824*i64+147456*j56+98304*k168+1536*l70, sum601);
_mm512_storeu_ps(sfPtr14+960+589824*i64+147456*j56+98304*k168+1536*l70, sum602);
_mm512_storeu_ps(sfPtr14+1024+589824*i64+147456*j56+98304*k168+1536*l70, sum603);
_mm512_storeu_ps(sfPtr14+1088+589824*i64+147456*j56+98304*k168+1536*l70, sum604);
_mm512_storeu_ps(sfPtr14+1152+589824*i64+147456*j56+98304*k168+1536*l70, sum605);
_mm512_storeu_ps(sfPtr14+1216+589824*i64+147456*j56+98304*k168+1536*l70, sum606);
_mm512_storeu_ps(sfPtr14+1280+589824*i64+147456*j56+98304*k168+1536*l70, sum607);
_mm512_storeu_ps(sfPtr14+1344+589824*i64+147456*j56+98304*k168+1536*l70, sum608);
_mm512_storeu_ps(sfPtr14+1408+589824*i64+147456*j56+98304*k168+1536*l70, sum609);
_mm512_storeu_ps(sfPtr14+1472+589824*i64+147456*j56+98304*k168+1536*l70, sum610);
if (l70 >= ll11) return;
}
if (k168 >= kk59) return;
}
ptrdiff_t l71 = 2*w73;
ptrdiff_t ll12 = l71+1;
for (; l71 != 64; ++l71) {
__m512 sum611;
__m512 sum614;
__m512 sum617;
__m512 sum620;
if (__builtin_expect(!j56, 0)) {
sum611 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr15+0+1024*i64+16*l71)));
sum614 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr15+4+1024*i64+16*l71)));
sum617 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr15+8+1024*i64+16*l71)));
sum620 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr15+12+1024*i64+16*l71)));
} else {
sum611 = _mm512_setzero_ps();
sum614 = _mm512_setzero_ps();
sum617 = _mm512_setzero_ps();
sum620 = _mm512_setzero_ps();
}
__m512 sum612 = sum611;
__m512 sum613 = sum611;
__m512 sum615 = sum614;
__m512 sum616 = sum614;
__m512 sum618 = sum617;
__m512 sum619 = sum617;
__m512 sum621 = sum620;
__m512 sum622 = sum620;
ptrdiff_t b70 = 0;
for (; b70 != 256; ++b70) {
__m512i wfs39 = _mm512_maskz_loadu_epi32(65535, wfPtr15+0+8388608*i64+2097152*j56+32768*l71+128*b70);
__m512 wf173 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs39));
__m512 df690 = _mm512_loadu_ps(dfPtr15+0+589824*i64+147456*j56+98304*k168+192*b70);
sum611 = _mm512_fmadd_ps(wf173, df690, sum611);
__m512 df691 = _mm512_loadu_ps(dfPtr15+64+589824*i64+147456*j56+98304*k168+192*b70);
sum612 = _mm512_fmadd_ps(wf173, df691, sum612);
__m512 df692 = _mm512_loadu_ps(dfPtr15+128+589824*i64+147456*j56+98304*k168+192*b70);
sum613 = _mm512_fmadd_ps(wf173, df692, sum613);
__m512 wf174 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs39, 1));
sum614 = _mm512_fmadd_ps(wf174, df690, sum614);
sum615 = _mm512_fmadd_ps(wf174, df691, sum615);
sum616 = _mm512_fmadd_ps(wf174, df692, sum616);
__m512i wfs40 = _mm512_maskz_loadu_epi32(65535, wfPtr15+64+8388608*i64+2097152*j56+32768*l71+128*b70);
__m512 wf175 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs40));
sum617 = _mm512_fmadd_ps(wf175, df690, sum617);
sum618 = _mm512_fmadd_ps(wf175, df691, sum618);
sum619 = _mm512_fmadd_ps(wf175, df692, sum619);
__m512 wf176 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs40, 1));
sum620 = _mm512_fmadd_ps(wf176, df690, sum620);
sum621 = _mm512_fmadd_ps(wf176, df691, sum621);
sum622 = _mm512_fmadd_ps(wf176, df692, sum622);
}
_mm512_storeu_ps(sfPtr14+0+589824*i64+147456*j56+98304*k168+768*l71, sum611);
_mm512_storeu_ps(sfPtr14+64+589824*i64+147456*j56+98304*k168+768*l71, sum612);
_mm512_storeu_ps(sfPtr14+128+589824*i64+147456*j56+98304*k168+768*l71, sum613);
_mm512_storeu_ps(sfPtr14+192+589824*i64+147456*j56+98304*k168+768*l71, sum614);
_mm512_storeu_ps(sfPtr14+256+589824*i64+147456*j56+98304*k168+768*l71, sum615);
_mm512_storeu_ps(sfPtr14+320+589824*i64+147456*j56+98304*k168+768*l71, sum616);
_mm512_storeu_ps(sfPtr14+384+589824*i64+147456*j56+98304*k168+768*l71, sum617);
_mm512_storeu_ps(sfPtr14+448+589824*i64+147456*j56+98304*k168+768*l71, sum618);
_mm512_storeu_ps(sfPtr14+512+589824*i64+147456*j56+98304*k168+768*l71, sum619);
_mm512_storeu_ps(sfPtr14+576+589824*i64+147456*j56+98304*k168+768*l71, sum620);
_mm512_storeu_ps(sfPtr14+640+589824*i64+147456*j56+98304*k168+768*l71, sum621);
_mm512_storeu_ps(sfPtr14+704+589824*i64+147456*j56+98304*k168+768*l71, sum622);
if (l71 >= ll12) return;
}
}

static void ResNet50ThreeProduceSums6(ResNet50ThreaderTeam1* team68, char** tensors109) {
void* pair29[] = {tensors109, 0};
ResNet50ThreaderTask1 task113;
task113.callee1 = ResNet50ThreeProduceSums6Callee1;
task113.any1 = pair29;
task113.nd1 = 4;
task113.hull1[0] = 32;
task113.hull1[1] = 2;
task113.hull1[2] = 4;
task113.hull1[3] = 1;
ResNet50ThreaderDo1(team68, &task113);
}

static void ResNet50ThreeConsumeSums6Callee1(ResNet50ThreaderTask1* task114, int64_t* pt62) {
char** tensors112 = task114->any1;
ptrdiff_t w74 = pt62[0];
ptrdiff_t d23 = pt62[1];
ptrdiff_t g38 = 0;
char*restrict sfPtr15 = tensors112[0];
char*restrict datPtr35 = tensors112[1];
ptrdiff_t i65 = 1*g38;
ptrdiff_t j57 = 1*d23;
ptrdiff_t last14 = j57+0;
ptrdiff_t rel26 = j57-0;
ptrdiff_t base26 = 0;
if (rel26 < 1) {
ptrdiff_t toH50 = base26+0;
ptrdiff_t toW50 = 0;
ptrdiff_t k169 = 32*w74;
ptrdiff_t kk60 = k169+31;
for (; k169 != 64; ++k169) {
ptrdiff_t l72 = 0;
for (; l72 != 2; ++l72) {
__m512 sf1313 = _mm512_loadu_ps(sfPtr15+0+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1314 = _mm512_loadu_ps(sfPtr15+128+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2541 = _mm512_shuffle_f32x4(sf1313, sf1314, 68);
__m512 in2542 = _mm512_shuffle_f32x4(sf1313, sf1314, 238);
__m512 sf1315 = _mm512_loadu_ps(sfPtr15+64+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1316 = _mm512_loadu_ps(sfPtr15+192+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2549 = _mm512_shuffle_f32x4(sf1315, sf1316, 68);
__m512 in2550 = _mm512_shuffle_f32x4(sf1315, sf1316, 238);
__m512 sf1317 = _mm512_loadu_ps(sfPtr15+147456+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1318 = _mm512_loadu_ps(sfPtr15+147584+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2543 = _mm512_shuffle_f32x4(sf1317, sf1318, 68);
__m512 in2544 = _mm512_shuffle_f32x4(sf1317, sf1318, 238);
__m512 sf1319 = _mm512_loadu_ps(sfPtr15+147520+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1320 = _mm512_loadu_ps(sfPtr15+147648+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2551 = _mm512_shuffle_f32x4(sf1319, sf1320, 68);
__m512 in2552 = _mm512_shuffle_f32x4(sf1319, sf1320, 238);
__m512 sf1321 = _mm512_loadu_ps(sfPtr15+294912+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1322 = _mm512_loadu_ps(sfPtr15+295040+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2545 = _mm512_shuffle_f32x4(sf1321, sf1322, 68);
__m512 in2546 = _mm512_shuffle_f32x4(sf1321, sf1322, 238);
__m512 sf1323 = _mm512_loadu_ps(sfPtr15+294976+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1324 = _mm512_loadu_ps(sfPtr15+295104+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2553 = _mm512_shuffle_f32x4(sf1323, sf1324, 68);
__m512 in2554 = _mm512_shuffle_f32x4(sf1323, sf1324, 238);
__m512 sf1325 = _mm512_loadu_ps(sfPtr15+442368+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1326 = _mm512_loadu_ps(sfPtr15+442496+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2547 = _mm512_shuffle_f32x4(sf1325, sf1326, 68);
__m512 in2548 = _mm512_shuffle_f32x4(sf1325, sf1326, 238);
__m512 sf1327 = _mm512_loadu_ps(sfPtr15+442432+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1328 = _mm512_loadu_ps(sfPtr15+442560+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2555 = _mm512_shuffle_f32x4(sf1327, sf1328, 68);
__m512 in2556 = _mm512_shuffle_f32x4(sf1327, sf1328, 238);
__m512 tmp18531 = _mm512_add_ps(in2542, in2543);
__m512 tmp18551 = _mm512_add_ps(in2550, in2551);
__m512 tmp18530 = _mm512_add_ps(in2544, in2545);
__m512 tmp18550 = _mm512_add_ps(in2552, in2553);
__m512 tmp18536 = _mm512_sub_ps(in2544, in2545);
__m512 tmp18556 = _mm512_sub_ps(in2552, in2553);
__m512 tmp18535 = _mm512_sub_ps(in2542, in2543);
__m512 tmp18555 = _mm512_sub_ps(in2550, in2551);
__m512 tmp18532 = _mm512_add_ps(in2546, in2547);
__m512 tmp18552 = _mm512_add_ps(in2554, in2555);
__m512 tmp18537 = _mm512_sub_ps(in2546, in2547);
__m512 tmp18557 = _mm512_sub_ps(in2554, in2555);
__m512 tmp18534 = _mm512_fmadd_ps(tmp18536, _mm512_set1_ps(2e+00f), tmp18535);
__m512 tmp18554 = _mm512_fmadd_ps(tmp18556, _mm512_set1_ps(2e+00f), tmp18555);
__m512 tmp18541 = _mm512_fmadd_ps(tmp18536, _mm512_set1_ps(8e+00f), tmp18535);
__m512 tmp18561 = _mm512_fmadd_ps(tmp18556, _mm512_set1_ps(8e+00f), tmp18555);
__m512 tmp18529 = _mm512_add_ps(tmp18530, tmp18531);
__m512 tmp18549 = _mm512_add_ps(tmp18550, tmp18551);
__m512 tmp18533 = _mm512_fmadd_ps(tmp18537, _mm512_set1_ps(1.6e+01f), tmp18534);
__m512 tmp18553 = _mm512_fmadd_ps(tmp18557, _mm512_set1_ps(1.6e+01f), tmp18554);
__m512 tmp18540 = _mm512_fmadd_ps(tmp18537, _mm512_set1_ps(4e+00f), tmp18541);
__m512 tmp18560 = _mm512_fmadd_ps(tmp18557, _mm512_set1_ps(4e+00f), tmp18561);
__m512 tmp18546 = _mm512_add_ps(tmp18537, tmp18535);
__m512 tmp18566 = _mm512_add_ps(tmp18557, tmp18555);
__m512 tmp18539 = _mm512_fmadd_ps(tmp18530, _mm512_set1_ps(4e+00f), tmp18531);
__m512 tmp18559 = _mm512_fmadd_ps(tmp18550, _mm512_set1_ps(4e+00f), tmp18551);
__m512 tmp18543 = _mm512_fmadd_ps(tmp18530, _mm512_set1_ps(1.6e+01f), tmp18531);
__m512 tmp18563 = _mm512_fmadd_ps(tmp18550, _mm512_set1_ps(1.6e+01f), tmp18551);
__m512 tmp18528 = _mm512_add_ps(tmp18529, in2541);
__m512 tmp18548 = _mm512_add_ps(tmp18549, in2549);
__m512 tmp18545 = _mm512_add_ps(tmp18546, in2548);
__m512 tmp18565 = _mm512_add_ps(tmp18566, in2556);
__m512 tmp18527 = _mm512_fmadd_ps(tmp18532, _mm512_set1_ps(3.2e+01f), tmp18528);
__m512 tmp18547 = _mm512_fmadd_ps(tmp18552, _mm512_set1_ps(3.2e+01f), tmp18548);
__m512 tmp18538 = _mm512_fmadd_ps(tmp18532, _mm512_set1_ps(8e+00f), tmp18539);
__m512 tmp18558 = _mm512_fmadd_ps(tmp18552, _mm512_set1_ps(8e+00f), tmp18559);
__m512 tmp18544 = _mm512_fmadd_ps(tmp18536, _mm512_set1_ps(3.2e+01f), tmp18545);
__m512 tmp18564 = _mm512_fmadd_ps(tmp18556, _mm512_set1_ps(3.2e+01f), tmp18565);
__m512 tmp18542 = _mm512_fmadd_ps(tmp18532, _mm512_set1_ps(2e+00f), tmp18543);
__m512 tmp18562 = _mm512_fmadd_ps(tmp18552, _mm512_set1_ps(2e+00f), tmp18563);
__m512 tmp18515 = tmp18527;
__m512 tmp18521 = tmp18547;
__m512 tmp18516 = tmp18533;
__m512 tmp18522 = tmp18553;
__m512 tmp18517 = tmp18538;
__m512 tmp18523 = tmp18558;
__m512 tmp18518 = tmp18540;
__m512 tmp18524 = tmp18560;
__m512 tmp18519 = tmp18542;
__m512 tmp18525 = tmp18562;
__m512 tmp18520 = tmp18544;
__m512 tmp18526 = tmp18564;
__m512 tmp18611 = _mm512_unpacklo_ps(tmp18515, tmp18516);
__m512 tmp18612 = _mm512_unpackhi_ps(tmp18515, tmp18516);
__m512 tmp18613 = _mm512_unpacklo_ps(tmp18517, tmp18518);
__m512 tmp18614 = _mm512_unpackhi_ps(tmp18517, tmp18518);
__m512 tmp18615 = _mm512_unpacklo_ps(tmp18519, tmp18520);
__m512 tmp18616 = _mm512_unpackhi_ps(tmp18519, tmp18520);
__m512 tmp18617 = _mm512_unpacklo_ps(tmp18521, tmp18522);
__m512 tmp18618 = _mm512_unpackhi_ps(tmp18521, tmp18522);
__m512 tmp18619 = _mm512_unpacklo_ps(tmp18523, tmp18524);
__m512 tmp18620 = _mm512_unpackhi_ps(tmp18523, tmp18524);
__m512 tmp18621 = _mm512_unpacklo_ps(tmp18525, tmp18526);
__m512 tmp18622 = _mm512_unpackhi_ps(tmp18525, tmp18526);
__m512 tmp18623 = _mm512_shuffle_ps(tmp18611, tmp18613, 68);
__m512 tmp18624 = _mm512_shuffle_ps(tmp18611, tmp18613, 238);
__m512 tmp18625 = _mm512_shuffle_ps(tmp18612, tmp18614, 68);
__m512 tmp18626 = _mm512_shuffle_ps(tmp18612, tmp18614, 238);
__m512 tmp18627 = _mm512_shuffle_ps(tmp18615, tmp18617, 68);
__m512 tmp18628 = _mm512_shuffle_ps(tmp18615, tmp18617, 238);
__m512 tmp18629 = _mm512_shuffle_ps(tmp18616, tmp18618, 68);
__m512 tmp18630 = _mm512_shuffle_ps(tmp18616, tmp18618, 238);
__m512 tmp18631 = _mm512_shuffle_ps(tmp18619, tmp18621, 68);
__m512 tmp18632 = _mm512_shuffle_ps(tmp18619, tmp18621, 238);
__m512 tmp18633 = _mm512_shuffle_ps(tmp18620, tmp18622, 68);
__m512 tmp18634 = _mm512_shuffle_ps(tmp18620, tmp18622, 238);
__m512 tmp18635 = _mm512_shuffle_f32x4(tmp18623, tmp18627, 136);
__m512 tmp18636 = _mm512_shuffle_f32x4(tmp18623, tmp18627, 221);
__m512 tmp18637 = _mm512_shuffle_f32x4(tmp18624, tmp18628, 136);
__m512 tmp18638 = _mm512_shuffle_f32x4(tmp18624, tmp18628, 221);
__m512 tmp18639 = _mm512_shuffle_f32x4(tmp18625, tmp18629, 136);
__m512 tmp18640 = _mm512_shuffle_f32x4(tmp18625, tmp18629, 221);
__m512 tmp18641 = _mm512_shuffle_f32x4(tmp18626, tmp18630, 136);
__m512 tmp18642 = _mm512_shuffle_f32x4(tmp18626, tmp18630, 221);
__m512 tmp18643 = _mm512_shuffle_f32x4(tmp18631, tmp18631, 136);
__m512 tmp18644 = _mm512_shuffle_f32x4(tmp18631, tmp18631, 221);
__m512 tmp18645 = _mm512_shuffle_f32x4(tmp18632, tmp18632, 136);
__m512 tmp18646 = _mm512_shuffle_f32x4(tmp18632, tmp18632, 221);
__m512 tmp18647 = _mm512_shuffle_f32x4(tmp18633, tmp18633, 136);
__m512 tmp18648 = _mm512_shuffle_f32x4(tmp18633, tmp18633, 221);
__m512 tmp18649 = _mm512_shuffle_f32x4(tmp18634, tmp18634, 136);
__m512 tmp18650 = _mm512_shuffle_f32x4(tmp18634, tmp18634, 221);
tmp18515 = _mm512_shuffle_f32x4(tmp18635, tmp18643, 136);
tmp18523 = _mm512_shuffle_f32x4(tmp18635, tmp18643, 221);
tmp18516 = _mm512_shuffle_f32x4(tmp18637, tmp18645, 136);
tmp18524 = _mm512_shuffle_f32x4(tmp18637, tmp18645, 221);
tmp18517 = _mm512_shuffle_f32x4(tmp18639, tmp18647, 136);
tmp18525 = _mm512_shuffle_f32x4(tmp18639, tmp18647, 221);
tmp18518 = _mm512_shuffle_f32x4(tmp18641, tmp18649, 136);
tmp18526 = _mm512_shuffle_f32x4(tmp18641, tmp18649, 221);
tmp18519 = _mm512_shuffle_f32x4(tmp18636, tmp18644, 136);
__m512 tmp18567 = _mm512_shuffle_f32x4(tmp18636, tmp18644, 221);
tmp18520 = _mm512_shuffle_f32x4(tmp18638, tmp18646, 136);
__m512 tmp18568 = _mm512_shuffle_f32x4(tmp18638, tmp18646, 221);
tmp18521 = _mm512_shuffle_f32x4(tmp18640, tmp18648, 136);
__m512 tmp18569 = _mm512_shuffle_f32x4(tmp18640, tmp18648, 221);
tmp18522 = _mm512_shuffle_f32x4(tmp18642, tmp18650, 136);
__m512 tmp18570 = _mm512_shuffle_f32x4(tmp18642, tmp18650, 221);
__m512 tmp18575 = _mm512_add_ps(tmp18516, tmp18517);
__m512 tmp18595 = _mm512_add_ps(tmp18524, tmp18525);
__m512 tmp18574 = _mm512_add_ps(tmp18518, tmp18519);
__m512 tmp18594 = _mm512_add_ps(tmp18526, tmp18567);
__m512 tmp18580 = _mm512_sub_ps(tmp18518, tmp18519);
__m512 tmp18600 = _mm512_sub_ps(tmp18526, tmp18567);
__m512 tmp18579 = _mm512_sub_ps(tmp18516, tmp18517);
__m512 tmp18599 = _mm512_sub_ps(tmp18524, tmp18525);
__m512 tmp18576 = _mm512_add_ps(tmp18520, tmp18521);
__m512 tmp18596 = _mm512_add_ps(tmp18568, tmp18569);
__m512 tmp18581 = _mm512_sub_ps(tmp18520, tmp18521);
__m512 tmp18601 = _mm512_sub_ps(tmp18568, tmp18569);
__m512 tmp18578 = _mm512_fmadd_ps(tmp18580, _mm512_set1_ps(2e+00f), tmp18579);
__m512 tmp18598 = _mm512_fmadd_ps(tmp18600, _mm512_set1_ps(2e+00f), tmp18599);
__m512 tmp18585 = _mm512_fmadd_ps(tmp18580, _mm512_set1_ps(8e+00f), tmp18579);
__m512 tmp18605 = _mm512_fmadd_ps(tmp18600, _mm512_set1_ps(8e+00f), tmp18599);
__m512 tmp18573 = _mm512_add_ps(tmp18574, tmp18575);
__m512 tmp18593 = _mm512_add_ps(tmp18594, tmp18595);
__m512 tmp18577 = _mm512_fmadd_ps(tmp18581, _mm512_set1_ps(1.6e+01f), tmp18578);
__m512 tmp18597 = _mm512_fmadd_ps(tmp18601, _mm512_set1_ps(1.6e+01f), tmp18598);
__m512 tmp18584 = _mm512_fmadd_ps(tmp18581, _mm512_set1_ps(4e+00f), tmp18585);
__m512 tmp18604 = _mm512_fmadd_ps(tmp18601, _mm512_set1_ps(4e+00f), tmp18605);
__m512 tmp18590 = _mm512_add_ps(tmp18581, tmp18579);
__m512 tmp18610 = _mm512_add_ps(tmp18601, tmp18599);
__m512 tmp18583 = _mm512_fmadd_ps(tmp18574, _mm512_set1_ps(4e+00f), tmp18575);
__m512 tmp18603 = _mm512_fmadd_ps(tmp18594, _mm512_set1_ps(4e+00f), tmp18595);
__m512 tmp18587 = _mm512_fmadd_ps(tmp18574, _mm512_set1_ps(1.6e+01f), tmp18575);
__m512 tmp18607 = _mm512_fmadd_ps(tmp18594, _mm512_set1_ps(1.6e+01f), tmp18595);
__m512 tmp18572 = _mm512_add_ps(tmp18573, tmp18515);
__m512 tmp18592 = _mm512_add_ps(tmp18593, tmp18523);
__m512 tmp18589 = _mm512_add_ps(tmp18590, tmp18522);
__m512 tmp18609 = _mm512_add_ps(tmp18610, tmp18570);
__m512 tmp18571 = _mm512_fmadd_ps(tmp18576, _mm512_set1_ps(3.2e+01f), tmp18572);
__m512 tmp18591 = _mm512_fmadd_ps(tmp18596, _mm512_set1_ps(3.2e+01f), tmp18592);
__m512 tmp18582 = _mm512_fmadd_ps(tmp18576, _mm512_set1_ps(8e+00f), tmp18583);
__m512 tmp18602 = _mm512_fmadd_ps(tmp18596, _mm512_set1_ps(8e+00f), tmp18603);
__m512 tmp18588 = _mm512_fmadd_ps(tmp18580, _mm512_set1_ps(3.2e+01f), tmp18589);
__m512 tmp18608 = _mm512_fmadd_ps(tmp18600, _mm512_set1_ps(3.2e+01f), tmp18609);
__m512 tmp18586 = _mm512_fmadd_ps(tmp18576, _mm512_set1_ps(2e+00f), tmp18587);
__m512 tmp18606 = _mm512_fmadd_ps(tmp18596, _mm512_set1_ps(2e+00f), tmp18607);
__m512 out2383 = tmp18571;
__m512 out2389 = tmp18591;
__m512 out2384 = tmp18577;
__m512 out2390 = tmp18597;
__m512 out2385 = tmp18582;
__m512 out2391 = tmp18602;
__m512 out2386 = tmp18584;
__m512 out2392 = tmp18604;
__m512 out2387 = tmp18586;
__m512 out2393 = tmp18606;
__m512 out2388 = tmp18588;
__m512 out2394 = tmp18608;
out2383 = _mm512_max_ps(_mm512_setzero_ps(), out2383);
out2389 = _mm512_max_ps(_mm512_setzero_ps(), out2389);
out2384 = _mm512_max_ps(_mm512_setzero_ps(), out2384);
out2390 = _mm512_max_ps(_mm512_setzero_ps(), out2390);
out2385 = _mm512_max_ps(_mm512_setzero_ps(), out2385);
out2391 = _mm512_max_ps(_mm512_setzero_ps(), out2391);
out2386 = _mm512_max_ps(_mm512_setzero_ps(), out2386);
out2392 = _mm512_max_ps(_mm512_setzero_ps(), out2392);
out2387 = _mm512_max_ps(_mm512_setzero_ps(), out2387);
out2393 = _mm512_max_ps(_mm512_setzero_ps(), out2393);
out2388 = _mm512_max_ps(_mm512_setzero_ps(), out2388);
out2394 = _mm512_max_ps(_mm512_setzero_ps(), out2394);
_mm512_mask_storeu_ps(datPtr35+0+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2383);
_mm512_mask_storeu_ps(datPtr35+48+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 3, out2389);
_mm512_mask_storeu_ps(datPtr35+312+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4032, out2389);
_mm512_mask_storeu_ps(datPtr35+56+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2384);
_mm512_mask_storeu_ps(datPtr35+104+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 3, out2390);
_mm512_mask_storeu_ps(datPtr35+368+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4032, out2390);
_mm512_mask_storeu_ps(datPtr35+112+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2385);
_mm512_mask_storeu_ps(datPtr35+160+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 3, out2391);
_mm512_mask_storeu_ps(datPtr35+424+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4032, out2391);
_mm512_mask_storeu_ps(datPtr35+168+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2386);
_mm512_mask_storeu_ps(datPtr35+216+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 3, out2392);
_mm512_mask_storeu_ps(datPtr35+480+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4032, out2392);
_mm512_mask_storeu_ps(datPtr35+224+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2387);
_mm512_mask_storeu_ps(datPtr35+272+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 3, out2393);
_mm512_mask_storeu_ps(datPtr35+536+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4032, out2393);
_mm512_mask_storeu_ps(datPtr35+280+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2388);
_mm512_mask_storeu_ps(datPtr35+328+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 3, out2394);
_mm512_mask_storeu_ps(datPtr35+592+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4032, out2394);
__m512 sf1329 = _mm512_loadu_ps(sfPtr15+256+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1330 = _mm512_loadu_ps(sfPtr15+384+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2557 = _mm512_shuffle_f32x4(sf1329, sf1330, 68);
__m512 in2558 = _mm512_shuffle_f32x4(sf1329, sf1330, 238);
__m512 sf1331 = _mm512_loadu_ps(sfPtr15+320+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1332 = _mm512_loadu_ps(sfPtr15+448+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2565 = _mm512_shuffle_f32x4(sf1331, sf1332, 68);
__m512 in2566 = _mm512_shuffle_f32x4(sf1331, sf1332, 238);
__m512 sf1333 = _mm512_loadu_ps(sfPtr15+147712+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1334 = _mm512_loadu_ps(sfPtr15+147840+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2559 = _mm512_shuffle_f32x4(sf1333, sf1334, 68);
__m512 in2560 = _mm512_shuffle_f32x4(sf1333, sf1334, 238);
__m512 sf1335 = _mm512_loadu_ps(sfPtr15+147776+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1336 = _mm512_loadu_ps(sfPtr15+147904+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2567 = _mm512_shuffle_f32x4(sf1335, sf1336, 68);
__m512 in2568 = _mm512_shuffle_f32x4(sf1335, sf1336, 238);
__m512 sf1337 = _mm512_loadu_ps(sfPtr15+295168+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1338 = _mm512_loadu_ps(sfPtr15+295296+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2561 = _mm512_shuffle_f32x4(sf1337, sf1338, 68);
__m512 in2562 = _mm512_shuffle_f32x4(sf1337, sf1338, 238);
__m512 sf1339 = _mm512_loadu_ps(sfPtr15+295232+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1340 = _mm512_loadu_ps(sfPtr15+295360+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2569 = _mm512_shuffle_f32x4(sf1339, sf1340, 68);
__m512 in2570 = _mm512_shuffle_f32x4(sf1339, sf1340, 238);
__m512 sf1341 = _mm512_loadu_ps(sfPtr15+442624+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1342 = _mm512_loadu_ps(sfPtr15+442752+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2563 = _mm512_shuffle_f32x4(sf1341, sf1342, 68);
__m512 in2564 = _mm512_shuffle_f32x4(sf1341, sf1342, 238);
__m512 sf1343 = _mm512_loadu_ps(sfPtr15+442688+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1344 = _mm512_loadu_ps(sfPtr15+442816+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2571 = _mm512_shuffle_f32x4(sf1343, sf1344, 68);
__m512 in2572 = _mm512_shuffle_f32x4(sf1343, sf1344, 238);
__m512 tmp18667 = _mm512_add_ps(in2558, in2559);
__m512 tmp18687 = _mm512_add_ps(in2566, in2567);
__m512 tmp18666 = _mm512_add_ps(in2560, in2561);
__m512 tmp18686 = _mm512_add_ps(in2568, in2569);
__m512 tmp18672 = _mm512_sub_ps(in2560, in2561);
__m512 tmp18692 = _mm512_sub_ps(in2568, in2569);
__m512 tmp18671 = _mm512_sub_ps(in2558, in2559);
__m512 tmp18691 = _mm512_sub_ps(in2566, in2567);
__m512 tmp18668 = _mm512_add_ps(in2562, in2563);
__m512 tmp18688 = _mm512_add_ps(in2570, in2571);
__m512 tmp18673 = _mm512_sub_ps(in2562, in2563);
__m512 tmp18693 = _mm512_sub_ps(in2570, in2571);
__m512 tmp18670 = _mm512_fmadd_ps(tmp18672, _mm512_set1_ps(2e+00f), tmp18671);
__m512 tmp18690 = _mm512_fmadd_ps(tmp18692, _mm512_set1_ps(2e+00f), tmp18691);
__m512 tmp18677 = _mm512_fmadd_ps(tmp18672, _mm512_set1_ps(8e+00f), tmp18671);
__m512 tmp18697 = _mm512_fmadd_ps(tmp18692, _mm512_set1_ps(8e+00f), tmp18691);
__m512 tmp18665 = _mm512_add_ps(tmp18666, tmp18667);
__m512 tmp18685 = _mm512_add_ps(tmp18686, tmp18687);
__m512 tmp18669 = _mm512_fmadd_ps(tmp18673, _mm512_set1_ps(1.6e+01f), tmp18670);
__m512 tmp18689 = _mm512_fmadd_ps(tmp18693, _mm512_set1_ps(1.6e+01f), tmp18690);
__m512 tmp18676 = _mm512_fmadd_ps(tmp18673, _mm512_set1_ps(4e+00f), tmp18677);
__m512 tmp18696 = _mm512_fmadd_ps(tmp18693, _mm512_set1_ps(4e+00f), tmp18697);
__m512 tmp18682 = _mm512_add_ps(tmp18673, tmp18671);
__m512 tmp18702 = _mm512_add_ps(tmp18693, tmp18691);
__m512 tmp18675 = _mm512_fmadd_ps(tmp18666, _mm512_set1_ps(4e+00f), tmp18667);
__m512 tmp18695 = _mm512_fmadd_ps(tmp18686, _mm512_set1_ps(4e+00f), tmp18687);
__m512 tmp18679 = _mm512_fmadd_ps(tmp18666, _mm512_set1_ps(1.6e+01f), tmp18667);
__m512 tmp18699 = _mm512_fmadd_ps(tmp18686, _mm512_set1_ps(1.6e+01f), tmp18687);
__m512 tmp18664 = _mm512_add_ps(tmp18665, in2557);
__m512 tmp18684 = _mm512_add_ps(tmp18685, in2565);
__m512 tmp18681 = _mm512_add_ps(tmp18682, in2564);
__m512 tmp18701 = _mm512_add_ps(tmp18702, in2572);
__m512 tmp18663 = _mm512_fmadd_ps(tmp18668, _mm512_set1_ps(3.2e+01f), tmp18664);
__m512 tmp18683 = _mm512_fmadd_ps(tmp18688, _mm512_set1_ps(3.2e+01f), tmp18684);
__m512 tmp18674 = _mm512_fmadd_ps(tmp18668, _mm512_set1_ps(8e+00f), tmp18675);
__m512 tmp18694 = _mm512_fmadd_ps(tmp18688, _mm512_set1_ps(8e+00f), tmp18695);
__m512 tmp18680 = _mm512_fmadd_ps(tmp18672, _mm512_set1_ps(3.2e+01f), tmp18681);
__m512 tmp18700 = _mm512_fmadd_ps(tmp18692, _mm512_set1_ps(3.2e+01f), tmp18701);
__m512 tmp18678 = _mm512_fmadd_ps(tmp18668, _mm512_set1_ps(2e+00f), tmp18679);
__m512 tmp18698 = _mm512_fmadd_ps(tmp18688, _mm512_set1_ps(2e+00f), tmp18699);
__m512 tmp18651 = tmp18663;
__m512 tmp18657 = tmp18683;
__m512 tmp18652 = tmp18669;
__m512 tmp18658 = tmp18689;
__m512 tmp18653 = tmp18674;
__m512 tmp18659 = tmp18694;
__m512 tmp18654 = tmp18676;
__m512 tmp18660 = tmp18696;
__m512 tmp18655 = tmp18678;
__m512 tmp18661 = tmp18698;
__m512 tmp18656 = tmp18680;
__m512 tmp18662 = tmp18700;
__m512 tmp18747 = _mm512_unpacklo_ps(tmp18651, tmp18652);
__m512 tmp18748 = _mm512_unpackhi_ps(tmp18651, tmp18652);
__m512 tmp18749 = _mm512_unpacklo_ps(tmp18653, tmp18654);
__m512 tmp18750 = _mm512_unpackhi_ps(tmp18653, tmp18654);
__m512 tmp18751 = _mm512_unpacklo_ps(tmp18655, tmp18656);
__m512 tmp18752 = _mm512_unpackhi_ps(tmp18655, tmp18656);
__m512 tmp18753 = _mm512_unpacklo_ps(tmp18657, tmp18658);
__m512 tmp18754 = _mm512_unpackhi_ps(tmp18657, tmp18658);
__m512 tmp18755 = _mm512_unpacklo_ps(tmp18659, tmp18660);
__m512 tmp18756 = _mm512_unpackhi_ps(tmp18659, tmp18660);
__m512 tmp18757 = _mm512_unpacklo_ps(tmp18661, tmp18662);
__m512 tmp18758 = _mm512_unpackhi_ps(tmp18661, tmp18662);
__m512 tmp18759 = _mm512_shuffle_ps(tmp18747, tmp18749, 68);
__m512 tmp18760 = _mm512_shuffle_ps(tmp18747, tmp18749, 238);
__m512 tmp18761 = _mm512_shuffle_ps(tmp18748, tmp18750, 68);
__m512 tmp18762 = _mm512_shuffle_ps(tmp18748, tmp18750, 238);
__m512 tmp18763 = _mm512_shuffle_ps(tmp18751, tmp18753, 68);
__m512 tmp18764 = _mm512_shuffle_ps(tmp18751, tmp18753, 238);
__m512 tmp18765 = _mm512_shuffle_ps(tmp18752, tmp18754, 68);
__m512 tmp18766 = _mm512_shuffle_ps(tmp18752, tmp18754, 238);
__m512 tmp18767 = _mm512_shuffle_ps(tmp18755, tmp18757, 68);
__m512 tmp18768 = _mm512_shuffle_ps(tmp18755, tmp18757, 238);
__m512 tmp18769 = _mm512_shuffle_ps(tmp18756, tmp18758, 68);
__m512 tmp18770 = _mm512_shuffle_ps(tmp18756, tmp18758, 238);
__m512 tmp18771 = _mm512_shuffle_f32x4(tmp18759, tmp18763, 136);
__m512 tmp18772 = _mm512_shuffle_f32x4(tmp18759, tmp18763, 221);
__m512 tmp18773 = _mm512_shuffle_f32x4(tmp18760, tmp18764, 136);
__m512 tmp18774 = _mm512_shuffle_f32x4(tmp18760, tmp18764, 221);
__m512 tmp18775 = _mm512_shuffle_f32x4(tmp18761, tmp18765, 136);
__m512 tmp18776 = _mm512_shuffle_f32x4(tmp18761, tmp18765, 221);
__m512 tmp18777 = _mm512_shuffle_f32x4(tmp18762, tmp18766, 136);
__m512 tmp18778 = _mm512_shuffle_f32x4(tmp18762, tmp18766, 221);
__m512 tmp18779 = _mm512_shuffle_f32x4(tmp18767, tmp18767, 136);
__m512 tmp18780 = _mm512_shuffle_f32x4(tmp18767, tmp18767, 221);
__m512 tmp18781 = _mm512_shuffle_f32x4(tmp18768, tmp18768, 136);
__m512 tmp18782 = _mm512_shuffle_f32x4(tmp18768, tmp18768, 221);
__m512 tmp18783 = _mm512_shuffle_f32x4(tmp18769, tmp18769, 136);
__m512 tmp18784 = _mm512_shuffle_f32x4(tmp18769, tmp18769, 221);
__m512 tmp18785 = _mm512_shuffle_f32x4(tmp18770, tmp18770, 136);
__m512 tmp18786 = _mm512_shuffle_f32x4(tmp18770, tmp18770, 221);
tmp18651 = _mm512_shuffle_f32x4(tmp18771, tmp18779, 136);
tmp18659 = _mm512_shuffle_f32x4(tmp18771, tmp18779, 221);
tmp18652 = _mm512_shuffle_f32x4(tmp18773, tmp18781, 136);
tmp18660 = _mm512_shuffle_f32x4(tmp18773, tmp18781, 221);
tmp18653 = _mm512_shuffle_f32x4(tmp18775, tmp18783, 136);
tmp18661 = _mm512_shuffle_f32x4(tmp18775, tmp18783, 221);
tmp18654 = _mm512_shuffle_f32x4(tmp18777, tmp18785, 136);
tmp18662 = _mm512_shuffle_f32x4(tmp18777, tmp18785, 221);
tmp18655 = _mm512_shuffle_f32x4(tmp18772, tmp18780, 136);
__m512 tmp18703 = _mm512_shuffle_f32x4(tmp18772, tmp18780, 221);
tmp18656 = _mm512_shuffle_f32x4(tmp18774, tmp18782, 136);
__m512 tmp18704 = _mm512_shuffle_f32x4(tmp18774, tmp18782, 221);
tmp18657 = _mm512_shuffle_f32x4(tmp18776, tmp18784, 136);
__m512 tmp18705 = _mm512_shuffle_f32x4(tmp18776, tmp18784, 221);
tmp18658 = _mm512_shuffle_f32x4(tmp18778, tmp18786, 136);
__m512 tmp18706 = _mm512_shuffle_f32x4(tmp18778, tmp18786, 221);
__m512 tmp18711 = _mm512_add_ps(tmp18652, tmp18653);
__m512 tmp18731 = _mm512_add_ps(tmp18660, tmp18661);
__m512 tmp18710 = _mm512_add_ps(tmp18654, tmp18655);
__m512 tmp18730 = _mm512_add_ps(tmp18662, tmp18703);
__m512 tmp18716 = _mm512_sub_ps(tmp18654, tmp18655);
__m512 tmp18736 = _mm512_sub_ps(tmp18662, tmp18703);
__m512 tmp18715 = _mm512_sub_ps(tmp18652, tmp18653);
__m512 tmp18735 = _mm512_sub_ps(tmp18660, tmp18661);
__m512 tmp18712 = _mm512_add_ps(tmp18656, tmp18657);
__m512 tmp18732 = _mm512_add_ps(tmp18704, tmp18705);
__m512 tmp18717 = _mm512_sub_ps(tmp18656, tmp18657);
__m512 tmp18737 = _mm512_sub_ps(tmp18704, tmp18705);
__m512 tmp18714 = _mm512_fmadd_ps(tmp18716, _mm512_set1_ps(2e+00f), tmp18715);
__m512 tmp18734 = _mm512_fmadd_ps(tmp18736, _mm512_set1_ps(2e+00f), tmp18735);
__m512 tmp18721 = _mm512_fmadd_ps(tmp18716, _mm512_set1_ps(8e+00f), tmp18715);
__m512 tmp18741 = _mm512_fmadd_ps(tmp18736, _mm512_set1_ps(8e+00f), tmp18735);
__m512 tmp18709 = _mm512_add_ps(tmp18710, tmp18711);
__m512 tmp18729 = _mm512_add_ps(tmp18730, tmp18731);
__m512 tmp18713 = _mm512_fmadd_ps(tmp18717, _mm512_set1_ps(1.6e+01f), tmp18714);
__m512 tmp18733 = _mm512_fmadd_ps(tmp18737, _mm512_set1_ps(1.6e+01f), tmp18734);
__m512 tmp18720 = _mm512_fmadd_ps(tmp18717, _mm512_set1_ps(4e+00f), tmp18721);
__m512 tmp18740 = _mm512_fmadd_ps(tmp18737, _mm512_set1_ps(4e+00f), tmp18741);
__m512 tmp18726 = _mm512_add_ps(tmp18717, tmp18715);
__m512 tmp18746 = _mm512_add_ps(tmp18737, tmp18735);
__m512 tmp18719 = _mm512_fmadd_ps(tmp18710, _mm512_set1_ps(4e+00f), tmp18711);
__m512 tmp18739 = _mm512_fmadd_ps(tmp18730, _mm512_set1_ps(4e+00f), tmp18731);
__m512 tmp18723 = _mm512_fmadd_ps(tmp18710, _mm512_set1_ps(1.6e+01f), tmp18711);
__m512 tmp18743 = _mm512_fmadd_ps(tmp18730, _mm512_set1_ps(1.6e+01f), tmp18731);
__m512 tmp18708 = _mm512_add_ps(tmp18709, tmp18651);
__m512 tmp18728 = _mm512_add_ps(tmp18729, tmp18659);
__m512 tmp18725 = _mm512_add_ps(tmp18726, tmp18658);
__m512 tmp18745 = _mm512_add_ps(tmp18746, tmp18706);
__m512 tmp18707 = _mm512_fmadd_ps(tmp18712, _mm512_set1_ps(3.2e+01f), tmp18708);
__m512 tmp18727 = _mm512_fmadd_ps(tmp18732, _mm512_set1_ps(3.2e+01f), tmp18728);
__m512 tmp18718 = _mm512_fmadd_ps(tmp18712, _mm512_set1_ps(8e+00f), tmp18719);
__m512 tmp18738 = _mm512_fmadd_ps(tmp18732, _mm512_set1_ps(8e+00f), tmp18739);
__m512 tmp18724 = _mm512_fmadd_ps(tmp18716, _mm512_set1_ps(3.2e+01f), tmp18725);
__m512 tmp18744 = _mm512_fmadd_ps(tmp18736, _mm512_set1_ps(3.2e+01f), tmp18745);
__m512 tmp18722 = _mm512_fmadd_ps(tmp18712, _mm512_set1_ps(2e+00f), tmp18723);
__m512 tmp18742 = _mm512_fmadd_ps(tmp18732, _mm512_set1_ps(2e+00f), tmp18743);
__m512 out2395 = tmp18707;
__m512 out2401 = tmp18727;
__m512 out2396 = tmp18713;
__m512 out2402 = tmp18733;
__m512 out2397 = tmp18718;
__m512 out2403 = tmp18738;
__m512 out2398 = tmp18720;
__m512 out2404 = tmp18740;
__m512 out2399 = tmp18722;
__m512 out2405 = tmp18742;
__m512 out2400 = tmp18724;
__m512 out2406 = tmp18744;
out2395 = _mm512_max_ps(_mm512_setzero_ps(), out2395);
out2401 = _mm512_max_ps(_mm512_setzero_ps(), out2401);
out2396 = _mm512_max_ps(_mm512_setzero_ps(), out2396);
out2402 = _mm512_max_ps(_mm512_setzero_ps(), out2402);
out2397 = _mm512_max_ps(_mm512_setzero_ps(), out2397);
out2403 = _mm512_max_ps(_mm512_setzero_ps(), out2403);
out2398 = _mm512_max_ps(_mm512_setzero_ps(), out2398);
out2404 = _mm512_max_ps(_mm512_setzero_ps(), out2404);
out2399 = _mm512_max_ps(_mm512_setzero_ps(), out2399);
out2405 = _mm512_max_ps(_mm512_setzero_ps(), out2405);
out2400 = _mm512_max_ps(_mm512_setzero_ps(), out2400);
out2406 = _mm512_max_ps(_mm512_setzero_ps(), out2406);
_mm512_mask_storeu_ps(datPtr35+360+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 255, out2395);
_mm512_mask_storeu_ps(datPtr35+832+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2401);
_mm512_mask_storeu_ps(datPtr35+416+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 255, out2396);
_mm512_mask_storeu_ps(datPtr35+888+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2402);
_mm512_mask_storeu_ps(datPtr35+472+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 255, out2397);
_mm512_mask_storeu_ps(datPtr35+944+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2403);
_mm512_mask_storeu_ps(datPtr35+528+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 255, out2398);
_mm512_mask_storeu_ps(datPtr35+1000+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2404);
_mm512_mask_storeu_ps(datPtr35+584+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 255, out2399);
_mm512_mask_storeu_ps(datPtr35+1056+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2405);
_mm512_mask_storeu_ps(datPtr35+640+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 255, out2400);
_mm512_mask_storeu_ps(datPtr35+1112+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2406);
__m512 sf1345 = _mm512_loadu_ps(sfPtr15+512+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1346 = _mm512_loadu_ps(sfPtr15+576+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2573 = _mm512_shuffle_f32x4(sf1346, sf1345, 68);
__m512 in2574 = _mm512_shuffle_f32x4(sf1346, sf1345, 238);
__m512 sf1347 = _mm512_loadu_ps(sfPtr15+640+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1348 = _mm512_loadu_ps(sfPtr15+704+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2581 = _mm512_shuffle_f32x4(sf1347, sf1348, 68);
__m512 in2582 = _mm512_shuffle_f32x4(sf1347, sf1348, 238);
__m512 sf1349 = _mm512_loadu_ps(sfPtr15+147968+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1350 = _mm512_loadu_ps(sfPtr15+148032+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2575 = _mm512_shuffle_f32x4(sf1350, sf1349, 68);
__m512 in2576 = _mm512_shuffle_f32x4(sf1350, sf1349, 238);
__m512 sf1351 = _mm512_loadu_ps(sfPtr15+148096+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1352 = _mm512_loadu_ps(sfPtr15+148160+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2583 = _mm512_shuffle_f32x4(sf1351, sf1352, 68);
__m512 in2584 = _mm512_shuffle_f32x4(sf1351, sf1352, 238);
__m512 sf1353 = _mm512_loadu_ps(sfPtr15+295424+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1354 = _mm512_loadu_ps(sfPtr15+295488+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2577 = _mm512_shuffle_f32x4(sf1354, sf1353, 68);
__m512 in2578 = _mm512_shuffle_f32x4(sf1354, sf1353, 238);
__m512 sf1355 = _mm512_loadu_ps(sfPtr15+295552+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1356 = _mm512_loadu_ps(sfPtr15+295616+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2585 = _mm512_shuffle_f32x4(sf1355, sf1356, 68);
__m512 in2586 = _mm512_shuffle_f32x4(sf1355, sf1356, 238);
__m512 sf1357 = _mm512_loadu_ps(sfPtr15+442880+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1358 = _mm512_loadu_ps(sfPtr15+442944+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2579 = _mm512_shuffle_f32x4(sf1358, sf1357, 68);
__m512 in2580 = _mm512_shuffle_f32x4(sf1358, sf1357, 238);
__m512 sf1359 = _mm512_loadu_ps(sfPtr15+443008+589824*i65+98304*j57+1536*k169+768*l72);
__m512 sf1360 = _mm512_loadu_ps(sfPtr15+443072+589824*i65+98304*j57+1536*k169+768*l72);
__m512 in2587 = _mm512_shuffle_f32x4(sf1359, sf1360, 68);
__m512 in2588 = _mm512_shuffle_f32x4(sf1359, sf1360, 238);
__m512 tmp18803 = _mm512_add_ps(in2574, in2575);
__m512 tmp18823 = _mm512_add_ps(in2582, in2583);
__m512 tmp18802 = _mm512_add_ps(in2576, in2577);
__m512 tmp18822 = _mm512_add_ps(in2584, in2585);
__m512 tmp18808 = _mm512_sub_ps(in2576, in2577);
__m512 tmp18828 = _mm512_sub_ps(in2584, in2585);
__m512 tmp18807 = _mm512_sub_ps(in2574, in2575);
__m512 tmp18827 = _mm512_sub_ps(in2582, in2583);
__m512 tmp18804 = _mm512_add_ps(in2578, in2579);
__m512 tmp18824 = _mm512_add_ps(in2586, in2587);
__m512 tmp18809 = _mm512_sub_ps(in2578, in2579);
__m512 tmp18829 = _mm512_sub_ps(in2586, in2587);
__m512 tmp18806 = _mm512_fmadd_ps(tmp18808, _mm512_set1_ps(2e+00f), tmp18807);
__m512 tmp18826 = _mm512_fmadd_ps(tmp18828, _mm512_set1_ps(2e+00f), tmp18827);
__m512 tmp18813 = _mm512_fmadd_ps(tmp18808, _mm512_set1_ps(8e+00f), tmp18807);
__m512 tmp18833 = _mm512_fmadd_ps(tmp18828, _mm512_set1_ps(8e+00f), tmp18827);
__m512 tmp18801 = _mm512_add_ps(tmp18802, tmp18803);
__m512 tmp18821 = _mm512_add_ps(tmp18822, tmp18823);
__m512 tmp18805 = _mm512_fmadd_ps(tmp18809, _mm512_set1_ps(1.6e+01f), tmp18806);
__m512 tmp18825 = _mm512_fmadd_ps(tmp18829, _mm512_set1_ps(1.6e+01f), tmp18826);
__m512 tmp18812 = _mm512_fmadd_ps(tmp18809, _mm512_set1_ps(4e+00f), tmp18813);
__m512 tmp18832 = _mm512_fmadd_ps(tmp18829, _mm512_set1_ps(4e+00f), tmp18833);
__m512 tmp18818 = _mm512_add_ps(tmp18809, tmp18807);
__m512 tmp18838 = _mm512_add_ps(tmp18829, tmp18827);
__m512 tmp18811 = _mm512_fmadd_ps(tmp18802, _mm512_set1_ps(4e+00f), tmp18803);
__m512 tmp18831 = _mm512_fmadd_ps(tmp18822, _mm512_set1_ps(4e+00f), tmp18823);
__m512 tmp18815 = _mm512_fmadd_ps(tmp18802, _mm512_set1_ps(1.6e+01f), tmp18803);
__m512 tmp18835 = _mm512_fmadd_ps(tmp18822, _mm512_set1_ps(1.6e+01f), tmp18823);
__m512 tmp18800 = _mm512_add_ps(tmp18801, in2573);
__m512 tmp18820 = _mm512_add_ps(tmp18821, in2581);
__m512 tmp18817 = _mm512_add_ps(tmp18818, in2580);
__m512 tmp18837 = _mm512_add_ps(tmp18838, in2588);
__m512 tmp18799 = _mm512_fmadd_ps(tmp18804, _mm512_set1_ps(3.2e+01f), tmp18800);
__m512 tmp18819 = _mm512_fmadd_ps(tmp18824, _mm512_set1_ps(3.2e+01f), tmp18820);
__m512 tmp18810 = _mm512_fmadd_ps(tmp18804, _mm512_set1_ps(8e+00f), tmp18811);
__m512 tmp18830 = _mm512_fmadd_ps(tmp18824, _mm512_set1_ps(8e+00f), tmp18831);
__m512 tmp18816 = _mm512_fmadd_ps(tmp18808, _mm512_set1_ps(3.2e+01f), tmp18817);
__m512 tmp18836 = _mm512_fmadd_ps(tmp18828, _mm512_set1_ps(3.2e+01f), tmp18837);
__m512 tmp18814 = _mm512_fmadd_ps(tmp18804, _mm512_set1_ps(2e+00f), tmp18815);
__m512 tmp18834 = _mm512_fmadd_ps(tmp18824, _mm512_set1_ps(2e+00f), tmp18835);
__m512 tmp18787 = tmp18799;
__m512 tmp18793 = tmp18819;
__m512 tmp18788 = tmp18805;
__m512 tmp18794 = tmp18825;
__m512 tmp18789 = tmp18810;
__m512 tmp18795 = tmp18830;
__m512 tmp18790 = tmp18812;
__m512 tmp18796 = tmp18832;
__m512 tmp18791 = tmp18814;
__m512 tmp18797 = tmp18834;
__m512 tmp18792 = tmp18816;
__m512 tmp18798 = tmp18836;
__m512 tmp18883 = _mm512_unpacklo_ps(tmp18787, tmp18788);
__m512 tmp18884 = _mm512_unpackhi_ps(tmp18787, tmp18788);
__m512 tmp18885 = _mm512_unpacklo_ps(tmp18789, tmp18790);
__m512 tmp18886 = _mm512_unpackhi_ps(tmp18789, tmp18790);
__m512 tmp18887 = _mm512_unpacklo_ps(tmp18791, tmp18792);
__m512 tmp18888 = _mm512_unpackhi_ps(tmp18791, tmp18792);
__m512 tmp18889 = _mm512_unpacklo_ps(tmp18793, tmp18794);
__m512 tmp18890 = _mm512_unpackhi_ps(tmp18793, tmp18794);
__m512 tmp18891 = _mm512_unpacklo_ps(tmp18795, tmp18796);
__m512 tmp18892 = _mm512_unpackhi_ps(tmp18795, tmp18796);
__m512 tmp18893 = _mm512_unpacklo_ps(tmp18797, tmp18798);
__m512 tmp18894 = _mm512_unpackhi_ps(tmp18797, tmp18798);
__m512 tmp18895 = _mm512_shuffle_ps(tmp18883, tmp18885, 68);
__m512 tmp18896 = _mm512_shuffle_ps(tmp18883, tmp18885, 238);
__m512 tmp18897 = _mm512_shuffle_ps(tmp18884, tmp18886, 68);
__m512 tmp18898 = _mm512_shuffle_ps(tmp18884, tmp18886, 238);
__m512 tmp18899 = _mm512_shuffle_ps(tmp18887, tmp18889, 68);
__m512 tmp18900 = _mm512_shuffle_ps(tmp18887, tmp18889, 238);
__m512 tmp18901 = _mm512_shuffle_ps(tmp18888, tmp18890, 68);
__m512 tmp18902 = _mm512_shuffle_ps(tmp18888, tmp18890, 238);
__m512 tmp18903 = _mm512_shuffle_ps(tmp18891, tmp18893, 68);
__m512 tmp18904 = _mm512_shuffle_ps(tmp18891, tmp18893, 238);
__m512 tmp18905 = _mm512_shuffle_ps(tmp18892, tmp18894, 68);
__m512 tmp18906 = _mm512_shuffle_ps(tmp18892, tmp18894, 238);
__m512 tmp18907 = _mm512_shuffle_f32x4(tmp18895, tmp18899, 136);
__m512 tmp18908 = _mm512_shuffle_f32x4(tmp18895, tmp18899, 221);
__m512 tmp18909 = _mm512_shuffle_f32x4(tmp18896, tmp18900, 136);
__m512 tmp18910 = _mm512_shuffle_f32x4(tmp18896, tmp18900, 221);
__m512 tmp18911 = _mm512_shuffle_f32x4(tmp18897, tmp18901, 136);
__m512 tmp18912 = _mm512_shuffle_f32x4(tmp18897, tmp18901, 221);
__m512 tmp18913 = _mm512_shuffle_f32x4(tmp18898, tmp18902, 136);
__m512 tmp18914 = _mm512_shuffle_f32x4(tmp18898, tmp18902, 221);
__m512 tmp18915 = _mm512_shuffle_f32x4(tmp18903, tmp18903, 136);
__m512 tmp18916 = _mm512_shuffle_f32x4(tmp18903, tmp18903, 221);
__m512 tmp18917 = _mm512_shuffle_f32x4(tmp18904, tmp18904, 136);
__m512 tmp18918 = _mm512_shuffle_f32x4(tmp18904, tmp18904, 221);
__m512 tmp18919 = _mm512_shuffle_f32x4(tmp18905, tmp18905, 136);
__m512 tmp18920 = _mm512_shuffle_f32x4(tmp18905, tmp18905, 221);
__m512 tmp18921 = _mm512_shuffle_f32x4(tmp18906, tmp18906, 136);
__m512 tmp18922 = _mm512_shuffle_f32x4(tmp18906, tmp18906, 221);
tmp18787 = _mm512_shuffle_f32x4(tmp18907, tmp18915, 136);
tmp18795 = _mm512_shuffle_f32x4(tmp18907, tmp18915, 221);
tmp18788 = _mm512_shuffle_f32x4(tmp18909, tmp18917, 136);
tmp18796 = _mm512_shuffle_f32x4(tmp18909, tmp18917, 221);
tmp18789 = _mm512_shuffle_f32x4(tmp18911, tmp18919, 136);
tmp18797 = _mm512_shuffle_f32x4(tmp18911, tmp18919, 221);
tmp18790 = _mm512_shuffle_f32x4(tmp18913, tmp18921, 136);
tmp18798 = _mm512_shuffle_f32x4(tmp18913, tmp18921, 221);
tmp18791 = _mm512_shuffle_f32x4(tmp18908, tmp18916, 136);
__m512 tmp18839 = _mm512_shuffle_f32x4(tmp18908, tmp18916, 221);
tmp18792 = _mm512_shuffle_f32x4(tmp18910, tmp18918, 136);
__m512 tmp18840 = _mm512_shuffle_f32x4(tmp18910, tmp18918, 221);
tmp18793 = _mm512_shuffle_f32x4(tmp18912, tmp18920, 136);
__m512 tmp18841 = _mm512_shuffle_f32x4(tmp18912, tmp18920, 221);
tmp18794 = _mm512_shuffle_f32x4(tmp18914, tmp18922, 136);
__m512 tmp18842 = _mm512_shuffle_f32x4(tmp18914, tmp18922, 221);
__m512 tmp18847 = _mm512_add_ps(tmp18788, tmp18789);
__m512 tmp18867 = _mm512_add_ps(tmp18796, tmp18797);
__m512 tmp18846 = _mm512_add_ps(tmp18790, tmp18791);
__m512 tmp18866 = _mm512_add_ps(tmp18798, tmp18839);
__m512 tmp18852 = _mm512_sub_ps(tmp18790, tmp18791);
__m512 tmp18872 = _mm512_sub_ps(tmp18798, tmp18839);
__m512 tmp18851 = _mm512_sub_ps(tmp18788, tmp18789);
__m512 tmp18871 = _mm512_sub_ps(tmp18796, tmp18797);
__m512 tmp18848 = _mm512_add_ps(tmp18792, tmp18793);
__m512 tmp18868 = _mm512_add_ps(tmp18840, tmp18841);
__m512 tmp18853 = _mm512_sub_ps(tmp18792, tmp18793);
__m512 tmp18873 = _mm512_sub_ps(tmp18840, tmp18841);
__m512 tmp18850 = _mm512_fmadd_ps(tmp18852, _mm512_set1_ps(2e+00f), tmp18851);
__m512 tmp18870 = _mm512_fmadd_ps(tmp18872, _mm512_set1_ps(2e+00f), tmp18871);
__m512 tmp18857 = _mm512_fmadd_ps(tmp18852, _mm512_set1_ps(8e+00f), tmp18851);
__m512 tmp18877 = _mm512_fmadd_ps(tmp18872, _mm512_set1_ps(8e+00f), tmp18871);
__m512 tmp18845 = _mm512_add_ps(tmp18846, tmp18847);
__m512 tmp18865 = _mm512_add_ps(tmp18866, tmp18867);
__m512 tmp18849 = _mm512_fmadd_ps(tmp18853, _mm512_set1_ps(1.6e+01f), tmp18850);
__m512 tmp18869 = _mm512_fmadd_ps(tmp18873, _mm512_set1_ps(1.6e+01f), tmp18870);
__m512 tmp18856 = _mm512_fmadd_ps(tmp18853, _mm512_set1_ps(4e+00f), tmp18857);
__m512 tmp18876 = _mm512_fmadd_ps(tmp18873, _mm512_set1_ps(4e+00f), tmp18877);
__m512 tmp18862 = _mm512_add_ps(tmp18853, tmp18851);
__m512 tmp18882 = _mm512_add_ps(tmp18873, tmp18871);
__m512 tmp18855 = _mm512_fmadd_ps(tmp18846, _mm512_set1_ps(4e+00f), tmp18847);
__m512 tmp18875 = _mm512_fmadd_ps(tmp18866, _mm512_set1_ps(4e+00f), tmp18867);
__m512 tmp18859 = _mm512_fmadd_ps(tmp18846, _mm512_set1_ps(1.6e+01f), tmp18847);
__m512 tmp18879 = _mm512_fmadd_ps(tmp18866, _mm512_set1_ps(1.6e+01f), tmp18867);
__m512 tmp18844 = _mm512_add_ps(tmp18845, tmp18787);
__m512 tmp18864 = _mm512_add_ps(tmp18865, tmp18795);
__m512 tmp18861 = _mm512_add_ps(tmp18862, tmp18794);
__m512 tmp18881 = _mm512_add_ps(tmp18882, tmp18842);
__m512 tmp18843 = _mm512_fmadd_ps(tmp18848, _mm512_set1_ps(3.2e+01f), tmp18844);
__m512 tmp18863 = _mm512_fmadd_ps(tmp18868, _mm512_set1_ps(3.2e+01f), tmp18864);
__m512 tmp18854 = _mm512_fmadd_ps(tmp18848, _mm512_set1_ps(8e+00f), tmp18855);
__m512 tmp18874 = _mm512_fmadd_ps(tmp18868, _mm512_set1_ps(8e+00f), tmp18875);
__m512 tmp18860 = _mm512_fmadd_ps(tmp18852, _mm512_set1_ps(3.2e+01f), tmp18861);
__m512 tmp18880 = _mm512_fmadd_ps(tmp18872, _mm512_set1_ps(3.2e+01f), tmp18881);
__m512 tmp18858 = _mm512_fmadd_ps(tmp18848, _mm512_set1_ps(2e+00f), tmp18859);
__m512 tmp18878 = _mm512_fmadd_ps(tmp18868, _mm512_set1_ps(2e+00f), tmp18879);
__m512 out2413 = tmp18843;
__m512 out2407 = tmp18863;
__m512 out2414 = tmp18849;
__m512 out2408 = tmp18869;
__m512 out2415 = tmp18854;
__m512 out2409 = tmp18874;
__m512 out2416 = tmp18856;
__m512 out2410 = tmp18876;
__m512 out2417 = tmp18858;
__m512 out2411 = tmp18878;
__m512 out2418 = tmp18860;
__m512 out2412 = tmp18880;
out2413 = _mm512_max_ps(_mm512_setzero_ps(), out2413);
out2407 = _mm512_max_ps(_mm512_setzero_ps(), out2407);
out2414 = _mm512_max_ps(_mm512_setzero_ps(), out2414);
out2408 = _mm512_max_ps(_mm512_setzero_ps(), out2408);
out2415 = _mm512_max_ps(_mm512_setzero_ps(), out2415);
out2409 = _mm512_max_ps(_mm512_setzero_ps(), out2409);
out2416 = _mm512_max_ps(_mm512_setzero_ps(), out2416);
out2410 = _mm512_max_ps(_mm512_setzero_ps(), out2410);
out2417 = _mm512_max_ps(_mm512_setzero_ps(), out2417);
out2411 = _mm512_max_ps(_mm512_setzero_ps(), out2411);
out2418 = _mm512_max_ps(_mm512_setzero_ps(), out2418);
out2412 = _mm512_max_ps(_mm512_setzero_ps(), out2412);
_mm512_mask_storeu_ps(datPtr35+1168+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2413);
_mm512_mask_storeu_ps(datPtr35+880+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 3, out2407);
_mm512_mask_storeu_ps(datPtr35+1192+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 192, out2407);
_mm512_mask_storeu_ps(datPtr35+1224+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2414);
_mm512_mask_storeu_ps(datPtr35+936+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 3, out2408);
_mm512_mask_storeu_ps(datPtr35+1248+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 192, out2408);
_mm512_mask_storeu_ps(datPtr35+1280+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2415);
_mm512_mask_storeu_ps(datPtr35+992+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 3, out2409);
_mm512_mask_storeu_ps(datPtr35+1304+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 192, out2409);
_mm512_mask_storeu_ps(datPtr35+1336+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2416);
_mm512_mask_storeu_ps(datPtr35+1048+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 3, out2410);
_mm512_mask_storeu_ps(datPtr35+1360+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 192, out2410);
_mm512_mask_storeu_ps(datPtr35+1392+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2417);
_mm512_mask_storeu_ps(datPtr35+1104+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 3, out2411);
_mm512_mask_storeu_ps(datPtr35+1416+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 192, out2411);
_mm512_mask_storeu_ps(datPtr35+1448+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 4095, out2418);
_mm512_mask_storeu_ps(datPtr35+1160+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 3, out2412);
_mm512_mask_storeu_ps(datPtr35+1472+212992*i65+56*toH50+4*toW50+3328*k169+1664*l72, 192, out2412);
}
if (k169 >= kk60) return;
}
if (j57 >= last14) return;
++j57;
rel26 = 1;
}
ptrdiff_t toH51 = base26+12;
ptrdiff_t toW51 = 0;
ptrdiff_t k170 = 32*w74;
ptrdiff_t kk61 = k170+31;
for (; k170 != 64; ++k170) {
ptrdiff_t l73 = 0;
for (; l73 != 1; ++l73) {
__m512 sf1361 = _mm512_loadu_ps(sfPtr15+0+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1362 = _mm512_loadu_ps(sfPtr15+128+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2589 = _mm512_shuffle_f32x4(sf1361, sf1362, 68);
__m512 in2590 = _mm512_shuffle_f32x4(sf1361, sf1362, 238);
__m512 sf1363 = _mm512_loadu_ps(sfPtr15+64+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1364 = _mm512_loadu_ps(sfPtr15+192+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2597 = _mm512_shuffle_f32x4(sf1363, sf1364, 68);
__m512 in2598 = _mm512_shuffle_f32x4(sf1363, sf1364, 238);
__m512 sf1365 = _mm512_loadu_ps(sfPtr15+147456+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1366 = _mm512_loadu_ps(sfPtr15+147584+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2591 = _mm512_shuffle_f32x4(sf1365, sf1366, 68);
__m512 in2592 = _mm512_shuffle_f32x4(sf1365, sf1366, 238);
__m512 sf1367 = _mm512_loadu_ps(sfPtr15+147520+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1368 = _mm512_loadu_ps(sfPtr15+147648+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2599 = _mm512_shuffle_f32x4(sf1367, sf1368, 68);
__m512 in2600 = _mm512_shuffle_f32x4(sf1367, sf1368, 238);
__m512 sf1369 = _mm512_loadu_ps(sfPtr15+294912+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1370 = _mm512_loadu_ps(sfPtr15+295040+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2593 = _mm512_shuffle_f32x4(sf1369, sf1370, 68);
__m512 in2594 = _mm512_shuffle_f32x4(sf1369, sf1370, 238);
__m512 sf1371 = _mm512_loadu_ps(sfPtr15+294976+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1372 = _mm512_loadu_ps(sfPtr15+295104+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2601 = _mm512_shuffle_f32x4(sf1371, sf1372, 68);
__m512 in2602 = _mm512_shuffle_f32x4(sf1371, sf1372, 238);
__m512 sf1373 = _mm512_loadu_ps(sfPtr15+442368+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1374 = _mm512_loadu_ps(sfPtr15+442496+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2595 = _mm512_shuffle_f32x4(sf1373, sf1374, 68);
__m512 in2596 = _mm512_shuffle_f32x4(sf1373, sf1374, 238);
__m512 sf1375 = _mm512_loadu_ps(sfPtr15+442432+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1376 = _mm512_loadu_ps(sfPtr15+442560+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2603 = _mm512_shuffle_f32x4(sf1375, sf1376, 68);
__m512 in2604 = _mm512_shuffle_f32x4(sf1375, sf1376, 238);
__m512 tmp18939 = _mm512_add_ps(in2590, in2591);
__m512 tmp18959 = _mm512_add_ps(in2598, in2599);
__m512 tmp18938 = _mm512_add_ps(in2592, in2593);
__m512 tmp18958 = _mm512_add_ps(in2600, in2601);
__m512 tmp18944 = _mm512_sub_ps(in2592, in2593);
__m512 tmp18964 = _mm512_sub_ps(in2600, in2601);
__m512 tmp18943 = _mm512_sub_ps(in2590, in2591);
__m512 tmp18963 = _mm512_sub_ps(in2598, in2599);
__m512 tmp18940 = _mm512_add_ps(in2594, in2595);
__m512 tmp18960 = _mm512_add_ps(in2602, in2603);
__m512 tmp18945 = _mm512_sub_ps(in2594, in2595);
__m512 tmp18965 = _mm512_sub_ps(in2602, in2603);
__m512 tmp18942 = _mm512_fmadd_ps(tmp18944, _mm512_set1_ps(2e+00f), tmp18943);
__m512 tmp18962 = _mm512_fmadd_ps(tmp18964, _mm512_set1_ps(2e+00f), tmp18963);
__m512 tmp18949 = _mm512_fmadd_ps(tmp18944, _mm512_set1_ps(8e+00f), tmp18943);
__m512 tmp18969 = _mm512_fmadd_ps(tmp18964, _mm512_set1_ps(8e+00f), tmp18963);
__m512 tmp18937 = _mm512_add_ps(tmp18938, tmp18939);
__m512 tmp18957 = _mm512_add_ps(tmp18958, tmp18959);
__m512 tmp18941 = _mm512_fmadd_ps(tmp18945, _mm512_set1_ps(1.6e+01f), tmp18942);
__m512 tmp18961 = _mm512_fmadd_ps(tmp18965, _mm512_set1_ps(1.6e+01f), tmp18962);
__m512 tmp18948 = _mm512_fmadd_ps(tmp18945, _mm512_set1_ps(4e+00f), tmp18949);
__m512 tmp18968 = _mm512_fmadd_ps(tmp18965, _mm512_set1_ps(4e+00f), tmp18969);
__m512 tmp18954 = _mm512_add_ps(tmp18945, tmp18943);
__m512 tmp18974 = _mm512_add_ps(tmp18965, tmp18963);
__m512 tmp18947 = _mm512_fmadd_ps(tmp18938, _mm512_set1_ps(4e+00f), tmp18939);
__m512 tmp18967 = _mm512_fmadd_ps(tmp18958, _mm512_set1_ps(4e+00f), tmp18959);
__m512 tmp18951 = _mm512_fmadd_ps(tmp18938, _mm512_set1_ps(1.6e+01f), tmp18939);
__m512 tmp18971 = _mm512_fmadd_ps(tmp18958, _mm512_set1_ps(1.6e+01f), tmp18959);
__m512 tmp18936 = _mm512_add_ps(tmp18937, in2589);
__m512 tmp18956 = _mm512_add_ps(tmp18957, in2597);
__m512 tmp18953 = _mm512_add_ps(tmp18954, in2596);
__m512 tmp18973 = _mm512_add_ps(tmp18974, in2604);
__m512 tmp18935 = _mm512_fmadd_ps(tmp18940, _mm512_set1_ps(3.2e+01f), tmp18936);
__m512 tmp18955 = _mm512_fmadd_ps(tmp18960, _mm512_set1_ps(3.2e+01f), tmp18956);
__m512 tmp18946 = _mm512_fmadd_ps(tmp18940, _mm512_set1_ps(8e+00f), tmp18947);
__m512 tmp18966 = _mm512_fmadd_ps(tmp18960, _mm512_set1_ps(8e+00f), tmp18967);
__m512 tmp18952 = _mm512_fmadd_ps(tmp18944, _mm512_set1_ps(3.2e+01f), tmp18953);
__m512 tmp18972 = _mm512_fmadd_ps(tmp18964, _mm512_set1_ps(3.2e+01f), tmp18973);
__m512 tmp18950 = _mm512_fmadd_ps(tmp18940, _mm512_set1_ps(2e+00f), tmp18951);
__m512 tmp18970 = _mm512_fmadd_ps(tmp18960, _mm512_set1_ps(2e+00f), tmp18971);
__m512 tmp18923 = tmp18935;
__m512 tmp18929 = tmp18955;
__m512 tmp18924 = tmp18941;
__m512 tmp18930 = tmp18961;
__m512 tmp18925 = tmp18946;
__m512 tmp18931 = tmp18966;
__m512 tmp18926 = tmp18948;
__m512 tmp18932 = tmp18968;
__m512 tmp18927 = tmp18950;
__m512 tmp18933 = tmp18970;
__m512 tmp18928 = tmp18952;
__m512 tmp18934 = tmp18972;
__m512 tmp19001 = _mm512_unpacklo_ps(tmp18923, tmp18924);
__m512 tmp19002 = _mm512_unpackhi_ps(tmp18923, tmp18924);
__m512 tmp19003 = _mm512_unpacklo_ps(tmp18925, tmp18926);
__m512 tmp19004 = _mm512_unpackhi_ps(tmp18925, tmp18926);
__m512 tmp19005 = _mm512_unpacklo_ps(tmp18927, tmp18928);
__m512 tmp19006 = _mm512_unpackhi_ps(tmp18927, tmp18928);
__m512 tmp19007 = _mm512_unpacklo_ps(tmp18929, tmp18930);
__m512 tmp19008 = _mm512_unpackhi_ps(tmp18929, tmp18930);
__m512 tmp19009 = _mm512_unpacklo_ps(tmp18931, tmp18932);
__m512 tmp19010 = _mm512_unpackhi_ps(tmp18931, tmp18932);
__m512 tmp19011 = _mm512_unpacklo_ps(tmp18933, tmp18934);
__m512 tmp19012 = _mm512_unpackhi_ps(tmp18933, tmp18934);
__m512 tmp19013 = _mm512_shuffle_ps(tmp19001, tmp19003, 68);
__m512 tmp19014 = _mm512_shuffle_ps(tmp19001, tmp19003, 238);
__m512 tmp19015 = _mm512_shuffle_ps(tmp19002, tmp19004, 68);
__m512 tmp19016 = _mm512_shuffle_ps(tmp19002, tmp19004, 238);
__m512 tmp19017 = _mm512_shuffle_ps(tmp19005, tmp19007, 68);
__m512 tmp19018 = _mm512_shuffle_ps(tmp19005, tmp19007, 238);
__m512 tmp19019 = _mm512_shuffle_ps(tmp19006, tmp19008, 68);
__m512 tmp19020 = _mm512_shuffle_ps(tmp19006, tmp19008, 238);
__m512 tmp19021 = _mm512_shuffle_ps(tmp19009, tmp19011, 68);
__m512 tmp19022 = _mm512_shuffle_ps(tmp19009, tmp19011, 238);
__m512 tmp19023 = _mm512_shuffle_ps(tmp19010, tmp19012, 68);
__m512 tmp19024 = _mm512_shuffle_ps(tmp19010, tmp19012, 238);
__m512 tmp19025 = _mm512_shuffle_f32x4(tmp19013, tmp19017, 136);
__m512 tmp19026 = _mm512_shuffle_f32x4(tmp19013, tmp19017, 221);
__m512 tmp19027 = _mm512_shuffle_f32x4(tmp19014, tmp19018, 136);
__m512 tmp19028 = _mm512_shuffle_f32x4(tmp19014, tmp19018, 221);
__m512 tmp19029 = _mm512_shuffle_f32x4(tmp19015, tmp19019, 136);
__m512 tmp19030 = _mm512_shuffle_f32x4(tmp19015, tmp19019, 221);
__m512 tmp19031 = _mm512_shuffle_f32x4(tmp19016, tmp19020, 136);
__m512 tmp19032 = _mm512_shuffle_f32x4(tmp19016, tmp19020, 221);
__m512 tmp19033 = _mm512_shuffle_f32x4(tmp19021, tmp19021, 136);
__m512 tmp19034 = _mm512_shuffle_f32x4(tmp19021, tmp19021, 221);
__m512 tmp19035 = _mm512_shuffle_f32x4(tmp19022, tmp19022, 136);
__m512 tmp19036 = _mm512_shuffle_f32x4(tmp19022, tmp19022, 221);
__m512 tmp19037 = _mm512_shuffle_f32x4(tmp19023, tmp19023, 136);
__m512 tmp19038 = _mm512_shuffle_f32x4(tmp19023, tmp19023, 221);
__m512 tmp19039 = _mm512_shuffle_f32x4(tmp19024, tmp19024, 136);
__m512 tmp19040 = _mm512_shuffle_f32x4(tmp19024, tmp19024, 221);
tmp18923 = _mm512_shuffle_f32x4(tmp19025, tmp19033, 136);
tmp18931 = _mm512_shuffle_f32x4(tmp19025, tmp19033, 221);
tmp18924 = _mm512_shuffle_f32x4(tmp19027, tmp19035, 136);
tmp18932 = _mm512_shuffle_f32x4(tmp19027, tmp19035, 221);
tmp18925 = _mm512_shuffle_f32x4(tmp19029, tmp19037, 136);
tmp18933 = _mm512_shuffle_f32x4(tmp19029, tmp19037, 221);
tmp18926 = _mm512_shuffle_f32x4(tmp19031, tmp19039, 136);
tmp18934 = _mm512_shuffle_f32x4(tmp19031, tmp19039, 221);
tmp18927 = _mm512_shuffle_f32x4(tmp19026, tmp19034, 136);
__m512 tmp18975 = _mm512_shuffle_f32x4(tmp19026, tmp19034, 221);
tmp18928 = _mm512_shuffle_f32x4(tmp19028, tmp19036, 136);
__m512 tmp18976 = _mm512_shuffle_f32x4(tmp19028, tmp19036, 221);
tmp18929 = _mm512_shuffle_f32x4(tmp19030, tmp19038, 136);
__m512 tmp18977 = _mm512_shuffle_f32x4(tmp19030, tmp19038, 221);
tmp18930 = _mm512_shuffle_f32x4(tmp19032, tmp19040, 136);
__m512 tmp18978 = _mm512_shuffle_f32x4(tmp19032, tmp19040, 221);
(void)tmp18930;
(void)tmp18978;
__m512 tmp18983 = _mm512_add_ps(tmp18924, tmp18925);
__m512 tmp18994 = _mm512_add_ps(tmp18932, tmp18933);
__m512 tmp18982 = _mm512_add_ps(tmp18926, tmp18927);
__m512 tmp18993 = _mm512_add_ps(tmp18934, tmp18975);
__m512 tmp18988 = _mm512_sub_ps(tmp18926, tmp18927);
__m512 tmp18999 = _mm512_sub_ps(tmp18934, tmp18975);
__m512 tmp18987 = _mm512_sub_ps(tmp18924, tmp18925);
__m512 tmp18998 = _mm512_sub_ps(tmp18932, tmp18933);
__m512 tmp18984 = _mm512_add_ps(tmp18928, tmp18929);
__m512 tmp18995 = _mm512_add_ps(tmp18976, tmp18977);
__m512 tmp18989 = _mm512_sub_ps(tmp18928, tmp18929);
__m512 tmp19000 = _mm512_sub_ps(tmp18976, tmp18977);
__m512 tmp18986 = _mm512_fmadd_ps(tmp18988, _mm512_set1_ps(2e+00f), tmp18987);
__m512 tmp18997 = _mm512_fmadd_ps(tmp18999, _mm512_set1_ps(2e+00f), tmp18998);
__m512 tmp18981 = _mm512_add_ps(tmp18982, tmp18983);
__m512 tmp18992 = _mm512_add_ps(tmp18993, tmp18994);
__m512 tmp18985 = _mm512_fmadd_ps(tmp18989, _mm512_set1_ps(1.6e+01f), tmp18986);
__m512 tmp18996 = _mm512_fmadd_ps(tmp19000, _mm512_set1_ps(1.6e+01f), tmp18997);
__m512 tmp18980 = _mm512_add_ps(tmp18981, tmp18923);
__m512 tmp18991 = _mm512_add_ps(tmp18992, tmp18931);
__m512 tmp18979 = _mm512_fmadd_ps(tmp18984, _mm512_set1_ps(3.2e+01f), tmp18980);
__m512 tmp18990 = _mm512_fmadd_ps(tmp18995, _mm512_set1_ps(3.2e+01f), tmp18991);
__m512 out2419 = tmp18979;
__m512 out2421 = tmp18990;
__m512 out2420 = tmp18985;
__m512 out2422 = tmp18996;
out2419 = _mm512_max_ps(_mm512_setzero_ps(), out2419);
out2421 = _mm512_max_ps(_mm512_setzero_ps(), out2421);
out2420 = _mm512_max_ps(_mm512_setzero_ps(), out2420);
out2422 = _mm512_max_ps(_mm512_setzero_ps(), out2422);
_mm512_mask_storeu_ps(datPtr35+0+212992*i65+56*toH51+4*toW51+3328*k170+3328*l73, 4095, out2419);
_mm512_mask_storeu_ps(datPtr35+48+212992*i65+56*toH51+4*toW51+3328*k170+3328*l73, 3, out2421);
_mm512_mask_storeu_ps(datPtr35+808+212992*i65+56*toH51+4*toW51+3328*k170+3328*l73, 4032, out2421);
_mm512_mask_storeu_ps(datPtr35+56+212992*i65+56*toH51+4*toW51+3328*k170+3328*l73, 4095, out2420);
_mm512_mask_storeu_ps(datPtr35+104+212992*i65+56*toH51+4*toW51+3328*k170+3328*l73, 3, out2422);
_mm512_mask_storeu_ps(datPtr35+864+212992*i65+56*toH51+4*toW51+3328*k170+3328*l73, 4032, out2422);
__m512 sf1377 = _mm512_loadu_ps(sfPtr15+256+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1378 = _mm512_loadu_ps(sfPtr15+384+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2605 = _mm512_shuffle_f32x4(sf1377, sf1378, 68);
__m512 in2606 = _mm512_shuffle_f32x4(sf1377, sf1378, 238);
__m512 sf1379 = _mm512_loadu_ps(sfPtr15+320+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1380 = _mm512_loadu_ps(sfPtr15+448+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2613 = _mm512_shuffle_f32x4(sf1379, sf1380, 68);
__m512 in2614 = _mm512_shuffle_f32x4(sf1379, sf1380, 238);
__m512 sf1381 = _mm512_loadu_ps(sfPtr15+147712+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1382 = _mm512_loadu_ps(sfPtr15+147840+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2607 = _mm512_shuffle_f32x4(sf1381, sf1382, 68);
__m512 in2608 = _mm512_shuffle_f32x4(sf1381, sf1382, 238);
__m512 sf1383 = _mm512_loadu_ps(sfPtr15+147776+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1384 = _mm512_loadu_ps(sfPtr15+147904+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2615 = _mm512_shuffle_f32x4(sf1383, sf1384, 68);
__m512 in2616 = _mm512_shuffle_f32x4(sf1383, sf1384, 238);
__m512 sf1385 = _mm512_loadu_ps(sfPtr15+295168+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1386 = _mm512_loadu_ps(sfPtr15+295296+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2609 = _mm512_shuffle_f32x4(sf1385, sf1386, 68);
__m512 in2610 = _mm512_shuffle_f32x4(sf1385, sf1386, 238);
__m512 sf1387 = _mm512_loadu_ps(sfPtr15+295232+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1388 = _mm512_loadu_ps(sfPtr15+295360+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2617 = _mm512_shuffle_f32x4(sf1387, sf1388, 68);
__m512 in2618 = _mm512_shuffle_f32x4(sf1387, sf1388, 238);
__m512 sf1389 = _mm512_loadu_ps(sfPtr15+442624+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1390 = _mm512_loadu_ps(sfPtr15+442752+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2611 = _mm512_shuffle_f32x4(sf1389, sf1390, 68);
__m512 in2612 = _mm512_shuffle_f32x4(sf1389, sf1390, 238);
__m512 sf1391 = _mm512_loadu_ps(sfPtr15+442688+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1392 = _mm512_loadu_ps(sfPtr15+442816+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2619 = _mm512_shuffle_f32x4(sf1391, sf1392, 68);
__m512 in2620 = _mm512_shuffle_f32x4(sf1391, sf1392, 238);
__m512 tmp19057 = _mm512_add_ps(in2606, in2607);
__m512 tmp19077 = _mm512_add_ps(in2614, in2615);
__m512 tmp19056 = _mm512_add_ps(in2608, in2609);
__m512 tmp19076 = _mm512_add_ps(in2616, in2617);
__m512 tmp19062 = _mm512_sub_ps(in2608, in2609);
__m512 tmp19082 = _mm512_sub_ps(in2616, in2617);
__m512 tmp19061 = _mm512_sub_ps(in2606, in2607);
__m512 tmp19081 = _mm512_sub_ps(in2614, in2615);
__m512 tmp19058 = _mm512_add_ps(in2610, in2611);
__m512 tmp19078 = _mm512_add_ps(in2618, in2619);
__m512 tmp19063 = _mm512_sub_ps(in2610, in2611);
__m512 tmp19083 = _mm512_sub_ps(in2618, in2619);
__m512 tmp19060 = _mm512_fmadd_ps(tmp19062, _mm512_set1_ps(2e+00f), tmp19061);
__m512 tmp19080 = _mm512_fmadd_ps(tmp19082, _mm512_set1_ps(2e+00f), tmp19081);
__m512 tmp19067 = _mm512_fmadd_ps(tmp19062, _mm512_set1_ps(8e+00f), tmp19061);
__m512 tmp19087 = _mm512_fmadd_ps(tmp19082, _mm512_set1_ps(8e+00f), tmp19081);
__m512 tmp19055 = _mm512_add_ps(tmp19056, tmp19057);
__m512 tmp19075 = _mm512_add_ps(tmp19076, tmp19077);
__m512 tmp19059 = _mm512_fmadd_ps(tmp19063, _mm512_set1_ps(1.6e+01f), tmp19060);
__m512 tmp19079 = _mm512_fmadd_ps(tmp19083, _mm512_set1_ps(1.6e+01f), tmp19080);
__m512 tmp19066 = _mm512_fmadd_ps(tmp19063, _mm512_set1_ps(4e+00f), tmp19067);
__m512 tmp19086 = _mm512_fmadd_ps(tmp19083, _mm512_set1_ps(4e+00f), tmp19087);
__m512 tmp19072 = _mm512_add_ps(tmp19063, tmp19061);
__m512 tmp19092 = _mm512_add_ps(tmp19083, tmp19081);
__m512 tmp19065 = _mm512_fmadd_ps(tmp19056, _mm512_set1_ps(4e+00f), tmp19057);
__m512 tmp19085 = _mm512_fmadd_ps(tmp19076, _mm512_set1_ps(4e+00f), tmp19077);
__m512 tmp19069 = _mm512_fmadd_ps(tmp19056, _mm512_set1_ps(1.6e+01f), tmp19057);
__m512 tmp19089 = _mm512_fmadd_ps(tmp19076, _mm512_set1_ps(1.6e+01f), tmp19077);
__m512 tmp19054 = _mm512_add_ps(tmp19055, in2605);
__m512 tmp19074 = _mm512_add_ps(tmp19075, in2613);
__m512 tmp19071 = _mm512_add_ps(tmp19072, in2612);
__m512 tmp19091 = _mm512_add_ps(tmp19092, in2620);
__m512 tmp19053 = _mm512_fmadd_ps(tmp19058, _mm512_set1_ps(3.2e+01f), tmp19054);
__m512 tmp19073 = _mm512_fmadd_ps(tmp19078, _mm512_set1_ps(3.2e+01f), tmp19074);
__m512 tmp19064 = _mm512_fmadd_ps(tmp19058, _mm512_set1_ps(8e+00f), tmp19065);
__m512 tmp19084 = _mm512_fmadd_ps(tmp19078, _mm512_set1_ps(8e+00f), tmp19085);
__m512 tmp19070 = _mm512_fmadd_ps(tmp19062, _mm512_set1_ps(3.2e+01f), tmp19071);
__m512 tmp19090 = _mm512_fmadd_ps(tmp19082, _mm512_set1_ps(3.2e+01f), tmp19091);
__m512 tmp19068 = _mm512_fmadd_ps(tmp19058, _mm512_set1_ps(2e+00f), tmp19069);
__m512 tmp19088 = _mm512_fmadd_ps(tmp19078, _mm512_set1_ps(2e+00f), tmp19089);
__m512 tmp19041 = tmp19053;
__m512 tmp19047 = tmp19073;
__m512 tmp19042 = tmp19059;
__m512 tmp19048 = tmp19079;
__m512 tmp19043 = tmp19064;
__m512 tmp19049 = tmp19084;
__m512 tmp19044 = tmp19066;
__m512 tmp19050 = tmp19086;
__m512 tmp19045 = tmp19068;
__m512 tmp19051 = tmp19088;
__m512 tmp19046 = tmp19070;
__m512 tmp19052 = tmp19090;
__m512 tmp19119 = _mm512_unpacklo_ps(tmp19041, tmp19042);
__m512 tmp19120 = _mm512_unpackhi_ps(tmp19041, tmp19042);
__m512 tmp19121 = _mm512_unpacklo_ps(tmp19043, tmp19044);
__m512 tmp19122 = _mm512_unpackhi_ps(tmp19043, tmp19044);
__m512 tmp19123 = _mm512_unpacklo_ps(tmp19045, tmp19046);
__m512 tmp19124 = _mm512_unpackhi_ps(tmp19045, tmp19046);
__m512 tmp19125 = _mm512_unpacklo_ps(tmp19047, tmp19048);
__m512 tmp19126 = _mm512_unpackhi_ps(tmp19047, tmp19048);
__m512 tmp19127 = _mm512_unpacklo_ps(tmp19049, tmp19050);
__m512 tmp19128 = _mm512_unpackhi_ps(tmp19049, tmp19050);
__m512 tmp19129 = _mm512_unpacklo_ps(tmp19051, tmp19052);
__m512 tmp19130 = _mm512_unpackhi_ps(tmp19051, tmp19052);
__m512 tmp19131 = _mm512_shuffle_ps(tmp19119, tmp19121, 68);
__m512 tmp19132 = _mm512_shuffle_ps(tmp19119, tmp19121, 238);
__m512 tmp19133 = _mm512_shuffle_ps(tmp19120, tmp19122, 68);
__m512 tmp19134 = _mm512_shuffle_ps(tmp19120, tmp19122, 238);
__m512 tmp19135 = _mm512_shuffle_ps(tmp19123, tmp19125, 68);
__m512 tmp19136 = _mm512_shuffle_ps(tmp19123, tmp19125, 238);
__m512 tmp19137 = _mm512_shuffle_ps(tmp19124, tmp19126, 68);
__m512 tmp19138 = _mm512_shuffle_ps(tmp19124, tmp19126, 238);
__m512 tmp19139 = _mm512_shuffle_ps(tmp19127, tmp19129, 68);
__m512 tmp19140 = _mm512_shuffle_ps(tmp19127, tmp19129, 238);
__m512 tmp19141 = _mm512_shuffle_ps(tmp19128, tmp19130, 68);
__m512 tmp19142 = _mm512_shuffle_ps(tmp19128, tmp19130, 238);
__m512 tmp19143 = _mm512_shuffle_f32x4(tmp19131, tmp19135, 136);
__m512 tmp19144 = _mm512_shuffle_f32x4(tmp19131, tmp19135, 221);
__m512 tmp19145 = _mm512_shuffle_f32x4(tmp19132, tmp19136, 136);
__m512 tmp19146 = _mm512_shuffle_f32x4(tmp19132, tmp19136, 221);
__m512 tmp19147 = _mm512_shuffle_f32x4(tmp19133, tmp19137, 136);
__m512 tmp19148 = _mm512_shuffle_f32x4(tmp19133, tmp19137, 221);
__m512 tmp19149 = _mm512_shuffle_f32x4(tmp19134, tmp19138, 136);
__m512 tmp19150 = _mm512_shuffle_f32x4(tmp19134, tmp19138, 221);
__m512 tmp19151 = _mm512_shuffle_f32x4(tmp19139, tmp19139, 136);
__m512 tmp19152 = _mm512_shuffle_f32x4(tmp19139, tmp19139, 221);
__m512 tmp19153 = _mm512_shuffle_f32x4(tmp19140, tmp19140, 136);
__m512 tmp19154 = _mm512_shuffle_f32x4(tmp19140, tmp19140, 221);
__m512 tmp19155 = _mm512_shuffle_f32x4(tmp19141, tmp19141, 136);
__m512 tmp19156 = _mm512_shuffle_f32x4(tmp19141, tmp19141, 221);
__m512 tmp19157 = _mm512_shuffle_f32x4(tmp19142, tmp19142, 136);
__m512 tmp19158 = _mm512_shuffle_f32x4(tmp19142, tmp19142, 221);
tmp19041 = _mm512_shuffle_f32x4(tmp19143, tmp19151, 136);
tmp19049 = _mm512_shuffle_f32x4(tmp19143, tmp19151, 221);
tmp19042 = _mm512_shuffle_f32x4(tmp19145, tmp19153, 136);
tmp19050 = _mm512_shuffle_f32x4(tmp19145, tmp19153, 221);
tmp19043 = _mm512_shuffle_f32x4(tmp19147, tmp19155, 136);
tmp19051 = _mm512_shuffle_f32x4(tmp19147, tmp19155, 221);
tmp19044 = _mm512_shuffle_f32x4(tmp19149, tmp19157, 136);
tmp19052 = _mm512_shuffle_f32x4(tmp19149, tmp19157, 221);
tmp19045 = _mm512_shuffle_f32x4(tmp19144, tmp19152, 136);
__m512 tmp19093 = _mm512_shuffle_f32x4(tmp19144, tmp19152, 221);
tmp19046 = _mm512_shuffle_f32x4(tmp19146, tmp19154, 136);
__m512 tmp19094 = _mm512_shuffle_f32x4(tmp19146, tmp19154, 221);
tmp19047 = _mm512_shuffle_f32x4(tmp19148, tmp19156, 136);
__m512 tmp19095 = _mm512_shuffle_f32x4(tmp19148, tmp19156, 221);
tmp19048 = _mm512_shuffle_f32x4(tmp19150, tmp19158, 136);
__m512 tmp19096 = _mm512_shuffle_f32x4(tmp19150, tmp19158, 221);
(void)tmp19048;
(void)tmp19096;
__m512 tmp19101 = _mm512_add_ps(tmp19042, tmp19043);
__m512 tmp19112 = _mm512_add_ps(tmp19050, tmp19051);
__m512 tmp19100 = _mm512_add_ps(tmp19044, tmp19045);
__m512 tmp19111 = _mm512_add_ps(tmp19052, tmp19093);
__m512 tmp19106 = _mm512_sub_ps(tmp19044, tmp19045);
__m512 tmp19117 = _mm512_sub_ps(tmp19052, tmp19093);
__m512 tmp19105 = _mm512_sub_ps(tmp19042, tmp19043);
__m512 tmp19116 = _mm512_sub_ps(tmp19050, tmp19051);
__m512 tmp19102 = _mm512_add_ps(tmp19046, tmp19047);
__m512 tmp19113 = _mm512_add_ps(tmp19094, tmp19095);
__m512 tmp19107 = _mm512_sub_ps(tmp19046, tmp19047);
__m512 tmp19118 = _mm512_sub_ps(tmp19094, tmp19095);
__m512 tmp19104 = _mm512_fmadd_ps(tmp19106, _mm512_set1_ps(2e+00f), tmp19105);
__m512 tmp19115 = _mm512_fmadd_ps(tmp19117, _mm512_set1_ps(2e+00f), tmp19116);
__m512 tmp19099 = _mm512_add_ps(tmp19100, tmp19101);
__m512 tmp19110 = _mm512_add_ps(tmp19111, tmp19112);
__m512 tmp19103 = _mm512_fmadd_ps(tmp19107, _mm512_set1_ps(1.6e+01f), tmp19104);
__m512 tmp19114 = _mm512_fmadd_ps(tmp19118, _mm512_set1_ps(1.6e+01f), tmp19115);
__m512 tmp19098 = _mm512_add_ps(tmp19099, tmp19041);
__m512 tmp19109 = _mm512_add_ps(tmp19110, tmp19049);
__m512 tmp19097 = _mm512_fmadd_ps(tmp19102, _mm512_set1_ps(3.2e+01f), tmp19098);
__m512 tmp19108 = _mm512_fmadd_ps(tmp19113, _mm512_set1_ps(3.2e+01f), tmp19109);
__m512 out2423 = tmp19097;
__m512 out2425 = tmp19108;
__m512 out2424 = tmp19103;
__m512 out2426 = tmp19114;
out2423 = _mm512_max_ps(_mm512_setzero_ps(), out2423);
out2425 = _mm512_max_ps(_mm512_setzero_ps(), out2425);
out2424 = _mm512_max_ps(_mm512_setzero_ps(), out2424);
out2426 = _mm512_max_ps(_mm512_setzero_ps(), out2426);
_mm512_mask_storeu_ps(datPtr35+856+212992*i65+56*toH51+4*toW51+3328*k170+3328*l73, 255, out2423);
_mm512_mask_storeu_ps(datPtr35+1664+212992*i65+56*toH51+4*toW51+3328*k170+3328*l73, 4095, out2425);
_mm512_mask_storeu_ps(datPtr35+912+212992*i65+56*toH51+4*toW51+3328*k170+3328*l73, 255, out2424);
_mm512_mask_storeu_ps(datPtr35+1720+212992*i65+56*toH51+4*toW51+3328*k170+3328*l73, 4095, out2426);
__m512 sf1393 = _mm512_loadu_ps(sfPtr15+512+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1394 = _mm512_loadu_ps(sfPtr15+576+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2621 = _mm512_shuffle_f32x4(sf1394, sf1393, 68);
__m512 in2622 = _mm512_shuffle_f32x4(sf1394, sf1393, 238);
__m512 sf1395 = _mm512_loadu_ps(sfPtr15+640+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1396 = _mm512_loadu_ps(sfPtr15+704+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2629 = _mm512_shuffle_f32x4(sf1395, sf1396, 68);
__m512 in2630 = _mm512_shuffle_f32x4(sf1395, sf1396, 238);
__m512 sf1397 = _mm512_loadu_ps(sfPtr15+147968+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1398 = _mm512_loadu_ps(sfPtr15+148032+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2623 = _mm512_shuffle_f32x4(sf1398, sf1397, 68);
__m512 in2624 = _mm512_shuffle_f32x4(sf1398, sf1397, 238);
__m512 sf1399 = _mm512_loadu_ps(sfPtr15+148096+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1400 = _mm512_loadu_ps(sfPtr15+148160+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2631 = _mm512_shuffle_f32x4(sf1399, sf1400, 68);
__m512 in2632 = _mm512_shuffle_f32x4(sf1399, sf1400, 238);
__m512 sf1401 = _mm512_loadu_ps(sfPtr15+295424+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1402 = _mm512_loadu_ps(sfPtr15+295488+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2625 = _mm512_shuffle_f32x4(sf1402, sf1401, 68);
__m512 in2626 = _mm512_shuffle_f32x4(sf1402, sf1401, 238);
__m512 sf1403 = _mm512_loadu_ps(sfPtr15+295552+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1404 = _mm512_loadu_ps(sfPtr15+295616+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2633 = _mm512_shuffle_f32x4(sf1403, sf1404, 68);
__m512 in2634 = _mm512_shuffle_f32x4(sf1403, sf1404, 238);
__m512 sf1405 = _mm512_loadu_ps(sfPtr15+442880+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1406 = _mm512_loadu_ps(sfPtr15+442944+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2627 = _mm512_shuffle_f32x4(sf1406, sf1405, 68);
__m512 in2628 = _mm512_shuffle_f32x4(sf1406, sf1405, 238);
__m512 sf1407 = _mm512_loadu_ps(sfPtr15+443008+589824*i65+98304*j57+768*k170+768*l73);
__m512 sf1408 = _mm512_loadu_ps(sfPtr15+443072+589824*i65+98304*j57+768*k170+768*l73);
__m512 in2635 = _mm512_shuffle_f32x4(sf1407, sf1408, 68);
__m512 in2636 = _mm512_shuffle_f32x4(sf1407, sf1408, 238);
__m512 tmp19175 = _mm512_add_ps(in2622, in2623);
__m512 tmp19195 = _mm512_add_ps(in2630, in2631);
__m512 tmp19174 = _mm512_add_ps(in2624, in2625);
__m512 tmp19194 = _mm512_add_ps(in2632, in2633);
__m512 tmp19180 = _mm512_sub_ps(in2624, in2625);
__m512 tmp19200 = _mm512_sub_ps(in2632, in2633);
__m512 tmp19179 = _mm512_sub_ps(in2622, in2623);
__m512 tmp19199 = _mm512_sub_ps(in2630, in2631);
__m512 tmp19176 = _mm512_add_ps(in2626, in2627);
__m512 tmp19196 = _mm512_add_ps(in2634, in2635);
__m512 tmp19181 = _mm512_sub_ps(in2626, in2627);
__m512 tmp19201 = _mm512_sub_ps(in2634, in2635);
__m512 tmp19178 = _mm512_fmadd_ps(tmp19180, _mm512_set1_ps(2e+00f), tmp19179);
__m512 tmp19198 = _mm512_fmadd_ps(tmp19200, _mm512_set1_ps(2e+00f), tmp19199);
__m512 tmp19185 = _mm512_fmadd_ps(tmp19180, _mm512_set1_ps(8e+00f), tmp19179);
__m512 tmp19205 = _mm512_fmadd_ps(tmp19200, _mm512_set1_ps(8e+00f), tmp19199);
__m512 tmp19173 = _mm512_add_ps(tmp19174, tmp19175);
__m512 tmp19193 = _mm512_add_ps(tmp19194, tmp19195);
__m512 tmp19177 = _mm512_fmadd_ps(tmp19181, _mm512_set1_ps(1.6e+01f), tmp19178);
__m512 tmp19197 = _mm512_fmadd_ps(tmp19201, _mm512_set1_ps(1.6e+01f), tmp19198);
__m512 tmp19184 = _mm512_fmadd_ps(tmp19181, _mm512_set1_ps(4e+00f), tmp19185);
__m512 tmp19204 = _mm512_fmadd_ps(tmp19201, _mm512_set1_ps(4e+00f), tmp19205);
__m512 tmp19190 = _mm512_add_ps(tmp19181, tmp19179);
__m512 tmp19210 = _mm512_add_ps(tmp19201, tmp19199);
__m512 tmp19183 = _mm512_fmadd_ps(tmp19174, _mm512_set1_ps(4e+00f), tmp19175);
__m512 tmp19203 = _mm512_fmadd_ps(tmp19194, _mm512_set1_ps(4e+00f), tmp19195);
__m512 tmp19187 = _mm512_fmadd_ps(tmp19174, _mm512_set1_ps(1.6e+01f), tmp19175);
__m512 tmp19207 = _mm512_fmadd_ps(tmp19194, _mm512_set1_ps(1.6e+01f), tmp19195);
__m512 tmp19172 = _mm512_add_ps(tmp19173, in2621);
__m512 tmp19192 = _mm512_add_ps(tmp19193, in2629);
__m512 tmp19189 = _mm512_add_ps(tmp19190, in2628);
__m512 tmp19209 = _mm512_add_ps(tmp19210, in2636);
__m512 tmp19171 = _mm512_fmadd_ps(tmp19176, _mm512_set1_ps(3.2e+01f), tmp19172);
__m512 tmp19191 = _mm512_fmadd_ps(tmp19196, _mm512_set1_ps(3.2e+01f), tmp19192);
__m512 tmp19182 = _mm512_fmadd_ps(tmp19176, _mm512_set1_ps(8e+00f), tmp19183);
__m512 tmp19202 = _mm512_fmadd_ps(tmp19196, _mm512_set1_ps(8e+00f), tmp19203);
__m512 tmp19188 = _mm512_fmadd_ps(tmp19180, _mm512_set1_ps(3.2e+01f), tmp19189);
__m512 tmp19208 = _mm512_fmadd_ps(tmp19200, _mm512_set1_ps(3.2e+01f), tmp19209);
__m512 tmp19186 = _mm512_fmadd_ps(tmp19176, _mm512_set1_ps(2e+00f), tmp19187);
__m512 tmp19206 = _mm512_fmadd_ps(tmp19196, _mm512_set1_ps(2e+00f), tmp19207);
__m512 tmp19159 = tmp19171;
__m512 tmp19165 = tmp19191;
__m512 tmp19160 = tmp19177;
__m512 tmp19166 = tmp19197;
__m512 tmp19161 = tmp19182;
__m512 tmp19167 = tmp19202;
__m512 tmp19162 = tmp19184;
__m512 tmp19168 = tmp19204;
__m512 tmp19163 = tmp19186;
__m512 tmp19169 = tmp19206;
__m512 tmp19164 = tmp19188;
__m512 tmp19170 = tmp19208;
__m512 tmp19237 = _mm512_unpacklo_ps(tmp19159, tmp19160);
__m512 tmp19238 = _mm512_unpackhi_ps(tmp19159, tmp19160);
__m512 tmp19239 = _mm512_unpacklo_ps(tmp19161, tmp19162);
__m512 tmp19240 = _mm512_unpackhi_ps(tmp19161, tmp19162);
__m512 tmp19241 = _mm512_unpacklo_ps(tmp19163, tmp19164);
__m512 tmp19242 = _mm512_unpackhi_ps(tmp19163, tmp19164);
__m512 tmp19243 = _mm512_unpacklo_ps(tmp19165, tmp19166);
__m512 tmp19244 = _mm512_unpackhi_ps(tmp19165, tmp19166);
__m512 tmp19245 = _mm512_unpacklo_ps(tmp19167, tmp19168);
__m512 tmp19246 = _mm512_unpackhi_ps(tmp19167, tmp19168);
__m512 tmp19247 = _mm512_unpacklo_ps(tmp19169, tmp19170);
__m512 tmp19248 = _mm512_unpackhi_ps(tmp19169, tmp19170);
__m512 tmp19249 = _mm512_shuffle_ps(tmp19237, tmp19239, 68);
__m512 tmp19250 = _mm512_shuffle_ps(tmp19237, tmp19239, 238);
__m512 tmp19251 = _mm512_shuffle_ps(tmp19238, tmp19240, 68);
__m512 tmp19252 = _mm512_shuffle_ps(tmp19238, tmp19240, 238);
__m512 tmp19253 = _mm512_shuffle_ps(tmp19241, tmp19243, 68);
__m512 tmp19254 = _mm512_shuffle_ps(tmp19241, tmp19243, 238);
__m512 tmp19255 = _mm512_shuffle_ps(tmp19242, tmp19244, 68);
__m512 tmp19256 = _mm512_shuffle_ps(tmp19242, tmp19244, 238);
__m512 tmp19257 = _mm512_shuffle_ps(tmp19245, tmp19247, 68);
__m512 tmp19258 = _mm512_shuffle_ps(tmp19245, tmp19247, 238);
__m512 tmp19259 = _mm512_shuffle_ps(tmp19246, tmp19248, 68);
__m512 tmp19260 = _mm512_shuffle_ps(tmp19246, tmp19248, 238);
__m512 tmp19261 = _mm512_shuffle_f32x4(tmp19249, tmp19253, 136);
__m512 tmp19262 = _mm512_shuffle_f32x4(tmp19249, tmp19253, 221);
__m512 tmp19263 = _mm512_shuffle_f32x4(tmp19250, tmp19254, 136);
__m512 tmp19264 = _mm512_shuffle_f32x4(tmp19250, tmp19254, 221);
__m512 tmp19265 = _mm512_shuffle_f32x4(tmp19251, tmp19255, 136);
__m512 tmp19266 = _mm512_shuffle_f32x4(tmp19251, tmp19255, 221);
__m512 tmp19267 = _mm512_shuffle_f32x4(tmp19252, tmp19256, 136);
__m512 tmp19268 = _mm512_shuffle_f32x4(tmp19252, tmp19256, 221);
__m512 tmp19269 = _mm512_shuffle_f32x4(tmp19257, tmp19257, 136);
__m512 tmp19270 = _mm512_shuffle_f32x4(tmp19257, tmp19257, 221);
__m512 tmp19271 = _mm512_shuffle_f32x4(tmp19258, tmp19258, 136);
__m512 tmp19272 = _mm512_shuffle_f32x4(tmp19258, tmp19258, 221);
__m512 tmp19273 = _mm512_shuffle_f32x4(tmp19259, tmp19259, 136);
__m512 tmp19274 = _mm512_shuffle_f32x4(tmp19259, tmp19259, 221);
__m512 tmp19275 = _mm512_shuffle_f32x4(tmp19260, tmp19260, 136);
__m512 tmp19276 = _mm512_shuffle_f32x4(tmp19260, tmp19260, 221);
tmp19159 = _mm512_shuffle_f32x4(tmp19261, tmp19269, 136);
tmp19167 = _mm512_shuffle_f32x4(tmp19261, tmp19269, 221);
tmp19160 = _mm512_shuffle_f32x4(tmp19263, tmp19271, 136);
tmp19168 = _mm512_shuffle_f32x4(tmp19263, tmp19271, 221);
tmp19161 = _mm512_shuffle_f32x4(tmp19265, tmp19273, 136);
tmp19169 = _mm512_shuffle_f32x4(tmp19265, tmp19273, 221);
tmp19162 = _mm512_shuffle_f32x4(tmp19267, tmp19275, 136);
tmp19170 = _mm512_shuffle_f32x4(tmp19267, tmp19275, 221);
tmp19163 = _mm512_shuffle_f32x4(tmp19262, tmp19270, 136);
__m512 tmp19211 = _mm512_shuffle_f32x4(tmp19262, tmp19270, 221);
tmp19164 = _mm512_shuffle_f32x4(tmp19264, tmp19272, 136);
__m512 tmp19212 = _mm512_shuffle_f32x4(tmp19264, tmp19272, 221);
tmp19165 = _mm512_shuffle_f32x4(tmp19266, tmp19274, 136);
__m512 tmp19213 = _mm512_shuffle_f32x4(tmp19266, tmp19274, 221);
tmp19166 = _mm512_shuffle_f32x4(tmp19268, tmp19276, 136);
__m512 tmp19214 = _mm512_shuffle_f32x4(tmp19268, tmp19276, 221);
(void)tmp19166;
(void)tmp19214;
__m512 tmp19219 = _mm512_add_ps(tmp19160, tmp19161);
__m512 tmp19230 = _mm512_add_ps(tmp19168, tmp19169);
__m512 tmp19218 = _mm512_add_ps(tmp19162, tmp19163);
__m512 tmp19229 = _mm512_add_ps(tmp19170, tmp19211);
__m512 tmp19224 = _mm512_sub_ps(tmp19162, tmp19163);
__m512 tmp19235 = _mm512_sub_ps(tmp19170, tmp19211);
__m512 tmp19223 = _mm512_sub_ps(tmp19160, tmp19161);
__m512 tmp19234 = _mm512_sub_ps(tmp19168, tmp19169);
__m512 tmp19220 = _mm512_add_ps(tmp19164, tmp19165);
__m512 tmp19231 = _mm512_add_ps(tmp19212, tmp19213);
__m512 tmp19225 = _mm512_sub_ps(tmp19164, tmp19165);
__m512 tmp19236 = _mm512_sub_ps(tmp19212, tmp19213);
__m512 tmp19222 = _mm512_fmadd_ps(tmp19224, _mm512_set1_ps(2e+00f), tmp19223);
__m512 tmp19233 = _mm512_fmadd_ps(tmp19235, _mm512_set1_ps(2e+00f), tmp19234);
__m512 tmp19217 = _mm512_add_ps(tmp19218, tmp19219);
__m512 tmp19228 = _mm512_add_ps(tmp19229, tmp19230);
__m512 tmp19221 = _mm512_fmadd_ps(tmp19225, _mm512_set1_ps(1.6e+01f), tmp19222);
__m512 tmp19232 = _mm512_fmadd_ps(tmp19236, _mm512_set1_ps(1.6e+01f), tmp19233);
__m512 tmp19216 = _mm512_add_ps(tmp19217, tmp19159);
__m512 tmp19227 = _mm512_add_ps(tmp19228, tmp19167);
__m512 tmp19215 = _mm512_fmadd_ps(tmp19220, _mm512_set1_ps(3.2e+01f), tmp19216);
__m512 tmp19226 = _mm512_fmadd_ps(tmp19231, _mm512_set1_ps(3.2e+01f), tmp19227);
__m512 out2429 = tmp19215;
__m512 out2427 = tmp19226;
__m512 out2430 = tmp19221;
__m512 out2428 = tmp19232;
out2429 = _mm512_max_ps(_mm512_setzero_ps(), out2429);
out2427 = _mm512_max_ps(_mm512_setzero_ps(), out2427);
out2430 = _mm512_max_ps(_mm512_setzero_ps(), out2430);
out2428 = _mm512_max_ps(_mm512_setzero_ps(), out2428);
_mm512_mask_storeu_ps(datPtr35+2496+212992*i65+56*toH51+4*toW51+3328*k170+3328*l73, 4095, out2429);
_mm512_mask_storeu_ps(datPtr35+1712+212992*i65+56*toH51+4*toW51+3328*k170+3328*l73, 3, out2427);
_mm512_mask_storeu_ps(datPtr35+2520+212992*i65+56*toH51+4*toW51+3328*k170+3328*l73, 192, out2427);
_mm512_mask_storeu_ps(datPtr35+2552+212992*i65+56*toH51+4*toW51+3328*k170+3328*l73, 4095, out2430);
_mm512_mask_storeu_ps(datPtr35+1768+212992*i65+56*toH51+4*toW51+3328*k170+3328*l73, 3, out2428);
_mm512_mask_storeu_ps(datPtr35+2576+212992*i65+56*toH51+4*toW51+3328*k170+3328*l73, 192, out2428);
}
if (k170 >= kk61) return;
}
if (j57 >= last14) return;
++j57;
}

static void ResNet50ThreeConsumeSums6(ResNet50ThreaderTeam1* team69, char** tensors111) {
ResNet50ThreaderTask1 task115;
task115.callee1 = ResNet50ThreeConsumeSums6Callee1;
task115.any1 = tensors111;
task115.nd1 = 3;
task115.hull1[0] = 2;
task115.hull1[1] = 2;
task115.hull1[2] = 1;
ResNet50ThreaderDo1(team69, &task115);
}

static void ResNet50ThreeArrangeFilts7Callee1(ResNet50ThreaderTask1* task122, int64_t* pt66) {
char** tensors120 = task122->any1;
ptrdiff_t b72 = pt66[0];
ptrdiff_t g40 = 0;
ptrdiff_t e34 = 0;
char*restrict bfPtr16 = tensors120[3]+2048*e34;
char*restrict wfPtr16 = tensors120[3]+2048+32505856*e34;
char*restrict wtPtr22 = tensors120[0]+17856*e34;
char*restrict biasPtr22 = tensors120[1];
char*restrict bnPtr22 = tensors120[2];
ptrdiff_t i70 = 1*g40;
ptrdiff_t j61 = 1*b72;
ptrdiff_t jj53 = j61+0;
if (j61 < 128) {
for (; j61 != 128; ++j61) {
ptrdiff_t k176 = 0+1*j61;
ptrdiff_t cut31 = 0;
__m512 postMul72 = _mm512_set1_ps(((float*)bnPtr22+(ptrdiff_t)2*(0+512*i70+4*j61))[0]);
__m512 postMul73 = _mm512_set1_ps(((float*)bnPtr22+(ptrdiff_t)2*(1+512*i70+4*j61))[0]);
__m512 postMul74 = _mm512_set1_ps(((float*)bnPtr22+(ptrdiff_t)2*(2+512*i70+4*j61))[0]);
__m512 postMul75 = _mm512_set1_ps(((float*)bnPtr22+(ptrdiff_t)2*(3+512*i70+4*j61))[0]);
ptrdiff_t s59 = 0;
for (; s59 != 512; ++s59) {
__m512 wt783 = _mm512_maskz_loadu_ps(511, wtPtr22+0+9437184*i70+73728*j61+36*s59);
__m512 wt784 = _mm512_maskz_loadu_ps(511, wtPtr22+18432+9437184*i70+73728*j61+36*s59);
__m512 wt785 = _mm512_maskz_loadu_ps(511, wtPtr22+36864+9437184*i70+73728*j61+36*s59);
__m512 wt786 = _mm512_maskz_loadu_ps(511, wtPtr22+55296+9437184*i70+73728*j61+36*s59);
wt783 = _mm512_mul_ps(wt783, postMul72);
wt784 = _mm512_mul_ps(wt784, postMul73);
wt785 = _mm512_mul_ps(wt785, postMul74);
wt786 = _mm512_mul_ps(wt786, postMul75);
__m512i pm258 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm259 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp19517 = _mm512_permutex2var_ps(wt783, pm258, wt785);
__m512 tmp19518 = _mm512_permutex2var_ps(wt784, pm258, wt786);
__m512 tmp19519 = _mm512_permutex2var_ps(wt783, pm259, wt785);
__m512 tmp19520 = _mm512_permutex2var_ps(wt784, pm259, wt786);
__m512 in2637 = _mm512_permutex2var_ps(tmp19517, pm258, tmp19518);
__m512 in2638 = _mm512_permutex2var_ps(tmp19517, pm259, tmp19518);
__m512 in2639 = _mm512_permutex2var_ps(tmp19519, pm258, tmp19520);
__m512 tmp19521 = _mm512_fmadd_ps(in2637, _mm512_set1_ps(4e+00f), in2639);
__m512 tmp19522 = _mm512_add_ps(in2637, in2639);
__m512 tmp19523 = _mm512_fmadd_ps(in2639, _mm512_set1_ps(4e+00f), in2637);
__m512 tmp19524 = _mm512_add_ps(in2638, tmp19522);
__m512 tmp19525 = _mm512_fmadd_ps(in2638, _mm512_set1_ps(2e+00f), tmp19523);
tmp19523 = _mm512_fnmadd_ps(in2638, _mm512_set1_ps(2e+00f), tmp19523);
__m512 tmp19526 = _mm512_fnmadd_ps(in2638, _mm512_set1_ps(2e+00f), tmp19521);
tmp19521 = _mm512_fmadd_ps(in2638, _mm512_set1_ps(2e+00f), tmp19521);
tmp19522 = _mm512_sub_ps(tmp19522, in2638);
__m512 tmp19543 = _mm512_unpacklo_ps(in2637, tmp19524);
__m512 tmp19544 = _mm512_unpackhi_ps(in2637, tmp19524);
__m512 tmp19545 = _mm512_unpacklo_ps(tmp19522, tmp19525);
__m512 tmp19546 = _mm512_unpackhi_ps(tmp19522, tmp19525);
__m512 tmp19547 = _mm512_unpacklo_ps(tmp19523, tmp19521);
__m512 tmp19548 = _mm512_unpackhi_ps(tmp19523, tmp19521);
__m512 tmp19549 = _mm512_unpacklo_ps(tmp19526, in2639);
__m512 tmp19550 = _mm512_unpackhi_ps(tmp19526, in2639);
__m512 tmp19551 = _mm512_shuffle_ps(tmp19543, tmp19545, 68);
__m512 tmp19552 = _mm512_shuffle_ps(tmp19543, tmp19545, 238);
__m512 tmp19553 = _mm512_shuffle_ps(tmp19544, tmp19546, 68);
__m512 tmp19554 = _mm512_shuffle_ps(tmp19544, tmp19546, 238);
__m512 tmp19555 = _mm512_shuffle_ps(tmp19547, tmp19549, 68);
__m512 tmp19556 = _mm512_shuffle_ps(tmp19547, tmp19549, 238);
__m512 tmp19557 = _mm512_shuffle_ps(tmp19548, tmp19550, 68);
__m512 tmp19558 = _mm512_shuffle_ps(tmp19548, tmp19550, 238);
__m512 tmp19559 = _mm512_shuffle_f32x4(tmp19551, tmp19555, 136);
__m512 tmp19560 = _mm512_shuffle_f32x4(tmp19551, tmp19555, 221);
__m512 tmp19561 = _mm512_shuffle_f32x4(tmp19552, tmp19556, 136);
__m512 tmp19562 = _mm512_shuffle_f32x4(tmp19552, tmp19556, 221);
__m512 tmp19563 = _mm512_shuffle_f32x4(tmp19553, tmp19557, 136);
__m512 tmp19564 = _mm512_shuffle_f32x4(tmp19553, tmp19557, 221);
__m512 tmp19565 = _mm512_shuffle_f32x4(tmp19554, tmp19558, 136);
__m512 tmp19566 = _mm512_shuffle_f32x4(tmp19554, tmp19558, 221);
in2637 = _mm512_shuffle_f32x4(tmp19559, tmp19559, 136);
__m512 tmp19527 = _mm512_shuffle_f32x4(tmp19559, tmp19559, 221);
tmp19524 = _mm512_shuffle_f32x4(tmp19561, tmp19561, 136);
__m512 tmp19528 = _mm512_shuffle_f32x4(tmp19561, tmp19561, 221);
tmp19522 = _mm512_shuffle_f32x4(tmp19563, tmp19563, 136);
__m512 tmp19529 = _mm512_shuffle_f32x4(tmp19563, tmp19563, 221);
tmp19525 = _mm512_shuffle_f32x4(tmp19565, tmp19565, 136);
__m512 tmp19530 = _mm512_shuffle_f32x4(tmp19565, tmp19565, 221);
tmp19523 = _mm512_shuffle_f32x4(tmp19560, tmp19560, 136);
tmp19521 = _mm512_shuffle_f32x4(tmp19562, tmp19562, 136);
tmp19526 = _mm512_shuffle_f32x4(tmp19564, tmp19564, 136);
in2639 = _mm512_shuffle_f32x4(tmp19566, tmp19566, 136);
in2637 = _mm512_shuffle_f32x4(in2637, tmp19525, 68);
tmp19524 = _mm512_shuffle_f32x4(tmp19524, tmp19523, 68);
tmp19522 = _mm512_shuffle_f32x4(tmp19522, tmp19521, 68);
tmp19526 = _mm512_shuffle_f32x4(tmp19526, tmp19528, 68);
in2639 = _mm512_shuffle_f32x4(in2639, tmp19529, 68);
tmp19527 = _mm512_shuffle_f32x4(tmp19527, tmp19530, 68);
__m512 tmp19531 = _mm512_fmadd_ps(in2637, _mm512_set1_ps(4e+00f), tmp19522);
__m512 tmp19537 = _mm512_fmadd_ps(tmp19526, _mm512_set1_ps(4e+00f), tmp19527);
__m512 tmp19532 = _mm512_add_ps(in2637, tmp19522);
__m512 tmp19538 = _mm512_add_ps(tmp19526, tmp19527);
__m512 tmp19533 = _mm512_fmadd_ps(tmp19522, _mm512_set1_ps(4e+00f), in2637);
__m512 tmp19539 = _mm512_fmadd_ps(tmp19527, _mm512_set1_ps(4e+00f), tmp19526);
__m512 tmp19534 = _mm512_add_ps(tmp19524, tmp19532);
__m512 tmp19540 = _mm512_add_ps(in2639, tmp19538);
__m512 tmp19535 = _mm512_fmadd_ps(tmp19524, _mm512_set1_ps(2e+00f), tmp19533);
__m512 tmp19541 = _mm512_fmadd_ps(in2639, _mm512_set1_ps(2e+00f), tmp19539);
tmp19533 = _mm512_fnmadd_ps(tmp19524, _mm512_set1_ps(2e+00f), tmp19533);
tmp19539 = _mm512_fnmadd_ps(in2639, _mm512_set1_ps(2e+00f), tmp19539);
__m512 tmp19536 = _mm512_fnmadd_ps(tmp19524, _mm512_set1_ps(2e+00f), tmp19531);
__m512 tmp19542 = _mm512_fnmadd_ps(in2639, _mm512_set1_ps(2e+00f), tmp19537);
tmp19531 = _mm512_fmadd_ps(tmp19524, _mm512_set1_ps(2e+00f), tmp19531);
tmp19537 = _mm512_fmadd_ps(in2639, _mm512_set1_ps(2e+00f), tmp19537);
tmp19532 = _mm512_sub_ps(tmp19532, tmp19524);
tmp19538 = _mm512_sub_ps(tmp19538, in2639);
in2637 = _mm512_mul_ps(in2637, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp19534 = _mm512_mul_ps(tmp19534, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp19532 = _mm512_mul_ps(tmp19532, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp19535 = _mm512_mul_ps(tmp19535, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp19533 = _mm512_mul_ps(tmp19533, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp19531 = _mm512_mul_ps(tmp19531, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp19536 = _mm512_mul_ps(tmp19536, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp19522 = _mm512_mul_ps(tmp19522, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp19526 = _mm512_mul_ps(tmp19526, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp19540 = _mm512_mul_ps(tmp19540, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp19538 = _mm512_mul_ps(tmp19538, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp19541 = _mm512_mul_ps(tmp19541, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp19539 = _mm512_mul_ps(tmp19539, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp19537 = _mm512_mul_ps(tmp19537, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp19542 = _mm512_mul_ps(tmp19542, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp19527 = _mm512_mul_ps(tmp19527, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out2431 = _mm512_shuffle_f32x4(in2637, tmp19534, 68);
__m512 out2435 = _mm512_shuffle_f32x4(in2637, tmp19534, 238);
__m512 out2432 = _mm512_shuffle_f32x4(tmp19532, tmp19535, 68);
__m512 out2436 = _mm512_shuffle_f32x4(tmp19532, tmp19535, 238);
__m512 out2433 = _mm512_shuffle_f32x4(tmp19533, tmp19531, 68);
__m512 out2437 = _mm512_shuffle_f32x4(tmp19533, tmp19531, 238);
__m512 out2434 = _mm512_shuffle_f32x4(tmp19536, tmp19522, 68);
__m512 out2438 = _mm512_shuffle_f32x4(tmp19536, tmp19522, 238);
__m512 out2439 = _mm512_shuffle_f32x4(tmp19526, tmp19540, 68);
__m512 out2443 = _mm512_shuffle_f32x4(tmp19526, tmp19540, 238);
__m512 out2440 = _mm512_shuffle_f32x4(tmp19538, tmp19541, 68);
__m512 out2444 = _mm512_shuffle_f32x4(tmp19538, tmp19541, 238);
__m512 out2441 = _mm512_shuffle_f32x4(tmp19539, tmp19537, 68);
__m512 out2445 = _mm512_shuffle_f32x4(tmp19539, tmp19537, 238);
__m512 out2442 = _mm512_shuffle_f32x4(tmp19542, tmp19527, 68);
__m512 out2446 = _mm512_shuffle_f32x4(tmp19542, tmp19527, 238);
ptrdiff_t off25 = 32*cut31;
ptrdiff_t off26 = (size_t)(cut31+1)/4*65536+(size_t)(cut31+1)%4*32;
ptrdiff_t off27 = (size_t)(cut31+2)/4*65536+(size_t)(cut31+2)%4*32;
ptrdiff_t off28 = (size_t)(cut31+3)/4*65536+(size_t)(cut31+3)%4*32;
__m512i wf177 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2431, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf178 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2435, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf179 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2439, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf180 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2443, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf181 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2432, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf182 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2436, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf183 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2440, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf184 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2444, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf185 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2433, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf186 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2437, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf187 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2441, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf188 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2445, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf189 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2434, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf190 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2438, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf191 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2442, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf192 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2446, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr16+0+33554432*i70+65536*k176+off25+128*s59, 255, wf177);
_mm512_mask_storeu_epi32(wfPtr16+0+33554432*i70+65536*k176+off26+128*s59, 255, wf178);
_mm512_mask_storeu_epi32(wfPtr16+0+33554432*i70+65536*k176+off27+128*s59, 255, wf179);
_mm512_mask_storeu_epi32(wfPtr16+0+33554432*i70+65536*k176+off28+128*s59, 255, wf180);
_mm512_mask_storeu_epi32(wfPtr16+8388608+33554432*i70+65536*k176+off25+128*s59, 255, wf181);
_mm512_mask_storeu_epi32(wfPtr16+8388608+33554432*i70+65536*k176+off26+128*s59, 255, wf182);
_mm512_mask_storeu_epi32(wfPtr16+8388608+33554432*i70+65536*k176+off27+128*s59, 255, wf183);
_mm512_mask_storeu_epi32(wfPtr16+8388608+33554432*i70+65536*k176+off28+128*s59, 255, wf184);
_mm512_mask_storeu_epi32(wfPtr16+16777216+33554432*i70+65536*k176+off25+128*s59, 255, wf185);
_mm512_mask_storeu_epi32(wfPtr16+16777216+33554432*i70+65536*k176+off26+128*s59, 255, wf186);
_mm512_mask_storeu_epi32(wfPtr16+16777216+33554432*i70+65536*k176+off27+128*s59, 255, wf187);
_mm512_mask_storeu_epi32(wfPtr16+16777216+33554432*i70+65536*k176+off28+128*s59, 255, wf188);
_mm512_mask_storeu_epi32(wfPtr16+25165824+33554432*i70+65536*k176+off25+128*s59, 255, wf189);
_mm512_mask_storeu_epi32(wfPtr16+25165824+33554432*i70+65536*k176+off26+128*s59, 255, wf190);
_mm512_mask_storeu_epi32(wfPtr16+25165824+33554432*i70+65536*k176+off27+128*s59, 255, wf191);
_mm512_mask_storeu_epi32(wfPtr16+25165824+33554432*i70+65536*k176+off28+128*s59, 255, wf192);
}
__m512 bias8 = _mm512_setzero_ps();
if (!e34) {
bias8 = _mm512_maskz_loadu_ps(15, biasPtr22-0+2048*i70+16*j61);
__m512i pmMul46 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd46 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas14 = _mm512_maskz_loadu_ps(255, bnPtr22+(ptrdiff_t)8*(0+512*i70+4*j61));
__m512 postMul76 = _mm512_permutexvar_ps(pmMul46, mas14);
__m512 postAdd46 = _mm512_permutexvar_ps(pmAdd46, mas14);
bias8 = _mm512_fmadd_ps(bias8, postMul76, postAdd46);
}
_mm512_mask_storeu_ps(bfPtr16-0+2048*i70+16*j61, 15, bias8);
if (j61 >= jj53) return;
}
}
}

static void ResNet50ThreeArrangeFilts7(ResNet50ThreaderTeam1* team73, char** tensors119) {
ResNet50ThreaderTask1 task123;
task123.callee1 = ResNet50ThreeArrangeFilts7Callee1;
task123.any1 = tensors119;
task123.nd1 = 3;
task123.hull1[0] = 128;
task123.hull1[1] = 1;
task123.hull1[2] = 1;
ResNet50ThreaderDo1(team73, &task123);
}

static void ResNet50ThreeArrangeDats7Callee1(ResNet50ThreaderTask1* task124, int64_t* pt67) {
char** tensors122 = task124->any1;
ptrdiff_t s60 = pt67[0];
ptrdiff_t c58 = 0;
ptrdiff_t g41 = 0;
ptrdiff_t e35 = 0;
char*restrict datPtr38 = tensors122[0]-32+158720*e35;
char*restrict dfPtr16 = tensors122[1]+507904*e35;
ptrdiff_t i71 = 1*g41;
ptrdiff_t j62 = 1*c58;
ptrdiff_t rel27 = j62-0;
ptrdiff_t base27 = 0;
ptrdiff_t h58 = base27+0;
ptrdiff_t w76 = 0;
ptrdiff_t k177 = 0;
for (; k177 != 128; ++k177) {
__m512 dat2493 = _mm512_maskz_loadu_ps(127, datPtr38+172+163840*i71+28*h58+4*w76+40960*s60+320*k177);
dat2493 = _mm512_max_ps(_mm512_setzero_ps(), dat2493);
__m512i pm260 = _mm512_set_epi32(15, 15, 15, 15, 15, 15, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in2647 = _mm512_permutexvar_ps(pm260, dat2493);
__m512 dat2494 = _mm512_maskz_loadu_ps(127, datPtr38+32+163840*i71+28*h58+4*w76+40960*s60+320*k177);
dat2494 = _mm512_max_ps(_mm512_setzero_ps(), dat2494);
__m512 dat2495 = _mm512_maskz_loadu_ps(127, datPtr38+200+163840*i71+28*h58+4*w76+40960*s60+320*k177);
dat2495 = _mm512_max_ps(_mm512_setzero_ps(), dat2495);
__m512 in2640 = _mm512_permutexvar_ps(pm260, dat2494);
__m512 in2648 = _mm512_permutexvar_ps(pm260, dat2495);
__m512 dat2496 = _mm512_maskz_loadu_ps(127, datPtr38+60+163840*i71+28*h58+4*w76+40960*s60+320*k177);
dat2496 = _mm512_max_ps(_mm512_setzero_ps(), dat2496);
__m512 in2641 = _mm512_permutexvar_ps(pm260, dat2496);
__m512 dat2497 = _mm512_maskz_loadu_ps(127, datPtr38+88+163840*i71+28*h58+4*w76+40960*s60+320*k177);
dat2497 = _mm512_max_ps(_mm512_setzero_ps(), dat2497);
__m512 in2642 = _mm512_permutexvar_ps(pm260, dat2497);
__m512 dat2498 = _mm512_maskz_loadu_ps(127, datPtr38+116+163840*i71+28*h58+4*w76+40960*s60+320*k177);
dat2498 = _mm512_max_ps(_mm512_setzero_ps(), dat2498);
__m512 in2643 = _mm512_permutexvar_ps(pm260, dat2498);
__m512 dat2499 = _mm512_maskz_loadu_ps(127, datPtr38+144+163840*i71+28*h58+4*w76+40960*s60+320*k177);
dat2499 = _mm512_max_ps(_mm512_setzero_ps(), dat2499);
__m512 in2644 = _mm512_permutexvar_ps(pm260, dat2499);
__m512 dat2500 = _mm512_maskz_loadu_ps(127, datPtr38+172+163840*i71+28*h58+4*w76+40960*s60+320*k177);
dat2500 = _mm512_max_ps(_mm512_setzero_ps(), dat2500);
__m512 in2645 = _mm512_permutexvar_ps(pm260, dat2500);
__m512 dat2501 = _mm512_maskz_loadu_ps(127, datPtr38+200+163840*i71+28*h58+4*w76+40960*s60+320*k177);
dat2501 = _mm512_max_ps(_mm512_setzero_ps(), dat2501);
__m512 in2646 = _mm512_permutexvar_ps(pm260, dat2501);
__m512 tmp19567 = _mm512_add_ps(in2640, in2644);
__m512 tmp19572 = in2648;
__m512 tmp19568 = _mm512_sub_ps(in2643, in2641);
__m512 tmp19569 = _mm512_add_ps(in2641, in2645);
__m512 tmp19570 = _mm512_sub_ps(_mm512_setzero_ps(), in2645);
in2647 = in2647;
tmp19567 = _mm512_fmadd_ps(in2642, _mm512_set1_ps(-4.25e+00f), tmp19567);
tmp19572 = tmp19572;
tmp19569 = _mm512_fmadd_ps(in2643, _mm512_set1_ps(-4.25e+00f), tmp19569);
tmp19570 = _mm512_fmadd_ps(tmp19568, _mm512_set1_ps(5.25e+00f), tmp19570);
in2647 = in2647;
tmp19568 = _mm512_fmadd_ps(in2641, _mm512_set1_ps(2.5e-01f), in2645);
in2641 = _mm512_fmadd_ps(in2641, _mm512_set1_ps(4e+00f), in2645);
__m512 tmp19571 = _mm512_sub_ps(tmp19569, tmp19567);
__m512 tmp19573 = _mm512_sub_ps(_mm512_setzero_ps(), tmp19572);
tmp19569 = _mm512_add_ps(tmp19567, tmp19569);
__m512 tmp19574 = tmp19572;
tmp19567 = _mm512_fmadd_ps(in2640, _mm512_set1_ps(2.5e-01f), in2644);
tmp19572 = _mm512_mul_ps(in2648, _mm512_set1_ps(2.5e-01f));
tmp19568 = _mm512_fmadd_ps(in2643, _mm512_set1_ps(-1.25e+00f), tmp19568);
in2643 = _mm512_fmadd_ps(in2643, _mm512_set1_ps(-5e+00f), in2641);
tmp19567 = _mm512_fmadd_ps(in2642, _mm512_set1_ps(-1.25e+00f), tmp19567);
tmp19572 = tmp19572;
in2645 = _mm512_fmadd_ps(tmp19567, _mm512_set1_ps(2e+00f), tmp19568);
__m512 tmp19575 = _mm512_mul_ps(tmp19572, _mm512_set1_ps(2e+00f));
tmp19568 = _mm512_fnmadd_ps(tmp19567, _mm512_set1_ps(2e+00f), tmp19568);
__m512 tmp19576 = _mm512_fnmadd_ps(tmp19572, _mm512_set1_ps(2e+00f), _mm512_setzero_ps());
tmp19567 = _mm512_fmadd_ps(in2644, _mm512_set1_ps(2.5e-01f), in2640);
tmp19572 = in2648;
in2640 = _mm512_sub_ps(in2646, in2640);
in2648 = _mm512_sub_ps(_mm512_setzero_ps(), in2648);
tmp19567 = _mm512_fmadd_ps(in2642, _mm512_set1_ps(-1.25e+00f), tmp19567);
tmp19572 = tmp19572;
in2642 = _mm512_sub_ps(in2642, in2644);
in2642 = _mm512_fmadd_ps(in2642, _mm512_set1_ps(5.25e+00f), in2640);
__m512 tmp19577 = in2648;
in2641 = _mm512_fmadd_ps(tmp19567, _mm512_set1_ps(2e+00f), in2643);
__m512 tmp19578 = _mm512_mul_ps(tmp19572, _mm512_set1_ps(2e+00f));
in2643 = _mm512_fnmadd_ps(tmp19567, _mm512_set1_ps(2e+00f), in2643);
__m512 tmp19579 = _mm512_fnmadd_ps(tmp19572, _mm512_set1_ps(2e+00f), _mm512_setzero_ps());
__m512 tmp19593 = _mm512_unpacklo_ps(tmp19570, tmp19569);
__m512 tmp19594 = _mm512_unpackhi_ps(tmp19570, tmp19569);
__m512 tmp19595 = _mm512_unpacklo_ps(tmp19571, in2645);
__m512 tmp19596 = _mm512_unpackhi_ps(tmp19571, in2645);
__m512 tmp19597 = _mm512_unpacklo_ps(tmp19568, in2641);
__m512 tmp19598 = _mm512_unpackhi_ps(tmp19568, in2641);
__m512 tmp19599 = _mm512_unpacklo_ps(in2643, in2642);
__m512 tmp19600 = _mm512_unpackhi_ps(in2643, in2642);
__m512 tmp19601 = _mm512_unpacklo_ps(in2647, tmp19574);
__m512 tmp19602 = _mm512_unpackhi_ps(in2647, tmp19574);
__m512 tmp19603 = _mm512_unpacklo_ps(tmp19573, tmp19575);
__m512 tmp19604 = _mm512_unpackhi_ps(tmp19573, tmp19575);
__m512 tmp19605 = _mm512_unpacklo_ps(tmp19576, tmp19578);
__m512 tmp19606 = _mm512_unpackhi_ps(tmp19576, tmp19578);
__m512 tmp19607 = _mm512_unpacklo_ps(tmp19579, tmp19577);
__m512 tmp19608 = _mm512_unpackhi_ps(tmp19579, tmp19577);
__m512 tmp19609 = _mm512_shuffle_ps(tmp19593, tmp19595, 68);
__m512 tmp19610 = _mm512_shuffle_ps(tmp19593, tmp19595, 238);
__m512 tmp19611 = _mm512_shuffle_ps(tmp19594, tmp19596, 68);
__m512 tmp19612 = _mm512_shuffle_ps(tmp19594, tmp19596, 238);
__m512 tmp19613 = _mm512_shuffle_ps(tmp19597, tmp19599, 68);
__m512 tmp19614 = _mm512_shuffle_ps(tmp19597, tmp19599, 238);
__m512 tmp19615 = _mm512_shuffle_ps(tmp19598, tmp19600, 68);
__m512 tmp19616 = _mm512_shuffle_ps(tmp19598, tmp19600, 238);
__m512 tmp19617 = _mm512_shuffle_ps(tmp19601, tmp19603, 68);
__m512 tmp19618 = _mm512_shuffle_ps(tmp19601, tmp19603, 238);
__m512 tmp19619 = _mm512_shuffle_ps(tmp19602, tmp19604, 68);
__m512 tmp19620 = _mm512_shuffle_ps(tmp19602, tmp19604, 238);
__m512 tmp19621 = _mm512_shuffle_ps(tmp19605, tmp19607, 68);
__m512 tmp19622 = _mm512_shuffle_ps(tmp19605, tmp19607, 238);
__m512 tmp19623 = _mm512_shuffle_ps(tmp19606, tmp19608, 68);
__m512 tmp19624 = _mm512_shuffle_ps(tmp19606, tmp19608, 238);
__m512 tmp19625 = _mm512_shuffle_f32x4(tmp19609, tmp19613, 136);
__m512 tmp19626 = _mm512_shuffle_f32x4(tmp19609, tmp19613, 221);
__m512 tmp19627 = _mm512_shuffle_f32x4(tmp19610, tmp19614, 136);
__m512 tmp19628 = _mm512_shuffle_f32x4(tmp19610, tmp19614, 221);
__m512 tmp19629 = _mm512_shuffle_f32x4(tmp19611, tmp19615, 136);
__m512 tmp19630 = _mm512_shuffle_f32x4(tmp19611, tmp19615, 221);
__m512 tmp19631 = _mm512_shuffle_f32x4(tmp19612, tmp19616, 136);
__m512 tmp19632 = _mm512_shuffle_f32x4(tmp19612, tmp19616, 221);
__m512 tmp19633 = _mm512_shuffle_f32x4(tmp19617, tmp19621, 136);
__m512 tmp19634 = _mm512_shuffle_f32x4(tmp19617, tmp19621, 221);
__m512 tmp19635 = _mm512_shuffle_f32x4(tmp19618, tmp19622, 136);
__m512 tmp19636 = _mm512_shuffle_f32x4(tmp19618, tmp19622, 221);
__m512 tmp19637 = _mm512_shuffle_f32x4(tmp19619, tmp19623, 136);
__m512 tmp19638 = _mm512_shuffle_f32x4(tmp19619, tmp19623, 221);
__m512 tmp19639 = _mm512_shuffle_f32x4(tmp19620, tmp19624, 136);
__m512 tmp19640 = _mm512_shuffle_f32x4(tmp19620, tmp19624, 221);
tmp19570 = _mm512_shuffle_f32x4(tmp19625, tmp19633, 136);
in2647 = _mm512_shuffle_f32x4(tmp19625, tmp19633, 221);
tmp19569 = _mm512_shuffle_f32x4(tmp19627, tmp19635, 136);
tmp19574 = _mm512_shuffle_f32x4(tmp19627, tmp19635, 221);
tmp19571 = _mm512_shuffle_f32x4(tmp19629, tmp19637, 136);
in2645 = _mm512_shuffle_f32x4(tmp19631, tmp19639, 136);
tmp19568 = _mm512_shuffle_f32x4(tmp19626, tmp19634, 136);
in2641 = _mm512_shuffle_f32x4(tmp19628, tmp19636, 136);
in2643 = _mm512_shuffle_f32x4(tmp19630, tmp19638, 136);
in2642 = _mm512_shuffle_f32x4(tmp19632, tmp19640, 136);
(void)tmp19570;
__m512 tmp19580 = _mm512_add_ps(tmp19569, in2641);
__m512 tmp19585 = tmp19574;
__m512 tmp19581 = _mm512_sub_ps(tmp19568, tmp19571);
__m512 tmp19582 = _mm512_add_ps(tmp19571, in2643);
__m512 tmp19583 = _mm512_sub_ps(_mm512_setzero_ps(), in2643);
in2647 = in2647;
tmp19580 = _mm512_fmadd_ps(in2645, _mm512_set1_ps(-4.25e+00f), tmp19580);
tmp19585 = tmp19585;
tmp19582 = _mm512_fmadd_ps(tmp19568, _mm512_set1_ps(-4.25e+00f), tmp19582);
tmp19583 = _mm512_fmadd_ps(tmp19581, _mm512_set1_ps(5.25e+00f), tmp19583);
in2647 = in2647;
tmp19581 = _mm512_fmadd_ps(tmp19571, _mm512_set1_ps(2.5e-01f), in2643);
tmp19571 = _mm512_fmadd_ps(tmp19571, _mm512_set1_ps(4e+00f), in2643);
__m512 tmp19584 = _mm512_sub_ps(tmp19582, tmp19580);
__m512 tmp19586 = _mm512_sub_ps(_mm512_setzero_ps(), tmp19585);
tmp19582 = _mm512_add_ps(tmp19580, tmp19582);
__m512 tmp19587 = tmp19585;
tmp19580 = _mm512_fmadd_ps(tmp19569, _mm512_set1_ps(2.5e-01f), in2641);
tmp19585 = _mm512_mul_ps(tmp19574, _mm512_set1_ps(2.5e-01f));
tmp19581 = _mm512_fmadd_ps(tmp19568, _mm512_set1_ps(-1.25e+00f), tmp19581);
tmp19568 = _mm512_fmadd_ps(tmp19568, _mm512_set1_ps(-5e+00f), tmp19571);
tmp19580 = _mm512_fmadd_ps(in2645, _mm512_set1_ps(-1.25e+00f), tmp19580);
tmp19585 = tmp19585;
in2643 = _mm512_fmadd_ps(tmp19580, _mm512_set1_ps(2e+00f), tmp19581);
__m512 tmp19588 = _mm512_mul_ps(tmp19585, _mm512_set1_ps(2e+00f));
tmp19581 = _mm512_fnmadd_ps(tmp19580, _mm512_set1_ps(2e+00f), tmp19581);
__m512 tmp19589 = _mm512_fnmadd_ps(tmp19585, _mm512_set1_ps(2e+00f), _mm512_setzero_ps());
tmp19580 = _mm512_fmadd_ps(in2641, _mm512_set1_ps(2.5e-01f), tmp19569);
tmp19585 = tmp19574;
tmp19569 = _mm512_sub_ps(in2642, tmp19569);
tmp19574 = _mm512_sub_ps(_mm512_setzero_ps(), tmp19574);
tmp19580 = _mm512_fmadd_ps(in2645, _mm512_set1_ps(-1.25e+00f), tmp19580);
tmp19585 = tmp19585;
in2645 = _mm512_sub_ps(in2645, in2641);
in2645 = _mm512_fmadd_ps(in2645, _mm512_set1_ps(5.25e+00f), tmp19569);
__m512 tmp19590 = tmp19574;
tmp19571 = _mm512_fmadd_ps(tmp19580, _mm512_set1_ps(2e+00f), tmp19568);
__m512 tmp19591 = _mm512_mul_ps(tmp19585, _mm512_set1_ps(2e+00f));
tmp19568 = _mm512_fnmadd_ps(tmp19580, _mm512_set1_ps(2e+00f), tmp19568);
__m512 tmp19592 = _mm512_fnmadd_ps(tmp19585, _mm512_set1_ps(2e+00f), _mm512_setzero_ps());
__m512 out2447 = _mm512_shuffle_f32x4(tmp19583, tmp19582, 68);
__m512 out2455 = _mm512_shuffle_f32x4(tmp19583, tmp19582, 238);
__m512 out2448 = _mm512_shuffle_f32x4(tmp19584, in2643, 68);
__m512 out2456 = _mm512_shuffle_f32x4(tmp19584, in2643, 238);
__m512 out2449 = _mm512_shuffle_f32x4(tmp19581, tmp19571, 68);
__m512 out2457 = _mm512_shuffle_f32x4(tmp19581, tmp19571, 238);
__m512 out2450 = _mm512_shuffle_f32x4(tmp19568, in2645, 68);
__m512 out2458 = _mm512_shuffle_f32x4(tmp19568, in2645, 238);
__m512 out2451 = _mm512_shuffle_f32x4(in2647, tmp19587, 68);
__m512 out2459 = _mm512_shuffle_f32x4(in2647, tmp19587, 238);
__m512 out2452 = _mm512_shuffle_f32x4(tmp19586, tmp19588, 68);
__m512 out2460 = _mm512_shuffle_f32x4(tmp19586, tmp19588, 238);
__m512 out2453 = _mm512_shuffle_f32x4(tmp19589, tmp19591, 68);
__m512 out2461 = _mm512_shuffle_f32x4(tmp19589, tmp19591, 238);
__m512 out2454 = _mm512_shuffle_f32x4(tmp19592, tmp19590, 68);
__m512 out2462 = _mm512_shuffle_f32x4(tmp19592, tmp19590, 238);
_mm512_storeu_ps(dfPtr16+0+524288*i71+196608*j62+32768*s60+256*k177, out2447);
_mm512_storeu_ps(dfPtr16+128+524288*i71+196608*j62+32768*s60+256*k177, out2455);
_mm512_storeu_ps(dfPtr16+64+524288*i71+196608*j62+32768*s60+256*k177, out2451);
_mm512_storeu_ps(dfPtr16+192+524288*i71+196608*j62+32768*s60+256*k177, out2459);
_mm512_storeu_ps(dfPtr16+131072+524288*i71+196608*j62+32768*s60+256*k177, out2448);
_mm512_storeu_ps(dfPtr16+131200+524288*i71+196608*j62+32768*s60+256*k177, out2456);
_mm512_storeu_ps(dfPtr16+131136+524288*i71+196608*j62+32768*s60+256*k177, out2452);
_mm512_storeu_ps(dfPtr16+131264+524288*i71+196608*j62+32768*s60+256*k177, out2460);
_mm512_storeu_ps(dfPtr16+262144+524288*i71+196608*j62+32768*s60+256*k177, out2449);
_mm512_storeu_ps(dfPtr16+262272+524288*i71+196608*j62+32768*s60+256*k177, out2457);
_mm512_storeu_ps(dfPtr16+262208+524288*i71+196608*j62+32768*s60+256*k177, out2453);
_mm512_storeu_ps(dfPtr16+262336+524288*i71+196608*j62+32768*s60+256*k177, out2461);
_mm512_storeu_ps(dfPtr16+393216+524288*i71+196608*j62+32768*s60+256*k177, out2450);
_mm512_storeu_ps(dfPtr16+393344+524288*i71+196608*j62+32768*s60+256*k177, out2458);
_mm512_storeu_ps(dfPtr16+393280+524288*i71+196608*j62+32768*s60+256*k177, out2454);
_mm512_storeu_ps(dfPtr16+393408+524288*i71+196608*j62+32768*s60+256*k177, out2462);
}
++j62;
}

static void ResNet50ThreeArrangeDats7(ResNet50ThreaderTeam1* team74, char** tensors121) {
ResNet50ThreaderTask1 task125;
task125.callee1 = ResNet50ThreeArrangeDats7Callee1;
task125.any1 = tensors121;
task125.nd1 = 4;
task125.hull1[0] = 4;
task125.hull1[1] = 1;
task125.hull1[2] = 1;
task125.hull1[3] = 1;
ResNet50ThreaderDo1(team74, &task125);
}

static void ResNet50ThreeProduceSums7Callee1(ResNet50ThreaderTask1* task126, int64_t* pt68) {
void** pair34 = task126->any1;
char** tensors124 = pair34[0];
ptrdiff_t e36 = 0;
ptrdiff_t g42 = 0;
ptrdiff_t f50 = pt68[2];
ptrdiff_t d25 = 0;
ptrdiff_t w77 = pt68[0];
char*restrict bfPtr17 = tensors124[0]+2048*e36;
char*restrict wfPtr17 = tensors124[0]+2048+32505856*e36;
char*restrict dfPtr17 = tensors124[1]+507904*e36;
char*restrict sfPtr16 = tensors124[2];
ptrdiff_t i72 = 1*g42;
ptrdiff_t j63 = 1*f50;
ptrdiff_t k178 = 1*d25;
ptrdiff_t l77 = 1*w77;
ptrdiff_t ll13 = l77+0;
for (; l77 != 128; ++l77) {
__m512 sum668;
__m512 sum672;
__m512 sum676;
__m512 sum680;
if (__builtin_expect(!j63, 0)) {
sum668 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr17+0+2048*i72+16*l77)));
sum672 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr17+4+2048*i72+16*l77)));
sum676 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr17+8+2048*i72+16*l77)));
sum680 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr17+12+2048*i72+16*l77)));
} else {
sum668 = _mm512_setzero_ps();
sum672 = _mm512_setzero_ps();
sum676 = _mm512_setzero_ps();
sum680 = _mm512_setzero_ps();
}
__m512 sum669 = sum668;
__m512 sum670 = sum668;
__m512 sum671 = sum668;
__m512 sum673 = sum672;
__m512 sum674 = sum672;
__m512 sum675 = sum672;
__m512 sum677 = sum676;
__m512 sum678 = sum676;
__m512 sum679 = sum676;
__m512 sum681 = sum680;
__m512 sum682 = sum680;
__m512 sum683 = sum680;
ptrdiff_t b73 = 0;
for (; b73 != 512; ++b73) {
__m512i wfs41 = _mm512_maskz_loadu_epi32(65535, wfPtr17+0+33554432*i72+8388608*j63+65536*l77+128*b73);
__m512 wf193 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs41));
__m512 df693 = _mm512_loadu_ps(dfPtr17+0+524288*i72+131072*j63+196608*k178+256*b73);
sum668 = _mm512_fmadd_ps(wf193, df693, sum668);
__m512 df694 = _mm512_loadu_ps(dfPtr17+64+524288*i72+131072*j63+196608*k178+256*b73);
sum669 = _mm512_fmadd_ps(wf193, df694, sum669);
__m512 df695 = _mm512_loadu_ps(dfPtr17+128+524288*i72+131072*j63+196608*k178+256*b73);
sum670 = _mm512_fmadd_ps(wf193, df695, sum670);
__m512 df696 = _mm512_loadu_ps(dfPtr17+192+524288*i72+131072*j63+196608*k178+256*b73);
sum671 = _mm512_fmadd_ps(wf193, df696, sum671);
__m512 wf194 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs41, 1));
sum672 = _mm512_fmadd_ps(wf194, df693, sum672);
sum673 = _mm512_fmadd_ps(wf194, df694, sum673);
sum674 = _mm512_fmadd_ps(wf194, df695, sum674);
sum675 = _mm512_fmadd_ps(wf194, df696, sum675);
__m512i wfs42 = _mm512_maskz_loadu_epi32(65535, wfPtr17+64+33554432*i72+8388608*j63+65536*l77+128*b73);
__m512 wf195 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs42));
sum676 = _mm512_fmadd_ps(wf195, df693, sum676);
sum677 = _mm512_fmadd_ps(wf195, df694, sum677);
sum678 = _mm512_fmadd_ps(wf195, df695, sum678);
sum679 = _mm512_fmadd_ps(wf195, df696, sum679);
__m512 wf196 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs42, 1));
sum680 = _mm512_fmadd_ps(wf196, df693, sum680);
sum681 = _mm512_fmadd_ps(wf196, df694, sum681);
sum682 = _mm512_fmadd_ps(wf196, df695, sum682);
sum683 = _mm512_fmadd_ps(wf196, df696, sum683);
}
_mm512_storeu_ps(sfPtr16+0+524288*i72+131072*j63+196608*k178+1024*l77, sum668);
_mm512_storeu_ps(sfPtr16+64+524288*i72+131072*j63+196608*k178+1024*l77, sum669);
_mm512_storeu_ps(sfPtr16+128+524288*i72+131072*j63+196608*k178+1024*l77, sum670);
_mm512_storeu_ps(sfPtr16+192+524288*i72+131072*j63+196608*k178+1024*l77, sum671);
_mm512_storeu_ps(sfPtr16+256+524288*i72+131072*j63+196608*k178+1024*l77, sum672);
_mm512_storeu_ps(sfPtr16+320+524288*i72+131072*j63+196608*k178+1024*l77, sum673);
_mm512_storeu_ps(sfPtr16+384+524288*i72+131072*j63+196608*k178+1024*l77, sum674);
_mm512_storeu_ps(sfPtr16+448+524288*i72+131072*j63+196608*k178+1024*l77, sum675);
_mm512_storeu_ps(sfPtr16+512+524288*i72+131072*j63+196608*k178+1024*l77, sum676);
_mm512_storeu_ps(sfPtr16+576+524288*i72+131072*j63+196608*k178+1024*l77, sum677);
_mm512_storeu_ps(sfPtr16+640+524288*i72+131072*j63+196608*k178+1024*l77, sum678);
_mm512_storeu_ps(sfPtr16+704+524288*i72+131072*j63+196608*k178+1024*l77, sum679);
_mm512_storeu_ps(sfPtr16+768+524288*i72+131072*j63+196608*k178+1024*l77, sum680);
_mm512_storeu_ps(sfPtr16+832+524288*i72+131072*j63+196608*k178+1024*l77, sum681);
_mm512_storeu_ps(sfPtr16+896+524288*i72+131072*j63+196608*k178+1024*l77, sum682);
_mm512_storeu_ps(sfPtr16+960+524288*i72+131072*j63+196608*k178+1024*l77, sum683);
if (l77 >= ll13) return;
}
}

static void ResNet50ThreeProduceSums7(ResNet50ThreaderTeam1* team75, char** tensors123) {
void* pair33[] = {tensors123, 0};
ResNet50ThreaderTask1 task127;
task127.callee1 = ResNet50ThreeProduceSums7Callee1;
task127.any1 = pair33;
task127.nd1 = 4;
task127.hull1[0] = 128;
task127.hull1[1] = 1;
task127.hull1[2] = 4;
task127.hull1[3] = 1;
ResNet50ThreaderDo1(team75, &task127);
}

static void ResNet50ThreeConsumeSums7Callee1(ResNet50ThreaderTask1* task128, int64_t* pt69) {
char** tensors126 = task128->any1;
ptrdiff_t w78 = pt69[0];
ptrdiff_t d26 = 0;
ptrdiff_t g43 = 0;
char*restrict sfPtr17 = tensors126[0];
char*restrict datPtr39 = tensors126[1];
ptrdiff_t i73 = 1*g43;
ptrdiff_t j64 = 1*d26;
ptrdiff_t rel28 = j64-0;
ptrdiff_t base28 = 0;
ptrdiff_t toH52 = base28+0;
ptrdiff_t toW52 = 0;
ptrdiff_t k179 = 32*w78;
ptrdiff_t kk64 = k179+31;
for (; k179 != 128; ++k179) {
ptrdiff_t l78 = 0;
for (; l78 != 4; ++l78) {
__m512 sf1409 = _mm512_loadu_ps(sfPtr17+0+524288*i73+196608*j64+1024*k179+256*l78);
__m512 sf1410 = _mm512_loadu_ps(sfPtr17+128+524288*i73+196608*j64+1024*k179+256*l78);
__m512 in2649 = _mm512_shuffle_f32x4(sf1409, sf1410, 68);
__m512 in2650 = _mm512_shuffle_f32x4(sf1409, sf1410, 238);
__m512 sf1411 = _mm512_loadu_ps(sfPtr17+64+524288*i73+196608*j64+1024*k179+256*l78);
__m512 sf1412 = _mm512_loadu_ps(sfPtr17+192+524288*i73+196608*j64+1024*k179+256*l78);
__m512 in2657 = _mm512_shuffle_f32x4(sf1411, sf1412, 68);
__m512 in2658 = _mm512_shuffle_f32x4(sf1411, sf1412, 238);
__m512 sf1413 = _mm512_loadu_ps(sfPtr17+131072+524288*i73+196608*j64+1024*k179+256*l78);
__m512 sf1414 = _mm512_loadu_ps(sfPtr17+131200+524288*i73+196608*j64+1024*k179+256*l78);
__m512 in2651 = _mm512_shuffle_f32x4(sf1413, sf1414, 68);
__m512 in2652 = _mm512_shuffle_f32x4(sf1413, sf1414, 238);
__m512 sf1415 = _mm512_loadu_ps(sfPtr17+131136+524288*i73+196608*j64+1024*k179+256*l78);
__m512 sf1416 = _mm512_loadu_ps(sfPtr17+131264+524288*i73+196608*j64+1024*k179+256*l78);
__m512 in2659 = _mm512_shuffle_f32x4(sf1415, sf1416, 68);
__m512 in2660 = _mm512_shuffle_f32x4(sf1415, sf1416, 238);
__m512 sf1417 = _mm512_loadu_ps(sfPtr17+262144+524288*i73+196608*j64+1024*k179+256*l78);
__m512 sf1418 = _mm512_loadu_ps(sfPtr17+262272+524288*i73+196608*j64+1024*k179+256*l78);
__m512 in2653 = _mm512_shuffle_f32x4(sf1417, sf1418, 68);
__m512 in2654 = _mm512_shuffle_f32x4(sf1417, sf1418, 238);
__m512 sf1419 = _mm512_loadu_ps(sfPtr17+262208+524288*i73+196608*j64+1024*k179+256*l78);
__m512 sf1420 = _mm512_loadu_ps(sfPtr17+262336+524288*i73+196608*j64+1024*k179+256*l78);
__m512 in2661 = _mm512_shuffle_f32x4(sf1419, sf1420, 68);
__m512 in2662 = _mm512_shuffle_f32x4(sf1419, sf1420, 238);
__m512 sf1421 = _mm512_loadu_ps(sfPtr17+393216+524288*i73+196608*j64+1024*k179+256*l78);
__m512 sf1422 = _mm512_loadu_ps(sfPtr17+393344+524288*i73+196608*j64+1024*k179+256*l78);
__m512 in2655 = _mm512_shuffle_f32x4(sf1421, sf1422, 68);
__m512 in2656 = _mm512_shuffle_f32x4(sf1421, sf1422, 238);
__m512 sf1423 = _mm512_loadu_ps(sfPtr17+393280+524288*i73+196608*j64+1024*k179+256*l78);
__m512 sf1424 = _mm512_loadu_ps(sfPtr17+393408+524288*i73+196608*j64+1024*k179+256*l78);
__m512 in2663 = _mm512_shuffle_f32x4(sf1423, sf1424, 68);
__m512 in2664 = _mm512_shuffle_f32x4(sf1423, sf1424, 238);
(void)in2664;
__m512 tmp19652 = _mm512_add_ps(in2650, in2651);
__m512 tmp19672 = _mm512_add_ps(in2658, in2659);
__m512 tmp19651 = _mm512_add_ps(in2652, in2653);
__m512 tmp19671 = _mm512_add_ps(in2660, in2661);
__m512 tmp19657 = _mm512_sub_ps(in2652, in2653);
__m512 tmp19656 = _mm512_sub_ps(in2650, in2651);
__m512 tmp19653 = _mm512_add_ps(in2654, in2655);
__m512 tmp19673 = _mm512_add_ps(in2662, in2663);
__m512 tmp19658 = _mm512_sub_ps(in2654, in2655);
__m512 tmp19655 = _mm512_fmadd_ps(tmp19657, _mm512_set1_ps(2e+00f), tmp19656);
__m512 tmp19662 = _mm512_fmadd_ps(tmp19657, _mm512_set1_ps(8e+00f), tmp19656);
__m512 tmp19650 = _mm512_add_ps(tmp19651, tmp19652);
__m512 tmp19670 = _mm512_add_ps(tmp19671, tmp19672);
__m512 tmp19654 = _mm512_fmadd_ps(tmp19658, _mm512_set1_ps(1.6e+01f), tmp19655);
__m512 tmp19661 = _mm512_fmadd_ps(tmp19658, _mm512_set1_ps(4e+00f), tmp19662);
__m512 tmp19667 = _mm512_add_ps(tmp19658, tmp19656);
__m512 tmp19660 = _mm512_fmadd_ps(tmp19651, _mm512_set1_ps(4e+00f), tmp19652);
__m512 tmp19664 = _mm512_fmadd_ps(tmp19651, _mm512_set1_ps(1.6e+01f), tmp19652);
__m512 tmp19649 = _mm512_add_ps(tmp19650, in2649);
__m512 tmp19669 = _mm512_add_ps(tmp19670, in2657);
__m512 tmp19666 = _mm512_add_ps(tmp19667, in2656);
__m512 tmp19648 = _mm512_fmadd_ps(tmp19653, _mm512_set1_ps(3.2e+01f), tmp19649);
__m512 tmp19668 = _mm512_fmadd_ps(tmp19673, _mm512_set1_ps(3.2e+01f), tmp19669);
__m512 tmp19659 = _mm512_fmadd_ps(tmp19653, _mm512_set1_ps(8e+00f), tmp19660);
__m512 tmp19665 = _mm512_fmadd_ps(tmp19657, _mm512_set1_ps(3.2e+01f), tmp19666);
__m512 tmp19663 = _mm512_fmadd_ps(tmp19653, _mm512_set1_ps(2e+00f), tmp19664);
__m512 tmp19641 = tmp19648;
__m512 tmp19647 = tmp19668;
__m512 tmp19642 = tmp19654;
__m512 tmp19643 = tmp19659;
__m512 tmp19644 = tmp19661;
__m512 tmp19645 = tmp19663;
__m512 tmp19646 = tmp19665;
__m512 tmp19709 = _mm512_unpacklo_ps(tmp19641, tmp19642);
__m512 tmp19710 = _mm512_unpackhi_ps(tmp19641, tmp19642);
__m512 tmp19711 = _mm512_unpacklo_ps(tmp19643, tmp19644);
__m512 tmp19712 = _mm512_unpackhi_ps(tmp19643, tmp19644);
__m512 tmp19713 = _mm512_unpacklo_ps(tmp19645, tmp19646);
__m512 tmp19714 = _mm512_unpackhi_ps(tmp19645, tmp19646);
__m512 tmp19715 = _mm512_unpacklo_ps(tmp19647, tmp19647);
__m512 tmp19716 = _mm512_unpackhi_ps(tmp19647, tmp19647);
__m512 tmp19717 = _mm512_shuffle_ps(tmp19709, tmp19711, 68);
__m512 tmp19718 = _mm512_shuffle_ps(tmp19709, tmp19711, 238);
__m512 tmp19719 = _mm512_shuffle_ps(tmp19710, tmp19712, 68);
__m512 tmp19720 = _mm512_shuffle_ps(tmp19710, tmp19712, 238);
__m512 tmp19721 = _mm512_shuffle_ps(tmp19713, tmp19715, 68);
__m512 tmp19722 = _mm512_shuffle_ps(tmp19713, tmp19715, 238);
__m512 tmp19723 = _mm512_shuffle_ps(tmp19714, tmp19716, 68);
__m512 tmp19724 = _mm512_shuffle_ps(tmp19714, tmp19716, 238);
__m512 tmp19725 = _mm512_shuffle_f32x4(tmp19717, tmp19721, 136);
__m512 tmp19726 = _mm512_shuffle_f32x4(tmp19717, tmp19721, 221);
__m512 tmp19727 = _mm512_shuffle_f32x4(tmp19718, tmp19722, 136);
__m512 tmp19728 = _mm512_shuffle_f32x4(tmp19718, tmp19722, 221);
__m512 tmp19729 = _mm512_shuffle_f32x4(tmp19719, tmp19723, 136);
__m512 tmp19730 = _mm512_shuffle_f32x4(tmp19719, tmp19723, 221);
__m512 tmp19731 = _mm512_shuffle_f32x4(tmp19720, tmp19724, 136);
__m512 tmp19732 = _mm512_shuffle_f32x4(tmp19720, tmp19724, 221);
tmp19641 = _mm512_shuffle_f32x4(tmp19725, tmp19725, 136);
__m512 tmp19675 = _mm512_shuffle_f32x4(tmp19725, tmp19725, 221);
tmp19642 = _mm512_shuffle_f32x4(tmp19727, tmp19727, 136);
__m512 tmp19676 = _mm512_shuffle_f32x4(tmp19727, tmp19727, 221);
tmp19643 = _mm512_shuffle_f32x4(tmp19729, tmp19729, 136);
__m512 tmp19677 = _mm512_shuffle_f32x4(tmp19729, tmp19729, 221);
tmp19644 = _mm512_shuffle_f32x4(tmp19731, tmp19731, 136);
__m512 tmp19678 = _mm512_shuffle_f32x4(tmp19731, tmp19731, 221);
tmp19645 = _mm512_shuffle_f32x4(tmp19726, tmp19726, 136);
__m512 tmp19679 = _mm512_shuffle_f32x4(tmp19726, tmp19726, 221);
tmp19646 = _mm512_shuffle_f32x4(tmp19728, tmp19728, 136);
__m512 tmp19680 = _mm512_shuffle_f32x4(tmp19728, tmp19728, 221);
tmp19647 = _mm512_shuffle_f32x4(tmp19730, tmp19730, 136);
__m512 tmp19681 = _mm512_shuffle_f32x4(tmp19730, tmp19730, 221);
__m512 tmp19674 = _mm512_shuffle_f32x4(tmp19732, tmp19732, 136);
__m512 tmp19682 = _mm512_shuffle_f32x4(tmp19732, tmp19732, 221);
(void)tmp19682;
__m512 tmp19687 = _mm512_add_ps(tmp19642, tmp19643);
__m512 tmp19707 = _mm512_add_ps(tmp19676, tmp19677);
__m512 tmp19686 = _mm512_add_ps(tmp19644, tmp19645);
__m512 tmp19706 = _mm512_add_ps(tmp19678, tmp19679);
__m512 tmp19692 = _mm512_sub_ps(tmp19644, tmp19645);
__m512 tmp19691 = _mm512_sub_ps(tmp19642, tmp19643);
__m512 tmp19688 = _mm512_add_ps(tmp19646, tmp19647);
__m512 tmp19708 = _mm512_add_ps(tmp19680, tmp19681);
__m512 tmp19693 = _mm512_sub_ps(tmp19646, tmp19647);
__m512 tmp19690 = _mm512_fmadd_ps(tmp19692, _mm512_set1_ps(2e+00f), tmp19691);
__m512 tmp19697 = _mm512_fmadd_ps(tmp19692, _mm512_set1_ps(8e+00f), tmp19691);
__m512 tmp19685 = _mm512_add_ps(tmp19686, tmp19687);
__m512 tmp19705 = _mm512_add_ps(tmp19706, tmp19707);
__m512 tmp19689 = _mm512_fmadd_ps(tmp19693, _mm512_set1_ps(1.6e+01f), tmp19690);
__m512 tmp19696 = _mm512_fmadd_ps(tmp19693, _mm512_set1_ps(4e+00f), tmp19697);
__m512 tmp19702 = _mm512_add_ps(tmp19693, tmp19691);
__m512 tmp19695 = _mm512_fmadd_ps(tmp19686, _mm512_set1_ps(4e+00f), tmp19687);
__m512 tmp19699 = _mm512_fmadd_ps(tmp19686, _mm512_set1_ps(1.6e+01f), tmp19687);
__m512 tmp19684 = _mm512_add_ps(tmp19685, tmp19641);
__m512 tmp19704 = _mm512_add_ps(tmp19705, tmp19675);
__m512 tmp19701 = _mm512_add_ps(tmp19702, tmp19674);
__m512 tmp19683 = _mm512_fmadd_ps(tmp19688, _mm512_set1_ps(3.2e+01f), tmp19684);
__m512 tmp19703 = _mm512_fmadd_ps(tmp19708, _mm512_set1_ps(3.2e+01f), tmp19704);
__m512 tmp19694 = _mm512_fmadd_ps(tmp19688, _mm512_set1_ps(8e+00f), tmp19695);
__m512 tmp19700 = _mm512_fmadd_ps(tmp19692, _mm512_set1_ps(3.2e+01f), tmp19701);
__m512 tmp19698 = _mm512_fmadd_ps(tmp19688, _mm512_set1_ps(2e+00f), tmp19699);
__m512 out2463 = tmp19683;
__m512 out2469 = tmp19703;
__m512 out2464 = tmp19689;
__m512 out2465 = tmp19694;
__m512 out2466 = tmp19696;
__m512 out2467 = tmp19698;
__m512 out2468 = tmp19700;
out2463 = _mm512_max_ps(_mm512_setzero_ps(), out2463);
out2469 = _mm512_max_ps(_mm512_setzero_ps(), out2469);
out2464 = _mm512_max_ps(_mm512_setzero_ps(), out2464);
out2465 = _mm512_max_ps(_mm512_setzero_ps(), out2465);
out2466 = _mm512_max_ps(_mm512_setzero_ps(), out2466);
out2467 = _mm512_max_ps(_mm512_setzero_ps(), out2467);
out2468 = _mm512_max_ps(_mm512_setzero_ps(), out2468);
_mm512_mask_storeu_ps(datPtr39+0+163840*i73+28*toH52+4*toW52+1280*k179+320*l78, 127, out2463);
_mm512_mask_storeu_ps(datPtr39+168+163840*i73+28*toH52+4*toW52+1280*k179+320*l78, 127, out2469);
_mm512_mask_storeu_ps(datPtr39+28+163840*i73+28*toH52+4*toW52+1280*k179+320*l78, 127, out2464);
_mm512_mask_storeu_ps(datPtr39+56+163840*i73+28*toH52+4*toW52+1280*k179+320*l78, 127, out2465);
_mm512_mask_storeu_ps(datPtr39+84+163840*i73+28*toH52+4*toW52+1280*k179+320*l78, 127, out2466);
_mm512_mask_storeu_ps(datPtr39+112+163840*i73+28*toH52+4*toW52+1280*k179+320*l78, 127, out2467);
_mm512_mask_storeu_ps(datPtr39+140+163840*i73+28*toH52+4*toW52+1280*k179+320*l78, 127, out2468);
}
if (k179 >= kk64) return;
}
++j64;
}

static void ResNet50ThreeConsumeSums7(ResNet50ThreaderTeam1* team76, char** tensors125) {
ResNet50ThreaderTask1 task129;
task129.callee1 = ResNet50ThreeConsumeSums7Callee1;
task129.any1 = tensors125;
task129.nd1 = 3;
task129.hull1[0] = 4;
task129.hull1[1] = 1;
task129.hull1[2] = 1;
ResNet50ThreaderDo1(team76, &task129);
}

static void ResNet50ThreeArrangeFilts8Callee1(ResNet50ThreaderTask1* task146, int64_t* pt78) {
char** tensors142 = task146->any1;
ptrdiff_t b76 = pt78[0];
ptrdiff_t g48 = 0;
ptrdiff_t e44 = 0;
char*restrict bfPtr18 = tensors142[3]+2048*e44;
char*restrict wfPtr18 = tensors142[3]+2048+32505856*e44;
char*restrict wtPtr26 = tensors142[0]+17856*e44;
char*restrict biasPtr25 = tensors142[1];
char*restrict bnPtr26 = tensors142[2];
ptrdiff_t i84 = 1*g48;
ptrdiff_t j75 = 1*b76;
ptrdiff_t jj57 = j75+0;
if (j75 < 128) {
for (; j75 != 128; ++j75) {
ptrdiff_t k193 = 0+1*j75;
ptrdiff_t cut38 = 0;
__m512 postMul86 = _mm512_set1_ps(((float*)bnPtr26+(ptrdiff_t)2*(0+512*i84+4*j75))[0]);
__m512 postMul87 = _mm512_set1_ps(((float*)bnPtr26+(ptrdiff_t)2*(1+512*i84+4*j75))[0]);
__m512 postMul88 = _mm512_set1_ps(((float*)bnPtr26+(ptrdiff_t)2*(2+512*i84+4*j75))[0]);
__m512 postMul89 = _mm512_set1_ps(((float*)bnPtr26+(ptrdiff_t)2*(3+512*i84+4*j75))[0]);
ptrdiff_t s71 = 0;
for (; s71 != 512; ++s71) {
__m512 wt1059 = _mm512_maskz_loadu_ps(511, wtPtr26+0+9437184*i84+73728*j75+36*s71);
__m512 wt1060 = _mm512_maskz_loadu_ps(511, wtPtr26+18432+9437184*i84+73728*j75+36*s71);
__m512 wt1061 = _mm512_maskz_loadu_ps(511, wtPtr26+36864+9437184*i84+73728*j75+36*s71);
__m512 wt1062 = _mm512_maskz_loadu_ps(511, wtPtr26+55296+9437184*i84+73728*j75+36*s71);
wt1059 = _mm512_mul_ps(wt1059, postMul86);
wt1060 = _mm512_mul_ps(wt1060, postMul87);
wt1061 = _mm512_mul_ps(wt1061, postMul88);
wt1062 = _mm512_mul_ps(wt1062, postMul89);
__m512i pm261 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm262 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp20411 = _mm512_permutex2var_ps(wt1059, pm261, wt1061);
__m512 tmp20412 = _mm512_permutex2var_ps(wt1060, pm261, wt1062);
__m512 tmp20413 = _mm512_permutex2var_ps(wt1059, pm262, wt1061);
__m512 tmp20414 = _mm512_permutex2var_ps(wt1060, pm262, wt1062);
__m512 in2665 = _mm512_permutex2var_ps(tmp20411, pm261, tmp20412);
__m512 in2666 = _mm512_permutex2var_ps(tmp20411, pm262, tmp20412);
__m512 in2667 = _mm512_permutex2var_ps(tmp20413, pm261, tmp20414);
__m512 tmp20415 = _mm512_fmadd_ps(in2665, _mm512_set1_ps(4e+00f), in2667);
__m512 tmp20416 = _mm512_add_ps(in2665, in2667);
__m512 tmp20417 = _mm512_fmadd_ps(in2667, _mm512_set1_ps(4e+00f), in2665);
__m512 tmp20418 = _mm512_add_ps(in2666, tmp20416);
__m512 tmp20419 = _mm512_fmadd_ps(in2666, _mm512_set1_ps(2e+00f), tmp20417);
tmp20417 = _mm512_fnmadd_ps(in2666, _mm512_set1_ps(2e+00f), tmp20417);
__m512 tmp20420 = _mm512_fnmadd_ps(in2666, _mm512_set1_ps(2e+00f), tmp20415);
tmp20415 = _mm512_fmadd_ps(in2666, _mm512_set1_ps(2e+00f), tmp20415);
tmp20416 = _mm512_sub_ps(tmp20416, in2666);
__m512 tmp20437 = _mm512_unpacklo_ps(in2665, tmp20418);
__m512 tmp20438 = _mm512_unpackhi_ps(in2665, tmp20418);
__m512 tmp20439 = _mm512_unpacklo_ps(tmp20416, tmp20419);
__m512 tmp20440 = _mm512_unpackhi_ps(tmp20416, tmp20419);
__m512 tmp20441 = _mm512_unpacklo_ps(tmp20417, tmp20415);
__m512 tmp20442 = _mm512_unpackhi_ps(tmp20417, tmp20415);
__m512 tmp20443 = _mm512_unpacklo_ps(tmp20420, in2667);
__m512 tmp20444 = _mm512_unpackhi_ps(tmp20420, in2667);
__m512 tmp20445 = _mm512_shuffle_ps(tmp20437, tmp20439, 68);
__m512 tmp20446 = _mm512_shuffle_ps(tmp20437, tmp20439, 238);
__m512 tmp20447 = _mm512_shuffle_ps(tmp20438, tmp20440, 68);
__m512 tmp20448 = _mm512_shuffle_ps(tmp20438, tmp20440, 238);
__m512 tmp20449 = _mm512_shuffle_ps(tmp20441, tmp20443, 68);
__m512 tmp20450 = _mm512_shuffle_ps(tmp20441, tmp20443, 238);
__m512 tmp20451 = _mm512_shuffle_ps(tmp20442, tmp20444, 68);
__m512 tmp20452 = _mm512_shuffle_ps(tmp20442, tmp20444, 238);
__m512 tmp20453 = _mm512_shuffle_f32x4(tmp20445, tmp20449, 136);
__m512 tmp20454 = _mm512_shuffle_f32x4(tmp20445, tmp20449, 221);
__m512 tmp20455 = _mm512_shuffle_f32x4(tmp20446, tmp20450, 136);
__m512 tmp20456 = _mm512_shuffle_f32x4(tmp20446, tmp20450, 221);
__m512 tmp20457 = _mm512_shuffle_f32x4(tmp20447, tmp20451, 136);
__m512 tmp20458 = _mm512_shuffle_f32x4(tmp20447, tmp20451, 221);
__m512 tmp20459 = _mm512_shuffle_f32x4(tmp20448, tmp20452, 136);
__m512 tmp20460 = _mm512_shuffle_f32x4(tmp20448, tmp20452, 221);
in2665 = _mm512_shuffle_f32x4(tmp20453, tmp20453, 136);
__m512 tmp20421 = _mm512_shuffle_f32x4(tmp20453, tmp20453, 221);
tmp20418 = _mm512_shuffle_f32x4(tmp20455, tmp20455, 136);
__m512 tmp20422 = _mm512_shuffle_f32x4(tmp20455, tmp20455, 221);
tmp20416 = _mm512_shuffle_f32x4(tmp20457, tmp20457, 136);
__m512 tmp20423 = _mm512_shuffle_f32x4(tmp20457, tmp20457, 221);
tmp20419 = _mm512_shuffle_f32x4(tmp20459, tmp20459, 136);
__m512 tmp20424 = _mm512_shuffle_f32x4(tmp20459, tmp20459, 221);
tmp20417 = _mm512_shuffle_f32x4(tmp20454, tmp20454, 136);
tmp20415 = _mm512_shuffle_f32x4(tmp20456, tmp20456, 136);
tmp20420 = _mm512_shuffle_f32x4(tmp20458, tmp20458, 136);
in2667 = _mm512_shuffle_f32x4(tmp20460, tmp20460, 136);
in2665 = _mm512_shuffle_f32x4(in2665, tmp20419, 68);
tmp20418 = _mm512_shuffle_f32x4(tmp20418, tmp20417, 68);
tmp20416 = _mm512_shuffle_f32x4(tmp20416, tmp20415, 68);
tmp20420 = _mm512_shuffle_f32x4(tmp20420, tmp20422, 68);
in2667 = _mm512_shuffle_f32x4(in2667, tmp20423, 68);
tmp20421 = _mm512_shuffle_f32x4(tmp20421, tmp20424, 68);
__m512 tmp20425 = _mm512_fmadd_ps(in2665, _mm512_set1_ps(4e+00f), tmp20416);
__m512 tmp20431 = _mm512_fmadd_ps(tmp20420, _mm512_set1_ps(4e+00f), tmp20421);
__m512 tmp20426 = _mm512_add_ps(in2665, tmp20416);
__m512 tmp20432 = _mm512_add_ps(tmp20420, tmp20421);
__m512 tmp20427 = _mm512_fmadd_ps(tmp20416, _mm512_set1_ps(4e+00f), in2665);
__m512 tmp20433 = _mm512_fmadd_ps(tmp20421, _mm512_set1_ps(4e+00f), tmp20420);
__m512 tmp20428 = _mm512_add_ps(tmp20418, tmp20426);
__m512 tmp20434 = _mm512_add_ps(in2667, tmp20432);
__m512 tmp20429 = _mm512_fmadd_ps(tmp20418, _mm512_set1_ps(2e+00f), tmp20427);
__m512 tmp20435 = _mm512_fmadd_ps(in2667, _mm512_set1_ps(2e+00f), tmp20433);
tmp20427 = _mm512_fnmadd_ps(tmp20418, _mm512_set1_ps(2e+00f), tmp20427);
tmp20433 = _mm512_fnmadd_ps(in2667, _mm512_set1_ps(2e+00f), tmp20433);
__m512 tmp20430 = _mm512_fnmadd_ps(tmp20418, _mm512_set1_ps(2e+00f), tmp20425);
__m512 tmp20436 = _mm512_fnmadd_ps(in2667, _mm512_set1_ps(2e+00f), tmp20431);
tmp20425 = _mm512_fmadd_ps(tmp20418, _mm512_set1_ps(2e+00f), tmp20425);
tmp20431 = _mm512_fmadd_ps(in2667, _mm512_set1_ps(2e+00f), tmp20431);
tmp20426 = _mm512_sub_ps(tmp20426, tmp20418);
tmp20432 = _mm512_sub_ps(tmp20432, in2667);
in2665 = _mm512_mul_ps(in2665, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp20428 = _mm512_mul_ps(tmp20428, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp20426 = _mm512_mul_ps(tmp20426, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp20429 = _mm512_mul_ps(tmp20429, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp20427 = _mm512_mul_ps(tmp20427, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp20425 = _mm512_mul_ps(tmp20425, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp20430 = _mm512_mul_ps(tmp20430, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp20416 = _mm512_mul_ps(tmp20416, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp20420 = _mm512_mul_ps(tmp20420, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp20434 = _mm512_mul_ps(tmp20434, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp20432 = _mm512_mul_ps(tmp20432, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp20435 = _mm512_mul_ps(tmp20435, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp20433 = _mm512_mul_ps(tmp20433, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp20431 = _mm512_mul_ps(tmp20431, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp20436 = _mm512_mul_ps(tmp20436, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp20421 = _mm512_mul_ps(tmp20421, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out2470 = _mm512_shuffle_f32x4(in2665, tmp20428, 68);
__m512 out2474 = _mm512_shuffle_f32x4(in2665, tmp20428, 238);
__m512 out2471 = _mm512_shuffle_f32x4(tmp20426, tmp20429, 68);
__m512 out2475 = _mm512_shuffle_f32x4(tmp20426, tmp20429, 238);
__m512 out2472 = _mm512_shuffle_f32x4(tmp20427, tmp20425, 68);
__m512 out2476 = _mm512_shuffle_f32x4(tmp20427, tmp20425, 238);
__m512 out2473 = _mm512_shuffle_f32x4(tmp20430, tmp20416, 68);
__m512 out2477 = _mm512_shuffle_f32x4(tmp20430, tmp20416, 238);
__m512 out2478 = _mm512_shuffle_f32x4(tmp20420, tmp20434, 68);
__m512 out2482 = _mm512_shuffle_f32x4(tmp20420, tmp20434, 238);
__m512 out2479 = _mm512_shuffle_f32x4(tmp20432, tmp20435, 68);
__m512 out2483 = _mm512_shuffle_f32x4(tmp20432, tmp20435, 238);
__m512 out2480 = _mm512_shuffle_f32x4(tmp20433, tmp20431, 68);
__m512 out2484 = _mm512_shuffle_f32x4(tmp20433, tmp20431, 238);
__m512 out2481 = _mm512_shuffle_f32x4(tmp20436, tmp20421, 68);
__m512 out2485 = _mm512_shuffle_f32x4(tmp20436, tmp20421, 238);
ptrdiff_t off29 = 32*cut38;
ptrdiff_t off30 = (size_t)(cut38+1)/4*65536+(size_t)(cut38+1)%4*32;
ptrdiff_t off31 = (size_t)(cut38+2)/4*65536+(size_t)(cut38+2)%4*32;
ptrdiff_t off32 = (size_t)(cut38+3)/4*65536+(size_t)(cut38+3)%4*32;
__m512i wf197 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2470, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf198 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2474, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf199 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2478, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf200 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2482, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf201 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2471, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf202 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2475, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf203 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2479, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf204 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2483, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf205 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2472, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf206 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2476, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf207 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2480, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf208 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2484, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf209 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2473, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf210 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2477, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf211 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2481, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf212 = _mm512_castsi256_si512(_mm512_cvtps_ph(out2485, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr18+0+33554432*i84+65536*k193+off29+128*s71, 255, wf197);
_mm512_mask_storeu_epi32(wfPtr18+0+33554432*i84+65536*k193+off30+128*s71, 255, wf198);
_mm512_mask_storeu_epi32(wfPtr18+0+33554432*i84+65536*k193+off31+128*s71, 255, wf199);
_mm512_mask_storeu_epi32(wfPtr18+0+33554432*i84+65536*k193+off32+128*s71, 255, wf200);
_mm512_mask_storeu_epi32(wfPtr18+8388608+33554432*i84+65536*k193+off29+128*s71, 255, wf201);
_mm512_mask_storeu_epi32(wfPtr18+8388608+33554432*i84+65536*k193+off30+128*s71, 255, wf202);
_mm512_mask_storeu_epi32(wfPtr18+8388608+33554432*i84+65536*k193+off31+128*s71, 255, wf203);
_mm512_mask_storeu_epi32(wfPtr18+8388608+33554432*i84+65536*k193+off32+128*s71, 255, wf204);
_mm512_mask_storeu_epi32(wfPtr18+16777216+33554432*i84+65536*k193+off29+128*s71, 255, wf205);
_mm512_mask_storeu_epi32(wfPtr18+16777216+33554432*i84+65536*k193+off30+128*s71, 255, wf206);
_mm512_mask_storeu_epi32(wfPtr18+16777216+33554432*i84+65536*k193+off31+128*s71, 255, wf207);
_mm512_mask_storeu_epi32(wfPtr18+16777216+33554432*i84+65536*k193+off32+128*s71, 255, wf208);
_mm512_mask_storeu_epi32(wfPtr18+25165824+33554432*i84+65536*k193+off29+128*s71, 255, wf209);
_mm512_mask_storeu_epi32(wfPtr18+25165824+33554432*i84+65536*k193+off30+128*s71, 255, wf210);
_mm512_mask_storeu_epi32(wfPtr18+25165824+33554432*i84+65536*k193+off31+128*s71, 255, wf211);
_mm512_mask_storeu_epi32(wfPtr18+25165824+33554432*i84+65536*k193+off32+128*s71, 255, wf212);
}
__m512 bias9 = _mm512_setzero_ps();
if (!e44) {
bias9 = _mm512_maskz_loadu_ps(15, biasPtr25-0+2048*i84+16*j75);
__m512i pmMul56 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd56 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas15 = _mm512_maskz_loadu_ps(255, bnPtr26+(ptrdiff_t)8*(0+512*i84+4*j75));
__m512 postMul90 = _mm512_permutexvar_ps(pmMul56, mas15);
__m512 postAdd56 = _mm512_permutexvar_ps(pmAdd56, mas15);
bias9 = _mm512_fmadd_ps(bias9, postMul90, postAdd56);
}
_mm512_mask_storeu_ps(bfPtr18-0+2048*i84+16*j75, 15, bias9);
if (j75 >= jj57) return;
}
}
}

static void ResNet50ThreeArrangeFilts8(ResNet50ThreaderTeam1* team83, char** tensors141) {
ResNet50ThreaderTask1 task147;
task147.callee1 = ResNet50ThreeArrangeFilts8Callee1;
task147.any1 = tensors141;
task147.nd1 = 3;
task147.hull1[0] = 128;
task147.hull1[1] = 1;
task147.hull1[2] = 1;
ResNet50ThreaderDo1(team83, &task147);
}

static void ResNet50ThreeArrangeDats8Callee1(ResNet50ThreaderTask1* task148, int64_t* pt79) {
char** tensors144 = task148->any1;
ptrdiff_t s72 = pt79[0];
ptrdiff_t c68 = 0;
ptrdiff_t g49 = 0;
ptrdiff_t e45 = 0;
char*restrict datPtr48 = tensors144[0]-32+158720*e45;
char*restrict dfPtr18 = tensors144[1]+507904*e45;
ptrdiff_t i85 = 1*g49;
ptrdiff_t j76 = 1*c68;
ptrdiff_t rel29 = j76-0;
ptrdiff_t base29 = 0;
ptrdiff_t h59 = base29+0;
ptrdiff_t w83 = 0;
ptrdiff_t k194 = 0;
for (; k194 != 128; ++k194) {
__m512 dat2546 = _mm512_maskz_loadu_ps(127, datPtr48+172+163840*i85+28*h59+4*w83+40960*s72+320*k194);
__m512i pm263 = _mm512_set_epi32(15, 15, 15, 15, 15, 15, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in2675 = _mm512_permutexvar_ps(pm263, dat2546);
__m512 dat2547 = _mm512_maskz_loadu_ps(127, datPtr48+32+163840*i85+28*h59+4*w83+40960*s72+320*k194);
__m512 dat2548 = _mm512_maskz_loadu_ps(127, datPtr48+200+163840*i85+28*h59+4*w83+40960*s72+320*k194);
__m512 in2668 = _mm512_permutexvar_ps(pm263, dat2547);
__m512 in2676 = _mm512_permutexvar_ps(pm263, dat2548);
__m512 dat2549 = _mm512_maskz_loadu_ps(127, datPtr48+60+163840*i85+28*h59+4*w83+40960*s72+320*k194);
__m512 in2669 = _mm512_permutexvar_ps(pm263, dat2549);
__m512 dat2550 = _mm512_maskz_loadu_ps(127, datPtr48+88+163840*i85+28*h59+4*w83+40960*s72+320*k194);
__m512 in2670 = _mm512_permutexvar_ps(pm263, dat2550);
__m512 dat2551 = _mm512_maskz_loadu_ps(127, datPtr48+116+163840*i85+28*h59+4*w83+40960*s72+320*k194);
__m512 in2671 = _mm512_permutexvar_ps(pm263, dat2551);
__m512 dat2552 = _mm512_maskz_loadu_ps(127, datPtr48+144+163840*i85+28*h59+4*w83+40960*s72+320*k194);
__m512 in2672 = _mm512_permutexvar_ps(pm263, dat2552);
__m512 dat2553 = _mm512_maskz_loadu_ps(127, datPtr48+172+163840*i85+28*h59+4*w83+40960*s72+320*k194);
__m512 in2673 = _mm512_permutexvar_ps(pm263, dat2553);
__m512 dat2554 = _mm512_maskz_loadu_ps(127, datPtr48+200+163840*i85+28*h59+4*w83+40960*s72+320*k194);
__m512 in2674 = _mm512_permutexvar_ps(pm263, dat2554);
__m512 tmp20461 = _mm512_add_ps(in2668, in2672);
__m512 tmp20466 = in2676;
__m512 tmp20462 = _mm512_sub_ps(in2671, in2669);
__m512 tmp20463 = _mm512_add_ps(in2669, in2673);
__m512 tmp20464 = _mm512_sub_ps(_mm512_setzero_ps(), in2673);
in2675 = in2675;
tmp20461 = _mm512_fmadd_ps(in2670, _mm512_set1_ps(-4.25e+00f), tmp20461);
tmp20466 = tmp20466;
tmp20463 = _mm512_fmadd_ps(in2671, _mm512_set1_ps(-4.25e+00f), tmp20463);
tmp20464 = _mm512_fmadd_ps(tmp20462, _mm512_set1_ps(5.25e+00f), tmp20464);
in2675 = in2675;
tmp20462 = _mm512_fmadd_ps(in2669, _mm512_set1_ps(2.5e-01f), in2673);
in2669 = _mm512_fmadd_ps(in2669, _mm512_set1_ps(4e+00f), in2673);
__m512 tmp20465 = _mm512_sub_ps(tmp20463, tmp20461);
__m512 tmp20467 = _mm512_sub_ps(_mm512_setzero_ps(), tmp20466);
tmp20463 = _mm512_add_ps(tmp20461, tmp20463);
__m512 tmp20468 = tmp20466;
tmp20461 = _mm512_fmadd_ps(in2668, _mm512_set1_ps(2.5e-01f), in2672);
tmp20466 = _mm512_mul_ps(in2676, _mm512_set1_ps(2.5e-01f));
tmp20462 = _mm512_fmadd_ps(in2671, _mm512_set1_ps(-1.25e+00f), tmp20462);
in2671 = _mm512_fmadd_ps(in2671, _mm512_set1_ps(-5e+00f), in2669);
tmp20461 = _mm512_fmadd_ps(in2670, _mm512_set1_ps(-1.25e+00f), tmp20461);
tmp20466 = tmp20466;
in2673 = _mm512_fmadd_ps(tmp20461, _mm512_set1_ps(2e+00f), tmp20462);
__m512 tmp20469 = _mm512_mul_ps(tmp20466, _mm512_set1_ps(2e+00f));
tmp20462 = _mm512_fnmadd_ps(tmp20461, _mm512_set1_ps(2e+00f), tmp20462);
__m512 tmp20470 = _mm512_fnmadd_ps(tmp20466, _mm512_set1_ps(2e+00f), _mm512_setzero_ps());
tmp20461 = _mm512_fmadd_ps(in2672, _mm512_set1_ps(2.5e-01f), in2668);
tmp20466 = in2676;
in2668 = _mm512_sub_ps(in2674, in2668);
in2676 = _mm512_sub_ps(_mm512_setzero_ps(), in2676);
tmp20461 = _mm512_fmadd_ps(in2670, _mm512_set1_ps(-1.25e+00f), tmp20461);
tmp20466 = tmp20466;
in2670 = _mm512_sub_ps(in2670, in2672);
in2670 = _mm512_fmadd_ps(in2670, _mm512_set1_ps(5.25e+00f), in2668);
__m512 tmp20471 = in2676;
in2669 = _mm512_fmadd_ps(tmp20461, _mm512_set1_ps(2e+00f), in2671);
__m512 tmp20472 = _mm512_mul_ps(tmp20466, _mm512_set1_ps(2e+00f));
in2671 = _mm512_fnmadd_ps(tmp20461, _mm512_set1_ps(2e+00f), in2671);
__m512 tmp20473 = _mm512_fnmadd_ps(tmp20466, _mm512_set1_ps(2e+00f), _mm512_setzero_ps());
__m512 tmp20487 = _mm512_unpacklo_ps(tmp20464, tmp20463);
__m512 tmp20488 = _mm512_unpackhi_ps(tmp20464, tmp20463);
__m512 tmp20489 = _mm512_unpacklo_ps(tmp20465, in2673);
__m512 tmp20490 = _mm512_unpackhi_ps(tmp20465, in2673);
__m512 tmp20491 = _mm512_unpacklo_ps(tmp20462, in2669);
__m512 tmp20492 = _mm512_unpackhi_ps(tmp20462, in2669);
__m512 tmp20493 = _mm512_unpacklo_ps(in2671, in2670);
__m512 tmp20494 = _mm512_unpackhi_ps(in2671, in2670);
__m512 tmp20495 = _mm512_unpacklo_ps(in2675, tmp20468);
__m512 tmp20496 = _mm512_unpackhi_ps(in2675, tmp20468);
__m512 tmp20497 = _mm512_unpacklo_ps(tmp20467, tmp20469);
__m512 tmp20498 = _mm512_unpackhi_ps(tmp20467, tmp20469);
__m512 tmp20499 = _mm512_unpacklo_ps(tmp20470, tmp20472);
__m512 tmp20500 = _mm512_unpackhi_ps(tmp20470, tmp20472);
__m512 tmp20501 = _mm512_unpacklo_ps(tmp20473, tmp20471);
__m512 tmp20502 = _mm512_unpackhi_ps(tmp20473, tmp20471);
__m512 tmp20503 = _mm512_shuffle_ps(tmp20487, tmp20489, 68);
__m512 tmp20504 = _mm512_shuffle_ps(tmp20487, tmp20489, 238);
__m512 tmp20505 = _mm512_shuffle_ps(tmp20488, tmp20490, 68);
__m512 tmp20506 = _mm512_shuffle_ps(tmp20488, tmp20490, 238);
__m512 tmp20507 = _mm512_shuffle_ps(tmp20491, tmp20493, 68);
__m512 tmp20508 = _mm512_shuffle_ps(tmp20491, tmp20493, 238);
__m512 tmp20509 = _mm512_shuffle_ps(tmp20492, tmp20494, 68);
__m512 tmp20510 = _mm512_shuffle_ps(tmp20492, tmp20494, 238);
__m512 tmp20511 = _mm512_shuffle_ps(tmp20495, tmp20497, 68);
__m512 tmp20512 = _mm512_shuffle_ps(tmp20495, tmp20497, 238);
__m512 tmp20513 = _mm512_shuffle_ps(tmp20496, tmp20498, 68);
__m512 tmp20514 = _mm512_shuffle_ps(tmp20496, tmp20498, 238);
__m512 tmp20515 = _mm512_shuffle_ps(tmp20499, tmp20501, 68);
__m512 tmp20516 = _mm512_shuffle_ps(tmp20499, tmp20501, 238);
__m512 tmp20517 = _mm512_shuffle_ps(tmp20500, tmp20502, 68);
__m512 tmp20518 = _mm512_shuffle_ps(tmp20500, tmp20502, 238);
__m512 tmp20519 = _mm512_shuffle_f32x4(tmp20503, tmp20507, 136);
__m512 tmp20520 = _mm512_shuffle_f32x4(tmp20503, tmp20507, 221);
__m512 tmp20521 = _mm512_shuffle_f32x4(tmp20504, tmp20508, 136);
__m512 tmp20522 = _mm512_shuffle_f32x4(tmp20504, tmp20508, 221);
__m512 tmp20523 = _mm512_shuffle_f32x4(tmp20505, tmp20509, 136);
__m512 tmp20524 = _mm512_shuffle_f32x4(tmp20505, tmp20509, 221);
__m512 tmp20525 = _mm512_shuffle_f32x4(tmp20506, tmp20510, 136);
__m512 tmp20526 = _mm512_shuffle_f32x4(tmp20506, tmp20510, 221);
__m512 tmp20527 = _mm512_shuffle_f32x4(tmp20511, tmp20515, 136);
__m512 tmp20528 = _mm512_shuffle_f32x4(tmp20511, tmp20515, 221);
__m512 tmp20529 = _mm512_shuffle_f32x4(tmp20512, tmp20516, 136);
__m512 tmp20530 = _mm512_shuffle_f32x4(tmp20512, tmp20516, 221);
__m512 tmp20531 = _mm512_shuffle_f32x4(tmp20513, tmp20517, 136);
__m512 tmp20532 = _mm512_shuffle_f32x4(tmp20513, tmp20517, 221);
__m512 tmp20533 = _mm512_shuffle_f32x4(tmp20514, tmp20518, 136);
__m512 tmp20534 = _mm512_shuffle_f32x4(tmp20514, tmp20518, 221);
tmp20464 = _mm512_shuffle_f32x4(tmp20519, tmp20527, 136);
in2675 = _mm512_shuffle_f32x4(tmp20519, tmp20527, 221);
tmp20463 = _mm512_shuffle_f32x4(tmp20521, tmp20529, 136);
tmp20468 = _mm512_shuffle_f32x4(tmp20521, tmp20529, 221);
tmp20465 = _mm512_shuffle_f32x4(tmp20523, tmp20531, 136);
in2673 = _mm512_shuffle_f32x4(tmp20525, tmp20533, 136);
tmp20462 = _mm512_shuffle_f32x4(tmp20520, tmp20528, 136);
in2669 = _mm512_shuffle_f32x4(tmp20522, tmp20530, 136);
in2671 = _mm512_shuffle_f32x4(tmp20524, tmp20532, 136);
in2670 = _mm512_shuffle_f32x4(tmp20526, tmp20534, 136);
(void)tmp20464;
__m512 tmp20474 = _mm512_add_ps(tmp20463, in2669);
__m512 tmp20479 = tmp20468;
__m512 tmp20475 = _mm512_sub_ps(tmp20462, tmp20465);
__m512 tmp20476 = _mm512_add_ps(tmp20465, in2671);
__m512 tmp20477 = _mm512_sub_ps(_mm512_setzero_ps(), in2671);
in2675 = in2675;
tmp20474 = _mm512_fmadd_ps(in2673, _mm512_set1_ps(-4.25e+00f), tmp20474);
tmp20479 = tmp20479;
tmp20476 = _mm512_fmadd_ps(tmp20462, _mm512_set1_ps(-4.25e+00f), tmp20476);
tmp20477 = _mm512_fmadd_ps(tmp20475, _mm512_set1_ps(5.25e+00f), tmp20477);
in2675 = in2675;
tmp20475 = _mm512_fmadd_ps(tmp20465, _mm512_set1_ps(2.5e-01f), in2671);
tmp20465 = _mm512_fmadd_ps(tmp20465, _mm512_set1_ps(4e+00f), in2671);
__m512 tmp20478 = _mm512_sub_ps(tmp20476, tmp20474);
__m512 tmp20480 = _mm512_sub_ps(_mm512_setzero_ps(), tmp20479);
tmp20476 = _mm512_add_ps(tmp20474, tmp20476);
__m512 tmp20481 = tmp20479;
tmp20474 = _mm512_fmadd_ps(tmp20463, _mm512_set1_ps(2.5e-01f), in2669);
tmp20479 = _mm512_mul_ps(tmp20468, _mm512_set1_ps(2.5e-01f));
tmp20475 = _mm512_fmadd_ps(tmp20462, _mm512_set1_ps(-1.25e+00f), tmp20475);
tmp20462 = _mm512_fmadd_ps(tmp20462, _mm512_set1_ps(-5e+00f), tmp20465);
tmp20474 = _mm512_fmadd_ps(in2673, _mm512_set1_ps(-1.25e+00f), tmp20474);
tmp20479 = tmp20479;
in2671 = _mm512_fmadd_ps(tmp20474, _mm512_set1_ps(2e+00f), tmp20475);
__m512 tmp20482 = _mm512_mul_ps(tmp20479, _mm512_set1_ps(2e+00f));
tmp20475 = _mm512_fnmadd_ps(tmp20474, _mm512_set1_ps(2e+00f), tmp20475);
__m512 tmp20483 = _mm512_fnmadd_ps(tmp20479, _mm512_set1_ps(2e+00f), _mm512_setzero_ps());
tmp20474 = _mm512_fmadd_ps(in2669, _mm512_set1_ps(2.5e-01f), tmp20463);
tmp20479 = tmp20468;
tmp20463 = _mm512_sub_ps(in2670, tmp20463);
tmp20468 = _mm512_sub_ps(_mm512_setzero_ps(), tmp20468);
tmp20474 = _mm512_fmadd_ps(in2673, _mm512_set1_ps(-1.25e+00f), tmp20474);
tmp20479 = tmp20479;
in2673 = _mm512_sub_ps(in2673, in2669);
in2673 = _mm512_fmadd_ps(in2673, _mm512_set1_ps(5.25e+00f), tmp20463);
__m512 tmp20484 = tmp20468;
tmp20465 = _mm512_fmadd_ps(tmp20474, _mm512_set1_ps(2e+00f), tmp20462);
__m512 tmp20485 = _mm512_mul_ps(tmp20479, _mm512_set1_ps(2e+00f));
tmp20462 = _mm512_fnmadd_ps(tmp20474, _mm512_set1_ps(2e+00f), tmp20462);
__m512 tmp20486 = _mm512_fnmadd_ps(tmp20479, _mm512_set1_ps(2e+00f), _mm512_setzero_ps());
__m512 out2486 = _mm512_shuffle_f32x4(tmp20477, tmp20476, 68);
__m512 out2494 = _mm512_shuffle_f32x4(tmp20477, tmp20476, 238);
__m512 out2487 = _mm512_shuffle_f32x4(tmp20478, in2671, 68);
__m512 out2495 = _mm512_shuffle_f32x4(tmp20478, in2671, 238);
__m512 out2488 = _mm512_shuffle_f32x4(tmp20475, tmp20465, 68);
__m512 out2496 = _mm512_shuffle_f32x4(tmp20475, tmp20465, 238);
__m512 out2489 = _mm512_shuffle_f32x4(tmp20462, in2673, 68);
__m512 out2497 = _mm512_shuffle_f32x4(tmp20462, in2673, 238);
__m512 out2490 = _mm512_shuffle_f32x4(in2675, tmp20481, 68);
__m512 out2498 = _mm512_shuffle_f32x4(in2675, tmp20481, 238);
__m512 out2491 = _mm512_shuffle_f32x4(tmp20480, tmp20482, 68);
__m512 out2499 = _mm512_shuffle_f32x4(tmp20480, tmp20482, 238);
__m512 out2492 = _mm512_shuffle_f32x4(tmp20483, tmp20485, 68);
__m512 out2500 = _mm512_shuffle_f32x4(tmp20483, tmp20485, 238);
__m512 out2493 = _mm512_shuffle_f32x4(tmp20486, tmp20484, 68);
__m512 out2501 = _mm512_shuffle_f32x4(tmp20486, tmp20484, 238);
_mm512_storeu_ps(dfPtr18+0+524288*i85+196608*j76+32768*s72+256*k194, out2486);
_mm512_storeu_ps(dfPtr18+128+524288*i85+196608*j76+32768*s72+256*k194, out2494);
_mm512_storeu_ps(dfPtr18+64+524288*i85+196608*j76+32768*s72+256*k194, out2490);
_mm512_storeu_ps(dfPtr18+192+524288*i85+196608*j76+32768*s72+256*k194, out2498);
_mm512_storeu_ps(dfPtr18+131072+524288*i85+196608*j76+32768*s72+256*k194, out2487);
_mm512_storeu_ps(dfPtr18+131200+524288*i85+196608*j76+32768*s72+256*k194, out2495);
_mm512_storeu_ps(dfPtr18+131136+524288*i85+196608*j76+32768*s72+256*k194, out2491);
_mm512_storeu_ps(dfPtr18+131264+524288*i85+196608*j76+32768*s72+256*k194, out2499);
_mm512_storeu_ps(dfPtr18+262144+524288*i85+196608*j76+32768*s72+256*k194, out2488);
_mm512_storeu_ps(dfPtr18+262272+524288*i85+196608*j76+32768*s72+256*k194, out2496);
_mm512_storeu_ps(dfPtr18+262208+524288*i85+196608*j76+32768*s72+256*k194, out2492);
_mm512_storeu_ps(dfPtr18+262336+524288*i85+196608*j76+32768*s72+256*k194, out2500);
_mm512_storeu_ps(dfPtr18+393216+524288*i85+196608*j76+32768*s72+256*k194, out2489);
_mm512_storeu_ps(dfPtr18+393344+524288*i85+196608*j76+32768*s72+256*k194, out2497);
_mm512_storeu_ps(dfPtr18+393280+524288*i85+196608*j76+32768*s72+256*k194, out2493);
_mm512_storeu_ps(dfPtr18+393408+524288*i85+196608*j76+32768*s72+256*k194, out2501);
}
++j76;
}

static void ResNet50ThreeArrangeDats8(ResNet50ThreaderTeam1* team84, char** tensors143) {
ResNet50ThreaderTask1 task149;
task149.callee1 = ResNet50ThreeArrangeDats8Callee1;
task149.any1 = tensors143;
task149.nd1 = 4;
task149.hull1[0] = 4;
task149.hull1[1] = 1;
task149.hull1[2] = 1;
task149.hull1[3] = 1;
ResNet50ThreaderDo1(team84, &task149);
}

static void ResNet50ThreeProduceSums8Callee1(ResNet50ThreaderTask1* task150, int64_t* pt80) {
void** pair42 = task150->any1;
char** tensors146 = pair42[0];
ptrdiff_t e46 = 0;
ptrdiff_t g50 = 0;
ptrdiff_t f51 = pt80[2];
ptrdiff_t d31 = 0;
ptrdiff_t w84 = pt80[0];
char*restrict bfPtr19 = tensors146[0]+2048*e46;
char*restrict wfPtr19 = tensors146[0]+2048+32505856*e46;
char*restrict dfPtr19 = tensors146[1]+507904*e46;
char*restrict sfPtr18 = tensors146[2];
ptrdiff_t i86 = 1*g50;
ptrdiff_t j77 = 1*f51;
ptrdiff_t k195 = 1*d31;
ptrdiff_t l85 = 1*w84;
ptrdiff_t ll14 = l85+0;
for (; l85 != 128; ++l85) {
__m512 sum821;
__m512 sum825;
__m512 sum829;
__m512 sum833;
if (__builtin_expect(!j77, 0)) {
sum821 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr19+0+2048*i86+16*l85)));
sum825 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr19+4+2048*i86+16*l85)));
sum829 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr19+8+2048*i86+16*l85)));
sum833 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr19+12+2048*i86+16*l85)));
} else {
sum821 = _mm512_setzero_ps();
sum825 = _mm512_setzero_ps();
sum829 = _mm512_setzero_ps();
sum833 = _mm512_setzero_ps();
}
__m512 sum822 = sum821;
__m512 sum823 = sum821;
__m512 sum824 = sum821;
__m512 sum826 = sum825;
__m512 sum827 = sum825;
__m512 sum828 = sum825;
__m512 sum830 = sum829;
__m512 sum831 = sum829;
__m512 sum832 = sum829;
__m512 sum834 = sum833;
__m512 sum835 = sum833;
__m512 sum836 = sum833;
ptrdiff_t b77 = 0;
for (; b77 != 512; ++b77) {
__m512i wfs43 = _mm512_maskz_loadu_epi32(65535, wfPtr19+0+33554432*i86+8388608*j77+65536*l85+128*b77);
__m512 wf213 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs43));
__m512 df697 = _mm512_loadu_ps(dfPtr19+0+524288*i86+131072*j77+196608*k195+256*b77);
sum821 = _mm512_fmadd_ps(wf213, df697, sum821);
__m512 df698 = _mm512_loadu_ps(dfPtr19+64+524288*i86+131072*j77+196608*k195+256*b77);
sum822 = _mm512_fmadd_ps(wf213, df698, sum822);
__m512 df699 = _mm512_loadu_ps(dfPtr19+128+524288*i86+131072*j77+196608*k195+256*b77);
sum823 = _mm512_fmadd_ps(wf213, df699, sum823);
__m512 df700 = _mm512_loadu_ps(dfPtr19+192+524288*i86+131072*j77+196608*k195+256*b77);
sum824 = _mm512_fmadd_ps(wf213, df700, sum824);
__m512 wf214 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs43, 1));
sum825 = _mm512_fmadd_ps(wf214, df697, sum825);
sum826 = _mm512_fmadd_ps(wf214, df698, sum826);
sum827 = _mm512_fmadd_ps(wf214, df699, sum827);
sum828 = _mm512_fmadd_ps(wf214, df700, sum828);
__m512i wfs44 = _mm512_maskz_loadu_epi32(65535, wfPtr19+64+33554432*i86+8388608*j77+65536*l85+128*b77);
__m512 wf215 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs44));
sum829 = _mm512_fmadd_ps(wf215, df697, sum829);
sum830 = _mm512_fmadd_ps(wf215, df698, sum830);
sum831 = _mm512_fmadd_ps(wf215, df699, sum831);
sum832 = _mm512_fmadd_ps(wf215, df700, sum832);
__m512 wf216 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs44, 1));
sum833 = _mm512_fmadd_ps(wf216, df697, sum833);
sum834 = _mm512_fmadd_ps(wf216, df698, sum834);
sum835 = _mm512_fmadd_ps(wf216, df699, sum835);
sum836 = _mm512_fmadd_ps(wf216, df700, sum836);
}
_mm512_storeu_ps(sfPtr18+0+524288*i86+131072*j77+196608*k195+1024*l85, sum821);
_mm512_storeu_ps(sfPtr18+64+524288*i86+131072*j77+196608*k195+1024*l85, sum822);
_mm512_storeu_ps(sfPtr18+128+524288*i86+131072*j77+196608*k195+1024*l85, sum823);
_mm512_storeu_ps(sfPtr18+192+524288*i86+131072*j77+196608*k195+1024*l85, sum824);
_mm512_storeu_ps(sfPtr18+256+524288*i86+131072*j77+196608*k195+1024*l85, sum825);
_mm512_storeu_ps(sfPtr18+320+524288*i86+131072*j77+196608*k195+1024*l85, sum826);
_mm512_storeu_ps(sfPtr18+384+524288*i86+131072*j77+196608*k195+1024*l85, sum827);
_mm512_storeu_ps(sfPtr18+448+524288*i86+131072*j77+196608*k195+1024*l85, sum828);
_mm512_storeu_ps(sfPtr18+512+524288*i86+131072*j77+196608*k195+1024*l85, sum829);
_mm512_storeu_ps(sfPtr18+576+524288*i86+131072*j77+196608*k195+1024*l85, sum830);
_mm512_storeu_ps(sfPtr18+640+524288*i86+131072*j77+196608*k195+1024*l85, sum831);
_mm512_storeu_ps(sfPtr18+704+524288*i86+131072*j77+196608*k195+1024*l85, sum832);
_mm512_storeu_ps(sfPtr18+768+524288*i86+131072*j77+196608*k195+1024*l85, sum833);
_mm512_storeu_ps(sfPtr18+832+524288*i86+131072*j77+196608*k195+1024*l85, sum834);
_mm512_storeu_ps(sfPtr18+896+524288*i86+131072*j77+196608*k195+1024*l85, sum835);
_mm512_storeu_ps(sfPtr18+960+524288*i86+131072*j77+196608*k195+1024*l85, sum836);
if (l85 >= ll14) return;
}
}

static void ResNet50ThreeProduceSums8(ResNet50ThreaderTeam1* team85, char** tensors145) {
void* pair41[] = {tensors145, 0};
ResNet50ThreaderTask1 task151;
task151.callee1 = ResNet50ThreeProduceSums8Callee1;
task151.any1 = pair41;
task151.nd1 = 4;
task151.hull1[0] = 128;
task151.hull1[1] = 1;
task151.hull1[2] = 4;
task151.hull1[3] = 1;
ResNet50ThreaderDo1(team85, &task151);
}

static void ResNet50ThreeConsumeSums8Callee1(ResNet50ThreaderTask1* task152, int64_t* pt81) {
char** tensors148 = task152->any1;
ptrdiff_t w85 = pt81[0];
ptrdiff_t d32 = 0;
ptrdiff_t g51 = 0;
char*restrict sfPtr19 = tensors148[0];
char*restrict datPtr49 = tensors148[1];
ptrdiff_t i87 = 1*g51;
ptrdiff_t j78 = 1*d32;
ptrdiff_t rel30 = j78-0;
ptrdiff_t base30 = 0;
ptrdiff_t toH53 = base30+0;
ptrdiff_t toW53 = 0;
ptrdiff_t k196 = 32*w85;
ptrdiff_t kk72 = k196+31;
for (; k196 != 128; ++k196) {
ptrdiff_t l86 = 0;
for (; l86 != 4; ++l86) {
__m512 sf1425 = _mm512_loadu_ps(sfPtr19+0+524288*i87+196608*j78+1024*k196+256*l86);
__m512 sf1426 = _mm512_loadu_ps(sfPtr19+128+524288*i87+196608*j78+1024*k196+256*l86);
__m512 in2677 = _mm512_shuffle_f32x4(sf1425, sf1426, 68);
__m512 in2678 = _mm512_shuffle_f32x4(sf1425, sf1426, 238);
__m512 sf1427 = _mm512_loadu_ps(sfPtr19+64+524288*i87+196608*j78+1024*k196+256*l86);
__m512 sf1428 = _mm512_loadu_ps(sfPtr19+192+524288*i87+196608*j78+1024*k196+256*l86);
__m512 in2685 = _mm512_shuffle_f32x4(sf1427, sf1428, 68);
__m512 in2686 = _mm512_shuffle_f32x4(sf1427, sf1428, 238);
__m512 sf1429 = _mm512_loadu_ps(sfPtr19+131072+524288*i87+196608*j78+1024*k196+256*l86);
__m512 sf1430 = _mm512_loadu_ps(sfPtr19+131200+524288*i87+196608*j78+1024*k196+256*l86);
__m512 in2679 = _mm512_shuffle_f32x4(sf1429, sf1430, 68);
__m512 in2680 = _mm512_shuffle_f32x4(sf1429, sf1430, 238);
__m512 sf1431 = _mm512_loadu_ps(sfPtr19+131136+524288*i87+196608*j78+1024*k196+256*l86);
__m512 sf1432 = _mm512_loadu_ps(sfPtr19+131264+524288*i87+196608*j78+1024*k196+256*l86);
__m512 in2687 = _mm512_shuffle_f32x4(sf1431, sf1432, 68);
__m512 in2688 = _mm512_shuffle_f32x4(sf1431, sf1432, 238);
__m512 sf1433 = _mm512_loadu_ps(sfPtr19+262144+524288*i87+196608*j78+1024*k196+256*l86);
__m512 sf1434 = _mm512_loadu_ps(sfPtr19+262272+524288*i87+196608*j78+1024*k196+256*l86);
__m512 in2681 = _mm512_shuffle_f32x4(sf1433, sf1434, 68);
__m512 in2682 = _mm512_shuffle_f32x4(sf1433, sf1434, 238);
__m512 sf1435 = _mm512_loadu_ps(sfPtr19+262208+524288*i87+196608*j78+1024*k196+256*l86);
__m512 sf1436 = _mm512_loadu_ps(sfPtr19+262336+524288*i87+196608*j78+1024*k196+256*l86);
__m512 in2689 = _mm512_shuffle_f32x4(sf1435, sf1436, 68);
__m512 in2690 = _mm512_shuffle_f32x4(sf1435, sf1436, 238);
__m512 sf1437 = _mm512_loadu_ps(sfPtr19+393216+524288*i87+196608*j78+1024*k196+256*l86);
__m512 sf1438 = _mm512_loadu_ps(sfPtr19+393344+524288*i87+196608*j78+1024*k196+256*l86);
__m512 in2683 = _mm512_shuffle_f32x4(sf1437, sf1438, 68);
__m512 in2684 = _mm512_shuffle_f32x4(sf1437, sf1438, 238);
__m512 sf1439 = _mm512_loadu_ps(sfPtr19+393280+524288*i87+196608*j78+1024*k196+256*l86);
__m512 sf1440 = _mm512_loadu_ps(sfPtr19+393408+524288*i87+196608*j78+1024*k196+256*l86);
__m512 in2691 = _mm512_shuffle_f32x4(sf1439, sf1440, 68);
__m512 in2692 = _mm512_shuffle_f32x4(sf1439, sf1440, 238);
(void)in2692;
__m512 tmp20546 = _mm512_add_ps(in2678, in2679);
__m512 tmp20566 = _mm512_add_ps(in2686, in2687);
__m512 tmp20545 = _mm512_add_ps(in2680, in2681);
__m512 tmp20565 = _mm512_add_ps(in2688, in2689);
__m512 tmp20551 = _mm512_sub_ps(in2680, in2681);
__m512 tmp20550 = _mm512_sub_ps(in2678, in2679);
__m512 tmp20547 = _mm512_add_ps(in2682, in2683);
__m512 tmp20567 = _mm512_add_ps(in2690, in2691);
__m512 tmp20552 = _mm512_sub_ps(in2682, in2683);
__m512 tmp20549 = _mm512_fmadd_ps(tmp20551, _mm512_set1_ps(2e+00f), tmp20550);
__m512 tmp20556 = _mm512_fmadd_ps(tmp20551, _mm512_set1_ps(8e+00f), tmp20550);
__m512 tmp20544 = _mm512_add_ps(tmp20545, tmp20546);
__m512 tmp20564 = _mm512_add_ps(tmp20565, tmp20566);
__m512 tmp20548 = _mm512_fmadd_ps(tmp20552, _mm512_set1_ps(1.6e+01f), tmp20549);
__m512 tmp20555 = _mm512_fmadd_ps(tmp20552, _mm512_set1_ps(4e+00f), tmp20556);
__m512 tmp20561 = _mm512_add_ps(tmp20552, tmp20550);
__m512 tmp20554 = _mm512_fmadd_ps(tmp20545, _mm512_set1_ps(4e+00f), tmp20546);
__m512 tmp20558 = _mm512_fmadd_ps(tmp20545, _mm512_set1_ps(1.6e+01f), tmp20546);
__m512 tmp20543 = _mm512_add_ps(tmp20544, in2677);
__m512 tmp20563 = _mm512_add_ps(tmp20564, in2685);
__m512 tmp20560 = _mm512_add_ps(tmp20561, in2684);
__m512 tmp20542 = _mm512_fmadd_ps(tmp20547, _mm512_set1_ps(3.2e+01f), tmp20543);
__m512 tmp20562 = _mm512_fmadd_ps(tmp20567, _mm512_set1_ps(3.2e+01f), tmp20563);
__m512 tmp20553 = _mm512_fmadd_ps(tmp20547, _mm512_set1_ps(8e+00f), tmp20554);
__m512 tmp20559 = _mm512_fmadd_ps(tmp20551, _mm512_set1_ps(3.2e+01f), tmp20560);
__m512 tmp20557 = _mm512_fmadd_ps(tmp20547, _mm512_set1_ps(2e+00f), tmp20558);
__m512 tmp20535 = tmp20542;
__m512 tmp20541 = tmp20562;
__m512 tmp20536 = tmp20548;
__m512 tmp20537 = tmp20553;
__m512 tmp20538 = tmp20555;
__m512 tmp20539 = tmp20557;
__m512 tmp20540 = tmp20559;
__m512 tmp20603 = _mm512_unpacklo_ps(tmp20535, tmp20536);
__m512 tmp20604 = _mm512_unpackhi_ps(tmp20535, tmp20536);
__m512 tmp20605 = _mm512_unpacklo_ps(tmp20537, tmp20538);
__m512 tmp20606 = _mm512_unpackhi_ps(tmp20537, tmp20538);
__m512 tmp20607 = _mm512_unpacklo_ps(tmp20539, tmp20540);
__m512 tmp20608 = _mm512_unpackhi_ps(tmp20539, tmp20540);
__m512 tmp20609 = _mm512_unpacklo_ps(tmp20541, tmp20541);
__m512 tmp20610 = _mm512_unpackhi_ps(tmp20541, tmp20541);
__m512 tmp20611 = _mm512_shuffle_ps(tmp20603, tmp20605, 68);
__m512 tmp20612 = _mm512_shuffle_ps(tmp20603, tmp20605, 238);
__m512 tmp20613 = _mm512_shuffle_ps(tmp20604, tmp20606, 68);
__m512 tmp20614 = _mm512_shuffle_ps(tmp20604, tmp20606, 238);
__m512 tmp20615 = _mm512_shuffle_ps(tmp20607, tmp20609, 68);
__m512 tmp20616 = _mm512_shuffle_ps(tmp20607, tmp20609, 238);
__m512 tmp20617 = _mm512_shuffle_ps(tmp20608, tmp20610, 68);
__m512 tmp20618 = _mm512_shuffle_ps(tmp20608, tmp20610, 238);
__m512 tmp20619 = _mm512_shuffle_f32x4(tmp20611, tmp20615, 136);
__m512 tmp20620 = _mm512_shuffle_f32x4(tmp20611, tmp20615, 221);
__m512 tmp20621 = _mm512_shuffle_f32x4(tmp20612, tmp20616, 136);
__m512 tmp20622 = _mm512_shuffle_f32x4(tmp20612, tmp20616, 221);
__m512 tmp20623 = _mm512_shuffle_f32x4(tmp20613, tmp20617, 136);
__m512 tmp20624 = _mm512_shuffle_f32x4(tmp20613, tmp20617, 221);
__m512 tmp20625 = _mm512_shuffle_f32x4(tmp20614, tmp20618, 136);
__m512 tmp20626 = _mm512_shuffle_f32x4(tmp20614, tmp20618, 221);
tmp20535 = _mm512_shuffle_f32x4(tmp20619, tmp20619, 136);
__m512 tmp20569 = _mm512_shuffle_f32x4(tmp20619, tmp20619, 221);
tmp20536 = _mm512_shuffle_f32x4(tmp20621, tmp20621, 136);
__m512 tmp20570 = _mm512_shuffle_f32x4(tmp20621, tmp20621, 221);
tmp20537 = _mm512_shuffle_f32x4(tmp20623, tmp20623, 136);
__m512 tmp20571 = _mm512_shuffle_f32x4(tmp20623, tmp20623, 221);
tmp20538 = _mm512_shuffle_f32x4(tmp20625, tmp20625, 136);
__m512 tmp20572 = _mm512_shuffle_f32x4(tmp20625, tmp20625, 221);
tmp20539 = _mm512_shuffle_f32x4(tmp20620, tmp20620, 136);
__m512 tmp20573 = _mm512_shuffle_f32x4(tmp20620, tmp20620, 221);
tmp20540 = _mm512_shuffle_f32x4(tmp20622, tmp20622, 136);
__m512 tmp20574 = _mm512_shuffle_f32x4(tmp20622, tmp20622, 221);
tmp20541 = _mm512_shuffle_f32x4(tmp20624, tmp20624, 136);
__m512 tmp20575 = _mm512_shuffle_f32x4(tmp20624, tmp20624, 221);
__m512 tmp20568 = _mm512_shuffle_f32x4(tmp20626, tmp20626, 136);
__m512 tmp20576 = _mm512_shuffle_f32x4(tmp20626, tmp20626, 221);
(void)tmp20576;
__m512 tmp20581 = _mm512_add_ps(tmp20536, tmp20537);
__m512 tmp20601 = _mm512_add_ps(tmp20570, tmp20571);
__m512 tmp20580 = _mm512_add_ps(tmp20538, tmp20539);
__m512 tmp20600 = _mm512_add_ps(tmp20572, tmp20573);
__m512 tmp20586 = _mm512_sub_ps(tmp20538, tmp20539);
__m512 tmp20585 = _mm512_sub_ps(tmp20536, tmp20537);
__m512 tmp20582 = _mm512_add_ps(tmp20540, tmp20541);
__m512 tmp20602 = _mm512_add_ps(tmp20574, tmp20575);
__m512 tmp20587 = _mm512_sub_ps(tmp20540, tmp20541);
__m512 tmp20584 = _mm512_fmadd_ps(tmp20586, _mm512_set1_ps(2e+00f), tmp20585);
__m512 tmp20591 = _mm512_fmadd_ps(tmp20586, _mm512_set1_ps(8e+00f), tmp20585);
__m512 tmp20579 = _mm512_add_ps(tmp20580, tmp20581);
__m512 tmp20599 = _mm512_add_ps(tmp20600, tmp20601);
__m512 tmp20583 = _mm512_fmadd_ps(tmp20587, _mm512_set1_ps(1.6e+01f), tmp20584);
__m512 tmp20590 = _mm512_fmadd_ps(tmp20587, _mm512_set1_ps(4e+00f), tmp20591);
__m512 tmp20596 = _mm512_add_ps(tmp20587, tmp20585);
__m512 tmp20589 = _mm512_fmadd_ps(tmp20580, _mm512_set1_ps(4e+00f), tmp20581);
__m512 tmp20593 = _mm512_fmadd_ps(tmp20580, _mm512_set1_ps(1.6e+01f), tmp20581);
__m512 tmp20578 = _mm512_add_ps(tmp20579, tmp20535);
__m512 tmp20598 = _mm512_add_ps(tmp20599, tmp20569);
__m512 tmp20595 = _mm512_add_ps(tmp20596, tmp20568);
__m512 tmp20577 = _mm512_fmadd_ps(tmp20582, _mm512_set1_ps(3.2e+01f), tmp20578);
__m512 tmp20597 = _mm512_fmadd_ps(tmp20602, _mm512_set1_ps(3.2e+01f), tmp20598);
__m512 tmp20588 = _mm512_fmadd_ps(tmp20582, _mm512_set1_ps(8e+00f), tmp20589);
__m512 tmp20594 = _mm512_fmadd_ps(tmp20586, _mm512_set1_ps(3.2e+01f), tmp20595);
__m512 tmp20592 = _mm512_fmadd_ps(tmp20582, _mm512_set1_ps(2e+00f), tmp20593);
__m512 out2502 = tmp20577;
__m512 out2508 = tmp20597;
__m512 out2503 = tmp20583;
__m512 out2504 = tmp20588;
__m512 out2505 = tmp20590;
__m512 out2506 = tmp20592;
__m512 out2507 = tmp20594;
out2502 = _mm512_max_ps(_mm512_setzero_ps(), out2502);
out2508 = _mm512_max_ps(_mm512_setzero_ps(), out2508);
out2503 = _mm512_max_ps(_mm512_setzero_ps(), out2503);
out2504 = _mm512_max_ps(_mm512_setzero_ps(), out2504);
out2505 = _mm512_max_ps(_mm512_setzero_ps(), out2505);
out2506 = _mm512_max_ps(_mm512_setzero_ps(), out2506);
out2507 = _mm512_max_ps(_mm512_setzero_ps(), out2507);
_mm512_mask_storeu_ps(datPtr49+0+163840*i87+28*toH53+4*toW53+1280*k196+320*l86, 127, out2502);
_mm512_mask_storeu_ps(datPtr49+168+163840*i87+28*toH53+4*toW53+1280*k196+320*l86, 127, out2508);
_mm512_mask_storeu_ps(datPtr49+28+163840*i87+28*toH53+4*toW53+1280*k196+320*l86, 127, out2503);
_mm512_mask_storeu_ps(datPtr49+56+163840*i87+28*toH53+4*toW53+1280*k196+320*l86, 127, out2504);
_mm512_mask_storeu_ps(datPtr49+84+163840*i87+28*toH53+4*toW53+1280*k196+320*l86, 127, out2505);
_mm512_mask_storeu_ps(datPtr49+112+163840*i87+28*toH53+4*toW53+1280*k196+320*l86, 127, out2506);
_mm512_mask_storeu_ps(datPtr49+140+163840*i87+28*toH53+4*toW53+1280*k196+320*l86, 127, out2507);
}
if (k196 >= kk72) return;
}
++j78;
}

static void ResNet50ThreeConsumeSums8(ResNet50ThreaderTeam1* team86, char** tensors147) {
ResNet50ThreaderTask1 task153;
task153.callee1 = ResNet50ThreeConsumeSums8Callee1;
task153.any1 = tensors147;
task153.nd1 = 3;
task153.hull1[0] = 4;
task153.hull1[1] = 1;
task153.hull1[2] = 1;
ResNet50ThreaderDo1(team86, &task153);
}

static void ResNet50StriderArrangeFilts1Callee1(ResNet50ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t b2 = 0;
ptrdiff_t g2 = 0;
ptrdiff_t e1 = 0;
(void)pt7;
char*restrict bfPtr1 = tensors2[3]+256*e1;
char*restrict wfPtr1 = tensors2[3]+256+12976128*e1;
char*restrict wtPtr1 = tensors2[0]+77616*e1;
char*restrict biasPtr1 = tensors2[1];
char*restrict bnPtr1 = tensors2[2];
ptrdiff_t i5 = 1*g2;
ptrdiff_t j1 = 32*b2;
if (j1 < 32) {
for (; j1 != 32; ++j1) {
__m512 postMul1 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+64*i5+2*j1))[0]);
__m512 postMul2 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(1+64*i5+2*j1))[0]);
for (ptrdiff_t k1 = 0; k1 < 3; ++k1) {
__m512 wt1 = _mm512_maskz_loadu_ps(127, wtPtr1+0+37632*i5+1176*j1+196*k1);
__m512 wt2 = _mm512_maskz_loadu_ps(127, wtPtr1+28+37632*i5+1176*j1+196*k1);
__m512 wt3 = _mm512_maskz_loadu_ps(127, wtPtr1+56+37632*i5+1176*j1+196*k1);
__m512 wt4 = _mm512_maskz_loadu_ps(127, wtPtr1+84+37632*i5+1176*j1+196*k1);
__m512 wt5 = _mm512_maskz_loadu_ps(127, wtPtr1+112+37632*i5+1176*j1+196*k1);
__m512 wt6 = _mm512_maskz_loadu_ps(127, wtPtr1+140+37632*i5+1176*j1+196*k1);
__m512 wt7 = _mm512_maskz_loadu_ps(127, wtPtr1+168+37632*i5+1176*j1+196*k1);
wt1 = _mm512_mul_ps(postMul1, wt1);
wt2 = _mm512_mul_ps(postMul1, wt2);
wt3 = _mm512_mul_ps(postMul1, wt3);
wt4 = _mm512_mul_ps(postMul1, wt4);
wt5 = _mm512_mul_ps(postMul1, wt5);
wt6 = _mm512_mul_ps(postMul1, wt6);
wt7 = _mm512_mul_ps(postMul1, wt7);
__m512 fft1 = _mm512_add_ps(wt1, _mm512_setzero_ps());
__m512 fft89 = _mm512_add_ps(wt2, _mm512_setzero_ps());
__m512 fft2 = _mm512_sub_ps(wt1, _mm512_setzero_ps());
__m512 fft90 = _mm512_sub_ps(wt2, _mm512_setzero_ps());
__m512 fft3 = _mm512_add_ps(wt3, _mm512_setzero_ps());
__m512 fft91 = _mm512_add_ps(wt4, _mm512_setzero_ps());
__m512 fft4 = _mm512_sub_ps(wt3, _mm512_setzero_ps());
__m512 fft92 = _mm512_sub_ps(wt4, _mm512_setzero_ps());
__m512 fft5 = _mm512_add_ps(wt5, _mm512_setzero_ps());
__m512 fft93 = _mm512_add_ps(wt6, _mm512_setzero_ps());
__m512 fft6 = _mm512_sub_ps(wt5, _mm512_setzero_ps());
__m512 fft94 = _mm512_sub_ps(wt6, _mm512_setzero_ps());
__m512 fft7 = _mm512_add_ps(wt7, _mm512_setzero_ps());
__m512 fft95 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft8 = _mm512_sub_ps(wt7, _mm512_setzero_ps());
__m512 fft96 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9 = _mm512_add_ps(fft1, fft5);
__m512 fft97 = _mm512_add_ps(fft89, fft93);
__m512 fft10 = _mm512_sub_ps(fft1, fft5);
__m512 fft98 = _mm512_sub_ps(fft89, fft93);
__m512 fft11 = _mm512_add_ps(fft3, fft7);
__m512 fft99 = _mm512_add_ps(fft91, fft95);
__m512 fft12 = _mm512_sub_ps(fft7, fft3);
__m512 fft100 = _mm512_sub_ps(fft95, fft91);
__m512 fft13 = _mm512_sub_ps(fft4, fft8);
__m512 fft101 = _mm512_sub_ps(fft92, fft96);
__m512 fft14 = _mm512_add_ps(fft4, fft8);
__m512 fft102 = _mm512_add_ps(fft92, fft96);
__m512 fft15 = _mm512_add_ps(fft9, fft11);
__m512 fft103 = _mm512_add_ps(fft97, fft99);
__m512 fft16 = _mm512_sub_ps(fft9, fft11);
__m512 fft104 = _mm512_sub_ps(fft97, fft99);
__m512 fft17 = _mm512_fmadd_ps(fft13, _mm512_set1_ps(7.0710677e-01f), fft2);
__m512 fft105 = _mm512_fmadd_ps(fft101, _mm512_set1_ps(7.0710677e-01f), fft90);
__m512 fft18 = _mm512_fnmsub_ps(fft14, _mm512_set1_ps(7.0710677e-01f), fft6);
__m512 fft106 = _mm512_fnmsub_ps(fft102, _mm512_set1_ps(7.0710677e-01f), fft94);
__m512 fft19 = _mm512_fnmadd_ps(fft13, _mm512_set1_ps(7.0710677e-01f), fft2);
__m512 fft107 = _mm512_fnmadd_ps(fft101, _mm512_set1_ps(7.0710677e-01f), fft90);
__m512 fft20 = _mm512_fnmadd_ps(fft14, _mm512_set1_ps(7.0710677e-01f), fft6);
__m512 fft108 = _mm512_fnmadd_ps(fft102, _mm512_set1_ps(7.0710677e-01f), fft94);
__m512 fft21 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft22 = _mm512_fmadd_ps(fft15, fft21, _mm512_shuffle_f32x4(fft15, fft15, 78));
__m512 fft109 = _mm512_fmadd_ps(fft103, fft21, _mm512_shuffle_f32x4(fft103, fft103, 78));
__m512 fft23 = _mm512_fmadd_ps(fft16, fft21, _mm512_shuffle_f32x4(fft16, fft16, 78));
__m512 fft110 = _mm512_fmadd_ps(fft104, fft21, _mm512_shuffle_f32x4(fft104, fft104, 78));
__m512 fft24 = _mm512_fmadd_ps(fft17, fft21, _mm512_shuffle_f32x4(fft17, fft17, 78));
__m512 fft111 = _mm512_fmadd_ps(fft105, fft21, _mm512_shuffle_f32x4(fft105, fft105, 78));
__m512 fft25 = _mm512_fmadd_ps(fft18, fft21, _mm512_shuffle_f32x4(fft18, fft18, 78));
__m512 fft112 = _mm512_fmadd_ps(fft106, fft21, _mm512_shuffle_f32x4(fft106, fft106, 78));
__m512 fft26 = _mm512_fmadd_ps(fft10, fft21, _mm512_shuffle_f32x4(fft10, fft10, 78));
__m512 fft113 = _mm512_fmadd_ps(fft98, fft21, _mm512_shuffle_f32x4(fft98, fft98, 78));
__m512 fft27 = _mm512_fmadd_ps(fft12, fft21, _mm512_shuffle_f32x4(fft12, fft12, 78));
__m512 fft114 = _mm512_fmadd_ps(fft100, fft21, _mm512_shuffle_f32x4(fft100, fft100, 78));
__m512 fft28 = _mm512_fmadd_ps(fft19, fft21, _mm512_shuffle_f32x4(fft19, fft19, 78));
__m512 fft115 = _mm512_fmadd_ps(fft107, fft21, _mm512_shuffle_f32x4(fft107, fft107, 78));
__m512 fft29 = _mm512_fmadd_ps(fft20, fft21, _mm512_shuffle_f32x4(fft20, fft20, 78));
__m512 fft116 = _mm512_fmadd_ps(fft108, fft21, _mm512_shuffle_f32x4(fft108, fft108, 78));
__m512 fft30 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft31 = _mm512_mul_ps(fft22, fft30);
__m512 fft117 = _mm512_mul_ps(fft109, fft30);
__m512 fft32 = _mm512_mul_ps(fft23, fft30);
__m512 fft118 = _mm512_mul_ps(fft110, fft30);
__m512 fft33 = _mm512_mul_ps(fft24, fft30);
__m512 fft119 = _mm512_mul_ps(fft111, fft30);
__m512 fft34 = _mm512_mul_ps(fft25, fft30);
__m512 fft120 = _mm512_mul_ps(fft112, fft30);
__m512 fft35 = _mm512_mul_ps(fft26, fft30);
__m512 fft121 = _mm512_mul_ps(fft113, fft30);
__m512 fft36 = _mm512_mul_ps(fft27, fft30);
__m512 fft122 = _mm512_mul_ps(fft114, fft30);
__m512 fft37 = _mm512_mul_ps(fft28, fft30);
__m512 fft123 = _mm512_mul_ps(fft115, fft30);
__m512 fft38 = _mm512_mul_ps(fft29, fft30);
__m512 fft124 = _mm512_mul_ps(fft116, fft30);
__m512 fft39 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft40 = _mm512_fmadd_ps(fft23, fft39, fft31);
__m512 fft125 = _mm512_fmadd_ps(fft110, fft39, fft117);
__m512 fft41 = _mm512_fnmadd_ps(fft22, fft39, fft32);
__m512 fft126 = _mm512_fnmadd_ps(fft109, fft39, fft118);
__m512 fft42 = _mm512_fmadd_ps(fft25, fft39, fft33);
__m512 fft127 = _mm512_fmadd_ps(fft112, fft39, fft119);
__m512 fft43 = _mm512_fnmadd_ps(fft24, fft39, fft34);
__m512 fft128 = _mm512_fnmadd_ps(fft111, fft39, fft120);
__m512 fft44 = _mm512_fmadd_ps(fft27, fft39, fft35);
__m512 fft129 = _mm512_fmadd_ps(fft114, fft39, fft121);
__m512 fft45 = _mm512_fnmadd_ps(fft26, fft39, fft36);
__m512 fft130 = _mm512_fnmadd_ps(fft113, fft39, fft122);
__m512 fft46 = _mm512_fmadd_ps(fft29, fft39, fft37);
__m512 fft131 = _mm512_fmadd_ps(fft116, fft39, fft123);
__m512 fft47 = _mm512_fnmadd_ps(fft28, fft39, fft38);
__m512 fft132 = _mm512_fnmadd_ps(fft115, fft39, fft124);
__m512 fft48 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft49 = _mm512_fmadd_ps(fft40, fft48, _mm512_shuffle_f32x4(fft40, fft40, 177));
__m512 fft133 = _mm512_fmadd_ps(fft125, fft48, _mm512_shuffle_f32x4(fft125, fft125, 177));
__m512 fft50 = _mm512_fmadd_ps(fft41, fft48, _mm512_shuffle_f32x4(fft41, fft41, 177));
__m512 fft134 = _mm512_fmadd_ps(fft126, fft48, _mm512_shuffle_f32x4(fft126, fft126, 177));
__m512 fft51 = _mm512_fmadd_ps(fft42, fft48, _mm512_shuffle_f32x4(fft42, fft42, 177));
__m512 fft135 = _mm512_fmadd_ps(fft127, fft48, _mm512_shuffle_f32x4(fft127, fft127, 177));
__m512 fft52 = _mm512_fmadd_ps(fft43, fft48, _mm512_shuffle_f32x4(fft43, fft43, 177));
__m512 fft136 = _mm512_fmadd_ps(fft128, fft48, _mm512_shuffle_f32x4(fft128, fft128, 177));
__m512 fft53 = _mm512_fmadd_ps(fft44, fft48, _mm512_shuffle_f32x4(fft44, fft44, 177));
__m512 fft137 = _mm512_fmadd_ps(fft129, fft48, _mm512_shuffle_f32x4(fft129, fft129, 177));
__m512 fft54 = _mm512_fmadd_ps(fft45, fft48, _mm512_shuffle_f32x4(fft45, fft45, 177));
__m512 fft138 = _mm512_fmadd_ps(fft130, fft48, _mm512_shuffle_f32x4(fft130, fft130, 177));
__m512 fft55 = _mm512_fmadd_ps(fft46, fft48, _mm512_shuffle_f32x4(fft46, fft46, 177));
__m512 fft139 = _mm512_fmadd_ps(fft131, fft48, _mm512_shuffle_f32x4(fft131, fft131, 177));
__m512 fft56 = _mm512_fmadd_ps(fft47, fft48, _mm512_shuffle_f32x4(fft47, fft47, 177));
__m512 fft140 = _mm512_fmadd_ps(fft132, fft48, _mm512_shuffle_f32x4(fft132, fft132, 177));
__m512 fft57 = _mm512_mask_mov_ps(fft49, 49344, fft50);
__m512 fft141 = _mm512_mask_mov_ps(fft133, 49344, fft134);
__m512 fft58 = _mm512_mask_sub_ps(fft50, 49344, _mm512_setzero_ps(), fft49);
__m512 fft142 = _mm512_mask_sub_ps(fft134, 49344, _mm512_setzero_ps(), fft133);
__m512 fft59 = _mm512_mask_mov_ps(fft51, 49344, fft52);
__m512 fft143 = _mm512_mask_mov_ps(fft135, 49344, fft136);
__m512 fft60 = _mm512_mask_sub_ps(fft52, 49344, _mm512_setzero_ps(), fft51);
__m512 fft144 = _mm512_mask_sub_ps(fft136, 49344, _mm512_setzero_ps(), fft135);
__m512 fft61 = _mm512_mask_mov_ps(fft53, 49344, fft54);
__m512 fft145 = _mm512_mask_mov_ps(fft137, 49344, fft138);
__m512 fft62 = _mm512_mask_sub_ps(fft54, 49344, _mm512_setzero_ps(), fft53);
__m512 fft146 = _mm512_mask_sub_ps(fft138, 49344, _mm512_setzero_ps(), fft137);
__m512 fft63 = _mm512_mask_mov_ps(fft55, 49344, fft56);
__m512 fft147 = _mm512_mask_mov_ps(fft139, 49344, fft140);
__m512 fft64 = _mm512_mask_sub_ps(fft56, 49344, _mm512_setzero_ps(), fft55);
__m512 fft148 = _mm512_mask_sub_ps(fft140, 49344, _mm512_setzero_ps(), fft139);
__m512 fft65 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft66 = _mm512_fmadd_ps(fft57, fft65, _mm512_shuffle_ps(fft57, fft57, 78));
__m512 fft149 = _mm512_fmadd_ps(fft141, fft65, _mm512_shuffle_ps(fft141, fft141, 78));
__m512 fft67 = _mm512_fmadd_ps(fft58, fft65, _mm512_shuffle_ps(fft58, fft58, 78));
__m512 fft150 = _mm512_fmadd_ps(fft142, fft65, _mm512_shuffle_ps(fft142, fft142, 78));
__m512 fft68 = _mm512_fmadd_ps(fft59, fft65, _mm512_shuffle_ps(fft59, fft59, 78));
__m512 fft151 = _mm512_fmadd_ps(fft143, fft65, _mm512_shuffle_ps(fft143, fft143, 78));
__m512 fft69 = _mm512_fmadd_ps(fft60, fft65, _mm512_shuffle_ps(fft60, fft60, 78));
__m512 fft152 = _mm512_fmadd_ps(fft144, fft65, _mm512_shuffle_ps(fft144, fft144, 78));
__m512 fft70 = _mm512_fmadd_ps(fft61, fft65, _mm512_shuffle_ps(fft61, fft61, 78));
__m512 fft153 = _mm512_fmadd_ps(fft145, fft65, _mm512_shuffle_ps(fft145, fft145, 78));
__m512 fft71 = _mm512_fmadd_ps(fft62, fft65, _mm512_shuffle_ps(fft62, fft62, 78));
__m512 fft154 = _mm512_fmadd_ps(fft146, fft65, _mm512_shuffle_ps(fft146, fft146, 78));
__m512 fft72 = _mm512_fmadd_ps(fft63, fft65, _mm512_shuffle_ps(fft63, fft63, 78));
__m512 fft155 = _mm512_fmadd_ps(fft147, fft65, _mm512_shuffle_ps(fft147, fft147, 78));
__m512 fft73 = _mm512_fmadd_ps(fft64, fft65, _mm512_shuffle_ps(fft64, fft64, 78));
__m512 fft156 = _mm512_fmadd_ps(fft148, fft65, _mm512_shuffle_ps(fft148, fft148, 78));
__m512i fft74 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft75 = _mm512_permutexvar_ps(fft74, fft66);
__m512 fft157 = _mm512_permutexvar_ps(fft74, fft149);
__m512i fft76 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft77 = _mm512_permutexvar_ps(fft76, fft66);
__m512 fft158 = _mm512_permutexvar_ps(fft76, fft149);
__m512 fft78 = _mm512_permutexvar_ps(fft74, fft67);
__m512 fft159 = _mm512_permutexvar_ps(fft74, fft150);
__m512 fft79 = _mm512_permutexvar_ps(fft76, fft67);
__m512 fft160 = _mm512_permutexvar_ps(fft76, fft150);
__m512 fft80 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft81 = _mm512_fmadd_ps(fft75, fft80, fft77);
__m512 fft161 = _mm512_fmadd_ps(fft157, fft80, fft158);
__m512 fft82 = _mm512_fnmadd_ps(fft79, fft80, fft78);
__m512 fft162 = _mm512_fnmadd_ps(fft160, fft80, fft159);
__m512 fft83 = _mm512_mask_mov_ps(fft79, 21845, fft81);
__m512 fft163 = _mm512_mask_mov_ps(fft160, 21845, fft161);
__m512 fft84 = _mm512_mask_mov_ps(fft75, 43176, fft81);
__m512 fft164 = _mm512_mask_mov_ps(fft157, 43176, fft161);
__m512 fft85 = _mm512_mask_mov_ps(fft83, 43176, fft82);
__m512 fft165 = _mm512_mask_mov_ps(fft163, 43176, fft162);
__m512 fft86 = _mm512_mask_mov_ps(fft84, 22102, fft82);
__m512 fft166 = _mm512_mask_mov_ps(fft164, 22102, fft162);
__m512 fft87 = _mm512_mask_mul_ps(fft85, 64764, fft85, _mm512_set1_ps(5e-01f));
__m512 fft167 = _mm512_mask_mul_ps(fft165, 64764, fft165, _mm512_set1_ps(5e-01f));
__m512 fft88 = _mm512_mask_mul_ps(fft86, 64764, fft86, _mm512_set1_ps(5e-01f));
__m512 fft168 = _mm512_mask_mul_ps(fft166, 64764, fft166, _mm512_set1_ps(5e-01f));
__m512 wf1 = fft87;
__m512 wf9 = fft167;
__m512 wf2 = fft88;
__m512 wf10 = fft168;
__m512 wf3 = fft68;
__m512 wf11 = fft151;
__m512 wf4 = fft69;
__m512 wf12 = fft152;
__m512 wf5 = fft70;
__m512 wf13 = fft153;
__m512 wf6 = fft71;
__m512 wf14 = fft154;
__m512 wf7 = fft72;
__m512 wf15 = fft155;
__m512 wf8 = fft73;
__m512 wf16 = fft156;
ptrdiff_t c1 = (size_t)(0+2*j1)/4;
ptrdiff_t m1 = (size_t)(0+2*j1)%4/2;
ptrdiff_t f2 = (size_t)(0+2*j1)%2;
__m512i eo1 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf3 = _mm512_permutexvar_ps(eo1, wf3);
wf4 = _mm512_permutexvar_ps(eo1, wf4);
__m512i wfs1 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs1 = _mm512_inserti64x4(wfs1, _mm512_cvtps_ph(wf4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+6144+24576*i5+384*c1+128*k1+64*m1+16*f2, 3855, wfs1);
_mm512_mask_storeu_epi32(wfPtr1+30704+24576*i5+384*c1+128*k1+64*m1+16*f2, 61680, wfs1);
wf11 = _mm512_permutexvar_ps(eo1, wf11);
wf12 = _mm512_permutexvar_ps(eo1, wf12);
__m512i wfs2 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs2 = _mm512_inserti64x4(wfs2, _mm512_cvtps_ph(wf12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+55296+24576*i5+384*c1+128*k1+64*m1+16*f2, 3855, wfs2);
_mm512_mask_storeu_epi32(wfPtr1+79856+24576*i5+384*c1+128*k1+64*m1+16*f2, 61680, wfs2);
wf5 = _mm512_permutexvar_ps(eo1, wf5);
wf6 = _mm512_permutexvar_ps(eo1, wf6);
__m512i wfs3 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs3 = _mm512_inserti64x4(wfs3, _mm512_cvtps_ph(wf6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+12288+24576*i5+384*c1+128*k1+64*m1+16*f2, 3855, wfs3);
_mm512_mask_storeu_epi32(wfPtr1+36848+24576*i5+384*c1+128*k1+64*m1+16*f2, 61680, wfs3);
wf13 = _mm512_permutexvar_ps(eo1, wf13);
wf14 = _mm512_permutexvar_ps(eo1, wf14);
__m512i wfs4 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs4 = _mm512_inserti64x4(wfs4, _mm512_cvtps_ph(wf14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+61440+24576*i5+384*c1+128*k1+64*m1+16*f2, 3855, wfs4);
_mm512_mask_storeu_epi32(wfPtr1+86000+24576*i5+384*c1+128*k1+64*m1+16*f2, 61680, wfs4);
wf7 = _mm512_permutexvar_ps(eo1, wf7);
wf8 = _mm512_permutexvar_ps(eo1, wf8);
__m512i wfs5 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs5 = _mm512_inserti64x4(wfs5, _mm512_cvtps_ph(wf8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+18432+24576*i5+384*c1+128*k1+64*m1+16*f2, 3855, wfs5);
_mm512_mask_storeu_epi32(wfPtr1+42992+24576*i5+384*c1+128*k1+64*m1+16*f2, 61680, wfs5);
wf15 = _mm512_permutexvar_ps(eo1, wf15);
wf16 = _mm512_permutexvar_ps(eo1, wf16);
__m512i wfs6 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs6 = _mm512_inserti64x4(wfs6, _mm512_cvtps_ph(wf16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+67584+24576*i5+384*c1+128*k1+64*m1+16*f2, 3855, wfs6);
_mm512_mask_storeu_epi32(wfPtr1+92144+24576*i5+384*c1+128*k1+64*m1+16*f2, 61680, wfs6);
__m512i wfs7 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs7 = _mm512_inserti64x4(wfs7, _mm512_cvtps_ph(wf2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+0+24576*i5+384*c1+128*k1+64*m1+16*f2, 3855, wfs7);
_mm512_mask_storeu_epi32(wfPtr1+24560+24576*i5+384*c1+128*k1+64*m1+16*f2, 61680, wfs7);
__m512i wfs8 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs8 = _mm512_inserti64x4(wfs8, _mm512_cvtps_ph(wf10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+49152+24576*i5+384*c1+128*k1+64*m1+16*f2, 3855, wfs8);
_mm512_mask_storeu_epi32(wfPtr1+73712+24576*i5+384*c1+128*k1+64*m1+16*f2, 61680, wfs8);
__m512 wt8 = _mm512_maskz_loadu_ps(127, wtPtr1+588+37632*i5+1176*j1+196*k1);
__m512 wt9 = _mm512_maskz_loadu_ps(127, wtPtr1+616+37632*i5+1176*j1+196*k1);
__m512 wt10 = _mm512_maskz_loadu_ps(127, wtPtr1+644+37632*i5+1176*j1+196*k1);
__m512 wt11 = _mm512_maskz_loadu_ps(127, wtPtr1+672+37632*i5+1176*j1+196*k1);
__m512 wt12 = _mm512_maskz_loadu_ps(127, wtPtr1+700+37632*i5+1176*j1+196*k1);
__m512 wt13 = _mm512_maskz_loadu_ps(127, wtPtr1+728+37632*i5+1176*j1+196*k1);
__m512 wt14 = _mm512_maskz_loadu_ps(127, wtPtr1+756+37632*i5+1176*j1+196*k1);
wt8 = _mm512_mul_ps(postMul2, wt8);
wt9 = _mm512_mul_ps(postMul2, wt9);
wt10 = _mm512_mul_ps(postMul2, wt10);
wt11 = _mm512_mul_ps(postMul2, wt11);
wt12 = _mm512_mul_ps(postMul2, wt12);
wt13 = _mm512_mul_ps(postMul2, wt13);
wt14 = _mm512_mul_ps(postMul2, wt14);
__m512 fft169 = _mm512_add_ps(wt8, _mm512_setzero_ps());
__m512 fft257 = _mm512_add_ps(wt9, _mm512_setzero_ps());
__m512 fft170 = _mm512_sub_ps(wt8, _mm512_setzero_ps());
__m512 fft258 = _mm512_sub_ps(wt9, _mm512_setzero_ps());
__m512 fft171 = _mm512_add_ps(wt10, _mm512_setzero_ps());
__m512 fft259 = _mm512_add_ps(wt11, _mm512_setzero_ps());
__m512 fft172 = _mm512_sub_ps(wt10, _mm512_setzero_ps());
__m512 fft260 = _mm512_sub_ps(wt11, _mm512_setzero_ps());
__m512 fft173 = _mm512_add_ps(wt12, _mm512_setzero_ps());
__m512 fft261 = _mm512_add_ps(wt13, _mm512_setzero_ps());
__m512 fft174 = _mm512_sub_ps(wt12, _mm512_setzero_ps());
__m512 fft262 = _mm512_sub_ps(wt13, _mm512_setzero_ps());
__m512 fft175 = _mm512_add_ps(wt14, _mm512_setzero_ps());
__m512 fft263 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft176 = _mm512_sub_ps(wt14, _mm512_setzero_ps());
__m512 fft264 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft177 = _mm512_add_ps(fft169, fft173);
__m512 fft265 = _mm512_add_ps(fft257, fft261);
__m512 fft178 = _mm512_sub_ps(fft169, fft173);
__m512 fft266 = _mm512_sub_ps(fft257, fft261);
__m512 fft179 = _mm512_add_ps(fft171, fft175);
__m512 fft267 = _mm512_add_ps(fft259, fft263);
__m512 fft180 = _mm512_sub_ps(fft175, fft171);
__m512 fft268 = _mm512_sub_ps(fft263, fft259);
__m512 fft181 = _mm512_sub_ps(fft172, fft176);
__m512 fft269 = _mm512_sub_ps(fft260, fft264);
__m512 fft182 = _mm512_add_ps(fft172, fft176);
__m512 fft270 = _mm512_add_ps(fft260, fft264);
__m512 fft183 = _mm512_add_ps(fft177, fft179);
__m512 fft271 = _mm512_add_ps(fft265, fft267);
__m512 fft184 = _mm512_sub_ps(fft177, fft179);
__m512 fft272 = _mm512_sub_ps(fft265, fft267);
__m512 fft185 = _mm512_fmadd_ps(fft181, _mm512_set1_ps(7.0710677e-01f), fft170);
__m512 fft273 = _mm512_fmadd_ps(fft269, _mm512_set1_ps(7.0710677e-01f), fft258);
__m512 fft186 = _mm512_fnmsub_ps(fft182, _mm512_set1_ps(7.0710677e-01f), fft174);
__m512 fft274 = _mm512_fnmsub_ps(fft270, _mm512_set1_ps(7.0710677e-01f), fft262);
__m512 fft187 = _mm512_fnmadd_ps(fft181, _mm512_set1_ps(7.0710677e-01f), fft170);
__m512 fft275 = _mm512_fnmadd_ps(fft269, _mm512_set1_ps(7.0710677e-01f), fft258);
__m512 fft188 = _mm512_fnmadd_ps(fft182, _mm512_set1_ps(7.0710677e-01f), fft174);
__m512 fft276 = _mm512_fnmadd_ps(fft270, _mm512_set1_ps(7.0710677e-01f), fft262);
__m512 fft189 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft190 = _mm512_fmadd_ps(fft183, fft189, _mm512_shuffle_f32x4(fft183, fft183, 78));
__m512 fft277 = _mm512_fmadd_ps(fft271, fft189, _mm512_shuffle_f32x4(fft271, fft271, 78));
__m512 fft191 = _mm512_fmadd_ps(fft184, fft189, _mm512_shuffle_f32x4(fft184, fft184, 78));
__m512 fft278 = _mm512_fmadd_ps(fft272, fft189, _mm512_shuffle_f32x4(fft272, fft272, 78));
__m512 fft192 = _mm512_fmadd_ps(fft185, fft189, _mm512_shuffle_f32x4(fft185, fft185, 78));
__m512 fft279 = _mm512_fmadd_ps(fft273, fft189, _mm512_shuffle_f32x4(fft273, fft273, 78));
__m512 fft193 = _mm512_fmadd_ps(fft186, fft189, _mm512_shuffle_f32x4(fft186, fft186, 78));
__m512 fft280 = _mm512_fmadd_ps(fft274, fft189, _mm512_shuffle_f32x4(fft274, fft274, 78));
__m512 fft194 = _mm512_fmadd_ps(fft178, fft189, _mm512_shuffle_f32x4(fft178, fft178, 78));
__m512 fft281 = _mm512_fmadd_ps(fft266, fft189, _mm512_shuffle_f32x4(fft266, fft266, 78));
__m512 fft195 = _mm512_fmadd_ps(fft180, fft189, _mm512_shuffle_f32x4(fft180, fft180, 78));
__m512 fft282 = _mm512_fmadd_ps(fft268, fft189, _mm512_shuffle_f32x4(fft268, fft268, 78));
__m512 fft196 = _mm512_fmadd_ps(fft187, fft189, _mm512_shuffle_f32x4(fft187, fft187, 78));
__m512 fft283 = _mm512_fmadd_ps(fft275, fft189, _mm512_shuffle_f32x4(fft275, fft275, 78));
__m512 fft197 = _mm512_fmadd_ps(fft188, fft189, _mm512_shuffle_f32x4(fft188, fft188, 78));
__m512 fft284 = _mm512_fmadd_ps(fft276, fft189, _mm512_shuffle_f32x4(fft276, fft276, 78));
__m512 fft198 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft199 = _mm512_mul_ps(fft190, fft198);
__m512 fft285 = _mm512_mul_ps(fft277, fft198);
__m512 fft200 = _mm512_mul_ps(fft191, fft198);
__m512 fft286 = _mm512_mul_ps(fft278, fft198);
__m512 fft201 = _mm512_mul_ps(fft192, fft198);
__m512 fft287 = _mm512_mul_ps(fft279, fft198);
__m512 fft202 = _mm512_mul_ps(fft193, fft198);
__m512 fft288 = _mm512_mul_ps(fft280, fft198);
__m512 fft203 = _mm512_mul_ps(fft194, fft198);
__m512 fft289 = _mm512_mul_ps(fft281, fft198);
__m512 fft204 = _mm512_mul_ps(fft195, fft198);
__m512 fft290 = _mm512_mul_ps(fft282, fft198);
__m512 fft205 = _mm512_mul_ps(fft196, fft198);
__m512 fft291 = _mm512_mul_ps(fft283, fft198);
__m512 fft206 = _mm512_mul_ps(fft197, fft198);
__m512 fft292 = _mm512_mul_ps(fft284, fft198);
__m512 fft207 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft208 = _mm512_fmadd_ps(fft191, fft207, fft199);
__m512 fft293 = _mm512_fmadd_ps(fft278, fft207, fft285);
__m512 fft209 = _mm512_fnmadd_ps(fft190, fft207, fft200);
__m512 fft294 = _mm512_fnmadd_ps(fft277, fft207, fft286);
__m512 fft210 = _mm512_fmadd_ps(fft193, fft207, fft201);
__m512 fft295 = _mm512_fmadd_ps(fft280, fft207, fft287);
__m512 fft211 = _mm512_fnmadd_ps(fft192, fft207, fft202);
__m512 fft296 = _mm512_fnmadd_ps(fft279, fft207, fft288);
__m512 fft212 = _mm512_fmadd_ps(fft195, fft207, fft203);
__m512 fft297 = _mm512_fmadd_ps(fft282, fft207, fft289);
__m512 fft213 = _mm512_fnmadd_ps(fft194, fft207, fft204);
__m512 fft298 = _mm512_fnmadd_ps(fft281, fft207, fft290);
__m512 fft214 = _mm512_fmadd_ps(fft197, fft207, fft205);
__m512 fft299 = _mm512_fmadd_ps(fft284, fft207, fft291);
__m512 fft215 = _mm512_fnmadd_ps(fft196, fft207, fft206);
__m512 fft300 = _mm512_fnmadd_ps(fft283, fft207, fft292);
__m512 fft216 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft217 = _mm512_fmadd_ps(fft208, fft216, _mm512_shuffle_f32x4(fft208, fft208, 177));
__m512 fft301 = _mm512_fmadd_ps(fft293, fft216, _mm512_shuffle_f32x4(fft293, fft293, 177));
__m512 fft218 = _mm512_fmadd_ps(fft209, fft216, _mm512_shuffle_f32x4(fft209, fft209, 177));
__m512 fft302 = _mm512_fmadd_ps(fft294, fft216, _mm512_shuffle_f32x4(fft294, fft294, 177));
__m512 fft219 = _mm512_fmadd_ps(fft210, fft216, _mm512_shuffle_f32x4(fft210, fft210, 177));
__m512 fft303 = _mm512_fmadd_ps(fft295, fft216, _mm512_shuffle_f32x4(fft295, fft295, 177));
__m512 fft220 = _mm512_fmadd_ps(fft211, fft216, _mm512_shuffle_f32x4(fft211, fft211, 177));
__m512 fft304 = _mm512_fmadd_ps(fft296, fft216, _mm512_shuffle_f32x4(fft296, fft296, 177));
__m512 fft221 = _mm512_fmadd_ps(fft212, fft216, _mm512_shuffle_f32x4(fft212, fft212, 177));
__m512 fft305 = _mm512_fmadd_ps(fft297, fft216, _mm512_shuffle_f32x4(fft297, fft297, 177));
__m512 fft222 = _mm512_fmadd_ps(fft213, fft216, _mm512_shuffle_f32x4(fft213, fft213, 177));
__m512 fft306 = _mm512_fmadd_ps(fft298, fft216, _mm512_shuffle_f32x4(fft298, fft298, 177));
__m512 fft223 = _mm512_fmadd_ps(fft214, fft216, _mm512_shuffle_f32x4(fft214, fft214, 177));
__m512 fft307 = _mm512_fmadd_ps(fft299, fft216, _mm512_shuffle_f32x4(fft299, fft299, 177));
__m512 fft224 = _mm512_fmadd_ps(fft215, fft216, _mm512_shuffle_f32x4(fft215, fft215, 177));
__m512 fft308 = _mm512_fmadd_ps(fft300, fft216, _mm512_shuffle_f32x4(fft300, fft300, 177));
__m512 fft225 = _mm512_mask_mov_ps(fft217, 49344, fft218);
__m512 fft309 = _mm512_mask_mov_ps(fft301, 49344, fft302);
__m512 fft226 = _mm512_mask_sub_ps(fft218, 49344, _mm512_setzero_ps(), fft217);
__m512 fft310 = _mm512_mask_sub_ps(fft302, 49344, _mm512_setzero_ps(), fft301);
__m512 fft227 = _mm512_mask_mov_ps(fft219, 49344, fft220);
__m512 fft311 = _mm512_mask_mov_ps(fft303, 49344, fft304);
__m512 fft228 = _mm512_mask_sub_ps(fft220, 49344, _mm512_setzero_ps(), fft219);
__m512 fft312 = _mm512_mask_sub_ps(fft304, 49344, _mm512_setzero_ps(), fft303);
__m512 fft229 = _mm512_mask_mov_ps(fft221, 49344, fft222);
__m512 fft313 = _mm512_mask_mov_ps(fft305, 49344, fft306);
__m512 fft230 = _mm512_mask_sub_ps(fft222, 49344, _mm512_setzero_ps(), fft221);
__m512 fft314 = _mm512_mask_sub_ps(fft306, 49344, _mm512_setzero_ps(), fft305);
__m512 fft231 = _mm512_mask_mov_ps(fft223, 49344, fft224);
__m512 fft315 = _mm512_mask_mov_ps(fft307, 49344, fft308);
__m512 fft232 = _mm512_mask_sub_ps(fft224, 49344, _mm512_setzero_ps(), fft223);
__m512 fft316 = _mm512_mask_sub_ps(fft308, 49344, _mm512_setzero_ps(), fft307);
__m512 fft233 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft234 = _mm512_fmadd_ps(fft225, fft233, _mm512_shuffle_ps(fft225, fft225, 78));
__m512 fft317 = _mm512_fmadd_ps(fft309, fft233, _mm512_shuffle_ps(fft309, fft309, 78));
__m512 fft235 = _mm512_fmadd_ps(fft226, fft233, _mm512_shuffle_ps(fft226, fft226, 78));
__m512 fft318 = _mm512_fmadd_ps(fft310, fft233, _mm512_shuffle_ps(fft310, fft310, 78));
__m512 fft236 = _mm512_fmadd_ps(fft227, fft233, _mm512_shuffle_ps(fft227, fft227, 78));
__m512 fft319 = _mm512_fmadd_ps(fft311, fft233, _mm512_shuffle_ps(fft311, fft311, 78));
__m512 fft237 = _mm512_fmadd_ps(fft228, fft233, _mm512_shuffle_ps(fft228, fft228, 78));
__m512 fft320 = _mm512_fmadd_ps(fft312, fft233, _mm512_shuffle_ps(fft312, fft312, 78));
__m512 fft238 = _mm512_fmadd_ps(fft229, fft233, _mm512_shuffle_ps(fft229, fft229, 78));
__m512 fft321 = _mm512_fmadd_ps(fft313, fft233, _mm512_shuffle_ps(fft313, fft313, 78));
__m512 fft239 = _mm512_fmadd_ps(fft230, fft233, _mm512_shuffle_ps(fft230, fft230, 78));
__m512 fft322 = _mm512_fmadd_ps(fft314, fft233, _mm512_shuffle_ps(fft314, fft314, 78));
__m512 fft240 = _mm512_fmadd_ps(fft231, fft233, _mm512_shuffle_ps(fft231, fft231, 78));
__m512 fft323 = _mm512_fmadd_ps(fft315, fft233, _mm512_shuffle_ps(fft315, fft315, 78));
__m512 fft241 = _mm512_fmadd_ps(fft232, fft233, _mm512_shuffle_ps(fft232, fft232, 78));
__m512 fft324 = _mm512_fmadd_ps(fft316, fft233, _mm512_shuffle_ps(fft316, fft316, 78));
__m512i fft242 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft243 = _mm512_permutexvar_ps(fft242, fft234);
__m512 fft325 = _mm512_permutexvar_ps(fft242, fft317);
__m512i fft244 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft245 = _mm512_permutexvar_ps(fft244, fft234);
__m512 fft326 = _mm512_permutexvar_ps(fft244, fft317);
__m512 fft246 = _mm512_permutexvar_ps(fft242, fft235);
__m512 fft327 = _mm512_permutexvar_ps(fft242, fft318);
__m512 fft247 = _mm512_permutexvar_ps(fft244, fft235);
__m512 fft328 = _mm512_permutexvar_ps(fft244, fft318);
__m512 fft248 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft249 = _mm512_fmadd_ps(fft243, fft248, fft245);
__m512 fft329 = _mm512_fmadd_ps(fft325, fft248, fft326);
__m512 fft250 = _mm512_fnmadd_ps(fft247, fft248, fft246);
__m512 fft330 = _mm512_fnmadd_ps(fft328, fft248, fft327);
__m512 fft251 = _mm512_mask_mov_ps(fft247, 21845, fft249);
__m512 fft331 = _mm512_mask_mov_ps(fft328, 21845, fft329);
__m512 fft252 = _mm512_mask_mov_ps(fft243, 43176, fft249);
__m512 fft332 = _mm512_mask_mov_ps(fft325, 43176, fft329);
__m512 fft253 = _mm512_mask_mov_ps(fft251, 43176, fft250);
__m512 fft333 = _mm512_mask_mov_ps(fft331, 43176, fft330);
__m512 fft254 = _mm512_mask_mov_ps(fft252, 22102, fft250);
__m512 fft334 = _mm512_mask_mov_ps(fft332, 22102, fft330);
__m512 fft255 = _mm512_mask_mul_ps(fft253, 64764, fft253, _mm512_set1_ps(5e-01f));
__m512 fft335 = _mm512_mask_mul_ps(fft333, 64764, fft333, _mm512_set1_ps(5e-01f));
__m512 fft256 = _mm512_mask_mul_ps(fft254, 64764, fft254, _mm512_set1_ps(5e-01f));
__m512 fft336 = _mm512_mask_mul_ps(fft334, 64764, fft334, _mm512_set1_ps(5e-01f));
__m512 wf17 = fft255;
__m512 wf25 = fft335;
__m512 wf18 = fft256;
__m512 wf26 = fft336;
__m512 wf19 = fft236;
__m512 wf27 = fft319;
__m512 wf20 = fft237;
__m512 wf28 = fft320;
__m512 wf21 = fft238;
__m512 wf29 = fft321;
__m512 wf22 = fft239;
__m512 wf30 = fft322;
__m512 wf23 = fft240;
__m512 wf31 = fft323;
__m512 wf24 = fft241;
__m512 wf32 = fft324;
ptrdiff_t c2 = (size_t)(1+2*j1)/4;
ptrdiff_t m2 = (size_t)(1+2*j1)%4/2;
ptrdiff_t f3 = (size_t)(1+2*j1)%2;
__m512i eo2 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf19 = _mm512_permutexvar_ps(eo2, wf19);
wf20 = _mm512_permutexvar_ps(eo2, wf20);
__m512i wfs9 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf19, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs9 = _mm512_inserti64x4(wfs9, _mm512_cvtps_ph(wf20, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+6144+24576*i5+384*c2+128*k1+64*m2+16*f3, 3855, wfs9);
_mm512_mask_storeu_epi32(wfPtr1+30704+24576*i5+384*c2+128*k1+64*m2+16*f3, 61680, wfs9);
wf27 = _mm512_permutexvar_ps(eo2, wf27);
wf28 = _mm512_permutexvar_ps(eo2, wf28);
__m512i wfs10 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf27, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs10 = _mm512_inserti64x4(wfs10, _mm512_cvtps_ph(wf28, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+55296+24576*i5+384*c2+128*k1+64*m2+16*f3, 3855, wfs10);
_mm512_mask_storeu_epi32(wfPtr1+79856+24576*i5+384*c2+128*k1+64*m2+16*f3, 61680, wfs10);
wf21 = _mm512_permutexvar_ps(eo2, wf21);
wf22 = _mm512_permutexvar_ps(eo2, wf22);
__m512i wfs11 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf21, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs11 = _mm512_inserti64x4(wfs11, _mm512_cvtps_ph(wf22, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+12288+24576*i5+384*c2+128*k1+64*m2+16*f3, 3855, wfs11);
_mm512_mask_storeu_epi32(wfPtr1+36848+24576*i5+384*c2+128*k1+64*m2+16*f3, 61680, wfs11);
wf29 = _mm512_permutexvar_ps(eo2, wf29);
wf30 = _mm512_permutexvar_ps(eo2, wf30);
__m512i wfs12 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf29, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs12 = _mm512_inserti64x4(wfs12, _mm512_cvtps_ph(wf30, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+61440+24576*i5+384*c2+128*k1+64*m2+16*f3, 3855, wfs12);
_mm512_mask_storeu_epi32(wfPtr1+86000+24576*i5+384*c2+128*k1+64*m2+16*f3, 61680, wfs12);
wf23 = _mm512_permutexvar_ps(eo2, wf23);
wf24 = _mm512_permutexvar_ps(eo2, wf24);
__m512i wfs13 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf23, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs13 = _mm512_inserti64x4(wfs13, _mm512_cvtps_ph(wf24, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+18432+24576*i5+384*c2+128*k1+64*m2+16*f3, 3855, wfs13);
_mm512_mask_storeu_epi32(wfPtr1+42992+24576*i5+384*c2+128*k1+64*m2+16*f3, 61680, wfs13);
wf31 = _mm512_permutexvar_ps(eo2, wf31);
wf32 = _mm512_permutexvar_ps(eo2, wf32);
__m512i wfs14 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf31, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs14 = _mm512_inserti64x4(wfs14, _mm512_cvtps_ph(wf32, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+67584+24576*i5+384*c2+128*k1+64*m2+16*f3, 3855, wfs14);
_mm512_mask_storeu_epi32(wfPtr1+92144+24576*i5+384*c2+128*k1+64*m2+16*f3, 61680, wfs14);
__m512i wfs15 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf17, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs15 = _mm512_inserti64x4(wfs15, _mm512_cvtps_ph(wf18, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+0+24576*i5+384*c2+128*k1+64*m2+16*f3, 3855, wfs15);
_mm512_mask_storeu_epi32(wfPtr1+24560+24576*i5+384*c2+128*k1+64*m2+16*f3, 61680, wfs15);
__m512i wfs16 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf25, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs16 = _mm512_inserti64x4(wfs16, _mm512_cvtps_ph(wf26, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+49152+24576*i5+384*c2+128*k1+64*m2+16*f3, 3855, wfs16);
_mm512_mask_storeu_epi32(wfPtr1+73712+24576*i5+384*c2+128*k1+64*m2+16*f3, 61680, wfs16);
}
__m512 bias1 = _mm512_setzero_ps();
if (!e1) {
bias1 = _mm512_maskz_loadu_ps(3, biasPtr1-0+256*i5+8*j1);
__m512i pmMul1 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd1 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas2 = _mm512_maskz_loadu_ps(15, bnPtr1+(ptrdiff_t)8*(0+64*i5+2*j1));
__m512 postMul3 = _mm512_permutexvar_ps(pmMul1, mas2);
__m512 postAdd1 = _mm512_permutexvar_ps(pmAdd1, mas2);
bias1 = _mm512_fmadd_ps(bias1, postMul3, postAdd1);
bias1 = _mm512_mul_ps(bias1, _mm512_set1_ps(6.4e+01f));
}
_mm512_mask_storeu_ps(bfPtr1-0+256*i5+8*j1, 3, bias1);
}
}
}

static void ResNet50StriderArrangeFilts1(ResNet50ThreaderTeam1* team13, char** tensors1) {
ResNet50ThreaderTask1 task5;
task5.callee1 = ResNet50StriderArrangeFilts1Callee1;
task5.any1 = tensors1;
task5.nd1 = 3;
task5.hull1[0] = 1;
task5.hull1[1] = 1;
task5.hull1[2] = 1;
ResNet50ThreaderDo1(team13, &task5);
}

static void ResNet50StriderArrangeDats1Callee1(ResNet50ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t s1 = 0;
ptrdiff_t c3 = pt8[1];
ptrdiff_t g3 = 0;
ptrdiff_t e2 = 0;
char*restrict datPtr1 = tensors4[0]-2700+79478784*e2;
char*restrict dfPtr1 = tensors4[1]+214917120*e2;
ptrdiff_t i6 = 1*g3;
ptrdiff_t j2 = 8*c3;
ptrdiff_t last1 = j2+(c3 < 10 ? 7 : 8);
if (j2 < 4) {
ptrdiff_t rel1 = j2-0;
ptrdiff_t base1 = 0;
if (rel1 < 1) {
ptrdiff_t h1 = base1+0;
ptrdiff_t w1 = 0;
ptrdiff_t k2 = 3*s1;
ptrdiff_t kk1 = k2+2;
for (; k2 <= kk1; ++k2) {
ptrdiff_t b3 = 0;
ptrdiff_t m3 = (size_t)b3/2;
ptrdiff_t f4 = (size_t)b3%2;
__m512 dat1 = _mm512_maskz_loadu_ps(65528, datPtr1+2688+602112*i6+200704*k2+896*h1+4*w1+0*b3);
__m512 dat2 = _mm512_maskz_loadu_ps(65528, datPtr1+3584+602112*i6+200704*k2+896*h1+4*w1+0*b3);
__m512 dat3 = _mm512_maskz_loadu_ps(65528, datPtr1+4480+602112*i6+200704*k2+896*h1+4*w1+0*b3);
__m512 dat4 = _mm512_maskz_loadu_ps(65528, datPtr1+5376+602112*i6+200704*k2+896*h1+4*w1+0*b3);
__m512 dat5 = _mm512_maskz_loadu_ps(65528, datPtr1+6272+602112*i6+200704*k2+896*h1+4*w1+0*b3);
__m512 dat6 = _mm512_maskz_loadu_ps(65528, datPtr1+7168+602112*i6+200704*k2+896*h1+4*w1+0*b3);
__m512 dat7 = _mm512_maskz_loadu_ps(65528, datPtr1+8064+602112*i6+200704*k2+896*h1+4*w1+0*b3);
__m512 dat8 = _mm512_maskz_loadu_ps(65528, datPtr1+8960+602112*i6+200704*k2+896*h1+4*w1+0*b3);
__m512 dat9 = _mm512_maskz_loadu_ps(65528, datPtr1+9856+602112*i6+200704*k2+896*h1+4*w1+0*b3);
__m512 dat10 = _mm512_maskz_loadu_ps(65528, datPtr1+10752+602112*i6+200704*k2+896*h1+4*w1+0*b3);
__m512 dat11 = _mm512_maskz_loadu_ps(65528, datPtr1+11648+602112*i6+200704*k2+896*h1+4*w1+0*b3);
__m512 dat12 = _mm512_maskz_loadu_ps(65528, datPtr1+12544+602112*i6+200704*k2+896*h1+4*w1+0*b3);
__m512 dat13 = _mm512_maskz_loadu_ps(65528, datPtr1+13440+602112*i6+200704*k2+896*h1+4*w1+0*b3);
__m512 fft337 = _mm512_add_ps(_mm512_setzero_ps(), dat6);
__m512 fft425 = _mm512_add_ps(_mm512_setzero_ps(), dat7);
__m512 fft338 = _mm512_sub_ps(_mm512_setzero_ps(), dat6);
__m512 fft426 = _mm512_sub_ps(_mm512_setzero_ps(), dat7);
__m512 fft339 = _mm512_add_ps(_mm512_setzero_ps(), dat8);
__m512 fft427 = _mm512_add_ps(dat1, dat9);
__m512 fft340 = _mm512_sub_ps(_mm512_setzero_ps(), dat8);
__m512 fft428 = _mm512_sub_ps(dat1, dat9);
__m512 fft341 = _mm512_add_ps(dat2, dat10);
__m512 fft429 = _mm512_add_ps(dat3, dat11);
__m512 fft342 = _mm512_sub_ps(dat2, dat10);
__m512 fft430 = _mm512_sub_ps(dat3, dat11);
__m512 fft343 = _mm512_add_ps(dat4, dat12);
__m512 fft431 = _mm512_add_ps(dat5, dat13);
__m512 fft344 = _mm512_sub_ps(dat4, dat12);
__m512 fft432 = _mm512_sub_ps(dat5, dat13);
__m512 fft345 = _mm512_add_ps(fft337, fft341);
__m512 fft433 = _mm512_add_ps(fft425, fft429);
__m512 fft346 = _mm512_sub_ps(fft337, fft341);
__m512 fft434 = _mm512_sub_ps(fft425, fft429);
__m512 fft347 = _mm512_add_ps(fft339, fft343);
__m512 fft435 = _mm512_add_ps(fft427, fft431);
__m512 fft348 = _mm512_sub_ps(fft343, fft339);
__m512 fft436 = _mm512_sub_ps(fft431, fft427);
__m512 fft349 = _mm512_sub_ps(fft340, fft344);
__m512 fft437 = _mm512_sub_ps(fft428, fft432);
__m512 fft350 = _mm512_add_ps(fft340, fft344);
__m512 fft438 = _mm512_add_ps(fft428, fft432);
__m512 fft351 = _mm512_add_ps(fft345, fft347);
__m512 fft439 = _mm512_add_ps(fft433, fft435);
__m512 fft352 = _mm512_sub_ps(fft345, fft347);
__m512 fft440 = _mm512_sub_ps(fft433, fft435);
__m512 fft353 = _mm512_fmadd_ps(fft349, _mm512_set1_ps(7.0710677e-01f), fft338);
__m512 fft441 = _mm512_fmadd_ps(fft437, _mm512_set1_ps(7.0710677e-01f), fft426);
__m512 fft354 = _mm512_fnmsub_ps(fft350, _mm512_set1_ps(7.0710677e-01f), fft342);
__m512 fft442 = _mm512_fnmsub_ps(fft438, _mm512_set1_ps(7.0710677e-01f), fft430);
__m512 fft355 = _mm512_fnmadd_ps(fft349, _mm512_set1_ps(7.0710677e-01f), fft338);
__m512 fft443 = _mm512_fnmadd_ps(fft437, _mm512_set1_ps(7.0710677e-01f), fft426);
__m512 fft356 = _mm512_fnmadd_ps(fft350, _mm512_set1_ps(7.0710677e-01f), fft342);
__m512 fft444 = _mm512_fnmadd_ps(fft438, _mm512_set1_ps(7.0710677e-01f), fft430);
__m512 fft357 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft358 = _mm512_fmadd_ps(fft351, fft357, _mm512_shuffle_f32x4(fft351, fft351, 78));
__m512 fft445 = _mm512_fmadd_ps(fft439, fft357, _mm512_shuffle_f32x4(fft439, fft439, 78));
__m512 fft359 = _mm512_fmadd_ps(fft352, fft357, _mm512_shuffle_f32x4(fft352, fft352, 78));
__m512 fft446 = _mm512_fmadd_ps(fft440, fft357, _mm512_shuffle_f32x4(fft440, fft440, 78));
__m512 fft360 = _mm512_fmadd_ps(fft353, fft357, _mm512_shuffle_f32x4(fft353, fft353, 78));
__m512 fft447 = _mm512_fmadd_ps(fft441, fft357, _mm512_shuffle_f32x4(fft441, fft441, 78));
__m512 fft361 = _mm512_fmadd_ps(fft354, fft357, _mm512_shuffle_f32x4(fft354, fft354, 78));
__m512 fft448 = _mm512_fmadd_ps(fft442, fft357, _mm512_shuffle_f32x4(fft442, fft442, 78));
__m512 fft362 = _mm512_fmadd_ps(fft346, fft357, _mm512_shuffle_f32x4(fft346, fft346, 78));
__m512 fft449 = _mm512_fmadd_ps(fft434, fft357, _mm512_shuffle_f32x4(fft434, fft434, 78));
__m512 fft363 = _mm512_fmadd_ps(fft348, fft357, _mm512_shuffle_f32x4(fft348, fft348, 78));
__m512 fft450 = _mm512_fmadd_ps(fft436, fft357, _mm512_shuffle_f32x4(fft436, fft436, 78));
__m512 fft364 = _mm512_fmadd_ps(fft355, fft357, _mm512_shuffle_f32x4(fft355, fft355, 78));
__m512 fft451 = _mm512_fmadd_ps(fft443, fft357, _mm512_shuffle_f32x4(fft443, fft443, 78));
__m512 fft365 = _mm512_fmadd_ps(fft356, fft357, _mm512_shuffle_f32x4(fft356, fft356, 78));
__m512 fft452 = _mm512_fmadd_ps(fft444, fft357, _mm512_shuffle_f32x4(fft444, fft444, 78));
__m512 fft366 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft367 = _mm512_mul_ps(fft358, fft366);
__m512 fft453 = _mm512_mul_ps(fft445, fft366);
__m512 fft368 = _mm512_mul_ps(fft359, fft366);
__m512 fft454 = _mm512_mul_ps(fft446, fft366);
__m512 fft369 = _mm512_mul_ps(fft360, fft366);
__m512 fft455 = _mm512_mul_ps(fft447, fft366);
__m512 fft370 = _mm512_mul_ps(fft361, fft366);
__m512 fft456 = _mm512_mul_ps(fft448, fft366);
__m512 fft371 = _mm512_mul_ps(fft362, fft366);
__m512 fft457 = _mm512_mul_ps(fft449, fft366);
__m512 fft372 = _mm512_mul_ps(fft363, fft366);
__m512 fft458 = _mm512_mul_ps(fft450, fft366);
__m512 fft373 = _mm512_mul_ps(fft364, fft366);
__m512 fft459 = _mm512_mul_ps(fft451, fft366);
__m512 fft374 = _mm512_mul_ps(fft365, fft366);
__m512 fft460 = _mm512_mul_ps(fft452, fft366);
__m512 fft375 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft376 = _mm512_fmadd_ps(fft359, fft375, fft367);
__m512 fft461 = _mm512_fmadd_ps(fft446, fft375, fft453);
__m512 fft377 = _mm512_fnmadd_ps(fft358, fft375, fft368);
__m512 fft462 = _mm512_fnmadd_ps(fft445, fft375, fft454);
__m512 fft378 = _mm512_fmadd_ps(fft361, fft375, fft369);
__m512 fft463 = _mm512_fmadd_ps(fft448, fft375, fft455);
__m512 fft379 = _mm512_fnmadd_ps(fft360, fft375, fft370);
__m512 fft464 = _mm512_fnmadd_ps(fft447, fft375, fft456);
__m512 fft380 = _mm512_fmadd_ps(fft363, fft375, fft371);
__m512 fft465 = _mm512_fmadd_ps(fft450, fft375, fft457);
__m512 fft381 = _mm512_fnmadd_ps(fft362, fft375, fft372);
__m512 fft466 = _mm512_fnmadd_ps(fft449, fft375, fft458);
__m512 fft382 = _mm512_fmadd_ps(fft365, fft375, fft373);
__m512 fft467 = _mm512_fmadd_ps(fft452, fft375, fft459);
__m512 fft383 = _mm512_fnmadd_ps(fft364, fft375, fft374);
__m512 fft468 = _mm512_fnmadd_ps(fft451, fft375, fft460);
__m512 fft384 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft385 = _mm512_fmadd_ps(fft376, fft384, _mm512_shuffle_f32x4(fft376, fft376, 177));
__m512 fft469 = _mm512_fmadd_ps(fft461, fft384, _mm512_shuffle_f32x4(fft461, fft461, 177));
__m512 fft386 = _mm512_fmadd_ps(fft377, fft384, _mm512_shuffle_f32x4(fft377, fft377, 177));
__m512 fft470 = _mm512_fmadd_ps(fft462, fft384, _mm512_shuffle_f32x4(fft462, fft462, 177));
__m512 fft387 = _mm512_fmadd_ps(fft378, fft384, _mm512_shuffle_f32x4(fft378, fft378, 177));
__m512 fft471 = _mm512_fmadd_ps(fft463, fft384, _mm512_shuffle_f32x4(fft463, fft463, 177));
__m512 fft388 = _mm512_fmadd_ps(fft379, fft384, _mm512_shuffle_f32x4(fft379, fft379, 177));
__m512 fft472 = _mm512_fmadd_ps(fft464, fft384, _mm512_shuffle_f32x4(fft464, fft464, 177));
__m512 fft389 = _mm512_fmadd_ps(fft380, fft384, _mm512_shuffle_f32x4(fft380, fft380, 177));
__m512 fft473 = _mm512_fmadd_ps(fft465, fft384, _mm512_shuffle_f32x4(fft465, fft465, 177));
__m512 fft390 = _mm512_fmadd_ps(fft381, fft384, _mm512_shuffle_f32x4(fft381, fft381, 177));
__m512 fft474 = _mm512_fmadd_ps(fft466, fft384, _mm512_shuffle_f32x4(fft466, fft466, 177));
__m512 fft391 = _mm512_fmadd_ps(fft382, fft384, _mm512_shuffle_f32x4(fft382, fft382, 177));
__m512 fft475 = _mm512_fmadd_ps(fft467, fft384, _mm512_shuffle_f32x4(fft467, fft467, 177));
__m512 fft392 = _mm512_fmadd_ps(fft383, fft384, _mm512_shuffle_f32x4(fft383, fft383, 177));
__m512 fft476 = _mm512_fmadd_ps(fft468, fft384, _mm512_shuffle_f32x4(fft468, fft468, 177));
__m512 fft393 = _mm512_mask_mov_ps(fft385, 49344, fft386);
__m512 fft477 = _mm512_mask_mov_ps(fft469, 49344, fft470);
__m512 fft394 = _mm512_mask_sub_ps(fft386, 49344, _mm512_setzero_ps(), fft385);
__m512 fft478 = _mm512_mask_sub_ps(fft470, 49344, _mm512_setzero_ps(), fft469);
__m512 fft395 = _mm512_mask_mov_ps(fft387, 49344, fft388);
__m512 fft479 = _mm512_mask_mov_ps(fft471, 49344, fft472);
__m512 fft396 = _mm512_mask_sub_ps(fft388, 49344, _mm512_setzero_ps(), fft387);
__m512 fft480 = _mm512_mask_sub_ps(fft472, 49344, _mm512_setzero_ps(), fft471);
__m512 fft397 = _mm512_mask_mov_ps(fft389, 49344, fft390);
__m512 fft481 = _mm512_mask_mov_ps(fft473, 49344, fft474);
__m512 fft398 = _mm512_mask_sub_ps(fft390, 49344, _mm512_setzero_ps(), fft389);
__m512 fft482 = _mm512_mask_sub_ps(fft474, 49344, _mm512_setzero_ps(), fft473);
__m512 fft399 = _mm512_mask_mov_ps(fft391, 49344, fft392);
__m512 fft483 = _mm512_mask_mov_ps(fft475, 49344, fft476);
__m512 fft400 = _mm512_mask_sub_ps(fft392, 49344, _mm512_setzero_ps(), fft391);
__m512 fft484 = _mm512_mask_sub_ps(fft476, 49344, _mm512_setzero_ps(), fft475);
__m512 fft401 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft402 = _mm512_fmadd_ps(fft393, fft401, _mm512_shuffle_ps(fft393, fft393, 78));
__m512 fft485 = _mm512_fmadd_ps(fft477, fft401, _mm512_shuffle_ps(fft477, fft477, 78));
__m512 fft403 = _mm512_fmadd_ps(fft394, fft401, _mm512_shuffle_ps(fft394, fft394, 78));
__m512 fft486 = _mm512_fmadd_ps(fft478, fft401, _mm512_shuffle_ps(fft478, fft478, 78));
__m512 fft404 = _mm512_fmadd_ps(fft395, fft401, _mm512_shuffle_ps(fft395, fft395, 78));
__m512 fft487 = _mm512_fmadd_ps(fft479, fft401, _mm512_shuffle_ps(fft479, fft479, 78));
__m512 fft405 = _mm512_fmadd_ps(fft396, fft401, _mm512_shuffle_ps(fft396, fft396, 78));
__m512 fft488 = _mm512_fmadd_ps(fft480, fft401, _mm512_shuffle_ps(fft480, fft480, 78));
__m512 fft406 = _mm512_fmadd_ps(fft397, fft401, _mm512_shuffle_ps(fft397, fft397, 78));
__m512 fft489 = _mm512_fmadd_ps(fft481, fft401, _mm512_shuffle_ps(fft481, fft481, 78));
__m512 fft407 = _mm512_fmadd_ps(fft398, fft401, _mm512_shuffle_ps(fft398, fft398, 78));
__m512 fft490 = _mm512_fmadd_ps(fft482, fft401, _mm512_shuffle_ps(fft482, fft482, 78));
__m512 fft408 = _mm512_fmadd_ps(fft399, fft401, _mm512_shuffle_ps(fft399, fft399, 78));
__m512 fft491 = _mm512_fmadd_ps(fft483, fft401, _mm512_shuffle_ps(fft483, fft483, 78));
__m512 fft409 = _mm512_fmadd_ps(fft400, fft401, _mm512_shuffle_ps(fft400, fft400, 78));
__m512 fft492 = _mm512_fmadd_ps(fft484, fft401, _mm512_shuffle_ps(fft484, fft484, 78));
__m512i fft410 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft411 = _mm512_permutexvar_ps(fft410, fft402);
__m512 fft493 = _mm512_permutexvar_ps(fft410, fft485);
__m512i fft412 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft413 = _mm512_permutexvar_ps(fft412, fft402);
__m512 fft494 = _mm512_permutexvar_ps(fft412, fft485);
__m512 fft414 = _mm512_permutexvar_ps(fft410, fft403);
__m512 fft495 = _mm512_permutexvar_ps(fft410, fft486);
__m512 fft415 = _mm512_permutexvar_ps(fft412, fft403);
__m512 fft496 = _mm512_permutexvar_ps(fft412, fft486);
__m512 fft416 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft417 = _mm512_fmadd_ps(fft411, fft416, fft413);
__m512 fft497 = _mm512_fmadd_ps(fft493, fft416, fft494);
__m512 fft418 = _mm512_fnmadd_ps(fft415, fft416, fft414);
__m512 fft498 = _mm512_fnmadd_ps(fft496, fft416, fft495);
__m512 fft419 = _mm512_mask_mov_ps(fft415, 21845, fft417);
__m512 fft499 = _mm512_mask_mov_ps(fft496, 21845, fft497);
__m512 fft420 = _mm512_mask_mov_ps(fft411, 43176, fft417);
__m512 fft500 = _mm512_mask_mov_ps(fft493, 43176, fft497);
__m512 fft421 = _mm512_mask_mov_ps(fft419, 43176, fft418);
__m512 fft501 = _mm512_mask_mov_ps(fft499, 43176, fft498);
__m512 fft422 = _mm512_mask_mov_ps(fft420, 22102, fft418);
__m512 fft502 = _mm512_mask_mov_ps(fft500, 22102, fft498);
__m512 fft423 = _mm512_mask_mul_ps(fft421, 64764, fft421, _mm512_set1_ps(5e-01f));
__m512 fft503 = _mm512_mask_mul_ps(fft501, 64764, fft501, _mm512_set1_ps(5e-01f));
__m512 fft424 = _mm512_mask_mul_ps(fft422, 64764, fft422, _mm512_set1_ps(5e-01f));
__m512 fft504 = _mm512_mask_mul_ps(fft502, 64764, fft502, _mm512_set1_ps(5e-01f));
__m512 df1 = fft423;
__m512 df9 = fft503;
__m512 df2 = fft424;
__m512 df10 = fft504;
__m512 df3 = fft404;
__m512 df11 = fft487;
__m512 df4 = fft405;
__m512 df12 = fft488;
__m512 df5 = fft406;
__m512 df13 = fft489;
__m512 df6 = fft407;
__m512 df14 = fft490;
__m512 df7 = fft408;
__m512 df15 = fft491;
__m512 df8 = fft409;
__m512 df16 = fft492;
__m512i eo3 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df3 = _mm512_permutexvar_ps(eo3, df3);
df4 = _mm512_permutexvar_ps(eo3, df4);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df3);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df4);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df3);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df4);
df11 = _mm512_permutexvar_ps(eo3, df11);
df12 = _mm512_permutexvar_ps(eo3, df12);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df11);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df12);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df11);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df12);
df5 = _mm512_permutexvar_ps(eo3, df5);
df6 = _mm512_permutexvar_ps(eo3, df6);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df5);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df6);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df5);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df6);
df13 = _mm512_permutexvar_ps(eo3, df13);
df14 = _mm512_permutexvar_ps(eo3, df14);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df13);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df14);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df13);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df14);
df7 = _mm512_permutexvar_ps(eo3, df7);
df8 = _mm512_permutexvar_ps(eo3, df8);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df7);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df8);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df7);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df8);
df15 = _mm512_permutexvar_ps(eo3, df15);
df16 = _mm512_permutexvar_ps(eo3, df16);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df15);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df16);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df15);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df16);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df1);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df2);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df1);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df2);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df9);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df10);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df9);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df10);
for (ptrdiff_t b4 = 1; b4 < 6; ++b4) {
ptrdiff_t m4 = (size_t)b4/2;
ptrdiff_t f5 = (size_t)b4%2;
__m512 dat14 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k2+896*h1+4*w1+40*b4);
__m512 dat15 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k2+896*h1+4*w1+40*b4);
__m512 dat16 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k2+896*h1+4*w1+40*b4);
__m512 dat17 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k2+896*h1+4*w1+40*b4);
__m512 dat18 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k2+896*h1+4*w1+40*b4);
__m512 dat19 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k2+896*h1+4*w1+40*b4);
__m512 dat20 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k2+896*h1+4*w1+40*b4);
__m512 dat21 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k2+896*h1+4*w1+40*b4);
__m512 dat22 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k2+896*h1+4*w1+40*b4);
__m512 dat23 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k2+896*h1+4*w1+40*b4);
__m512 dat24 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k2+896*h1+4*w1+40*b4);
__m512 dat25 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k2+896*h1+4*w1+40*b4);
__m512 dat26 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k2+896*h1+4*w1+40*b4);
__m512 fft505 = _mm512_add_ps(_mm512_setzero_ps(), dat19);
__m512 fft593 = _mm512_add_ps(_mm512_setzero_ps(), dat20);
__m512 fft506 = _mm512_sub_ps(_mm512_setzero_ps(), dat19);
__m512 fft594 = _mm512_sub_ps(_mm512_setzero_ps(), dat20);
__m512 fft507 = _mm512_add_ps(_mm512_setzero_ps(), dat21);
__m512 fft595 = _mm512_add_ps(dat14, dat22);
__m512 fft508 = _mm512_sub_ps(_mm512_setzero_ps(), dat21);
__m512 fft596 = _mm512_sub_ps(dat14, dat22);
__m512 fft509 = _mm512_add_ps(dat15, dat23);
__m512 fft597 = _mm512_add_ps(dat16, dat24);
__m512 fft510 = _mm512_sub_ps(dat15, dat23);
__m512 fft598 = _mm512_sub_ps(dat16, dat24);
__m512 fft511 = _mm512_add_ps(dat17, dat25);
__m512 fft599 = _mm512_add_ps(dat18, dat26);
__m512 fft512 = _mm512_sub_ps(dat17, dat25);
__m512 fft600 = _mm512_sub_ps(dat18, dat26);
__m512 fft513 = _mm512_add_ps(fft505, fft509);
__m512 fft601 = _mm512_add_ps(fft593, fft597);
__m512 fft514 = _mm512_sub_ps(fft505, fft509);
__m512 fft602 = _mm512_sub_ps(fft593, fft597);
__m512 fft515 = _mm512_add_ps(fft507, fft511);
__m512 fft603 = _mm512_add_ps(fft595, fft599);
__m512 fft516 = _mm512_sub_ps(fft511, fft507);
__m512 fft604 = _mm512_sub_ps(fft599, fft595);
__m512 fft517 = _mm512_sub_ps(fft508, fft512);
__m512 fft605 = _mm512_sub_ps(fft596, fft600);
__m512 fft518 = _mm512_add_ps(fft508, fft512);
__m512 fft606 = _mm512_add_ps(fft596, fft600);
__m512 fft519 = _mm512_add_ps(fft513, fft515);
__m512 fft607 = _mm512_add_ps(fft601, fft603);
__m512 fft520 = _mm512_sub_ps(fft513, fft515);
__m512 fft608 = _mm512_sub_ps(fft601, fft603);
__m512 fft521 = _mm512_fmadd_ps(fft517, _mm512_set1_ps(7.0710677e-01f), fft506);
__m512 fft609 = _mm512_fmadd_ps(fft605, _mm512_set1_ps(7.0710677e-01f), fft594);
__m512 fft522 = _mm512_fnmsub_ps(fft518, _mm512_set1_ps(7.0710677e-01f), fft510);
__m512 fft610 = _mm512_fnmsub_ps(fft606, _mm512_set1_ps(7.0710677e-01f), fft598);
__m512 fft523 = _mm512_fnmadd_ps(fft517, _mm512_set1_ps(7.0710677e-01f), fft506);
__m512 fft611 = _mm512_fnmadd_ps(fft605, _mm512_set1_ps(7.0710677e-01f), fft594);
__m512 fft524 = _mm512_fnmadd_ps(fft518, _mm512_set1_ps(7.0710677e-01f), fft510);
__m512 fft612 = _mm512_fnmadd_ps(fft606, _mm512_set1_ps(7.0710677e-01f), fft598);
__m512 fft525 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft526 = _mm512_fmadd_ps(fft519, fft525, _mm512_shuffle_f32x4(fft519, fft519, 78));
__m512 fft613 = _mm512_fmadd_ps(fft607, fft525, _mm512_shuffle_f32x4(fft607, fft607, 78));
__m512 fft527 = _mm512_fmadd_ps(fft520, fft525, _mm512_shuffle_f32x4(fft520, fft520, 78));
__m512 fft614 = _mm512_fmadd_ps(fft608, fft525, _mm512_shuffle_f32x4(fft608, fft608, 78));
__m512 fft528 = _mm512_fmadd_ps(fft521, fft525, _mm512_shuffle_f32x4(fft521, fft521, 78));
__m512 fft615 = _mm512_fmadd_ps(fft609, fft525, _mm512_shuffle_f32x4(fft609, fft609, 78));
__m512 fft529 = _mm512_fmadd_ps(fft522, fft525, _mm512_shuffle_f32x4(fft522, fft522, 78));
__m512 fft616 = _mm512_fmadd_ps(fft610, fft525, _mm512_shuffle_f32x4(fft610, fft610, 78));
__m512 fft530 = _mm512_fmadd_ps(fft514, fft525, _mm512_shuffle_f32x4(fft514, fft514, 78));
__m512 fft617 = _mm512_fmadd_ps(fft602, fft525, _mm512_shuffle_f32x4(fft602, fft602, 78));
__m512 fft531 = _mm512_fmadd_ps(fft516, fft525, _mm512_shuffle_f32x4(fft516, fft516, 78));
__m512 fft618 = _mm512_fmadd_ps(fft604, fft525, _mm512_shuffle_f32x4(fft604, fft604, 78));
__m512 fft532 = _mm512_fmadd_ps(fft523, fft525, _mm512_shuffle_f32x4(fft523, fft523, 78));
__m512 fft619 = _mm512_fmadd_ps(fft611, fft525, _mm512_shuffle_f32x4(fft611, fft611, 78));
__m512 fft533 = _mm512_fmadd_ps(fft524, fft525, _mm512_shuffle_f32x4(fft524, fft524, 78));
__m512 fft620 = _mm512_fmadd_ps(fft612, fft525, _mm512_shuffle_f32x4(fft612, fft612, 78));
__m512 fft534 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft535 = _mm512_mul_ps(fft526, fft534);
__m512 fft621 = _mm512_mul_ps(fft613, fft534);
__m512 fft536 = _mm512_mul_ps(fft527, fft534);
__m512 fft622 = _mm512_mul_ps(fft614, fft534);
__m512 fft537 = _mm512_mul_ps(fft528, fft534);
__m512 fft623 = _mm512_mul_ps(fft615, fft534);
__m512 fft538 = _mm512_mul_ps(fft529, fft534);
__m512 fft624 = _mm512_mul_ps(fft616, fft534);
__m512 fft539 = _mm512_mul_ps(fft530, fft534);
__m512 fft625 = _mm512_mul_ps(fft617, fft534);
__m512 fft540 = _mm512_mul_ps(fft531, fft534);
__m512 fft626 = _mm512_mul_ps(fft618, fft534);
__m512 fft541 = _mm512_mul_ps(fft532, fft534);
__m512 fft627 = _mm512_mul_ps(fft619, fft534);
__m512 fft542 = _mm512_mul_ps(fft533, fft534);
__m512 fft628 = _mm512_mul_ps(fft620, fft534);
__m512 fft543 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft544 = _mm512_fmadd_ps(fft527, fft543, fft535);
__m512 fft629 = _mm512_fmadd_ps(fft614, fft543, fft621);
__m512 fft545 = _mm512_fnmadd_ps(fft526, fft543, fft536);
__m512 fft630 = _mm512_fnmadd_ps(fft613, fft543, fft622);
__m512 fft546 = _mm512_fmadd_ps(fft529, fft543, fft537);
__m512 fft631 = _mm512_fmadd_ps(fft616, fft543, fft623);
__m512 fft547 = _mm512_fnmadd_ps(fft528, fft543, fft538);
__m512 fft632 = _mm512_fnmadd_ps(fft615, fft543, fft624);
__m512 fft548 = _mm512_fmadd_ps(fft531, fft543, fft539);
__m512 fft633 = _mm512_fmadd_ps(fft618, fft543, fft625);
__m512 fft549 = _mm512_fnmadd_ps(fft530, fft543, fft540);
__m512 fft634 = _mm512_fnmadd_ps(fft617, fft543, fft626);
__m512 fft550 = _mm512_fmadd_ps(fft533, fft543, fft541);
__m512 fft635 = _mm512_fmadd_ps(fft620, fft543, fft627);
__m512 fft551 = _mm512_fnmadd_ps(fft532, fft543, fft542);
__m512 fft636 = _mm512_fnmadd_ps(fft619, fft543, fft628);
__m512 fft552 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft553 = _mm512_fmadd_ps(fft544, fft552, _mm512_shuffle_f32x4(fft544, fft544, 177));
__m512 fft637 = _mm512_fmadd_ps(fft629, fft552, _mm512_shuffle_f32x4(fft629, fft629, 177));
__m512 fft554 = _mm512_fmadd_ps(fft545, fft552, _mm512_shuffle_f32x4(fft545, fft545, 177));
__m512 fft638 = _mm512_fmadd_ps(fft630, fft552, _mm512_shuffle_f32x4(fft630, fft630, 177));
__m512 fft555 = _mm512_fmadd_ps(fft546, fft552, _mm512_shuffle_f32x4(fft546, fft546, 177));
__m512 fft639 = _mm512_fmadd_ps(fft631, fft552, _mm512_shuffle_f32x4(fft631, fft631, 177));
__m512 fft556 = _mm512_fmadd_ps(fft547, fft552, _mm512_shuffle_f32x4(fft547, fft547, 177));
__m512 fft640 = _mm512_fmadd_ps(fft632, fft552, _mm512_shuffle_f32x4(fft632, fft632, 177));
__m512 fft557 = _mm512_fmadd_ps(fft548, fft552, _mm512_shuffle_f32x4(fft548, fft548, 177));
__m512 fft641 = _mm512_fmadd_ps(fft633, fft552, _mm512_shuffle_f32x4(fft633, fft633, 177));
__m512 fft558 = _mm512_fmadd_ps(fft549, fft552, _mm512_shuffle_f32x4(fft549, fft549, 177));
__m512 fft642 = _mm512_fmadd_ps(fft634, fft552, _mm512_shuffle_f32x4(fft634, fft634, 177));
__m512 fft559 = _mm512_fmadd_ps(fft550, fft552, _mm512_shuffle_f32x4(fft550, fft550, 177));
__m512 fft643 = _mm512_fmadd_ps(fft635, fft552, _mm512_shuffle_f32x4(fft635, fft635, 177));
__m512 fft560 = _mm512_fmadd_ps(fft551, fft552, _mm512_shuffle_f32x4(fft551, fft551, 177));
__m512 fft644 = _mm512_fmadd_ps(fft636, fft552, _mm512_shuffle_f32x4(fft636, fft636, 177));
__m512 fft561 = _mm512_mask_mov_ps(fft553, 49344, fft554);
__m512 fft645 = _mm512_mask_mov_ps(fft637, 49344, fft638);
__m512 fft562 = _mm512_mask_sub_ps(fft554, 49344, _mm512_setzero_ps(), fft553);
__m512 fft646 = _mm512_mask_sub_ps(fft638, 49344, _mm512_setzero_ps(), fft637);
__m512 fft563 = _mm512_mask_mov_ps(fft555, 49344, fft556);
__m512 fft647 = _mm512_mask_mov_ps(fft639, 49344, fft640);
__m512 fft564 = _mm512_mask_sub_ps(fft556, 49344, _mm512_setzero_ps(), fft555);
__m512 fft648 = _mm512_mask_sub_ps(fft640, 49344, _mm512_setzero_ps(), fft639);
__m512 fft565 = _mm512_mask_mov_ps(fft557, 49344, fft558);
__m512 fft649 = _mm512_mask_mov_ps(fft641, 49344, fft642);
__m512 fft566 = _mm512_mask_sub_ps(fft558, 49344, _mm512_setzero_ps(), fft557);
__m512 fft650 = _mm512_mask_sub_ps(fft642, 49344, _mm512_setzero_ps(), fft641);
__m512 fft567 = _mm512_mask_mov_ps(fft559, 49344, fft560);
__m512 fft651 = _mm512_mask_mov_ps(fft643, 49344, fft644);
__m512 fft568 = _mm512_mask_sub_ps(fft560, 49344, _mm512_setzero_ps(), fft559);
__m512 fft652 = _mm512_mask_sub_ps(fft644, 49344, _mm512_setzero_ps(), fft643);
__m512 fft569 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft570 = _mm512_fmadd_ps(fft561, fft569, _mm512_shuffle_ps(fft561, fft561, 78));
__m512 fft653 = _mm512_fmadd_ps(fft645, fft569, _mm512_shuffle_ps(fft645, fft645, 78));
__m512 fft571 = _mm512_fmadd_ps(fft562, fft569, _mm512_shuffle_ps(fft562, fft562, 78));
__m512 fft654 = _mm512_fmadd_ps(fft646, fft569, _mm512_shuffle_ps(fft646, fft646, 78));
__m512 fft572 = _mm512_fmadd_ps(fft563, fft569, _mm512_shuffle_ps(fft563, fft563, 78));
__m512 fft655 = _mm512_fmadd_ps(fft647, fft569, _mm512_shuffle_ps(fft647, fft647, 78));
__m512 fft573 = _mm512_fmadd_ps(fft564, fft569, _mm512_shuffle_ps(fft564, fft564, 78));
__m512 fft656 = _mm512_fmadd_ps(fft648, fft569, _mm512_shuffle_ps(fft648, fft648, 78));
__m512 fft574 = _mm512_fmadd_ps(fft565, fft569, _mm512_shuffle_ps(fft565, fft565, 78));
__m512 fft657 = _mm512_fmadd_ps(fft649, fft569, _mm512_shuffle_ps(fft649, fft649, 78));
__m512 fft575 = _mm512_fmadd_ps(fft566, fft569, _mm512_shuffle_ps(fft566, fft566, 78));
__m512 fft658 = _mm512_fmadd_ps(fft650, fft569, _mm512_shuffle_ps(fft650, fft650, 78));
__m512 fft576 = _mm512_fmadd_ps(fft567, fft569, _mm512_shuffle_ps(fft567, fft567, 78));
__m512 fft659 = _mm512_fmadd_ps(fft651, fft569, _mm512_shuffle_ps(fft651, fft651, 78));
__m512 fft577 = _mm512_fmadd_ps(fft568, fft569, _mm512_shuffle_ps(fft568, fft568, 78));
__m512 fft660 = _mm512_fmadd_ps(fft652, fft569, _mm512_shuffle_ps(fft652, fft652, 78));
__m512i fft578 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft579 = _mm512_permutexvar_ps(fft578, fft570);
__m512 fft661 = _mm512_permutexvar_ps(fft578, fft653);
__m512i fft580 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft581 = _mm512_permutexvar_ps(fft580, fft570);
__m512 fft662 = _mm512_permutexvar_ps(fft580, fft653);
__m512 fft582 = _mm512_permutexvar_ps(fft578, fft571);
__m512 fft663 = _mm512_permutexvar_ps(fft578, fft654);
__m512 fft583 = _mm512_permutexvar_ps(fft580, fft571);
__m512 fft664 = _mm512_permutexvar_ps(fft580, fft654);
__m512 fft584 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft585 = _mm512_fmadd_ps(fft579, fft584, fft581);
__m512 fft665 = _mm512_fmadd_ps(fft661, fft584, fft662);
__m512 fft586 = _mm512_fnmadd_ps(fft583, fft584, fft582);
__m512 fft666 = _mm512_fnmadd_ps(fft664, fft584, fft663);
__m512 fft587 = _mm512_mask_mov_ps(fft583, 21845, fft585);
__m512 fft667 = _mm512_mask_mov_ps(fft664, 21845, fft665);
__m512 fft588 = _mm512_mask_mov_ps(fft579, 43176, fft585);
__m512 fft668 = _mm512_mask_mov_ps(fft661, 43176, fft665);
__m512 fft589 = _mm512_mask_mov_ps(fft587, 43176, fft586);
__m512 fft669 = _mm512_mask_mov_ps(fft667, 43176, fft666);
__m512 fft590 = _mm512_mask_mov_ps(fft588, 22102, fft586);
__m512 fft670 = _mm512_mask_mov_ps(fft668, 22102, fft666);
__m512 fft591 = _mm512_mask_mul_ps(fft589, 64764, fft589, _mm512_set1_ps(5e-01f));
__m512 fft671 = _mm512_mask_mul_ps(fft669, 64764, fft669, _mm512_set1_ps(5e-01f));
__m512 fft592 = _mm512_mask_mul_ps(fft590, 64764, fft590, _mm512_set1_ps(5e-01f));
__m512 fft672 = _mm512_mask_mul_ps(fft670, 64764, fft670, _mm512_set1_ps(5e-01f));
__m512 df17 = fft591;
__m512 df25 = fft671;
__m512 df18 = fft592;
__m512 df26 = fft672;
__m512 df19 = fft572;
__m512 df27 = fft655;
__m512 df20 = fft573;
__m512 df28 = fft656;
__m512 df21 = fft574;
__m512 df29 = fft657;
__m512 df22 = fft575;
__m512 df30 = fft658;
__m512 df23 = fft576;
__m512 df31 = fft659;
__m512 df24 = fft577;
__m512 df32 = fft660;
__m512i eo4 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df19 = _mm512_permutexvar_ps(eo4, df19);
df20 = _mm512_permutexvar_ps(eo4, df20);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df19);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df20);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df19);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df20);
df27 = _mm512_permutexvar_ps(eo4, df27);
df28 = _mm512_permutexvar_ps(eo4, df28);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df27);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df28);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df27);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df28);
df21 = _mm512_permutexvar_ps(eo4, df21);
df22 = _mm512_permutexvar_ps(eo4, df22);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df21);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df22);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df21);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df22);
df29 = _mm512_permutexvar_ps(eo4, df29);
df30 = _mm512_permutexvar_ps(eo4, df30);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df29);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df30);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df29);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df30);
df23 = _mm512_permutexvar_ps(eo4, df23);
df24 = _mm512_permutexvar_ps(eo4, df24);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df23);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df24);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df23);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df24);
df31 = _mm512_permutexvar_ps(eo4, df31);
df32 = _mm512_permutexvar_ps(eo4, df32);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df31);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df32);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df31);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df32);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df17);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df18);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df17);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df18);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df25);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df26);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df25);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df26);
}
}
if (j2 >= last1) return;
++j2;
rel1 = 1;
}
if (rel1 < 3) {
ptrdiff_t h2 = base1+0;
ptrdiff_t w2 = 0+60*rel1;
ptrdiff_t jj1 = 2-rel1+j2;
for (; j2 <= jj1; w2 += 60) {
ptrdiff_t k3 = 3*s1;
ptrdiff_t kk2 = k3+2;
for (; k3 <= kk2; ++k3) {
for (ptrdiff_t b5 = 0; b5 < 6; ++b5) {
ptrdiff_t m5 = (size_t)b5/2;
ptrdiff_t f6 = (size_t)b5%2;
__m512 dat27 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k3+896*h2+4*w2+40*b5);
__m512 dat28 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k3+896*h2+4*w2+40*b5);
__m512 dat29 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k3+896*h2+4*w2+40*b5);
__m512 dat30 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k3+896*h2+4*w2+40*b5);
__m512 dat31 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k3+896*h2+4*w2+40*b5);
__m512 dat32 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k3+896*h2+4*w2+40*b5);
__m512 dat33 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k3+896*h2+4*w2+40*b5);
__m512 dat34 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k3+896*h2+4*w2+40*b5);
__m512 dat35 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k3+896*h2+4*w2+40*b5);
__m512 dat36 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k3+896*h2+4*w2+40*b5);
__m512 dat37 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k3+896*h2+4*w2+40*b5);
__m512 dat38 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k3+896*h2+4*w2+40*b5);
__m512 dat39 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k3+896*h2+4*w2+40*b5);
__m512 fft673 = _mm512_add_ps(_mm512_setzero_ps(), dat32);
__m512 fft761 = _mm512_add_ps(_mm512_setzero_ps(), dat33);
__m512 fft674 = _mm512_sub_ps(_mm512_setzero_ps(), dat32);
__m512 fft762 = _mm512_sub_ps(_mm512_setzero_ps(), dat33);
__m512 fft675 = _mm512_add_ps(_mm512_setzero_ps(), dat34);
__m512 fft763 = _mm512_add_ps(dat27, dat35);
__m512 fft676 = _mm512_sub_ps(_mm512_setzero_ps(), dat34);
__m512 fft764 = _mm512_sub_ps(dat27, dat35);
__m512 fft677 = _mm512_add_ps(dat28, dat36);
__m512 fft765 = _mm512_add_ps(dat29, dat37);
__m512 fft678 = _mm512_sub_ps(dat28, dat36);
__m512 fft766 = _mm512_sub_ps(dat29, dat37);
__m512 fft679 = _mm512_add_ps(dat30, dat38);
__m512 fft767 = _mm512_add_ps(dat31, dat39);
__m512 fft680 = _mm512_sub_ps(dat30, dat38);
__m512 fft768 = _mm512_sub_ps(dat31, dat39);
__m512 fft681 = _mm512_add_ps(fft673, fft677);
__m512 fft769 = _mm512_add_ps(fft761, fft765);
__m512 fft682 = _mm512_sub_ps(fft673, fft677);
__m512 fft770 = _mm512_sub_ps(fft761, fft765);
__m512 fft683 = _mm512_add_ps(fft675, fft679);
__m512 fft771 = _mm512_add_ps(fft763, fft767);
__m512 fft684 = _mm512_sub_ps(fft679, fft675);
__m512 fft772 = _mm512_sub_ps(fft767, fft763);
__m512 fft685 = _mm512_sub_ps(fft676, fft680);
__m512 fft773 = _mm512_sub_ps(fft764, fft768);
__m512 fft686 = _mm512_add_ps(fft676, fft680);
__m512 fft774 = _mm512_add_ps(fft764, fft768);
__m512 fft687 = _mm512_add_ps(fft681, fft683);
__m512 fft775 = _mm512_add_ps(fft769, fft771);
__m512 fft688 = _mm512_sub_ps(fft681, fft683);
__m512 fft776 = _mm512_sub_ps(fft769, fft771);
__m512 fft689 = _mm512_fmadd_ps(fft685, _mm512_set1_ps(7.0710677e-01f), fft674);
__m512 fft777 = _mm512_fmadd_ps(fft773, _mm512_set1_ps(7.0710677e-01f), fft762);
__m512 fft690 = _mm512_fnmsub_ps(fft686, _mm512_set1_ps(7.0710677e-01f), fft678);
__m512 fft778 = _mm512_fnmsub_ps(fft774, _mm512_set1_ps(7.0710677e-01f), fft766);
__m512 fft691 = _mm512_fnmadd_ps(fft685, _mm512_set1_ps(7.0710677e-01f), fft674);
__m512 fft779 = _mm512_fnmadd_ps(fft773, _mm512_set1_ps(7.0710677e-01f), fft762);
__m512 fft692 = _mm512_fnmadd_ps(fft686, _mm512_set1_ps(7.0710677e-01f), fft678);
__m512 fft780 = _mm512_fnmadd_ps(fft774, _mm512_set1_ps(7.0710677e-01f), fft766);
__m512 fft693 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft694 = _mm512_fmadd_ps(fft687, fft693, _mm512_shuffle_f32x4(fft687, fft687, 78));
__m512 fft781 = _mm512_fmadd_ps(fft775, fft693, _mm512_shuffle_f32x4(fft775, fft775, 78));
__m512 fft695 = _mm512_fmadd_ps(fft688, fft693, _mm512_shuffle_f32x4(fft688, fft688, 78));
__m512 fft782 = _mm512_fmadd_ps(fft776, fft693, _mm512_shuffle_f32x4(fft776, fft776, 78));
__m512 fft696 = _mm512_fmadd_ps(fft689, fft693, _mm512_shuffle_f32x4(fft689, fft689, 78));
__m512 fft783 = _mm512_fmadd_ps(fft777, fft693, _mm512_shuffle_f32x4(fft777, fft777, 78));
__m512 fft697 = _mm512_fmadd_ps(fft690, fft693, _mm512_shuffle_f32x4(fft690, fft690, 78));
__m512 fft784 = _mm512_fmadd_ps(fft778, fft693, _mm512_shuffle_f32x4(fft778, fft778, 78));
__m512 fft698 = _mm512_fmadd_ps(fft682, fft693, _mm512_shuffle_f32x4(fft682, fft682, 78));
__m512 fft785 = _mm512_fmadd_ps(fft770, fft693, _mm512_shuffle_f32x4(fft770, fft770, 78));
__m512 fft699 = _mm512_fmadd_ps(fft684, fft693, _mm512_shuffle_f32x4(fft684, fft684, 78));
__m512 fft786 = _mm512_fmadd_ps(fft772, fft693, _mm512_shuffle_f32x4(fft772, fft772, 78));
__m512 fft700 = _mm512_fmadd_ps(fft691, fft693, _mm512_shuffle_f32x4(fft691, fft691, 78));
__m512 fft787 = _mm512_fmadd_ps(fft779, fft693, _mm512_shuffle_f32x4(fft779, fft779, 78));
__m512 fft701 = _mm512_fmadd_ps(fft692, fft693, _mm512_shuffle_f32x4(fft692, fft692, 78));
__m512 fft788 = _mm512_fmadd_ps(fft780, fft693, _mm512_shuffle_f32x4(fft780, fft780, 78));
__m512 fft702 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft703 = _mm512_mul_ps(fft694, fft702);
__m512 fft789 = _mm512_mul_ps(fft781, fft702);
__m512 fft704 = _mm512_mul_ps(fft695, fft702);
__m512 fft790 = _mm512_mul_ps(fft782, fft702);
__m512 fft705 = _mm512_mul_ps(fft696, fft702);
__m512 fft791 = _mm512_mul_ps(fft783, fft702);
__m512 fft706 = _mm512_mul_ps(fft697, fft702);
__m512 fft792 = _mm512_mul_ps(fft784, fft702);
__m512 fft707 = _mm512_mul_ps(fft698, fft702);
__m512 fft793 = _mm512_mul_ps(fft785, fft702);
__m512 fft708 = _mm512_mul_ps(fft699, fft702);
__m512 fft794 = _mm512_mul_ps(fft786, fft702);
__m512 fft709 = _mm512_mul_ps(fft700, fft702);
__m512 fft795 = _mm512_mul_ps(fft787, fft702);
__m512 fft710 = _mm512_mul_ps(fft701, fft702);
__m512 fft796 = _mm512_mul_ps(fft788, fft702);
__m512 fft711 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft712 = _mm512_fmadd_ps(fft695, fft711, fft703);
__m512 fft797 = _mm512_fmadd_ps(fft782, fft711, fft789);
__m512 fft713 = _mm512_fnmadd_ps(fft694, fft711, fft704);
__m512 fft798 = _mm512_fnmadd_ps(fft781, fft711, fft790);
__m512 fft714 = _mm512_fmadd_ps(fft697, fft711, fft705);
__m512 fft799 = _mm512_fmadd_ps(fft784, fft711, fft791);
__m512 fft715 = _mm512_fnmadd_ps(fft696, fft711, fft706);
__m512 fft800 = _mm512_fnmadd_ps(fft783, fft711, fft792);
__m512 fft716 = _mm512_fmadd_ps(fft699, fft711, fft707);
__m512 fft801 = _mm512_fmadd_ps(fft786, fft711, fft793);
__m512 fft717 = _mm512_fnmadd_ps(fft698, fft711, fft708);
__m512 fft802 = _mm512_fnmadd_ps(fft785, fft711, fft794);
__m512 fft718 = _mm512_fmadd_ps(fft701, fft711, fft709);
__m512 fft803 = _mm512_fmadd_ps(fft788, fft711, fft795);
__m512 fft719 = _mm512_fnmadd_ps(fft700, fft711, fft710);
__m512 fft804 = _mm512_fnmadd_ps(fft787, fft711, fft796);
__m512 fft720 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft721 = _mm512_fmadd_ps(fft712, fft720, _mm512_shuffle_f32x4(fft712, fft712, 177));
__m512 fft805 = _mm512_fmadd_ps(fft797, fft720, _mm512_shuffle_f32x4(fft797, fft797, 177));
__m512 fft722 = _mm512_fmadd_ps(fft713, fft720, _mm512_shuffle_f32x4(fft713, fft713, 177));
__m512 fft806 = _mm512_fmadd_ps(fft798, fft720, _mm512_shuffle_f32x4(fft798, fft798, 177));
__m512 fft723 = _mm512_fmadd_ps(fft714, fft720, _mm512_shuffle_f32x4(fft714, fft714, 177));
__m512 fft807 = _mm512_fmadd_ps(fft799, fft720, _mm512_shuffle_f32x4(fft799, fft799, 177));
__m512 fft724 = _mm512_fmadd_ps(fft715, fft720, _mm512_shuffle_f32x4(fft715, fft715, 177));
__m512 fft808 = _mm512_fmadd_ps(fft800, fft720, _mm512_shuffle_f32x4(fft800, fft800, 177));
__m512 fft725 = _mm512_fmadd_ps(fft716, fft720, _mm512_shuffle_f32x4(fft716, fft716, 177));
__m512 fft809 = _mm512_fmadd_ps(fft801, fft720, _mm512_shuffle_f32x4(fft801, fft801, 177));
__m512 fft726 = _mm512_fmadd_ps(fft717, fft720, _mm512_shuffle_f32x4(fft717, fft717, 177));
__m512 fft810 = _mm512_fmadd_ps(fft802, fft720, _mm512_shuffle_f32x4(fft802, fft802, 177));
__m512 fft727 = _mm512_fmadd_ps(fft718, fft720, _mm512_shuffle_f32x4(fft718, fft718, 177));
__m512 fft811 = _mm512_fmadd_ps(fft803, fft720, _mm512_shuffle_f32x4(fft803, fft803, 177));
__m512 fft728 = _mm512_fmadd_ps(fft719, fft720, _mm512_shuffle_f32x4(fft719, fft719, 177));
__m512 fft812 = _mm512_fmadd_ps(fft804, fft720, _mm512_shuffle_f32x4(fft804, fft804, 177));
__m512 fft729 = _mm512_mask_mov_ps(fft721, 49344, fft722);
__m512 fft813 = _mm512_mask_mov_ps(fft805, 49344, fft806);
__m512 fft730 = _mm512_mask_sub_ps(fft722, 49344, _mm512_setzero_ps(), fft721);
__m512 fft814 = _mm512_mask_sub_ps(fft806, 49344, _mm512_setzero_ps(), fft805);
__m512 fft731 = _mm512_mask_mov_ps(fft723, 49344, fft724);
__m512 fft815 = _mm512_mask_mov_ps(fft807, 49344, fft808);
__m512 fft732 = _mm512_mask_sub_ps(fft724, 49344, _mm512_setzero_ps(), fft723);
__m512 fft816 = _mm512_mask_sub_ps(fft808, 49344, _mm512_setzero_ps(), fft807);
__m512 fft733 = _mm512_mask_mov_ps(fft725, 49344, fft726);
__m512 fft817 = _mm512_mask_mov_ps(fft809, 49344, fft810);
__m512 fft734 = _mm512_mask_sub_ps(fft726, 49344, _mm512_setzero_ps(), fft725);
__m512 fft818 = _mm512_mask_sub_ps(fft810, 49344, _mm512_setzero_ps(), fft809);
__m512 fft735 = _mm512_mask_mov_ps(fft727, 49344, fft728);
__m512 fft819 = _mm512_mask_mov_ps(fft811, 49344, fft812);
__m512 fft736 = _mm512_mask_sub_ps(fft728, 49344, _mm512_setzero_ps(), fft727);
__m512 fft820 = _mm512_mask_sub_ps(fft812, 49344, _mm512_setzero_ps(), fft811);
__m512 fft737 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft738 = _mm512_fmadd_ps(fft729, fft737, _mm512_shuffle_ps(fft729, fft729, 78));
__m512 fft821 = _mm512_fmadd_ps(fft813, fft737, _mm512_shuffle_ps(fft813, fft813, 78));
__m512 fft739 = _mm512_fmadd_ps(fft730, fft737, _mm512_shuffle_ps(fft730, fft730, 78));
__m512 fft822 = _mm512_fmadd_ps(fft814, fft737, _mm512_shuffle_ps(fft814, fft814, 78));
__m512 fft740 = _mm512_fmadd_ps(fft731, fft737, _mm512_shuffle_ps(fft731, fft731, 78));
__m512 fft823 = _mm512_fmadd_ps(fft815, fft737, _mm512_shuffle_ps(fft815, fft815, 78));
__m512 fft741 = _mm512_fmadd_ps(fft732, fft737, _mm512_shuffle_ps(fft732, fft732, 78));
__m512 fft824 = _mm512_fmadd_ps(fft816, fft737, _mm512_shuffle_ps(fft816, fft816, 78));
__m512 fft742 = _mm512_fmadd_ps(fft733, fft737, _mm512_shuffle_ps(fft733, fft733, 78));
__m512 fft825 = _mm512_fmadd_ps(fft817, fft737, _mm512_shuffle_ps(fft817, fft817, 78));
__m512 fft743 = _mm512_fmadd_ps(fft734, fft737, _mm512_shuffle_ps(fft734, fft734, 78));
__m512 fft826 = _mm512_fmadd_ps(fft818, fft737, _mm512_shuffle_ps(fft818, fft818, 78));
__m512 fft744 = _mm512_fmadd_ps(fft735, fft737, _mm512_shuffle_ps(fft735, fft735, 78));
__m512 fft827 = _mm512_fmadd_ps(fft819, fft737, _mm512_shuffle_ps(fft819, fft819, 78));
__m512 fft745 = _mm512_fmadd_ps(fft736, fft737, _mm512_shuffle_ps(fft736, fft736, 78));
__m512 fft828 = _mm512_fmadd_ps(fft820, fft737, _mm512_shuffle_ps(fft820, fft820, 78));
__m512i fft746 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft747 = _mm512_permutexvar_ps(fft746, fft738);
__m512 fft829 = _mm512_permutexvar_ps(fft746, fft821);
__m512i fft748 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft749 = _mm512_permutexvar_ps(fft748, fft738);
__m512 fft830 = _mm512_permutexvar_ps(fft748, fft821);
__m512 fft750 = _mm512_permutexvar_ps(fft746, fft739);
__m512 fft831 = _mm512_permutexvar_ps(fft746, fft822);
__m512 fft751 = _mm512_permutexvar_ps(fft748, fft739);
__m512 fft832 = _mm512_permutexvar_ps(fft748, fft822);
__m512 fft752 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft753 = _mm512_fmadd_ps(fft747, fft752, fft749);
__m512 fft833 = _mm512_fmadd_ps(fft829, fft752, fft830);
__m512 fft754 = _mm512_fnmadd_ps(fft751, fft752, fft750);
__m512 fft834 = _mm512_fnmadd_ps(fft832, fft752, fft831);
__m512 fft755 = _mm512_mask_mov_ps(fft751, 21845, fft753);
__m512 fft835 = _mm512_mask_mov_ps(fft832, 21845, fft833);
__m512 fft756 = _mm512_mask_mov_ps(fft747, 43176, fft753);
__m512 fft836 = _mm512_mask_mov_ps(fft829, 43176, fft833);
__m512 fft757 = _mm512_mask_mov_ps(fft755, 43176, fft754);
__m512 fft837 = _mm512_mask_mov_ps(fft835, 43176, fft834);
__m512 fft758 = _mm512_mask_mov_ps(fft756, 22102, fft754);
__m512 fft838 = _mm512_mask_mov_ps(fft836, 22102, fft834);
__m512 fft759 = _mm512_mask_mul_ps(fft757, 64764, fft757, _mm512_set1_ps(5e-01f));
__m512 fft839 = _mm512_mask_mul_ps(fft837, 64764, fft837, _mm512_set1_ps(5e-01f));
__m512 fft760 = _mm512_mask_mul_ps(fft758, 64764, fft758, _mm512_set1_ps(5e-01f));
__m512 fft840 = _mm512_mask_mul_ps(fft838, 64764, fft838, _mm512_set1_ps(5e-01f));
__m512 df33 = fft759;
__m512 df41 = fft839;
__m512 df34 = fft760;
__m512 df42 = fft840;
__m512 df35 = fft740;
__m512 df43 = fft823;
__m512 df36 = fft741;
__m512 df44 = fft824;
__m512 df37 = fft742;
__m512 df45 = fft825;
__m512 df38 = fft743;
__m512 df46 = fft826;
__m512 df39 = fft744;
__m512 df47 = fft827;
__m512 df40 = fft745;
__m512 df48 = fft828;
__m512i eo5 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df35 = _mm512_permutexvar_ps(eo5, df35);
df36 = _mm512_permutexvar_ps(eo5, df36);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df35);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df36);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df35);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df36);
df43 = _mm512_permutexvar_ps(eo5, df43);
df44 = _mm512_permutexvar_ps(eo5, df44);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df43);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df44);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df43);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df44);
df37 = _mm512_permutexvar_ps(eo5, df37);
df38 = _mm512_permutexvar_ps(eo5, df38);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df37);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df38);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df37);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df38);
df45 = _mm512_permutexvar_ps(eo5, df45);
df46 = _mm512_permutexvar_ps(eo5, df46);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df45);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df46);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df45);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df46);
df39 = _mm512_permutexvar_ps(eo5, df39);
df40 = _mm512_permutexvar_ps(eo5, df40);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df39);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df40);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df39);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df40);
df47 = _mm512_permutexvar_ps(eo5, df47);
df48 = _mm512_permutexvar_ps(eo5, df48);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df47);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df48);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df47);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df48);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df33);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df34);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df33);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df34);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df41);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df42);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df41);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df42);
}
}
if (j2 >= last1) return;
++j2;
}
rel1 = 3;
}
ptrdiff_t h3 = base1+0;
ptrdiff_t w3 = 180;
ptrdiff_t k4 = 3*s1;
ptrdiff_t kk3 = k4+2;
for (; k4 <= kk3; ++k4) {
for (ptrdiff_t b6 = 0; b6 < 4; ++b6) {
ptrdiff_t m6 = (size_t)b6/2;
ptrdiff_t f7 = (size_t)b6%2;
__m512 dat40 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k4+896*h3+4*w3+40*b6);
__m512 dat41 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k4+896*h3+4*w3+40*b6);
__m512 dat42 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k4+896*h3+4*w3+40*b6);
__m512 dat43 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k4+896*h3+4*w3+40*b6);
__m512 dat44 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k4+896*h3+4*w3+40*b6);
__m512 dat45 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k4+896*h3+4*w3+40*b6);
__m512 dat46 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k4+896*h3+4*w3+40*b6);
__m512 dat47 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k4+896*h3+4*w3+40*b6);
__m512 dat48 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k4+896*h3+4*w3+40*b6);
__m512 dat49 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k4+896*h3+4*w3+40*b6);
__m512 dat50 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k4+896*h3+4*w3+40*b6);
__m512 dat51 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k4+896*h3+4*w3+40*b6);
__m512 dat52 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k4+896*h3+4*w3+40*b6);
__m512 fft841 = _mm512_add_ps(_mm512_setzero_ps(), dat45);
__m512 fft929 = _mm512_add_ps(_mm512_setzero_ps(), dat46);
__m512 fft842 = _mm512_sub_ps(_mm512_setzero_ps(), dat45);
__m512 fft930 = _mm512_sub_ps(_mm512_setzero_ps(), dat46);
__m512 fft843 = _mm512_add_ps(_mm512_setzero_ps(), dat47);
__m512 fft931 = _mm512_add_ps(dat40, dat48);
__m512 fft844 = _mm512_sub_ps(_mm512_setzero_ps(), dat47);
__m512 fft932 = _mm512_sub_ps(dat40, dat48);
__m512 fft845 = _mm512_add_ps(dat41, dat49);
__m512 fft933 = _mm512_add_ps(dat42, dat50);
__m512 fft846 = _mm512_sub_ps(dat41, dat49);
__m512 fft934 = _mm512_sub_ps(dat42, dat50);
__m512 fft847 = _mm512_add_ps(dat43, dat51);
__m512 fft935 = _mm512_add_ps(dat44, dat52);
__m512 fft848 = _mm512_sub_ps(dat43, dat51);
__m512 fft936 = _mm512_sub_ps(dat44, dat52);
__m512 fft849 = _mm512_add_ps(fft841, fft845);
__m512 fft937 = _mm512_add_ps(fft929, fft933);
__m512 fft850 = _mm512_sub_ps(fft841, fft845);
__m512 fft938 = _mm512_sub_ps(fft929, fft933);
__m512 fft851 = _mm512_add_ps(fft843, fft847);
__m512 fft939 = _mm512_add_ps(fft931, fft935);
__m512 fft852 = _mm512_sub_ps(fft847, fft843);
__m512 fft940 = _mm512_sub_ps(fft935, fft931);
__m512 fft853 = _mm512_sub_ps(fft844, fft848);
__m512 fft941 = _mm512_sub_ps(fft932, fft936);
__m512 fft854 = _mm512_add_ps(fft844, fft848);
__m512 fft942 = _mm512_add_ps(fft932, fft936);
__m512 fft855 = _mm512_add_ps(fft849, fft851);
__m512 fft943 = _mm512_add_ps(fft937, fft939);
__m512 fft856 = _mm512_sub_ps(fft849, fft851);
__m512 fft944 = _mm512_sub_ps(fft937, fft939);
__m512 fft857 = _mm512_fmadd_ps(fft853, _mm512_set1_ps(7.0710677e-01f), fft842);
__m512 fft945 = _mm512_fmadd_ps(fft941, _mm512_set1_ps(7.0710677e-01f), fft930);
__m512 fft858 = _mm512_fnmsub_ps(fft854, _mm512_set1_ps(7.0710677e-01f), fft846);
__m512 fft946 = _mm512_fnmsub_ps(fft942, _mm512_set1_ps(7.0710677e-01f), fft934);
__m512 fft859 = _mm512_fnmadd_ps(fft853, _mm512_set1_ps(7.0710677e-01f), fft842);
__m512 fft947 = _mm512_fnmadd_ps(fft941, _mm512_set1_ps(7.0710677e-01f), fft930);
__m512 fft860 = _mm512_fnmadd_ps(fft854, _mm512_set1_ps(7.0710677e-01f), fft846);
__m512 fft948 = _mm512_fnmadd_ps(fft942, _mm512_set1_ps(7.0710677e-01f), fft934);
__m512 fft861 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft862 = _mm512_fmadd_ps(fft855, fft861, _mm512_shuffle_f32x4(fft855, fft855, 78));
__m512 fft949 = _mm512_fmadd_ps(fft943, fft861, _mm512_shuffle_f32x4(fft943, fft943, 78));
__m512 fft863 = _mm512_fmadd_ps(fft856, fft861, _mm512_shuffle_f32x4(fft856, fft856, 78));
__m512 fft950 = _mm512_fmadd_ps(fft944, fft861, _mm512_shuffle_f32x4(fft944, fft944, 78));
__m512 fft864 = _mm512_fmadd_ps(fft857, fft861, _mm512_shuffle_f32x4(fft857, fft857, 78));
__m512 fft951 = _mm512_fmadd_ps(fft945, fft861, _mm512_shuffle_f32x4(fft945, fft945, 78));
__m512 fft865 = _mm512_fmadd_ps(fft858, fft861, _mm512_shuffle_f32x4(fft858, fft858, 78));
__m512 fft952 = _mm512_fmadd_ps(fft946, fft861, _mm512_shuffle_f32x4(fft946, fft946, 78));
__m512 fft866 = _mm512_fmadd_ps(fft850, fft861, _mm512_shuffle_f32x4(fft850, fft850, 78));
__m512 fft953 = _mm512_fmadd_ps(fft938, fft861, _mm512_shuffle_f32x4(fft938, fft938, 78));
__m512 fft867 = _mm512_fmadd_ps(fft852, fft861, _mm512_shuffle_f32x4(fft852, fft852, 78));
__m512 fft954 = _mm512_fmadd_ps(fft940, fft861, _mm512_shuffle_f32x4(fft940, fft940, 78));
__m512 fft868 = _mm512_fmadd_ps(fft859, fft861, _mm512_shuffle_f32x4(fft859, fft859, 78));
__m512 fft955 = _mm512_fmadd_ps(fft947, fft861, _mm512_shuffle_f32x4(fft947, fft947, 78));
__m512 fft869 = _mm512_fmadd_ps(fft860, fft861, _mm512_shuffle_f32x4(fft860, fft860, 78));
__m512 fft956 = _mm512_fmadd_ps(fft948, fft861, _mm512_shuffle_f32x4(fft948, fft948, 78));
__m512 fft870 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft871 = _mm512_mul_ps(fft862, fft870);
__m512 fft957 = _mm512_mul_ps(fft949, fft870);
__m512 fft872 = _mm512_mul_ps(fft863, fft870);
__m512 fft958 = _mm512_mul_ps(fft950, fft870);
__m512 fft873 = _mm512_mul_ps(fft864, fft870);
__m512 fft959 = _mm512_mul_ps(fft951, fft870);
__m512 fft874 = _mm512_mul_ps(fft865, fft870);
__m512 fft960 = _mm512_mul_ps(fft952, fft870);
__m512 fft875 = _mm512_mul_ps(fft866, fft870);
__m512 fft961 = _mm512_mul_ps(fft953, fft870);
__m512 fft876 = _mm512_mul_ps(fft867, fft870);
__m512 fft962 = _mm512_mul_ps(fft954, fft870);
__m512 fft877 = _mm512_mul_ps(fft868, fft870);
__m512 fft963 = _mm512_mul_ps(fft955, fft870);
__m512 fft878 = _mm512_mul_ps(fft869, fft870);
__m512 fft964 = _mm512_mul_ps(fft956, fft870);
__m512 fft879 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft880 = _mm512_fmadd_ps(fft863, fft879, fft871);
__m512 fft965 = _mm512_fmadd_ps(fft950, fft879, fft957);
__m512 fft881 = _mm512_fnmadd_ps(fft862, fft879, fft872);
__m512 fft966 = _mm512_fnmadd_ps(fft949, fft879, fft958);
__m512 fft882 = _mm512_fmadd_ps(fft865, fft879, fft873);
__m512 fft967 = _mm512_fmadd_ps(fft952, fft879, fft959);
__m512 fft883 = _mm512_fnmadd_ps(fft864, fft879, fft874);
__m512 fft968 = _mm512_fnmadd_ps(fft951, fft879, fft960);
__m512 fft884 = _mm512_fmadd_ps(fft867, fft879, fft875);
__m512 fft969 = _mm512_fmadd_ps(fft954, fft879, fft961);
__m512 fft885 = _mm512_fnmadd_ps(fft866, fft879, fft876);
__m512 fft970 = _mm512_fnmadd_ps(fft953, fft879, fft962);
__m512 fft886 = _mm512_fmadd_ps(fft869, fft879, fft877);
__m512 fft971 = _mm512_fmadd_ps(fft956, fft879, fft963);
__m512 fft887 = _mm512_fnmadd_ps(fft868, fft879, fft878);
__m512 fft972 = _mm512_fnmadd_ps(fft955, fft879, fft964);
__m512 fft888 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft889 = _mm512_fmadd_ps(fft880, fft888, _mm512_shuffle_f32x4(fft880, fft880, 177));
__m512 fft973 = _mm512_fmadd_ps(fft965, fft888, _mm512_shuffle_f32x4(fft965, fft965, 177));
__m512 fft890 = _mm512_fmadd_ps(fft881, fft888, _mm512_shuffle_f32x4(fft881, fft881, 177));
__m512 fft974 = _mm512_fmadd_ps(fft966, fft888, _mm512_shuffle_f32x4(fft966, fft966, 177));
__m512 fft891 = _mm512_fmadd_ps(fft882, fft888, _mm512_shuffle_f32x4(fft882, fft882, 177));
__m512 fft975 = _mm512_fmadd_ps(fft967, fft888, _mm512_shuffle_f32x4(fft967, fft967, 177));
__m512 fft892 = _mm512_fmadd_ps(fft883, fft888, _mm512_shuffle_f32x4(fft883, fft883, 177));
__m512 fft976 = _mm512_fmadd_ps(fft968, fft888, _mm512_shuffle_f32x4(fft968, fft968, 177));
__m512 fft893 = _mm512_fmadd_ps(fft884, fft888, _mm512_shuffle_f32x4(fft884, fft884, 177));
__m512 fft977 = _mm512_fmadd_ps(fft969, fft888, _mm512_shuffle_f32x4(fft969, fft969, 177));
__m512 fft894 = _mm512_fmadd_ps(fft885, fft888, _mm512_shuffle_f32x4(fft885, fft885, 177));
__m512 fft978 = _mm512_fmadd_ps(fft970, fft888, _mm512_shuffle_f32x4(fft970, fft970, 177));
__m512 fft895 = _mm512_fmadd_ps(fft886, fft888, _mm512_shuffle_f32x4(fft886, fft886, 177));
__m512 fft979 = _mm512_fmadd_ps(fft971, fft888, _mm512_shuffle_f32x4(fft971, fft971, 177));
__m512 fft896 = _mm512_fmadd_ps(fft887, fft888, _mm512_shuffle_f32x4(fft887, fft887, 177));
__m512 fft980 = _mm512_fmadd_ps(fft972, fft888, _mm512_shuffle_f32x4(fft972, fft972, 177));
__m512 fft897 = _mm512_mask_mov_ps(fft889, 49344, fft890);
__m512 fft981 = _mm512_mask_mov_ps(fft973, 49344, fft974);
__m512 fft898 = _mm512_mask_sub_ps(fft890, 49344, _mm512_setzero_ps(), fft889);
__m512 fft982 = _mm512_mask_sub_ps(fft974, 49344, _mm512_setzero_ps(), fft973);
__m512 fft899 = _mm512_mask_mov_ps(fft891, 49344, fft892);
__m512 fft983 = _mm512_mask_mov_ps(fft975, 49344, fft976);
__m512 fft900 = _mm512_mask_sub_ps(fft892, 49344, _mm512_setzero_ps(), fft891);
__m512 fft984 = _mm512_mask_sub_ps(fft976, 49344, _mm512_setzero_ps(), fft975);
__m512 fft901 = _mm512_mask_mov_ps(fft893, 49344, fft894);
__m512 fft985 = _mm512_mask_mov_ps(fft977, 49344, fft978);
__m512 fft902 = _mm512_mask_sub_ps(fft894, 49344, _mm512_setzero_ps(), fft893);
__m512 fft986 = _mm512_mask_sub_ps(fft978, 49344, _mm512_setzero_ps(), fft977);
__m512 fft903 = _mm512_mask_mov_ps(fft895, 49344, fft896);
__m512 fft987 = _mm512_mask_mov_ps(fft979, 49344, fft980);
__m512 fft904 = _mm512_mask_sub_ps(fft896, 49344, _mm512_setzero_ps(), fft895);
__m512 fft988 = _mm512_mask_sub_ps(fft980, 49344, _mm512_setzero_ps(), fft979);
__m512 fft905 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft906 = _mm512_fmadd_ps(fft897, fft905, _mm512_shuffle_ps(fft897, fft897, 78));
__m512 fft989 = _mm512_fmadd_ps(fft981, fft905, _mm512_shuffle_ps(fft981, fft981, 78));
__m512 fft907 = _mm512_fmadd_ps(fft898, fft905, _mm512_shuffle_ps(fft898, fft898, 78));
__m512 fft990 = _mm512_fmadd_ps(fft982, fft905, _mm512_shuffle_ps(fft982, fft982, 78));
__m512 fft908 = _mm512_fmadd_ps(fft899, fft905, _mm512_shuffle_ps(fft899, fft899, 78));
__m512 fft991 = _mm512_fmadd_ps(fft983, fft905, _mm512_shuffle_ps(fft983, fft983, 78));
__m512 fft909 = _mm512_fmadd_ps(fft900, fft905, _mm512_shuffle_ps(fft900, fft900, 78));
__m512 fft992 = _mm512_fmadd_ps(fft984, fft905, _mm512_shuffle_ps(fft984, fft984, 78));
__m512 fft910 = _mm512_fmadd_ps(fft901, fft905, _mm512_shuffle_ps(fft901, fft901, 78));
__m512 fft993 = _mm512_fmadd_ps(fft985, fft905, _mm512_shuffle_ps(fft985, fft985, 78));
__m512 fft911 = _mm512_fmadd_ps(fft902, fft905, _mm512_shuffle_ps(fft902, fft902, 78));
__m512 fft994 = _mm512_fmadd_ps(fft986, fft905, _mm512_shuffle_ps(fft986, fft986, 78));
__m512 fft912 = _mm512_fmadd_ps(fft903, fft905, _mm512_shuffle_ps(fft903, fft903, 78));
__m512 fft995 = _mm512_fmadd_ps(fft987, fft905, _mm512_shuffle_ps(fft987, fft987, 78));
__m512 fft913 = _mm512_fmadd_ps(fft904, fft905, _mm512_shuffle_ps(fft904, fft904, 78));
__m512 fft996 = _mm512_fmadd_ps(fft988, fft905, _mm512_shuffle_ps(fft988, fft988, 78));
__m512i fft914 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft915 = _mm512_permutexvar_ps(fft914, fft906);
__m512 fft997 = _mm512_permutexvar_ps(fft914, fft989);
__m512i fft916 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft917 = _mm512_permutexvar_ps(fft916, fft906);
__m512 fft998 = _mm512_permutexvar_ps(fft916, fft989);
__m512 fft918 = _mm512_permutexvar_ps(fft914, fft907);
__m512 fft999 = _mm512_permutexvar_ps(fft914, fft990);
__m512 fft919 = _mm512_permutexvar_ps(fft916, fft907);
__m512 fft1000 = _mm512_permutexvar_ps(fft916, fft990);
__m512 fft920 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft921 = _mm512_fmadd_ps(fft915, fft920, fft917);
__m512 fft1001 = _mm512_fmadd_ps(fft997, fft920, fft998);
__m512 fft922 = _mm512_fnmadd_ps(fft919, fft920, fft918);
__m512 fft1002 = _mm512_fnmadd_ps(fft1000, fft920, fft999);
__m512 fft923 = _mm512_mask_mov_ps(fft919, 21845, fft921);
__m512 fft1003 = _mm512_mask_mov_ps(fft1000, 21845, fft1001);
__m512 fft924 = _mm512_mask_mov_ps(fft915, 43176, fft921);
__m512 fft1004 = _mm512_mask_mov_ps(fft997, 43176, fft1001);
__m512 fft925 = _mm512_mask_mov_ps(fft923, 43176, fft922);
__m512 fft1005 = _mm512_mask_mov_ps(fft1003, 43176, fft1002);
__m512 fft926 = _mm512_mask_mov_ps(fft924, 22102, fft922);
__m512 fft1006 = _mm512_mask_mov_ps(fft1004, 22102, fft1002);
__m512 fft927 = _mm512_mask_mul_ps(fft925, 64764, fft925, _mm512_set1_ps(5e-01f));
__m512 fft1007 = _mm512_mask_mul_ps(fft1005, 64764, fft1005, _mm512_set1_ps(5e-01f));
__m512 fft928 = _mm512_mask_mul_ps(fft926, 64764, fft926, _mm512_set1_ps(5e-01f));
__m512 fft1008 = _mm512_mask_mul_ps(fft1006, 64764, fft1006, _mm512_set1_ps(5e-01f));
__m512 df49 = fft927;
__m512 df57 = fft1007;
__m512 df50 = fft928;
__m512 df58 = fft1008;
__m512 df51 = fft908;
__m512 df59 = fft991;
__m512 df52 = fft909;
__m512 df60 = fft992;
__m512 df53 = fft910;
__m512 df61 = fft993;
__m512 df54 = fft911;
__m512 df62 = fft994;
__m512 df55 = fft912;
__m512 df63 = fft995;
__m512 df56 = fft913;
__m512 df64 = fft996;
__m512i eo6 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df51 = _mm512_permutexvar_ps(eo6, df51);
df52 = _mm512_permutexvar_ps(eo6, df52);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df51);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df52);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df51);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df52);
df59 = _mm512_permutexvar_ps(eo6, df59);
df60 = _mm512_permutexvar_ps(eo6, df60);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df59);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df60);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df59);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df60);
df53 = _mm512_permutexvar_ps(eo6, df53);
df54 = _mm512_permutexvar_ps(eo6, df54);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df53);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df54);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df53);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df54);
df61 = _mm512_permutexvar_ps(eo6, df61);
df62 = _mm512_permutexvar_ps(eo6, df62);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df61);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df62);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df61);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df62);
df55 = _mm512_permutexvar_ps(eo6, df55);
df56 = _mm512_permutexvar_ps(eo6, df56);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df55);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df56);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df55);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df56);
df63 = _mm512_permutexvar_ps(eo6, df63);
df64 = _mm512_permutexvar_ps(eo6, df64);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df63);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df64);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df63);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df64);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df49);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df50);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df49);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df50);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df57);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df58);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df57);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df58);
}
ptrdiff_t b7 = 4;
ptrdiff_t m7 = (size_t)b7/2;
ptrdiff_t f8 = (size_t)b7%2;
__m512 dat53 = _mm512_maskz_loadu_ps(127, datPtr1+2848+602112*i6+200704*k4+896*h3+4*w3+0*b7);
__m512 dat54 = _mm512_maskz_loadu_ps(127, datPtr1+3744+602112*i6+200704*k4+896*h3+4*w3+0*b7);
__m512 dat55 = _mm512_maskz_loadu_ps(127, datPtr1+4640+602112*i6+200704*k4+896*h3+4*w3+0*b7);
__m512 dat56 = _mm512_maskz_loadu_ps(127, datPtr1+5536+602112*i6+200704*k4+896*h3+4*w3+0*b7);
__m512 dat57 = _mm512_maskz_loadu_ps(127, datPtr1+6432+602112*i6+200704*k4+896*h3+4*w3+0*b7);
__m512 dat58 = _mm512_maskz_loadu_ps(127, datPtr1+7328+602112*i6+200704*k4+896*h3+4*w3+0*b7);
__m512 dat59 = _mm512_maskz_loadu_ps(127, datPtr1+8224+602112*i6+200704*k4+896*h3+4*w3+0*b7);
__m512 dat60 = _mm512_maskz_loadu_ps(127, datPtr1+9120+602112*i6+200704*k4+896*h3+4*w3+0*b7);
__m512 dat61 = _mm512_maskz_loadu_ps(127, datPtr1+10016+602112*i6+200704*k4+896*h3+4*w3+0*b7);
__m512 dat62 = _mm512_maskz_loadu_ps(127, datPtr1+10912+602112*i6+200704*k4+896*h3+4*w3+0*b7);
__m512 dat63 = _mm512_maskz_loadu_ps(127, datPtr1+11808+602112*i6+200704*k4+896*h3+4*w3+0*b7);
__m512 dat64 = _mm512_maskz_loadu_ps(127, datPtr1+12704+602112*i6+200704*k4+896*h3+4*w3+0*b7);
__m512 dat65 = _mm512_maskz_loadu_ps(127, datPtr1+13600+602112*i6+200704*k4+896*h3+4*w3+0*b7);
__m512 fft1009 = _mm512_add_ps(_mm512_setzero_ps(), dat58);
__m512 fft1097 = _mm512_add_ps(_mm512_setzero_ps(), dat59);
__m512 fft1010 = _mm512_sub_ps(_mm512_setzero_ps(), dat58);
__m512 fft1098 = _mm512_sub_ps(_mm512_setzero_ps(), dat59);
__m512 fft1011 = _mm512_add_ps(_mm512_setzero_ps(), dat60);
__m512 fft1099 = _mm512_add_ps(dat53, dat61);
__m512 fft1012 = _mm512_sub_ps(_mm512_setzero_ps(), dat60);
__m512 fft1100 = _mm512_sub_ps(dat53, dat61);
__m512 fft1013 = _mm512_add_ps(dat54, dat62);
__m512 fft1101 = _mm512_add_ps(dat55, dat63);
__m512 fft1014 = _mm512_sub_ps(dat54, dat62);
__m512 fft1102 = _mm512_sub_ps(dat55, dat63);
__m512 fft1015 = _mm512_add_ps(dat56, dat64);
__m512 fft1103 = _mm512_add_ps(dat57, dat65);
__m512 fft1016 = _mm512_sub_ps(dat56, dat64);
__m512 fft1104 = _mm512_sub_ps(dat57, dat65);
__m512 fft1017 = _mm512_add_ps(fft1009, fft1013);
__m512 fft1105 = _mm512_add_ps(fft1097, fft1101);
__m512 fft1018 = _mm512_sub_ps(fft1009, fft1013);
__m512 fft1106 = _mm512_sub_ps(fft1097, fft1101);
__m512 fft1019 = _mm512_add_ps(fft1011, fft1015);
__m512 fft1107 = _mm512_add_ps(fft1099, fft1103);
__m512 fft1020 = _mm512_sub_ps(fft1015, fft1011);
__m512 fft1108 = _mm512_sub_ps(fft1103, fft1099);
__m512 fft1021 = _mm512_sub_ps(fft1012, fft1016);
__m512 fft1109 = _mm512_sub_ps(fft1100, fft1104);
__m512 fft1022 = _mm512_add_ps(fft1012, fft1016);
__m512 fft1110 = _mm512_add_ps(fft1100, fft1104);
__m512 fft1023 = _mm512_add_ps(fft1017, fft1019);
__m512 fft1111 = _mm512_add_ps(fft1105, fft1107);
__m512 fft1024 = _mm512_sub_ps(fft1017, fft1019);
__m512 fft1112 = _mm512_sub_ps(fft1105, fft1107);
__m512 fft1025 = _mm512_fmadd_ps(fft1021, _mm512_set1_ps(7.0710677e-01f), fft1010);
__m512 fft1113 = _mm512_fmadd_ps(fft1109, _mm512_set1_ps(7.0710677e-01f), fft1098);
__m512 fft1026 = _mm512_fnmsub_ps(fft1022, _mm512_set1_ps(7.0710677e-01f), fft1014);
__m512 fft1114 = _mm512_fnmsub_ps(fft1110, _mm512_set1_ps(7.0710677e-01f), fft1102);
__m512 fft1027 = _mm512_fnmadd_ps(fft1021, _mm512_set1_ps(7.0710677e-01f), fft1010);
__m512 fft1115 = _mm512_fnmadd_ps(fft1109, _mm512_set1_ps(7.0710677e-01f), fft1098);
__m512 fft1028 = _mm512_fnmadd_ps(fft1022, _mm512_set1_ps(7.0710677e-01f), fft1014);
__m512 fft1116 = _mm512_fnmadd_ps(fft1110, _mm512_set1_ps(7.0710677e-01f), fft1102);
__m512 fft1029 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1030 = _mm512_fmadd_ps(fft1023, fft1029, _mm512_shuffle_f32x4(fft1023, fft1023, 78));
__m512 fft1117 = _mm512_fmadd_ps(fft1111, fft1029, _mm512_shuffle_f32x4(fft1111, fft1111, 78));
__m512 fft1031 = _mm512_fmadd_ps(fft1024, fft1029, _mm512_shuffle_f32x4(fft1024, fft1024, 78));
__m512 fft1118 = _mm512_fmadd_ps(fft1112, fft1029, _mm512_shuffle_f32x4(fft1112, fft1112, 78));
__m512 fft1032 = _mm512_fmadd_ps(fft1025, fft1029, _mm512_shuffle_f32x4(fft1025, fft1025, 78));
__m512 fft1119 = _mm512_fmadd_ps(fft1113, fft1029, _mm512_shuffle_f32x4(fft1113, fft1113, 78));
__m512 fft1033 = _mm512_fmadd_ps(fft1026, fft1029, _mm512_shuffle_f32x4(fft1026, fft1026, 78));
__m512 fft1120 = _mm512_fmadd_ps(fft1114, fft1029, _mm512_shuffle_f32x4(fft1114, fft1114, 78));
__m512 fft1034 = _mm512_fmadd_ps(fft1018, fft1029, _mm512_shuffle_f32x4(fft1018, fft1018, 78));
__m512 fft1121 = _mm512_fmadd_ps(fft1106, fft1029, _mm512_shuffle_f32x4(fft1106, fft1106, 78));
__m512 fft1035 = _mm512_fmadd_ps(fft1020, fft1029, _mm512_shuffle_f32x4(fft1020, fft1020, 78));
__m512 fft1122 = _mm512_fmadd_ps(fft1108, fft1029, _mm512_shuffle_f32x4(fft1108, fft1108, 78));
__m512 fft1036 = _mm512_fmadd_ps(fft1027, fft1029, _mm512_shuffle_f32x4(fft1027, fft1027, 78));
__m512 fft1123 = _mm512_fmadd_ps(fft1115, fft1029, _mm512_shuffle_f32x4(fft1115, fft1115, 78));
__m512 fft1037 = _mm512_fmadd_ps(fft1028, fft1029, _mm512_shuffle_f32x4(fft1028, fft1028, 78));
__m512 fft1124 = _mm512_fmadd_ps(fft1116, fft1029, _mm512_shuffle_f32x4(fft1116, fft1116, 78));
__m512 fft1038 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1039 = _mm512_mul_ps(fft1030, fft1038);
__m512 fft1125 = _mm512_mul_ps(fft1117, fft1038);
__m512 fft1040 = _mm512_mul_ps(fft1031, fft1038);
__m512 fft1126 = _mm512_mul_ps(fft1118, fft1038);
__m512 fft1041 = _mm512_mul_ps(fft1032, fft1038);
__m512 fft1127 = _mm512_mul_ps(fft1119, fft1038);
__m512 fft1042 = _mm512_mul_ps(fft1033, fft1038);
__m512 fft1128 = _mm512_mul_ps(fft1120, fft1038);
__m512 fft1043 = _mm512_mul_ps(fft1034, fft1038);
__m512 fft1129 = _mm512_mul_ps(fft1121, fft1038);
__m512 fft1044 = _mm512_mul_ps(fft1035, fft1038);
__m512 fft1130 = _mm512_mul_ps(fft1122, fft1038);
__m512 fft1045 = _mm512_mul_ps(fft1036, fft1038);
__m512 fft1131 = _mm512_mul_ps(fft1123, fft1038);
__m512 fft1046 = _mm512_mul_ps(fft1037, fft1038);
__m512 fft1132 = _mm512_mul_ps(fft1124, fft1038);
__m512 fft1047 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1048 = _mm512_fmadd_ps(fft1031, fft1047, fft1039);
__m512 fft1133 = _mm512_fmadd_ps(fft1118, fft1047, fft1125);
__m512 fft1049 = _mm512_fnmadd_ps(fft1030, fft1047, fft1040);
__m512 fft1134 = _mm512_fnmadd_ps(fft1117, fft1047, fft1126);
__m512 fft1050 = _mm512_fmadd_ps(fft1033, fft1047, fft1041);
__m512 fft1135 = _mm512_fmadd_ps(fft1120, fft1047, fft1127);
__m512 fft1051 = _mm512_fnmadd_ps(fft1032, fft1047, fft1042);
__m512 fft1136 = _mm512_fnmadd_ps(fft1119, fft1047, fft1128);
__m512 fft1052 = _mm512_fmadd_ps(fft1035, fft1047, fft1043);
__m512 fft1137 = _mm512_fmadd_ps(fft1122, fft1047, fft1129);
__m512 fft1053 = _mm512_fnmadd_ps(fft1034, fft1047, fft1044);
__m512 fft1138 = _mm512_fnmadd_ps(fft1121, fft1047, fft1130);
__m512 fft1054 = _mm512_fmadd_ps(fft1037, fft1047, fft1045);
__m512 fft1139 = _mm512_fmadd_ps(fft1124, fft1047, fft1131);
__m512 fft1055 = _mm512_fnmadd_ps(fft1036, fft1047, fft1046);
__m512 fft1140 = _mm512_fnmadd_ps(fft1123, fft1047, fft1132);
__m512 fft1056 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1057 = _mm512_fmadd_ps(fft1048, fft1056, _mm512_shuffle_f32x4(fft1048, fft1048, 177));
__m512 fft1141 = _mm512_fmadd_ps(fft1133, fft1056, _mm512_shuffle_f32x4(fft1133, fft1133, 177));
__m512 fft1058 = _mm512_fmadd_ps(fft1049, fft1056, _mm512_shuffle_f32x4(fft1049, fft1049, 177));
__m512 fft1142 = _mm512_fmadd_ps(fft1134, fft1056, _mm512_shuffle_f32x4(fft1134, fft1134, 177));
__m512 fft1059 = _mm512_fmadd_ps(fft1050, fft1056, _mm512_shuffle_f32x4(fft1050, fft1050, 177));
__m512 fft1143 = _mm512_fmadd_ps(fft1135, fft1056, _mm512_shuffle_f32x4(fft1135, fft1135, 177));
__m512 fft1060 = _mm512_fmadd_ps(fft1051, fft1056, _mm512_shuffle_f32x4(fft1051, fft1051, 177));
__m512 fft1144 = _mm512_fmadd_ps(fft1136, fft1056, _mm512_shuffle_f32x4(fft1136, fft1136, 177));
__m512 fft1061 = _mm512_fmadd_ps(fft1052, fft1056, _mm512_shuffle_f32x4(fft1052, fft1052, 177));
__m512 fft1145 = _mm512_fmadd_ps(fft1137, fft1056, _mm512_shuffle_f32x4(fft1137, fft1137, 177));
__m512 fft1062 = _mm512_fmadd_ps(fft1053, fft1056, _mm512_shuffle_f32x4(fft1053, fft1053, 177));
__m512 fft1146 = _mm512_fmadd_ps(fft1138, fft1056, _mm512_shuffle_f32x4(fft1138, fft1138, 177));
__m512 fft1063 = _mm512_fmadd_ps(fft1054, fft1056, _mm512_shuffle_f32x4(fft1054, fft1054, 177));
__m512 fft1147 = _mm512_fmadd_ps(fft1139, fft1056, _mm512_shuffle_f32x4(fft1139, fft1139, 177));
__m512 fft1064 = _mm512_fmadd_ps(fft1055, fft1056, _mm512_shuffle_f32x4(fft1055, fft1055, 177));
__m512 fft1148 = _mm512_fmadd_ps(fft1140, fft1056, _mm512_shuffle_f32x4(fft1140, fft1140, 177));
__m512 fft1065 = _mm512_mask_mov_ps(fft1057, 49344, fft1058);
__m512 fft1149 = _mm512_mask_mov_ps(fft1141, 49344, fft1142);
__m512 fft1066 = _mm512_mask_sub_ps(fft1058, 49344, _mm512_setzero_ps(), fft1057);
__m512 fft1150 = _mm512_mask_sub_ps(fft1142, 49344, _mm512_setzero_ps(), fft1141);
__m512 fft1067 = _mm512_mask_mov_ps(fft1059, 49344, fft1060);
__m512 fft1151 = _mm512_mask_mov_ps(fft1143, 49344, fft1144);
__m512 fft1068 = _mm512_mask_sub_ps(fft1060, 49344, _mm512_setzero_ps(), fft1059);
__m512 fft1152 = _mm512_mask_sub_ps(fft1144, 49344, _mm512_setzero_ps(), fft1143);
__m512 fft1069 = _mm512_mask_mov_ps(fft1061, 49344, fft1062);
__m512 fft1153 = _mm512_mask_mov_ps(fft1145, 49344, fft1146);
__m512 fft1070 = _mm512_mask_sub_ps(fft1062, 49344, _mm512_setzero_ps(), fft1061);
__m512 fft1154 = _mm512_mask_sub_ps(fft1146, 49344, _mm512_setzero_ps(), fft1145);
__m512 fft1071 = _mm512_mask_mov_ps(fft1063, 49344, fft1064);
__m512 fft1155 = _mm512_mask_mov_ps(fft1147, 49344, fft1148);
__m512 fft1072 = _mm512_mask_sub_ps(fft1064, 49344, _mm512_setzero_ps(), fft1063);
__m512 fft1156 = _mm512_mask_sub_ps(fft1148, 49344, _mm512_setzero_ps(), fft1147);
__m512 fft1073 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1074 = _mm512_fmadd_ps(fft1065, fft1073, _mm512_shuffle_ps(fft1065, fft1065, 78));
__m512 fft1157 = _mm512_fmadd_ps(fft1149, fft1073, _mm512_shuffle_ps(fft1149, fft1149, 78));
__m512 fft1075 = _mm512_fmadd_ps(fft1066, fft1073, _mm512_shuffle_ps(fft1066, fft1066, 78));
__m512 fft1158 = _mm512_fmadd_ps(fft1150, fft1073, _mm512_shuffle_ps(fft1150, fft1150, 78));
__m512 fft1076 = _mm512_fmadd_ps(fft1067, fft1073, _mm512_shuffle_ps(fft1067, fft1067, 78));
__m512 fft1159 = _mm512_fmadd_ps(fft1151, fft1073, _mm512_shuffle_ps(fft1151, fft1151, 78));
__m512 fft1077 = _mm512_fmadd_ps(fft1068, fft1073, _mm512_shuffle_ps(fft1068, fft1068, 78));
__m512 fft1160 = _mm512_fmadd_ps(fft1152, fft1073, _mm512_shuffle_ps(fft1152, fft1152, 78));
__m512 fft1078 = _mm512_fmadd_ps(fft1069, fft1073, _mm512_shuffle_ps(fft1069, fft1069, 78));
__m512 fft1161 = _mm512_fmadd_ps(fft1153, fft1073, _mm512_shuffle_ps(fft1153, fft1153, 78));
__m512 fft1079 = _mm512_fmadd_ps(fft1070, fft1073, _mm512_shuffle_ps(fft1070, fft1070, 78));
__m512 fft1162 = _mm512_fmadd_ps(fft1154, fft1073, _mm512_shuffle_ps(fft1154, fft1154, 78));
__m512 fft1080 = _mm512_fmadd_ps(fft1071, fft1073, _mm512_shuffle_ps(fft1071, fft1071, 78));
__m512 fft1163 = _mm512_fmadd_ps(fft1155, fft1073, _mm512_shuffle_ps(fft1155, fft1155, 78));
__m512 fft1081 = _mm512_fmadd_ps(fft1072, fft1073, _mm512_shuffle_ps(fft1072, fft1072, 78));
__m512 fft1164 = _mm512_fmadd_ps(fft1156, fft1073, _mm512_shuffle_ps(fft1156, fft1156, 78));
__m512i fft1082 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1083 = _mm512_permutexvar_ps(fft1082, fft1074);
__m512 fft1165 = _mm512_permutexvar_ps(fft1082, fft1157);
__m512i fft1084 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1085 = _mm512_permutexvar_ps(fft1084, fft1074);
__m512 fft1166 = _mm512_permutexvar_ps(fft1084, fft1157);
__m512 fft1086 = _mm512_permutexvar_ps(fft1082, fft1075);
__m512 fft1167 = _mm512_permutexvar_ps(fft1082, fft1158);
__m512 fft1087 = _mm512_permutexvar_ps(fft1084, fft1075);
__m512 fft1168 = _mm512_permutexvar_ps(fft1084, fft1158);
__m512 fft1088 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1089 = _mm512_fmadd_ps(fft1083, fft1088, fft1085);
__m512 fft1169 = _mm512_fmadd_ps(fft1165, fft1088, fft1166);
__m512 fft1090 = _mm512_fnmadd_ps(fft1087, fft1088, fft1086);
__m512 fft1170 = _mm512_fnmadd_ps(fft1168, fft1088, fft1167);
__m512 fft1091 = _mm512_mask_mov_ps(fft1087, 21845, fft1089);
__m512 fft1171 = _mm512_mask_mov_ps(fft1168, 21845, fft1169);
__m512 fft1092 = _mm512_mask_mov_ps(fft1083, 43176, fft1089);
__m512 fft1172 = _mm512_mask_mov_ps(fft1165, 43176, fft1169);
__m512 fft1093 = _mm512_mask_mov_ps(fft1091, 43176, fft1090);
__m512 fft1173 = _mm512_mask_mov_ps(fft1171, 43176, fft1170);
__m512 fft1094 = _mm512_mask_mov_ps(fft1092, 22102, fft1090);
__m512 fft1174 = _mm512_mask_mov_ps(fft1172, 22102, fft1170);
__m512 fft1095 = _mm512_mask_mul_ps(fft1093, 64764, fft1093, _mm512_set1_ps(5e-01f));
__m512 fft1175 = _mm512_mask_mul_ps(fft1173, 64764, fft1173, _mm512_set1_ps(5e-01f));
__m512 fft1096 = _mm512_mask_mul_ps(fft1094, 64764, fft1094, _mm512_set1_ps(5e-01f));
__m512 fft1176 = _mm512_mask_mul_ps(fft1174, 64764, fft1174, _mm512_set1_ps(5e-01f));
__m512 df65 = fft1095;
__m512 df73 = fft1175;
__m512 df66 = fft1096;
__m512 df74 = fft1176;
__m512 df67 = fft1076;
__m512 df75 = fft1159;
__m512 df68 = fft1077;
__m512 df76 = fft1160;
__m512 df69 = fft1078;
__m512 df77 = fft1161;
__m512 df70 = fft1079;
__m512 df78 = fft1162;
__m512 df71 = fft1080;
__m512 df79 = fft1163;
__m512 df72 = fft1081;
__m512 df80 = fft1164;
__m512i eo7 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df67 = _mm512_permutexvar_ps(eo7, df67);
df68 = _mm512_permutexvar_ps(eo7, df68);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df67);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df68);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df67);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df68);
df75 = _mm512_permutexvar_ps(eo7, df75);
df76 = _mm512_permutexvar_ps(eo7, df76);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df75);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df76);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df75);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df76);
df69 = _mm512_permutexvar_ps(eo7, df69);
df70 = _mm512_permutexvar_ps(eo7, df70);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df69);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df70);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df69);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df70);
df77 = _mm512_permutexvar_ps(eo7, df77);
df78 = _mm512_permutexvar_ps(eo7, df78);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df77);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df78);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df77);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df78);
df71 = _mm512_permutexvar_ps(eo7, df71);
df72 = _mm512_permutexvar_ps(eo7, df72);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df71);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df72);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df71);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df72);
df79 = _mm512_permutexvar_ps(eo7, df79);
df80 = _mm512_permutexvar_ps(eo7, df80);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df79);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df80);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df79);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df80);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df65);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df66);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df65);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df66);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df73);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df74);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df73);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df74);
ptrdiff_t b8 = 5;
ptrdiff_t m8 = (size_t)b8/2;
ptrdiff_t f9 = (size_t)b8%2;
__m512 dat66 = _mm512_maskz_loadu_ps(65528, datPtr1+8240+602112*i6+200704*k4+896*h3+4*w3+0*b8);
__m512 dat67 = _mm512_maskz_loadu_ps(65528, datPtr1+9136+602112*i6+200704*k4+896*h3+4*w3+0*b8);
__m512 dat68 = _mm512_maskz_loadu_ps(65528, datPtr1+10032+602112*i6+200704*k4+896*h3+4*w3+0*b8);
__m512 dat69 = _mm512_maskz_loadu_ps(65528, datPtr1+10928+602112*i6+200704*k4+896*h3+4*w3+0*b8);
__m512 dat70 = _mm512_maskz_loadu_ps(65528, datPtr1+11824+602112*i6+200704*k4+896*h3+4*w3+0*b8);
__m512 dat71 = _mm512_maskz_loadu_ps(65528, datPtr1+12720+602112*i6+200704*k4+896*h3+4*w3+0*b8);
__m512 dat72 = _mm512_maskz_loadu_ps(65528, datPtr1+13616+602112*i6+200704*k4+896*h3+4*w3+0*b8);
__m512 dat73 = _mm512_maskz_loadu_ps(65528, datPtr1+14512+602112*i6+200704*k4+896*h3+4*w3+0*b8);
__m512 dat74 = _mm512_maskz_loadu_ps(65528, datPtr1+15408+602112*i6+200704*k4+896*h3+4*w3+0*b8);
__m512 dat75 = _mm512_maskz_loadu_ps(65528, datPtr1+16304+602112*i6+200704*k4+896*h3+4*w3+0*b8);
__m512 dat76 = _mm512_maskz_loadu_ps(65528, datPtr1+17200+602112*i6+200704*k4+896*h3+4*w3+0*b8);
__m512 dat77 = _mm512_maskz_loadu_ps(65528, datPtr1+18096+602112*i6+200704*k4+896*h3+4*w3+0*b8);
__m512 dat78 = _mm512_maskz_loadu_ps(65528, datPtr1+18992+602112*i6+200704*k4+896*h3+4*w3+0*b8);
__m512 dat79 = _mm512_maskz_loadu_ps(65528, datPtr1+19888+602112*i6+200704*k4+896*h3+4*w3+0*b8);
__m512 dat80 = _mm512_maskz_loadu_ps(65528, datPtr1+20784+602112*i6+200704*k4+896*h3+4*w3+0*b8);
__m512 dat81 = _mm512_maskz_loadu_ps(65528, datPtr1+21680+602112*i6+200704*k4+896*h3+4*w3+0*b8);
__m512 fft1177 = _mm512_add_ps(dat66, dat74);
__m512 fft1265 = _mm512_add_ps(dat67, dat75);
__m512 fft1178 = _mm512_sub_ps(dat66, dat74);
__m512 fft1266 = _mm512_sub_ps(dat67, dat75);
__m512 fft1179 = _mm512_add_ps(dat68, dat76);
__m512 fft1267 = _mm512_add_ps(dat69, dat77);
__m512 fft1180 = _mm512_sub_ps(dat68, dat76);
__m512 fft1268 = _mm512_sub_ps(dat69, dat77);
__m512 fft1181 = _mm512_add_ps(dat70, dat78);
__m512 fft1269 = _mm512_add_ps(dat71, dat79);
__m512 fft1182 = _mm512_sub_ps(dat70, dat78);
__m512 fft1270 = _mm512_sub_ps(dat71, dat79);
__m512 fft1183 = _mm512_add_ps(dat72, dat80);
__m512 fft1271 = _mm512_add_ps(dat73, dat81);
__m512 fft1184 = _mm512_sub_ps(dat72, dat80);
__m512 fft1272 = _mm512_sub_ps(dat73, dat81);
__m512 fft1185 = _mm512_add_ps(fft1177, fft1181);
__m512 fft1273 = _mm512_add_ps(fft1265, fft1269);
__m512 fft1186 = _mm512_sub_ps(fft1177, fft1181);
__m512 fft1274 = _mm512_sub_ps(fft1265, fft1269);
__m512 fft1187 = _mm512_add_ps(fft1179, fft1183);
__m512 fft1275 = _mm512_add_ps(fft1267, fft1271);
__m512 fft1188 = _mm512_sub_ps(fft1183, fft1179);
__m512 fft1276 = _mm512_sub_ps(fft1271, fft1267);
__m512 fft1189 = _mm512_sub_ps(fft1180, fft1184);
__m512 fft1277 = _mm512_sub_ps(fft1268, fft1272);
__m512 fft1190 = _mm512_add_ps(fft1180, fft1184);
__m512 fft1278 = _mm512_add_ps(fft1268, fft1272);
__m512 fft1191 = _mm512_add_ps(fft1185, fft1187);
__m512 fft1279 = _mm512_add_ps(fft1273, fft1275);
__m512 fft1192 = _mm512_sub_ps(fft1185, fft1187);
__m512 fft1280 = _mm512_sub_ps(fft1273, fft1275);
__m512 fft1193 = _mm512_fmadd_ps(fft1189, _mm512_set1_ps(7.0710677e-01f), fft1178);
__m512 fft1281 = _mm512_fmadd_ps(fft1277, _mm512_set1_ps(7.0710677e-01f), fft1266);
__m512 fft1194 = _mm512_fnmsub_ps(fft1190, _mm512_set1_ps(7.0710677e-01f), fft1182);
__m512 fft1282 = _mm512_fnmsub_ps(fft1278, _mm512_set1_ps(7.0710677e-01f), fft1270);
__m512 fft1195 = _mm512_fnmadd_ps(fft1189, _mm512_set1_ps(7.0710677e-01f), fft1178);
__m512 fft1283 = _mm512_fnmadd_ps(fft1277, _mm512_set1_ps(7.0710677e-01f), fft1266);
__m512 fft1196 = _mm512_fnmadd_ps(fft1190, _mm512_set1_ps(7.0710677e-01f), fft1182);
__m512 fft1284 = _mm512_fnmadd_ps(fft1278, _mm512_set1_ps(7.0710677e-01f), fft1270);
__m512 fft1197 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1198 = _mm512_fmadd_ps(fft1191, fft1197, _mm512_shuffle_f32x4(fft1191, fft1191, 78));
__m512 fft1285 = _mm512_fmadd_ps(fft1279, fft1197, _mm512_shuffle_f32x4(fft1279, fft1279, 78));
__m512 fft1199 = _mm512_fmadd_ps(fft1192, fft1197, _mm512_shuffle_f32x4(fft1192, fft1192, 78));
__m512 fft1286 = _mm512_fmadd_ps(fft1280, fft1197, _mm512_shuffle_f32x4(fft1280, fft1280, 78));
__m512 fft1200 = _mm512_fmadd_ps(fft1193, fft1197, _mm512_shuffle_f32x4(fft1193, fft1193, 78));
__m512 fft1287 = _mm512_fmadd_ps(fft1281, fft1197, _mm512_shuffle_f32x4(fft1281, fft1281, 78));
__m512 fft1201 = _mm512_fmadd_ps(fft1194, fft1197, _mm512_shuffle_f32x4(fft1194, fft1194, 78));
__m512 fft1288 = _mm512_fmadd_ps(fft1282, fft1197, _mm512_shuffle_f32x4(fft1282, fft1282, 78));
__m512 fft1202 = _mm512_fmadd_ps(fft1186, fft1197, _mm512_shuffle_f32x4(fft1186, fft1186, 78));
__m512 fft1289 = _mm512_fmadd_ps(fft1274, fft1197, _mm512_shuffle_f32x4(fft1274, fft1274, 78));
__m512 fft1203 = _mm512_fmadd_ps(fft1188, fft1197, _mm512_shuffle_f32x4(fft1188, fft1188, 78));
__m512 fft1290 = _mm512_fmadd_ps(fft1276, fft1197, _mm512_shuffle_f32x4(fft1276, fft1276, 78));
__m512 fft1204 = _mm512_fmadd_ps(fft1195, fft1197, _mm512_shuffle_f32x4(fft1195, fft1195, 78));
__m512 fft1291 = _mm512_fmadd_ps(fft1283, fft1197, _mm512_shuffle_f32x4(fft1283, fft1283, 78));
__m512 fft1205 = _mm512_fmadd_ps(fft1196, fft1197, _mm512_shuffle_f32x4(fft1196, fft1196, 78));
__m512 fft1292 = _mm512_fmadd_ps(fft1284, fft1197, _mm512_shuffle_f32x4(fft1284, fft1284, 78));
__m512 fft1206 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1207 = _mm512_mul_ps(fft1198, fft1206);
__m512 fft1293 = _mm512_mul_ps(fft1285, fft1206);
__m512 fft1208 = _mm512_mul_ps(fft1199, fft1206);
__m512 fft1294 = _mm512_mul_ps(fft1286, fft1206);
__m512 fft1209 = _mm512_mul_ps(fft1200, fft1206);
__m512 fft1295 = _mm512_mul_ps(fft1287, fft1206);
__m512 fft1210 = _mm512_mul_ps(fft1201, fft1206);
__m512 fft1296 = _mm512_mul_ps(fft1288, fft1206);
__m512 fft1211 = _mm512_mul_ps(fft1202, fft1206);
__m512 fft1297 = _mm512_mul_ps(fft1289, fft1206);
__m512 fft1212 = _mm512_mul_ps(fft1203, fft1206);
__m512 fft1298 = _mm512_mul_ps(fft1290, fft1206);
__m512 fft1213 = _mm512_mul_ps(fft1204, fft1206);
__m512 fft1299 = _mm512_mul_ps(fft1291, fft1206);
__m512 fft1214 = _mm512_mul_ps(fft1205, fft1206);
__m512 fft1300 = _mm512_mul_ps(fft1292, fft1206);
__m512 fft1215 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1216 = _mm512_fmadd_ps(fft1199, fft1215, fft1207);
__m512 fft1301 = _mm512_fmadd_ps(fft1286, fft1215, fft1293);
__m512 fft1217 = _mm512_fnmadd_ps(fft1198, fft1215, fft1208);
__m512 fft1302 = _mm512_fnmadd_ps(fft1285, fft1215, fft1294);
__m512 fft1218 = _mm512_fmadd_ps(fft1201, fft1215, fft1209);
__m512 fft1303 = _mm512_fmadd_ps(fft1288, fft1215, fft1295);
__m512 fft1219 = _mm512_fnmadd_ps(fft1200, fft1215, fft1210);
__m512 fft1304 = _mm512_fnmadd_ps(fft1287, fft1215, fft1296);
__m512 fft1220 = _mm512_fmadd_ps(fft1203, fft1215, fft1211);
__m512 fft1305 = _mm512_fmadd_ps(fft1290, fft1215, fft1297);
__m512 fft1221 = _mm512_fnmadd_ps(fft1202, fft1215, fft1212);
__m512 fft1306 = _mm512_fnmadd_ps(fft1289, fft1215, fft1298);
__m512 fft1222 = _mm512_fmadd_ps(fft1205, fft1215, fft1213);
__m512 fft1307 = _mm512_fmadd_ps(fft1292, fft1215, fft1299);
__m512 fft1223 = _mm512_fnmadd_ps(fft1204, fft1215, fft1214);
__m512 fft1308 = _mm512_fnmadd_ps(fft1291, fft1215, fft1300);
__m512 fft1224 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1225 = _mm512_fmadd_ps(fft1216, fft1224, _mm512_shuffle_f32x4(fft1216, fft1216, 177));
__m512 fft1309 = _mm512_fmadd_ps(fft1301, fft1224, _mm512_shuffle_f32x4(fft1301, fft1301, 177));
__m512 fft1226 = _mm512_fmadd_ps(fft1217, fft1224, _mm512_shuffle_f32x4(fft1217, fft1217, 177));
__m512 fft1310 = _mm512_fmadd_ps(fft1302, fft1224, _mm512_shuffle_f32x4(fft1302, fft1302, 177));
__m512 fft1227 = _mm512_fmadd_ps(fft1218, fft1224, _mm512_shuffle_f32x4(fft1218, fft1218, 177));
__m512 fft1311 = _mm512_fmadd_ps(fft1303, fft1224, _mm512_shuffle_f32x4(fft1303, fft1303, 177));
__m512 fft1228 = _mm512_fmadd_ps(fft1219, fft1224, _mm512_shuffle_f32x4(fft1219, fft1219, 177));
__m512 fft1312 = _mm512_fmadd_ps(fft1304, fft1224, _mm512_shuffle_f32x4(fft1304, fft1304, 177));
__m512 fft1229 = _mm512_fmadd_ps(fft1220, fft1224, _mm512_shuffle_f32x4(fft1220, fft1220, 177));
__m512 fft1313 = _mm512_fmadd_ps(fft1305, fft1224, _mm512_shuffle_f32x4(fft1305, fft1305, 177));
__m512 fft1230 = _mm512_fmadd_ps(fft1221, fft1224, _mm512_shuffle_f32x4(fft1221, fft1221, 177));
__m512 fft1314 = _mm512_fmadd_ps(fft1306, fft1224, _mm512_shuffle_f32x4(fft1306, fft1306, 177));
__m512 fft1231 = _mm512_fmadd_ps(fft1222, fft1224, _mm512_shuffle_f32x4(fft1222, fft1222, 177));
__m512 fft1315 = _mm512_fmadd_ps(fft1307, fft1224, _mm512_shuffle_f32x4(fft1307, fft1307, 177));
__m512 fft1232 = _mm512_fmadd_ps(fft1223, fft1224, _mm512_shuffle_f32x4(fft1223, fft1223, 177));
__m512 fft1316 = _mm512_fmadd_ps(fft1308, fft1224, _mm512_shuffle_f32x4(fft1308, fft1308, 177));
__m512 fft1233 = _mm512_mask_mov_ps(fft1225, 49344, fft1226);
__m512 fft1317 = _mm512_mask_mov_ps(fft1309, 49344, fft1310);
__m512 fft1234 = _mm512_mask_sub_ps(fft1226, 49344, _mm512_setzero_ps(), fft1225);
__m512 fft1318 = _mm512_mask_sub_ps(fft1310, 49344, _mm512_setzero_ps(), fft1309);
__m512 fft1235 = _mm512_mask_mov_ps(fft1227, 49344, fft1228);
__m512 fft1319 = _mm512_mask_mov_ps(fft1311, 49344, fft1312);
__m512 fft1236 = _mm512_mask_sub_ps(fft1228, 49344, _mm512_setzero_ps(), fft1227);
__m512 fft1320 = _mm512_mask_sub_ps(fft1312, 49344, _mm512_setzero_ps(), fft1311);
__m512 fft1237 = _mm512_mask_mov_ps(fft1229, 49344, fft1230);
__m512 fft1321 = _mm512_mask_mov_ps(fft1313, 49344, fft1314);
__m512 fft1238 = _mm512_mask_sub_ps(fft1230, 49344, _mm512_setzero_ps(), fft1229);
__m512 fft1322 = _mm512_mask_sub_ps(fft1314, 49344, _mm512_setzero_ps(), fft1313);
__m512 fft1239 = _mm512_mask_mov_ps(fft1231, 49344, fft1232);
__m512 fft1323 = _mm512_mask_mov_ps(fft1315, 49344, fft1316);
__m512 fft1240 = _mm512_mask_sub_ps(fft1232, 49344, _mm512_setzero_ps(), fft1231);
__m512 fft1324 = _mm512_mask_sub_ps(fft1316, 49344, _mm512_setzero_ps(), fft1315);
__m512 fft1241 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1242 = _mm512_fmadd_ps(fft1233, fft1241, _mm512_shuffle_ps(fft1233, fft1233, 78));
__m512 fft1325 = _mm512_fmadd_ps(fft1317, fft1241, _mm512_shuffle_ps(fft1317, fft1317, 78));
__m512 fft1243 = _mm512_fmadd_ps(fft1234, fft1241, _mm512_shuffle_ps(fft1234, fft1234, 78));
__m512 fft1326 = _mm512_fmadd_ps(fft1318, fft1241, _mm512_shuffle_ps(fft1318, fft1318, 78));
__m512 fft1244 = _mm512_fmadd_ps(fft1235, fft1241, _mm512_shuffle_ps(fft1235, fft1235, 78));
__m512 fft1327 = _mm512_fmadd_ps(fft1319, fft1241, _mm512_shuffle_ps(fft1319, fft1319, 78));
__m512 fft1245 = _mm512_fmadd_ps(fft1236, fft1241, _mm512_shuffle_ps(fft1236, fft1236, 78));
__m512 fft1328 = _mm512_fmadd_ps(fft1320, fft1241, _mm512_shuffle_ps(fft1320, fft1320, 78));
__m512 fft1246 = _mm512_fmadd_ps(fft1237, fft1241, _mm512_shuffle_ps(fft1237, fft1237, 78));
__m512 fft1329 = _mm512_fmadd_ps(fft1321, fft1241, _mm512_shuffle_ps(fft1321, fft1321, 78));
__m512 fft1247 = _mm512_fmadd_ps(fft1238, fft1241, _mm512_shuffle_ps(fft1238, fft1238, 78));
__m512 fft1330 = _mm512_fmadd_ps(fft1322, fft1241, _mm512_shuffle_ps(fft1322, fft1322, 78));
__m512 fft1248 = _mm512_fmadd_ps(fft1239, fft1241, _mm512_shuffle_ps(fft1239, fft1239, 78));
__m512 fft1331 = _mm512_fmadd_ps(fft1323, fft1241, _mm512_shuffle_ps(fft1323, fft1323, 78));
__m512 fft1249 = _mm512_fmadd_ps(fft1240, fft1241, _mm512_shuffle_ps(fft1240, fft1240, 78));
__m512 fft1332 = _mm512_fmadd_ps(fft1324, fft1241, _mm512_shuffle_ps(fft1324, fft1324, 78));
__m512i fft1250 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1251 = _mm512_permutexvar_ps(fft1250, fft1242);
__m512 fft1333 = _mm512_permutexvar_ps(fft1250, fft1325);
__m512i fft1252 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1253 = _mm512_permutexvar_ps(fft1252, fft1242);
__m512 fft1334 = _mm512_permutexvar_ps(fft1252, fft1325);
__m512 fft1254 = _mm512_permutexvar_ps(fft1250, fft1243);
__m512 fft1335 = _mm512_permutexvar_ps(fft1250, fft1326);
__m512 fft1255 = _mm512_permutexvar_ps(fft1252, fft1243);
__m512 fft1336 = _mm512_permutexvar_ps(fft1252, fft1326);
__m512 fft1256 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1257 = _mm512_fmadd_ps(fft1251, fft1256, fft1253);
__m512 fft1337 = _mm512_fmadd_ps(fft1333, fft1256, fft1334);
__m512 fft1258 = _mm512_fnmadd_ps(fft1255, fft1256, fft1254);
__m512 fft1338 = _mm512_fnmadd_ps(fft1336, fft1256, fft1335);
__m512 fft1259 = _mm512_mask_mov_ps(fft1255, 21845, fft1257);
__m512 fft1339 = _mm512_mask_mov_ps(fft1336, 21845, fft1337);
__m512 fft1260 = _mm512_mask_mov_ps(fft1251, 43176, fft1257);
__m512 fft1340 = _mm512_mask_mov_ps(fft1333, 43176, fft1337);
__m512 fft1261 = _mm512_mask_mov_ps(fft1259, 43176, fft1258);
__m512 fft1341 = _mm512_mask_mov_ps(fft1339, 43176, fft1338);
__m512 fft1262 = _mm512_mask_mov_ps(fft1260, 22102, fft1258);
__m512 fft1342 = _mm512_mask_mov_ps(fft1340, 22102, fft1338);
__m512 fft1263 = _mm512_mask_mul_ps(fft1261, 64764, fft1261, _mm512_set1_ps(5e-01f));
__m512 fft1343 = _mm512_mask_mul_ps(fft1341, 64764, fft1341, _mm512_set1_ps(5e-01f));
__m512 fft1264 = _mm512_mask_mul_ps(fft1262, 64764, fft1262, _mm512_set1_ps(5e-01f));
__m512 fft1344 = _mm512_mask_mul_ps(fft1342, 64764, fft1342, _mm512_set1_ps(5e-01f));
__m512 df81 = fft1263;
__m512 df89 = fft1343;
__m512 df82 = fft1264;
__m512 df90 = fft1344;
__m512 df83 = fft1244;
__m512 df91 = fft1327;
__m512 df84 = fft1245;
__m512 df92 = fft1328;
__m512 df85 = fft1246;
__m512 df93 = fft1329;
__m512 df86 = fft1247;
__m512 df94 = fft1330;
__m512 df87 = fft1248;
__m512 df95 = fft1331;
__m512 df88 = fft1249;
__m512 df96 = fft1332;
__m512i eo8 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df83 = _mm512_permutexvar_ps(eo8, df83);
df84 = _mm512_permutexvar_ps(eo8, df84);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df83);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df84);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df83);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df84);
df91 = _mm512_permutexvar_ps(eo8, df91);
df92 = _mm512_permutexvar_ps(eo8, df92);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df91);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df92);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df91);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df92);
df85 = _mm512_permutexvar_ps(eo8, df85);
df86 = _mm512_permutexvar_ps(eo8, df86);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df85);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df86);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df85);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df86);
df93 = _mm512_permutexvar_ps(eo8, df93);
df94 = _mm512_permutexvar_ps(eo8, df94);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df93);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df94);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df93);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df94);
df87 = _mm512_permutexvar_ps(eo8, df87);
df88 = _mm512_permutexvar_ps(eo8, df88);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df87);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df88);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df87);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df88);
df95 = _mm512_permutexvar_ps(eo8, df95);
df96 = _mm512_permutexvar_ps(eo8, df96);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df95);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df96);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df95);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df96);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df81);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df82);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df81);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df82);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df89);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df90);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df89);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df90);
}
if (j2 >= last1) return;
++j2;
j2 = 4;
}
if (j2 < 84) {
ptrdiff_t rel2 = (size_t)(j2-4)%23;
ptrdiff_t base2 = 10+(size_t)(j2-4)/23*60;
for (; ; rel2 = 0, base2 += 60) {
if (rel2 < 11) {
if (rel2 < 4) {
if (rel2 < 3) {
ptrdiff_t h4 = base2+0;
ptrdiff_t w4 = 10+60*rel2;
ptrdiff_t jj2 = 2-rel2+j2;
for (; j2 <= jj2; w4 += 60) {
ptrdiff_t k5 = 3*s1;
ptrdiff_t kk4 = k5+2;
for (; k5 <= kk4; ++k5) {
for (ptrdiff_t b9 = 0; b9 < 6; ++b9) {
ptrdiff_t m9 = (size_t)b9/2;
ptrdiff_t f10 = (size_t)b9%2;
__m512 dat82 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k5+896*h4+4*w4+40*b9);
__m512 dat83 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k5+896*h4+4*w4+40*b9);
__m512 dat84 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k5+896*h4+4*w4+40*b9);
__m512 dat85 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k5+896*h4+4*w4+40*b9);
__m512 dat86 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k5+896*h4+4*w4+40*b9);
__m512 dat87 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k5+896*h4+4*w4+40*b9);
__m512 dat88 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k5+896*h4+4*w4+40*b9);
__m512 dat89 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k5+896*h4+4*w4+40*b9);
__m512 dat90 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k5+896*h4+4*w4+40*b9);
__m512 dat91 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k5+896*h4+4*w4+40*b9);
__m512 dat92 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k5+896*h4+4*w4+40*b9);
__m512 dat93 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k5+896*h4+4*w4+40*b9);
__m512 dat94 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k5+896*h4+4*w4+40*b9);
__m512 dat95 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k5+896*h4+4*w4+40*b9);
__m512 dat96 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k5+896*h4+4*w4+40*b9);
__m512 dat97 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k5+896*h4+4*w4+40*b9);
__m512 fft1345 = _mm512_add_ps(dat82, dat90);
__m512 fft1433 = _mm512_add_ps(dat83, dat91);
__m512 fft1346 = _mm512_sub_ps(dat82, dat90);
__m512 fft1434 = _mm512_sub_ps(dat83, dat91);
__m512 fft1347 = _mm512_add_ps(dat84, dat92);
__m512 fft1435 = _mm512_add_ps(dat85, dat93);
__m512 fft1348 = _mm512_sub_ps(dat84, dat92);
__m512 fft1436 = _mm512_sub_ps(dat85, dat93);
__m512 fft1349 = _mm512_add_ps(dat86, dat94);
__m512 fft1437 = _mm512_add_ps(dat87, dat95);
__m512 fft1350 = _mm512_sub_ps(dat86, dat94);
__m512 fft1438 = _mm512_sub_ps(dat87, dat95);
__m512 fft1351 = _mm512_add_ps(dat88, dat96);
__m512 fft1439 = _mm512_add_ps(dat89, dat97);
__m512 fft1352 = _mm512_sub_ps(dat88, dat96);
__m512 fft1440 = _mm512_sub_ps(dat89, dat97);
__m512 fft1353 = _mm512_add_ps(fft1345, fft1349);
__m512 fft1441 = _mm512_add_ps(fft1433, fft1437);
__m512 fft1354 = _mm512_sub_ps(fft1345, fft1349);
__m512 fft1442 = _mm512_sub_ps(fft1433, fft1437);
__m512 fft1355 = _mm512_add_ps(fft1347, fft1351);
__m512 fft1443 = _mm512_add_ps(fft1435, fft1439);
__m512 fft1356 = _mm512_sub_ps(fft1351, fft1347);
__m512 fft1444 = _mm512_sub_ps(fft1439, fft1435);
__m512 fft1357 = _mm512_sub_ps(fft1348, fft1352);
__m512 fft1445 = _mm512_sub_ps(fft1436, fft1440);
__m512 fft1358 = _mm512_add_ps(fft1348, fft1352);
__m512 fft1446 = _mm512_add_ps(fft1436, fft1440);
__m512 fft1359 = _mm512_add_ps(fft1353, fft1355);
__m512 fft1447 = _mm512_add_ps(fft1441, fft1443);
__m512 fft1360 = _mm512_sub_ps(fft1353, fft1355);
__m512 fft1448 = _mm512_sub_ps(fft1441, fft1443);
__m512 fft1361 = _mm512_fmadd_ps(fft1357, _mm512_set1_ps(7.0710677e-01f), fft1346);
__m512 fft1449 = _mm512_fmadd_ps(fft1445, _mm512_set1_ps(7.0710677e-01f), fft1434);
__m512 fft1362 = _mm512_fnmsub_ps(fft1358, _mm512_set1_ps(7.0710677e-01f), fft1350);
__m512 fft1450 = _mm512_fnmsub_ps(fft1446, _mm512_set1_ps(7.0710677e-01f), fft1438);
__m512 fft1363 = _mm512_fnmadd_ps(fft1357, _mm512_set1_ps(7.0710677e-01f), fft1346);
__m512 fft1451 = _mm512_fnmadd_ps(fft1445, _mm512_set1_ps(7.0710677e-01f), fft1434);
__m512 fft1364 = _mm512_fnmadd_ps(fft1358, _mm512_set1_ps(7.0710677e-01f), fft1350);
__m512 fft1452 = _mm512_fnmadd_ps(fft1446, _mm512_set1_ps(7.0710677e-01f), fft1438);
__m512 fft1365 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1366 = _mm512_fmadd_ps(fft1359, fft1365, _mm512_shuffle_f32x4(fft1359, fft1359, 78));
__m512 fft1453 = _mm512_fmadd_ps(fft1447, fft1365, _mm512_shuffle_f32x4(fft1447, fft1447, 78));
__m512 fft1367 = _mm512_fmadd_ps(fft1360, fft1365, _mm512_shuffle_f32x4(fft1360, fft1360, 78));
__m512 fft1454 = _mm512_fmadd_ps(fft1448, fft1365, _mm512_shuffle_f32x4(fft1448, fft1448, 78));
__m512 fft1368 = _mm512_fmadd_ps(fft1361, fft1365, _mm512_shuffle_f32x4(fft1361, fft1361, 78));
__m512 fft1455 = _mm512_fmadd_ps(fft1449, fft1365, _mm512_shuffle_f32x4(fft1449, fft1449, 78));
__m512 fft1369 = _mm512_fmadd_ps(fft1362, fft1365, _mm512_shuffle_f32x4(fft1362, fft1362, 78));
__m512 fft1456 = _mm512_fmadd_ps(fft1450, fft1365, _mm512_shuffle_f32x4(fft1450, fft1450, 78));
__m512 fft1370 = _mm512_fmadd_ps(fft1354, fft1365, _mm512_shuffle_f32x4(fft1354, fft1354, 78));
__m512 fft1457 = _mm512_fmadd_ps(fft1442, fft1365, _mm512_shuffle_f32x4(fft1442, fft1442, 78));
__m512 fft1371 = _mm512_fmadd_ps(fft1356, fft1365, _mm512_shuffle_f32x4(fft1356, fft1356, 78));
__m512 fft1458 = _mm512_fmadd_ps(fft1444, fft1365, _mm512_shuffle_f32x4(fft1444, fft1444, 78));
__m512 fft1372 = _mm512_fmadd_ps(fft1363, fft1365, _mm512_shuffle_f32x4(fft1363, fft1363, 78));
__m512 fft1459 = _mm512_fmadd_ps(fft1451, fft1365, _mm512_shuffle_f32x4(fft1451, fft1451, 78));
__m512 fft1373 = _mm512_fmadd_ps(fft1364, fft1365, _mm512_shuffle_f32x4(fft1364, fft1364, 78));
__m512 fft1460 = _mm512_fmadd_ps(fft1452, fft1365, _mm512_shuffle_f32x4(fft1452, fft1452, 78));
__m512 fft1374 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1375 = _mm512_mul_ps(fft1366, fft1374);
__m512 fft1461 = _mm512_mul_ps(fft1453, fft1374);
__m512 fft1376 = _mm512_mul_ps(fft1367, fft1374);
__m512 fft1462 = _mm512_mul_ps(fft1454, fft1374);
__m512 fft1377 = _mm512_mul_ps(fft1368, fft1374);
__m512 fft1463 = _mm512_mul_ps(fft1455, fft1374);
__m512 fft1378 = _mm512_mul_ps(fft1369, fft1374);
__m512 fft1464 = _mm512_mul_ps(fft1456, fft1374);
__m512 fft1379 = _mm512_mul_ps(fft1370, fft1374);
__m512 fft1465 = _mm512_mul_ps(fft1457, fft1374);
__m512 fft1380 = _mm512_mul_ps(fft1371, fft1374);
__m512 fft1466 = _mm512_mul_ps(fft1458, fft1374);
__m512 fft1381 = _mm512_mul_ps(fft1372, fft1374);
__m512 fft1467 = _mm512_mul_ps(fft1459, fft1374);
__m512 fft1382 = _mm512_mul_ps(fft1373, fft1374);
__m512 fft1468 = _mm512_mul_ps(fft1460, fft1374);
__m512 fft1383 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1384 = _mm512_fmadd_ps(fft1367, fft1383, fft1375);
__m512 fft1469 = _mm512_fmadd_ps(fft1454, fft1383, fft1461);
__m512 fft1385 = _mm512_fnmadd_ps(fft1366, fft1383, fft1376);
__m512 fft1470 = _mm512_fnmadd_ps(fft1453, fft1383, fft1462);
__m512 fft1386 = _mm512_fmadd_ps(fft1369, fft1383, fft1377);
__m512 fft1471 = _mm512_fmadd_ps(fft1456, fft1383, fft1463);
__m512 fft1387 = _mm512_fnmadd_ps(fft1368, fft1383, fft1378);
__m512 fft1472 = _mm512_fnmadd_ps(fft1455, fft1383, fft1464);
__m512 fft1388 = _mm512_fmadd_ps(fft1371, fft1383, fft1379);
__m512 fft1473 = _mm512_fmadd_ps(fft1458, fft1383, fft1465);
__m512 fft1389 = _mm512_fnmadd_ps(fft1370, fft1383, fft1380);
__m512 fft1474 = _mm512_fnmadd_ps(fft1457, fft1383, fft1466);
__m512 fft1390 = _mm512_fmadd_ps(fft1373, fft1383, fft1381);
__m512 fft1475 = _mm512_fmadd_ps(fft1460, fft1383, fft1467);
__m512 fft1391 = _mm512_fnmadd_ps(fft1372, fft1383, fft1382);
__m512 fft1476 = _mm512_fnmadd_ps(fft1459, fft1383, fft1468);
__m512 fft1392 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1393 = _mm512_fmadd_ps(fft1384, fft1392, _mm512_shuffle_f32x4(fft1384, fft1384, 177));
__m512 fft1477 = _mm512_fmadd_ps(fft1469, fft1392, _mm512_shuffle_f32x4(fft1469, fft1469, 177));
__m512 fft1394 = _mm512_fmadd_ps(fft1385, fft1392, _mm512_shuffle_f32x4(fft1385, fft1385, 177));
__m512 fft1478 = _mm512_fmadd_ps(fft1470, fft1392, _mm512_shuffle_f32x4(fft1470, fft1470, 177));
__m512 fft1395 = _mm512_fmadd_ps(fft1386, fft1392, _mm512_shuffle_f32x4(fft1386, fft1386, 177));
__m512 fft1479 = _mm512_fmadd_ps(fft1471, fft1392, _mm512_shuffle_f32x4(fft1471, fft1471, 177));
__m512 fft1396 = _mm512_fmadd_ps(fft1387, fft1392, _mm512_shuffle_f32x4(fft1387, fft1387, 177));
__m512 fft1480 = _mm512_fmadd_ps(fft1472, fft1392, _mm512_shuffle_f32x4(fft1472, fft1472, 177));
__m512 fft1397 = _mm512_fmadd_ps(fft1388, fft1392, _mm512_shuffle_f32x4(fft1388, fft1388, 177));
__m512 fft1481 = _mm512_fmadd_ps(fft1473, fft1392, _mm512_shuffle_f32x4(fft1473, fft1473, 177));
__m512 fft1398 = _mm512_fmadd_ps(fft1389, fft1392, _mm512_shuffle_f32x4(fft1389, fft1389, 177));
__m512 fft1482 = _mm512_fmadd_ps(fft1474, fft1392, _mm512_shuffle_f32x4(fft1474, fft1474, 177));
__m512 fft1399 = _mm512_fmadd_ps(fft1390, fft1392, _mm512_shuffle_f32x4(fft1390, fft1390, 177));
__m512 fft1483 = _mm512_fmadd_ps(fft1475, fft1392, _mm512_shuffle_f32x4(fft1475, fft1475, 177));
__m512 fft1400 = _mm512_fmadd_ps(fft1391, fft1392, _mm512_shuffle_f32x4(fft1391, fft1391, 177));
__m512 fft1484 = _mm512_fmadd_ps(fft1476, fft1392, _mm512_shuffle_f32x4(fft1476, fft1476, 177));
__m512 fft1401 = _mm512_mask_mov_ps(fft1393, 49344, fft1394);
__m512 fft1485 = _mm512_mask_mov_ps(fft1477, 49344, fft1478);
__m512 fft1402 = _mm512_mask_sub_ps(fft1394, 49344, _mm512_setzero_ps(), fft1393);
__m512 fft1486 = _mm512_mask_sub_ps(fft1478, 49344, _mm512_setzero_ps(), fft1477);
__m512 fft1403 = _mm512_mask_mov_ps(fft1395, 49344, fft1396);
__m512 fft1487 = _mm512_mask_mov_ps(fft1479, 49344, fft1480);
__m512 fft1404 = _mm512_mask_sub_ps(fft1396, 49344, _mm512_setzero_ps(), fft1395);
__m512 fft1488 = _mm512_mask_sub_ps(fft1480, 49344, _mm512_setzero_ps(), fft1479);
__m512 fft1405 = _mm512_mask_mov_ps(fft1397, 49344, fft1398);
__m512 fft1489 = _mm512_mask_mov_ps(fft1481, 49344, fft1482);
__m512 fft1406 = _mm512_mask_sub_ps(fft1398, 49344, _mm512_setzero_ps(), fft1397);
__m512 fft1490 = _mm512_mask_sub_ps(fft1482, 49344, _mm512_setzero_ps(), fft1481);
__m512 fft1407 = _mm512_mask_mov_ps(fft1399, 49344, fft1400);
__m512 fft1491 = _mm512_mask_mov_ps(fft1483, 49344, fft1484);
__m512 fft1408 = _mm512_mask_sub_ps(fft1400, 49344, _mm512_setzero_ps(), fft1399);
__m512 fft1492 = _mm512_mask_sub_ps(fft1484, 49344, _mm512_setzero_ps(), fft1483);
__m512 fft1409 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1410 = _mm512_fmadd_ps(fft1401, fft1409, _mm512_shuffle_ps(fft1401, fft1401, 78));
__m512 fft1493 = _mm512_fmadd_ps(fft1485, fft1409, _mm512_shuffle_ps(fft1485, fft1485, 78));
__m512 fft1411 = _mm512_fmadd_ps(fft1402, fft1409, _mm512_shuffle_ps(fft1402, fft1402, 78));
__m512 fft1494 = _mm512_fmadd_ps(fft1486, fft1409, _mm512_shuffle_ps(fft1486, fft1486, 78));
__m512 fft1412 = _mm512_fmadd_ps(fft1403, fft1409, _mm512_shuffle_ps(fft1403, fft1403, 78));
__m512 fft1495 = _mm512_fmadd_ps(fft1487, fft1409, _mm512_shuffle_ps(fft1487, fft1487, 78));
__m512 fft1413 = _mm512_fmadd_ps(fft1404, fft1409, _mm512_shuffle_ps(fft1404, fft1404, 78));
__m512 fft1496 = _mm512_fmadd_ps(fft1488, fft1409, _mm512_shuffle_ps(fft1488, fft1488, 78));
__m512 fft1414 = _mm512_fmadd_ps(fft1405, fft1409, _mm512_shuffle_ps(fft1405, fft1405, 78));
__m512 fft1497 = _mm512_fmadd_ps(fft1489, fft1409, _mm512_shuffle_ps(fft1489, fft1489, 78));
__m512 fft1415 = _mm512_fmadd_ps(fft1406, fft1409, _mm512_shuffle_ps(fft1406, fft1406, 78));
__m512 fft1498 = _mm512_fmadd_ps(fft1490, fft1409, _mm512_shuffle_ps(fft1490, fft1490, 78));
__m512 fft1416 = _mm512_fmadd_ps(fft1407, fft1409, _mm512_shuffle_ps(fft1407, fft1407, 78));
__m512 fft1499 = _mm512_fmadd_ps(fft1491, fft1409, _mm512_shuffle_ps(fft1491, fft1491, 78));
__m512 fft1417 = _mm512_fmadd_ps(fft1408, fft1409, _mm512_shuffle_ps(fft1408, fft1408, 78));
__m512 fft1500 = _mm512_fmadd_ps(fft1492, fft1409, _mm512_shuffle_ps(fft1492, fft1492, 78));
__m512i fft1418 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1419 = _mm512_permutexvar_ps(fft1418, fft1410);
__m512 fft1501 = _mm512_permutexvar_ps(fft1418, fft1493);
__m512i fft1420 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1421 = _mm512_permutexvar_ps(fft1420, fft1410);
__m512 fft1502 = _mm512_permutexvar_ps(fft1420, fft1493);
__m512 fft1422 = _mm512_permutexvar_ps(fft1418, fft1411);
__m512 fft1503 = _mm512_permutexvar_ps(fft1418, fft1494);
__m512 fft1423 = _mm512_permutexvar_ps(fft1420, fft1411);
__m512 fft1504 = _mm512_permutexvar_ps(fft1420, fft1494);
__m512 fft1424 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1425 = _mm512_fmadd_ps(fft1419, fft1424, fft1421);
__m512 fft1505 = _mm512_fmadd_ps(fft1501, fft1424, fft1502);
__m512 fft1426 = _mm512_fnmadd_ps(fft1423, fft1424, fft1422);
__m512 fft1506 = _mm512_fnmadd_ps(fft1504, fft1424, fft1503);
__m512 fft1427 = _mm512_mask_mov_ps(fft1423, 21845, fft1425);
__m512 fft1507 = _mm512_mask_mov_ps(fft1504, 21845, fft1505);
__m512 fft1428 = _mm512_mask_mov_ps(fft1419, 43176, fft1425);
__m512 fft1508 = _mm512_mask_mov_ps(fft1501, 43176, fft1505);
__m512 fft1429 = _mm512_mask_mov_ps(fft1427, 43176, fft1426);
__m512 fft1509 = _mm512_mask_mov_ps(fft1507, 43176, fft1506);
__m512 fft1430 = _mm512_mask_mov_ps(fft1428, 22102, fft1426);
__m512 fft1510 = _mm512_mask_mov_ps(fft1508, 22102, fft1506);
__m512 fft1431 = _mm512_mask_mul_ps(fft1429, 64764, fft1429, _mm512_set1_ps(5e-01f));
__m512 fft1511 = _mm512_mask_mul_ps(fft1509, 64764, fft1509, _mm512_set1_ps(5e-01f));
__m512 fft1432 = _mm512_mask_mul_ps(fft1430, 64764, fft1430, _mm512_set1_ps(5e-01f));
__m512 fft1512 = _mm512_mask_mul_ps(fft1510, 64764, fft1510, _mm512_set1_ps(5e-01f));
__m512 df97 = fft1431;
__m512 df105 = fft1511;
__m512 df98 = fft1432;
__m512 df106 = fft1512;
__m512 df99 = fft1412;
__m512 df107 = fft1495;
__m512 df100 = fft1413;
__m512 df108 = fft1496;
__m512 df101 = fft1414;
__m512 df109 = fft1497;
__m512 df102 = fft1415;
__m512 df110 = fft1498;
__m512 df103 = fft1416;
__m512 df111 = fft1499;
__m512 df104 = fft1417;
__m512 df112 = fft1500;
__m512i eo9 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df99 = _mm512_permutexvar_ps(eo9, df99);
df100 = _mm512_permutexvar_ps(eo9, df100);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df99);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df100);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df99);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df100);
df107 = _mm512_permutexvar_ps(eo9, df107);
df108 = _mm512_permutexvar_ps(eo9, df108);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df107);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df108);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df107);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df108);
df101 = _mm512_permutexvar_ps(eo9, df101);
df102 = _mm512_permutexvar_ps(eo9, df102);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df101);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df102);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df101);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df102);
df109 = _mm512_permutexvar_ps(eo9, df109);
df110 = _mm512_permutexvar_ps(eo9, df110);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df109);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df110);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df109);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df110);
df103 = _mm512_permutexvar_ps(eo9, df103);
df104 = _mm512_permutexvar_ps(eo9, df104);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df103);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df104);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df103);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df104);
df111 = _mm512_permutexvar_ps(eo9, df111);
df112 = _mm512_permutexvar_ps(eo9, df112);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df111);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df112);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df111);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df112);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df97);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df98);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df97);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df98);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df105);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df106);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df105);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df106);
}
}
if (j2 >= last1) return;
++j2;
}
rel2 = 3;
}
ptrdiff_t h5 = base2+0;
ptrdiff_t w5 = 190;
ptrdiff_t k6 = 3*s1;
ptrdiff_t kk5 = k6+2;
for (; k6 <= kk5; ++k6) {
for (ptrdiff_t b10 = 0; b10 < 3; ++b10) {
ptrdiff_t m10 = (size_t)b10/2;
ptrdiff_t f11 = (size_t)b10%2;
__m512 dat98 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k6+896*h5+4*w5+40*b10);
__m512 dat99 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k6+896*h5+4*w5+40*b10);
__m512 dat100 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k6+896*h5+4*w5+40*b10);
__m512 dat101 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k6+896*h5+4*w5+40*b10);
__m512 dat102 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k6+896*h5+4*w5+40*b10);
__m512 dat103 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k6+896*h5+4*w5+40*b10);
__m512 dat104 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k6+896*h5+4*w5+40*b10);
__m512 dat105 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k6+896*h5+4*w5+40*b10);
__m512 dat106 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k6+896*h5+4*w5+40*b10);
__m512 dat107 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k6+896*h5+4*w5+40*b10);
__m512 dat108 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k6+896*h5+4*w5+40*b10);
__m512 dat109 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k6+896*h5+4*w5+40*b10);
__m512 dat110 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k6+896*h5+4*w5+40*b10);
__m512 dat111 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k6+896*h5+4*w5+40*b10);
__m512 dat112 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k6+896*h5+4*w5+40*b10);
__m512 dat113 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k6+896*h5+4*w5+40*b10);
__m512 fft1513 = _mm512_add_ps(dat98, dat106);
__m512 fft1601 = _mm512_add_ps(dat99, dat107);
__m512 fft1514 = _mm512_sub_ps(dat98, dat106);
__m512 fft1602 = _mm512_sub_ps(dat99, dat107);
__m512 fft1515 = _mm512_add_ps(dat100, dat108);
__m512 fft1603 = _mm512_add_ps(dat101, dat109);
__m512 fft1516 = _mm512_sub_ps(dat100, dat108);
__m512 fft1604 = _mm512_sub_ps(dat101, dat109);
__m512 fft1517 = _mm512_add_ps(dat102, dat110);
__m512 fft1605 = _mm512_add_ps(dat103, dat111);
__m512 fft1518 = _mm512_sub_ps(dat102, dat110);
__m512 fft1606 = _mm512_sub_ps(dat103, dat111);
__m512 fft1519 = _mm512_add_ps(dat104, dat112);
__m512 fft1607 = _mm512_add_ps(dat105, dat113);
__m512 fft1520 = _mm512_sub_ps(dat104, dat112);
__m512 fft1608 = _mm512_sub_ps(dat105, dat113);
__m512 fft1521 = _mm512_add_ps(fft1513, fft1517);
__m512 fft1609 = _mm512_add_ps(fft1601, fft1605);
__m512 fft1522 = _mm512_sub_ps(fft1513, fft1517);
__m512 fft1610 = _mm512_sub_ps(fft1601, fft1605);
__m512 fft1523 = _mm512_add_ps(fft1515, fft1519);
__m512 fft1611 = _mm512_add_ps(fft1603, fft1607);
__m512 fft1524 = _mm512_sub_ps(fft1519, fft1515);
__m512 fft1612 = _mm512_sub_ps(fft1607, fft1603);
__m512 fft1525 = _mm512_sub_ps(fft1516, fft1520);
__m512 fft1613 = _mm512_sub_ps(fft1604, fft1608);
__m512 fft1526 = _mm512_add_ps(fft1516, fft1520);
__m512 fft1614 = _mm512_add_ps(fft1604, fft1608);
__m512 fft1527 = _mm512_add_ps(fft1521, fft1523);
__m512 fft1615 = _mm512_add_ps(fft1609, fft1611);
__m512 fft1528 = _mm512_sub_ps(fft1521, fft1523);
__m512 fft1616 = _mm512_sub_ps(fft1609, fft1611);
__m512 fft1529 = _mm512_fmadd_ps(fft1525, _mm512_set1_ps(7.0710677e-01f), fft1514);
__m512 fft1617 = _mm512_fmadd_ps(fft1613, _mm512_set1_ps(7.0710677e-01f), fft1602);
__m512 fft1530 = _mm512_fnmsub_ps(fft1526, _mm512_set1_ps(7.0710677e-01f), fft1518);
__m512 fft1618 = _mm512_fnmsub_ps(fft1614, _mm512_set1_ps(7.0710677e-01f), fft1606);
__m512 fft1531 = _mm512_fnmadd_ps(fft1525, _mm512_set1_ps(7.0710677e-01f), fft1514);
__m512 fft1619 = _mm512_fnmadd_ps(fft1613, _mm512_set1_ps(7.0710677e-01f), fft1602);
__m512 fft1532 = _mm512_fnmadd_ps(fft1526, _mm512_set1_ps(7.0710677e-01f), fft1518);
__m512 fft1620 = _mm512_fnmadd_ps(fft1614, _mm512_set1_ps(7.0710677e-01f), fft1606);
__m512 fft1533 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1534 = _mm512_fmadd_ps(fft1527, fft1533, _mm512_shuffle_f32x4(fft1527, fft1527, 78));
__m512 fft1621 = _mm512_fmadd_ps(fft1615, fft1533, _mm512_shuffle_f32x4(fft1615, fft1615, 78));
__m512 fft1535 = _mm512_fmadd_ps(fft1528, fft1533, _mm512_shuffle_f32x4(fft1528, fft1528, 78));
__m512 fft1622 = _mm512_fmadd_ps(fft1616, fft1533, _mm512_shuffle_f32x4(fft1616, fft1616, 78));
__m512 fft1536 = _mm512_fmadd_ps(fft1529, fft1533, _mm512_shuffle_f32x4(fft1529, fft1529, 78));
__m512 fft1623 = _mm512_fmadd_ps(fft1617, fft1533, _mm512_shuffle_f32x4(fft1617, fft1617, 78));
__m512 fft1537 = _mm512_fmadd_ps(fft1530, fft1533, _mm512_shuffle_f32x4(fft1530, fft1530, 78));
__m512 fft1624 = _mm512_fmadd_ps(fft1618, fft1533, _mm512_shuffle_f32x4(fft1618, fft1618, 78));
__m512 fft1538 = _mm512_fmadd_ps(fft1522, fft1533, _mm512_shuffle_f32x4(fft1522, fft1522, 78));
__m512 fft1625 = _mm512_fmadd_ps(fft1610, fft1533, _mm512_shuffle_f32x4(fft1610, fft1610, 78));
__m512 fft1539 = _mm512_fmadd_ps(fft1524, fft1533, _mm512_shuffle_f32x4(fft1524, fft1524, 78));
__m512 fft1626 = _mm512_fmadd_ps(fft1612, fft1533, _mm512_shuffle_f32x4(fft1612, fft1612, 78));
__m512 fft1540 = _mm512_fmadd_ps(fft1531, fft1533, _mm512_shuffle_f32x4(fft1531, fft1531, 78));
__m512 fft1627 = _mm512_fmadd_ps(fft1619, fft1533, _mm512_shuffle_f32x4(fft1619, fft1619, 78));
__m512 fft1541 = _mm512_fmadd_ps(fft1532, fft1533, _mm512_shuffle_f32x4(fft1532, fft1532, 78));
__m512 fft1628 = _mm512_fmadd_ps(fft1620, fft1533, _mm512_shuffle_f32x4(fft1620, fft1620, 78));
__m512 fft1542 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1543 = _mm512_mul_ps(fft1534, fft1542);
__m512 fft1629 = _mm512_mul_ps(fft1621, fft1542);
__m512 fft1544 = _mm512_mul_ps(fft1535, fft1542);
__m512 fft1630 = _mm512_mul_ps(fft1622, fft1542);
__m512 fft1545 = _mm512_mul_ps(fft1536, fft1542);
__m512 fft1631 = _mm512_mul_ps(fft1623, fft1542);
__m512 fft1546 = _mm512_mul_ps(fft1537, fft1542);
__m512 fft1632 = _mm512_mul_ps(fft1624, fft1542);
__m512 fft1547 = _mm512_mul_ps(fft1538, fft1542);
__m512 fft1633 = _mm512_mul_ps(fft1625, fft1542);
__m512 fft1548 = _mm512_mul_ps(fft1539, fft1542);
__m512 fft1634 = _mm512_mul_ps(fft1626, fft1542);
__m512 fft1549 = _mm512_mul_ps(fft1540, fft1542);
__m512 fft1635 = _mm512_mul_ps(fft1627, fft1542);
__m512 fft1550 = _mm512_mul_ps(fft1541, fft1542);
__m512 fft1636 = _mm512_mul_ps(fft1628, fft1542);
__m512 fft1551 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1552 = _mm512_fmadd_ps(fft1535, fft1551, fft1543);
__m512 fft1637 = _mm512_fmadd_ps(fft1622, fft1551, fft1629);
__m512 fft1553 = _mm512_fnmadd_ps(fft1534, fft1551, fft1544);
__m512 fft1638 = _mm512_fnmadd_ps(fft1621, fft1551, fft1630);
__m512 fft1554 = _mm512_fmadd_ps(fft1537, fft1551, fft1545);
__m512 fft1639 = _mm512_fmadd_ps(fft1624, fft1551, fft1631);
__m512 fft1555 = _mm512_fnmadd_ps(fft1536, fft1551, fft1546);
__m512 fft1640 = _mm512_fnmadd_ps(fft1623, fft1551, fft1632);
__m512 fft1556 = _mm512_fmadd_ps(fft1539, fft1551, fft1547);
__m512 fft1641 = _mm512_fmadd_ps(fft1626, fft1551, fft1633);
__m512 fft1557 = _mm512_fnmadd_ps(fft1538, fft1551, fft1548);
__m512 fft1642 = _mm512_fnmadd_ps(fft1625, fft1551, fft1634);
__m512 fft1558 = _mm512_fmadd_ps(fft1541, fft1551, fft1549);
__m512 fft1643 = _mm512_fmadd_ps(fft1628, fft1551, fft1635);
__m512 fft1559 = _mm512_fnmadd_ps(fft1540, fft1551, fft1550);
__m512 fft1644 = _mm512_fnmadd_ps(fft1627, fft1551, fft1636);
__m512 fft1560 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1561 = _mm512_fmadd_ps(fft1552, fft1560, _mm512_shuffle_f32x4(fft1552, fft1552, 177));
__m512 fft1645 = _mm512_fmadd_ps(fft1637, fft1560, _mm512_shuffle_f32x4(fft1637, fft1637, 177));
__m512 fft1562 = _mm512_fmadd_ps(fft1553, fft1560, _mm512_shuffle_f32x4(fft1553, fft1553, 177));
__m512 fft1646 = _mm512_fmadd_ps(fft1638, fft1560, _mm512_shuffle_f32x4(fft1638, fft1638, 177));
__m512 fft1563 = _mm512_fmadd_ps(fft1554, fft1560, _mm512_shuffle_f32x4(fft1554, fft1554, 177));
__m512 fft1647 = _mm512_fmadd_ps(fft1639, fft1560, _mm512_shuffle_f32x4(fft1639, fft1639, 177));
__m512 fft1564 = _mm512_fmadd_ps(fft1555, fft1560, _mm512_shuffle_f32x4(fft1555, fft1555, 177));
__m512 fft1648 = _mm512_fmadd_ps(fft1640, fft1560, _mm512_shuffle_f32x4(fft1640, fft1640, 177));
__m512 fft1565 = _mm512_fmadd_ps(fft1556, fft1560, _mm512_shuffle_f32x4(fft1556, fft1556, 177));
__m512 fft1649 = _mm512_fmadd_ps(fft1641, fft1560, _mm512_shuffle_f32x4(fft1641, fft1641, 177));
__m512 fft1566 = _mm512_fmadd_ps(fft1557, fft1560, _mm512_shuffle_f32x4(fft1557, fft1557, 177));
__m512 fft1650 = _mm512_fmadd_ps(fft1642, fft1560, _mm512_shuffle_f32x4(fft1642, fft1642, 177));
__m512 fft1567 = _mm512_fmadd_ps(fft1558, fft1560, _mm512_shuffle_f32x4(fft1558, fft1558, 177));
__m512 fft1651 = _mm512_fmadd_ps(fft1643, fft1560, _mm512_shuffle_f32x4(fft1643, fft1643, 177));
__m512 fft1568 = _mm512_fmadd_ps(fft1559, fft1560, _mm512_shuffle_f32x4(fft1559, fft1559, 177));
__m512 fft1652 = _mm512_fmadd_ps(fft1644, fft1560, _mm512_shuffle_f32x4(fft1644, fft1644, 177));
__m512 fft1569 = _mm512_mask_mov_ps(fft1561, 49344, fft1562);
__m512 fft1653 = _mm512_mask_mov_ps(fft1645, 49344, fft1646);
__m512 fft1570 = _mm512_mask_sub_ps(fft1562, 49344, _mm512_setzero_ps(), fft1561);
__m512 fft1654 = _mm512_mask_sub_ps(fft1646, 49344, _mm512_setzero_ps(), fft1645);
__m512 fft1571 = _mm512_mask_mov_ps(fft1563, 49344, fft1564);
__m512 fft1655 = _mm512_mask_mov_ps(fft1647, 49344, fft1648);
__m512 fft1572 = _mm512_mask_sub_ps(fft1564, 49344, _mm512_setzero_ps(), fft1563);
__m512 fft1656 = _mm512_mask_sub_ps(fft1648, 49344, _mm512_setzero_ps(), fft1647);
__m512 fft1573 = _mm512_mask_mov_ps(fft1565, 49344, fft1566);
__m512 fft1657 = _mm512_mask_mov_ps(fft1649, 49344, fft1650);
__m512 fft1574 = _mm512_mask_sub_ps(fft1566, 49344, _mm512_setzero_ps(), fft1565);
__m512 fft1658 = _mm512_mask_sub_ps(fft1650, 49344, _mm512_setzero_ps(), fft1649);
__m512 fft1575 = _mm512_mask_mov_ps(fft1567, 49344, fft1568);
__m512 fft1659 = _mm512_mask_mov_ps(fft1651, 49344, fft1652);
__m512 fft1576 = _mm512_mask_sub_ps(fft1568, 49344, _mm512_setzero_ps(), fft1567);
__m512 fft1660 = _mm512_mask_sub_ps(fft1652, 49344, _mm512_setzero_ps(), fft1651);
__m512 fft1577 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1578 = _mm512_fmadd_ps(fft1569, fft1577, _mm512_shuffle_ps(fft1569, fft1569, 78));
__m512 fft1661 = _mm512_fmadd_ps(fft1653, fft1577, _mm512_shuffle_ps(fft1653, fft1653, 78));
__m512 fft1579 = _mm512_fmadd_ps(fft1570, fft1577, _mm512_shuffle_ps(fft1570, fft1570, 78));
__m512 fft1662 = _mm512_fmadd_ps(fft1654, fft1577, _mm512_shuffle_ps(fft1654, fft1654, 78));
__m512 fft1580 = _mm512_fmadd_ps(fft1571, fft1577, _mm512_shuffle_ps(fft1571, fft1571, 78));
__m512 fft1663 = _mm512_fmadd_ps(fft1655, fft1577, _mm512_shuffle_ps(fft1655, fft1655, 78));
__m512 fft1581 = _mm512_fmadd_ps(fft1572, fft1577, _mm512_shuffle_ps(fft1572, fft1572, 78));
__m512 fft1664 = _mm512_fmadd_ps(fft1656, fft1577, _mm512_shuffle_ps(fft1656, fft1656, 78));
__m512 fft1582 = _mm512_fmadd_ps(fft1573, fft1577, _mm512_shuffle_ps(fft1573, fft1573, 78));
__m512 fft1665 = _mm512_fmadd_ps(fft1657, fft1577, _mm512_shuffle_ps(fft1657, fft1657, 78));
__m512 fft1583 = _mm512_fmadd_ps(fft1574, fft1577, _mm512_shuffle_ps(fft1574, fft1574, 78));
__m512 fft1666 = _mm512_fmadd_ps(fft1658, fft1577, _mm512_shuffle_ps(fft1658, fft1658, 78));
__m512 fft1584 = _mm512_fmadd_ps(fft1575, fft1577, _mm512_shuffle_ps(fft1575, fft1575, 78));
__m512 fft1667 = _mm512_fmadd_ps(fft1659, fft1577, _mm512_shuffle_ps(fft1659, fft1659, 78));
__m512 fft1585 = _mm512_fmadd_ps(fft1576, fft1577, _mm512_shuffle_ps(fft1576, fft1576, 78));
__m512 fft1668 = _mm512_fmadd_ps(fft1660, fft1577, _mm512_shuffle_ps(fft1660, fft1660, 78));
__m512i fft1586 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1587 = _mm512_permutexvar_ps(fft1586, fft1578);
__m512 fft1669 = _mm512_permutexvar_ps(fft1586, fft1661);
__m512i fft1588 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1589 = _mm512_permutexvar_ps(fft1588, fft1578);
__m512 fft1670 = _mm512_permutexvar_ps(fft1588, fft1661);
__m512 fft1590 = _mm512_permutexvar_ps(fft1586, fft1579);
__m512 fft1671 = _mm512_permutexvar_ps(fft1586, fft1662);
__m512 fft1591 = _mm512_permutexvar_ps(fft1588, fft1579);
__m512 fft1672 = _mm512_permutexvar_ps(fft1588, fft1662);
__m512 fft1592 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1593 = _mm512_fmadd_ps(fft1587, fft1592, fft1589);
__m512 fft1673 = _mm512_fmadd_ps(fft1669, fft1592, fft1670);
__m512 fft1594 = _mm512_fnmadd_ps(fft1591, fft1592, fft1590);
__m512 fft1674 = _mm512_fnmadd_ps(fft1672, fft1592, fft1671);
__m512 fft1595 = _mm512_mask_mov_ps(fft1591, 21845, fft1593);
__m512 fft1675 = _mm512_mask_mov_ps(fft1672, 21845, fft1673);
__m512 fft1596 = _mm512_mask_mov_ps(fft1587, 43176, fft1593);
__m512 fft1676 = _mm512_mask_mov_ps(fft1669, 43176, fft1673);
__m512 fft1597 = _mm512_mask_mov_ps(fft1595, 43176, fft1594);
__m512 fft1677 = _mm512_mask_mov_ps(fft1675, 43176, fft1674);
__m512 fft1598 = _mm512_mask_mov_ps(fft1596, 22102, fft1594);
__m512 fft1678 = _mm512_mask_mov_ps(fft1676, 22102, fft1674);
__m512 fft1599 = _mm512_mask_mul_ps(fft1597, 64764, fft1597, _mm512_set1_ps(5e-01f));
__m512 fft1679 = _mm512_mask_mul_ps(fft1677, 64764, fft1677, _mm512_set1_ps(5e-01f));
__m512 fft1600 = _mm512_mask_mul_ps(fft1598, 64764, fft1598, _mm512_set1_ps(5e-01f));
__m512 fft1680 = _mm512_mask_mul_ps(fft1678, 64764, fft1678, _mm512_set1_ps(5e-01f));
__m512 df113 = fft1599;
__m512 df121 = fft1679;
__m512 df114 = fft1600;
__m512 df122 = fft1680;
__m512 df115 = fft1580;
__m512 df123 = fft1663;
__m512 df116 = fft1581;
__m512 df124 = fft1664;
__m512 df117 = fft1582;
__m512 df125 = fft1665;
__m512 df118 = fft1583;
__m512 df126 = fft1666;
__m512 df119 = fft1584;
__m512 df127 = fft1667;
__m512 df120 = fft1585;
__m512 df128 = fft1668;
__m512i eo10 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df115 = _mm512_permutexvar_ps(eo10, df115);
df116 = _mm512_permutexvar_ps(eo10, df116);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df115);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df116);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df115);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df116);
df123 = _mm512_permutexvar_ps(eo10, df123);
df124 = _mm512_permutexvar_ps(eo10, df124);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df123);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df124);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df123);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df124);
df117 = _mm512_permutexvar_ps(eo10, df117);
df118 = _mm512_permutexvar_ps(eo10, df118);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df117);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df118);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df117);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df118);
df125 = _mm512_permutexvar_ps(eo10, df125);
df126 = _mm512_permutexvar_ps(eo10, df126);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df125);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df126);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df125);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df126);
df119 = _mm512_permutexvar_ps(eo10, df119);
df120 = _mm512_permutexvar_ps(eo10, df120);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df119);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df120);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df119);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df120);
df127 = _mm512_permutexvar_ps(eo10, df127);
df128 = _mm512_permutexvar_ps(eo10, df128);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df127);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df128);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df127);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df128);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df113);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df114);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df113);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df114);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df121);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df122);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df121);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df122);
}
ptrdiff_t b11 = 3;
ptrdiff_t m11 = (size_t)b11/2;
ptrdiff_t f12 = (size_t)b11%2;
__m512 dat114 = _mm512_maskz_loadu_ps(127, datPtr1+120+602112*i6+200704*k6+896*h5+4*w5+0*b11);
__m512 dat115 = _mm512_maskz_loadu_ps(127, datPtr1+1016+602112*i6+200704*k6+896*h5+4*w5+0*b11);
__m512 dat116 = _mm512_maskz_loadu_ps(127, datPtr1+1912+602112*i6+200704*k6+896*h5+4*w5+0*b11);
__m512 dat117 = _mm512_maskz_loadu_ps(127, datPtr1+2808+602112*i6+200704*k6+896*h5+4*w5+0*b11);
__m512 dat118 = _mm512_maskz_loadu_ps(127, datPtr1+3704+602112*i6+200704*k6+896*h5+4*w5+0*b11);
__m512 dat119 = _mm512_maskz_loadu_ps(127, datPtr1+4600+602112*i6+200704*k6+896*h5+4*w5+0*b11);
__m512 dat120 = _mm512_maskz_loadu_ps(127, datPtr1+5496+602112*i6+200704*k6+896*h5+4*w5+0*b11);
__m512 dat121 = _mm512_maskz_loadu_ps(127, datPtr1+6392+602112*i6+200704*k6+896*h5+4*w5+0*b11);
__m512 dat122 = _mm512_maskz_loadu_ps(127, datPtr1+7288+602112*i6+200704*k6+896*h5+4*w5+0*b11);
__m512 dat123 = _mm512_maskz_loadu_ps(127, datPtr1+8184+602112*i6+200704*k6+896*h5+4*w5+0*b11);
__m512 dat124 = _mm512_maskz_loadu_ps(127, datPtr1+9080+602112*i6+200704*k6+896*h5+4*w5+0*b11);
__m512 dat125 = _mm512_maskz_loadu_ps(127, datPtr1+9976+602112*i6+200704*k6+896*h5+4*w5+0*b11);
__m512 dat126 = _mm512_maskz_loadu_ps(127, datPtr1+10872+602112*i6+200704*k6+896*h5+4*w5+0*b11);
__m512 dat127 = _mm512_maskz_loadu_ps(127, datPtr1+11768+602112*i6+200704*k6+896*h5+4*w5+0*b11);
__m512 dat128 = _mm512_maskz_loadu_ps(127, datPtr1+12664+602112*i6+200704*k6+896*h5+4*w5+0*b11);
__m512 dat129 = _mm512_maskz_loadu_ps(127, datPtr1+13560+602112*i6+200704*k6+896*h5+4*w5+0*b11);
__m512 fft1681 = _mm512_add_ps(dat114, dat122);
__m512 fft1769 = _mm512_add_ps(dat115, dat123);
__m512 fft1682 = _mm512_sub_ps(dat114, dat122);
__m512 fft1770 = _mm512_sub_ps(dat115, dat123);
__m512 fft1683 = _mm512_add_ps(dat116, dat124);
__m512 fft1771 = _mm512_add_ps(dat117, dat125);
__m512 fft1684 = _mm512_sub_ps(dat116, dat124);
__m512 fft1772 = _mm512_sub_ps(dat117, dat125);
__m512 fft1685 = _mm512_add_ps(dat118, dat126);
__m512 fft1773 = _mm512_add_ps(dat119, dat127);
__m512 fft1686 = _mm512_sub_ps(dat118, dat126);
__m512 fft1774 = _mm512_sub_ps(dat119, dat127);
__m512 fft1687 = _mm512_add_ps(dat120, dat128);
__m512 fft1775 = _mm512_add_ps(dat121, dat129);
__m512 fft1688 = _mm512_sub_ps(dat120, dat128);
__m512 fft1776 = _mm512_sub_ps(dat121, dat129);
__m512 fft1689 = _mm512_add_ps(fft1681, fft1685);
__m512 fft1777 = _mm512_add_ps(fft1769, fft1773);
__m512 fft1690 = _mm512_sub_ps(fft1681, fft1685);
__m512 fft1778 = _mm512_sub_ps(fft1769, fft1773);
__m512 fft1691 = _mm512_add_ps(fft1683, fft1687);
__m512 fft1779 = _mm512_add_ps(fft1771, fft1775);
__m512 fft1692 = _mm512_sub_ps(fft1687, fft1683);
__m512 fft1780 = _mm512_sub_ps(fft1775, fft1771);
__m512 fft1693 = _mm512_sub_ps(fft1684, fft1688);
__m512 fft1781 = _mm512_sub_ps(fft1772, fft1776);
__m512 fft1694 = _mm512_add_ps(fft1684, fft1688);
__m512 fft1782 = _mm512_add_ps(fft1772, fft1776);
__m512 fft1695 = _mm512_add_ps(fft1689, fft1691);
__m512 fft1783 = _mm512_add_ps(fft1777, fft1779);
__m512 fft1696 = _mm512_sub_ps(fft1689, fft1691);
__m512 fft1784 = _mm512_sub_ps(fft1777, fft1779);
__m512 fft1697 = _mm512_fmadd_ps(fft1693, _mm512_set1_ps(7.0710677e-01f), fft1682);
__m512 fft1785 = _mm512_fmadd_ps(fft1781, _mm512_set1_ps(7.0710677e-01f), fft1770);
__m512 fft1698 = _mm512_fnmsub_ps(fft1694, _mm512_set1_ps(7.0710677e-01f), fft1686);
__m512 fft1786 = _mm512_fnmsub_ps(fft1782, _mm512_set1_ps(7.0710677e-01f), fft1774);
__m512 fft1699 = _mm512_fnmadd_ps(fft1693, _mm512_set1_ps(7.0710677e-01f), fft1682);
__m512 fft1787 = _mm512_fnmadd_ps(fft1781, _mm512_set1_ps(7.0710677e-01f), fft1770);
__m512 fft1700 = _mm512_fnmadd_ps(fft1694, _mm512_set1_ps(7.0710677e-01f), fft1686);
__m512 fft1788 = _mm512_fnmadd_ps(fft1782, _mm512_set1_ps(7.0710677e-01f), fft1774);
__m512 fft1701 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1702 = _mm512_fmadd_ps(fft1695, fft1701, _mm512_shuffle_f32x4(fft1695, fft1695, 78));
__m512 fft1789 = _mm512_fmadd_ps(fft1783, fft1701, _mm512_shuffle_f32x4(fft1783, fft1783, 78));
__m512 fft1703 = _mm512_fmadd_ps(fft1696, fft1701, _mm512_shuffle_f32x4(fft1696, fft1696, 78));
__m512 fft1790 = _mm512_fmadd_ps(fft1784, fft1701, _mm512_shuffle_f32x4(fft1784, fft1784, 78));
__m512 fft1704 = _mm512_fmadd_ps(fft1697, fft1701, _mm512_shuffle_f32x4(fft1697, fft1697, 78));
__m512 fft1791 = _mm512_fmadd_ps(fft1785, fft1701, _mm512_shuffle_f32x4(fft1785, fft1785, 78));
__m512 fft1705 = _mm512_fmadd_ps(fft1698, fft1701, _mm512_shuffle_f32x4(fft1698, fft1698, 78));
__m512 fft1792 = _mm512_fmadd_ps(fft1786, fft1701, _mm512_shuffle_f32x4(fft1786, fft1786, 78));
__m512 fft1706 = _mm512_fmadd_ps(fft1690, fft1701, _mm512_shuffle_f32x4(fft1690, fft1690, 78));
__m512 fft1793 = _mm512_fmadd_ps(fft1778, fft1701, _mm512_shuffle_f32x4(fft1778, fft1778, 78));
__m512 fft1707 = _mm512_fmadd_ps(fft1692, fft1701, _mm512_shuffle_f32x4(fft1692, fft1692, 78));
__m512 fft1794 = _mm512_fmadd_ps(fft1780, fft1701, _mm512_shuffle_f32x4(fft1780, fft1780, 78));
__m512 fft1708 = _mm512_fmadd_ps(fft1699, fft1701, _mm512_shuffle_f32x4(fft1699, fft1699, 78));
__m512 fft1795 = _mm512_fmadd_ps(fft1787, fft1701, _mm512_shuffle_f32x4(fft1787, fft1787, 78));
__m512 fft1709 = _mm512_fmadd_ps(fft1700, fft1701, _mm512_shuffle_f32x4(fft1700, fft1700, 78));
__m512 fft1796 = _mm512_fmadd_ps(fft1788, fft1701, _mm512_shuffle_f32x4(fft1788, fft1788, 78));
__m512 fft1710 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1711 = _mm512_mul_ps(fft1702, fft1710);
__m512 fft1797 = _mm512_mul_ps(fft1789, fft1710);
__m512 fft1712 = _mm512_mul_ps(fft1703, fft1710);
__m512 fft1798 = _mm512_mul_ps(fft1790, fft1710);
__m512 fft1713 = _mm512_mul_ps(fft1704, fft1710);
__m512 fft1799 = _mm512_mul_ps(fft1791, fft1710);
__m512 fft1714 = _mm512_mul_ps(fft1705, fft1710);
__m512 fft1800 = _mm512_mul_ps(fft1792, fft1710);
__m512 fft1715 = _mm512_mul_ps(fft1706, fft1710);
__m512 fft1801 = _mm512_mul_ps(fft1793, fft1710);
__m512 fft1716 = _mm512_mul_ps(fft1707, fft1710);
__m512 fft1802 = _mm512_mul_ps(fft1794, fft1710);
__m512 fft1717 = _mm512_mul_ps(fft1708, fft1710);
__m512 fft1803 = _mm512_mul_ps(fft1795, fft1710);
__m512 fft1718 = _mm512_mul_ps(fft1709, fft1710);
__m512 fft1804 = _mm512_mul_ps(fft1796, fft1710);
__m512 fft1719 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1720 = _mm512_fmadd_ps(fft1703, fft1719, fft1711);
__m512 fft1805 = _mm512_fmadd_ps(fft1790, fft1719, fft1797);
__m512 fft1721 = _mm512_fnmadd_ps(fft1702, fft1719, fft1712);
__m512 fft1806 = _mm512_fnmadd_ps(fft1789, fft1719, fft1798);
__m512 fft1722 = _mm512_fmadd_ps(fft1705, fft1719, fft1713);
__m512 fft1807 = _mm512_fmadd_ps(fft1792, fft1719, fft1799);
__m512 fft1723 = _mm512_fnmadd_ps(fft1704, fft1719, fft1714);
__m512 fft1808 = _mm512_fnmadd_ps(fft1791, fft1719, fft1800);
__m512 fft1724 = _mm512_fmadd_ps(fft1707, fft1719, fft1715);
__m512 fft1809 = _mm512_fmadd_ps(fft1794, fft1719, fft1801);
__m512 fft1725 = _mm512_fnmadd_ps(fft1706, fft1719, fft1716);
__m512 fft1810 = _mm512_fnmadd_ps(fft1793, fft1719, fft1802);
__m512 fft1726 = _mm512_fmadd_ps(fft1709, fft1719, fft1717);
__m512 fft1811 = _mm512_fmadd_ps(fft1796, fft1719, fft1803);
__m512 fft1727 = _mm512_fnmadd_ps(fft1708, fft1719, fft1718);
__m512 fft1812 = _mm512_fnmadd_ps(fft1795, fft1719, fft1804);
__m512 fft1728 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1729 = _mm512_fmadd_ps(fft1720, fft1728, _mm512_shuffle_f32x4(fft1720, fft1720, 177));
__m512 fft1813 = _mm512_fmadd_ps(fft1805, fft1728, _mm512_shuffle_f32x4(fft1805, fft1805, 177));
__m512 fft1730 = _mm512_fmadd_ps(fft1721, fft1728, _mm512_shuffle_f32x4(fft1721, fft1721, 177));
__m512 fft1814 = _mm512_fmadd_ps(fft1806, fft1728, _mm512_shuffle_f32x4(fft1806, fft1806, 177));
__m512 fft1731 = _mm512_fmadd_ps(fft1722, fft1728, _mm512_shuffle_f32x4(fft1722, fft1722, 177));
__m512 fft1815 = _mm512_fmadd_ps(fft1807, fft1728, _mm512_shuffle_f32x4(fft1807, fft1807, 177));
__m512 fft1732 = _mm512_fmadd_ps(fft1723, fft1728, _mm512_shuffle_f32x4(fft1723, fft1723, 177));
__m512 fft1816 = _mm512_fmadd_ps(fft1808, fft1728, _mm512_shuffle_f32x4(fft1808, fft1808, 177));
__m512 fft1733 = _mm512_fmadd_ps(fft1724, fft1728, _mm512_shuffle_f32x4(fft1724, fft1724, 177));
__m512 fft1817 = _mm512_fmadd_ps(fft1809, fft1728, _mm512_shuffle_f32x4(fft1809, fft1809, 177));
__m512 fft1734 = _mm512_fmadd_ps(fft1725, fft1728, _mm512_shuffle_f32x4(fft1725, fft1725, 177));
__m512 fft1818 = _mm512_fmadd_ps(fft1810, fft1728, _mm512_shuffle_f32x4(fft1810, fft1810, 177));
__m512 fft1735 = _mm512_fmadd_ps(fft1726, fft1728, _mm512_shuffle_f32x4(fft1726, fft1726, 177));
__m512 fft1819 = _mm512_fmadd_ps(fft1811, fft1728, _mm512_shuffle_f32x4(fft1811, fft1811, 177));
__m512 fft1736 = _mm512_fmadd_ps(fft1727, fft1728, _mm512_shuffle_f32x4(fft1727, fft1727, 177));
__m512 fft1820 = _mm512_fmadd_ps(fft1812, fft1728, _mm512_shuffle_f32x4(fft1812, fft1812, 177));
__m512 fft1737 = _mm512_mask_mov_ps(fft1729, 49344, fft1730);
__m512 fft1821 = _mm512_mask_mov_ps(fft1813, 49344, fft1814);
__m512 fft1738 = _mm512_mask_sub_ps(fft1730, 49344, _mm512_setzero_ps(), fft1729);
__m512 fft1822 = _mm512_mask_sub_ps(fft1814, 49344, _mm512_setzero_ps(), fft1813);
__m512 fft1739 = _mm512_mask_mov_ps(fft1731, 49344, fft1732);
__m512 fft1823 = _mm512_mask_mov_ps(fft1815, 49344, fft1816);
__m512 fft1740 = _mm512_mask_sub_ps(fft1732, 49344, _mm512_setzero_ps(), fft1731);
__m512 fft1824 = _mm512_mask_sub_ps(fft1816, 49344, _mm512_setzero_ps(), fft1815);
__m512 fft1741 = _mm512_mask_mov_ps(fft1733, 49344, fft1734);
__m512 fft1825 = _mm512_mask_mov_ps(fft1817, 49344, fft1818);
__m512 fft1742 = _mm512_mask_sub_ps(fft1734, 49344, _mm512_setzero_ps(), fft1733);
__m512 fft1826 = _mm512_mask_sub_ps(fft1818, 49344, _mm512_setzero_ps(), fft1817);
__m512 fft1743 = _mm512_mask_mov_ps(fft1735, 49344, fft1736);
__m512 fft1827 = _mm512_mask_mov_ps(fft1819, 49344, fft1820);
__m512 fft1744 = _mm512_mask_sub_ps(fft1736, 49344, _mm512_setzero_ps(), fft1735);
__m512 fft1828 = _mm512_mask_sub_ps(fft1820, 49344, _mm512_setzero_ps(), fft1819);
__m512 fft1745 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1746 = _mm512_fmadd_ps(fft1737, fft1745, _mm512_shuffle_ps(fft1737, fft1737, 78));
__m512 fft1829 = _mm512_fmadd_ps(fft1821, fft1745, _mm512_shuffle_ps(fft1821, fft1821, 78));
__m512 fft1747 = _mm512_fmadd_ps(fft1738, fft1745, _mm512_shuffle_ps(fft1738, fft1738, 78));
__m512 fft1830 = _mm512_fmadd_ps(fft1822, fft1745, _mm512_shuffle_ps(fft1822, fft1822, 78));
__m512 fft1748 = _mm512_fmadd_ps(fft1739, fft1745, _mm512_shuffle_ps(fft1739, fft1739, 78));
__m512 fft1831 = _mm512_fmadd_ps(fft1823, fft1745, _mm512_shuffle_ps(fft1823, fft1823, 78));
__m512 fft1749 = _mm512_fmadd_ps(fft1740, fft1745, _mm512_shuffle_ps(fft1740, fft1740, 78));
__m512 fft1832 = _mm512_fmadd_ps(fft1824, fft1745, _mm512_shuffle_ps(fft1824, fft1824, 78));
__m512 fft1750 = _mm512_fmadd_ps(fft1741, fft1745, _mm512_shuffle_ps(fft1741, fft1741, 78));
__m512 fft1833 = _mm512_fmadd_ps(fft1825, fft1745, _mm512_shuffle_ps(fft1825, fft1825, 78));
__m512 fft1751 = _mm512_fmadd_ps(fft1742, fft1745, _mm512_shuffle_ps(fft1742, fft1742, 78));
__m512 fft1834 = _mm512_fmadd_ps(fft1826, fft1745, _mm512_shuffle_ps(fft1826, fft1826, 78));
__m512 fft1752 = _mm512_fmadd_ps(fft1743, fft1745, _mm512_shuffle_ps(fft1743, fft1743, 78));
__m512 fft1835 = _mm512_fmadd_ps(fft1827, fft1745, _mm512_shuffle_ps(fft1827, fft1827, 78));
__m512 fft1753 = _mm512_fmadd_ps(fft1744, fft1745, _mm512_shuffle_ps(fft1744, fft1744, 78));
__m512 fft1836 = _mm512_fmadd_ps(fft1828, fft1745, _mm512_shuffle_ps(fft1828, fft1828, 78));
__m512i fft1754 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1755 = _mm512_permutexvar_ps(fft1754, fft1746);
__m512 fft1837 = _mm512_permutexvar_ps(fft1754, fft1829);
__m512i fft1756 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1757 = _mm512_permutexvar_ps(fft1756, fft1746);
__m512 fft1838 = _mm512_permutexvar_ps(fft1756, fft1829);
__m512 fft1758 = _mm512_permutexvar_ps(fft1754, fft1747);
__m512 fft1839 = _mm512_permutexvar_ps(fft1754, fft1830);
__m512 fft1759 = _mm512_permutexvar_ps(fft1756, fft1747);
__m512 fft1840 = _mm512_permutexvar_ps(fft1756, fft1830);
__m512 fft1760 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1761 = _mm512_fmadd_ps(fft1755, fft1760, fft1757);
__m512 fft1841 = _mm512_fmadd_ps(fft1837, fft1760, fft1838);
__m512 fft1762 = _mm512_fnmadd_ps(fft1759, fft1760, fft1758);
__m512 fft1842 = _mm512_fnmadd_ps(fft1840, fft1760, fft1839);
__m512 fft1763 = _mm512_mask_mov_ps(fft1759, 21845, fft1761);
__m512 fft1843 = _mm512_mask_mov_ps(fft1840, 21845, fft1841);
__m512 fft1764 = _mm512_mask_mov_ps(fft1755, 43176, fft1761);
__m512 fft1844 = _mm512_mask_mov_ps(fft1837, 43176, fft1841);
__m512 fft1765 = _mm512_mask_mov_ps(fft1763, 43176, fft1762);
__m512 fft1845 = _mm512_mask_mov_ps(fft1843, 43176, fft1842);
__m512 fft1766 = _mm512_mask_mov_ps(fft1764, 22102, fft1762);
__m512 fft1846 = _mm512_mask_mov_ps(fft1844, 22102, fft1842);
__m512 fft1767 = _mm512_mask_mul_ps(fft1765, 64764, fft1765, _mm512_set1_ps(5e-01f));
__m512 fft1847 = _mm512_mask_mul_ps(fft1845, 64764, fft1845, _mm512_set1_ps(5e-01f));
__m512 fft1768 = _mm512_mask_mul_ps(fft1766, 64764, fft1766, _mm512_set1_ps(5e-01f));
__m512 fft1848 = _mm512_mask_mul_ps(fft1846, 64764, fft1846, _mm512_set1_ps(5e-01f));
__m512 df129 = fft1767;
__m512 df137 = fft1847;
__m512 df130 = fft1768;
__m512 df138 = fft1848;
__m512 df131 = fft1748;
__m512 df139 = fft1831;
__m512 df132 = fft1749;
__m512 df140 = fft1832;
__m512 df133 = fft1750;
__m512 df141 = fft1833;
__m512 df134 = fft1751;
__m512 df142 = fft1834;
__m512 df135 = fft1752;
__m512 df143 = fft1835;
__m512 df136 = fft1753;
__m512 df144 = fft1836;
__m512i eo11 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df131 = _mm512_permutexvar_ps(eo11, df131);
df132 = _mm512_permutexvar_ps(eo11, df132);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df131);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df132);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df131);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df132);
df139 = _mm512_permutexvar_ps(eo11, df139);
df140 = _mm512_permutexvar_ps(eo11, df140);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df139);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df140);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df139);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df140);
df133 = _mm512_permutexvar_ps(eo11, df133);
df134 = _mm512_permutexvar_ps(eo11, df134);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df133);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df134);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df133);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df134);
df141 = _mm512_permutexvar_ps(eo11, df141);
df142 = _mm512_permutexvar_ps(eo11, df142);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df141);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df142);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df141);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df142);
df135 = _mm512_permutexvar_ps(eo11, df135);
df136 = _mm512_permutexvar_ps(eo11, df136);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df135);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df136);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df135);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df136);
df143 = _mm512_permutexvar_ps(eo11, df143);
df144 = _mm512_permutexvar_ps(eo11, df144);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df143);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df144);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df143);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df144);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df129);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df130);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df129);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df130);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df137);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df138);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df137);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df138);
ptrdiff_t b12 = 4;
ptrdiff_t m12 = (size_t)b12/2;
ptrdiff_t f13 = (size_t)b12%2;
__m512 dat130 = _mm512_maskz_loadu_ps(65528, datPtr1+8200+602112*i6+200704*k6+896*h5+4*w5+0*b12);
__m512 dat131 = _mm512_maskz_loadu_ps(65528, datPtr1+9096+602112*i6+200704*k6+896*h5+4*w5+0*b12);
__m512 dat132 = _mm512_maskz_loadu_ps(65528, datPtr1+9992+602112*i6+200704*k6+896*h5+4*w5+0*b12);
__m512 dat133 = _mm512_maskz_loadu_ps(65528, datPtr1+10888+602112*i6+200704*k6+896*h5+4*w5+0*b12);
__m512 dat134 = _mm512_maskz_loadu_ps(65528, datPtr1+11784+602112*i6+200704*k6+896*h5+4*w5+0*b12);
__m512 dat135 = _mm512_maskz_loadu_ps(65528, datPtr1+12680+602112*i6+200704*k6+896*h5+4*w5+0*b12);
__m512 dat136 = _mm512_maskz_loadu_ps(65528, datPtr1+13576+602112*i6+200704*k6+896*h5+4*w5+0*b12);
__m512 dat137 = _mm512_maskz_loadu_ps(65528, datPtr1+14472+602112*i6+200704*k6+896*h5+4*w5+0*b12);
__m512 dat138 = _mm512_maskz_loadu_ps(65528, datPtr1+15368+602112*i6+200704*k6+896*h5+4*w5+0*b12);
__m512 dat139 = _mm512_maskz_loadu_ps(65528, datPtr1+16264+602112*i6+200704*k6+896*h5+4*w5+0*b12);
__m512 dat140 = _mm512_maskz_loadu_ps(65528, datPtr1+17160+602112*i6+200704*k6+896*h5+4*w5+0*b12);
__m512 dat141 = _mm512_maskz_loadu_ps(65528, datPtr1+18056+602112*i6+200704*k6+896*h5+4*w5+0*b12);
__m512 dat142 = _mm512_maskz_loadu_ps(65528, datPtr1+18952+602112*i6+200704*k6+896*h5+4*w5+0*b12);
__m512 dat143 = _mm512_maskz_loadu_ps(65528, datPtr1+19848+602112*i6+200704*k6+896*h5+4*w5+0*b12);
__m512 dat144 = _mm512_maskz_loadu_ps(65528, datPtr1+20744+602112*i6+200704*k6+896*h5+4*w5+0*b12);
__m512 dat145 = _mm512_maskz_loadu_ps(65528, datPtr1+21640+602112*i6+200704*k6+896*h5+4*w5+0*b12);
__m512 fft1849 = _mm512_add_ps(dat130, dat138);
__m512 fft1937 = _mm512_add_ps(dat131, dat139);
__m512 fft1850 = _mm512_sub_ps(dat130, dat138);
__m512 fft1938 = _mm512_sub_ps(dat131, dat139);
__m512 fft1851 = _mm512_add_ps(dat132, dat140);
__m512 fft1939 = _mm512_add_ps(dat133, dat141);
__m512 fft1852 = _mm512_sub_ps(dat132, dat140);
__m512 fft1940 = _mm512_sub_ps(dat133, dat141);
__m512 fft1853 = _mm512_add_ps(dat134, dat142);
__m512 fft1941 = _mm512_add_ps(dat135, dat143);
__m512 fft1854 = _mm512_sub_ps(dat134, dat142);
__m512 fft1942 = _mm512_sub_ps(dat135, dat143);
__m512 fft1855 = _mm512_add_ps(dat136, dat144);
__m512 fft1943 = _mm512_add_ps(dat137, dat145);
__m512 fft1856 = _mm512_sub_ps(dat136, dat144);
__m512 fft1944 = _mm512_sub_ps(dat137, dat145);
__m512 fft1857 = _mm512_add_ps(fft1849, fft1853);
__m512 fft1945 = _mm512_add_ps(fft1937, fft1941);
__m512 fft1858 = _mm512_sub_ps(fft1849, fft1853);
__m512 fft1946 = _mm512_sub_ps(fft1937, fft1941);
__m512 fft1859 = _mm512_add_ps(fft1851, fft1855);
__m512 fft1947 = _mm512_add_ps(fft1939, fft1943);
__m512 fft1860 = _mm512_sub_ps(fft1855, fft1851);
__m512 fft1948 = _mm512_sub_ps(fft1943, fft1939);
__m512 fft1861 = _mm512_sub_ps(fft1852, fft1856);
__m512 fft1949 = _mm512_sub_ps(fft1940, fft1944);
__m512 fft1862 = _mm512_add_ps(fft1852, fft1856);
__m512 fft1950 = _mm512_add_ps(fft1940, fft1944);
__m512 fft1863 = _mm512_add_ps(fft1857, fft1859);
__m512 fft1951 = _mm512_add_ps(fft1945, fft1947);
__m512 fft1864 = _mm512_sub_ps(fft1857, fft1859);
__m512 fft1952 = _mm512_sub_ps(fft1945, fft1947);
__m512 fft1865 = _mm512_fmadd_ps(fft1861, _mm512_set1_ps(7.0710677e-01f), fft1850);
__m512 fft1953 = _mm512_fmadd_ps(fft1949, _mm512_set1_ps(7.0710677e-01f), fft1938);
__m512 fft1866 = _mm512_fnmsub_ps(fft1862, _mm512_set1_ps(7.0710677e-01f), fft1854);
__m512 fft1954 = _mm512_fnmsub_ps(fft1950, _mm512_set1_ps(7.0710677e-01f), fft1942);
__m512 fft1867 = _mm512_fnmadd_ps(fft1861, _mm512_set1_ps(7.0710677e-01f), fft1850);
__m512 fft1955 = _mm512_fnmadd_ps(fft1949, _mm512_set1_ps(7.0710677e-01f), fft1938);
__m512 fft1868 = _mm512_fnmadd_ps(fft1862, _mm512_set1_ps(7.0710677e-01f), fft1854);
__m512 fft1956 = _mm512_fnmadd_ps(fft1950, _mm512_set1_ps(7.0710677e-01f), fft1942);
__m512 fft1869 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1870 = _mm512_fmadd_ps(fft1863, fft1869, _mm512_shuffle_f32x4(fft1863, fft1863, 78));
__m512 fft1957 = _mm512_fmadd_ps(fft1951, fft1869, _mm512_shuffle_f32x4(fft1951, fft1951, 78));
__m512 fft1871 = _mm512_fmadd_ps(fft1864, fft1869, _mm512_shuffle_f32x4(fft1864, fft1864, 78));
__m512 fft1958 = _mm512_fmadd_ps(fft1952, fft1869, _mm512_shuffle_f32x4(fft1952, fft1952, 78));
__m512 fft1872 = _mm512_fmadd_ps(fft1865, fft1869, _mm512_shuffle_f32x4(fft1865, fft1865, 78));
__m512 fft1959 = _mm512_fmadd_ps(fft1953, fft1869, _mm512_shuffle_f32x4(fft1953, fft1953, 78));
__m512 fft1873 = _mm512_fmadd_ps(fft1866, fft1869, _mm512_shuffle_f32x4(fft1866, fft1866, 78));
__m512 fft1960 = _mm512_fmadd_ps(fft1954, fft1869, _mm512_shuffle_f32x4(fft1954, fft1954, 78));
__m512 fft1874 = _mm512_fmadd_ps(fft1858, fft1869, _mm512_shuffle_f32x4(fft1858, fft1858, 78));
__m512 fft1961 = _mm512_fmadd_ps(fft1946, fft1869, _mm512_shuffle_f32x4(fft1946, fft1946, 78));
__m512 fft1875 = _mm512_fmadd_ps(fft1860, fft1869, _mm512_shuffle_f32x4(fft1860, fft1860, 78));
__m512 fft1962 = _mm512_fmadd_ps(fft1948, fft1869, _mm512_shuffle_f32x4(fft1948, fft1948, 78));
__m512 fft1876 = _mm512_fmadd_ps(fft1867, fft1869, _mm512_shuffle_f32x4(fft1867, fft1867, 78));
__m512 fft1963 = _mm512_fmadd_ps(fft1955, fft1869, _mm512_shuffle_f32x4(fft1955, fft1955, 78));
__m512 fft1877 = _mm512_fmadd_ps(fft1868, fft1869, _mm512_shuffle_f32x4(fft1868, fft1868, 78));
__m512 fft1964 = _mm512_fmadd_ps(fft1956, fft1869, _mm512_shuffle_f32x4(fft1956, fft1956, 78));
__m512 fft1878 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1879 = _mm512_mul_ps(fft1870, fft1878);
__m512 fft1965 = _mm512_mul_ps(fft1957, fft1878);
__m512 fft1880 = _mm512_mul_ps(fft1871, fft1878);
__m512 fft1966 = _mm512_mul_ps(fft1958, fft1878);
__m512 fft1881 = _mm512_mul_ps(fft1872, fft1878);
__m512 fft1967 = _mm512_mul_ps(fft1959, fft1878);
__m512 fft1882 = _mm512_mul_ps(fft1873, fft1878);
__m512 fft1968 = _mm512_mul_ps(fft1960, fft1878);
__m512 fft1883 = _mm512_mul_ps(fft1874, fft1878);
__m512 fft1969 = _mm512_mul_ps(fft1961, fft1878);
__m512 fft1884 = _mm512_mul_ps(fft1875, fft1878);
__m512 fft1970 = _mm512_mul_ps(fft1962, fft1878);
__m512 fft1885 = _mm512_mul_ps(fft1876, fft1878);
__m512 fft1971 = _mm512_mul_ps(fft1963, fft1878);
__m512 fft1886 = _mm512_mul_ps(fft1877, fft1878);
__m512 fft1972 = _mm512_mul_ps(fft1964, fft1878);
__m512 fft1887 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1888 = _mm512_fmadd_ps(fft1871, fft1887, fft1879);
__m512 fft1973 = _mm512_fmadd_ps(fft1958, fft1887, fft1965);
__m512 fft1889 = _mm512_fnmadd_ps(fft1870, fft1887, fft1880);
__m512 fft1974 = _mm512_fnmadd_ps(fft1957, fft1887, fft1966);
__m512 fft1890 = _mm512_fmadd_ps(fft1873, fft1887, fft1881);
__m512 fft1975 = _mm512_fmadd_ps(fft1960, fft1887, fft1967);
__m512 fft1891 = _mm512_fnmadd_ps(fft1872, fft1887, fft1882);
__m512 fft1976 = _mm512_fnmadd_ps(fft1959, fft1887, fft1968);
__m512 fft1892 = _mm512_fmadd_ps(fft1875, fft1887, fft1883);
__m512 fft1977 = _mm512_fmadd_ps(fft1962, fft1887, fft1969);
__m512 fft1893 = _mm512_fnmadd_ps(fft1874, fft1887, fft1884);
__m512 fft1978 = _mm512_fnmadd_ps(fft1961, fft1887, fft1970);
__m512 fft1894 = _mm512_fmadd_ps(fft1877, fft1887, fft1885);
__m512 fft1979 = _mm512_fmadd_ps(fft1964, fft1887, fft1971);
__m512 fft1895 = _mm512_fnmadd_ps(fft1876, fft1887, fft1886);
__m512 fft1980 = _mm512_fnmadd_ps(fft1963, fft1887, fft1972);
__m512 fft1896 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1897 = _mm512_fmadd_ps(fft1888, fft1896, _mm512_shuffle_f32x4(fft1888, fft1888, 177));
__m512 fft1981 = _mm512_fmadd_ps(fft1973, fft1896, _mm512_shuffle_f32x4(fft1973, fft1973, 177));
__m512 fft1898 = _mm512_fmadd_ps(fft1889, fft1896, _mm512_shuffle_f32x4(fft1889, fft1889, 177));
__m512 fft1982 = _mm512_fmadd_ps(fft1974, fft1896, _mm512_shuffle_f32x4(fft1974, fft1974, 177));
__m512 fft1899 = _mm512_fmadd_ps(fft1890, fft1896, _mm512_shuffle_f32x4(fft1890, fft1890, 177));
__m512 fft1983 = _mm512_fmadd_ps(fft1975, fft1896, _mm512_shuffle_f32x4(fft1975, fft1975, 177));
__m512 fft1900 = _mm512_fmadd_ps(fft1891, fft1896, _mm512_shuffle_f32x4(fft1891, fft1891, 177));
__m512 fft1984 = _mm512_fmadd_ps(fft1976, fft1896, _mm512_shuffle_f32x4(fft1976, fft1976, 177));
__m512 fft1901 = _mm512_fmadd_ps(fft1892, fft1896, _mm512_shuffle_f32x4(fft1892, fft1892, 177));
__m512 fft1985 = _mm512_fmadd_ps(fft1977, fft1896, _mm512_shuffle_f32x4(fft1977, fft1977, 177));
__m512 fft1902 = _mm512_fmadd_ps(fft1893, fft1896, _mm512_shuffle_f32x4(fft1893, fft1893, 177));
__m512 fft1986 = _mm512_fmadd_ps(fft1978, fft1896, _mm512_shuffle_f32x4(fft1978, fft1978, 177));
__m512 fft1903 = _mm512_fmadd_ps(fft1894, fft1896, _mm512_shuffle_f32x4(fft1894, fft1894, 177));
__m512 fft1987 = _mm512_fmadd_ps(fft1979, fft1896, _mm512_shuffle_f32x4(fft1979, fft1979, 177));
__m512 fft1904 = _mm512_fmadd_ps(fft1895, fft1896, _mm512_shuffle_f32x4(fft1895, fft1895, 177));
__m512 fft1988 = _mm512_fmadd_ps(fft1980, fft1896, _mm512_shuffle_f32x4(fft1980, fft1980, 177));
__m512 fft1905 = _mm512_mask_mov_ps(fft1897, 49344, fft1898);
__m512 fft1989 = _mm512_mask_mov_ps(fft1981, 49344, fft1982);
__m512 fft1906 = _mm512_mask_sub_ps(fft1898, 49344, _mm512_setzero_ps(), fft1897);
__m512 fft1990 = _mm512_mask_sub_ps(fft1982, 49344, _mm512_setzero_ps(), fft1981);
__m512 fft1907 = _mm512_mask_mov_ps(fft1899, 49344, fft1900);
__m512 fft1991 = _mm512_mask_mov_ps(fft1983, 49344, fft1984);
__m512 fft1908 = _mm512_mask_sub_ps(fft1900, 49344, _mm512_setzero_ps(), fft1899);
__m512 fft1992 = _mm512_mask_sub_ps(fft1984, 49344, _mm512_setzero_ps(), fft1983);
__m512 fft1909 = _mm512_mask_mov_ps(fft1901, 49344, fft1902);
__m512 fft1993 = _mm512_mask_mov_ps(fft1985, 49344, fft1986);
__m512 fft1910 = _mm512_mask_sub_ps(fft1902, 49344, _mm512_setzero_ps(), fft1901);
__m512 fft1994 = _mm512_mask_sub_ps(fft1986, 49344, _mm512_setzero_ps(), fft1985);
__m512 fft1911 = _mm512_mask_mov_ps(fft1903, 49344, fft1904);
__m512 fft1995 = _mm512_mask_mov_ps(fft1987, 49344, fft1988);
__m512 fft1912 = _mm512_mask_sub_ps(fft1904, 49344, _mm512_setzero_ps(), fft1903);
__m512 fft1996 = _mm512_mask_sub_ps(fft1988, 49344, _mm512_setzero_ps(), fft1987);
__m512 fft1913 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1914 = _mm512_fmadd_ps(fft1905, fft1913, _mm512_shuffle_ps(fft1905, fft1905, 78));
__m512 fft1997 = _mm512_fmadd_ps(fft1989, fft1913, _mm512_shuffle_ps(fft1989, fft1989, 78));
__m512 fft1915 = _mm512_fmadd_ps(fft1906, fft1913, _mm512_shuffle_ps(fft1906, fft1906, 78));
__m512 fft1998 = _mm512_fmadd_ps(fft1990, fft1913, _mm512_shuffle_ps(fft1990, fft1990, 78));
__m512 fft1916 = _mm512_fmadd_ps(fft1907, fft1913, _mm512_shuffle_ps(fft1907, fft1907, 78));
__m512 fft1999 = _mm512_fmadd_ps(fft1991, fft1913, _mm512_shuffle_ps(fft1991, fft1991, 78));
__m512 fft1917 = _mm512_fmadd_ps(fft1908, fft1913, _mm512_shuffle_ps(fft1908, fft1908, 78));
__m512 fft2000 = _mm512_fmadd_ps(fft1992, fft1913, _mm512_shuffle_ps(fft1992, fft1992, 78));
__m512 fft1918 = _mm512_fmadd_ps(fft1909, fft1913, _mm512_shuffle_ps(fft1909, fft1909, 78));
__m512 fft2001 = _mm512_fmadd_ps(fft1993, fft1913, _mm512_shuffle_ps(fft1993, fft1993, 78));
__m512 fft1919 = _mm512_fmadd_ps(fft1910, fft1913, _mm512_shuffle_ps(fft1910, fft1910, 78));
__m512 fft2002 = _mm512_fmadd_ps(fft1994, fft1913, _mm512_shuffle_ps(fft1994, fft1994, 78));
__m512 fft1920 = _mm512_fmadd_ps(fft1911, fft1913, _mm512_shuffle_ps(fft1911, fft1911, 78));
__m512 fft2003 = _mm512_fmadd_ps(fft1995, fft1913, _mm512_shuffle_ps(fft1995, fft1995, 78));
__m512 fft1921 = _mm512_fmadd_ps(fft1912, fft1913, _mm512_shuffle_ps(fft1912, fft1912, 78));
__m512 fft2004 = _mm512_fmadd_ps(fft1996, fft1913, _mm512_shuffle_ps(fft1996, fft1996, 78));
__m512i fft1922 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1923 = _mm512_permutexvar_ps(fft1922, fft1914);
__m512 fft2005 = _mm512_permutexvar_ps(fft1922, fft1997);
__m512i fft1924 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1925 = _mm512_permutexvar_ps(fft1924, fft1914);
__m512 fft2006 = _mm512_permutexvar_ps(fft1924, fft1997);
__m512 fft1926 = _mm512_permutexvar_ps(fft1922, fft1915);
__m512 fft2007 = _mm512_permutexvar_ps(fft1922, fft1998);
__m512 fft1927 = _mm512_permutexvar_ps(fft1924, fft1915);
__m512 fft2008 = _mm512_permutexvar_ps(fft1924, fft1998);
__m512 fft1928 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1929 = _mm512_fmadd_ps(fft1923, fft1928, fft1925);
__m512 fft2009 = _mm512_fmadd_ps(fft2005, fft1928, fft2006);
__m512 fft1930 = _mm512_fnmadd_ps(fft1927, fft1928, fft1926);
__m512 fft2010 = _mm512_fnmadd_ps(fft2008, fft1928, fft2007);
__m512 fft1931 = _mm512_mask_mov_ps(fft1927, 21845, fft1929);
__m512 fft2011 = _mm512_mask_mov_ps(fft2008, 21845, fft2009);
__m512 fft1932 = _mm512_mask_mov_ps(fft1923, 43176, fft1929);
__m512 fft2012 = _mm512_mask_mov_ps(fft2005, 43176, fft2009);
__m512 fft1933 = _mm512_mask_mov_ps(fft1931, 43176, fft1930);
__m512 fft2013 = _mm512_mask_mov_ps(fft2011, 43176, fft2010);
__m512 fft1934 = _mm512_mask_mov_ps(fft1932, 22102, fft1930);
__m512 fft2014 = _mm512_mask_mov_ps(fft2012, 22102, fft2010);
__m512 fft1935 = _mm512_mask_mul_ps(fft1933, 64764, fft1933, _mm512_set1_ps(5e-01f));
__m512 fft2015 = _mm512_mask_mul_ps(fft2013, 64764, fft2013, _mm512_set1_ps(5e-01f));
__m512 fft1936 = _mm512_mask_mul_ps(fft1934, 64764, fft1934, _mm512_set1_ps(5e-01f));
__m512 fft2016 = _mm512_mask_mul_ps(fft2014, 64764, fft2014, _mm512_set1_ps(5e-01f));
__m512 df145 = fft1935;
__m512 df153 = fft2015;
__m512 df146 = fft1936;
__m512 df154 = fft2016;
__m512 df147 = fft1916;
__m512 df155 = fft1999;
__m512 df148 = fft1917;
__m512 df156 = fft2000;
__m512 df149 = fft1918;
__m512 df157 = fft2001;
__m512 df150 = fft1919;
__m512 df158 = fft2002;
__m512 df151 = fft1920;
__m512 df159 = fft2003;
__m512 df152 = fft1921;
__m512 df160 = fft2004;
__m512i eo12 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df147 = _mm512_permutexvar_ps(eo12, df147);
df148 = _mm512_permutexvar_ps(eo12, df148);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df147);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df148);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df147);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df148);
df155 = _mm512_permutexvar_ps(eo12, df155);
df156 = _mm512_permutexvar_ps(eo12, df156);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df155);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df156);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df155);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df156);
df149 = _mm512_permutexvar_ps(eo12, df149);
df150 = _mm512_permutexvar_ps(eo12, df150);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df149);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df150);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df149);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df150);
df157 = _mm512_permutexvar_ps(eo12, df157);
df158 = _mm512_permutexvar_ps(eo12, df158);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df157);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df158);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df157);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df158);
df151 = _mm512_permutexvar_ps(eo12, df151);
df152 = _mm512_permutexvar_ps(eo12, df152);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df151);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df152);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df151);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df152);
df159 = _mm512_permutexvar_ps(eo12, df159);
df160 = _mm512_permutexvar_ps(eo12, df160);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df159);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df160);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df159);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df160);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df145);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df146);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df145);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df146);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df153);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df154);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df153);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df154);
ptrdiff_t b13 = 5;
ptrdiff_t m13 = (size_t)b13/2;
ptrdiff_t f14 = (size_t)b13%2;
__m512 dat146 = _mm512_maskz_loadu_ps(65535, datPtr1+8240+602112*i6+200704*k6+896*h5+4*w5+0*b13);
__m512 dat147 = _mm512_maskz_loadu_ps(65535, datPtr1+9136+602112*i6+200704*k6+896*h5+4*w5+0*b13);
__m512 dat148 = _mm512_maskz_loadu_ps(65535, datPtr1+10032+602112*i6+200704*k6+896*h5+4*w5+0*b13);
__m512 dat149 = _mm512_maskz_loadu_ps(65535, datPtr1+10928+602112*i6+200704*k6+896*h5+4*w5+0*b13);
__m512 dat150 = _mm512_maskz_loadu_ps(65535, datPtr1+11824+602112*i6+200704*k6+896*h5+4*w5+0*b13);
__m512 dat151 = _mm512_maskz_loadu_ps(65535, datPtr1+12720+602112*i6+200704*k6+896*h5+4*w5+0*b13);
__m512 dat152 = _mm512_maskz_loadu_ps(65535, datPtr1+13616+602112*i6+200704*k6+896*h5+4*w5+0*b13);
__m512 dat153 = _mm512_maskz_loadu_ps(65535, datPtr1+14512+602112*i6+200704*k6+896*h5+4*w5+0*b13);
__m512 dat154 = _mm512_maskz_loadu_ps(65535, datPtr1+15408+602112*i6+200704*k6+896*h5+4*w5+0*b13);
__m512 dat155 = _mm512_maskz_loadu_ps(65535, datPtr1+16304+602112*i6+200704*k6+896*h5+4*w5+0*b13);
__m512 dat156 = _mm512_maskz_loadu_ps(65535, datPtr1+17200+602112*i6+200704*k6+896*h5+4*w5+0*b13);
__m512 dat157 = _mm512_maskz_loadu_ps(65535, datPtr1+18096+602112*i6+200704*k6+896*h5+4*w5+0*b13);
__m512 dat158 = _mm512_maskz_loadu_ps(65535, datPtr1+18992+602112*i6+200704*k6+896*h5+4*w5+0*b13);
__m512 dat159 = _mm512_maskz_loadu_ps(65535, datPtr1+19888+602112*i6+200704*k6+896*h5+4*w5+0*b13);
__m512 dat160 = _mm512_maskz_loadu_ps(65535, datPtr1+20784+602112*i6+200704*k6+896*h5+4*w5+0*b13);
__m512 dat161 = _mm512_maskz_loadu_ps(65535, datPtr1+21680+602112*i6+200704*k6+896*h5+4*w5+0*b13);
__m512 fft2017 = _mm512_add_ps(dat146, dat154);
__m512 fft2105 = _mm512_add_ps(dat147, dat155);
__m512 fft2018 = _mm512_sub_ps(dat146, dat154);
__m512 fft2106 = _mm512_sub_ps(dat147, dat155);
__m512 fft2019 = _mm512_add_ps(dat148, dat156);
__m512 fft2107 = _mm512_add_ps(dat149, dat157);
__m512 fft2020 = _mm512_sub_ps(dat148, dat156);
__m512 fft2108 = _mm512_sub_ps(dat149, dat157);
__m512 fft2021 = _mm512_add_ps(dat150, dat158);
__m512 fft2109 = _mm512_add_ps(dat151, dat159);
__m512 fft2022 = _mm512_sub_ps(dat150, dat158);
__m512 fft2110 = _mm512_sub_ps(dat151, dat159);
__m512 fft2023 = _mm512_add_ps(dat152, dat160);
__m512 fft2111 = _mm512_add_ps(dat153, dat161);
__m512 fft2024 = _mm512_sub_ps(dat152, dat160);
__m512 fft2112 = _mm512_sub_ps(dat153, dat161);
__m512 fft2025 = _mm512_add_ps(fft2017, fft2021);
__m512 fft2113 = _mm512_add_ps(fft2105, fft2109);
__m512 fft2026 = _mm512_sub_ps(fft2017, fft2021);
__m512 fft2114 = _mm512_sub_ps(fft2105, fft2109);
__m512 fft2027 = _mm512_add_ps(fft2019, fft2023);
__m512 fft2115 = _mm512_add_ps(fft2107, fft2111);
__m512 fft2028 = _mm512_sub_ps(fft2023, fft2019);
__m512 fft2116 = _mm512_sub_ps(fft2111, fft2107);
__m512 fft2029 = _mm512_sub_ps(fft2020, fft2024);
__m512 fft2117 = _mm512_sub_ps(fft2108, fft2112);
__m512 fft2030 = _mm512_add_ps(fft2020, fft2024);
__m512 fft2118 = _mm512_add_ps(fft2108, fft2112);
__m512 fft2031 = _mm512_add_ps(fft2025, fft2027);
__m512 fft2119 = _mm512_add_ps(fft2113, fft2115);
__m512 fft2032 = _mm512_sub_ps(fft2025, fft2027);
__m512 fft2120 = _mm512_sub_ps(fft2113, fft2115);
__m512 fft2033 = _mm512_fmadd_ps(fft2029, _mm512_set1_ps(7.0710677e-01f), fft2018);
__m512 fft2121 = _mm512_fmadd_ps(fft2117, _mm512_set1_ps(7.0710677e-01f), fft2106);
__m512 fft2034 = _mm512_fnmsub_ps(fft2030, _mm512_set1_ps(7.0710677e-01f), fft2022);
__m512 fft2122 = _mm512_fnmsub_ps(fft2118, _mm512_set1_ps(7.0710677e-01f), fft2110);
__m512 fft2035 = _mm512_fnmadd_ps(fft2029, _mm512_set1_ps(7.0710677e-01f), fft2018);
__m512 fft2123 = _mm512_fnmadd_ps(fft2117, _mm512_set1_ps(7.0710677e-01f), fft2106);
__m512 fft2036 = _mm512_fnmadd_ps(fft2030, _mm512_set1_ps(7.0710677e-01f), fft2022);
__m512 fft2124 = _mm512_fnmadd_ps(fft2118, _mm512_set1_ps(7.0710677e-01f), fft2110);
__m512 fft2037 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2038 = _mm512_fmadd_ps(fft2031, fft2037, _mm512_shuffle_f32x4(fft2031, fft2031, 78));
__m512 fft2125 = _mm512_fmadd_ps(fft2119, fft2037, _mm512_shuffle_f32x4(fft2119, fft2119, 78));
__m512 fft2039 = _mm512_fmadd_ps(fft2032, fft2037, _mm512_shuffle_f32x4(fft2032, fft2032, 78));
__m512 fft2126 = _mm512_fmadd_ps(fft2120, fft2037, _mm512_shuffle_f32x4(fft2120, fft2120, 78));
__m512 fft2040 = _mm512_fmadd_ps(fft2033, fft2037, _mm512_shuffle_f32x4(fft2033, fft2033, 78));
__m512 fft2127 = _mm512_fmadd_ps(fft2121, fft2037, _mm512_shuffle_f32x4(fft2121, fft2121, 78));
__m512 fft2041 = _mm512_fmadd_ps(fft2034, fft2037, _mm512_shuffle_f32x4(fft2034, fft2034, 78));
__m512 fft2128 = _mm512_fmadd_ps(fft2122, fft2037, _mm512_shuffle_f32x4(fft2122, fft2122, 78));
__m512 fft2042 = _mm512_fmadd_ps(fft2026, fft2037, _mm512_shuffle_f32x4(fft2026, fft2026, 78));
__m512 fft2129 = _mm512_fmadd_ps(fft2114, fft2037, _mm512_shuffle_f32x4(fft2114, fft2114, 78));
__m512 fft2043 = _mm512_fmadd_ps(fft2028, fft2037, _mm512_shuffle_f32x4(fft2028, fft2028, 78));
__m512 fft2130 = _mm512_fmadd_ps(fft2116, fft2037, _mm512_shuffle_f32x4(fft2116, fft2116, 78));
__m512 fft2044 = _mm512_fmadd_ps(fft2035, fft2037, _mm512_shuffle_f32x4(fft2035, fft2035, 78));
__m512 fft2131 = _mm512_fmadd_ps(fft2123, fft2037, _mm512_shuffle_f32x4(fft2123, fft2123, 78));
__m512 fft2045 = _mm512_fmadd_ps(fft2036, fft2037, _mm512_shuffle_f32x4(fft2036, fft2036, 78));
__m512 fft2132 = _mm512_fmadd_ps(fft2124, fft2037, _mm512_shuffle_f32x4(fft2124, fft2124, 78));
__m512 fft2046 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2047 = _mm512_mul_ps(fft2038, fft2046);
__m512 fft2133 = _mm512_mul_ps(fft2125, fft2046);
__m512 fft2048 = _mm512_mul_ps(fft2039, fft2046);
__m512 fft2134 = _mm512_mul_ps(fft2126, fft2046);
__m512 fft2049 = _mm512_mul_ps(fft2040, fft2046);
__m512 fft2135 = _mm512_mul_ps(fft2127, fft2046);
__m512 fft2050 = _mm512_mul_ps(fft2041, fft2046);
__m512 fft2136 = _mm512_mul_ps(fft2128, fft2046);
__m512 fft2051 = _mm512_mul_ps(fft2042, fft2046);
__m512 fft2137 = _mm512_mul_ps(fft2129, fft2046);
__m512 fft2052 = _mm512_mul_ps(fft2043, fft2046);
__m512 fft2138 = _mm512_mul_ps(fft2130, fft2046);
__m512 fft2053 = _mm512_mul_ps(fft2044, fft2046);
__m512 fft2139 = _mm512_mul_ps(fft2131, fft2046);
__m512 fft2054 = _mm512_mul_ps(fft2045, fft2046);
__m512 fft2140 = _mm512_mul_ps(fft2132, fft2046);
__m512 fft2055 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft2056 = _mm512_fmadd_ps(fft2039, fft2055, fft2047);
__m512 fft2141 = _mm512_fmadd_ps(fft2126, fft2055, fft2133);
__m512 fft2057 = _mm512_fnmadd_ps(fft2038, fft2055, fft2048);
__m512 fft2142 = _mm512_fnmadd_ps(fft2125, fft2055, fft2134);
__m512 fft2058 = _mm512_fmadd_ps(fft2041, fft2055, fft2049);
__m512 fft2143 = _mm512_fmadd_ps(fft2128, fft2055, fft2135);
__m512 fft2059 = _mm512_fnmadd_ps(fft2040, fft2055, fft2050);
__m512 fft2144 = _mm512_fnmadd_ps(fft2127, fft2055, fft2136);
__m512 fft2060 = _mm512_fmadd_ps(fft2043, fft2055, fft2051);
__m512 fft2145 = _mm512_fmadd_ps(fft2130, fft2055, fft2137);
__m512 fft2061 = _mm512_fnmadd_ps(fft2042, fft2055, fft2052);
__m512 fft2146 = _mm512_fnmadd_ps(fft2129, fft2055, fft2138);
__m512 fft2062 = _mm512_fmadd_ps(fft2045, fft2055, fft2053);
__m512 fft2147 = _mm512_fmadd_ps(fft2132, fft2055, fft2139);
__m512 fft2063 = _mm512_fnmadd_ps(fft2044, fft2055, fft2054);
__m512 fft2148 = _mm512_fnmadd_ps(fft2131, fft2055, fft2140);
__m512 fft2064 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft2065 = _mm512_fmadd_ps(fft2056, fft2064, _mm512_shuffle_f32x4(fft2056, fft2056, 177));
__m512 fft2149 = _mm512_fmadd_ps(fft2141, fft2064, _mm512_shuffle_f32x4(fft2141, fft2141, 177));
__m512 fft2066 = _mm512_fmadd_ps(fft2057, fft2064, _mm512_shuffle_f32x4(fft2057, fft2057, 177));
__m512 fft2150 = _mm512_fmadd_ps(fft2142, fft2064, _mm512_shuffle_f32x4(fft2142, fft2142, 177));
__m512 fft2067 = _mm512_fmadd_ps(fft2058, fft2064, _mm512_shuffle_f32x4(fft2058, fft2058, 177));
__m512 fft2151 = _mm512_fmadd_ps(fft2143, fft2064, _mm512_shuffle_f32x4(fft2143, fft2143, 177));
__m512 fft2068 = _mm512_fmadd_ps(fft2059, fft2064, _mm512_shuffle_f32x4(fft2059, fft2059, 177));
__m512 fft2152 = _mm512_fmadd_ps(fft2144, fft2064, _mm512_shuffle_f32x4(fft2144, fft2144, 177));
__m512 fft2069 = _mm512_fmadd_ps(fft2060, fft2064, _mm512_shuffle_f32x4(fft2060, fft2060, 177));
__m512 fft2153 = _mm512_fmadd_ps(fft2145, fft2064, _mm512_shuffle_f32x4(fft2145, fft2145, 177));
__m512 fft2070 = _mm512_fmadd_ps(fft2061, fft2064, _mm512_shuffle_f32x4(fft2061, fft2061, 177));
__m512 fft2154 = _mm512_fmadd_ps(fft2146, fft2064, _mm512_shuffle_f32x4(fft2146, fft2146, 177));
__m512 fft2071 = _mm512_fmadd_ps(fft2062, fft2064, _mm512_shuffle_f32x4(fft2062, fft2062, 177));
__m512 fft2155 = _mm512_fmadd_ps(fft2147, fft2064, _mm512_shuffle_f32x4(fft2147, fft2147, 177));
__m512 fft2072 = _mm512_fmadd_ps(fft2063, fft2064, _mm512_shuffle_f32x4(fft2063, fft2063, 177));
__m512 fft2156 = _mm512_fmadd_ps(fft2148, fft2064, _mm512_shuffle_f32x4(fft2148, fft2148, 177));
__m512 fft2073 = _mm512_mask_mov_ps(fft2065, 49344, fft2066);
__m512 fft2157 = _mm512_mask_mov_ps(fft2149, 49344, fft2150);
__m512 fft2074 = _mm512_mask_sub_ps(fft2066, 49344, _mm512_setzero_ps(), fft2065);
__m512 fft2158 = _mm512_mask_sub_ps(fft2150, 49344, _mm512_setzero_ps(), fft2149);
__m512 fft2075 = _mm512_mask_mov_ps(fft2067, 49344, fft2068);
__m512 fft2159 = _mm512_mask_mov_ps(fft2151, 49344, fft2152);
__m512 fft2076 = _mm512_mask_sub_ps(fft2068, 49344, _mm512_setzero_ps(), fft2067);
__m512 fft2160 = _mm512_mask_sub_ps(fft2152, 49344, _mm512_setzero_ps(), fft2151);
__m512 fft2077 = _mm512_mask_mov_ps(fft2069, 49344, fft2070);
__m512 fft2161 = _mm512_mask_mov_ps(fft2153, 49344, fft2154);
__m512 fft2078 = _mm512_mask_sub_ps(fft2070, 49344, _mm512_setzero_ps(), fft2069);
__m512 fft2162 = _mm512_mask_sub_ps(fft2154, 49344, _mm512_setzero_ps(), fft2153);
__m512 fft2079 = _mm512_mask_mov_ps(fft2071, 49344, fft2072);
__m512 fft2163 = _mm512_mask_mov_ps(fft2155, 49344, fft2156);
__m512 fft2080 = _mm512_mask_sub_ps(fft2072, 49344, _mm512_setzero_ps(), fft2071);
__m512 fft2164 = _mm512_mask_sub_ps(fft2156, 49344, _mm512_setzero_ps(), fft2155);
__m512 fft2081 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft2082 = _mm512_fmadd_ps(fft2073, fft2081, _mm512_shuffle_ps(fft2073, fft2073, 78));
__m512 fft2165 = _mm512_fmadd_ps(fft2157, fft2081, _mm512_shuffle_ps(fft2157, fft2157, 78));
__m512 fft2083 = _mm512_fmadd_ps(fft2074, fft2081, _mm512_shuffle_ps(fft2074, fft2074, 78));
__m512 fft2166 = _mm512_fmadd_ps(fft2158, fft2081, _mm512_shuffle_ps(fft2158, fft2158, 78));
__m512 fft2084 = _mm512_fmadd_ps(fft2075, fft2081, _mm512_shuffle_ps(fft2075, fft2075, 78));
__m512 fft2167 = _mm512_fmadd_ps(fft2159, fft2081, _mm512_shuffle_ps(fft2159, fft2159, 78));
__m512 fft2085 = _mm512_fmadd_ps(fft2076, fft2081, _mm512_shuffle_ps(fft2076, fft2076, 78));
__m512 fft2168 = _mm512_fmadd_ps(fft2160, fft2081, _mm512_shuffle_ps(fft2160, fft2160, 78));
__m512 fft2086 = _mm512_fmadd_ps(fft2077, fft2081, _mm512_shuffle_ps(fft2077, fft2077, 78));
__m512 fft2169 = _mm512_fmadd_ps(fft2161, fft2081, _mm512_shuffle_ps(fft2161, fft2161, 78));
__m512 fft2087 = _mm512_fmadd_ps(fft2078, fft2081, _mm512_shuffle_ps(fft2078, fft2078, 78));
__m512 fft2170 = _mm512_fmadd_ps(fft2162, fft2081, _mm512_shuffle_ps(fft2162, fft2162, 78));
__m512 fft2088 = _mm512_fmadd_ps(fft2079, fft2081, _mm512_shuffle_ps(fft2079, fft2079, 78));
__m512 fft2171 = _mm512_fmadd_ps(fft2163, fft2081, _mm512_shuffle_ps(fft2163, fft2163, 78));
__m512 fft2089 = _mm512_fmadd_ps(fft2080, fft2081, _mm512_shuffle_ps(fft2080, fft2080, 78));
__m512 fft2172 = _mm512_fmadd_ps(fft2164, fft2081, _mm512_shuffle_ps(fft2164, fft2164, 78));
__m512i fft2090 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft2091 = _mm512_permutexvar_ps(fft2090, fft2082);
__m512 fft2173 = _mm512_permutexvar_ps(fft2090, fft2165);
__m512i fft2092 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft2093 = _mm512_permutexvar_ps(fft2092, fft2082);
__m512 fft2174 = _mm512_permutexvar_ps(fft2092, fft2165);
__m512 fft2094 = _mm512_permutexvar_ps(fft2090, fft2083);
__m512 fft2175 = _mm512_permutexvar_ps(fft2090, fft2166);
__m512 fft2095 = _mm512_permutexvar_ps(fft2092, fft2083);
__m512 fft2176 = _mm512_permutexvar_ps(fft2092, fft2166);
__m512 fft2096 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft2097 = _mm512_fmadd_ps(fft2091, fft2096, fft2093);
__m512 fft2177 = _mm512_fmadd_ps(fft2173, fft2096, fft2174);
__m512 fft2098 = _mm512_fnmadd_ps(fft2095, fft2096, fft2094);
__m512 fft2178 = _mm512_fnmadd_ps(fft2176, fft2096, fft2175);
__m512 fft2099 = _mm512_mask_mov_ps(fft2095, 21845, fft2097);
__m512 fft2179 = _mm512_mask_mov_ps(fft2176, 21845, fft2177);
__m512 fft2100 = _mm512_mask_mov_ps(fft2091, 43176, fft2097);
__m512 fft2180 = _mm512_mask_mov_ps(fft2173, 43176, fft2177);
__m512 fft2101 = _mm512_mask_mov_ps(fft2099, 43176, fft2098);
__m512 fft2181 = _mm512_mask_mov_ps(fft2179, 43176, fft2178);
__m512 fft2102 = _mm512_mask_mov_ps(fft2100, 22102, fft2098);
__m512 fft2182 = _mm512_mask_mov_ps(fft2180, 22102, fft2178);
__m512 fft2103 = _mm512_mask_mul_ps(fft2101, 64764, fft2101, _mm512_set1_ps(5e-01f));
__m512 fft2183 = _mm512_mask_mul_ps(fft2181, 64764, fft2181, _mm512_set1_ps(5e-01f));
__m512 fft2104 = _mm512_mask_mul_ps(fft2102, 64764, fft2102, _mm512_set1_ps(5e-01f));
__m512 fft2184 = _mm512_mask_mul_ps(fft2182, 64764, fft2182, _mm512_set1_ps(5e-01f));
__m512 df161 = fft2103;
__m512 df169 = fft2183;
__m512 df162 = fft2104;
__m512 df170 = fft2184;
__m512 df163 = fft2084;
__m512 df171 = fft2167;
__m512 df164 = fft2085;
__m512 df172 = fft2168;
__m512 df165 = fft2086;
__m512 df173 = fft2169;
__m512 df166 = fft2087;
__m512 df174 = fft2170;
__m512 df167 = fft2088;
__m512 df175 = fft2171;
__m512 df168 = fft2089;
__m512 df176 = fft2172;
__m512i eo13 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df163 = _mm512_permutexvar_ps(eo13, df163);
df164 = _mm512_permutexvar_ps(eo13, df164);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df163);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df164);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df163);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df164);
df171 = _mm512_permutexvar_ps(eo13, df171);
df172 = _mm512_permutexvar_ps(eo13, df172);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df171);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df172);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df171);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df172);
df165 = _mm512_permutexvar_ps(eo13, df165);
df166 = _mm512_permutexvar_ps(eo13, df166);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df165);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df166);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df165);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df166);
df173 = _mm512_permutexvar_ps(eo13, df173);
df174 = _mm512_permutexvar_ps(eo13, df174);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df173);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df174);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df173);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df174);
df167 = _mm512_permutexvar_ps(eo13, df167);
df168 = _mm512_permutexvar_ps(eo13, df168);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df167);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df168);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df167);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df168);
df175 = _mm512_permutexvar_ps(eo13, df175);
df176 = _mm512_permutexvar_ps(eo13, df176);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df175);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df176);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df175);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df176);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df161);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df162);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df161);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df162);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df169);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df170);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df169);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df170);
}
if (j2 >= last1) return;
++j2;
rel2 = 4;
}
if (rel2 < 7) {
ptrdiff_t h6 = base2+10;
ptrdiff_t w6 = -220+60*rel2;
ptrdiff_t jj3 = 6-rel2+j2;
for (; j2 <= jj3; w6 += 60) {
ptrdiff_t k7 = 3*s1;
ptrdiff_t kk6 = k7+2;
for (; k7 <= kk6; ++k7) {
for (ptrdiff_t b14 = 0; b14 < 6; ++b14) {
ptrdiff_t m14 = (size_t)b14/2;
ptrdiff_t f15 = (size_t)b14%2;
__m512 dat162 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k7+896*h6+4*w6+40*b14);
__m512 dat163 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k7+896*h6+4*w6+40*b14);
__m512 dat164 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k7+896*h6+4*w6+40*b14);
__m512 dat165 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k7+896*h6+4*w6+40*b14);
__m512 dat166 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k7+896*h6+4*w6+40*b14);
__m512 dat167 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k7+896*h6+4*w6+40*b14);
__m512 dat168 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k7+896*h6+4*w6+40*b14);
__m512 dat169 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k7+896*h6+4*w6+40*b14);
__m512 dat170 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k7+896*h6+4*w6+40*b14);
__m512 dat171 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k7+896*h6+4*w6+40*b14);
__m512 dat172 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k7+896*h6+4*w6+40*b14);
__m512 dat173 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k7+896*h6+4*w6+40*b14);
__m512 dat174 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k7+896*h6+4*w6+40*b14);
__m512 dat175 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k7+896*h6+4*w6+40*b14);
__m512 dat176 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k7+896*h6+4*w6+40*b14);
__m512 dat177 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k7+896*h6+4*w6+40*b14);
__m512 fft2185 = _mm512_add_ps(dat162, dat170);
__m512 fft2273 = _mm512_add_ps(dat163, dat171);
__m512 fft2186 = _mm512_sub_ps(dat162, dat170);
__m512 fft2274 = _mm512_sub_ps(dat163, dat171);
__m512 fft2187 = _mm512_add_ps(dat164, dat172);
__m512 fft2275 = _mm512_add_ps(dat165, dat173);
__m512 fft2188 = _mm512_sub_ps(dat164, dat172);
__m512 fft2276 = _mm512_sub_ps(dat165, dat173);
__m512 fft2189 = _mm512_add_ps(dat166, dat174);
__m512 fft2277 = _mm512_add_ps(dat167, dat175);
__m512 fft2190 = _mm512_sub_ps(dat166, dat174);
__m512 fft2278 = _mm512_sub_ps(dat167, dat175);
__m512 fft2191 = _mm512_add_ps(dat168, dat176);
__m512 fft2279 = _mm512_add_ps(dat169, dat177);
__m512 fft2192 = _mm512_sub_ps(dat168, dat176);
__m512 fft2280 = _mm512_sub_ps(dat169, dat177);
__m512 fft2193 = _mm512_add_ps(fft2185, fft2189);
__m512 fft2281 = _mm512_add_ps(fft2273, fft2277);
__m512 fft2194 = _mm512_sub_ps(fft2185, fft2189);
__m512 fft2282 = _mm512_sub_ps(fft2273, fft2277);
__m512 fft2195 = _mm512_add_ps(fft2187, fft2191);
__m512 fft2283 = _mm512_add_ps(fft2275, fft2279);
__m512 fft2196 = _mm512_sub_ps(fft2191, fft2187);
__m512 fft2284 = _mm512_sub_ps(fft2279, fft2275);
__m512 fft2197 = _mm512_sub_ps(fft2188, fft2192);
__m512 fft2285 = _mm512_sub_ps(fft2276, fft2280);
__m512 fft2198 = _mm512_add_ps(fft2188, fft2192);
__m512 fft2286 = _mm512_add_ps(fft2276, fft2280);
__m512 fft2199 = _mm512_add_ps(fft2193, fft2195);
__m512 fft2287 = _mm512_add_ps(fft2281, fft2283);
__m512 fft2200 = _mm512_sub_ps(fft2193, fft2195);
__m512 fft2288 = _mm512_sub_ps(fft2281, fft2283);
__m512 fft2201 = _mm512_fmadd_ps(fft2197, _mm512_set1_ps(7.0710677e-01f), fft2186);
__m512 fft2289 = _mm512_fmadd_ps(fft2285, _mm512_set1_ps(7.0710677e-01f), fft2274);
__m512 fft2202 = _mm512_fnmsub_ps(fft2198, _mm512_set1_ps(7.0710677e-01f), fft2190);
__m512 fft2290 = _mm512_fnmsub_ps(fft2286, _mm512_set1_ps(7.0710677e-01f), fft2278);
__m512 fft2203 = _mm512_fnmadd_ps(fft2197, _mm512_set1_ps(7.0710677e-01f), fft2186);
__m512 fft2291 = _mm512_fnmadd_ps(fft2285, _mm512_set1_ps(7.0710677e-01f), fft2274);
__m512 fft2204 = _mm512_fnmadd_ps(fft2198, _mm512_set1_ps(7.0710677e-01f), fft2190);
__m512 fft2292 = _mm512_fnmadd_ps(fft2286, _mm512_set1_ps(7.0710677e-01f), fft2278);
__m512 fft2205 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2206 = _mm512_fmadd_ps(fft2199, fft2205, _mm512_shuffle_f32x4(fft2199, fft2199, 78));
__m512 fft2293 = _mm512_fmadd_ps(fft2287, fft2205, _mm512_shuffle_f32x4(fft2287, fft2287, 78));
__m512 fft2207 = _mm512_fmadd_ps(fft2200, fft2205, _mm512_shuffle_f32x4(fft2200, fft2200, 78));
__m512 fft2294 = _mm512_fmadd_ps(fft2288, fft2205, _mm512_shuffle_f32x4(fft2288, fft2288, 78));
__m512 fft2208 = _mm512_fmadd_ps(fft2201, fft2205, _mm512_shuffle_f32x4(fft2201, fft2201, 78));
__m512 fft2295 = _mm512_fmadd_ps(fft2289, fft2205, _mm512_shuffle_f32x4(fft2289, fft2289, 78));
__m512 fft2209 = _mm512_fmadd_ps(fft2202, fft2205, _mm512_shuffle_f32x4(fft2202, fft2202, 78));
__m512 fft2296 = _mm512_fmadd_ps(fft2290, fft2205, _mm512_shuffle_f32x4(fft2290, fft2290, 78));
__m512 fft2210 = _mm512_fmadd_ps(fft2194, fft2205, _mm512_shuffle_f32x4(fft2194, fft2194, 78));
__m512 fft2297 = _mm512_fmadd_ps(fft2282, fft2205, _mm512_shuffle_f32x4(fft2282, fft2282, 78));
__m512 fft2211 = _mm512_fmadd_ps(fft2196, fft2205, _mm512_shuffle_f32x4(fft2196, fft2196, 78));
__m512 fft2298 = _mm512_fmadd_ps(fft2284, fft2205, _mm512_shuffle_f32x4(fft2284, fft2284, 78));
__m512 fft2212 = _mm512_fmadd_ps(fft2203, fft2205, _mm512_shuffle_f32x4(fft2203, fft2203, 78));
__m512 fft2299 = _mm512_fmadd_ps(fft2291, fft2205, _mm512_shuffle_f32x4(fft2291, fft2291, 78));
__m512 fft2213 = _mm512_fmadd_ps(fft2204, fft2205, _mm512_shuffle_f32x4(fft2204, fft2204, 78));
__m512 fft2300 = _mm512_fmadd_ps(fft2292, fft2205, _mm512_shuffle_f32x4(fft2292, fft2292, 78));
__m512 fft2214 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2215 = _mm512_mul_ps(fft2206, fft2214);
__m512 fft2301 = _mm512_mul_ps(fft2293, fft2214);
__m512 fft2216 = _mm512_mul_ps(fft2207, fft2214);
__m512 fft2302 = _mm512_mul_ps(fft2294, fft2214);
__m512 fft2217 = _mm512_mul_ps(fft2208, fft2214);
__m512 fft2303 = _mm512_mul_ps(fft2295, fft2214);
__m512 fft2218 = _mm512_mul_ps(fft2209, fft2214);
__m512 fft2304 = _mm512_mul_ps(fft2296, fft2214);
__m512 fft2219 = _mm512_mul_ps(fft2210, fft2214);
__m512 fft2305 = _mm512_mul_ps(fft2297, fft2214);
__m512 fft2220 = _mm512_mul_ps(fft2211, fft2214);
__m512 fft2306 = _mm512_mul_ps(fft2298, fft2214);
__m512 fft2221 = _mm512_mul_ps(fft2212, fft2214);
__m512 fft2307 = _mm512_mul_ps(fft2299, fft2214);
__m512 fft2222 = _mm512_mul_ps(fft2213, fft2214);
__m512 fft2308 = _mm512_mul_ps(fft2300, fft2214);
__m512 fft2223 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft2224 = _mm512_fmadd_ps(fft2207, fft2223, fft2215);
__m512 fft2309 = _mm512_fmadd_ps(fft2294, fft2223, fft2301);
__m512 fft2225 = _mm512_fnmadd_ps(fft2206, fft2223, fft2216);
__m512 fft2310 = _mm512_fnmadd_ps(fft2293, fft2223, fft2302);
__m512 fft2226 = _mm512_fmadd_ps(fft2209, fft2223, fft2217);
__m512 fft2311 = _mm512_fmadd_ps(fft2296, fft2223, fft2303);
__m512 fft2227 = _mm512_fnmadd_ps(fft2208, fft2223, fft2218);
__m512 fft2312 = _mm512_fnmadd_ps(fft2295, fft2223, fft2304);
__m512 fft2228 = _mm512_fmadd_ps(fft2211, fft2223, fft2219);
__m512 fft2313 = _mm512_fmadd_ps(fft2298, fft2223, fft2305);
__m512 fft2229 = _mm512_fnmadd_ps(fft2210, fft2223, fft2220);
__m512 fft2314 = _mm512_fnmadd_ps(fft2297, fft2223, fft2306);
__m512 fft2230 = _mm512_fmadd_ps(fft2213, fft2223, fft2221);
__m512 fft2315 = _mm512_fmadd_ps(fft2300, fft2223, fft2307);
__m512 fft2231 = _mm512_fnmadd_ps(fft2212, fft2223, fft2222);
__m512 fft2316 = _mm512_fnmadd_ps(fft2299, fft2223, fft2308);
__m512 fft2232 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft2233 = _mm512_fmadd_ps(fft2224, fft2232, _mm512_shuffle_f32x4(fft2224, fft2224, 177));
__m512 fft2317 = _mm512_fmadd_ps(fft2309, fft2232, _mm512_shuffle_f32x4(fft2309, fft2309, 177));
__m512 fft2234 = _mm512_fmadd_ps(fft2225, fft2232, _mm512_shuffle_f32x4(fft2225, fft2225, 177));
__m512 fft2318 = _mm512_fmadd_ps(fft2310, fft2232, _mm512_shuffle_f32x4(fft2310, fft2310, 177));
__m512 fft2235 = _mm512_fmadd_ps(fft2226, fft2232, _mm512_shuffle_f32x4(fft2226, fft2226, 177));
__m512 fft2319 = _mm512_fmadd_ps(fft2311, fft2232, _mm512_shuffle_f32x4(fft2311, fft2311, 177));
__m512 fft2236 = _mm512_fmadd_ps(fft2227, fft2232, _mm512_shuffle_f32x4(fft2227, fft2227, 177));
__m512 fft2320 = _mm512_fmadd_ps(fft2312, fft2232, _mm512_shuffle_f32x4(fft2312, fft2312, 177));
__m512 fft2237 = _mm512_fmadd_ps(fft2228, fft2232, _mm512_shuffle_f32x4(fft2228, fft2228, 177));
__m512 fft2321 = _mm512_fmadd_ps(fft2313, fft2232, _mm512_shuffle_f32x4(fft2313, fft2313, 177));
__m512 fft2238 = _mm512_fmadd_ps(fft2229, fft2232, _mm512_shuffle_f32x4(fft2229, fft2229, 177));
__m512 fft2322 = _mm512_fmadd_ps(fft2314, fft2232, _mm512_shuffle_f32x4(fft2314, fft2314, 177));
__m512 fft2239 = _mm512_fmadd_ps(fft2230, fft2232, _mm512_shuffle_f32x4(fft2230, fft2230, 177));
__m512 fft2323 = _mm512_fmadd_ps(fft2315, fft2232, _mm512_shuffle_f32x4(fft2315, fft2315, 177));
__m512 fft2240 = _mm512_fmadd_ps(fft2231, fft2232, _mm512_shuffle_f32x4(fft2231, fft2231, 177));
__m512 fft2324 = _mm512_fmadd_ps(fft2316, fft2232, _mm512_shuffle_f32x4(fft2316, fft2316, 177));
__m512 fft2241 = _mm512_mask_mov_ps(fft2233, 49344, fft2234);
__m512 fft2325 = _mm512_mask_mov_ps(fft2317, 49344, fft2318);
__m512 fft2242 = _mm512_mask_sub_ps(fft2234, 49344, _mm512_setzero_ps(), fft2233);
__m512 fft2326 = _mm512_mask_sub_ps(fft2318, 49344, _mm512_setzero_ps(), fft2317);
__m512 fft2243 = _mm512_mask_mov_ps(fft2235, 49344, fft2236);
__m512 fft2327 = _mm512_mask_mov_ps(fft2319, 49344, fft2320);
__m512 fft2244 = _mm512_mask_sub_ps(fft2236, 49344, _mm512_setzero_ps(), fft2235);
__m512 fft2328 = _mm512_mask_sub_ps(fft2320, 49344, _mm512_setzero_ps(), fft2319);
__m512 fft2245 = _mm512_mask_mov_ps(fft2237, 49344, fft2238);
__m512 fft2329 = _mm512_mask_mov_ps(fft2321, 49344, fft2322);
__m512 fft2246 = _mm512_mask_sub_ps(fft2238, 49344, _mm512_setzero_ps(), fft2237);
__m512 fft2330 = _mm512_mask_sub_ps(fft2322, 49344, _mm512_setzero_ps(), fft2321);
__m512 fft2247 = _mm512_mask_mov_ps(fft2239, 49344, fft2240);
__m512 fft2331 = _mm512_mask_mov_ps(fft2323, 49344, fft2324);
__m512 fft2248 = _mm512_mask_sub_ps(fft2240, 49344, _mm512_setzero_ps(), fft2239);
__m512 fft2332 = _mm512_mask_sub_ps(fft2324, 49344, _mm512_setzero_ps(), fft2323);
__m512 fft2249 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft2250 = _mm512_fmadd_ps(fft2241, fft2249, _mm512_shuffle_ps(fft2241, fft2241, 78));
__m512 fft2333 = _mm512_fmadd_ps(fft2325, fft2249, _mm512_shuffle_ps(fft2325, fft2325, 78));
__m512 fft2251 = _mm512_fmadd_ps(fft2242, fft2249, _mm512_shuffle_ps(fft2242, fft2242, 78));
__m512 fft2334 = _mm512_fmadd_ps(fft2326, fft2249, _mm512_shuffle_ps(fft2326, fft2326, 78));
__m512 fft2252 = _mm512_fmadd_ps(fft2243, fft2249, _mm512_shuffle_ps(fft2243, fft2243, 78));
__m512 fft2335 = _mm512_fmadd_ps(fft2327, fft2249, _mm512_shuffle_ps(fft2327, fft2327, 78));
__m512 fft2253 = _mm512_fmadd_ps(fft2244, fft2249, _mm512_shuffle_ps(fft2244, fft2244, 78));
__m512 fft2336 = _mm512_fmadd_ps(fft2328, fft2249, _mm512_shuffle_ps(fft2328, fft2328, 78));
__m512 fft2254 = _mm512_fmadd_ps(fft2245, fft2249, _mm512_shuffle_ps(fft2245, fft2245, 78));
__m512 fft2337 = _mm512_fmadd_ps(fft2329, fft2249, _mm512_shuffle_ps(fft2329, fft2329, 78));
__m512 fft2255 = _mm512_fmadd_ps(fft2246, fft2249, _mm512_shuffle_ps(fft2246, fft2246, 78));
__m512 fft2338 = _mm512_fmadd_ps(fft2330, fft2249, _mm512_shuffle_ps(fft2330, fft2330, 78));
__m512 fft2256 = _mm512_fmadd_ps(fft2247, fft2249, _mm512_shuffle_ps(fft2247, fft2247, 78));
__m512 fft2339 = _mm512_fmadd_ps(fft2331, fft2249, _mm512_shuffle_ps(fft2331, fft2331, 78));
__m512 fft2257 = _mm512_fmadd_ps(fft2248, fft2249, _mm512_shuffle_ps(fft2248, fft2248, 78));
__m512 fft2340 = _mm512_fmadd_ps(fft2332, fft2249, _mm512_shuffle_ps(fft2332, fft2332, 78));
__m512i fft2258 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft2259 = _mm512_permutexvar_ps(fft2258, fft2250);
__m512 fft2341 = _mm512_permutexvar_ps(fft2258, fft2333);
__m512i fft2260 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft2261 = _mm512_permutexvar_ps(fft2260, fft2250);
__m512 fft2342 = _mm512_permutexvar_ps(fft2260, fft2333);
__m512 fft2262 = _mm512_permutexvar_ps(fft2258, fft2251);
__m512 fft2343 = _mm512_permutexvar_ps(fft2258, fft2334);
__m512 fft2263 = _mm512_permutexvar_ps(fft2260, fft2251);
__m512 fft2344 = _mm512_permutexvar_ps(fft2260, fft2334);
__m512 fft2264 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft2265 = _mm512_fmadd_ps(fft2259, fft2264, fft2261);
__m512 fft2345 = _mm512_fmadd_ps(fft2341, fft2264, fft2342);
__m512 fft2266 = _mm512_fnmadd_ps(fft2263, fft2264, fft2262);
__m512 fft2346 = _mm512_fnmadd_ps(fft2344, fft2264, fft2343);
__m512 fft2267 = _mm512_mask_mov_ps(fft2263, 21845, fft2265);
__m512 fft2347 = _mm512_mask_mov_ps(fft2344, 21845, fft2345);
__m512 fft2268 = _mm512_mask_mov_ps(fft2259, 43176, fft2265);
__m512 fft2348 = _mm512_mask_mov_ps(fft2341, 43176, fft2345);
__m512 fft2269 = _mm512_mask_mov_ps(fft2267, 43176, fft2266);
__m512 fft2349 = _mm512_mask_mov_ps(fft2347, 43176, fft2346);
__m512 fft2270 = _mm512_mask_mov_ps(fft2268, 22102, fft2266);
__m512 fft2350 = _mm512_mask_mov_ps(fft2348, 22102, fft2346);
__m512 fft2271 = _mm512_mask_mul_ps(fft2269, 64764, fft2269, _mm512_set1_ps(5e-01f));
__m512 fft2351 = _mm512_mask_mul_ps(fft2349, 64764, fft2349, _mm512_set1_ps(5e-01f));
__m512 fft2272 = _mm512_mask_mul_ps(fft2270, 64764, fft2270, _mm512_set1_ps(5e-01f));
__m512 fft2352 = _mm512_mask_mul_ps(fft2350, 64764, fft2350, _mm512_set1_ps(5e-01f));
__m512 df177 = fft2271;
__m512 df185 = fft2351;
__m512 df178 = fft2272;
__m512 df186 = fft2352;
__m512 df179 = fft2252;
__m512 df187 = fft2335;
__m512 df180 = fft2253;
__m512 df188 = fft2336;
__m512 df181 = fft2254;
__m512 df189 = fft2337;
__m512 df182 = fft2255;
__m512 df190 = fft2338;
__m512 df183 = fft2256;
__m512 df191 = fft2339;
__m512 df184 = fft2257;
__m512 df192 = fft2340;
__m512i eo14 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df179 = _mm512_permutexvar_ps(eo14, df179);
df180 = _mm512_permutexvar_ps(eo14, df180);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df179);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df180);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df179);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df180);
df187 = _mm512_permutexvar_ps(eo14, df187);
df188 = _mm512_permutexvar_ps(eo14, df188);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df187);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df188);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df187);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df188);
df181 = _mm512_permutexvar_ps(eo14, df181);
df182 = _mm512_permutexvar_ps(eo14, df182);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df181);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df182);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df181);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df182);
df189 = _mm512_permutexvar_ps(eo14, df189);
df190 = _mm512_permutexvar_ps(eo14, df190);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df189);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df190);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df189);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df190);
df183 = _mm512_permutexvar_ps(eo14, df183);
df184 = _mm512_permutexvar_ps(eo14, df184);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df183);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df184);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df183);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df184);
df191 = _mm512_permutexvar_ps(eo14, df191);
df192 = _mm512_permutexvar_ps(eo14, df192);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df191);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df192);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df191);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df192);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df177);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df178);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df177);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df178);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df185);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df186);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df185);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df186);
}
}
if (j2 >= last1) return;
++j2;
}
rel2 = 7;
}
if (rel2 < 8) {
ptrdiff_t h7 = base2+10;
ptrdiff_t w7 = 200;
ptrdiff_t k8 = 3*s1;
ptrdiff_t kk7 = k8+2;
for (; k8 <= kk7; ++k8) {
for (ptrdiff_t b15 = 0; b15 < 2; ++b15) {
ptrdiff_t m15 = (size_t)b15/2;
ptrdiff_t f16 = (size_t)b15%2;
__m512 dat178 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k8+896*h7+4*w7+40*b15);
__m512 dat179 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k8+896*h7+4*w7+40*b15);
__m512 dat180 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k8+896*h7+4*w7+40*b15);
__m512 dat181 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k8+896*h7+4*w7+40*b15);
__m512 dat182 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k8+896*h7+4*w7+40*b15);
__m512 dat183 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k8+896*h7+4*w7+40*b15);
__m512 dat184 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k8+896*h7+4*w7+40*b15);
__m512 dat185 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k8+896*h7+4*w7+40*b15);
__m512 dat186 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k8+896*h7+4*w7+40*b15);
__m512 dat187 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k8+896*h7+4*w7+40*b15);
__m512 dat188 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k8+896*h7+4*w7+40*b15);
__m512 dat189 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k8+896*h7+4*w7+40*b15);
__m512 dat190 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k8+896*h7+4*w7+40*b15);
__m512 dat191 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k8+896*h7+4*w7+40*b15);
__m512 dat192 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k8+896*h7+4*w7+40*b15);
__m512 dat193 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k8+896*h7+4*w7+40*b15);
__m512 fft2353 = _mm512_add_ps(dat178, dat186);
__m512 fft2441 = _mm512_add_ps(dat179, dat187);
__m512 fft2354 = _mm512_sub_ps(dat178, dat186);
__m512 fft2442 = _mm512_sub_ps(dat179, dat187);
__m512 fft2355 = _mm512_add_ps(dat180, dat188);
__m512 fft2443 = _mm512_add_ps(dat181, dat189);
__m512 fft2356 = _mm512_sub_ps(dat180, dat188);
__m512 fft2444 = _mm512_sub_ps(dat181, dat189);
__m512 fft2357 = _mm512_add_ps(dat182, dat190);
__m512 fft2445 = _mm512_add_ps(dat183, dat191);
__m512 fft2358 = _mm512_sub_ps(dat182, dat190);
__m512 fft2446 = _mm512_sub_ps(dat183, dat191);
__m512 fft2359 = _mm512_add_ps(dat184, dat192);
__m512 fft2447 = _mm512_add_ps(dat185, dat193);
__m512 fft2360 = _mm512_sub_ps(dat184, dat192);
__m512 fft2448 = _mm512_sub_ps(dat185, dat193);
__m512 fft2361 = _mm512_add_ps(fft2353, fft2357);
__m512 fft2449 = _mm512_add_ps(fft2441, fft2445);
__m512 fft2362 = _mm512_sub_ps(fft2353, fft2357);
__m512 fft2450 = _mm512_sub_ps(fft2441, fft2445);
__m512 fft2363 = _mm512_add_ps(fft2355, fft2359);
__m512 fft2451 = _mm512_add_ps(fft2443, fft2447);
__m512 fft2364 = _mm512_sub_ps(fft2359, fft2355);
__m512 fft2452 = _mm512_sub_ps(fft2447, fft2443);
__m512 fft2365 = _mm512_sub_ps(fft2356, fft2360);
__m512 fft2453 = _mm512_sub_ps(fft2444, fft2448);
__m512 fft2366 = _mm512_add_ps(fft2356, fft2360);
__m512 fft2454 = _mm512_add_ps(fft2444, fft2448);
__m512 fft2367 = _mm512_add_ps(fft2361, fft2363);
__m512 fft2455 = _mm512_add_ps(fft2449, fft2451);
__m512 fft2368 = _mm512_sub_ps(fft2361, fft2363);
__m512 fft2456 = _mm512_sub_ps(fft2449, fft2451);
__m512 fft2369 = _mm512_fmadd_ps(fft2365, _mm512_set1_ps(7.0710677e-01f), fft2354);
__m512 fft2457 = _mm512_fmadd_ps(fft2453, _mm512_set1_ps(7.0710677e-01f), fft2442);
__m512 fft2370 = _mm512_fnmsub_ps(fft2366, _mm512_set1_ps(7.0710677e-01f), fft2358);
__m512 fft2458 = _mm512_fnmsub_ps(fft2454, _mm512_set1_ps(7.0710677e-01f), fft2446);
__m512 fft2371 = _mm512_fnmadd_ps(fft2365, _mm512_set1_ps(7.0710677e-01f), fft2354);
__m512 fft2459 = _mm512_fnmadd_ps(fft2453, _mm512_set1_ps(7.0710677e-01f), fft2442);
__m512 fft2372 = _mm512_fnmadd_ps(fft2366, _mm512_set1_ps(7.0710677e-01f), fft2358);
__m512 fft2460 = _mm512_fnmadd_ps(fft2454, _mm512_set1_ps(7.0710677e-01f), fft2446);
__m512 fft2373 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2374 = _mm512_fmadd_ps(fft2367, fft2373, _mm512_shuffle_f32x4(fft2367, fft2367, 78));
__m512 fft2461 = _mm512_fmadd_ps(fft2455, fft2373, _mm512_shuffle_f32x4(fft2455, fft2455, 78));
__m512 fft2375 = _mm512_fmadd_ps(fft2368, fft2373, _mm512_shuffle_f32x4(fft2368, fft2368, 78));
__m512 fft2462 = _mm512_fmadd_ps(fft2456, fft2373, _mm512_shuffle_f32x4(fft2456, fft2456, 78));
__m512 fft2376 = _mm512_fmadd_ps(fft2369, fft2373, _mm512_shuffle_f32x4(fft2369, fft2369, 78));
__m512 fft2463 = _mm512_fmadd_ps(fft2457, fft2373, _mm512_shuffle_f32x4(fft2457, fft2457, 78));
__m512 fft2377 = _mm512_fmadd_ps(fft2370, fft2373, _mm512_shuffle_f32x4(fft2370, fft2370, 78));
__m512 fft2464 = _mm512_fmadd_ps(fft2458, fft2373, _mm512_shuffle_f32x4(fft2458, fft2458, 78));
__m512 fft2378 = _mm512_fmadd_ps(fft2362, fft2373, _mm512_shuffle_f32x4(fft2362, fft2362, 78));
__m512 fft2465 = _mm512_fmadd_ps(fft2450, fft2373, _mm512_shuffle_f32x4(fft2450, fft2450, 78));
__m512 fft2379 = _mm512_fmadd_ps(fft2364, fft2373, _mm512_shuffle_f32x4(fft2364, fft2364, 78));
__m512 fft2466 = _mm512_fmadd_ps(fft2452, fft2373, _mm512_shuffle_f32x4(fft2452, fft2452, 78));
__m512 fft2380 = _mm512_fmadd_ps(fft2371, fft2373, _mm512_shuffle_f32x4(fft2371, fft2371, 78));
__m512 fft2467 = _mm512_fmadd_ps(fft2459, fft2373, _mm512_shuffle_f32x4(fft2459, fft2459, 78));
__m512 fft2381 = _mm512_fmadd_ps(fft2372, fft2373, _mm512_shuffle_f32x4(fft2372, fft2372, 78));
__m512 fft2468 = _mm512_fmadd_ps(fft2460, fft2373, _mm512_shuffle_f32x4(fft2460, fft2460, 78));
__m512 fft2382 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2383 = _mm512_mul_ps(fft2374, fft2382);
__m512 fft2469 = _mm512_mul_ps(fft2461, fft2382);
__m512 fft2384 = _mm512_mul_ps(fft2375, fft2382);
__m512 fft2470 = _mm512_mul_ps(fft2462, fft2382);
__m512 fft2385 = _mm512_mul_ps(fft2376, fft2382);
__m512 fft2471 = _mm512_mul_ps(fft2463, fft2382);
__m512 fft2386 = _mm512_mul_ps(fft2377, fft2382);
__m512 fft2472 = _mm512_mul_ps(fft2464, fft2382);
__m512 fft2387 = _mm512_mul_ps(fft2378, fft2382);
__m512 fft2473 = _mm512_mul_ps(fft2465, fft2382);
__m512 fft2388 = _mm512_mul_ps(fft2379, fft2382);
__m512 fft2474 = _mm512_mul_ps(fft2466, fft2382);
__m512 fft2389 = _mm512_mul_ps(fft2380, fft2382);
__m512 fft2475 = _mm512_mul_ps(fft2467, fft2382);
__m512 fft2390 = _mm512_mul_ps(fft2381, fft2382);
__m512 fft2476 = _mm512_mul_ps(fft2468, fft2382);
__m512 fft2391 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft2392 = _mm512_fmadd_ps(fft2375, fft2391, fft2383);
__m512 fft2477 = _mm512_fmadd_ps(fft2462, fft2391, fft2469);
__m512 fft2393 = _mm512_fnmadd_ps(fft2374, fft2391, fft2384);
__m512 fft2478 = _mm512_fnmadd_ps(fft2461, fft2391, fft2470);
__m512 fft2394 = _mm512_fmadd_ps(fft2377, fft2391, fft2385);
__m512 fft2479 = _mm512_fmadd_ps(fft2464, fft2391, fft2471);
__m512 fft2395 = _mm512_fnmadd_ps(fft2376, fft2391, fft2386);
__m512 fft2480 = _mm512_fnmadd_ps(fft2463, fft2391, fft2472);
__m512 fft2396 = _mm512_fmadd_ps(fft2379, fft2391, fft2387);
__m512 fft2481 = _mm512_fmadd_ps(fft2466, fft2391, fft2473);
__m512 fft2397 = _mm512_fnmadd_ps(fft2378, fft2391, fft2388);
__m512 fft2482 = _mm512_fnmadd_ps(fft2465, fft2391, fft2474);
__m512 fft2398 = _mm512_fmadd_ps(fft2381, fft2391, fft2389);
__m512 fft2483 = _mm512_fmadd_ps(fft2468, fft2391, fft2475);
__m512 fft2399 = _mm512_fnmadd_ps(fft2380, fft2391, fft2390);
__m512 fft2484 = _mm512_fnmadd_ps(fft2467, fft2391, fft2476);
__m512 fft2400 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft2401 = _mm512_fmadd_ps(fft2392, fft2400, _mm512_shuffle_f32x4(fft2392, fft2392, 177));
__m512 fft2485 = _mm512_fmadd_ps(fft2477, fft2400, _mm512_shuffle_f32x4(fft2477, fft2477, 177));
__m512 fft2402 = _mm512_fmadd_ps(fft2393, fft2400, _mm512_shuffle_f32x4(fft2393, fft2393, 177));
__m512 fft2486 = _mm512_fmadd_ps(fft2478, fft2400, _mm512_shuffle_f32x4(fft2478, fft2478, 177));
__m512 fft2403 = _mm512_fmadd_ps(fft2394, fft2400, _mm512_shuffle_f32x4(fft2394, fft2394, 177));
__m512 fft2487 = _mm512_fmadd_ps(fft2479, fft2400, _mm512_shuffle_f32x4(fft2479, fft2479, 177));
__m512 fft2404 = _mm512_fmadd_ps(fft2395, fft2400, _mm512_shuffle_f32x4(fft2395, fft2395, 177));
__m512 fft2488 = _mm512_fmadd_ps(fft2480, fft2400, _mm512_shuffle_f32x4(fft2480, fft2480, 177));
__m512 fft2405 = _mm512_fmadd_ps(fft2396, fft2400, _mm512_shuffle_f32x4(fft2396, fft2396, 177));
__m512 fft2489 = _mm512_fmadd_ps(fft2481, fft2400, _mm512_shuffle_f32x4(fft2481, fft2481, 177));
__m512 fft2406 = _mm512_fmadd_ps(fft2397, fft2400, _mm512_shuffle_f32x4(fft2397, fft2397, 177));
__m512 fft2490 = _mm512_fmadd_ps(fft2482, fft2400, _mm512_shuffle_f32x4(fft2482, fft2482, 177));
__m512 fft2407 = _mm512_fmadd_ps(fft2398, fft2400, _mm512_shuffle_f32x4(fft2398, fft2398, 177));
__m512 fft2491 = _mm512_fmadd_ps(fft2483, fft2400, _mm512_shuffle_f32x4(fft2483, fft2483, 177));
__m512 fft2408 = _mm512_fmadd_ps(fft2399, fft2400, _mm512_shuffle_f32x4(fft2399, fft2399, 177));
__m512 fft2492 = _mm512_fmadd_ps(fft2484, fft2400, _mm512_shuffle_f32x4(fft2484, fft2484, 177));
__m512 fft2409 = _mm512_mask_mov_ps(fft2401, 49344, fft2402);
__m512 fft2493 = _mm512_mask_mov_ps(fft2485, 49344, fft2486);
__m512 fft2410 = _mm512_mask_sub_ps(fft2402, 49344, _mm512_setzero_ps(), fft2401);
__m512 fft2494 = _mm512_mask_sub_ps(fft2486, 49344, _mm512_setzero_ps(), fft2485);
__m512 fft2411 = _mm512_mask_mov_ps(fft2403, 49344, fft2404);
__m512 fft2495 = _mm512_mask_mov_ps(fft2487, 49344, fft2488);
__m512 fft2412 = _mm512_mask_sub_ps(fft2404, 49344, _mm512_setzero_ps(), fft2403);
__m512 fft2496 = _mm512_mask_sub_ps(fft2488, 49344, _mm512_setzero_ps(), fft2487);
__m512 fft2413 = _mm512_mask_mov_ps(fft2405, 49344, fft2406);
__m512 fft2497 = _mm512_mask_mov_ps(fft2489, 49344, fft2490);
__m512 fft2414 = _mm512_mask_sub_ps(fft2406, 49344, _mm512_setzero_ps(), fft2405);
__m512 fft2498 = _mm512_mask_sub_ps(fft2490, 49344, _mm512_setzero_ps(), fft2489);
__m512 fft2415 = _mm512_mask_mov_ps(fft2407, 49344, fft2408);
__m512 fft2499 = _mm512_mask_mov_ps(fft2491, 49344, fft2492);
__m512 fft2416 = _mm512_mask_sub_ps(fft2408, 49344, _mm512_setzero_ps(), fft2407);
__m512 fft2500 = _mm512_mask_sub_ps(fft2492, 49344, _mm512_setzero_ps(), fft2491);
__m512 fft2417 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft2418 = _mm512_fmadd_ps(fft2409, fft2417, _mm512_shuffle_ps(fft2409, fft2409, 78));
__m512 fft2501 = _mm512_fmadd_ps(fft2493, fft2417, _mm512_shuffle_ps(fft2493, fft2493, 78));
__m512 fft2419 = _mm512_fmadd_ps(fft2410, fft2417, _mm512_shuffle_ps(fft2410, fft2410, 78));
__m512 fft2502 = _mm512_fmadd_ps(fft2494, fft2417, _mm512_shuffle_ps(fft2494, fft2494, 78));
__m512 fft2420 = _mm512_fmadd_ps(fft2411, fft2417, _mm512_shuffle_ps(fft2411, fft2411, 78));
__m512 fft2503 = _mm512_fmadd_ps(fft2495, fft2417, _mm512_shuffle_ps(fft2495, fft2495, 78));
__m512 fft2421 = _mm512_fmadd_ps(fft2412, fft2417, _mm512_shuffle_ps(fft2412, fft2412, 78));
__m512 fft2504 = _mm512_fmadd_ps(fft2496, fft2417, _mm512_shuffle_ps(fft2496, fft2496, 78));
__m512 fft2422 = _mm512_fmadd_ps(fft2413, fft2417, _mm512_shuffle_ps(fft2413, fft2413, 78));
__m512 fft2505 = _mm512_fmadd_ps(fft2497, fft2417, _mm512_shuffle_ps(fft2497, fft2497, 78));
__m512 fft2423 = _mm512_fmadd_ps(fft2414, fft2417, _mm512_shuffle_ps(fft2414, fft2414, 78));
__m512 fft2506 = _mm512_fmadd_ps(fft2498, fft2417, _mm512_shuffle_ps(fft2498, fft2498, 78));
__m512 fft2424 = _mm512_fmadd_ps(fft2415, fft2417, _mm512_shuffle_ps(fft2415, fft2415, 78));
__m512 fft2507 = _mm512_fmadd_ps(fft2499, fft2417, _mm512_shuffle_ps(fft2499, fft2499, 78));
__m512 fft2425 = _mm512_fmadd_ps(fft2416, fft2417, _mm512_shuffle_ps(fft2416, fft2416, 78));
__m512 fft2508 = _mm512_fmadd_ps(fft2500, fft2417, _mm512_shuffle_ps(fft2500, fft2500, 78));
__m512i fft2426 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft2427 = _mm512_permutexvar_ps(fft2426, fft2418);
__m512 fft2509 = _mm512_permutexvar_ps(fft2426, fft2501);
__m512i fft2428 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft2429 = _mm512_permutexvar_ps(fft2428, fft2418);
__m512 fft2510 = _mm512_permutexvar_ps(fft2428, fft2501);
__m512 fft2430 = _mm512_permutexvar_ps(fft2426, fft2419);
__m512 fft2511 = _mm512_permutexvar_ps(fft2426, fft2502);
__m512 fft2431 = _mm512_permutexvar_ps(fft2428, fft2419);
__m512 fft2512 = _mm512_permutexvar_ps(fft2428, fft2502);
__m512 fft2432 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft2433 = _mm512_fmadd_ps(fft2427, fft2432, fft2429);
__m512 fft2513 = _mm512_fmadd_ps(fft2509, fft2432, fft2510);
__m512 fft2434 = _mm512_fnmadd_ps(fft2431, fft2432, fft2430);
__m512 fft2514 = _mm512_fnmadd_ps(fft2512, fft2432, fft2511);
__m512 fft2435 = _mm512_mask_mov_ps(fft2431, 21845, fft2433);
__m512 fft2515 = _mm512_mask_mov_ps(fft2512, 21845, fft2513);
__m512 fft2436 = _mm512_mask_mov_ps(fft2427, 43176, fft2433);
__m512 fft2516 = _mm512_mask_mov_ps(fft2509, 43176, fft2513);
__m512 fft2437 = _mm512_mask_mov_ps(fft2435, 43176, fft2434);
__m512 fft2517 = _mm512_mask_mov_ps(fft2515, 43176, fft2514);
__m512 fft2438 = _mm512_mask_mov_ps(fft2436, 22102, fft2434);
__m512 fft2518 = _mm512_mask_mov_ps(fft2516, 22102, fft2514);
__m512 fft2439 = _mm512_mask_mul_ps(fft2437, 64764, fft2437, _mm512_set1_ps(5e-01f));
__m512 fft2519 = _mm512_mask_mul_ps(fft2517, 64764, fft2517, _mm512_set1_ps(5e-01f));
__m512 fft2440 = _mm512_mask_mul_ps(fft2438, 64764, fft2438, _mm512_set1_ps(5e-01f));
__m512 fft2520 = _mm512_mask_mul_ps(fft2518, 64764, fft2518, _mm512_set1_ps(5e-01f));
__m512 df193 = fft2439;
__m512 df201 = fft2519;
__m512 df194 = fft2440;
__m512 df202 = fft2520;
__m512 df195 = fft2420;
__m512 df203 = fft2503;
__m512 df196 = fft2421;
__m512 df204 = fft2504;
__m512 df197 = fft2422;
__m512 df205 = fft2505;
__m512 df198 = fft2423;
__m512 df206 = fft2506;
__m512 df199 = fft2424;
__m512 df207 = fft2507;
__m512 df200 = fft2425;
__m512 df208 = fft2508;
__m512i eo15 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df195 = _mm512_permutexvar_ps(eo15, df195);
df196 = _mm512_permutexvar_ps(eo15, df196);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df195);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df196);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df195);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df196);
df203 = _mm512_permutexvar_ps(eo15, df203);
df204 = _mm512_permutexvar_ps(eo15, df204);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df203);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df204);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df203);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df204);
df197 = _mm512_permutexvar_ps(eo15, df197);
df198 = _mm512_permutexvar_ps(eo15, df198);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df197);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df198);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df197);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df198);
df205 = _mm512_permutexvar_ps(eo15, df205);
df206 = _mm512_permutexvar_ps(eo15, df206);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df205);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df206);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df205);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df206);
df199 = _mm512_permutexvar_ps(eo15, df199);
df200 = _mm512_permutexvar_ps(eo15, df200);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df199);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df200);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df199);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df200);
df207 = _mm512_permutexvar_ps(eo15, df207);
df208 = _mm512_permutexvar_ps(eo15, df208);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df207);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df208);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df207);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df208);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df193);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df194);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df193);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df194);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df201);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df202);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df201);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df202);
}
ptrdiff_t b16 = 2;
ptrdiff_t m16 = (size_t)b16/2;
ptrdiff_t f17 = (size_t)b16%2;
__m512 dat194 = _mm512_maskz_loadu_ps(127, datPtr1+80+602112*i6+200704*k8+896*h7+4*w7+0*b16);
__m512 dat195 = _mm512_maskz_loadu_ps(127, datPtr1+976+602112*i6+200704*k8+896*h7+4*w7+0*b16);
__m512 dat196 = _mm512_maskz_loadu_ps(127, datPtr1+1872+602112*i6+200704*k8+896*h7+4*w7+0*b16);
__m512 dat197 = _mm512_maskz_loadu_ps(127, datPtr1+2768+602112*i6+200704*k8+896*h7+4*w7+0*b16);
__m512 dat198 = _mm512_maskz_loadu_ps(127, datPtr1+3664+602112*i6+200704*k8+896*h7+4*w7+0*b16);
__m512 dat199 = _mm512_maskz_loadu_ps(127, datPtr1+4560+602112*i6+200704*k8+896*h7+4*w7+0*b16);
__m512 dat200 = _mm512_maskz_loadu_ps(127, datPtr1+5456+602112*i6+200704*k8+896*h7+4*w7+0*b16);
__m512 dat201 = _mm512_maskz_loadu_ps(127, datPtr1+6352+602112*i6+200704*k8+896*h7+4*w7+0*b16);
__m512 dat202 = _mm512_maskz_loadu_ps(127, datPtr1+7248+602112*i6+200704*k8+896*h7+4*w7+0*b16);
__m512 dat203 = _mm512_maskz_loadu_ps(127, datPtr1+8144+602112*i6+200704*k8+896*h7+4*w7+0*b16);
__m512 dat204 = _mm512_maskz_loadu_ps(127, datPtr1+9040+602112*i6+200704*k8+896*h7+4*w7+0*b16);
__m512 dat205 = _mm512_maskz_loadu_ps(127, datPtr1+9936+602112*i6+200704*k8+896*h7+4*w7+0*b16);
__m512 dat206 = _mm512_maskz_loadu_ps(127, datPtr1+10832+602112*i6+200704*k8+896*h7+4*w7+0*b16);
__m512 dat207 = _mm512_maskz_loadu_ps(127, datPtr1+11728+602112*i6+200704*k8+896*h7+4*w7+0*b16);
__m512 dat208 = _mm512_maskz_loadu_ps(127, datPtr1+12624+602112*i6+200704*k8+896*h7+4*w7+0*b16);
__m512 dat209 = _mm512_maskz_loadu_ps(127, datPtr1+13520+602112*i6+200704*k8+896*h7+4*w7+0*b16);
__m512 fft2521 = _mm512_add_ps(dat194, dat202);
__m512 fft2609 = _mm512_add_ps(dat195, dat203);
__m512 fft2522 = _mm512_sub_ps(dat194, dat202);
__m512 fft2610 = _mm512_sub_ps(dat195, dat203);
__m512 fft2523 = _mm512_add_ps(dat196, dat204);
__m512 fft2611 = _mm512_add_ps(dat197, dat205);
__m512 fft2524 = _mm512_sub_ps(dat196, dat204);
__m512 fft2612 = _mm512_sub_ps(dat197, dat205);
__m512 fft2525 = _mm512_add_ps(dat198, dat206);
__m512 fft2613 = _mm512_add_ps(dat199, dat207);
__m512 fft2526 = _mm512_sub_ps(dat198, dat206);
__m512 fft2614 = _mm512_sub_ps(dat199, dat207);
__m512 fft2527 = _mm512_add_ps(dat200, dat208);
__m512 fft2615 = _mm512_add_ps(dat201, dat209);
__m512 fft2528 = _mm512_sub_ps(dat200, dat208);
__m512 fft2616 = _mm512_sub_ps(dat201, dat209);
__m512 fft2529 = _mm512_add_ps(fft2521, fft2525);
__m512 fft2617 = _mm512_add_ps(fft2609, fft2613);
__m512 fft2530 = _mm512_sub_ps(fft2521, fft2525);
__m512 fft2618 = _mm512_sub_ps(fft2609, fft2613);
__m512 fft2531 = _mm512_add_ps(fft2523, fft2527);
__m512 fft2619 = _mm512_add_ps(fft2611, fft2615);
__m512 fft2532 = _mm512_sub_ps(fft2527, fft2523);
__m512 fft2620 = _mm512_sub_ps(fft2615, fft2611);
__m512 fft2533 = _mm512_sub_ps(fft2524, fft2528);
__m512 fft2621 = _mm512_sub_ps(fft2612, fft2616);
__m512 fft2534 = _mm512_add_ps(fft2524, fft2528);
__m512 fft2622 = _mm512_add_ps(fft2612, fft2616);
__m512 fft2535 = _mm512_add_ps(fft2529, fft2531);
__m512 fft2623 = _mm512_add_ps(fft2617, fft2619);
__m512 fft2536 = _mm512_sub_ps(fft2529, fft2531);
__m512 fft2624 = _mm512_sub_ps(fft2617, fft2619);
__m512 fft2537 = _mm512_fmadd_ps(fft2533, _mm512_set1_ps(7.0710677e-01f), fft2522);
__m512 fft2625 = _mm512_fmadd_ps(fft2621, _mm512_set1_ps(7.0710677e-01f), fft2610);
__m512 fft2538 = _mm512_fnmsub_ps(fft2534, _mm512_set1_ps(7.0710677e-01f), fft2526);
__m512 fft2626 = _mm512_fnmsub_ps(fft2622, _mm512_set1_ps(7.0710677e-01f), fft2614);
__m512 fft2539 = _mm512_fnmadd_ps(fft2533, _mm512_set1_ps(7.0710677e-01f), fft2522);
__m512 fft2627 = _mm512_fnmadd_ps(fft2621, _mm512_set1_ps(7.0710677e-01f), fft2610);
__m512 fft2540 = _mm512_fnmadd_ps(fft2534, _mm512_set1_ps(7.0710677e-01f), fft2526);
__m512 fft2628 = _mm512_fnmadd_ps(fft2622, _mm512_set1_ps(7.0710677e-01f), fft2614);
__m512 fft2541 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2542 = _mm512_fmadd_ps(fft2535, fft2541, _mm512_shuffle_f32x4(fft2535, fft2535, 78));
__m512 fft2629 = _mm512_fmadd_ps(fft2623, fft2541, _mm512_shuffle_f32x4(fft2623, fft2623, 78));
__m512 fft2543 = _mm512_fmadd_ps(fft2536, fft2541, _mm512_shuffle_f32x4(fft2536, fft2536, 78));
__m512 fft2630 = _mm512_fmadd_ps(fft2624, fft2541, _mm512_shuffle_f32x4(fft2624, fft2624, 78));
__m512 fft2544 = _mm512_fmadd_ps(fft2537, fft2541, _mm512_shuffle_f32x4(fft2537, fft2537, 78));
__m512 fft2631 = _mm512_fmadd_ps(fft2625, fft2541, _mm512_shuffle_f32x4(fft2625, fft2625, 78));
__m512 fft2545 = _mm512_fmadd_ps(fft2538, fft2541, _mm512_shuffle_f32x4(fft2538, fft2538, 78));
__m512 fft2632 = _mm512_fmadd_ps(fft2626, fft2541, _mm512_shuffle_f32x4(fft2626, fft2626, 78));
__m512 fft2546 = _mm512_fmadd_ps(fft2530, fft2541, _mm512_shuffle_f32x4(fft2530, fft2530, 78));
__m512 fft2633 = _mm512_fmadd_ps(fft2618, fft2541, _mm512_shuffle_f32x4(fft2618, fft2618, 78));
__m512 fft2547 = _mm512_fmadd_ps(fft2532, fft2541, _mm512_shuffle_f32x4(fft2532, fft2532, 78));
__m512 fft2634 = _mm512_fmadd_ps(fft2620, fft2541, _mm512_shuffle_f32x4(fft2620, fft2620, 78));
__m512 fft2548 = _mm512_fmadd_ps(fft2539, fft2541, _mm512_shuffle_f32x4(fft2539, fft2539, 78));
__m512 fft2635 = _mm512_fmadd_ps(fft2627, fft2541, _mm512_shuffle_f32x4(fft2627, fft2627, 78));
__m512 fft2549 = _mm512_fmadd_ps(fft2540, fft2541, _mm512_shuffle_f32x4(fft2540, fft2540, 78));
__m512 fft2636 = _mm512_fmadd_ps(fft2628, fft2541, _mm512_shuffle_f32x4(fft2628, fft2628, 78));
__m512 fft2550 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2551 = _mm512_mul_ps(fft2542, fft2550);
__m512 fft2637 = _mm512_mul_ps(fft2629, fft2550);
__m512 fft2552 = _mm512_mul_ps(fft2543, fft2550);
__m512 fft2638 = _mm512_mul_ps(fft2630, fft2550);
__m512 fft2553 = _mm512_mul_ps(fft2544, fft2550);
__m512 fft2639 = _mm512_mul_ps(fft2631, fft2550);
__m512 fft2554 = _mm512_mul_ps(fft2545, fft2550);
__m512 fft2640 = _mm512_mul_ps(fft2632, fft2550);
__m512 fft2555 = _mm512_mul_ps(fft2546, fft2550);
__m512 fft2641 = _mm512_mul_ps(fft2633, fft2550);
__m512 fft2556 = _mm512_mul_ps(fft2547, fft2550);
__m512 fft2642 = _mm512_mul_ps(fft2634, fft2550);
__m512 fft2557 = _mm512_mul_ps(fft2548, fft2550);
__m512 fft2643 = _mm512_mul_ps(fft2635, fft2550);
__m512 fft2558 = _mm512_mul_ps(fft2549, fft2550);
__m512 fft2644 = _mm512_mul_ps(fft2636, fft2550);
__m512 fft2559 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft2560 = _mm512_fmadd_ps(fft2543, fft2559, fft2551);
__m512 fft2645 = _mm512_fmadd_ps(fft2630, fft2559, fft2637);
__m512 fft2561 = _mm512_fnmadd_ps(fft2542, fft2559, fft2552);
__m512 fft2646 = _mm512_fnmadd_ps(fft2629, fft2559, fft2638);
__m512 fft2562 = _mm512_fmadd_ps(fft2545, fft2559, fft2553);
__m512 fft2647 = _mm512_fmadd_ps(fft2632, fft2559, fft2639);
__m512 fft2563 = _mm512_fnmadd_ps(fft2544, fft2559, fft2554);
__m512 fft2648 = _mm512_fnmadd_ps(fft2631, fft2559, fft2640);
__m512 fft2564 = _mm512_fmadd_ps(fft2547, fft2559, fft2555);
__m512 fft2649 = _mm512_fmadd_ps(fft2634, fft2559, fft2641);
__m512 fft2565 = _mm512_fnmadd_ps(fft2546, fft2559, fft2556);
__m512 fft2650 = _mm512_fnmadd_ps(fft2633, fft2559, fft2642);
__m512 fft2566 = _mm512_fmadd_ps(fft2549, fft2559, fft2557);
__m512 fft2651 = _mm512_fmadd_ps(fft2636, fft2559, fft2643);
__m512 fft2567 = _mm512_fnmadd_ps(fft2548, fft2559, fft2558);
__m512 fft2652 = _mm512_fnmadd_ps(fft2635, fft2559, fft2644);
__m512 fft2568 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft2569 = _mm512_fmadd_ps(fft2560, fft2568, _mm512_shuffle_f32x4(fft2560, fft2560, 177));
__m512 fft2653 = _mm512_fmadd_ps(fft2645, fft2568, _mm512_shuffle_f32x4(fft2645, fft2645, 177));
__m512 fft2570 = _mm512_fmadd_ps(fft2561, fft2568, _mm512_shuffle_f32x4(fft2561, fft2561, 177));
__m512 fft2654 = _mm512_fmadd_ps(fft2646, fft2568, _mm512_shuffle_f32x4(fft2646, fft2646, 177));
__m512 fft2571 = _mm512_fmadd_ps(fft2562, fft2568, _mm512_shuffle_f32x4(fft2562, fft2562, 177));
__m512 fft2655 = _mm512_fmadd_ps(fft2647, fft2568, _mm512_shuffle_f32x4(fft2647, fft2647, 177));
__m512 fft2572 = _mm512_fmadd_ps(fft2563, fft2568, _mm512_shuffle_f32x4(fft2563, fft2563, 177));
__m512 fft2656 = _mm512_fmadd_ps(fft2648, fft2568, _mm512_shuffle_f32x4(fft2648, fft2648, 177));
__m512 fft2573 = _mm512_fmadd_ps(fft2564, fft2568, _mm512_shuffle_f32x4(fft2564, fft2564, 177));
__m512 fft2657 = _mm512_fmadd_ps(fft2649, fft2568, _mm512_shuffle_f32x4(fft2649, fft2649, 177));
__m512 fft2574 = _mm512_fmadd_ps(fft2565, fft2568, _mm512_shuffle_f32x4(fft2565, fft2565, 177));
__m512 fft2658 = _mm512_fmadd_ps(fft2650, fft2568, _mm512_shuffle_f32x4(fft2650, fft2650, 177));
__m512 fft2575 = _mm512_fmadd_ps(fft2566, fft2568, _mm512_shuffle_f32x4(fft2566, fft2566, 177));
__m512 fft2659 = _mm512_fmadd_ps(fft2651, fft2568, _mm512_shuffle_f32x4(fft2651, fft2651, 177));
__m512 fft2576 = _mm512_fmadd_ps(fft2567, fft2568, _mm512_shuffle_f32x4(fft2567, fft2567, 177));
__m512 fft2660 = _mm512_fmadd_ps(fft2652, fft2568, _mm512_shuffle_f32x4(fft2652, fft2652, 177));
__m512 fft2577 = _mm512_mask_mov_ps(fft2569, 49344, fft2570);
__m512 fft2661 = _mm512_mask_mov_ps(fft2653, 49344, fft2654);
__m512 fft2578 = _mm512_mask_sub_ps(fft2570, 49344, _mm512_setzero_ps(), fft2569);
__m512 fft2662 = _mm512_mask_sub_ps(fft2654, 49344, _mm512_setzero_ps(), fft2653);
__m512 fft2579 = _mm512_mask_mov_ps(fft2571, 49344, fft2572);
__m512 fft2663 = _mm512_mask_mov_ps(fft2655, 49344, fft2656);
__m512 fft2580 = _mm512_mask_sub_ps(fft2572, 49344, _mm512_setzero_ps(), fft2571);
__m512 fft2664 = _mm512_mask_sub_ps(fft2656, 49344, _mm512_setzero_ps(), fft2655);
__m512 fft2581 = _mm512_mask_mov_ps(fft2573, 49344, fft2574);
__m512 fft2665 = _mm512_mask_mov_ps(fft2657, 49344, fft2658);
__m512 fft2582 = _mm512_mask_sub_ps(fft2574, 49344, _mm512_setzero_ps(), fft2573);
__m512 fft2666 = _mm512_mask_sub_ps(fft2658, 49344, _mm512_setzero_ps(), fft2657);
__m512 fft2583 = _mm512_mask_mov_ps(fft2575, 49344, fft2576);
__m512 fft2667 = _mm512_mask_mov_ps(fft2659, 49344, fft2660);
__m512 fft2584 = _mm512_mask_sub_ps(fft2576, 49344, _mm512_setzero_ps(), fft2575);
__m512 fft2668 = _mm512_mask_sub_ps(fft2660, 49344, _mm512_setzero_ps(), fft2659);
__m512 fft2585 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft2586 = _mm512_fmadd_ps(fft2577, fft2585, _mm512_shuffle_ps(fft2577, fft2577, 78));
__m512 fft2669 = _mm512_fmadd_ps(fft2661, fft2585, _mm512_shuffle_ps(fft2661, fft2661, 78));
__m512 fft2587 = _mm512_fmadd_ps(fft2578, fft2585, _mm512_shuffle_ps(fft2578, fft2578, 78));
__m512 fft2670 = _mm512_fmadd_ps(fft2662, fft2585, _mm512_shuffle_ps(fft2662, fft2662, 78));
__m512 fft2588 = _mm512_fmadd_ps(fft2579, fft2585, _mm512_shuffle_ps(fft2579, fft2579, 78));
__m512 fft2671 = _mm512_fmadd_ps(fft2663, fft2585, _mm512_shuffle_ps(fft2663, fft2663, 78));
__m512 fft2589 = _mm512_fmadd_ps(fft2580, fft2585, _mm512_shuffle_ps(fft2580, fft2580, 78));
__m512 fft2672 = _mm512_fmadd_ps(fft2664, fft2585, _mm512_shuffle_ps(fft2664, fft2664, 78));
__m512 fft2590 = _mm512_fmadd_ps(fft2581, fft2585, _mm512_shuffle_ps(fft2581, fft2581, 78));
__m512 fft2673 = _mm512_fmadd_ps(fft2665, fft2585, _mm512_shuffle_ps(fft2665, fft2665, 78));
__m512 fft2591 = _mm512_fmadd_ps(fft2582, fft2585, _mm512_shuffle_ps(fft2582, fft2582, 78));
__m512 fft2674 = _mm512_fmadd_ps(fft2666, fft2585, _mm512_shuffle_ps(fft2666, fft2666, 78));
__m512 fft2592 = _mm512_fmadd_ps(fft2583, fft2585, _mm512_shuffle_ps(fft2583, fft2583, 78));
__m512 fft2675 = _mm512_fmadd_ps(fft2667, fft2585, _mm512_shuffle_ps(fft2667, fft2667, 78));
__m512 fft2593 = _mm512_fmadd_ps(fft2584, fft2585, _mm512_shuffle_ps(fft2584, fft2584, 78));
__m512 fft2676 = _mm512_fmadd_ps(fft2668, fft2585, _mm512_shuffle_ps(fft2668, fft2668, 78));
__m512i fft2594 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft2595 = _mm512_permutexvar_ps(fft2594, fft2586);
__m512 fft2677 = _mm512_permutexvar_ps(fft2594, fft2669);
__m512i fft2596 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft2597 = _mm512_permutexvar_ps(fft2596, fft2586);
__m512 fft2678 = _mm512_permutexvar_ps(fft2596, fft2669);
__m512 fft2598 = _mm512_permutexvar_ps(fft2594, fft2587);
__m512 fft2679 = _mm512_permutexvar_ps(fft2594, fft2670);
__m512 fft2599 = _mm512_permutexvar_ps(fft2596, fft2587);
__m512 fft2680 = _mm512_permutexvar_ps(fft2596, fft2670);
__m512 fft2600 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft2601 = _mm512_fmadd_ps(fft2595, fft2600, fft2597);
__m512 fft2681 = _mm512_fmadd_ps(fft2677, fft2600, fft2678);
__m512 fft2602 = _mm512_fnmadd_ps(fft2599, fft2600, fft2598);
__m512 fft2682 = _mm512_fnmadd_ps(fft2680, fft2600, fft2679);
__m512 fft2603 = _mm512_mask_mov_ps(fft2599, 21845, fft2601);
__m512 fft2683 = _mm512_mask_mov_ps(fft2680, 21845, fft2681);
__m512 fft2604 = _mm512_mask_mov_ps(fft2595, 43176, fft2601);
__m512 fft2684 = _mm512_mask_mov_ps(fft2677, 43176, fft2681);
__m512 fft2605 = _mm512_mask_mov_ps(fft2603, 43176, fft2602);
__m512 fft2685 = _mm512_mask_mov_ps(fft2683, 43176, fft2682);
__m512 fft2606 = _mm512_mask_mov_ps(fft2604, 22102, fft2602);
__m512 fft2686 = _mm512_mask_mov_ps(fft2684, 22102, fft2682);
__m512 fft2607 = _mm512_mask_mul_ps(fft2605, 64764, fft2605, _mm512_set1_ps(5e-01f));
__m512 fft2687 = _mm512_mask_mul_ps(fft2685, 64764, fft2685, _mm512_set1_ps(5e-01f));
__m512 fft2608 = _mm512_mask_mul_ps(fft2606, 64764, fft2606, _mm512_set1_ps(5e-01f));
__m512 fft2688 = _mm512_mask_mul_ps(fft2686, 64764, fft2686, _mm512_set1_ps(5e-01f));
__m512 df209 = fft2607;
__m512 df217 = fft2687;
__m512 df210 = fft2608;
__m512 df218 = fft2688;
__m512 df211 = fft2588;
__m512 df219 = fft2671;
__m512 df212 = fft2589;
__m512 df220 = fft2672;
__m512 df213 = fft2590;
__m512 df221 = fft2673;
__m512 df214 = fft2591;
__m512 df222 = fft2674;
__m512 df215 = fft2592;
__m512 df223 = fft2675;
__m512 df216 = fft2593;
__m512 df224 = fft2676;
__m512i eo16 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df211 = _mm512_permutexvar_ps(eo16, df211);
df212 = _mm512_permutexvar_ps(eo16, df212);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df211);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df212);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df211);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df212);
df219 = _mm512_permutexvar_ps(eo16, df219);
df220 = _mm512_permutexvar_ps(eo16, df220);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df219);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df220);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df219);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df220);
df213 = _mm512_permutexvar_ps(eo16, df213);
df214 = _mm512_permutexvar_ps(eo16, df214);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df213);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df214);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df213);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df214);
df221 = _mm512_permutexvar_ps(eo16, df221);
df222 = _mm512_permutexvar_ps(eo16, df222);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df221);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df222);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df221);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df222);
df215 = _mm512_permutexvar_ps(eo16, df215);
df216 = _mm512_permutexvar_ps(eo16, df216);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df215);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df216);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df215);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df216);
df223 = _mm512_permutexvar_ps(eo16, df223);
df224 = _mm512_permutexvar_ps(eo16, df224);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df223);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df224);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df223);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df224);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df209);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df210);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df209);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df210);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df217);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df218);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df217);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df218);
ptrdiff_t b17 = 3;
ptrdiff_t m17 = (size_t)b17/2;
ptrdiff_t f18 = (size_t)b17%2;
__m512 dat210 = _mm512_maskz_loadu_ps(65528, datPtr1+8160+602112*i6+200704*k8+896*h7+4*w7+0*b17);
__m512 dat211 = _mm512_maskz_loadu_ps(65528, datPtr1+9056+602112*i6+200704*k8+896*h7+4*w7+0*b17);
__m512 dat212 = _mm512_maskz_loadu_ps(65528, datPtr1+9952+602112*i6+200704*k8+896*h7+4*w7+0*b17);
__m512 dat213 = _mm512_maskz_loadu_ps(65528, datPtr1+10848+602112*i6+200704*k8+896*h7+4*w7+0*b17);
__m512 dat214 = _mm512_maskz_loadu_ps(65528, datPtr1+11744+602112*i6+200704*k8+896*h7+4*w7+0*b17);
__m512 dat215 = _mm512_maskz_loadu_ps(65528, datPtr1+12640+602112*i6+200704*k8+896*h7+4*w7+0*b17);
__m512 dat216 = _mm512_maskz_loadu_ps(65528, datPtr1+13536+602112*i6+200704*k8+896*h7+4*w7+0*b17);
__m512 dat217 = _mm512_maskz_loadu_ps(65528, datPtr1+14432+602112*i6+200704*k8+896*h7+4*w7+0*b17);
__m512 dat218 = _mm512_maskz_loadu_ps(65528, datPtr1+15328+602112*i6+200704*k8+896*h7+4*w7+0*b17);
__m512 dat219 = _mm512_maskz_loadu_ps(65528, datPtr1+16224+602112*i6+200704*k8+896*h7+4*w7+0*b17);
__m512 dat220 = _mm512_maskz_loadu_ps(65528, datPtr1+17120+602112*i6+200704*k8+896*h7+4*w7+0*b17);
__m512 dat221 = _mm512_maskz_loadu_ps(65528, datPtr1+18016+602112*i6+200704*k8+896*h7+4*w7+0*b17);
__m512 dat222 = _mm512_maskz_loadu_ps(65528, datPtr1+18912+602112*i6+200704*k8+896*h7+4*w7+0*b17);
__m512 dat223 = _mm512_maskz_loadu_ps(65528, datPtr1+19808+602112*i6+200704*k8+896*h7+4*w7+0*b17);
__m512 dat224 = _mm512_maskz_loadu_ps(65528, datPtr1+20704+602112*i6+200704*k8+896*h7+4*w7+0*b17);
__m512 dat225 = _mm512_maskz_loadu_ps(65528, datPtr1+21600+602112*i6+200704*k8+896*h7+4*w7+0*b17);
__m512 fft2689 = _mm512_add_ps(dat210, dat218);
__m512 fft2777 = _mm512_add_ps(dat211, dat219);
__m512 fft2690 = _mm512_sub_ps(dat210, dat218);
__m512 fft2778 = _mm512_sub_ps(dat211, dat219);
__m512 fft2691 = _mm512_add_ps(dat212, dat220);
__m512 fft2779 = _mm512_add_ps(dat213, dat221);
__m512 fft2692 = _mm512_sub_ps(dat212, dat220);
__m512 fft2780 = _mm512_sub_ps(dat213, dat221);
__m512 fft2693 = _mm512_add_ps(dat214, dat222);
__m512 fft2781 = _mm512_add_ps(dat215, dat223);
__m512 fft2694 = _mm512_sub_ps(dat214, dat222);
__m512 fft2782 = _mm512_sub_ps(dat215, dat223);
__m512 fft2695 = _mm512_add_ps(dat216, dat224);
__m512 fft2783 = _mm512_add_ps(dat217, dat225);
__m512 fft2696 = _mm512_sub_ps(dat216, dat224);
__m512 fft2784 = _mm512_sub_ps(dat217, dat225);
__m512 fft2697 = _mm512_add_ps(fft2689, fft2693);
__m512 fft2785 = _mm512_add_ps(fft2777, fft2781);
__m512 fft2698 = _mm512_sub_ps(fft2689, fft2693);
__m512 fft2786 = _mm512_sub_ps(fft2777, fft2781);
__m512 fft2699 = _mm512_add_ps(fft2691, fft2695);
__m512 fft2787 = _mm512_add_ps(fft2779, fft2783);
__m512 fft2700 = _mm512_sub_ps(fft2695, fft2691);
__m512 fft2788 = _mm512_sub_ps(fft2783, fft2779);
__m512 fft2701 = _mm512_sub_ps(fft2692, fft2696);
__m512 fft2789 = _mm512_sub_ps(fft2780, fft2784);
__m512 fft2702 = _mm512_add_ps(fft2692, fft2696);
__m512 fft2790 = _mm512_add_ps(fft2780, fft2784);
__m512 fft2703 = _mm512_add_ps(fft2697, fft2699);
__m512 fft2791 = _mm512_add_ps(fft2785, fft2787);
__m512 fft2704 = _mm512_sub_ps(fft2697, fft2699);
__m512 fft2792 = _mm512_sub_ps(fft2785, fft2787);
__m512 fft2705 = _mm512_fmadd_ps(fft2701, _mm512_set1_ps(7.0710677e-01f), fft2690);
__m512 fft2793 = _mm512_fmadd_ps(fft2789, _mm512_set1_ps(7.0710677e-01f), fft2778);
__m512 fft2706 = _mm512_fnmsub_ps(fft2702, _mm512_set1_ps(7.0710677e-01f), fft2694);
__m512 fft2794 = _mm512_fnmsub_ps(fft2790, _mm512_set1_ps(7.0710677e-01f), fft2782);
__m512 fft2707 = _mm512_fnmadd_ps(fft2701, _mm512_set1_ps(7.0710677e-01f), fft2690);
__m512 fft2795 = _mm512_fnmadd_ps(fft2789, _mm512_set1_ps(7.0710677e-01f), fft2778);
__m512 fft2708 = _mm512_fnmadd_ps(fft2702, _mm512_set1_ps(7.0710677e-01f), fft2694);
__m512 fft2796 = _mm512_fnmadd_ps(fft2790, _mm512_set1_ps(7.0710677e-01f), fft2782);
__m512 fft2709 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2710 = _mm512_fmadd_ps(fft2703, fft2709, _mm512_shuffle_f32x4(fft2703, fft2703, 78));
__m512 fft2797 = _mm512_fmadd_ps(fft2791, fft2709, _mm512_shuffle_f32x4(fft2791, fft2791, 78));
__m512 fft2711 = _mm512_fmadd_ps(fft2704, fft2709, _mm512_shuffle_f32x4(fft2704, fft2704, 78));
__m512 fft2798 = _mm512_fmadd_ps(fft2792, fft2709, _mm512_shuffle_f32x4(fft2792, fft2792, 78));
__m512 fft2712 = _mm512_fmadd_ps(fft2705, fft2709, _mm512_shuffle_f32x4(fft2705, fft2705, 78));
__m512 fft2799 = _mm512_fmadd_ps(fft2793, fft2709, _mm512_shuffle_f32x4(fft2793, fft2793, 78));
__m512 fft2713 = _mm512_fmadd_ps(fft2706, fft2709, _mm512_shuffle_f32x4(fft2706, fft2706, 78));
__m512 fft2800 = _mm512_fmadd_ps(fft2794, fft2709, _mm512_shuffle_f32x4(fft2794, fft2794, 78));
__m512 fft2714 = _mm512_fmadd_ps(fft2698, fft2709, _mm512_shuffle_f32x4(fft2698, fft2698, 78));
__m512 fft2801 = _mm512_fmadd_ps(fft2786, fft2709, _mm512_shuffle_f32x4(fft2786, fft2786, 78));
__m512 fft2715 = _mm512_fmadd_ps(fft2700, fft2709, _mm512_shuffle_f32x4(fft2700, fft2700, 78));
__m512 fft2802 = _mm512_fmadd_ps(fft2788, fft2709, _mm512_shuffle_f32x4(fft2788, fft2788, 78));
__m512 fft2716 = _mm512_fmadd_ps(fft2707, fft2709, _mm512_shuffle_f32x4(fft2707, fft2707, 78));
__m512 fft2803 = _mm512_fmadd_ps(fft2795, fft2709, _mm512_shuffle_f32x4(fft2795, fft2795, 78));
__m512 fft2717 = _mm512_fmadd_ps(fft2708, fft2709, _mm512_shuffle_f32x4(fft2708, fft2708, 78));
__m512 fft2804 = _mm512_fmadd_ps(fft2796, fft2709, _mm512_shuffle_f32x4(fft2796, fft2796, 78));
__m512 fft2718 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2719 = _mm512_mul_ps(fft2710, fft2718);
__m512 fft2805 = _mm512_mul_ps(fft2797, fft2718);
__m512 fft2720 = _mm512_mul_ps(fft2711, fft2718);
__m512 fft2806 = _mm512_mul_ps(fft2798, fft2718);
__m512 fft2721 = _mm512_mul_ps(fft2712, fft2718);
__m512 fft2807 = _mm512_mul_ps(fft2799, fft2718);
__m512 fft2722 = _mm512_mul_ps(fft2713, fft2718);
__m512 fft2808 = _mm512_mul_ps(fft2800, fft2718);
__m512 fft2723 = _mm512_mul_ps(fft2714, fft2718);
__m512 fft2809 = _mm512_mul_ps(fft2801, fft2718);
__m512 fft2724 = _mm512_mul_ps(fft2715, fft2718);
__m512 fft2810 = _mm512_mul_ps(fft2802, fft2718);
__m512 fft2725 = _mm512_mul_ps(fft2716, fft2718);
__m512 fft2811 = _mm512_mul_ps(fft2803, fft2718);
__m512 fft2726 = _mm512_mul_ps(fft2717, fft2718);
__m512 fft2812 = _mm512_mul_ps(fft2804, fft2718);
__m512 fft2727 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft2728 = _mm512_fmadd_ps(fft2711, fft2727, fft2719);
__m512 fft2813 = _mm512_fmadd_ps(fft2798, fft2727, fft2805);
__m512 fft2729 = _mm512_fnmadd_ps(fft2710, fft2727, fft2720);
__m512 fft2814 = _mm512_fnmadd_ps(fft2797, fft2727, fft2806);
__m512 fft2730 = _mm512_fmadd_ps(fft2713, fft2727, fft2721);
__m512 fft2815 = _mm512_fmadd_ps(fft2800, fft2727, fft2807);
__m512 fft2731 = _mm512_fnmadd_ps(fft2712, fft2727, fft2722);
__m512 fft2816 = _mm512_fnmadd_ps(fft2799, fft2727, fft2808);
__m512 fft2732 = _mm512_fmadd_ps(fft2715, fft2727, fft2723);
__m512 fft2817 = _mm512_fmadd_ps(fft2802, fft2727, fft2809);
__m512 fft2733 = _mm512_fnmadd_ps(fft2714, fft2727, fft2724);
__m512 fft2818 = _mm512_fnmadd_ps(fft2801, fft2727, fft2810);
__m512 fft2734 = _mm512_fmadd_ps(fft2717, fft2727, fft2725);
__m512 fft2819 = _mm512_fmadd_ps(fft2804, fft2727, fft2811);
__m512 fft2735 = _mm512_fnmadd_ps(fft2716, fft2727, fft2726);
__m512 fft2820 = _mm512_fnmadd_ps(fft2803, fft2727, fft2812);
__m512 fft2736 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft2737 = _mm512_fmadd_ps(fft2728, fft2736, _mm512_shuffle_f32x4(fft2728, fft2728, 177));
__m512 fft2821 = _mm512_fmadd_ps(fft2813, fft2736, _mm512_shuffle_f32x4(fft2813, fft2813, 177));
__m512 fft2738 = _mm512_fmadd_ps(fft2729, fft2736, _mm512_shuffle_f32x4(fft2729, fft2729, 177));
__m512 fft2822 = _mm512_fmadd_ps(fft2814, fft2736, _mm512_shuffle_f32x4(fft2814, fft2814, 177));
__m512 fft2739 = _mm512_fmadd_ps(fft2730, fft2736, _mm512_shuffle_f32x4(fft2730, fft2730, 177));
__m512 fft2823 = _mm512_fmadd_ps(fft2815, fft2736, _mm512_shuffle_f32x4(fft2815, fft2815, 177));
__m512 fft2740 = _mm512_fmadd_ps(fft2731, fft2736, _mm512_shuffle_f32x4(fft2731, fft2731, 177));
__m512 fft2824 = _mm512_fmadd_ps(fft2816, fft2736, _mm512_shuffle_f32x4(fft2816, fft2816, 177));
__m512 fft2741 = _mm512_fmadd_ps(fft2732, fft2736, _mm512_shuffle_f32x4(fft2732, fft2732, 177));
__m512 fft2825 = _mm512_fmadd_ps(fft2817, fft2736, _mm512_shuffle_f32x4(fft2817, fft2817, 177));
__m512 fft2742 = _mm512_fmadd_ps(fft2733, fft2736, _mm512_shuffle_f32x4(fft2733, fft2733, 177));
__m512 fft2826 = _mm512_fmadd_ps(fft2818, fft2736, _mm512_shuffle_f32x4(fft2818, fft2818, 177));
__m512 fft2743 = _mm512_fmadd_ps(fft2734, fft2736, _mm512_shuffle_f32x4(fft2734, fft2734, 177));
__m512 fft2827 = _mm512_fmadd_ps(fft2819, fft2736, _mm512_shuffle_f32x4(fft2819, fft2819, 177));
__m512 fft2744 = _mm512_fmadd_ps(fft2735, fft2736, _mm512_shuffle_f32x4(fft2735, fft2735, 177));
__m512 fft2828 = _mm512_fmadd_ps(fft2820, fft2736, _mm512_shuffle_f32x4(fft2820, fft2820, 177));
__m512 fft2745 = _mm512_mask_mov_ps(fft2737, 49344, fft2738);
__m512 fft2829 = _mm512_mask_mov_ps(fft2821, 49344, fft2822);
__m512 fft2746 = _mm512_mask_sub_ps(fft2738, 49344, _mm512_setzero_ps(), fft2737);
__m512 fft2830 = _mm512_mask_sub_ps(fft2822, 49344, _mm512_setzero_ps(), fft2821);
__m512 fft2747 = _mm512_mask_mov_ps(fft2739, 49344, fft2740);
__m512 fft2831 = _mm512_mask_mov_ps(fft2823, 49344, fft2824);
__m512 fft2748 = _mm512_mask_sub_ps(fft2740, 49344, _mm512_setzero_ps(), fft2739);
__m512 fft2832 = _mm512_mask_sub_ps(fft2824, 49344, _mm512_setzero_ps(), fft2823);
__m512 fft2749 = _mm512_mask_mov_ps(fft2741, 49344, fft2742);
__m512 fft2833 = _mm512_mask_mov_ps(fft2825, 49344, fft2826);
__m512 fft2750 = _mm512_mask_sub_ps(fft2742, 49344, _mm512_setzero_ps(), fft2741);
__m512 fft2834 = _mm512_mask_sub_ps(fft2826, 49344, _mm512_setzero_ps(), fft2825);
__m512 fft2751 = _mm512_mask_mov_ps(fft2743, 49344, fft2744);
__m512 fft2835 = _mm512_mask_mov_ps(fft2827, 49344, fft2828);
__m512 fft2752 = _mm512_mask_sub_ps(fft2744, 49344, _mm512_setzero_ps(), fft2743);
__m512 fft2836 = _mm512_mask_sub_ps(fft2828, 49344, _mm512_setzero_ps(), fft2827);
__m512 fft2753 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft2754 = _mm512_fmadd_ps(fft2745, fft2753, _mm512_shuffle_ps(fft2745, fft2745, 78));
__m512 fft2837 = _mm512_fmadd_ps(fft2829, fft2753, _mm512_shuffle_ps(fft2829, fft2829, 78));
__m512 fft2755 = _mm512_fmadd_ps(fft2746, fft2753, _mm512_shuffle_ps(fft2746, fft2746, 78));
__m512 fft2838 = _mm512_fmadd_ps(fft2830, fft2753, _mm512_shuffle_ps(fft2830, fft2830, 78));
__m512 fft2756 = _mm512_fmadd_ps(fft2747, fft2753, _mm512_shuffle_ps(fft2747, fft2747, 78));
__m512 fft2839 = _mm512_fmadd_ps(fft2831, fft2753, _mm512_shuffle_ps(fft2831, fft2831, 78));
__m512 fft2757 = _mm512_fmadd_ps(fft2748, fft2753, _mm512_shuffle_ps(fft2748, fft2748, 78));
__m512 fft2840 = _mm512_fmadd_ps(fft2832, fft2753, _mm512_shuffle_ps(fft2832, fft2832, 78));
__m512 fft2758 = _mm512_fmadd_ps(fft2749, fft2753, _mm512_shuffle_ps(fft2749, fft2749, 78));
__m512 fft2841 = _mm512_fmadd_ps(fft2833, fft2753, _mm512_shuffle_ps(fft2833, fft2833, 78));
__m512 fft2759 = _mm512_fmadd_ps(fft2750, fft2753, _mm512_shuffle_ps(fft2750, fft2750, 78));
__m512 fft2842 = _mm512_fmadd_ps(fft2834, fft2753, _mm512_shuffle_ps(fft2834, fft2834, 78));
__m512 fft2760 = _mm512_fmadd_ps(fft2751, fft2753, _mm512_shuffle_ps(fft2751, fft2751, 78));
__m512 fft2843 = _mm512_fmadd_ps(fft2835, fft2753, _mm512_shuffle_ps(fft2835, fft2835, 78));
__m512 fft2761 = _mm512_fmadd_ps(fft2752, fft2753, _mm512_shuffle_ps(fft2752, fft2752, 78));
__m512 fft2844 = _mm512_fmadd_ps(fft2836, fft2753, _mm512_shuffle_ps(fft2836, fft2836, 78));
__m512i fft2762 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft2763 = _mm512_permutexvar_ps(fft2762, fft2754);
__m512 fft2845 = _mm512_permutexvar_ps(fft2762, fft2837);
__m512i fft2764 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft2765 = _mm512_permutexvar_ps(fft2764, fft2754);
__m512 fft2846 = _mm512_permutexvar_ps(fft2764, fft2837);
__m512 fft2766 = _mm512_permutexvar_ps(fft2762, fft2755);
__m512 fft2847 = _mm512_permutexvar_ps(fft2762, fft2838);
__m512 fft2767 = _mm512_permutexvar_ps(fft2764, fft2755);
__m512 fft2848 = _mm512_permutexvar_ps(fft2764, fft2838);
__m512 fft2768 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft2769 = _mm512_fmadd_ps(fft2763, fft2768, fft2765);
__m512 fft2849 = _mm512_fmadd_ps(fft2845, fft2768, fft2846);
__m512 fft2770 = _mm512_fnmadd_ps(fft2767, fft2768, fft2766);
__m512 fft2850 = _mm512_fnmadd_ps(fft2848, fft2768, fft2847);
__m512 fft2771 = _mm512_mask_mov_ps(fft2767, 21845, fft2769);
__m512 fft2851 = _mm512_mask_mov_ps(fft2848, 21845, fft2849);
__m512 fft2772 = _mm512_mask_mov_ps(fft2763, 43176, fft2769);
__m512 fft2852 = _mm512_mask_mov_ps(fft2845, 43176, fft2849);
__m512 fft2773 = _mm512_mask_mov_ps(fft2771, 43176, fft2770);
__m512 fft2853 = _mm512_mask_mov_ps(fft2851, 43176, fft2850);
__m512 fft2774 = _mm512_mask_mov_ps(fft2772, 22102, fft2770);
__m512 fft2854 = _mm512_mask_mov_ps(fft2852, 22102, fft2850);
__m512 fft2775 = _mm512_mask_mul_ps(fft2773, 64764, fft2773, _mm512_set1_ps(5e-01f));
__m512 fft2855 = _mm512_mask_mul_ps(fft2853, 64764, fft2853, _mm512_set1_ps(5e-01f));
__m512 fft2776 = _mm512_mask_mul_ps(fft2774, 64764, fft2774, _mm512_set1_ps(5e-01f));
__m512 fft2856 = _mm512_mask_mul_ps(fft2854, 64764, fft2854, _mm512_set1_ps(5e-01f));
__m512 df225 = fft2775;
__m512 df233 = fft2855;
__m512 df226 = fft2776;
__m512 df234 = fft2856;
__m512 df227 = fft2756;
__m512 df235 = fft2839;
__m512 df228 = fft2757;
__m512 df236 = fft2840;
__m512 df229 = fft2758;
__m512 df237 = fft2841;
__m512 df230 = fft2759;
__m512 df238 = fft2842;
__m512 df231 = fft2760;
__m512 df239 = fft2843;
__m512 df232 = fft2761;
__m512 df240 = fft2844;
__m512i eo17 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df227 = _mm512_permutexvar_ps(eo17, df227);
df228 = _mm512_permutexvar_ps(eo17, df228);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df227);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df228);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df227);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df228);
df235 = _mm512_permutexvar_ps(eo17, df235);
df236 = _mm512_permutexvar_ps(eo17, df236);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df235);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df236);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df235);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df236);
df229 = _mm512_permutexvar_ps(eo17, df229);
df230 = _mm512_permutexvar_ps(eo17, df230);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df229);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df230);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df229);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df230);
df237 = _mm512_permutexvar_ps(eo17, df237);
df238 = _mm512_permutexvar_ps(eo17, df238);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df237);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df238);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df237);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df238);
df231 = _mm512_permutexvar_ps(eo17, df231);
df232 = _mm512_permutexvar_ps(eo17, df232);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df231);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df232);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df231);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df232);
df239 = _mm512_permutexvar_ps(eo17, df239);
df240 = _mm512_permutexvar_ps(eo17, df240);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df239);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df240);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df239);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df240);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df225);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df226);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df225);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df226);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df233);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df234);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df233);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df234);
for (ptrdiff_t b18 = 4; b18 < 6; ++b18) {
ptrdiff_t m18 = (size_t)b18/2;
ptrdiff_t f19 = (size_t)b18%2;
__m512 dat226 = _mm512_maskz_loadu_ps(65535, datPtr1+8040+602112*i6+200704*k8+896*h7+4*w7+40*b18);
__m512 dat227 = _mm512_maskz_loadu_ps(65535, datPtr1+8936+602112*i6+200704*k8+896*h7+4*w7+40*b18);
__m512 dat228 = _mm512_maskz_loadu_ps(65535, datPtr1+9832+602112*i6+200704*k8+896*h7+4*w7+40*b18);
__m512 dat229 = _mm512_maskz_loadu_ps(65535, datPtr1+10728+602112*i6+200704*k8+896*h7+4*w7+40*b18);
__m512 dat230 = _mm512_maskz_loadu_ps(65535, datPtr1+11624+602112*i6+200704*k8+896*h7+4*w7+40*b18);
__m512 dat231 = _mm512_maskz_loadu_ps(65535, datPtr1+12520+602112*i6+200704*k8+896*h7+4*w7+40*b18);
__m512 dat232 = _mm512_maskz_loadu_ps(65535, datPtr1+13416+602112*i6+200704*k8+896*h7+4*w7+40*b18);
__m512 dat233 = _mm512_maskz_loadu_ps(65535, datPtr1+14312+602112*i6+200704*k8+896*h7+4*w7+40*b18);
__m512 dat234 = _mm512_maskz_loadu_ps(65535, datPtr1+15208+602112*i6+200704*k8+896*h7+4*w7+40*b18);
__m512 dat235 = _mm512_maskz_loadu_ps(65535, datPtr1+16104+602112*i6+200704*k8+896*h7+4*w7+40*b18);
__m512 dat236 = _mm512_maskz_loadu_ps(65535, datPtr1+17000+602112*i6+200704*k8+896*h7+4*w7+40*b18);
__m512 dat237 = _mm512_maskz_loadu_ps(65535, datPtr1+17896+602112*i6+200704*k8+896*h7+4*w7+40*b18);
__m512 dat238 = _mm512_maskz_loadu_ps(65535, datPtr1+18792+602112*i6+200704*k8+896*h7+4*w7+40*b18);
__m512 dat239 = _mm512_maskz_loadu_ps(65535, datPtr1+19688+602112*i6+200704*k8+896*h7+4*w7+40*b18);
__m512 dat240 = _mm512_maskz_loadu_ps(65535, datPtr1+20584+602112*i6+200704*k8+896*h7+4*w7+40*b18);
__m512 dat241 = _mm512_maskz_loadu_ps(65535, datPtr1+21480+602112*i6+200704*k8+896*h7+4*w7+40*b18);
__m512 fft2857 = _mm512_add_ps(dat226, dat234);
__m512 fft2945 = _mm512_add_ps(dat227, dat235);
__m512 fft2858 = _mm512_sub_ps(dat226, dat234);
__m512 fft2946 = _mm512_sub_ps(dat227, dat235);
__m512 fft2859 = _mm512_add_ps(dat228, dat236);
__m512 fft2947 = _mm512_add_ps(dat229, dat237);
__m512 fft2860 = _mm512_sub_ps(dat228, dat236);
__m512 fft2948 = _mm512_sub_ps(dat229, dat237);
__m512 fft2861 = _mm512_add_ps(dat230, dat238);
__m512 fft2949 = _mm512_add_ps(dat231, dat239);
__m512 fft2862 = _mm512_sub_ps(dat230, dat238);
__m512 fft2950 = _mm512_sub_ps(dat231, dat239);
__m512 fft2863 = _mm512_add_ps(dat232, dat240);
__m512 fft2951 = _mm512_add_ps(dat233, dat241);
__m512 fft2864 = _mm512_sub_ps(dat232, dat240);
__m512 fft2952 = _mm512_sub_ps(dat233, dat241);
__m512 fft2865 = _mm512_add_ps(fft2857, fft2861);
__m512 fft2953 = _mm512_add_ps(fft2945, fft2949);
__m512 fft2866 = _mm512_sub_ps(fft2857, fft2861);
__m512 fft2954 = _mm512_sub_ps(fft2945, fft2949);
__m512 fft2867 = _mm512_add_ps(fft2859, fft2863);
__m512 fft2955 = _mm512_add_ps(fft2947, fft2951);
__m512 fft2868 = _mm512_sub_ps(fft2863, fft2859);
__m512 fft2956 = _mm512_sub_ps(fft2951, fft2947);
__m512 fft2869 = _mm512_sub_ps(fft2860, fft2864);
__m512 fft2957 = _mm512_sub_ps(fft2948, fft2952);
__m512 fft2870 = _mm512_add_ps(fft2860, fft2864);
__m512 fft2958 = _mm512_add_ps(fft2948, fft2952);
__m512 fft2871 = _mm512_add_ps(fft2865, fft2867);
__m512 fft2959 = _mm512_add_ps(fft2953, fft2955);
__m512 fft2872 = _mm512_sub_ps(fft2865, fft2867);
__m512 fft2960 = _mm512_sub_ps(fft2953, fft2955);
__m512 fft2873 = _mm512_fmadd_ps(fft2869, _mm512_set1_ps(7.0710677e-01f), fft2858);
__m512 fft2961 = _mm512_fmadd_ps(fft2957, _mm512_set1_ps(7.0710677e-01f), fft2946);
__m512 fft2874 = _mm512_fnmsub_ps(fft2870, _mm512_set1_ps(7.0710677e-01f), fft2862);
__m512 fft2962 = _mm512_fnmsub_ps(fft2958, _mm512_set1_ps(7.0710677e-01f), fft2950);
__m512 fft2875 = _mm512_fnmadd_ps(fft2869, _mm512_set1_ps(7.0710677e-01f), fft2858);
__m512 fft2963 = _mm512_fnmadd_ps(fft2957, _mm512_set1_ps(7.0710677e-01f), fft2946);
__m512 fft2876 = _mm512_fnmadd_ps(fft2870, _mm512_set1_ps(7.0710677e-01f), fft2862);
__m512 fft2964 = _mm512_fnmadd_ps(fft2958, _mm512_set1_ps(7.0710677e-01f), fft2950);
__m512 fft2877 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2878 = _mm512_fmadd_ps(fft2871, fft2877, _mm512_shuffle_f32x4(fft2871, fft2871, 78));
__m512 fft2965 = _mm512_fmadd_ps(fft2959, fft2877, _mm512_shuffle_f32x4(fft2959, fft2959, 78));
__m512 fft2879 = _mm512_fmadd_ps(fft2872, fft2877, _mm512_shuffle_f32x4(fft2872, fft2872, 78));
__m512 fft2966 = _mm512_fmadd_ps(fft2960, fft2877, _mm512_shuffle_f32x4(fft2960, fft2960, 78));
__m512 fft2880 = _mm512_fmadd_ps(fft2873, fft2877, _mm512_shuffle_f32x4(fft2873, fft2873, 78));
__m512 fft2967 = _mm512_fmadd_ps(fft2961, fft2877, _mm512_shuffle_f32x4(fft2961, fft2961, 78));
__m512 fft2881 = _mm512_fmadd_ps(fft2874, fft2877, _mm512_shuffle_f32x4(fft2874, fft2874, 78));
__m512 fft2968 = _mm512_fmadd_ps(fft2962, fft2877, _mm512_shuffle_f32x4(fft2962, fft2962, 78));
__m512 fft2882 = _mm512_fmadd_ps(fft2866, fft2877, _mm512_shuffle_f32x4(fft2866, fft2866, 78));
__m512 fft2969 = _mm512_fmadd_ps(fft2954, fft2877, _mm512_shuffle_f32x4(fft2954, fft2954, 78));
__m512 fft2883 = _mm512_fmadd_ps(fft2868, fft2877, _mm512_shuffle_f32x4(fft2868, fft2868, 78));
__m512 fft2970 = _mm512_fmadd_ps(fft2956, fft2877, _mm512_shuffle_f32x4(fft2956, fft2956, 78));
__m512 fft2884 = _mm512_fmadd_ps(fft2875, fft2877, _mm512_shuffle_f32x4(fft2875, fft2875, 78));
__m512 fft2971 = _mm512_fmadd_ps(fft2963, fft2877, _mm512_shuffle_f32x4(fft2963, fft2963, 78));
__m512 fft2885 = _mm512_fmadd_ps(fft2876, fft2877, _mm512_shuffle_f32x4(fft2876, fft2876, 78));
__m512 fft2972 = _mm512_fmadd_ps(fft2964, fft2877, _mm512_shuffle_f32x4(fft2964, fft2964, 78));
__m512 fft2886 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2887 = _mm512_mul_ps(fft2878, fft2886);
__m512 fft2973 = _mm512_mul_ps(fft2965, fft2886);
__m512 fft2888 = _mm512_mul_ps(fft2879, fft2886);
__m512 fft2974 = _mm512_mul_ps(fft2966, fft2886);
__m512 fft2889 = _mm512_mul_ps(fft2880, fft2886);
__m512 fft2975 = _mm512_mul_ps(fft2967, fft2886);
__m512 fft2890 = _mm512_mul_ps(fft2881, fft2886);
__m512 fft2976 = _mm512_mul_ps(fft2968, fft2886);
__m512 fft2891 = _mm512_mul_ps(fft2882, fft2886);
__m512 fft2977 = _mm512_mul_ps(fft2969, fft2886);
__m512 fft2892 = _mm512_mul_ps(fft2883, fft2886);
__m512 fft2978 = _mm512_mul_ps(fft2970, fft2886);
__m512 fft2893 = _mm512_mul_ps(fft2884, fft2886);
__m512 fft2979 = _mm512_mul_ps(fft2971, fft2886);
__m512 fft2894 = _mm512_mul_ps(fft2885, fft2886);
__m512 fft2980 = _mm512_mul_ps(fft2972, fft2886);
__m512 fft2895 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft2896 = _mm512_fmadd_ps(fft2879, fft2895, fft2887);
__m512 fft2981 = _mm512_fmadd_ps(fft2966, fft2895, fft2973);
__m512 fft2897 = _mm512_fnmadd_ps(fft2878, fft2895, fft2888);
__m512 fft2982 = _mm512_fnmadd_ps(fft2965, fft2895, fft2974);
__m512 fft2898 = _mm512_fmadd_ps(fft2881, fft2895, fft2889);
__m512 fft2983 = _mm512_fmadd_ps(fft2968, fft2895, fft2975);
__m512 fft2899 = _mm512_fnmadd_ps(fft2880, fft2895, fft2890);
__m512 fft2984 = _mm512_fnmadd_ps(fft2967, fft2895, fft2976);
__m512 fft2900 = _mm512_fmadd_ps(fft2883, fft2895, fft2891);
__m512 fft2985 = _mm512_fmadd_ps(fft2970, fft2895, fft2977);
__m512 fft2901 = _mm512_fnmadd_ps(fft2882, fft2895, fft2892);
__m512 fft2986 = _mm512_fnmadd_ps(fft2969, fft2895, fft2978);
__m512 fft2902 = _mm512_fmadd_ps(fft2885, fft2895, fft2893);
__m512 fft2987 = _mm512_fmadd_ps(fft2972, fft2895, fft2979);
__m512 fft2903 = _mm512_fnmadd_ps(fft2884, fft2895, fft2894);
__m512 fft2988 = _mm512_fnmadd_ps(fft2971, fft2895, fft2980);
__m512 fft2904 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft2905 = _mm512_fmadd_ps(fft2896, fft2904, _mm512_shuffle_f32x4(fft2896, fft2896, 177));
__m512 fft2989 = _mm512_fmadd_ps(fft2981, fft2904, _mm512_shuffle_f32x4(fft2981, fft2981, 177));
__m512 fft2906 = _mm512_fmadd_ps(fft2897, fft2904, _mm512_shuffle_f32x4(fft2897, fft2897, 177));
__m512 fft2990 = _mm512_fmadd_ps(fft2982, fft2904, _mm512_shuffle_f32x4(fft2982, fft2982, 177));
__m512 fft2907 = _mm512_fmadd_ps(fft2898, fft2904, _mm512_shuffle_f32x4(fft2898, fft2898, 177));
__m512 fft2991 = _mm512_fmadd_ps(fft2983, fft2904, _mm512_shuffle_f32x4(fft2983, fft2983, 177));
__m512 fft2908 = _mm512_fmadd_ps(fft2899, fft2904, _mm512_shuffle_f32x4(fft2899, fft2899, 177));
__m512 fft2992 = _mm512_fmadd_ps(fft2984, fft2904, _mm512_shuffle_f32x4(fft2984, fft2984, 177));
__m512 fft2909 = _mm512_fmadd_ps(fft2900, fft2904, _mm512_shuffle_f32x4(fft2900, fft2900, 177));
__m512 fft2993 = _mm512_fmadd_ps(fft2985, fft2904, _mm512_shuffle_f32x4(fft2985, fft2985, 177));
__m512 fft2910 = _mm512_fmadd_ps(fft2901, fft2904, _mm512_shuffle_f32x4(fft2901, fft2901, 177));
__m512 fft2994 = _mm512_fmadd_ps(fft2986, fft2904, _mm512_shuffle_f32x4(fft2986, fft2986, 177));
__m512 fft2911 = _mm512_fmadd_ps(fft2902, fft2904, _mm512_shuffle_f32x4(fft2902, fft2902, 177));
__m512 fft2995 = _mm512_fmadd_ps(fft2987, fft2904, _mm512_shuffle_f32x4(fft2987, fft2987, 177));
__m512 fft2912 = _mm512_fmadd_ps(fft2903, fft2904, _mm512_shuffle_f32x4(fft2903, fft2903, 177));
__m512 fft2996 = _mm512_fmadd_ps(fft2988, fft2904, _mm512_shuffle_f32x4(fft2988, fft2988, 177));
__m512 fft2913 = _mm512_mask_mov_ps(fft2905, 49344, fft2906);
__m512 fft2997 = _mm512_mask_mov_ps(fft2989, 49344, fft2990);
__m512 fft2914 = _mm512_mask_sub_ps(fft2906, 49344, _mm512_setzero_ps(), fft2905);
__m512 fft2998 = _mm512_mask_sub_ps(fft2990, 49344, _mm512_setzero_ps(), fft2989);
__m512 fft2915 = _mm512_mask_mov_ps(fft2907, 49344, fft2908);
__m512 fft2999 = _mm512_mask_mov_ps(fft2991, 49344, fft2992);
__m512 fft2916 = _mm512_mask_sub_ps(fft2908, 49344, _mm512_setzero_ps(), fft2907);
__m512 fft3000 = _mm512_mask_sub_ps(fft2992, 49344, _mm512_setzero_ps(), fft2991);
__m512 fft2917 = _mm512_mask_mov_ps(fft2909, 49344, fft2910);
__m512 fft3001 = _mm512_mask_mov_ps(fft2993, 49344, fft2994);
__m512 fft2918 = _mm512_mask_sub_ps(fft2910, 49344, _mm512_setzero_ps(), fft2909);
__m512 fft3002 = _mm512_mask_sub_ps(fft2994, 49344, _mm512_setzero_ps(), fft2993);
__m512 fft2919 = _mm512_mask_mov_ps(fft2911, 49344, fft2912);
__m512 fft3003 = _mm512_mask_mov_ps(fft2995, 49344, fft2996);
__m512 fft2920 = _mm512_mask_sub_ps(fft2912, 49344, _mm512_setzero_ps(), fft2911);
__m512 fft3004 = _mm512_mask_sub_ps(fft2996, 49344, _mm512_setzero_ps(), fft2995);
__m512 fft2921 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft2922 = _mm512_fmadd_ps(fft2913, fft2921, _mm512_shuffle_ps(fft2913, fft2913, 78));
__m512 fft3005 = _mm512_fmadd_ps(fft2997, fft2921, _mm512_shuffle_ps(fft2997, fft2997, 78));
__m512 fft2923 = _mm512_fmadd_ps(fft2914, fft2921, _mm512_shuffle_ps(fft2914, fft2914, 78));
__m512 fft3006 = _mm512_fmadd_ps(fft2998, fft2921, _mm512_shuffle_ps(fft2998, fft2998, 78));
__m512 fft2924 = _mm512_fmadd_ps(fft2915, fft2921, _mm512_shuffle_ps(fft2915, fft2915, 78));
__m512 fft3007 = _mm512_fmadd_ps(fft2999, fft2921, _mm512_shuffle_ps(fft2999, fft2999, 78));
__m512 fft2925 = _mm512_fmadd_ps(fft2916, fft2921, _mm512_shuffle_ps(fft2916, fft2916, 78));
__m512 fft3008 = _mm512_fmadd_ps(fft3000, fft2921, _mm512_shuffle_ps(fft3000, fft3000, 78));
__m512 fft2926 = _mm512_fmadd_ps(fft2917, fft2921, _mm512_shuffle_ps(fft2917, fft2917, 78));
__m512 fft3009 = _mm512_fmadd_ps(fft3001, fft2921, _mm512_shuffle_ps(fft3001, fft3001, 78));
__m512 fft2927 = _mm512_fmadd_ps(fft2918, fft2921, _mm512_shuffle_ps(fft2918, fft2918, 78));
__m512 fft3010 = _mm512_fmadd_ps(fft3002, fft2921, _mm512_shuffle_ps(fft3002, fft3002, 78));
__m512 fft2928 = _mm512_fmadd_ps(fft2919, fft2921, _mm512_shuffle_ps(fft2919, fft2919, 78));
__m512 fft3011 = _mm512_fmadd_ps(fft3003, fft2921, _mm512_shuffle_ps(fft3003, fft3003, 78));
__m512 fft2929 = _mm512_fmadd_ps(fft2920, fft2921, _mm512_shuffle_ps(fft2920, fft2920, 78));
__m512 fft3012 = _mm512_fmadd_ps(fft3004, fft2921, _mm512_shuffle_ps(fft3004, fft3004, 78));
__m512i fft2930 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft2931 = _mm512_permutexvar_ps(fft2930, fft2922);
__m512 fft3013 = _mm512_permutexvar_ps(fft2930, fft3005);
__m512i fft2932 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft2933 = _mm512_permutexvar_ps(fft2932, fft2922);
__m512 fft3014 = _mm512_permutexvar_ps(fft2932, fft3005);
__m512 fft2934 = _mm512_permutexvar_ps(fft2930, fft2923);
__m512 fft3015 = _mm512_permutexvar_ps(fft2930, fft3006);
__m512 fft2935 = _mm512_permutexvar_ps(fft2932, fft2923);
__m512 fft3016 = _mm512_permutexvar_ps(fft2932, fft3006);
__m512 fft2936 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft2937 = _mm512_fmadd_ps(fft2931, fft2936, fft2933);
__m512 fft3017 = _mm512_fmadd_ps(fft3013, fft2936, fft3014);
__m512 fft2938 = _mm512_fnmadd_ps(fft2935, fft2936, fft2934);
__m512 fft3018 = _mm512_fnmadd_ps(fft3016, fft2936, fft3015);
__m512 fft2939 = _mm512_mask_mov_ps(fft2935, 21845, fft2937);
__m512 fft3019 = _mm512_mask_mov_ps(fft3016, 21845, fft3017);
__m512 fft2940 = _mm512_mask_mov_ps(fft2931, 43176, fft2937);
__m512 fft3020 = _mm512_mask_mov_ps(fft3013, 43176, fft3017);
__m512 fft2941 = _mm512_mask_mov_ps(fft2939, 43176, fft2938);
__m512 fft3021 = _mm512_mask_mov_ps(fft3019, 43176, fft3018);
__m512 fft2942 = _mm512_mask_mov_ps(fft2940, 22102, fft2938);
__m512 fft3022 = _mm512_mask_mov_ps(fft3020, 22102, fft3018);
__m512 fft2943 = _mm512_mask_mul_ps(fft2941, 64764, fft2941, _mm512_set1_ps(5e-01f));
__m512 fft3023 = _mm512_mask_mul_ps(fft3021, 64764, fft3021, _mm512_set1_ps(5e-01f));
__m512 fft2944 = _mm512_mask_mul_ps(fft2942, 64764, fft2942, _mm512_set1_ps(5e-01f));
__m512 fft3024 = _mm512_mask_mul_ps(fft3022, 64764, fft3022, _mm512_set1_ps(5e-01f));
__m512 df241 = fft2943;
__m512 df249 = fft3023;
__m512 df242 = fft2944;
__m512 df250 = fft3024;
__m512 df243 = fft2924;
__m512 df251 = fft3007;
__m512 df244 = fft2925;
__m512 df252 = fft3008;
__m512 df245 = fft2926;
__m512 df253 = fft3009;
__m512 df246 = fft2927;
__m512 df254 = fft3010;
__m512 df247 = fft2928;
__m512 df255 = fft3011;
__m512 df248 = fft2929;
__m512 df256 = fft3012;
__m512i eo18 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df243 = _mm512_permutexvar_ps(eo18, df243);
df244 = _mm512_permutexvar_ps(eo18, df244);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df243);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df244);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df243);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df244);
df251 = _mm512_permutexvar_ps(eo18, df251);
df252 = _mm512_permutexvar_ps(eo18, df252);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df251);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df252);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df251);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df252);
df245 = _mm512_permutexvar_ps(eo18, df245);
df246 = _mm512_permutexvar_ps(eo18, df246);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df245);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df246);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df245);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df246);
df253 = _mm512_permutexvar_ps(eo18, df253);
df254 = _mm512_permutexvar_ps(eo18, df254);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df253);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df254);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df253);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df254);
df247 = _mm512_permutexvar_ps(eo18, df247);
df248 = _mm512_permutexvar_ps(eo18, df248);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df247);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df248);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df247);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df248);
df255 = _mm512_permutexvar_ps(eo18, df255);
df256 = _mm512_permutexvar_ps(eo18, df256);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df255);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df256);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df255);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df256);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df241);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df242);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df241);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df242);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df249);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df250);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df249);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df250);
}
}
if (j2 >= last1) return;
++j2;
rel2 = 8;
}
ptrdiff_t h8 = base2+20;
ptrdiff_t w8 = -450+60*rel2;
ptrdiff_t jj4 = 10-rel2+j2;
for (; j2 <= jj4; w8 += 60) {
ptrdiff_t k9 = 3*s1;
ptrdiff_t kk8 = k9+2;
for (; k9 <= kk8; ++k9) {
for (ptrdiff_t b19 = 0; b19 < 6; ++b19) {
ptrdiff_t m19 = (size_t)b19/2;
ptrdiff_t f20 = (size_t)b19%2;
__m512 dat242 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k9+896*h8+4*w8+40*b19);
__m512 dat243 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k9+896*h8+4*w8+40*b19);
__m512 dat244 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k9+896*h8+4*w8+40*b19);
__m512 dat245 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k9+896*h8+4*w8+40*b19);
__m512 dat246 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k9+896*h8+4*w8+40*b19);
__m512 dat247 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k9+896*h8+4*w8+40*b19);
__m512 dat248 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k9+896*h8+4*w8+40*b19);
__m512 dat249 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k9+896*h8+4*w8+40*b19);
__m512 dat250 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k9+896*h8+4*w8+40*b19);
__m512 dat251 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k9+896*h8+4*w8+40*b19);
__m512 dat252 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k9+896*h8+4*w8+40*b19);
__m512 dat253 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k9+896*h8+4*w8+40*b19);
__m512 dat254 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k9+896*h8+4*w8+40*b19);
__m512 dat255 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k9+896*h8+4*w8+40*b19);
__m512 dat256 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k9+896*h8+4*w8+40*b19);
__m512 dat257 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k9+896*h8+4*w8+40*b19);
__m512 fft3025 = _mm512_add_ps(dat242, dat250);
__m512 fft3113 = _mm512_add_ps(dat243, dat251);
__m512 fft3026 = _mm512_sub_ps(dat242, dat250);
__m512 fft3114 = _mm512_sub_ps(dat243, dat251);
__m512 fft3027 = _mm512_add_ps(dat244, dat252);
__m512 fft3115 = _mm512_add_ps(dat245, dat253);
__m512 fft3028 = _mm512_sub_ps(dat244, dat252);
__m512 fft3116 = _mm512_sub_ps(dat245, dat253);
__m512 fft3029 = _mm512_add_ps(dat246, dat254);
__m512 fft3117 = _mm512_add_ps(dat247, dat255);
__m512 fft3030 = _mm512_sub_ps(dat246, dat254);
__m512 fft3118 = _mm512_sub_ps(dat247, dat255);
__m512 fft3031 = _mm512_add_ps(dat248, dat256);
__m512 fft3119 = _mm512_add_ps(dat249, dat257);
__m512 fft3032 = _mm512_sub_ps(dat248, dat256);
__m512 fft3120 = _mm512_sub_ps(dat249, dat257);
__m512 fft3033 = _mm512_add_ps(fft3025, fft3029);
__m512 fft3121 = _mm512_add_ps(fft3113, fft3117);
__m512 fft3034 = _mm512_sub_ps(fft3025, fft3029);
__m512 fft3122 = _mm512_sub_ps(fft3113, fft3117);
__m512 fft3035 = _mm512_add_ps(fft3027, fft3031);
__m512 fft3123 = _mm512_add_ps(fft3115, fft3119);
__m512 fft3036 = _mm512_sub_ps(fft3031, fft3027);
__m512 fft3124 = _mm512_sub_ps(fft3119, fft3115);
__m512 fft3037 = _mm512_sub_ps(fft3028, fft3032);
__m512 fft3125 = _mm512_sub_ps(fft3116, fft3120);
__m512 fft3038 = _mm512_add_ps(fft3028, fft3032);
__m512 fft3126 = _mm512_add_ps(fft3116, fft3120);
__m512 fft3039 = _mm512_add_ps(fft3033, fft3035);
__m512 fft3127 = _mm512_add_ps(fft3121, fft3123);
__m512 fft3040 = _mm512_sub_ps(fft3033, fft3035);
__m512 fft3128 = _mm512_sub_ps(fft3121, fft3123);
__m512 fft3041 = _mm512_fmadd_ps(fft3037, _mm512_set1_ps(7.0710677e-01f), fft3026);
__m512 fft3129 = _mm512_fmadd_ps(fft3125, _mm512_set1_ps(7.0710677e-01f), fft3114);
__m512 fft3042 = _mm512_fnmsub_ps(fft3038, _mm512_set1_ps(7.0710677e-01f), fft3030);
__m512 fft3130 = _mm512_fnmsub_ps(fft3126, _mm512_set1_ps(7.0710677e-01f), fft3118);
__m512 fft3043 = _mm512_fnmadd_ps(fft3037, _mm512_set1_ps(7.0710677e-01f), fft3026);
__m512 fft3131 = _mm512_fnmadd_ps(fft3125, _mm512_set1_ps(7.0710677e-01f), fft3114);
__m512 fft3044 = _mm512_fnmadd_ps(fft3038, _mm512_set1_ps(7.0710677e-01f), fft3030);
__m512 fft3132 = _mm512_fnmadd_ps(fft3126, _mm512_set1_ps(7.0710677e-01f), fft3118);
__m512 fft3045 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3046 = _mm512_fmadd_ps(fft3039, fft3045, _mm512_shuffle_f32x4(fft3039, fft3039, 78));
__m512 fft3133 = _mm512_fmadd_ps(fft3127, fft3045, _mm512_shuffle_f32x4(fft3127, fft3127, 78));
__m512 fft3047 = _mm512_fmadd_ps(fft3040, fft3045, _mm512_shuffle_f32x4(fft3040, fft3040, 78));
__m512 fft3134 = _mm512_fmadd_ps(fft3128, fft3045, _mm512_shuffle_f32x4(fft3128, fft3128, 78));
__m512 fft3048 = _mm512_fmadd_ps(fft3041, fft3045, _mm512_shuffle_f32x4(fft3041, fft3041, 78));
__m512 fft3135 = _mm512_fmadd_ps(fft3129, fft3045, _mm512_shuffle_f32x4(fft3129, fft3129, 78));
__m512 fft3049 = _mm512_fmadd_ps(fft3042, fft3045, _mm512_shuffle_f32x4(fft3042, fft3042, 78));
__m512 fft3136 = _mm512_fmadd_ps(fft3130, fft3045, _mm512_shuffle_f32x4(fft3130, fft3130, 78));
__m512 fft3050 = _mm512_fmadd_ps(fft3034, fft3045, _mm512_shuffle_f32x4(fft3034, fft3034, 78));
__m512 fft3137 = _mm512_fmadd_ps(fft3122, fft3045, _mm512_shuffle_f32x4(fft3122, fft3122, 78));
__m512 fft3051 = _mm512_fmadd_ps(fft3036, fft3045, _mm512_shuffle_f32x4(fft3036, fft3036, 78));
__m512 fft3138 = _mm512_fmadd_ps(fft3124, fft3045, _mm512_shuffle_f32x4(fft3124, fft3124, 78));
__m512 fft3052 = _mm512_fmadd_ps(fft3043, fft3045, _mm512_shuffle_f32x4(fft3043, fft3043, 78));
__m512 fft3139 = _mm512_fmadd_ps(fft3131, fft3045, _mm512_shuffle_f32x4(fft3131, fft3131, 78));
__m512 fft3053 = _mm512_fmadd_ps(fft3044, fft3045, _mm512_shuffle_f32x4(fft3044, fft3044, 78));
__m512 fft3140 = _mm512_fmadd_ps(fft3132, fft3045, _mm512_shuffle_f32x4(fft3132, fft3132, 78));
__m512 fft3054 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3055 = _mm512_mul_ps(fft3046, fft3054);
__m512 fft3141 = _mm512_mul_ps(fft3133, fft3054);
__m512 fft3056 = _mm512_mul_ps(fft3047, fft3054);
__m512 fft3142 = _mm512_mul_ps(fft3134, fft3054);
__m512 fft3057 = _mm512_mul_ps(fft3048, fft3054);
__m512 fft3143 = _mm512_mul_ps(fft3135, fft3054);
__m512 fft3058 = _mm512_mul_ps(fft3049, fft3054);
__m512 fft3144 = _mm512_mul_ps(fft3136, fft3054);
__m512 fft3059 = _mm512_mul_ps(fft3050, fft3054);
__m512 fft3145 = _mm512_mul_ps(fft3137, fft3054);
__m512 fft3060 = _mm512_mul_ps(fft3051, fft3054);
__m512 fft3146 = _mm512_mul_ps(fft3138, fft3054);
__m512 fft3061 = _mm512_mul_ps(fft3052, fft3054);
__m512 fft3147 = _mm512_mul_ps(fft3139, fft3054);
__m512 fft3062 = _mm512_mul_ps(fft3053, fft3054);
__m512 fft3148 = _mm512_mul_ps(fft3140, fft3054);
__m512 fft3063 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft3064 = _mm512_fmadd_ps(fft3047, fft3063, fft3055);
__m512 fft3149 = _mm512_fmadd_ps(fft3134, fft3063, fft3141);
__m512 fft3065 = _mm512_fnmadd_ps(fft3046, fft3063, fft3056);
__m512 fft3150 = _mm512_fnmadd_ps(fft3133, fft3063, fft3142);
__m512 fft3066 = _mm512_fmadd_ps(fft3049, fft3063, fft3057);
__m512 fft3151 = _mm512_fmadd_ps(fft3136, fft3063, fft3143);
__m512 fft3067 = _mm512_fnmadd_ps(fft3048, fft3063, fft3058);
__m512 fft3152 = _mm512_fnmadd_ps(fft3135, fft3063, fft3144);
__m512 fft3068 = _mm512_fmadd_ps(fft3051, fft3063, fft3059);
__m512 fft3153 = _mm512_fmadd_ps(fft3138, fft3063, fft3145);
__m512 fft3069 = _mm512_fnmadd_ps(fft3050, fft3063, fft3060);
__m512 fft3154 = _mm512_fnmadd_ps(fft3137, fft3063, fft3146);
__m512 fft3070 = _mm512_fmadd_ps(fft3053, fft3063, fft3061);
__m512 fft3155 = _mm512_fmadd_ps(fft3140, fft3063, fft3147);
__m512 fft3071 = _mm512_fnmadd_ps(fft3052, fft3063, fft3062);
__m512 fft3156 = _mm512_fnmadd_ps(fft3139, fft3063, fft3148);
__m512 fft3072 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft3073 = _mm512_fmadd_ps(fft3064, fft3072, _mm512_shuffle_f32x4(fft3064, fft3064, 177));
__m512 fft3157 = _mm512_fmadd_ps(fft3149, fft3072, _mm512_shuffle_f32x4(fft3149, fft3149, 177));
__m512 fft3074 = _mm512_fmadd_ps(fft3065, fft3072, _mm512_shuffle_f32x4(fft3065, fft3065, 177));
__m512 fft3158 = _mm512_fmadd_ps(fft3150, fft3072, _mm512_shuffle_f32x4(fft3150, fft3150, 177));
__m512 fft3075 = _mm512_fmadd_ps(fft3066, fft3072, _mm512_shuffle_f32x4(fft3066, fft3066, 177));
__m512 fft3159 = _mm512_fmadd_ps(fft3151, fft3072, _mm512_shuffle_f32x4(fft3151, fft3151, 177));
__m512 fft3076 = _mm512_fmadd_ps(fft3067, fft3072, _mm512_shuffle_f32x4(fft3067, fft3067, 177));
__m512 fft3160 = _mm512_fmadd_ps(fft3152, fft3072, _mm512_shuffle_f32x4(fft3152, fft3152, 177));
__m512 fft3077 = _mm512_fmadd_ps(fft3068, fft3072, _mm512_shuffle_f32x4(fft3068, fft3068, 177));
__m512 fft3161 = _mm512_fmadd_ps(fft3153, fft3072, _mm512_shuffle_f32x4(fft3153, fft3153, 177));
__m512 fft3078 = _mm512_fmadd_ps(fft3069, fft3072, _mm512_shuffle_f32x4(fft3069, fft3069, 177));
__m512 fft3162 = _mm512_fmadd_ps(fft3154, fft3072, _mm512_shuffle_f32x4(fft3154, fft3154, 177));
__m512 fft3079 = _mm512_fmadd_ps(fft3070, fft3072, _mm512_shuffle_f32x4(fft3070, fft3070, 177));
__m512 fft3163 = _mm512_fmadd_ps(fft3155, fft3072, _mm512_shuffle_f32x4(fft3155, fft3155, 177));
__m512 fft3080 = _mm512_fmadd_ps(fft3071, fft3072, _mm512_shuffle_f32x4(fft3071, fft3071, 177));
__m512 fft3164 = _mm512_fmadd_ps(fft3156, fft3072, _mm512_shuffle_f32x4(fft3156, fft3156, 177));
__m512 fft3081 = _mm512_mask_mov_ps(fft3073, 49344, fft3074);
__m512 fft3165 = _mm512_mask_mov_ps(fft3157, 49344, fft3158);
__m512 fft3082 = _mm512_mask_sub_ps(fft3074, 49344, _mm512_setzero_ps(), fft3073);
__m512 fft3166 = _mm512_mask_sub_ps(fft3158, 49344, _mm512_setzero_ps(), fft3157);
__m512 fft3083 = _mm512_mask_mov_ps(fft3075, 49344, fft3076);
__m512 fft3167 = _mm512_mask_mov_ps(fft3159, 49344, fft3160);
__m512 fft3084 = _mm512_mask_sub_ps(fft3076, 49344, _mm512_setzero_ps(), fft3075);
__m512 fft3168 = _mm512_mask_sub_ps(fft3160, 49344, _mm512_setzero_ps(), fft3159);
__m512 fft3085 = _mm512_mask_mov_ps(fft3077, 49344, fft3078);
__m512 fft3169 = _mm512_mask_mov_ps(fft3161, 49344, fft3162);
__m512 fft3086 = _mm512_mask_sub_ps(fft3078, 49344, _mm512_setzero_ps(), fft3077);
__m512 fft3170 = _mm512_mask_sub_ps(fft3162, 49344, _mm512_setzero_ps(), fft3161);
__m512 fft3087 = _mm512_mask_mov_ps(fft3079, 49344, fft3080);
__m512 fft3171 = _mm512_mask_mov_ps(fft3163, 49344, fft3164);
__m512 fft3088 = _mm512_mask_sub_ps(fft3080, 49344, _mm512_setzero_ps(), fft3079);
__m512 fft3172 = _mm512_mask_sub_ps(fft3164, 49344, _mm512_setzero_ps(), fft3163);
__m512 fft3089 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft3090 = _mm512_fmadd_ps(fft3081, fft3089, _mm512_shuffle_ps(fft3081, fft3081, 78));
__m512 fft3173 = _mm512_fmadd_ps(fft3165, fft3089, _mm512_shuffle_ps(fft3165, fft3165, 78));
__m512 fft3091 = _mm512_fmadd_ps(fft3082, fft3089, _mm512_shuffle_ps(fft3082, fft3082, 78));
__m512 fft3174 = _mm512_fmadd_ps(fft3166, fft3089, _mm512_shuffle_ps(fft3166, fft3166, 78));
__m512 fft3092 = _mm512_fmadd_ps(fft3083, fft3089, _mm512_shuffle_ps(fft3083, fft3083, 78));
__m512 fft3175 = _mm512_fmadd_ps(fft3167, fft3089, _mm512_shuffle_ps(fft3167, fft3167, 78));
__m512 fft3093 = _mm512_fmadd_ps(fft3084, fft3089, _mm512_shuffle_ps(fft3084, fft3084, 78));
__m512 fft3176 = _mm512_fmadd_ps(fft3168, fft3089, _mm512_shuffle_ps(fft3168, fft3168, 78));
__m512 fft3094 = _mm512_fmadd_ps(fft3085, fft3089, _mm512_shuffle_ps(fft3085, fft3085, 78));
__m512 fft3177 = _mm512_fmadd_ps(fft3169, fft3089, _mm512_shuffle_ps(fft3169, fft3169, 78));
__m512 fft3095 = _mm512_fmadd_ps(fft3086, fft3089, _mm512_shuffle_ps(fft3086, fft3086, 78));
__m512 fft3178 = _mm512_fmadd_ps(fft3170, fft3089, _mm512_shuffle_ps(fft3170, fft3170, 78));
__m512 fft3096 = _mm512_fmadd_ps(fft3087, fft3089, _mm512_shuffle_ps(fft3087, fft3087, 78));
__m512 fft3179 = _mm512_fmadd_ps(fft3171, fft3089, _mm512_shuffle_ps(fft3171, fft3171, 78));
__m512 fft3097 = _mm512_fmadd_ps(fft3088, fft3089, _mm512_shuffle_ps(fft3088, fft3088, 78));
__m512 fft3180 = _mm512_fmadd_ps(fft3172, fft3089, _mm512_shuffle_ps(fft3172, fft3172, 78));
__m512i fft3098 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft3099 = _mm512_permutexvar_ps(fft3098, fft3090);
__m512 fft3181 = _mm512_permutexvar_ps(fft3098, fft3173);
__m512i fft3100 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft3101 = _mm512_permutexvar_ps(fft3100, fft3090);
__m512 fft3182 = _mm512_permutexvar_ps(fft3100, fft3173);
__m512 fft3102 = _mm512_permutexvar_ps(fft3098, fft3091);
__m512 fft3183 = _mm512_permutexvar_ps(fft3098, fft3174);
__m512 fft3103 = _mm512_permutexvar_ps(fft3100, fft3091);
__m512 fft3184 = _mm512_permutexvar_ps(fft3100, fft3174);
__m512 fft3104 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft3105 = _mm512_fmadd_ps(fft3099, fft3104, fft3101);
__m512 fft3185 = _mm512_fmadd_ps(fft3181, fft3104, fft3182);
__m512 fft3106 = _mm512_fnmadd_ps(fft3103, fft3104, fft3102);
__m512 fft3186 = _mm512_fnmadd_ps(fft3184, fft3104, fft3183);
__m512 fft3107 = _mm512_mask_mov_ps(fft3103, 21845, fft3105);
__m512 fft3187 = _mm512_mask_mov_ps(fft3184, 21845, fft3185);
__m512 fft3108 = _mm512_mask_mov_ps(fft3099, 43176, fft3105);
__m512 fft3188 = _mm512_mask_mov_ps(fft3181, 43176, fft3185);
__m512 fft3109 = _mm512_mask_mov_ps(fft3107, 43176, fft3106);
__m512 fft3189 = _mm512_mask_mov_ps(fft3187, 43176, fft3186);
__m512 fft3110 = _mm512_mask_mov_ps(fft3108, 22102, fft3106);
__m512 fft3190 = _mm512_mask_mov_ps(fft3188, 22102, fft3186);
__m512 fft3111 = _mm512_mask_mul_ps(fft3109, 64764, fft3109, _mm512_set1_ps(5e-01f));
__m512 fft3191 = _mm512_mask_mul_ps(fft3189, 64764, fft3189, _mm512_set1_ps(5e-01f));
__m512 fft3112 = _mm512_mask_mul_ps(fft3110, 64764, fft3110, _mm512_set1_ps(5e-01f));
__m512 fft3192 = _mm512_mask_mul_ps(fft3190, 64764, fft3190, _mm512_set1_ps(5e-01f));
__m512 df257 = fft3111;
__m512 df265 = fft3191;
__m512 df258 = fft3112;
__m512 df266 = fft3192;
__m512 df259 = fft3092;
__m512 df267 = fft3175;
__m512 df260 = fft3093;
__m512 df268 = fft3176;
__m512 df261 = fft3094;
__m512 df269 = fft3177;
__m512 df262 = fft3095;
__m512 df270 = fft3178;
__m512 df263 = fft3096;
__m512 df271 = fft3179;
__m512 df264 = fft3097;
__m512 df272 = fft3180;
__m512i eo19 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df259 = _mm512_permutexvar_ps(eo19, df259);
df260 = _mm512_permutexvar_ps(eo19, df260);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df259);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df260);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df259);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df260);
df267 = _mm512_permutexvar_ps(eo19, df267);
df268 = _mm512_permutexvar_ps(eo19, df268);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df267);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df268);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df267);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df268);
df261 = _mm512_permutexvar_ps(eo19, df261);
df262 = _mm512_permutexvar_ps(eo19, df262);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df261);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df262);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df261);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df262);
df269 = _mm512_permutexvar_ps(eo19, df269);
df270 = _mm512_permutexvar_ps(eo19, df270);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df269);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df270);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df269);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df270);
df263 = _mm512_permutexvar_ps(eo19, df263);
df264 = _mm512_permutexvar_ps(eo19, df264);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df263);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df264);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df263);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df264);
df271 = _mm512_permutexvar_ps(eo19, df271);
df272 = _mm512_permutexvar_ps(eo19, df272);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df271);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df272);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df271);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df272);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df257);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df258);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df257);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df258);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df265);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df266);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df265);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df266);
}
}
if (j2 >= last1) return;
++j2;
}
if (j2 >= 84) break;
rel2 = 11;
}
if (rel2 < 16) {
if (rel2 < 12) {
ptrdiff_t h9 = base2+20;
ptrdiff_t w9 = 210;
ptrdiff_t k10 = 3*s1;
ptrdiff_t kk9 = k10+2;
for (; k10 <= kk9; ++k10) {
ptrdiff_t b20 = 0;
ptrdiff_t m20 = (size_t)b20/2;
ptrdiff_t f21 = (size_t)b20%2;
__m512 dat258 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k10+896*h9+4*w9+0*b20);
__m512 dat259 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k10+896*h9+4*w9+0*b20);
__m512 dat260 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k10+896*h9+4*w9+0*b20);
__m512 dat261 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k10+896*h9+4*w9+0*b20);
__m512 dat262 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k10+896*h9+4*w9+0*b20);
__m512 dat263 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k10+896*h9+4*w9+0*b20);
__m512 dat264 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k10+896*h9+4*w9+0*b20);
__m512 dat265 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k10+896*h9+4*w9+0*b20);
__m512 dat266 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k10+896*h9+4*w9+0*b20);
__m512 dat267 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k10+896*h9+4*w9+0*b20);
__m512 dat268 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k10+896*h9+4*w9+0*b20);
__m512 dat269 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k10+896*h9+4*w9+0*b20);
__m512 dat270 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k10+896*h9+4*w9+0*b20);
__m512 dat271 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k10+896*h9+4*w9+0*b20);
__m512 dat272 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k10+896*h9+4*w9+0*b20);
__m512 dat273 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k10+896*h9+4*w9+0*b20);
__m512 fft3193 = _mm512_add_ps(dat258, dat266);
__m512 fft3281 = _mm512_add_ps(dat259, dat267);
__m512 fft3194 = _mm512_sub_ps(dat258, dat266);
__m512 fft3282 = _mm512_sub_ps(dat259, dat267);
__m512 fft3195 = _mm512_add_ps(dat260, dat268);
__m512 fft3283 = _mm512_add_ps(dat261, dat269);
__m512 fft3196 = _mm512_sub_ps(dat260, dat268);
__m512 fft3284 = _mm512_sub_ps(dat261, dat269);
__m512 fft3197 = _mm512_add_ps(dat262, dat270);
__m512 fft3285 = _mm512_add_ps(dat263, dat271);
__m512 fft3198 = _mm512_sub_ps(dat262, dat270);
__m512 fft3286 = _mm512_sub_ps(dat263, dat271);
__m512 fft3199 = _mm512_add_ps(dat264, dat272);
__m512 fft3287 = _mm512_add_ps(dat265, dat273);
__m512 fft3200 = _mm512_sub_ps(dat264, dat272);
__m512 fft3288 = _mm512_sub_ps(dat265, dat273);
__m512 fft3201 = _mm512_add_ps(fft3193, fft3197);
__m512 fft3289 = _mm512_add_ps(fft3281, fft3285);
__m512 fft3202 = _mm512_sub_ps(fft3193, fft3197);
__m512 fft3290 = _mm512_sub_ps(fft3281, fft3285);
__m512 fft3203 = _mm512_add_ps(fft3195, fft3199);
__m512 fft3291 = _mm512_add_ps(fft3283, fft3287);
__m512 fft3204 = _mm512_sub_ps(fft3199, fft3195);
__m512 fft3292 = _mm512_sub_ps(fft3287, fft3283);
__m512 fft3205 = _mm512_sub_ps(fft3196, fft3200);
__m512 fft3293 = _mm512_sub_ps(fft3284, fft3288);
__m512 fft3206 = _mm512_add_ps(fft3196, fft3200);
__m512 fft3294 = _mm512_add_ps(fft3284, fft3288);
__m512 fft3207 = _mm512_add_ps(fft3201, fft3203);
__m512 fft3295 = _mm512_add_ps(fft3289, fft3291);
__m512 fft3208 = _mm512_sub_ps(fft3201, fft3203);
__m512 fft3296 = _mm512_sub_ps(fft3289, fft3291);
__m512 fft3209 = _mm512_fmadd_ps(fft3205, _mm512_set1_ps(7.0710677e-01f), fft3194);
__m512 fft3297 = _mm512_fmadd_ps(fft3293, _mm512_set1_ps(7.0710677e-01f), fft3282);
__m512 fft3210 = _mm512_fnmsub_ps(fft3206, _mm512_set1_ps(7.0710677e-01f), fft3198);
__m512 fft3298 = _mm512_fnmsub_ps(fft3294, _mm512_set1_ps(7.0710677e-01f), fft3286);
__m512 fft3211 = _mm512_fnmadd_ps(fft3205, _mm512_set1_ps(7.0710677e-01f), fft3194);
__m512 fft3299 = _mm512_fnmadd_ps(fft3293, _mm512_set1_ps(7.0710677e-01f), fft3282);
__m512 fft3212 = _mm512_fnmadd_ps(fft3206, _mm512_set1_ps(7.0710677e-01f), fft3198);
__m512 fft3300 = _mm512_fnmadd_ps(fft3294, _mm512_set1_ps(7.0710677e-01f), fft3286);
__m512 fft3213 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3214 = _mm512_fmadd_ps(fft3207, fft3213, _mm512_shuffle_f32x4(fft3207, fft3207, 78));
__m512 fft3301 = _mm512_fmadd_ps(fft3295, fft3213, _mm512_shuffle_f32x4(fft3295, fft3295, 78));
__m512 fft3215 = _mm512_fmadd_ps(fft3208, fft3213, _mm512_shuffle_f32x4(fft3208, fft3208, 78));
__m512 fft3302 = _mm512_fmadd_ps(fft3296, fft3213, _mm512_shuffle_f32x4(fft3296, fft3296, 78));
__m512 fft3216 = _mm512_fmadd_ps(fft3209, fft3213, _mm512_shuffle_f32x4(fft3209, fft3209, 78));
__m512 fft3303 = _mm512_fmadd_ps(fft3297, fft3213, _mm512_shuffle_f32x4(fft3297, fft3297, 78));
__m512 fft3217 = _mm512_fmadd_ps(fft3210, fft3213, _mm512_shuffle_f32x4(fft3210, fft3210, 78));
__m512 fft3304 = _mm512_fmadd_ps(fft3298, fft3213, _mm512_shuffle_f32x4(fft3298, fft3298, 78));
__m512 fft3218 = _mm512_fmadd_ps(fft3202, fft3213, _mm512_shuffle_f32x4(fft3202, fft3202, 78));
__m512 fft3305 = _mm512_fmadd_ps(fft3290, fft3213, _mm512_shuffle_f32x4(fft3290, fft3290, 78));
__m512 fft3219 = _mm512_fmadd_ps(fft3204, fft3213, _mm512_shuffle_f32x4(fft3204, fft3204, 78));
__m512 fft3306 = _mm512_fmadd_ps(fft3292, fft3213, _mm512_shuffle_f32x4(fft3292, fft3292, 78));
__m512 fft3220 = _mm512_fmadd_ps(fft3211, fft3213, _mm512_shuffle_f32x4(fft3211, fft3211, 78));
__m512 fft3307 = _mm512_fmadd_ps(fft3299, fft3213, _mm512_shuffle_f32x4(fft3299, fft3299, 78));
__m512 fft3221 = _mm512_fmadd_ps(fft3212, fft3213, _mm512_shuffle_f32x4(fft3212, fft3212, 78));
__m512 fft3308 = _mm512_fmadd_ps(fft3300, fft3213, _mm512_shuffle_f32x4(fft3300, fft3300, 78));
__m512 fft3222 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3223 = _mm512_mul_ps(fft3214, fft3222);
__m512 fft3309 = _mm512_mul_ps(fft3301, fft3222);
__m512 fft3224 = _mm512_mul_ps(fft3215, fft3222);
__m512 fft3310 = _mm512_mul_ps(fft3302, fft3222);
__m512 fft3225 = _mm512_mul_ps(fft3216, fft3222);
__m512 fft3311 = _mm512_mul_ps(fft3303, fft3222);
__m512 fft3226 = _mm512_mul_ps(fft3217, fft3222);
__m512 fft3312 = _mm512_mul_ps(fft3304, fft3222);
__m512 fft3227 = _mm512_mul_ps(fft3218, fft3222);
__m512 fft3313 = _mm512_mul_ps(fft3305, fft3222);
__m512 fft3228 = _mm512_mul_ps(fft3219, fft3222);
__m512 fft3314 = _mm512_mul_ps(fft3306, fft3222);
__m512 fft3229 = _mm512_mul_ps(fft3220, fft3222);
__m512 fft3315 = _mm512_mul_ps(fft3307, fft3222);
__m512 fft3230 = _mm512_mul_ps(fft3221, fft3222);
__m512 fft3316 = _mm512_mul_ps(fft3308, fft3222);
__m512 fft3231 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft3232 = _mm512_fmadd_ps(fft3215, fft3231, fft3223);
__m512 fft3317 = _mm512_fmadd_ps(fft3302, fft3231, fft3309);
__m512 fft3233 = _mm512_fnmadd_ps(fft3214, fft3231, fft3224);
__m512 fft3318 = _mm512_fnmadd_ps(fft3301, fft3231, fft3310);
__m512 fft3234 = _mm512_fmadd_ps(fft3217, fft3231, fft3225);
__m512 fft3319 = _mm512_fmadd_ps(fft3304, fft3231, fft3311);
__m512 fft3235 = _mm512_fnmadd_ps(fft3216, fft3231, fft3226);
__m512 fft3320 = _mm512_fnmadd_ps(fft3303, fft3231, fft3312);
__m512 fft3236 = _mm512_fmadd_ps(fft3219, fft3231, fft3227);
__m512 fft3321 = _mm512_fmadd_ps(fft3306, fft3231, fft3313);
__m512 fft3237 = _mm512_fnmadd_ps(fft3218, fft3231, fft3228);
__m512 fft3322 = _mm512_fnmadd_ps(fft3305, fft3231, fft3314);
__m512 fft3238 = _mm512_fmadd_ps(fft3221, fft3231, fft3229);
__m512 fft3323 = _mm512_fmadd_ps(fft3308, fft3231, fft3315);
__m512 fft3239 = _mm512_fnmadd_ps(fft3220, fft3231, fft3230);
__m512 fft3324 = _mm512_fnmadd_ps(fft3307, fft3231, fft3316);
__m512 fft3240 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft3241 = _mm512_fmadd_ps(fft3232, fft3240, _mm512_shuffle_f32x4(fft3232, fft3232, 177));
__m512 fft3325 = _mm512_fmadd_ps(fft3317, fft3240, _mm512_shuffle_f32x4(fft3317, fft3317, 177));
__m512 fft3242 = _mm512_fmadd_ps(fft3233, fft3240, _mm512_shuffle_f32x4(fft3233, fft3233, 177));
__m512 fft3326 = _mm512_fmadd_ps(fft3318, fft3240, _mm512_shuffle_f32x4(fft3318, fft3318, 177));
__m512 fft3243 = _mm512_fmadd_ps(fft3234, fft3240, _mm512_shuffle_f32x4(fft3234, fft3234, 177));
__m512 fft3327 = _mm512_fmadd_ps(fft3319, fft3240, _mm512_shuffle_f32x4(fft3319, fft3319, 177));
__m512 fft3244 = _mm512_fmadd_ps(fft3235, fft3240, _mm512_shuffle_f32x4(fft3235, fft3235, 177));
__m512 fft3328 = _mm512_fmadd_ps(fft3320, fft3240, _mm512_shuffle_f32x4(fft3320, fft3320, 177));
__m512 fft3245 = _mm512_fmadd_ps(fft3236, fft3240, _mm512_shuffle_f32x4(fft3236, fft3236, 177));
__m512 fft3329 = _mm512_fmadd_ps(fft3321, fft3240, _mm512_shuffle_f32x4(fft3321, fft3321, 177));
__m512 fft3246 = _mm512_fmadd_ps(fft3237, fft3240, _mm512_shuffle_f32x4(fft3237, fft3237, 177));
__m512 fft3330 = _mm512_fmadd_ps(fft3322, fft3240, _mm512_shuffle_f32x4(fft3322, fft3322, 177));
__m512 fft3247 = _mm512_fmadd_ps(fft3238, fft3240, _mm512_shuffle_f32x4(fft3238, fft3238, 177));
__m512 fft3331 = _mm512_fmadd_ps(fft3323, fft3240, _mm512_shuffle_f32x4(fft3323, fft3323, 177));
__m512 fft3248 = _mm512_fmadd_ps(fft3239, fft3240, _mm512_shuffle_f32x4(fft3239, fft3239, 177));
__m512 fft3332 = _mm512_fmadd_ps(fft3324, fft3240, _mm512_shuffle_f32x4(fft3324, fft3324, 177));
__m512 fft3249 = _mm512_mask_mov_ps(fft3241, 49344, fft3242);
__m512 fft3333 = _mm512_mask_mov_ps(fft3325, 49344, fft3326);
__m512 fft3250 = _mm512_mask_sub_ps(fft3242, 49344, _mm512_setzero_ps(), fft3241);
__m512 fft3334 = _mm512_mask_sub_ps(fft3326, 49344, _mm512_setzero_ps(), fft3325);
__m512 fft3251 = _mm512_mask_mov_ps(fft3243, 49344, fft3244);
__m512 fft3335 = _mm512_mask_mov_ps(fft3327, 49344, fft3328);
__m512 fft3252 = _mm512_mask_sub_ps(fft3244, 49344, _mm512_setzero_ps(), fft3243);
__m512 fft3336 = _mm512_mask_sub_ps(fft3328, 49344, _mm512_setzero_ps(), fft3327);
__m512 fft3253 = _mm512_mask_mov_ps(fft3245, 49344, fft3246);
__m512 fft3337 = _mm512_mask_mov_ps(fft3329, 49344, fft3330);
__m512 fft3254 = _mm512_mask_sub_ps(fft3246, 49344, _mm512_setzero_ps(), fft3245);
__m512 fft3338 = _mm512_mask_sub_ps(fft3330, 49344, _mm512_setzero_ps(), fft3329);
__m512 fft3255 = _mm512_mask_mov_ps(fft3247, 49344, fft3248);
__m512 fft3339 = _mm512_mask_mov_ps(fft3331, 49344, fft3332);
__m512 fft3256 = _mm512_mask_sub_ps(fft3248, 49344, _mm512_setzero_ps(), fft3247);
__m512 fft3340 = _mm512_mask_sub_ps(fft3332, 49344, _mm512_setzero_ps(), fft3331);
__m512 fft3257 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft3258 = _mm512_fmadd_ps(fft3249, fft3257, _mm512_shuffle_ps(fft3249, fft3249, 78));
__m512 fft3341 = _mm512_fmadd_ps(fft3333, fft3257, _mm512_shuffle_ps(fft3333, fft3333, 78));
__m512 fft3259 = _mm512_fmadd_ps(fft3250, fft3257, _mm512_shuffle_ps(fft3250, fft3250, 78));
__m512 fft3342 = _mm512_fmadd_ps(fft3334, fft3257, _mm512_shuffle_ps(fft3334, fft3334, 78));
__m512 fft3260 = _mm512_fmadd_ps(fft3251, fft3257, _mm512_shuffle_ps(fft3251, fft3251, 78));
__m512 fft3343 = _mm512_fmadd_ps(fft3335, fft3257, _mm512_shuffle_ps(fft3335, fft3335, 78));
__m512 fft3261 = _mm512_fmadd_ps(fft3252, fft3257, _mm512_shuffle_ps(fft3252, fft3252, 78));
__m512 fft3344 = _mm512_fmadd_ps(fft3336, fft3257, _mm512_shuffle_ps(fft3336, fft3336, 78));
__m512 fft3262 = _mm512_fmadd_ps(fft3253, fft3257, _mm512_shuffle_ps(fft3253, fft3253, 78));
__m512 fft3345 = _mm512_fmadd_ps(fft3337, fft3257, _mm512_shuffle_ps(fft3337, fft3337, 78));
__m512 fft3263 = _mm512_fmadd_ps(fft3254, fft3257, _mm512_shuffle_ps(fft3254, fft3254, 78));
__m512 fft3346 = _mm512_fmadd_ps(fft3338, fft3257, _mm512_shuffle_ps(fft3338, fft3338, 78));
__m512 fft3264 = _mm512_fmadd_ps(fft3255, fft3257, _mm512_shuffle_ps(fft3255, fft3255, 78));
__m512 fft3347 = _mm512_fmadd_ps(fft3339, fft3257, _mm512_shuffle_ps(fft3339, fft3339, 78));
__m512 fft3265 = _mm512_fmadd_ps(fft3256, fft3257, _mm512_shuffle_ps(fft3256, fft3256, 78));
__m512 fft3348 = _mm512_fmadd_ps(fft3340, fft3257, _mm512_shuffle_ps(fft3340, fft3340, 78));
__m512i fft3266 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft3267 = _mm512_permutexvar_ps(fft3266, fft3258);
__m512 fft3349 = _mm512_permutexvar_ps(fft3266, fft3341);
__m512i fft3268 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft3269 = _mm512_permutexvar_ps(fft3268, fft3258);
__m512 fft3350 = _mm512_permutexvar_ps(fft3268, fft3341);
__m512 fft3270 = _mm512_permutexvar_ps(fft3266, fft3259);
__m512 fft3351 = _mm512_permutexvar_ps(fft3266, fft3342);
__m512 fft3271 = _mm512_permutexvar_ps(fft3268, fft3259);
__m512 fft3352 = _mm512_permutexvar_ps(fft3268, fft3342);
__m512 fft3272 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft3273 = _mm512_fmadd_ps(fft3267, fft3272, fft3269);
__m512 fft3353 = _mm512_fmadd_ps(fft3349, fft3272, fft3350);
__m512 fft3274 = _mm512_fnmadd_ps(fft3271, fft3272, fft3270);
__m512 fft3354 = _mm512_fnmadd_ps(fft3352, fft3272, fft3351);
__m512 fft3275 = _mm512_mask_mov_ps(fft3271, 21845, fft3273);
__m512 fft3355 = _mm512_mask_mov_ps(fft3352, 21845, fft3353);
__m512 fft3276 = _mm512_mask_mov_ps(fft3267, 43176, fft3273);
__m512 fft3356 = _mm512_mask_mov_ps(fft3349, 43176, fft3353);
__m512 fft3277 = _mm512_mask_mov_ps(fft3275, 43176, fft3274);
__m512 fft3357 = _mm512_mask_mov_ps(fft3355, 43176, fft3354);
__m512 fft3278 = _mm512_mask_mov_ps(fft3276, 22102, fft3274);
__m512 fft3358 = _mm512_mask_mov_ps(fft3356, 22102, fft3354);
__m512 fft3279 = _mm512_mask_mul_ps(fft3277, 64764, fft3277, _mm512_set1_ps(5e-01f));
__m512 fft3359 = _mm512_mask_mul_ps(fft3357, 64764, fft3357, _mm512_set1_ps(5e-01f));
__m512 fft3280 = _mm512_mask_mul_ps(fft3278, 64764, fft3278, _mm512_set1_ps(5e-01f));
__m512 fft3360 = _mm512_mask_mul_ps(fft3358, 64764, fft3358, _mm512_set1_ps(5e-01f));
__m512 df273 = fft3279;
__m512 df281 = fft3359;
__m512 df274 = fft3280;
__m512 df282 = fft3360;
__m512 df275 = fft3260;
__m512 df283 = fft3343;
__m512 df276 = fft3261;
__m512 df284 = fft3344;
__m512 df277 = fft3262;
__m512 df285 = fft3345;
__m512 df278 = fft3263;
__m512 df286 = fft3346;
__m512 df279 = fft3264;
__m512 df287 = fft3347;
__m512 df280 = fft3265;
__m512 df288 = fft3348;
__m512i eo20 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df275 = _mm512_permutexvar_ps(eo20, df275);
df276 = _mm512_permutexvar_ps(eo20, df276);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df275);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df276);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df275);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df276);
df283 = _mm512_permutexvar_ps(eo20, df283);
df284 = _mm512_permutexvar_ps(eo20, df284);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df283);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df284);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df283);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df284);
df277 = _mm512_permutexvar_ps(eo20, df277);
df278 = _mm512_permutexvar_ps(eo20, df278);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df277);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df278);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df277);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df278);
df285 = _mm512_permutexvar_ps(eo20, df285);
df286 = _mm512_permutexvar_ps(eo20, df286);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df285);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df286);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df285);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df286);
df279 = _mm512_permutexvar_ps(eo20, df279);
df280 = _mm512_permutexvar_ps(eo20, df280);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df279);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df280);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df279);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df280);
df287 = _mm512_permutexvar_ps(eo20, df287);
df288 = _mm512_permutexvar_ps(eo20, df288);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df287);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df288);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df287);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df288);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df273);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df274);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df273);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df274);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df281);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df282);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df281);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df282);
ptrdiff_t b21 = 1;
ptrdiff_t m21 = (size_t)b21/2;
ptrdiff_t f22 = (size_t)b21%2;
__m512 dat274 = _mm512_maskz_loadu_ps(127, datPtr1+40+602112*i6+200704*k10+896*h9+4*w9+0*b21);
__m512 dat275 = _mm512_maskz_loadu_ps(127, datPtr1+936+602112*i6+200704*k10+896*h9+4*w9+0*b21);
__m512 dat276 = _mm512_maskz_loadu_ps(127, datPtr1+1832+602112*i6+200704*k10+896*h9+4*w9+0*b21);
__m512 dat277 = _mm512_maskz_loadu_ps(127, datPtr1+2728+602112*i6+200704*k10+896*h9+4*w9+0*b21);
__m512 dat278 = _mm512_maskz_loadu_ps(127, datPtr1+3624+602112*i6+200704*k10+896*h9+4*w9+0*b21);
__m512 dat279 = _mm512_maskz_loadu_ps(127, datPtr1+4520+602112*i6+200704*k10+896*h9+4*w9+0*b21);
__m512 dat280 = _mm512_maskz_loadu_ps(127, datPtr1+5416+602112*i6+200704*k10+896*h9+4*w9+0*b21);
__m512 dat281 = _mm512_maskz_loadu_ps(127, datPtr1+6312+602112*i6+200704*k10+896*h9+4*w9+0*b21);
__m512 dat282 = _mm512_maskz_loadu_ps(127, datPtr1+7208+602112*i6+200704*k10+896*h9+4*w9+0*b21);
__m512 dat283 = _mm512_maskz_loadu_ps(127, datPtr1+8104+602112*i6+200704*k10+896*h9+4*w9+0*b21);
__m512 dat284 = _mm512_maskz_loadu_ps(127, datPtr1+9000+602112*i6+200704*k10+896*h9+4*w9+0*b21);
__m512 dat285 = _mm512_maskz_loadu_ps(127, datPtr1+9896+602112*i6+200704*k10+896*h9+4*w9+0*b21);
__m512 dat286 = _mm512_maskz_loadu_ps(127, datPtr1+10792+602112*i6+200704*k10+896*h9+4*w9+0*b21);
__m512 dat287 = _mm512_maskz_loadu_ps(127, datPtr1+11688+602112*i6+200704*k10+896*h9+4*w9+0*b21);
__m512 dat288 = _mm512_maskz_loadu_ps(127, datPtr1+12584+602112*i6+200704*k10+896*h9+4*w9+0*b21);
__m512 dat289 = _mm512_maskz_loadu_ps(127, datPtr1+13480+602112*i6+200704*k10+896*h9+4*w9+0*b21);
__m512 fft3361 = _mm512_add_ps(dat274, dat282);
__m512 fft3449 = _mm512_add_ps(dat275, dat283);
__m512 fft3362 = _mm512_sub_ps(dat274, dat282);
__m512 fft3450 = _mm512_sub_ps(dat275, dat283);
__m512 fft3363 = _mm512_add_ps(dat276, dat284);
__m512 fft3451 = _mm512_add_ps(dat277, dat285);
__m512 fft3364 = _mm512_sub_ps(dat276, dat284);
__m512 fft3452 = _mm512_sub_ps(dat277, dat285);
__m512 fft3365 = _mm512_add_ps(dat278, dat286);
__m512 fft3453 = _mm512_add_ps(dat279, dat287);
__m512 fft3366 = _mm512_sub_ps(dat278, dat286);
__m512 fft3454 = _mm512_sub_ps(dat279, dat287);
__m512 fft3367 = _mm512_add_ps(dat280, dat288);
__m512 fft3455 = _mm512_add_ps(dat281, dat289);
__m512 fft3368 = _mm512_sub_ps(dat280, dat288);
__m512 fft3456 = _mm512_sub_ps(dat281, dat289);
__m512 fft3369 = _mm512_add_ps(fft3361, fft3365);
__m512 fft3457 = _mm512_add_ps(fft3449, fft3453);
__m512 fft3370 = _mm512_sub_ps(fft3361, fft3365);
__m512 fft3458 = _mm512_sub_ps(fft3449, fft3453);
__m512 fft3371 = _mm512_add_ps(fft3363, fft3367);
__m512 fft3459 = _mm512_add_ps(fft3451, fft3455);
__m512 fft3372 = _mm512_sub_ps(fft3367, fft3363);
__m512 fft3460 = _mm512_sub_ps(fft3455, fft3451);
__m512 fft3373 = _mm512_sub_ps(fft3364, fft3368);
__m512 fft3461 = _mm512_sub_ps(fft3452, fft3456);
__m512 fft3374 = _mm512_add_ps(fft3364, fft3368);
__m512 fft3462 = _mm512_add_ps(fft3452, fft3456);
__m512 fft3375 = _mm512_add_ps(fft3369, fft3371);
__m512 fft3463 = _mm512_add_ps(fft3457, fft3459);
__m512 fft3376 = _mm512_sub_ps(fft3369, fft3371);
__m512 fft3464 = _mm512_sub_ps(fft3457, fft3459);
__m512 fft3377 = _mm512_fmadd_ps(fft3373, _mm512_set1_ps(7.0710677e-01f), fft3362);
__m512 fft3465 = _mm512_fmadd_ps(fft3461, _mm512_set1_ps(7.0710677e-01f), fft3450);
__m512 fft3378 = _mm512_fnmsub_ps(fft3374, _mm512_set1_ps(7.0710677e-01f), fft3366);
__m512 fft3466 = _mm512_fnmsub_ps(fft3462, _mm512_set1_ps(7.0710677e-01f), fft3454);
__m512 fft3379 = _mm512_fnmadd_ps(fft3373, _mm512_set1_ps(7.0710677e-01f), fft3362);
__m512 fft3467 = _mm512_fnmadd_ps(fft3461, _mm512_set1_ps(7.0710677e-01f), fft3450);
__m512 fft3380 = _mm512_fnmadd_ps(fft3374, _mm512_set1_ps(7.0710677e-01f), fft3366);
__m512 fft3468 = _mm512_fnmadd_ps(fft3462, _mm512_set1_ps(7.0710677e-01f), fft3454);
__m512 fft3381 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3382 = _mm512_fmadd_ps(fft3375, fft3381, _mm512_shuffle_f32x4(fft3375, fft3375, 78));
__m512 fft3469 = _mm512_fmadd_ps(fft3463, fft3381, _mm512_shuffle_f32x4(fft3463, fft3463, 78));
__m512 fft3383 = _mm512_fmadd_ps(fft3376, fft3381, _mm512_shuffle_f32x4(fft3376, fft3376, 78));
__m512 fft3470 = _mm512_fmadd_ps(fft3464, fft3381, _mm512_shuffle_f32x4(fft3464, fft3464, 78));
__m512 fft3384 = _mm512_fmadd_ps(fft3377, fft3381, _mm512_shuffle_f32x4(fft3377, fft3377, 78));
__m512 fft3471 = _mm512_fmadd_ps(fft3465, fft3381, _mm512_shuffle_f32x4(fft3465, fft3465, 78));
__m512 fft3385 = _mm512_fmadd_ps(fft3378, fft3381, _mm512_shuffle_f32x4(fft3378, fft3378, 78));
__m512 fft3472 = _mm512_fmadd_ps(fft3466, fft3381, _mm512_shuffle_f32x4(fft3466, fft3466, 78));
__m512 fft3386 = _mm512_fmadd_ps(fft3370, fft3381, _mm512_shuffle_f32x4(fft3370, fft3370, 78));
__m512 fft3473 = _mm512_fmadd_ps(fft3458, fft3381, _mm512_shuffle_f32x4(fft3458, fft3458, 78));
__m512 fft3387 = _mm512_fmadd_ps(fft3372, fft3381, _mm512_shuffle_f32x4(fft3372, fft3372, 78));
__m512 fft3474 = _mm512_fmadd_ps(fft3460, fft3381, _mm512_shuffle_f32x4(fft3460, fft3460, 78));
__m512 fft3388 = _mm512_fmadd_ps(fft3379, fft3381, _mm512_shuffle_f32x4(fft3379, fft3379, 78));
__m512 fft3475 = _mm512_fmadd_ps(fft3467, fft3381, _mm512_shuffle_f32x4(fft3467, fft3467, 78));
__m512 fft3389 = _mm512_fmadd_ps(fft3380, fft3381, _mm512_shuffle_f32x4(fft3380, fft3380, 78));
__m512 fft3476 = _mm512_fmadd_ps(fft3468, fft3381, _mm512_shuffle_f32x4(fft3468, fft3468, 78));
__m512 fft3390 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3391 = _mm512_mul_ps(fft3382, fft3390);
__m512 fft3477 = _mm512_mul_ps(fft3469, fft3390);
__m512 fft3392 = _mm512_mul_ps(fft3383, fft3390);
__m512 fft3478 = _mm512_mul_ps(fft3470, fft3390);
__m512 fft3393 = _mm512_mul_ps(fft3384, fft3390);
__m512 fft3479 = _mm512_mul_ps(fft3471, fft3390);
__m512 fft3394 = _mm512_mul_ps(fft3385, fft3390);
__m512 fft3480 = _mm512_mul_ps(fft3472, fft3390);
__m512 fft3395 = _mm512_mul_ps(fft3386, fft3390);
__m512 fft3481 = _mm512_mul_ps(fft3473, fft3390);
__m512 fft3396 = _mm512_mul_ps(fft3387, fft3390);
__m512 fft3482 = _mm512_mul_ps(fft3474, fft3390);
__m512 fft3397 = _mm512_mul_ps(fft3388, fft3390);
__m512 fft3483 = _mm512_mul_ps(fft3475, fft3390);
__m512 fft3398 = _mm512_mul_ps(fft3389, fft3390);
__m512 fft3484 = _mm512_mul_ps(fft3476, fft3390);
__m512 fft3399 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft3400 = _mm512_fmadd_ps(fft3383, fft3399, fft3391);
__m512 fft3485 = _mm512_fmadd_ps(fft3470, fft3399, fft3477);
__m512 fft3401 = _mm512_fnmadd_ps(fft3382, fft3399, fft3392);
__m512 fft3486 = _mm512_fnmadd_ps(fft3469, fft3399, fft3478);
__m512 fft3402 = _mm512_fmadd_ps(fft3385, fft3399, fft3393);
__m512 fft3487 = _mm512_fmadd_ps(fft3472, fft3399, fft3479);
__m512 fft3403 = _mm512_fnmadd_ps(fft3384, fft3399, fft3394);
__m512 fft3488 = _mm512_fnmadd_ps(fft3471, fft3399, fft3480);
__m512 fft3404 = _mm512_fmadd_ps(fft3387, fft3399, fft3395);
__m512 fft3489 = _mm512_fmadd_ps(fft3474, fft3399, fft3481);
__m512 fft3405 = _mm512_fnmadd_ps(fft3386, fft3399, fft3396);
__m512 fft3490 = _mm512_fnmadd_ps(fft3473, fft3399, fft3482);
__m512 fft3406 = _mm512_fmadd_ps(fft3389, fft3399, fft3397);
__m512 fft3491 = _mm512_fmadd_ps(fft3476, fft3399, fft3483);
__m512 fft3407 = _mm512_fnmadd_ps(fft3388, fft3399, fft3398);
__m512 fft3492 = _mm512_fnmadd_ps(fft3475, fft3399, fft3484);
__m512 fft3408 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft3409 = _mm512_fmadd_ps(fft3400, fft3408, _mm512_shuffle_f32x4(fft3400, fft3400, 177));
__m512 fft3493 = _mm512_fmadd_ps(fft3485, fft3408, _mm512_shuffle_f32x4(fft3485, fft3485, 177));
__m512 fft3410 = _mm512_fmadd_ps(fft3401, fft3408, _mm512_shuffle_f32x4(fft3401, fft3401, 177));
__m512 fft3494 = _mm512_fmadd_ps(fft3486, fft3408, _mm512_shuffle_f32x4(fft3486, fft3486, 177));
__m512 fft3411 = _mm512_fmadd_ps(fft3402, fft3408, _mm512_shuffle_f32x4(fft3402, fft3402, 177));
__m512 fft3495 = _mm512_fmadd_ps(fft3487, fft3408, _mm512_shuffle_f32x4(fft3487, fft3487, 177));
__m512 fft3412 = _mm512_fmadd_ps(fft3403, fft3408, _mm512_shuffle_f32x4(fft3403, fft3403, 177));
__m512 fft3496 = _mm512_fmadd_ps(fft3488, fft3408, _mm512_shuffle_f32x4(fft3488, fft3488, 177));
__m512 fft3413 = _mm512_fmadd_ps(fft3404, fft3408, _mm512_shuffle_f32x4(fft3404, fft3404, 177));
__m512 fft3497 = _mm512_fmadd_ps(fft3489, fft3408, _mm512_shuffle_f32x4(fft3489, fft3489, 177));
__m512 fft3414 = _mm512_fmadd_ps(fft3405, fft3408, _mm512_shuffle_f32x4(fft3405, fft3405, 177));
__m512 fft3498 = _mm512_fmadd_ps(fft3490, fft3408, _mm512_shuffle_f32x4(fft3490, fft3490, 177));
__m512 fft3415 = _mm512_fmadd_ps(fft3406, fft3408, _mm512_shuffle_f32x4(fft3406, fft3406, 177));
__m512 fft3499 = _mm512_fmadd_ps(fft3491, fft3408, _mm512_shuffle_f32x4(fft3491, fft3491, 177));
__m512 fft3416 = _mm512_fmadd_ps(fft3407, fft3408, _mm512_shuffle_f32x4(fft3407, fft3407, 177));
__m512 fft3500 = _mm512_fmadd_ps(fft3492, fft3408, _mm512_shuffle_f32x4(fft3492, fft3492, 177));
__m512 fft3417 = _mm512_mask_mov_ps(fft3409, 49344, fft3410);
__m512 fft3501 = _mm512_mask_mov_ps(fft3493, 49344, fft3494);
__m512 fft3418 = _mm512_mask_sub_ps(fft3410, 49344, _mm512_setzero_ps(), fft3409);
__m512 fft3502 = _mm512_mask_sub_ps(fft3494, 49344, _mm512_setzero_ps(), fft3493);
__m512 fft3419 = _mm512_mask_mov_ps(fft3411, 49344, fft3412);
__m512 fft3503 = _mm512_mask_mov_ps(fft3495, 49344, fft3496);
__m512 fft3420 = _mm512_mask_sub_ps(fft3412, 49344, _mm512_setzero_ps(), fft3411);
__m512 fft3504 = _mm512_mask_sub_ps(fft3496, 49344, _mm512_setzero_ps(), fft3495);
__m512 fft3421 = _mm512_mask_mov_ps(fft3413, 49344, fft3414);
__m512 fft3505 = _mm512_mask_mov_ps(fft3497, 49344, fft3498);
__m512 fft3422 = _mm512_mask_sub_ps(fft3414, 49344, _mm512_setzero_ps(), fft3413);
__m512 fft3506 = _mm512_mask_sub_ps(fft3498, 49344, _mm512_setzero_ps(), fft3497);
__m512 fft3423 = _mm512_mask_mov_ps(fft3415, 49344, fft3416);
__m512 fft3507 = _mm512_mask_mov_ps(fft3499, 49344, fft3500);
__m512 fft3424 = _mm512_mask_sub_ps(fft3416, 49344, _mm512_setzero_ps(), fft3415);
__m512 fft3508 = _mm512_mask_sub_ps(fft3500, 49344, _mm512_setzero_ps(), fft3499);
__m512 fft3425 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft3426 = _mm512_fmadd_ps(fft3417, fft3425, _mm512_shuffle_ps(fft3417, fft3417, 78));
__m512 fft3509 = _mm512_fmadd_ps(fft3501, fft3425, _mm512_shuffle_ps(fft3501, fft3501, 78));
__m512 fft3427 = _mm512_fmadd_ps(fft3418, fft3425, _mm512_shuffle_ps(fft3418, fft3418, 78));
__m512 fft3510 = _mm512_fmadd_ps(fft3502, fft3425, _mm512_shuffle_ps(fft3502, fft3502, 78));
__m512 fft3428 = _mm512_fmadd_ps(fft3419, fft3425, _mm512_shuffle_ps(fft3419, fft3419, 78));
__m512 fft3511 = _mm512_fmadd_ps(fft3503, fft3425, _mm512_shuffle_ps(fft3503, fft3503, 78));
__m512 fft3429 = _mm512_fmadd_ps(fft3420, fft3425, _mm512_shuffle_ps(fft3420, fft3420, 78));
__m512 fft3512 = _mm512_fmadd_ps(fft3504, fft3425, _mm512_shuffle_ps(fft3504, fft3504, 78));
__m512 fft3430 = _mm512_fmadd_ps(fft3421, fft3425, _mm512_shuffle_ps(fft3421, fft3421, 78));
__m512 fft3513 = _mm512_fmadd_ps(fft3505, fft3425, _mm512_shuffle_ps(fft3505, fft3505, 78));
__m512 fft3431 = _mm512_fmadd_ps(fft3422, fft3425, _mm512_shuffle_ps(fft3422, fft3422, 78));
__m512 fft3514 = _mm512_fmadd_ps(fft3506, fft3425, _mm512_shuffle_ps(fft3506, fft3506, 78));
__m512 fft3432 = _mm512_fmadd_ps(fft3423, fft3425, _mm512_shuffle_ps(fft3423, fft3423, 78));
__m512 fft3515 = _mm512_fmadd_ps(fft3507, fft3425, _mm512_shuffle_ps(fft3507, fft3507, 78));
__m512 fft3433 = _mm512_fmadd_ps(fft3424, fft3425, _mm512_shuffle_ps(fft3424, fft3424, 78));
__m512 fft3516 = _mm512_fmadd_ps(fft3508, fft3425, _mm512_shuffle_ps(fft3508, fft3508, 78));
__m512i fft3434 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft3435 = _mm512_permutexvar_ps(fft3434, fft3426);
__m512 fft3517 = _mm512_permutexvar_ps(fft3434, fft3509);
__m512i fft3436 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft3437 = _mm512_permutexvar_ps(fft3436, fft3426);
__m512 fft3518 = _mm512_permutexvar_ps(fft3436, fft3509);
__m512 fft3438 = _mm512_permutexvar_ps(fft3434, fft3427);
__m512 fft3519 = _mm512_permutexvar_ps(fft3434, fft3510);
__m512 fft3439 = _mm512_permutexvar_ps(fft3436, fft3427);
__m512 fft3520 = _mm512_permutexvar_ps(fft3436, fft3510);
__m512 fft3440 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft3441 = _mm512_fmadd_ps(fft3435, fft3440, fft3437);
__m512 fft3521 = _mm512_fmadd_ps(fft3517, fft3440, fft3518);
__m512 fft3442 = _mm512_fnmadd_ps(fft3439, fft3440, fft3438);
__m512 fft3522 = _mm512_fnmadd_ps(fft3520, fft3440, fft3519);
__m512 fft3443 = _mm512_mask_mov_ps(fft3439, 21845, fft3441);
__m512 fft3523 = _mm512_mask_mov_ps(fft3520, 21845, fft3521);
__m512 fft3444 = _mm512_mask_mov_ps(fft3435, 43176, fft3441);
__m512 fft3524 = _mm512_mask_mov_ps(fft3517, 43176, fft3521);
__m512 fft3445 = _mm512_mask_mov_ps(fft3443, 43176, fft3442);
__m512 fft3525 = _mm512_mask_mov_ps(fft3523, 43176, fft3522);
__m512 fft3446 = _mm512_mask_mov_ps(fft3444, 22102, fft3442);
__m512 fft3526 = _mm512_mask_mov_ps(fft3524, 22102, fft3522);
__m512 fft3447 = _mm512_mask_mul_ps(fft3445, 64764, fft3445, _mm512_set1_ps(5e-01f));
__m512 fft3527 = _mm512_mask_mul_ps(fft3525, 64764, fft3525, _mm512_set1_ps(5e-01f));
__m512 fft3448 = _mm512_mask_mul_ps(fft3446, 64764, fft3446, _mm512_set1_ps(5e-01f));
__m512 fft3528 = _mm512_mask_mul_ps(fft3526, 64764, fft3526, _mm512_set1_ps(5e-01f));
__m512 df289 = fft3447;
__m512 df297 = fft3527;
__m512 df290 = fft3448;
__m512 df298 = fft3528;
__m512 df291 = fft3428;
__m512 df299 = fft3511;
__m512 df292 = fft3429;
__m512 df300 = fft3512;
__m512 df293 = fft3430;
__m512 df301 = fft3513;
__m512 df294 = fft3431;
__m512 df302 = fft3514;
__m512 df295 = fft3432;
__m512 df303 = fft3515;
__m512 df296 = fft3433;
__m512 df304 = fft3516;
__m512i eo21 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df291 = _mm512_permutexvar_ps(eo21, df291);
df292 = _mm512_permutexvar_ps(eo21, df292);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df291);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df292);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df291);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df292);
df299 = _mm512_permutexvar_ps(eo21, df299);
df300 = _mm512_permutexvar_ps(eo21, df300);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df299);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df300);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df299);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df300);
df293 = _mm512_permutexvar_ps(eo21, df293);
df294 = _mm512_permutexvar_ps(eo21, df294);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df293);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df294);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df293);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df294);
df301 = _mm512_permutexvar_ps(eo21, df301);
df302 = _mm512_permutexvar_ps(eo21, df302);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df301);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df302);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df301);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df302);
df295 = _mm512_permutexvar_ps(eo21, df295);
df296 = _mm512_permutexvar_ps(eo21, df296);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df295);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df296);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df295);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df296);
df303 = _mm512_permutexvar_ps(eo21, df303);
df304 = _mm512_permutexvar_ps(eo21, df304);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df303);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df304);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df303);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df304);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df289);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df290);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df289);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df290);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df297);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df298);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df297);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df298);
ptrdiff_t b22 = 2;
ptrdiff_t m22 = (size_t)b22/2;
ptrdiff_t f23 = (size_t)b22%2;
__m512 dat290 = _mm512_maskz_loadu_ps(65528, datPtr1+8120+602112*i6+200704*k10+896*h9+4*w9+0*b22);
__m512 dat291 = _mm512_maskz_loadu_ps(65528, datPtr1+9016+602112*i6+200704*k10+896*h9+4*w9+0*b22);
__m512 dat292 = _mm512_maskz_loadu_ps(65528, datPtr1+9912+602112*i6+200704*k10+896*h9+4*w9+0*b22);
__m512 dat293 = _mm512_maskz_loadu_ps(65528, datPtr1+10808+602112*i6+200704*k10+896*h9+4*w9+0*b22);
__m512 dat294 = _mm512_maskz_loadu_ps(65528, datPtr1+11704+602112*i6+200704*k10+896*h9+4*w9+0*b22);
__m512 dat295 = _mm512_maskz_loadu_ps(65528, datPtr1+12600+602112*i6+200704*k10+896*h9+4*w9+0*b22);
__m512 dat296 = _mm512_maskz_loadu_ps(65528, datPtr1+13496+602112*i6+200704*k10+896*h9+4*w9+0*b22);
__m512 dat297 = _mm512_maskz_loadu_ps(65528, datPtr1+14392+602112*i6+200704*k10+896*h9+4*w9+0*b22);
__m512 dat298 = _mm512_maskz_loadu_ps(65528, datPtr1+15288+602112*i6+200704*k10+896*h9+4*w9+0*b22);
__m512 dat299 = _mm512_maskz_loadu_ps(65528, datPtr1+16184+602112*i6+200704*k10+896*h9+4*w9+0*b22);
__m512 dat300 = _mm512_maskz_loadu_ps(65528, datPtr1+17080+602112*i6+200704*k10+896*h9+4*w9+0*b22);
__m512 dat301 = _mm512_maskz_loadu_ps(65528, datPtr1+17976+602112*i6+200704*k10+896*h9+4*w9+0*b22);
__m512 dat302 = _mm512_maskz_loadu_ps(65528, datPtr1+18872+602112*i6+200704*k10+896*h9+4*w9+0*b22);
__m512 dat303 = _mm512_maskz_loadu_ps(65528, datPtr1+19768+602112*i6+200704*k10+896*h9+4*w9+0*b22);
__m512 dat304 = _mm512_maskz_loadu_ps(65528, datPtr1+20664+602112*i6+200704*k10+896*h9+4*w9+0*b22);
__m512 dat305 = _mm512_maskz_loadu_ps(65528, datPtr1+21560+602112*i6+200704*k10+896*h9+4*w9+0*b22);
__m512 fft3529 = _mm512_add_ps(dat290, dat298);
__m512 fft3617 = _mm512_add_ps(dat291, dat299);
__m512 fft3530 = _mm512_sub_ps(dat290, dat298);
__m512 fft3618 = _mm512_sub_ps(dat291, dat299);
__m512 fft3531 = _mm512_add_ps(dat292, dat300);
__m512 fft3619 = _mm512_add_ps(dat293, dat301);
__m512 fft3532 = _mm512_sub_ps(dat292, dat300);
__m512 fft3620 = _mm512_sub_ps(dat293, dat301);
__m512 fft3533 = _mm512_add_ps(dat294, dat302);
__m512 fft3621 = _mm512_add_ps(dat295, dat303);
__m512 fft3534 = _mm512_sub_ps(dat294, dat302);
__m512 fft3622 = _mm512_sub_ps(dat295, dat303);
__m512 fft3535 = _mm512_add_ps(dat296, dat304);
__m512 fft3623 = _mm512_add_ps(dat297, dat305);
__m512 fft3536 = _mm512_sub_ps(dat296, dat304);
__m512 fft3624 = _mm512_sub_ps(dat297, dat305);
__m512 fft3537 = _mm512_add_ps(fft3529, fft3533);
__m512 fft3625 = _mm512_add_ps(fft3617, fft3621);
__m512 fft3538 = _mm512_sub_ps(fft3529, fft3533);
__m512 fft3626 = _mm512_sub_ps(fft3617, fft3621);
__m512 fft3539 = _mm512_add_ps(fft3531, fft3535);
__m512 fft3627 = _mm512_add_ps(fft3619, fft3623);
__m512 fft3540 = _mm512_sub_ps(fft3535, fft3531);
__m512 fft3628 = _mm512_sub_ps(fft3623, fft3619);
__m512 fft3541 = _mm512_sub_ps(fft3532, fft3536);
__m512 fft3629 = _mm512_sub_ps(fft3620, fft3624);
__m512 fft3542 = _mm512_add_ps(fft3532, fft3536);
__m512 fft3630 = _mm512_add_ps(fft3620, fft3624);
__m512 fft3543 = _mm512_add_ps(fft3537, fft3539);
__m512 fft3631 = _mm512_add_ps(fft3625, fft3627);
__m512 fft3544 = _mm512_sub_ps(fft3537, fft3539);
__m512 fft3632 = _mm512_sub_ps(fft3625, fft3627);
__m512 fft3545 = _mm512_fmadd_ps(fft3541, _mm512_set1_ps(7.0710677e-01f), fft3530);
__m512 fft3633 = _mm512_fmadd_ps(fft3629, _mm512_set1_ps(7.0710677e-01f), fft3618);
__m512 fft3546 = _mm512_fnmsub_ps(fft3542, _mm512_set1_ps(7.0710677e-01f), fft3534);
__m512 fft3634 = _mm512_fnmsub_ps(fft3630, _mm512_set1_ps(7.0710677e-01f), fft3622);
__m512 fft3547 = _mm512_fnmadd_ps(fft3541, _mm512_set1_ps(7.0710677e-01f), fft3530);
__m512 fft3635 = _mm512_fnmadd_ps(fft3629, _mm512_set1_ps(7.0710677e-01f), fft3618);
__m512 fft3548 = _mm512_fnmadd_ps(fft3542, _mm512_set1_ps(7.0710677e-01f), fft3534);
__m512 fft3636 = _mm512_fnmadd_ps(fft3630, _mm512_set1_ps(7.0710677e-01f), fft3622);
__m512 fft3549 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3550 = _mm512_fmadd_ps(fft3543, fft3549, _mm512_shuffle_f32x4(fft3543, fft3543, 78));
__m512 fft3637 = _mm512_fmadd_ps(fft3631, fft3549, _mm512_shuffle_f32x4(fft3631, fft3631, 78));
__m512 fft3551 = _mm512_fmadd_ps(fft3544, fft3549, _mm512_shuffle_f32x4(fft3544, fft3544, 78));
__m512 fft3638 = _mm512_fmadd_ps(fft3632, fft3549, _mm512_shuffle_f32x4(fft3632, fft3632, 78));
__m512 fft3552 = _mm512_fmadd_ps(fft3545, fft3549, _mm512_shuffle_f32x4(fft3545, fft3545, 78));
__m512 fft3639 = _mm512_fmadd_ps(fft3633, fft3549, _mm512_shuffle_f32x4(fft3633, fft3633, 78));
__m512 fft3553 = _mm512_fmadd_ps(fft3546, fft3549, _mm512_shuffle_f32x4(fft3546, fft3546, 78));
__m512 fft3640 = _mm512_fmadd_ps(fft3634, fft3549, _mm512_shuffle_f32x4(fft3634, fft3634, 78));
__m512 fft3554 = _mm512_fmadd_ps(fft3538, fft3549, _mm512_shuffle_f32x4(fft3538, fft3538, 78));
__m512 fft3641 = _mm512_fmadd_ps(fft3626, fft3549, _mm512_shuffle_f32x4(fft3626, fft3626, 78));
__m512 fft3555 = _mm512_fmadd_ps(fft3540, fft3549, _mm512_shuffle_f32x4(fft3540, fft3540, 78));
__m512 fft3642 = _mm512_fmadd_ps(fft3628, fft3549, _mm512_shuffle_f32x4(fft3628, fft3628, 78));
__m512 fft3556 = _mm512_fmadd_ps(fft3547, fft3549, _mm512_shuffle_f32x4(fft3547, fft3547, 78));
__m512 fft3643 = _mm512_fmadd_ps(fft3635, fft3549, _mm512_shuffle_f32x4(fft3635, fft3635, 78));
__m512 fft3557 = _mm512_fmadd_ps(fft3548, fft3549, _mm512_shuffle_f32x4(fft3548, fft3548, 78));
__m512 fft3644 = _mm512_fmadd_ps(fft3636, fft3549, _mm512_shuffle_f32x4(fft3636, fft3636, 78));
__m512 fft3558 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3559 = _mm512_mul_ps(fft3550, fft3558);
__m512 fft3645 = _mm512_mul_ps(fft3637, fft3558);
__m512 fft3560 = _mm512_mul_ps(fft3551, fft3558);
__m512 fft3646 = _mm512_mul_ps(fft3638, fft3558);
__m512 fft3561 = _mm512_mul_ps(fft3552, fft3558);
__m512 fft3647 = _mm512_mul_ps(fft3639, fft3558);
__m512 fft3562 = _mm512_mul_ps(fft3553, fft3558);
__m512 fft3648 = _mm512_mul_ps(fft3640, fft3558);
__m512 fft3563 = _mm512_mul_ps(fft3554, fft3558);
__m512 fft3649 = _mm512_mul_ps(fft3641, fft3558);
__m512 fft3564 = _mm512_mul_ps(fft3555, fft3558);
__m512 fft3650 = _mm512_mul_ps(fft3642, fft3558);
__m512 fft3565 = _mm512_mul_ps(fft3556, fft3558);
__m512 fft3651 = _mm512_mul_ps(fft3643, fft3558);
__m512 fft3566 = _mm512_mul_ps(fft3557, fft3558);
__m512 fft3652 = _mm512_mul_ps(fft3644, fft3558);
__m512 fft3567 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft3568 = _mm512_fmadd_ps(fft3551, fft3567, fft3559);
__m512 fft3653 = _mm512_fmadd_ps(fft3638, fft3567, fft3645);
__m512 fft3569 = _mm512_fnmadd_ps(fft3550, fft3567, fft3560);
__m512 fft3654 = _mm512_fnmadd_ps(fft3637, fft3567, fft3646);
__m512 fft3570 = _mm512_fmadd_ps(fft3553, fft3567, fft3561);
__m512 fft3655 = _mm512_fmadd_ps(fft3640, fft3567, fft3647);
__m512 fft3571 = _mm512_fnmadd_ps(fft3552, fft3567, fft3562);
__m512 fft3656 = _mm512_fnmadd_ps(fft3639, fft3567, fft3648);
__m512 fft3572 = _mm512_fmadd_ps(fft3555, fft3567, fft3563);
__m512 fft3657 = _mm512_fmadd_ps(fft3642, fft3567, fft3649);
__m512 fft3573 = _mm512_fnmadd_ps(fft3554, fft3567, fft3564);
__m512 fft3658 = _mm512_fnmadd_ps(fft3641, fft3567, fft3650);
__m512 fft3574 = _mm512_fmadd_ps(fft3557, fft3567, fft3565);
__m512 fft3659 = _mm512_fmadd_ps(fft3644, fft3567, fft3651);
__m512 fft3575 = _mm512_fnmadd_ps(fft3556, fft3567, fft3566);
__m512 fft3660 = _mm512_fnmadd_ps(fft3643, fft3567, fft3652);
__m512 fft3576 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft3577 = _mm512_fmadd_ps(fft3568, fft3576, _mm512_shuffle_f32x4(fft3568, fft3568, 177));
__m512 fft3661 = _mm512_fmadd_ps(fft3653, fft3576, _mm512_shuffle_f32x4(fft3653, fft3653, 177));
__m512 fft3578 = _mm512_fmadd_ps(fft3569, fft3576, _mm512_shuffle_f32x4(fft3569, fft3569, 177));
__m512 fft3662 = _mm512_fmadd_ps(fft3654, fft3576, _mm512_shuffle_f32x4(fft3654, fft3654, 177));
__m512 fft3579 = _mm512_fmadd_ps(fft3570, fft3576, _mm512_shuffle_f32x4(fft3570, fft3570, 177));
__m512 fft3663 = _mm512_fmadd_ps(fft3655, fft3576, _mm512_shuffle_f32x4(fft3655, fft3655, 177));
__m512 fft3580 = _mm512_fmadd_ps(fft3571, fft3576, _mm512_shuffle_f32x4(fft3571, fft3571, 177));
__m512 fft3664 = _mm512_fmadd_ps(fft3656, fft3576, _mm512_shuffle_f32x4(fft3656, fft3656, 177));
__m512 fft3581 = _mm512_fmadd_ps(fft3572, fft3576, _mm512_shuffle_f32x4(fft3572, fft3572, 177));
__m512 fft3665 = _mm512_fmadd_ps(fft3657, fft3576, _mm512_shuffle_f32x4(fft3657, fft3657, 177));
__m512 fft3582 = _mm512_fmadd_ps(fft3573, fft3576, _mm512_shuffle_f32x4(fft3573, fft3573, 177));
__m512 fft3666 = _mm512_fmadd_ps(fft3658, fft3576, _mm512_shuffle_f32x4(fft3658, fft3658, 177));
__m512 fft3583 = _mm512_fmadd_ps(fft3574, fft3576, _mm512_shuffle_f32x4(fft3574, fft3574, 177));
__m512 fft3667 = _mm512_fmadd_ps(fft3659, fft3576, _mm512_shuffle_f32x4(fft3659, fft3659, 177));
__m512 fft3584 = _mm512_fmadd_ps(fft3575, fft3576, _mm512_shuffle_f32x4(fft3575, fft3575, 177));
__m512 fft3668 = _mm512_fmadd_ps(fft3660, fft3576, _mm512_shuffle_f32x4(fft3660, fft3660, 177));
__m512 fft3585 = _mm512_mask_mov_ps(fft3577, 49344, fft3578);
__m512 fft3669 = _mm512_mask_mov_ps(fft3661, 49344, fft3662);
__m512 fft3586 = _mm512_mask_sub_ps(fft3578, 49344, _mm512_setzero_ps(), fft3577);
__m512 fft3670 = _mm512_mask_sub_ps(fft3662, 49344, _mm512_setzero_ps(), fft3661);
__m512 fft3587 = _mm512_mask_mov_ps(fft3579, 49344, fft3580);
__m512 fft3671 = _mm512_mask_mov_ps(fft3663, 49344, fft3664);
__m512 fft3588 = _mm512_mask_sub_ps(fft3580, 49344, _mm512_setzero_ps(), fft3579);
__m512 fft3672 = _mm512_mask_sub_ps(fft3664, 49344, _mm512_setzero_ps(), fft3663);
__m512 fft3589 = _mm512_mask_mov_ps(fft3581, 49344, fft3582);
__m512 fft3673 = _mm512_mask_mov_ps(fft3665, 49344, fft3666);
__m512 fft3590 = _mm512_mask_sub_ps(fft3582, 49344, _mm512_setzero_ps(), fft3581);
__m512 fft3674 = _mm512_mask_sub_ps(fft3666, 49344, _mm512_setzero_ps(), fft3665);
__m512 fft3591 = _mm512_mask_mov_ps(fft3583, 49344, fft3584);
__m512 fft3675 = _mm512_mask_mov_ps(fft3667, 49344, fft3668);
__m512 fft3592 = _mm512_mask_sub_ps(fft3584, 49344, _mm512_setzero_ps(), fft3583);
__m512 fft3676 = _mm512_mask_sub_ps(fft3668, 49344, _mm512_setzero_ps(), fft3667);
__m512 fft3593 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft3594 = _mm512_fmadd_ps(fft3585, fft3593, _mm512_shuffle_ps(fft3585, fft3585, 78));
__m512 fft3677 = _mm512_fmadd_ps(fft3669, fft3593, _mm512_shuffle_ps(fft3669, fft3669, 78));
__m512 fft3595 = _mm512_fmadd_ps(fft3586, fft3593, _mm512_shuffle_ps(fft3586, fft3586, 78));
__m512 fft3678 = _mm512_fmadd_ps(fft3670, fft3593, _mm512_shuffle_ps(fft3670, fft3670, 78));
__m512 fft3596 = _mm512_fmadd_ps(fft3587, fft3593, _mm512_shuffle_ps(fft3587, fft3587, 78));
__m512 fft3679 = _mm512_fmadd_ps(fft3671, fft3593, _mm512_shuffle_ps(fft3671, fft3671, 78));
__m512 fft3597 = _mm512_fmadd_ps(fft3588, fft3593, _mm512_shuffle_ps(fft3588, fft3588, 78));
__m512 fft3680 = _mm512_fmadd_ps(fft3672, fft3593, _mm512_shuffle_ps(fft3672, fft3672, 78));
__m512 fft3598 = _mm512_fmadd_ps(fft3589, fft3593, _mm512_shuffle_ps(fft3589, fft3589, 78));
__m512 fft3681 = _mm512_fmadd_ps(fft3673, fft3593, _mm512_shuffle_ps(fft3673, fft3673, 78));
__m512 fft3599 = _mm512_fmadd_ps(fft3590, fft3593, _mm512_shuffle_ps(fft3590, fft3590, 78));
__m512 fft3682 = _mm512_fmadd_ps(fft3674, fft3593, _mm512_shuffle_ps(fft3674, fft3674, 78));
__m512 fft3600 = _mm512_fmadd_ps(fft3591, fft3593, _mm512_shuffle_ps(fft3591, fft3591, 78));
__m512 fft3683 = _mm512_fmadd_ps(fft3675, fft3593, _mm512_shuffle_ps(fft3675, fft3675, 78));
__m512 fft3601 = _mm512_fmadd_ps(fft3592, fft3593, _mm512_shuffle_ps(fft3592, fft3592, 78));
__m512 fft3684 = _mm512_fmadd_ps(fft3676, fft3593, _mm512_shuffle_ps(fft3676, fft3676, 78));
__m512i fft3602 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft3603 = _mm512_permutexvar_ps(fft3602, fft3594);
__m512 fft3685 = _mm512_permutexvar_ps(fft3602, fft3677);
__m512i fft3604 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft3605 = _mm512_permutexvar_ps(fft3604, fft3594);
__m512 fft3686 = _mm512_permutexvar_ps(fft3604, fft3677);
__m512 fft3606 = _mm512_permutexvar_ps(fft3602, fft3595);
__m512 fft3687 = _mm512_permutexvar_ps(fft3602, fft3678);
__m512 fft3607 = _mm512_permutexvar_ps(fft3604, fft3595);
__m512 fft3688 = _mm512_permutexvar_ps(fft3604, fft3678);
__m512 fft3608 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft3609 = _mm512_fmadd_ps(fft3603, fft3608, fft3605);
__m512 fft3689 = _mm512_fmadd_ps(fft3685, fft3608, fft3686);
__m512 fft3610 = _mm512_fnmadd_ps(fft3607, fft3608, fft3606);
__m512 fft3690 = _mm512_fnmadd_ps(fft3688, fft3608, fft3687);
__m512 fft3611 = _mm512_mask_mov_ps(fft3607, 21845, fft3609);
__m512 fft3691 = _mm512_mask_mov_ps(fft3688, 21845, fft3689);
__m512 fft3612 = _mm512_mask_mov_ps(fft3603, 43176, fft3609);
__m512 fft3692 = _mm512_mask_mov_ps(fft3685, 43176, fft3689);
__m512 fft3613 = _mm512_mask_mov_ps(fft3611, 43176, fft3610);
__m512 fft3693 = _mm512_mask_mov_ps(fft3691, 43176, fft3690);
__m512 fft3614 = _mm512_mask_mov_ps(fft3612, 22102, fft3610);
__m512 fft3694 = _mm512_mask_mov_ps(fft3692, 22102, fft3690);
__m512 fft3615 = _mm512_mask_mul_ps(fft3613, 64764, fft3613, _mm512_set1_ps(5e-01f));
__m512 fft3695 = _mm512_mask_mul_ps(fft3693, 64764, fft3693, _mm512_set1_ps(5e-01f));
__m512 fft3616 = _mm512_mask_mul_ps(fft3614, 64764, fft3614, _mm512_set1_ps(5e-01f));
__m512 fft3696 = _mm512_mask_mul_ps(fft3694, 64764, fft3694, _mm512_set1_ps(5e-01f));
__m512 df305 = fft3615;
__m512 df313 = fft3695;
__m512 df306 = fft3616;
__m512 df314 = fft3696;
__m512 df307 = fft3596;
__m512 df315 = fft3679;
__m512 df308 = fft3597;
__m512 df316 = fft3680;
__m512 df309 = fft3598;
__m512 df317 = fft3681;
__m512 df310 = fft3599;
__m512 df318 = fft3682;
__m512 df311 = fft3600;
__m512 df319 = fft3683;
__m512 df312 = fft3601;
__m512 df320 = fft3684;
__m512i eo22 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df307 = _mm512_permutexvar_ps(eo22, df307);
df308 = _mm512_permutexvar_ps(eo22, df308);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df307);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df308);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df307);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df308);
df315 = _mm512_permutexvar_ps(eo22, df315);
df316 = _mm512_permutexvar_ps(eo22, df316);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df315);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df316);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df315);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df316);
df309 = _mm512_permutexvar_ps(eo22, df309);
df310 = _mm512_permutexvar_ps(eo22, df310);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df309);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df310);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df309);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df310);
df317 = _mm512_permutexvar_ps(eo22, df317);
df318 = _mm512_permutexvar_ps(eo22, df318);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df317);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df318);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df317);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df318);
df311 = _mm512_permutexvar_ps(eo22, df311);
df312 = _mm512_permutexvar_ps(eo22, df312);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df311);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df312);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df311);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df312);
df319 = _mm512_permutexvar_ps(eo22, df319);
df320 = _mm512_permutexvar_ps(eo22, df320);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df319);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df320);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df319);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df320);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df305);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df306);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df305);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df306);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df313);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df314);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df313);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df314);
for (ptrdiff_t b23 = 3; b23 < 6; ++b23) {
ptrdiff_t m23 = (size_t)b23/2;
ptrdiff_t f24 = (size_t)b23%2;
__m512 dat306 = _mm512_maskz_loadu_ps(65535, datPtr1+8040+602112*i6+200704*k10+896*h9+4*w9+40*b23);
__m512 dat307 = _mm512_maskz_loadu_ps(65535, datPtr1+8936+602112*i6+200704*k10+896*h9+4*w9+40*b23);
__m512 dat308 = _mm512_maskz_loadu_ps(65535, datPtr1+9832+602112*i6+200704*k10+896*h9+4*w9+40*b23);
__m512 dat309 = _mm512_maskz_loadu_ps(65535, datPtr1+10728+602112*i6+200704*k10+896*h9+4*w9+40*b23);
__m512 dat310 = _mm512_maskz_loadu_ps(65535, datPtr1+11624+602112*i6+200704*k10+896*h9+4*w9+40*b23);
__m512 dat311 = _mm512_maskz_loadu_ps(65535, datPtr1+12520+602112*i6+200704*k10+896*h9+4*w9+40*b23);
__m512 dat312 = _mm512_maskz_loadu_ps(65535, datPtr1+13416+602112*i6+200704*k10+896*h9+4*w9+40*b23);
__m512 dat313 = _mm512_maskz_loadu_ps(65535, datPtr1+14312+602112*i6+200704*k10+896*h9+4*w9+40*b23);
__m512 dat314 = _mm512_maskz_loadu_ps(65535, datPtr1+15208+602112*i6+200704*k10+896*h9+4*w9+40*b23);
__m512 dat315 = _mm512_maskz_loadu_ps(65535, datPtr1+16104+602112*i6+200704*k10+896*h9+4*w9+40*b23);
__m512 dat316 = _mm512_maskz_loadu_ps(65535, datPtr1+17000+602112*i6+200704*k10+896*h9+4*w9+40*b23);
__m512 dat317 = _mm512_maskz_loadu_ps(65535, datPtr1+17896+602112*i6+200704*k10+896*h9+4*w9+40*b23);
__m512 dat318 = _mm512_maskz_loadu_ps(65535, datPtr1+18792+602112*i6+200704*k10+896*h9+4*w9+40*b23);
__m512 dat319 = _mm512_maskz_loadu_ps(65535, datPtr1+19688+602112*i6+200704*k10+896*h9+4*w9+40*b23);
__m512 dat320 = _mm512_maskz_loadu_ps(65535, datPtr1+20584+602112*i6+200704*k10+896*h9+4*w9+40*b23);
__m512 dat321 = _mm512_maskz_loadu_ps(65535, datPtr1+21480+602112*i6+200704*k10+896*h9+4*w9+40*b23);
__m512 fft3697 = _mm512_add_ps(dat306, dat314);
__m512 fft3785 = _mm512_add_ps(dat307, dat315);
__m512 fft3698 = _mm512_sub_ps(dat306, dat314);
__m512 fft3786 = _mm512_sub_ps(dat307, dat315);
__m512 fft3699 = _mm512_add_ps(dat308, dat316);
__m512 fft3787 = _mm512_add_ps(dat309, dat317);
__m512 fft3700 = _mm512_sub_ps(dat308, dat316);
__m512 fft3788 = _mm512_sub_ps(dat309, dat317);
__m512 fft3701 = _mm512_add_ps(dat310, dat318);
__m512 fft3789 = _mm512_add_ps(dat311, dat319);
__m512 fft3702 = _mm512_sub_ps(dat310, dat318);
__m512 fft3790 = _mm512_sub_ps(dat311, dat319);
__m512 fft3703 = _mm512_add_ps(dat312, dat320);
__m512 fft3791 = _mm512_add_ps(dat313, dat321);
__m512 fft3704 = _mm512_sub_ps(dat312, dat320);
__m512 fft3792 = _mm512_sub_ps(dat313, dat321);
__m512 fft3705 = _mm512_add_ps(fft3697, fft3701);
__m512 fft3793 = _mm512_add_ps(fft3785, fft3789);
__m512 fft3706 = _mm512_sub_ps(fft3697, fft3701);
__m512 fft3794 = _mm512_sub_ps(fft3785, fft3789);
__m512 fft3707 = _mm512_add_ps(fft3699, fft3703);
__m512 fft3795 = _mm512_add_ps(fft3787, fft3791);
__m512 fft3708 = _mm512_sub_ps(fft3703, fft3699);
__m512 fft3796 = _mm512_sub_ps(fft3791, fft3787);
__m512 fft3709 = _mm512_sub_ps(fft3700, fft3704);
__m512 fft3797 = _mm512_sub_ps(fft3788, fft3792);
__m512 fft3710 = _mm512_add_ps(fft3700, fft3704);
__m512 fft3798 = _mm512_add_ps(fft3788, fft3792);
__m512 fft3711 = _mm512_add_ps(fft3705, fft3707);
__m512 fft3799 = _mm512_add_ps(fft3793, fft3795);
__m512 fft3712 = _mm512_sub_ps(fft3705, fft3707);
__m512 fft3800 = _mm512_sub_ps(fft3793, fft3795);
__m512 fft3713 = _mm512_fmadd_ps(fft3709, _mm512_set1_ps(7.0710677e-01f), fft3698);
__m512 fft3801 = _mm512_fmadd_ps(fft3797, _mm512_set1_ps(7.0710677e-01f), fft3786);
__m512 fft3714 = _mm512_fnmsub_ps(fft3710, _mm512_set1_ps(7.0710677e-01f), fft3702);
__m512 fft3802 = _mm512_fnmsub_ps(fft3798, _mm512_set1_ps(7.0710677e-01f), fft3790);
__m512 fft3715 = _mm512_fnmadd_ps(fft3709, _mm512_set1_ps(7.0710677e-01f), fft3698);
__m512 fft3803 = _mm512_fnmadd_ps(fft3797, _mm512_set1_ps(7.0710677e-01f), fft3786);
__m512 fft3716 = _mm512_fnmadd_ps(fft3710, _mm512_set1_ps(7.0710677e-01f), fft3702);
__m512 fft3804 = _mm512_fnmadd_ps(fft3798, _mm512_set1_ps(7.0710677e-01f), fft3790);
__m512 fft3717 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3718 = _mm512_fmadd_ps(fft3711, fft3717, _mm512_shuffle_f32x4(fft3711, fft3711, 78));
__m512 fft3805 = _mm512_fmadd_ps(fft3799, fft3717, _mm512_shuffle_f32x4(fft3799, fft3799, 78));
__m512 fft3719 = _mm512_fmadd_ps(fft3712, fft3717, _mm512_shuffle_f32x4(fft3712, fft3712, 78));
__m512 fft3806 = _mm512_fmadd_ps(fft3800, fft3717, _mm512_shuffle_f32x4(fft3800, fft3800, 78));
__m512 fft3720 = _mm512_fmadd_ps(fft3713, fft3717, _mm512_shuffle_f32x4(fft3713, fft3713, 78));
__m512 fft3807 = _mm512_fmadd_ps(fft3801, fft3717, _mm512_shuffle_f32x4(fft3801, fft3801, 78));
__m512 fft3721 = _mm512_fmadd_ps(fft3714, fft3717, _mm512_shuffle_f32x4(fft3714, fft3714, 78));
__m512 fft3808 = _mm512_fmadd_ps(fft3802, fft3717, _mm512_shuffle_f32x4(fft3802, fft3802, 78));
__m512 fft3722 = _mm512_fmadd_ps(fft3706, fft3717, _mm512_shuffle_f32x4(fft3706, fft3706, 78));
__m512 fft3809 = _mm512_fmadd_ps(fft3794, fft3717, _mm512_shuffle_f32x4(fft3794, fft3794, 78));
__m512 fft3723 = _mm512_fmadd_ps(fft3708, fft3717, _mm512_shuffle_f32x4(fft3708, fft3708, 78));
__m512 fft3810 = _mm512_fmadd_ps(fft3796, fft3717, _mm512_shuffle_f32x4(fft3796, fft3796, 78));
__m512 fft3724 = _mm512_fmadd_ps(fft3715, fft3717, _mm512_shuffle_f32x4(fft3715, fft3715, 78));
__m512 fft3811 = _mm512_fmadd_ps(fft3803, fft3717, _mm512_shuffle_f32x4(fft3803, fft3803, 78));
__m512 fft3725 = _mm512_fmadd_ps(fft3716, fft3717, _mm512_shuffle_f32x4(fft3716, fft3716, 78));
__m512 fft3812 = _mm512_fmadd_ps(fft3804, fft3717, _mm512_shuffle_f32x4(fft3804, fft3804, 78));
__m512 fft3726 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3727 = _mm512_mul_ps(fft3718, fft3726);
__m512 fft3813 = _mm512_mul_ps(fft3805, fft3726);
__m512 fft3728 = _mm512_mul_ps(fft3719, fft3726);
__m512 fft3814 = _mm512_mul_ps(fft3806, fft3726);
__m512 fft3729 = _mm512_mul_ps(fft3720, fft3726);
__m512 fft3815 = _mm512_mul_ps(fft3807, fft3726);
__m512 fft3730 = _mm512_mul_ps(fft3721, fft3726);
__m512 fft3816 = _mm512_mul_ps(fft3808, fft3726);
__m512 fft3731 = _mm512_mul_ps(fft3722, fft3726);
__m512 fft3817 = _mm512_mul_ps(fft3809, fft3726);
__m512 fft3732 = _mm512_mul_ps(fft3723, fft3726);
__m512 fft3818 = _mm512_mul_ps(fft3810, fft3726);
__m512 fft3733 = _mm512_mul_ps(fft3724, fft3726);
__m512 fft3819 = _mm512_mul_ps(fft3811, fft3726);
__m512 fft3734 = _mm512_mul_ps(fft3725, fft3726);
__m512 fft3820 = _mm512_mul_ps(fft3812, fft3726);
__m512 fft3735 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft3736 = _mm512_fmadd_ps(fft3719, fft3735, fft3727);
__m512 fft3821 = _mm512_fmadd_ps(fft3806, fft3735, fft3813);
__m512 fft3737 = _mm512_fnmadd_ps(fft3718, fft3735, fft3728);
__m512 fft3822 = _mm512_fnmadd_ps(fft3805, fft3735, fft3814);
__m512 fft3738 = _mm512_fmadd_ps(fft3721, fft3735, fft3729);
__m512 fft3823 = _mm512_fmadd_ps(fft3808, fft3735, fft3815);
__m512 fft3739 = _mm512_fnmadd_ps(fft3720, fft3735, fft3730);
__m512 fft3824 = _mm512_fnmadd_ps(fft3807, fft3735, fft3816);
__m512 fft3740 = _mm512_fmadd_ps(fft3723, fft3735, fft3731);
__m512 fft3825 = _mm512_fmadd_ps(fft3810, fft3735, fft3817);
__m512 fft3741 = _mm512_fnmadd_ps(fft3722, fft3735, fft3732);
__m512 fft3826 = _mm512_fnmadd_ps(fft3809, fft3735, fft3818);
__m512 fft3742 = _mm512_fmadd_ps(fft3725, fft3735, fft3733);
__m512 fft3827 = _mm512_fmadd_ps(fft3812, fft3735, fft3819);
__m512 fft3743 = _mm512_fnmadd_ps(fft3724, fft3735, fft3734);
__m512 fft3828 = _mm512_fnmadd_ps(fft3811, fft3735, fft3820);
__m512 fft3744 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft3745 = _mm512_fmadd_ps(fft3736, fft3744, _mm512_shuffle_f32x4(fft3736, fft3736, 177));
__m512 fft3829 = _mm512_fmadd_ps(fft3821, fft3744, _mm512_shuffle_f32x4(fft3821, fft3821, 177));
__m512 fft3746 = _mm512_fmadd_ps(fft3737, fft3744, _mm512_shuffle_f32x4(fft3737, fft3737, 177));
__m512 fft3830 = _mm512_fmadd_ps(fft3822, fft3744, _mm512_shuffle_f32x4(fft3822, fft3822, 177));
__m512 fft3747 = _mm512_fmadd_ps(fft3738, fft3744, _mm512_shuffle_f32x4(fft3738, fft3738, 177));
__m512 fft3831 = _mm512_fmadd_ps(fft3823, fft3744, _mm512_shuffle_f32x4(fft3823, fft3823, 177));
__m512 fft3748 = _mm512_fmadd_ps(fft3739, fft3744, _mm512_shuffle_f32x4(fft3739, fft3739, 177));
__m512 fft3832 = _mm512_fmadd_ps(fft3824, fft3744, _mm512_shuffle_f32x4(fft3824, fft3824, 177));
__m512 fft3749 = _mm512_fmadd_ps(fft3740, fft3744, _mm512_shuffle_f32x4(fft3740, fft3740, 177));
__m512 fft3833 = _mm512_fmadd_ps(fft3825, fft3744, _mm512_shuffle_f32x4(fft3825, fft3825, 177));
__m512 fft3750 = _mm512_fmadd_ps(fft3741, fft3744, _mm512_shuffle_f32x4(fft3741, fft3741, 177));
__m512 fft3834 = _mm512_fmadd_ps(fft3826, fft3744, _mm512_shuffle_f32x4(fft3826, fft3826, 177));
__m512 fft3751 = _mm512_fmadd_ps(fft3742, fft3744, _mm512_shuffle_f32x4(fft3742, fft3742, 177));
__m512 fft3835 = _mm512_fmadd_ps(fft3827, fft3744, _mm512_shuffle_f32x4(fft3827, fft3827, 177));
__m512 fft3752 = _mm512_fmadd_ps(fft3743, fft3744, _mm512_shuffle_f32x4(fft3743, fft3743, 177));
__m512 fft3836 = _mm512_fmadd_ps(fft3828, fft3744, _mm512_shuffle_f32x4(fft3828, fft3828, 177));
__m512 fft3753 = _mm512_mask_mov_ps(fft3745, 49344, fft3746);
__m512 fft3837 = _mm512_mask_mov_ps(fft3829, 49344, fft3830);
__m512 fft3754 = _mm512_mask_sub_ps(fft3746, 49344, _mm512_setzero_ps(), fft3745);
__m512 fft3838 = _mm512_mask_sub_ps(fft3830, 49344, _mm512_setzero_ps(), fft3829);
__m512 fft3755 = _mm512_mask_mov_ps(fft3747, 49344, fft3748);
__m512 fft3839 = _mm512_mask_mov_ps(fft3831, 49344, fft3832);
__m512 fft3756 = _mm512_mask_sub_ps(fft3748, 49344, _mm512_setzero_ps(), fft3747);
__m512 fft3840 = _mm512_mask_sub_ps(fft3832, 49344, _mm512_setzero_ps(), fft3831);
__m512 fft3757 = _mm512_mask_mov_ps(fft3749, 49344, fft3750);
__m512 fft3841 = _mm512_mask_mov_ps(fft3833, 49344, fft3834);
__m512 fft3758 = _mm512_mask_sub_ps(fft3750, 49344, _mm512_setzero_ps(), fft3749);
__m512 fft3842 = _mm512_mask_sub_ps(fft3834, 49344, _mm512_setzero_ps(), fft3833);
__m512 fft3759 = _mm512_mask_mov_ps(fft3751, 49344, fft3752);
__m512 fft3843 = _mm512_mask_mov_ps(fft3835, 49344, fft3836);
__m512 fft3760 = _mm512_mask_sub_ps(fft3752, 49344, _mm512_setzero_ps(), fft3751);
__m512 fft3844 = _mm512_mask_sub_ps(fft3836, 49344, _mm512_setzero_ps(), fft3835);
__m512 fft3761 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft3762 = _mm512_fmadd_ps(fft3753, fft3761, _mm512_shuffle_ps(fft3753, fft3753, 78));
__m512 fft3845 = _mm512_fmadd_ps(fft3837, fft3761, _mm512_shuffle_ps(fft3837, fft3837, 78));
__m512 fft3763 = _mm512_fmadd_ps(fft3754, fft3761, _mm512_shuffle_ps(fft3754, fft3754, 78));
__m512 fft3846 = _mm512_fmadd_ps(fft3838, fft3761, _mm512_shuffle_ps(fft3838, fft3838, 78));
__m512 fft3764 = _mm512_fmadd_ps(fft3755, fft3761, _mm512_shuffle_ps(fft3755, fft3755, 78));
__m512 fft3847 = _mm512_fmadd_ps(fft3839, fft3761, _mm512_shuffle_ps(fft3839, fft3839, 78));
__m512 fft3765 = _mm512_fmadd_ps(fft3756, fft3761, _mm512_shuffle_ps(fft3756, fft3756, 78));
__m512 fft3848 = _mm512_fmadd_ps(fft3840, fft3761, _mm512_shuffle_ps(fft3840, fft3840, 78));
__m512 fft3766 = _mm512_fmadd_ps(fft3757, fft3761, _mm512_shuffle_ps(fft3757, fft3757, 78));
__m512 fft3849 = _mm512_fmadd_ps(fft3841, fft3761, _mm512_shuffle_ps(fft3841, fft3841, 78));
__m512 fft3767 = _mm512_fmadd_ps(fft3758, fft3761, _mm512_shuffle_ps(fft3758, fft3758, 78));
__m512 fft3850 = _mm512_fmadd_ps(fft3842, fft3761, _mm512_shuffle_ps(fft3842, fft3842, 78));
__m512 fft3768 = _mm512_fmadd_ps(fft3759, fft3761, _mm512_shuffle_ps(fft3759, fft3759, 78));
__m512 fft3851 = _mm512_fmadd_ps(fft3843, fft3761, _mm512_shuffle_ps(fft3843, fft3843, 78));
__m512 fft3769 = _mm512_fmadd_ps(fft3760, fft3761, _mm512_shuffle_ps(fft3760, fft3760, 78));
__m512 fft3852 = _mm512_fmadd_ps(fft3844, fft3761, _mm512_shuffle_ps(fft3844, fft3844, 78));
__m512i fft3770 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft3771 = _mm512_permutexvar_ps(fft3770, fft3762);
__m512 fft3853 = _mm512_permutexvar_ps(fft3770, fft3845);
__m512i fft3772 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft3773 = _mm512_permutexvar_ps(fft3772, fft3762);
__m512 fft3854 = _mm512_permutexvar_ps(fft3772, fft3845);
__m512 fft3774 = _mm512_permutexvar_ps(fft3770, fft3763);
__m512 fft3855 = _mm512_permutexvar_ps(fft3770, fft3846);
__m512 fft3775 = _mm512_permutexvar_ps(fft3772, fft3763);
__m512 fft3856 = _mm512_permutexvar_ps(fft3772, fft3846);
__m512 fft3776 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft3777 = _mm512_fmadd_ps(fft3771, fft3776, fft3773);
__m512 fft3857 = _mm512_fmadd_ps(fft3853, fft3776, fft3854);
__m512 fft3778 = _mm512_fnmadd_ps(fft3775, fft3776, fft3774);
__m512 fft3858 = _mm512_fnmadd_ps(fft3856, fft3776, fft3855);
__m512 fft3779 = _mm512_mask_mov_ps(fft3775, 21845, fft3777);
__m512 fft3859 = _mm512_mask_mov_ps(fft3856, 21845, fft3857);
__m512 fft3780 = _mm512_mask_mov_ps(fft3771, 43176, fft3777);
__m512 fft3860 = _mm512_mask_mov_ps(fft3853, 43176, fft3857);
__m512 fft3781 = _mm512_mask_mov_ps(fft3779, 43176, fft3778);
__m512 fft3861 = _mm512_mask_mov_ps(fft3859, 43176, fft3858);
__m512 fft3782 = _mm512_mask_mov_ps(fft3780, 22102, fft3778);
__m512 fft3862 = _mm512_mask_mov_ps(fft3860, 22102, fft3858);
__m512 fft3783 = _mm512_mask_mul_ps(fft3781, 64764, fft3781, _mm512_set1_ps(5e-01f));
__m512 fft3863 = _mm512_mask_mul_ps(fft3861, 64764, fft3861, _mm512_set1_ps(5e-01f));
__m512 fft3784 = _mm512_mask_mul_ps(fft3782, 64764, fft3782, _mm512_set1_ps(5e-01f));
__m512 fft3864 = _mm512_mask_mul_ps(fft3862, 64764, fft3862, _mm512_set1_ps(5e-01f));
__m512 df321 = fft3783;
__m512 df329 = fft3863;
__m512 df322 = fft3784;
__m512 df330 = fft3864;
__m512 df323 = fft3764;
__m512 df331 = fft3847;
__m512 df324 = fft3765;
__m512 df332 = fft3848;
__m512 df325 = fft3766;
__m512 df333 = fft3849;
__m512 df326 = fft3767;
__m512 df334 = fft3850;
__m512 df327 = fft3768;
__m512 df335 = fft3851;
__m512 df328 = fft3769;
__m512 df336 = fft3852;
__m512i eo23 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df323 = _mm512_permutexvar_ps(eo23, df323);
df324 = _mm512_permutexvar_ps(eo23, df324);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df323);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df324);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df323);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df324);
df331 = _mm512_permutexvar_ps(eo23, df331);
df332 = _mm512_permutexvar_ps(eo23, df332);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df331);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df332);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df331);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df332);
df325 = _mm512_permutexvar_ps(eo23, df325);
df326 = _mm512_permutexvar_ps(eo23, df326);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df325);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df326);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df325);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df326);
df333 = _mm512_permutexvar_ps(eo23, df333);
df334 = _mm512_permutexvar_ps(eo23, df334);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df333);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df334);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df333);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df334);
df327 = _mm512_permutexvar_ps(eo23, df327);
df328 = _mm512_permutexvar_ps(eo23, df328);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df327);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df328);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df327);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df328);
df335 = _mm512_permutexvar_ps(eo23, df335);
df336 = _mm512_permutexvar_ps(eo23, df336);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df335);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df336);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df335);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df336);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df321);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df322);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df321);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df322);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df329);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df330);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df329);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df330);
}
}
if (j2 >= last1) return;
++j2;
rel2 = 12;
}
if (rel2 < 15) {
ptrdiff_t h10 = base2+30;
ptrdiff_t w10 = -680+60*rel2;
ptrdiff_t jj5 = 14-rel2+j2;
for (; j2 <= jj5; w10 += 60) {
ptrdiff_t k11 = 3*s1;
ptrdiff_t kk10 = k11+2;
for (; k11 <= kk10; ++k11) {
for (ptrdiff_t b24 = 0; b24 < 6; ++b24) {
ptrdiff_t m24 = (size_t)b24/2;
ptrdiff_t f25 = (size_t)b24%2;
__m512 dat322 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k11+896*h10+4*w10+40*b24);
__m512 dat323 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k11+896*h10+4*w10+40*b24);
__m512 dat324 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k11+896*h10+4*w10+40*b24);
__m512 dat325 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k11+896*h10+4*w10+40*b24);
__m512 dat326 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k11+896*h10+4*w10+40*b24);
__m512 dat327 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k11+896*h10+4*w10+40*b24);
__m512 dat328 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k11+896*h10+4*w10+40*b24);
__m512 dat329 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k11+896*h10+4*w10+40*b24);
__m512 dat330 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k11+896*h10+4*w10+40*b24);
__m512 dat331 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k11+896*h10+4*w10+40*b24);
__m512 dat332 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k11+896*h10+4*w10+40*b24);
__m512 dat333 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k11+896*h10+4*w10+40*b24);
__m512 dat334 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k11+896*h10+4*w10+40*b24);
__m512 dat335 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k11+896*h10+4*w10+40*b24);
__m512 dat336 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k11+896*h10+4*w10+40*b24);
__m512 dat337 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k11+896*h10+4*w10+40*b24);
__m512 fft3865 = _mm512_add_ps(dat322, dat330);
__m512 fft3953 = _mm512_add_ps(dat323, dat331);
__m512 fft3866 = _mm512_sub_ps(dat322, dat330);
__m512 fft3954 = _mm512_sub_ps(dat323, dat331);
__m512 fft3867 = _mm512_add_ps(dat324, dat332);
__m512 fft3955 = _mm512_add_ps(dat325, dat333);
__m512 fft3868 = _mm512_sub_ps(dat324, dat332);
__m512 fft3956 = _mm512_sub_ps(dat325, dat333);
__m512 fft3869 = _mm512_add_ps(dat326, dat334);
__m512 fft3957 = _mm512_add_ps(dat327, dat335);
__m512 fft3870 = _mm512_sub_ps(dat326, dat334);
__m512 fft3958 = _mm512_sub_ps(dat327, dat335);
__m512 fft3871 = _mm512_add_ps(dat328, dat336);
__m512 fft3959 = _mm512_add_ps(dat329, dat337);
__m512 fft3872 = _mm512_sub_ps(dat328, dat336);
__m512 fft3960 = _mm512_sub_ps(dat329, dat337);
__m512 fft3873 = _mm512_add_ps(fft3865, fft3869);
__m512 fft3961 = _mm512_add_ps(fft3953, fft3957);
__m512 fft3874 = _mm512_sub_ps(fft3865, fft3869);
__m512 fft3962 = _mm512_sub_ps(fft3953, fft3957);
__m512 fft3875 = _mm512_add_ps(fft3867, fft3871);
__m512 fft3963 = _mm512_add_ps(fft3955, fft3959);
__m512 fft3876 = _mm512_sub_ps(fft3871, fft3867);
__m512 fft3964 = _mm512_sub_ps(fft3959, fft3955);
__m512 fft3877 = _mm512_sub_ps(fft3868, fft3872);
__m512 fft3965 = _mm512_sub_ps(fft3956, fft3960);
__m512 fft3878 = _mm512_add_ps(fft3868, fft3872);
__m512 fft3966 = _mm512_add_ps(fft3956, fft3960);
__m512 fft3879 = _mm512_add_ps(fft3873, fft3875);
__m512 fft3967 = _mm512_add_ps(fft3961, fft3963);
__m512 fft3880 = _mm512_sub_ps(fft3873, fft3875);
__m512 fft3968 = _mm512_sub_ps(fft3961, fft3963);
__m512 fft3881 = _mm512_fmadd_ps(fft3877, _mm512_set1_ps(7.0710677e-01f), fft3866);
__m512 fft3969 = _mm512_fmadd_ps(fft3965, _mm512_set1_ps(7.0710677e-01f), fft3954);
__m512 fft3882 = _mm512_fnmsub_ps(fft3878, _mm512_set1_ps(7.0710677e-01f), fft3870);
__m512 fft3970 = _mm512_fnmsub_ps(fft3966, _mm512_set1_ps(7.0710677e-01f), fft3958);
__m512 fft3883 = _mm512_fnmadd_ps(fft3877, _mm512_set1_ps(7.0710677e-01f), fft3866);
__m512 fft3971 = _mm512_fnmadd_ps(fft3965, _mm512_set1_ps(7.0710677e-01f), fft3954);
__m512 fft3884 = _mm512_fnmadd_ps(fft3878, _mm512_set1_ps(7.0710677e-01f), fft3870);
__m512 fft3972 = _mm512_fnmadd_ps(fft3966, _mm512_set1_ps(7.0710677e-01f), fft3958);
__m512 fft3885 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3886 = _mm512_fmadd_ps(fft3879, fft3885, _mm512_shuffle_f32x4(fft3879, fft3879, 78));
__m512 fft3973 = _mm512_fmadd_ps(fft3967, fft3885, _mm512_shuffle_f32x4(fft3967, fft3967, 78));
__m512 fft3887 = _mm512_fmadd_ps(fft3880, fft3885, _mm512_shuffle_f32x4(fft3880, fft3880, 78));
__m512 fft3974 = _mm512_fmadd_ps(fft3968, fft3885, _mm512_shuffle_f32x4(fft3968, fft3968, 78));
__m512 fft3888 = _mm512_fmadd_ps(fft3881, fft3885, _mm512_shuffle_f32x4(fft3881, fft3881, 78));
__m512 fft3975 = _mm512_fmadd_ps(fft3969, fft3885, _mm512_shuffle_f32x4(fft3969, fft3969, 78));
__m512 fft3889 = _mm512_fmadd_ps(fft3882, fft3885, _mm512_shuffle_f32x4(fft3882, fft3882, 78));
__m512 fft3976 = _mm512_fmadd_ps(fft3970, fft3885, _mm512_shuffle_f32x4(fft3970, fft3970, 78));
__m512 fft3890 = _mm512_fmadd_ps(fft3874, fft3885, _mm512_shuffle_f32x4(fft3874, fft3874, 78));
__m512 fft3977 = _mm512_fmadd_ps(fft3962, fft3885, _mm512_shuffle_f32x4(fft3962, fft3962, 78));
__m512 fft3891 = _mm512_fmadd_ps(fft3876, fft3885, _mm512_shuffle_f32x4(fft3876, fft3876, 78));
__m512 fft3978 = _mm512_fmadd_ps(fft3964, fft3885, _mm512_shuffle_f32x4(fft3964, fft3964, 78));
__m512 fft3892 = _mm512_fmadd_ps(fft3883, fft3885, _mm512_shuffle_f32x4(fft3883, fft3883, 78));
__m512 fft3979 = _mm512_fmadd_ps(fft3971, fft3885, _mm512_shuffle_f32x4(fft3971, fft3971, 78));
__m512 fft3893 = _mm512_fmadd_ps(fft3884, fft3885, _mm512_shuffle_f32x4(fft3884, fft3884, 78));
__m512 fft3980 = _mm512_fmadd_ps(fft3972, fft3885, _mm512_shuffle_f32x4(fft3972, fft3972, 78));
__m512 fft3894 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3895 = _mm512_mul_ps(fft3886, fft3894);
__m512 fft3981 = _mm512_mul_ps(fft3973, fft3894);
__m512 fft3896 = _mm512_mul_ps(fft3887, fft3894);
__m512 fft3982 = _mm512_mul_ps(fft3974, fft3894);
__m512 fft3897 = _mm512_mul_ps(fft3888, fft3894);
__m512 fft3983 = _mm512_mul_ps(fft3975, fft3894);
__m512 fft3898 = _mm512_mul_ps(fft3889, fft3894);
__m512 fft3984 = _mm512_mul_ps(fft3976, fft3894);
__m512 fft3899 = _mm512_mul_ps(fft3890, fft3894);
__m512 fft3985 = _mm512_mul_ps(fft3977, fft3894);
__m512 fft3900 = _mm512_mul_ps(fft3891, fft3894);
__m512 fft3986 = _mm512_mul_ps(fft3978, fft3894);
__m512 fft3901 = _mm512_mul_ps(fft3892, fft3894);
__m512 fft3987 = _mm512_mul_ps(fft3979, fft3894);
__m512 fft3902 = _mm512_mul_ps(fft3893, fft3894);
__m512 fft3988 = _mm512_mul_ps(fft3980, fft3894);
__m512 fft3903 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft3904 = _mm512_fmadd_ps(fft3887, fft3903, fft3895);
__m512 fft3989 = _mm512_fmadd_ps(fft3974, fft3903, fft3981);
__m512 fft3905 = _mm512_fnmadd_ps(fft3886, fft3903, fft3896);
__m512 fft3990 = _mm512_fnmadd_ps(fft3973, fft3903, fft3982);
__m512 fft3906 = _mm512_fmadd_ps(fft3889, fft3903, fft3897);
__m512 fft3991 = _mm512_fmadd_ps(fft3976, fft3903, fft3983);
__m512 fft3907 = _mm512_fnmadd_ps(fft3888, fft3903, fft3898);
__m512 fft3992 = _mm512_fnmadd_ps(fft3975, fft3903, fft3984);
__m512 fft3908 = _mm512_fmadd_ps(fft3891, fft3903, fft3899);
__m512 fft3993 = _mm512_fmadd_ps(fft3978, fft3903, fft3985);
__m512 fft3909 = _mm512_fnmadd_ps(fft3890, fft3903, fft3900);
__m512 fft3994 = _mm512_fnmadd_ps(fft3977, fft3903, fft3986);
__m512 fft3910 = _mm512_fmadd_ps(fft3893, fft3903, fft3901);
__m512 fft3995 = _mm512_fmadd_ps(fft3980, fft3903, fft3987);
__m512 fft3911 = _mm512_fnmadd_ps(fft3892, fft3903, fft3902);
__m512 fft3996 = _mm512_fnmadd_ps(fft3979, fft3903, fft3988);
__m512 fft3912 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft3913 = _mm512_fmadd_ps(fft3904, fft3912, _mm512_shuffle_f32x4(fft3904, fft3904, 177));
__m512 fft3997 = _mm512_fmadd_ps(fft3989, fft3912, _mm512_shuffle_f32x4(fft3989, fft3989, 177));
__m512 fft3914 = _mm512_fmadd_ps(fft3905, fft3912, _mm512_shuffle_f32x4(fft3905, fft3905, 177));
__m512 fft3998 = _mm512_fmadd_ps(fft3990, fft3912, _mm512_shuffle_f32x4(fft3990, fft3990, 177));
__m512 fft3915 = _mm512_fmadd_ps(fft3906, fft3912, _mm512_shuffle_f32x4(fft3906, fft3906, 177));
__m512 fft3999 = _mm512_fmadd_ps(fft3991, fft3912, _mm512_shuffle_f32x4(fft3991, fft3991, 177));
__m512 fft3916 = _mm512_fmadd_ps(fft3907, fft3912, _mm512_shuffle_f32x4(fft3907, fft3907, 177));
__m512 fft4000 = _mm512_fmadd_ps(fft3992, fft3912, _mm512_shuffle_f32x4(fft3992, fft3992, 177));
__m512 fft3917 = _mm512_fmadd_ps(fft3908, fft3912, _mm512_shuffle_f32x4(fft3908, fft3908, 177));
__m512 fft4001 = _mm512_fmadd_ps(fft3993, fft3912, _mm512_shuffle_f32x4(fft3993, fft3993, 177));
__m512 fft3918 = _mm512_fmadd_ps(fft3909, fft3912, _mm512_shuffle_f32x4(fft3909, fft3909, 177));
__m512 fft4002 = _mm512_fmadd_ps(fft3994, fft3912, _mm512_shuffle_f32x4(fft3994, fft3994, 177));
__m512 fft3919 = _mm512_fmadd_ps(fft3910, fft3912, _mm512_shuffle_f32x4(fft3910, fft3910, 177));
__m512 fft4003 = _mm512_fmadd_ps(fft3995, fft3912, _mm512_shuffle_f32x4(fft3995, fft3995, 177));
__m512 fft3920 = _mm512_fmadd_ps(fft3911, fft3912, _mm512_shuffle_f32x4(fft3911, fft3911, 177));
__m512 fft4004 = _mm512_fmadd_ps(fft3996, fft3912, _mm512_shuffle_f32x4(fft3996, fft3996, 177));
__m512 fft3921 = _mm512_mask_mov_ps(fft3913, 49344, fft3914);
__m512 fft4005 = _mm512_mask_mov_ps(fft3997, 49344, fft3998);
__m512 fft3922 = _mm512_mask_sub_ps(fft3914, 49344, _mm512_setzero_ps(), fft3913);
__m512 fft4006 = _mm512_mask_sub_ps(fft3998, 49344, _mm512_setzero_ps(), fft3997);
__m512 fft3923 = _mm512_mask_mov_ps(fft3915, 49344, fft3916);
__m512 fft4007 = _mm512_mask_mov_ps(fft3999, 49344, fft4000);
__m512 fft3924 = _mm512_mask_sub_ps(fft3916, 49344, _mm512_setzero_ps(), fft3915);
__m512 fft4008 = _mm512_mask_sub_ps(fft4000, 49344, _mm512_setzero_ps(), fft3999);
__m512 fft3925 = _mm512_mask_mov_ps(fft3917, 49344, fft3918);
__m512 fft4009 = _mm512_mask_mov_ps(fft4001, 49344, fft4002);
__m512 fft3926 = _mm512_mask_sub_ps(fft3918, 49344, _mm512_setzero_ps(), fft3917);
__m512 fft4010 = _mm512_mask_sub_ps(fft4002, 49344, _mm512_setzero_ps(), fft4001);
__m512 fft3927 = _mm512_mask_mov_ps(fft3919, 49344, fft3920);
__m512 fft4011 = _mm512_mask_mov_ps(fft4003, 49344, fft4004);
__m512 fft3928 = _mm512_mask_sub_ps(fft3920, 49344, _mm512_setzero_ps(), fft3919);
__m512 fft4012 = _mm512_mask_sub_ps(fft4004, 49344, _mm512_setzero_ps(), fft4003);
__m512 fft3929 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft3930 = _mm512_fmadd_ps(fft3921, fft3929, _mm512_shuffle_ps(fft3921, fft3921, 78));
__m512 fft4013 = _mm512_fmadd_ps(fft4005, fft3929, _mm512_shuffle_ps(fft4005, fft4005, 78));
__m512 fft3931 = _mm512_fmadd_ps(fft3922, fft3929, _mm512_shuffle_ps(fft3922, fft3922, 78));
__m512 fft4014 = _mm512_fmadd_ps(fft4006, fft3929, _mm512_shuffle_ps(fft4006, fft4006, 78));
__m512 fft3932 = _mm512_fmadd_ps(fft3923, fft3929, _mm512_shuffle_ps(fft3923, fft3923, 78));
__m512 fft4015 = _mm512_fmadd_ps(fft4007, fft3929, _mm512_shuffle_ps(fft4007, fft4007, 78));
__m512 fft3933 = _mm512_fmadd_ps(fft3924, fft3929, _mm512_shuffle_ps(fft3924, fft3924, 78));
__m512 fft4016 = _mm512_fmadd_ps(fft4008, fft3929, _mm512_shuffle_ps(fft4008, fft4008, 78));
__m512 fft3934 = _mm512_fmadd_ps(fft3925, fft3929, _mm512_shuffle_ps(fft3925, fft3925, 78));
__m512 fft4017 = _mm512_fmadd_ps(fft4009, fft3929, _mm512_shuffle_ps(fft4009, fft4009, 78));
__m512 fft3935 = _mm512_fmadd_ps(fft3926, fft3929, _mm512_shuffle_ps(fft3926, fft3926, 78));
__m512 fft4018 = _mm512_fmadd_ps(fft4010, fft3929, _mm512_shuffle_ps(fft4010, fft4010, 78));
__m512 fft3936 = _mm512_fmadd_ps(fft3927, fft3929, _mm512_shuffle_ps(fft3927, fft3927, 78));
__m512 fft4019 = _mm512_fmadd_ps(fft4011, fft3929, _mm512_shuffle_ps(fft4011, fft4011, 78));
__m512 fft3937 = _mm512_fmadd_ps(fft3928, fft3929, _mm512_shuffle_ps(fft3928, fft3928, 78));
__m512 fft4020 = _mm512_fmadd_ps(fft4012, fft3929, _mm512_shuffle_ps(fft4012, fft4012, 78));
__m512i fft3938 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft3939 = _mm512_permutexvar_ps(fft3938, fft3930);
__m512 fft4021 = _mm512_permutexvar_ps(fft3938, fft4013);
__m512i fft3940 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft3941 = _mm512_permutexvar_ps(fft3940, fft3930);
__m512 fft4022 = _mm512_permutexvar_ps(fft3940, fft4013);
__m512 fft3942 = _mm512_permutexvar_ps(fft3938, fft3931);
__m512 fft4023 = _mm512_permutexvar_ps(fft3938, fft4014);
__m512 fft3943 = _mm512_permutexvar_ps(fft3940, fft3931);
__m512 fft4024 = _mm512_permutexvar_ps(fft3940, fft4014);
__m512 fft3944 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft3945 = _mm512_fmadd_ps(fft3939, fft3944, fft3941);
__m512 fft4025 = _mm512_fmadd_ps(fft4021, fft3944, fft4022);
__m512 fft3946 = _mm512_fnmadd_ps(fft3943, fft3944, fft3942);
__m512 fft4026 = _mm512_fnmadd_ps(fft4024, fft3944, fft4023);
__m512 fft3947 = _mm512_mask_mov_ps(fft3943, 21845, fft3945);
__m512 fft4027 = _mm512_mask_mov_ps(fft4024, 21845, fft4025);
__m512 fft3948 = _mm512_mask_mov_ps(fft3939, 43176, fft3945);
__m512 fft4028 = _mm512_mask_mov_ps(fft4021, 43176, fft4025);
__m512 fft3949 = _mm512_mask_mov_ps(fft3947, 43176, fft3946);
__m512 fft4029 = _mm512_mask_mov_ps(fft4027, 43176, fft4026);
__m512 fft3950 = _mm512_mask_mov_ps(fft3948, 22102, fft3946);
__m512 fft4030 = _mm512_mask_mov_ps(fft4028, 22102, fft4026);
__m512 fft3951 = _mm512_mask_mul_ps(fft3949, 64764, fft3949, _mm512_set1_ps(5e-01f));
__m512 fft4031 = _mm512_mask_mul_ps(fft4029, 64764, fft4029, _mm512_set1_ps(5e-01f));
__m512 fft3952 = _mm512_mask_mul_ps(fft3950, 64764, fft3950, _mm512_set1_ps(5e-01f));
__m512 fft4032 = _mm512_mask_mul_ps(fft4030, 64764, fft4030, _mm512_set1_ps(5e-01f));
__m512 df337 = fft3951;
__m512 df345 = fft4031;
__m512 df338 = fft3952;
__m512 df346 = fft4032;
__m512 df339 = fft3932;
__m512 df347 = fft4015;
__m512 df340 = fft3933;
__m512 df348 = fft4016;
__m512 df341 = fft3934;
__m512 df349 = fft4017;
__m512 df342 = fft3935;
__m512 df350 = fft4018;
__m512 df343 = fft3936;
__m512 df351 = fft4019;
__m512 df344 = fft3937;
__m512 df352 = fft4020;
__m512i eo24 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df339 = _mm512_permutexvar_ps(eo24, df339);
df340 = _mm512_permutexvar_ps(eo24, df340);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df339);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df340);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df339);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df340);
df347 = _mm512_permutexvar_ps(eo24, df347);
df348 = _mm512_permutexvar_ps(eo24, df348);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df347);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df348);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df347);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df348);
df341 = _mm512_permutexvar_ps(eo24, df341);
df342 = _mm512_permutexvar_ps(eo24, df342);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df341);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df342);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df341);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df342);
df349 = _mm512_permutexvar_ps(eo24, df349);
df350 = _mm512_permutexvar_ps(eo24, df350);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df349);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df350);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df349);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df350);
df343 = _mm512_permutexvar_ps(eo24, df343);
df344 = _mm512_permutexvar_ps(eo24, df344);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df343);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df344);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df343);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df344);
df351 = _mm512_permutexvar_ps(eo24, df351);
df352 = _mm512_permutexvar_ps(eo24, df352);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df351);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df352);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df351);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df352);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df337);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df338);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df337);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df338);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df345);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df346);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df345);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df346);
}
}
if (j2 >= last1) return;
++j2;
}
rel2 = 15;
}
ptrdiff_t h11 = base2+30;
ptrdiff_t w11 = 220;
ptrdiff_t k12 = 3*s1;
ptrdiff_t kk11 = k12+2;
for (; k12 <= kk11; ++k12) {
ptrdiff_t b25 = 0;
ptrdiff_t m25 = (size_t)b25/2;
ptrdiff_t f26 = (size_t)b25%2;
__m512 dat338 = _mm512_maskz_loadu_ps(127, datPtr1+0+602112*i6+200704*k12+896*h11+4*w11+0*b25);
__m512 dat339 = _mm512_maskz_loadu_ps(127, datPtr1+896+602112*i6+200704*k12+896*h11+4*w11+0*b25);
__m512 dat340 = _mm512_maskz_loadu_ps(127, datPtr1+1792+602112*i6+200704*k12+896*h11+4*w11+0*b25);
__m512 dat341 = _mm512_maskz_loadu_ps(127, datPtr1+2688+602112*i6+200704*k12+896*h11+4*w11+0*b25);
__m512 dat342 = _mm512_maskz_loadu_ps(127, datPtr1+3584+602112*i6+200704*k12+896*h11+4*w11+0*b25);
__m512 dat343 = _mm512_maskz_loadu_ps(127, datPtr1+4480+602112*i6+200704*k12+896*h11+4*w11+0*b25);
__m512 dat344 = _mm512_maskz_loadu_ps(127, datPtr1+5376+602112*i6+200704*k12+896*h11+4*w11+0*b25);
__m512 dat345 = _mm512_maskz_loadu_ps(127, datPtr1+6272+602112*i6+200704*k12+896*h11+4*w11+0*b25);
__m512 dat346 = _mm512_maskz_loadu_ps(127, datPtr1+7168+602112*i6+200704*k12+896*h11+4*w11+0*b25);
__m512 dat347 = _mm512_maskz_loadu_ps(127, datPtr1+8064+602112*i6+200704*k12+896*h11+4*w11+0*b25);
__m512 dat348 = _mm512_maskz_loadu_ps(127, datPtr1+8960+602112*i6+200704*k12+896*h11+4*w11+0*b25);
__m512 dat349 = _mm512_maskz_loadu_ps(127, datPtr1+9856+602112*i6+200704*k12+896*h11+4*w11+0*b25);
__m512 dat350 = _mm512_maskz_loadu_ps(127, datPtr1+10752+602112*i6+200704*k12+896*h11+4*w11+0*b25);
__m512 dat351 = _mm512_maskz_loadu_ps(127, datPtr1+11648+602112*i6+200704*k12+896*h11+4*w11+0*b25);
__m512 dat352 = _mm512_maskz_loadu_ps(127, datPtr1+12544+602112*i6+200704*k12+896*h11+4*w11+0*b25);
__m512 dat353 = _mm512_maskz_loadu_ps(127, datPtr1+13440+602112*i6+200704*k12+896*h11+4*w11+0*b25);
__m512 fft4033 = _mm512_add_ps(dat338, dat346);
__m512 fft4121 = _mm512_add_ps(dat339, dat347);
__m512 fft4034 = _mm512_sub_ps(dat338, dat346);
__m512 fft4122 = _mm512_sub_ps(dat339, dat347);
__m512 fft4035 = _mm512_add_ps(dat340, dat348);
__m512 fft4123 = _mm512_add_ps(dat341, dat349);
__m512 fft4036 = _mm512_sub_ps(dat340, dat348);
__m512 fft4124 = _mm512_sub_ps(dat341, dat349);
__m512 fft4037 = _mm512_add_ps(dat342, dat350);
__m512 fft4125 = _mm512_add_ps(dat343, dat351);
__m512 fft4038 = _mm512_sub_ps(dat342, dat350);
__m512 fft4126 = _mm512_sub_ps(dat343, dat351);
__m512 fft4039 = _mm512_add_ps(dat344, dat352);
__m512 fft4127 = _mm512_add_ps(dat345, dat353);
__m512 fft4040 = _mm512_sub_ps(dat344, dat352);
__m512 fft4128 = _mm512_sub_ps(dat345, dat353);
__m512 fft4041 = _mm512_add_ps(fft4033, fft4037);
__m512 fft4129 = _mm512_add_ps(fft4121, fft4125);
__m512 fft4042 = _mm512_sub_ps(fft4033, fft4037);
__m512 fft4130 = _mm512_sub_ps(fft4121, fft4125);
__m512 fft4043 = _mm512_add_ps(fft4035, fft4039);
__m512 fft4131 = _mm512_add_ps(fft4123, fft4127);
__m512 fft4044 = _mm512_sub_ps(fft4039, fft4035);
__m512 fft4132 = _mm512_sub_ps(fft4127, fft4123);
__m512 fft4045 = _mm512_sub_ps(fft4036, fft4040);
__m512 fft4133 = _mm512_sub_ps(fft4124, fft4128);
__m512 fft4046 = _mm512_add_ps(fft4036, fft4040);
__m512 fft4134 = _mm512_add_ps(fft4124, fft4128);
__m512 fft4047 = _mm512_add_ps(fft4041, fft4043);
__m512 fft4135 = _mm512_add_ps(fft4129, fft4131);
__m512 fft4048 = _mm512_sub_ps(fft4041, fft4043);
__m512 fft4136 = _mm512_sub_ps(fft4129, fft4131);
__m512 fft4049 = _mm512_fmadd_ps(fft4045, _mm512_set1_ps(7.0710677e-01f), fft4034);
__m512 fft4137 = _mm512_fmadd_ps(fft4133, _mm512_set1_ps(7.0710677e-01f), fft4122);
__m512 fft4050 = _mm512_fnmsub_ps(fft4046, _mm512_set1_ps(7.0710677e-01f), fft4038);
__m512 fft4138 = _mm512_fnmsub_ps(fft4134, _mm512_set1_ps(7.0710677e-01f), fft4126);
__m512 fft4051 = _mm512_fnmadd_ps(fft4045, _mm512_set1_ps(7.0710677e-01f), fft4034);
__m512 fft4139 = _mm512_fnmadd_ps(fft4133, _mm512_set1_ps(7.0710677e-01f), fft4122);
__m512 fft4052 = _mm512_fnmadd_ps(fft4046, _mm512_set1_ps(7.0710677e-01f), fft4038);
__m512 fft4140 = _mm512_fnmadd_ps(fft4134, _mm512_set1_ps(7.0710677e-01f), fft4126);
__m512 fft4053 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4054 = _mm512_fmadd_ps(fft4047, fft4053, _mm512_shuffle_f32x4(fft4047, fft4047, 78));
__m512 fft4141 = _mm512_fmadd_ps(fft4135, fft4053, _mm512_shuffle_f32x4(fft4135, fft4135, 78));
__m512 fft4055 = _mm512_fmadd_ps(fft4048, fft4053, _mm512_shuffle_f32x4(fft4048, fft4048, 78));
__m512 fft4142 = _mm512_fmadd_ps(fft4136, fft4053, _mm512_shuffle_f32x4(fft4136, fft4136, 78));
__m512 fft4056 = _mm512_fmadd_ps(fft4049, fft4053, _mm512_shuffle_f32x4(fft4049, fft4049, 78));
__m512 fft4143 = _mm512_fmadd_ps(fft4137, fft4053, _mm512_shuffle_f32x4(fft4137, fft4137, 78));
__m512 fft4057 = _mm512_fmadd_ps(fft4050, fft4053, _mm512_shuffle_f32x4(fft4050, fft4050, 78));
__m512 fft4144 = _mm512_fmadd_ps(fft4138, fft4053, _mm512_shuffle_f32x4(fft4138, fft4138, 78));
__m512 fft4058 = _mm512_fmadd_ps(fft4042, fft4053, _mm512_shuffle_f32x4(fft4042, fft4042, 78));
__m512 fft4145 = _mm512_fmadd_ps(fft4130, fft4053, _mm512_shuffle_f32x4(fft4130, fft4130, 78));
__m512 fft4059 = _mm512_fmadd_ps(fft4044, fft4053, _mm512_shuffle_f32x4(fft4044, fft4044, 78));
__m512 fft4146 = _mm512_fmadd_ps(fft4132, fft4053, _mm512_shuffle_f32x4(fft4132, fft4132, 78));
__m512 fft4060 = _mm512_fmadd_ps(fft4051, fft4053, _mm512_shuffle_f32x4(fft4051, fft4051, 78));
__m512 fft4147 = _mm512_fmadd_ps(fft4139, fft4053, _mm512_shuffle_f32x4(fft4139, fft4139, 78));
__m512 fft4061 = _mm512_fmadd_ps(fft4052, fft4053, _mm512_shuffle_f32x4(fft4052, fft4052, 78));
__m512 fft4148 = _mm512_fmadd_ps(fft4140, fft4053, _mm512_shuffle_f32x4(fft4140, fft4140, 78));
__m512 fft4062 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4063 = _mm512_mul_ps(fft4054, fft4062);
__m512 fft4149 = _mm512_mul_ps(fft4141, fft4062);
__m512 fft4064 = _mm512_mul_ps(fft4055, fft4062);
__m512 fft4150 = _mm512_mul_ps(fft4142, fft4062);
__m512 fft4065 = _mm512_mul_ps(fft4056, fft4062);
__m512 fft4151 = _mm512_mul_ps(fft4143, fft4062);
__m512 fft4066 = _mm512_mul_ps(fft4057, fft4062);
__m512 fft4152 = _mm512_mul_ps(fft4144, fft4062);
__m512 fft4067 = _mm512_mul_ps(fft4058, fft4062);
__m512 fft4153 = _mm512_mul_ps(fft4145, fft4062);
__m512 fft4068 = _mm512_mul_ps(fft4059, fft4062);
__m512 fft4154 = _mm512_mul_ps(fft4146, fft4062);
__m512 fft4069 = _mm512_mul_ps(fft4060, fft4062);
__m512 fft4155 = _mm512_mul_ps(fft4147, fft4062);
__m512 fft4070 = _mm512_mul_ps(fft4061, fft4062);
__m512 fft4156 = _mm512_mul_ps(fft4148, fft4062);
__m512 fft4071 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft4072 = _mm512_fmadd_ps(fft4055, fft4071, fft4063);
__m512 fft4157 = _mm512_fmadd_ps(fft4142, fft4071, fft4149);
__m512 fft4073 = _mm512_fnmadd_ps(fft4054, fft4071, fft4064);
__m512 fft4158 = _mm512_fnmadd_ps(fft4141, fft4071, fft4150);
__m512 fft4074 = _mm512_fmadd_ps(fft4057, fft4071, fft4065);
__m512 fft4159 = _mm512_fmadd_ps(fft4144, fft4071, fft4151);
__m512 fft4075 = _mm512_fnmadd_ps(fft4056, fft4071, fft4066);
__m512 fft4160 = _mm512_fnmadd_ps(fft4143, fft4071, fft4152);
__m512 fft4076 = _mm512_fmadd_ps(fft4059, fft4071, fft4067);
__m512 fft4161 = _mm512_fmadd_ps(fft4146, fft4071, fft4153);
__m512 fft4077 = _mm512_fnmadd_ps(fft4058, fft4071, fft4068);
__m512 fft4162 = _mm512_fnmadd_ps(fft4145, fft4071, fft4154);
__m512 fft4078 = _mm512_fmadd_ps(fft4061, fft4071, fft4069);
__m512 fft4163 = _mm512_fmadd_ps(fft4148, fft4071, fft4155);
__m512 fft4079 = _mm512_fnmadd_ps(fft4060, fft4071, fft4070);
__m512 fft4164 = _mm512_fnmadd_ps(fft4147, fft4071, fft4156);
__m512 fft4080 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft4081 = _mm512_fmadd_ps(fft4072, fft4080, _mm512_shuffle_f32x4(fft4072, fft4072, 177));
__m512 fft4165 = _mm512_fmadd_ps(fft4157, fft4080, _mm512_shuffle_f32x4(fft4157, fft4157, 177));
__m512 fft4082 = _mm512_fmadd_ps(fft4073, fft4080, _mm512_shuffle_f32x4(fft4073, fft4073, 177));
__m512 fft4166 = _mm512_fmadd_ps(fft4158, fft4080, _mm512_shuffle_f32x4(fft4158, fft4158, 177));
__m512 fft4083 = _mm512_fmadd_ps(fft4074, fft4080, _mm512_shuffle_f32x4(fft4074, fft4074, 177));
__m512 fft4167 = _mm512_fmadd_ps(fft4159, fft4080, _mm512_shuffle_f32x4(fft4159, fft4159, 177));
__m512 fft4084 = _mm512_fmadd_ps(fft4075, fft4080, _mm512_shuffle_f32x4(fft4075, fft4075, 177));
__m512 fft4168 = _mm512_fmadd_ps(fft4160, fft4080, _mm512_shuffle_f32x4(fft4160, fft4160, 177));
__m512 fft4085 = _mm512_fmadd_ps(fft4076, fft4080, _mm512_shuffle_f32x4(fft4076, fft4076, 177));
__m512 fft4169 = _mm512_fmadd_ps(fft4161, fft4080, _mm512_shuffle_f32x4(fft4161, fft4161, 177));
__m512 fft4086 = _mm512_fmadd_ps(fft4077, fft4080, _mm512_shuffle_f32x4(fft4077, fft4077, 177));
__m512 fft4170 = _mm512_fmadd_ps(fft4162, fft4080, _mm512_shuffle_f32x4(fft4162, fft4162, 177));
__m512 fft4087 = _mm512_fmadd_ps(fft4078, fft4080, _mm512_shuffle_f32x4(fft4078, fft4078, 177));
__m512 fft4171 = _mm512_fmadd_ps(fft4163, fft4080, _mm512_shuffle_f32x4(fft4163, fft4163, 177));
__m512 fft4088 = _mm512_fmadd_ps(fft4079, fft4080, _mm512_shuffle_f32x4(fft4079, fft4079, 177));
__m512 fft4172 = _mm512_fmadd_ps(fft4164, fft4080, _mm512_shuffle_f32x4(fft4164, fft4164, 177));
__m512 fft4089 = _mm512_mask_mov_ps(fft4081, 49344, fft4082);
__m512 fft4173 = _mm512_mask_mov_ps(fft4165, 49344, fft4166);
__m512 fft4090 = _mm512_mask_sub_ps(fft4082, 49344, _mm512_setzero_ps(), fft4081);
__m512 fft4174 = _mm512_mask_sub_ps(fft4166, 49344, _mm512_setzero_ps(), fft4165);
__m512 fft4091 = _mm512_mask_mov_ps(fft4083, 49344, fft4084);
__m512 fft4175 = _mm512_mask_mov_ps(fft4167, 49344, fft4168);
__m512 fft4092 = _mm512_mask_sub_ps(fft4084, 49344, _mm512_setzero_ps(), fft4083);
__m512 fft4176 = _mm512_mask_sub_ps(fft4168, 49344, _mm512_setzero_ps(), fft4167);
__m512 fft4093 = _mm512_mask_mov_ps(fft4085, 49344, fft4086);
__m512 fft4177 = _mm512_mask_mov_ps(fft4169, 49344, fft4170);
__m512 fft4094 = _mm512_mask_sub_ps(fft4086, 49344, _mm512_setzero_ps(), fft4085);
__m512 fft4178 = _mm512_mask_sub_ps(fft4170, 49344, _mm512_setzero_ps(), fft4169);
__m512 fft4095 = _mm512_mask_mov_ps(fft4087, 49344, fft4088);
__m512 fft4179 = _mm512_mask_mov_ps(fft4171, 49344, fft4172);
__m512 fft4096 = _mm512_mask_sub_ps(fft4088, 49344, _mm512_setzero_ps(), fft4087);
__m512 fft4180 = _mm512_mask_sub_ps(fft4172, 49344, _mm512_setzero_ps(), fft4171);
__m512 fft4097 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft4098 = _mm512_fmadd_ps(fft4089, fft4097, _mm512_shuffle_ps(fft4089, fft4089, 78));
__m512 fft4181 = _mm512_fmadd_ps(fft4173, fft4097, _mm512_shuffle_ps(fft4173, fft4173, 78));
__m512 fft4099 = _mm512_fmadd_ps(fft4090, fft4097, _mm512_shuffle_ps(fft4090, fft4090, 78));
__m512 fft4182 = _mm512_fmadd_ps(fft4174, fft4097, _mm512_shuffle_ps(fft4174, fft4174, 78));
__m512 fft4100 = _mm512_fmadd_ps(fft4091, fft4097, _mm512_shuffle_ps(fft4091, fft4091, 78));
__m512 fft4183 = _mm512_fmadd_ps(fft4175, fft4097, _mm512_shuffle_ps(fft4175, fft4175, 78));
__m512 fft4101 = _mm512_fmadd_ps(fft4092, fft4097, _mm512_shuffle_ps(fft4092, fft4092, 78));
__m512 fft4184 = _mm512_fmadd_ps(fft4176, fft4097, _mm512_shuffle_ps(fft4176, fft4176, 78));
__m512 fft4102 = _mm512_fmadd_ps(fft4093, fft4097, _mm512_shuffle_ps(fft4093, fft4093, 78));
__m512 fft4185 = _mm512_fmadd_ps(fft4177, fft4097, _mm512_shuffle_ps(fft4177, fft4177, 78));
__m512 fft4103 = _mm512_fmadd_ps(fft4094, fft4097, _mm512_shuffle_ps(fft4094, fft4094, 78));
__m512 fft4186 = _mm512_fmadd_ps(fft4178, fft4097, _mm512_shuffle_ps(fft4178, fft4178, 78));
__m512 fft4104 = _mm512_fmadd_ps(fft4095, fft4097, _mm512_shuffle_ps(fft4095, fft4095, 78));
__m512 fft4187 = _mm512_fmadd_ps(fft4179, fft4097, _mm512_shuffle_ps(fft4179, fft4179, 78));
__m512 fft4105 = _mm512_fmadd_ps(fft4096, fft4097, _mm512_shuffle_ps(fft4096, fft4096, 78));
__m512 fft4188 = _mm512_fmadd_ps(fft4180, fft4097, _mm512_shuffle_ps(fft4180, fft4180, 78));
__m512i fft4106 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft4107 = _mm512_permutexvar_ps(fft4106, fft4098);
__m512 fft4189 = _mm512_permutexvar_ps(fft4106, fft4181);
__m512i fft4108 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft4109 = _mm512_permutexvar_ps(fft4108, fft4098);
__m512 fft4190 = _mm512_permutexvar_ps(fft4108, fft4181);
__m512 fft4110 = _mm512_permutexvar_ps(fft4106, fft4099);
__m512 fft4191 = _mm512_permutexvar_ps(fft4106, fft4182);
__m512 fft4111 = _mm512_permutexvar_ps(fft4108, fft4099);
__m512 fft4192 = _mm512_permutexvar_ps(fft4108, fft4182);
__m512 fft4112 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft4113 = _mm512_fmadd_ps(fft4107, fft4112, fft4109);
__m512 fft4193 = _mm512_fmadd_ps(fft4189, fft4112, fft4190);
__m512 fft4114 = _mm512_fnmadd_ps(fft4111, fft4112, fft4110);
__m512 fft4194 = _mm512_fnmadd_ps(fft4192, fft4112, fft4191);
__m512 fft4115 = _mm512_mask_mov_ps(fft4111, 21845, fft4113);
__m512 fft4195 = _mm512_mask_mov_ps(fft4192, 21845, fft4193);
__m512 fft4116 = _mm512_mask_mov_ps(fft4107, 43176, fft4113);
__m512 fft4196 = _mm512_mask_mov_ps(fft4189, 43176, fft4193);
__m512 fft4117 = _mm512_mask_mov_ps(fft4115, 43176, fft4114);
__m512 fft4197 = _mm512_mask_mov_ps(fft4195, 43176, fft4194);
__m512 fft4118 = _mm512_mask_mov_ps(fft4116, 22102, fft4114);
__m512 fft4198 = _mm512_mask_mov_ps(fft4196, 22102, fft4194);
__m512 fft4119 = _mm512_mask_mul_ps(fft4117, 64764, fft4117, _mm512_set1_ps(5e-01f));
__m512 fft4199 = _mm512_mask_mul_ps(fft4197, 64764, fft4197, _mm512_set1_ps(5e-01f));
__m512 fft4120 = _mm512_mask_mul_ps(fft4118, 64764, fft4118, _mm512_set1_ps(5e-01f));
__m512 fft4200 = _mm512_mask_mul_ps(fft4198, 64764, fft4198, _mm512_set1_ps(5e-01f));
__m512 df353 = fft4119;
__m512 df361 = fft4199;
__m512 df354 = fft4120;
__m512 df362 = fft4200;
__m512 df355 = fft4100;
__m512 df363 = fft4183;
__m512 df356 = fft4101;
__m512 df364 = fft4184;
__m512 df357 = fft4102;
__m512 df365 = fft4185;
__m512 df358 = fft4103;
__m512 df366 = fft4186;
__m512 df359 = fft4104;
__m512 df367 = fft4187;
__m512 df360 = fft4105;
__m512 df368 = fft4188;
__m512i eo25 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df355 = _mm512_permutexvar_ps(eo25, df355);
df356 = _mm512_permutexvar_ps(eo25, df356);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df355);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df356);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df355);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df356);
df363 = _mm512_permutexvar_ps(eo25, df363);
df364 = _mm512_permutexvar_ps(eo25, df364);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df363);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df364);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df363);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df364);
df357 = _mm512_permutexvar_ps(eo25, df357);
df358 = _mm512_permutexvar_ps(eo25, df358);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df357);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df358);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df357);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df358);
df365 = _mm512_permutexvar_ps(eo25, df365);
df366 = _mm512_permutexvar_ps(eo25, df366);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df365);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df366);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df365);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df366);
df359 = _mm512_permutexvar_ps(eo25, df359);
df360 = _mm512_permutexvar_ps(eo25, df360);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df359);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df360);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df359);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df360);
df367 = _mm512_permutexvar_ps(eo25, df367);
df368 = _mm512_permutexvar_ps(eo25, df368);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df367);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df368);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df367);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df368);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df353);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df354);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df353);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df354);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df361);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df362);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df361);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df362);
ptrdiff_t b26 = 1;
ptrdiff_t m26 = (size_t)b26/2;
ptrdiff_t f27 = (size_t)b26%2;
__m512 dat354 = _mm512_maskz_loadu_ps(65528, datPtr1+8080+602112*i6+200704*k12+896*h11+4*w11+0*b26);
__m512 dat355 = _mm512_maskz_loadu_ps(65528, datPtr1+8976+602112*i6+200704*k12+896*h11+4*w11+0*b26);
__m512 dat356 = _mm512_maskz_loadu_ps(65528, datPtr1+9872+602112*i6+200704*k12+896*h11+4*w11+0*b26);
__m512 dat357 = _mm512_maskz_loadu_ps(65528, datPtr1+10768+602112*i6+200704*k12+896*h11+4*w11+0*b26);
__m512 dat358 = _mm512_maskz_loadu_ps(65528, datPtr1+11664+602112*i6+200704*k12+896*h11+4*w11+0*b26);
__m512 dat359 = _mm512_maskz_loadu_ps(65528, datPtr1+12560+602112*i6+200704*k12+896*h11+4*w11+0*b26);
__m512 dat360 = _mm512_maskz_loadu_ps(65528, datPtr1+13456+602112*i6+200704*k12+896*h11+4*w11+0*b26);
__m512 dat361 = _mm512_maskz_loadu_ps(65528, datPtr1+14352+602112*i6+200704*k12+896*h11+4*w11+0*b26);
__m512 dat362 = _mm512_maskz_loadu_ps(65528, datPtr1+15248+602112*i6+200704*k12+896*h11+4*w11+0*b26);
__m512 dat363 = _mm512_maskz_loadu_ps(65528, datPtr1+16144+602112*i6+200704*k12+896*h11+4*w11+0*b26);
__m512 dat364 = _mm512_maskz_loadu_ps(65528, datPtr1+17040+602112*i6+200704*k12+896*h11+4*w11+0*b26);
__m512 dat365 = _mm512_maskz_loadu_ps(65528, datPtr1+17936+602112*i6+200704*k12+896*h11+4*w11+0*b26);
__m512 dat366 = _mm512_maskz_loadu_ps(65528, datPtr1+18832+602112*i6+200704*k12+896*h11+4*w11+0*b26);
__m512 dat367 = _mm512_maskz_loadu_ps(65528, datPtr1+19728+602112*i6+200704*k12+896*h11+4*w11+0*b26);
__m512 dat368 = _mm512_maskz_loadu_ps(65528, datPtr1+20624+602112*i6+200704*k12+896*h11+4*w11+0*b26);
__m512 dat369 = _mm512_maskz_loadu_ps(65528, datPtr1+21520+602112*i6+200704*k12+896*h11+4*w11+0*b26);
__m512 fft4201 = _mm512_add_ps(dat354, dat362);
__m512 fft4289 = _mm512_add_ps(dat355, dat363);
__m512 fft4202 = _mm512_sub_ps(dat354, dat362);
__m512 fft4290 = _mm512_sub_ps(dat355, dat363);
__m512 fft4203 = _mm512_add_ps(dat356, dat364);
__m512 fft4291 = _mm512_add_ps(dat357, dat365);
__m512 fft4204 = _mm512_sub_ps(dat356, dat364);
__m512 fft4292 = _mm512_sub_ps(dat357, dat365);
__m512 fft4205 = _mm512_add_ps(dat358, dat366);
__m512 fft4293 = _mm512_add_ps(dat359, dat367);
__m512 fft4206 = _mm512_sub_ps(dat358, dat366);
__m512 fft4294 = _mm512_sub_ps(dat359, dat367);
__m512 fft4207 = _mm512_add_ps(dat360, dat368);
__m512 fft4295 = _mm512_add_ps(dat361, dat369);
__m512 fft4208 = _mm512_sub_ps(dat360, dat368);
__m512 fft4296 = _mm512_sub_ps(dat361, dat369);
__m512 fft4209 = _mm512_add_ps(fft4201, fft4205);
__m512 fft4297 = _mm512_add_ps(fft4289, fft4293);
__m512 fft4210 = _mm512_sub_ps(fft4201, fft4205);
__m512 fft4298 = _mm512_sub_ps(fft4289, fft4293);
__m512 fft4211 = _mm512_add_ps(fft4203, fft4207);
__m512 fft4299 = _mm512_add_ps(fft4291, fft4295);
__m512 fft4212 = _mm512_sub_ps(fft4207, fft4203);
__m512 fft4300 = _mm512_sub_ps(fft4295, fft4291);
__m512 fft4213 = _mm512_sub_ps(fft4204, fft4208);
__m512 fft4301 = _mm512_sub_ps(fft4292, fft4296);
__m512 fft4214 = _mm512_add_ps(fft4204, fft4208);
__m512 fft4302 = _mm512_add_ps(fft4292, fft4296);
__m512 fft4215 = _mm512_add_ps(fft4209, fft4211);
__m512 fft4303 = _mm512_add_ps(fft4297, fft4299);
__m512 fft4216 = _mm512_sub_ps(fft4209, fft4211);
__m512 fft4304 = _mm512_sub_ps(fft4297, fft4299);
__m512 fft4217 = _mm512_fmadd_ps(fft4213, _mm512_set1_ps(7.0710677e-01f), fft4202);
__m512 fft4305 = _mm512_fmadd_ps(fft4301, _mm512_set1_ps(7.0710677e-01f), fft4290);
__m512 fft4218 = _mm512_fnmsub_ps(fft4214, _mm512_set1_ps(7.0710677e-01f), fft4206);
__m512 fft4306 = _mm512_fnmsub_ps(fft4302, _mm512_set1_ps(7.0710677e-01f), fft4294);
__m512 fft4219 = _mm512_fnmadd_ps(fft4213, _mm512_set1_ps(7.0710677e-01f), fft4202);
__m512 fft4307 = _mm512_fnmadd_ps(fft4301, _mm512_set1_ps(7.0710677e-01f), fft4290);
__m512 fft4220 = _mm512_fnmadd_ps(fft4214, _mm512_set1_ps(7.0710677e-01f), fft4206);
__m512 fft4308 = _mm512_fnmadd_ps(fft4302, _mm512_set1_ps(7.0710677e-01f), fft4294);
__m512 fft4221 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4222 = _mm512_fmadd_ps(fft4215, fft4221, _mm512_shuffle_f32x4(fft4215, fft4215, 78));
__m512 fft4309 = _mm512_fmadd_ps(fft4303, fft4221, _mm512_shuffle_f32x4(fft4303, fft4303, 78));
__m512 fft4223 = _mm512_fmadd_ps(fft4216, fft4221, _mm512_shuffle_f32x4(fft4216, fft4216, 78));
__m512 fft4310 = _mm512_fmadd_ps(fft4304, fft4221, _mm512_shuffle_f32x4(fft4304, fft4304, 78));
__m512 fft4224 = _mm512_fmadd_ps(fft4217, fft4221, _mm512_shuffle_f32x4(fft4217, fft4217, 78));
__m512 fft4311 = _mm512_fmadd_ps(fft4305, fft4221, _mm512_shuffle_f32x4(fft4305, fft4305, 78));
__m512 fft4225 = _mm512_fmadd_ps(fft4218, fft4221, _mm512_shuffle_f32x4(fft4218, fft4218, 78));
__m512 fft4312 = _mm512_fmadd_ps(fft4306, fft4221, _mm512_shuffle_f32x4(fft4306, fft4306, 78));
__m512 fft4226 = _mm512_fmadd_ps(fft4210, fft4221, _mm512_shuffle_f32x4(fft4210, fft4210, 78));
__m512 fft4313 = _mm512_fmadd_ps(fft4298, fft4221, _mm512_shuffle_f32x4(fft4298, fft4298, 78));
__m512 fft4227 = _mm512_fmadd_ps(fft4212, fft4221, _mm512_shuffle_f32x4(fft4212, fft4212, 78));
__m512 fft4314 = _mm512_fmadd_ps(fft4300, fft4221, _mm512_shuffle_f32x4(fft4300, fft4300, 78));
__m512 fft4228 = _mm512_fmadd_ps(fft4219, fft4221, _mm512_shuffle_f32x4(fft4219, fft4219, 78));
__m512 fft4315 = _mm512_fmadd_ps(fft4307, fft4221, _mm512_shuffle_f32x4(fft4307, fft4307, 78));
__m512 fft4229 = _mm512_fmadd_ps(fft4220, fft4221, _mm512_shuffle_f32x4(fft4220, fft4220, 78));
__m512 fft4316 = _mm512_fmadd_ps(fft4308, fft4221, _mm512_shuffle_f32x4(fft4308, fft4308, 78));
__m512 fft4230 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4231 = _mm512_mul_ps(fft4222, fft4230);
__m512 fft4317 = _mm512_mul_ps(fft4309, fft4230);
__m512 fft4232 = _mm512_mul_ps(fft4223, fft4230);
__m512 fft4318 = _mm512_mul_ps(fft4310, fft4230);
__m512 fft4233 = _mm512_mul_ps(fft4224, fft4230);
__m512 fft4319 = _mm512_mul_ps(fft4311, fft4230);
__m512 fft4234 = _mm512_mul_ps(fft4225, fft4230);
__m512 fft4320 = _mm512_mul_ps(fft4312, fft4230);
__m512 fft4235 = _mm512_mul_ps(fft4226, fft4230);
__m512 fft4321 = _mm512_mul_ps(fft4313, fft4230);
__m512 fft4236 = _mm512_mul_ps(fft4227, fft4230);
__m512 fft4322 = _mm512_mul_ps(fft4314, fft4230);
__m512 fft4237 = _mm512_mul_ps(fft4228, fft4230);
__m512 fft4323 = _mm512_mul_ps(fft4315, fft4230);
__m512 fft4238 = _mm512_mul_ps(fft4229, fft4230);
__m512 fft4324 = _mm512_mul_ps(fft4316, fft4230);
__m512 fft4239 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft4240 = _mm512_fmadd_ps(fft4223, fft4239, fft4231);
__m512 fft4325 = _mm512_fmadd_ps(fft4310, fft4239, fft4317);
__m512 fft4241 = _mm512_fnmadd_ps(fft4222, fft4239, fft4232);
__m512 fft4326 = _mm512_fnmadd_ps(fft4309, fft4239, fft4318);
__m512 fft4242 = _mm512_fmadd_ps(fft4225, fft4239, fft4233);
__m512 fft4327 = _mm512_fmadd_ps(fft4312, fft4239, fft4319);
__m512 fft4243 = _mm512_fnmadd_ps(fft4224, fft4239, fft4234);
__m512 fft4328 = _mm512_fnmadd_ps(fft4311, fft4239, fft4320);
__m512 fft4244 = _mm512_fmadd_ps(fft4227, fft4239, fft4235);
__m512 fft4329 = _mm512_fmadd_ps(fft4314, fft4239, fft4321);
__m512 fft4245 = _mm512_fnmadd_ps(fft4226, fft4239, fft4236);
__m512 fft4330 = _mm512_fnmadd_ps(fft4313, fft4239, fft4322);
__m512 fft4246 = _mm512_fmadd_ps(fft4229, fft4239, fft4237);
__m512 fft4331 = _mm512_fmadd_ps(fft4316, fft4239, fft4323);
__m512 fft4247 = _mm512_fnmadd_ps(fft4228, fft4239, fft4238);
__m512 fft4332 = _mm512_fnmadd_ps(fft4315, fft4239, fft4324);
__m512 fft4248 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft4249 = _mm512_fmadd_ps(fft4240, fft4248, _mm512_shuffle_f32x4(fft4240, fft4240, 177));
__m512 fft4333 = _mm512_fmadd_ps(fft4325, fft4248, _mm512_shuffle_f32x4(fft4325, fft4325, 177));
__m512 fft4250 = _mm512_fmadd_ps(fft4241, fft4248, _mm512_shuffle_f32x4(fft4241, fft4241, 177));
__m512 fft4334 = _mm512_fmadd_ps(fft4326, fft4248, _mm512_shuffle_f32x4(fft4326, fft4326, 177));
__m512 fft4251 = _mm512_fmadd_ps(fft4242, fft4248, _mm512_shuffle_f32x4(fft4242, fft4242, 177));
__m512 fft4335 = _mm512_fmadd_ps(fft4327, fft4248, _mm512_shuffle_f32x4(fft4327, fft4327, 177));
__m512 fft4252 = _mm512_fmadd_ps(fft4243, fft4248, _mm512_shuffle_f32x4(fft4243, fft4243, 177));
__m512 fft4336 = _mm512_fmadd_ps(fft4328, fft4248, _mm512_shuffle_f32x4(fft4328, fft4328, 177));
__m512 fft4253 = _mm512_fmadd_ps(fft4244, fft4248, _mm512_shuffle_f32x4(fft4244, fft4244, 177));
__m512 fft4337 = _mm512_fmadd_ps(fft4329, fft4248, _mm512_shuffle_f32x4(fft4329, fft4329, 177));
__m512 fft4254 = _mm512_fmadd_ps(fft4245, fft4248, _mm512_shuffle_f32x4(fft4245, fft4245, 177));
__m512 fft4338 = _mm512_fmadd_ps(fft4330, fft4248, _mm512_shuffle_f32x4(fft4330, fft4330, 177));
__m512 fft4255 = _mm512_fmadd_ps(fft4246, fft4248, _mm512_shuffle_f32x4(fft4246, fft4246, 177));
__m512 fft4339 = _mm512_fmadd_ps(fft4331, fft4248, _mm512_shuffle_f32x4(fft4331, fft4331, 177));
__m512 fft4256 = _mm512_fmadd_ps(fft4247, fft4248, _mm512_shuffle_f32x4(fft4247, fft4247, 177));
__m512 fft4340 = _mm512_fmadd_ps(fft4332, fft4248, _mm512_shuffle_f32x4(fft4332, fft4332, 177));
__m512 fft4257 = _mm512_mask_mov_ps(fft4249, 49344, fft4250);
__m512 fft4341 = _mm512_mask_mov_ps(fft4333, 49344, fft4334);
__m512 fft4258 = _mm512_mask_sub_ps(fft4250, 49344, _mm512_setzero_ps(), fft4249);
__m512 fft4342 = _mm512_mask_sub_ps(fft4334, 49344, _mm512_setzero_ps(), fft4333);
__m512 fft4259 = _mm512_mask_mov_ps(fft4251, 49344, fft4252);
__m512 fft4343 = _mm512_mask_mov_ps(fft4335, 49344, fft4336);
__m512 fft4260 = _mm512_mask_sub_ps(fft4252, 49344, _mm512_setzero_ps(), fft4251);
__m512 fft4344 = _mm512_mask_sub_ps(fft4336, 49344, _mm512_setzero_ps(), fft4335);
__m512 fft4261 = _mm512_mask_mov_ps(fft4253, 49344, fft4254);
__m512 fft4345 = _mm512_mask_mov_ps(fft4337, 49344, fft4338);
__m512 fft4262 = _mm512_mask_sub_ps(fft4254, 49344, _mm512_setzero_ps(), fft4253);
__m512 fft4346 = _mm512_mask_sub_ps(fft4338, 49344, _mm512_setzero_ps(), fft4337);
__m512 fft4263 = _mm512_mask_mov_ps(fft4255, 49344, fft4256);
__m512 fft4347 = _mm512_mask_mov_ps(fft4339, 49344, fft4340);
__m512 fft4264 = _mm512_mask_sub_ps(fft4256, 49344, _mm512_setzero_ps(), fft4255);
__m512 fft4348 = _mm512_mask_sub_ps(fft4340, 49344, _mm512_setzero_ps(), fft4339);
__m512 fft4265 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft4266 = _mm512_fmadd_ps(fft4257, fft4265, _mm512_shuffle_ps(fft4257, fft4257, 78));
__m512 fft4349 = _mm512_fmadd_ps(fft4341, fft4265, _mm512_shuffle_ps(fft4341, fft4341, 78));
__m512 fft4267 = _mm512_fmadd_ps(fft4258, fft4265, _mm512_shuffle_ps(fft4258, fft4258, 78));
__m512 fft4350 = _mm512_fmadd_ps(fft4342, fft4265, _mm512_shuffle_ps(fft4342, fft4342, 78));
__m512 fft4268 = _mm512_fmadd_ps(fft4259, fft4265, _mm512_shuffle_ps(fft4259, fft4259, 78));
__m512 fft4351 = _mm512_fmadd_ps(fft4343, fft4265, _mm512_shuffle_ps(fft4343, fft4343, 78));
__m512 fft4269 = _mm512_fmadd_ps(fft4260, fft4265, _mm512_shuffle_ps(fft4260, fft4260, 78));
__m512 fft4352 = _mm512_fmadd_ps(fft4344, fft4265, _mm512_shuffle_ps(fft4344, fft4344, 78));
__m512 fft4270 = _mm512_fmadd_ps(fft4261, fft4265, _mm512_shuffle_ps(fft4261, fft4261, 78));
__m512 fft4353 = _mm512_fmadd_ps(fft4345, fft4265, _mm512_shuffle_ps(fft4345, fft4345, 78));
__m512 fft4271 = _mm512_fmadd_ps(fft4262, fft4265, _mm512_shuffle_ps(fft4262, fft4262, 78));
__m512 fft4354 = _mm512_fmadd_ps(fft4346, fft4265, _mm512_shuffle_ps(fft4346, fft4346, 78));
__m512 fft4272 = _mm512_fmadd_ps(fft4263, fft4265, _mm512_shuffle_ps(fft4263, fft4263, 78));
__m512 fft4355 = _mm512_fmadd_ps(fft4347, fft4265, _mm512_shuffle_ps(fft4347, fft4347, 78));
__m512 fft4273 = _mm512_fmadd_ps(fft4264, fft4265, _mm512_shuffle_ps(fft4264, fft4264, 78));
__m512 fft4356 = _mm512_fmadd_ps(fft4348, fft4265, _mm512_shuffle_ps(fft4348, fft4348, 78));
__m512i fft4274 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft4275 = _mm512_permutexvar_ps(fft4274, fft4266);
__m512 fft4357 = _mm512_permutexvar_ps(fft4274, fft4349);
__m512i fft4276 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft4277 = _mm512_permutexvar_ps(fft4276, fft4266);
__m512 fft4358 = _mm512_permutexvar_ps(fft4276, fft4349);
__m512 fft4278 = _mm512_permutexvar_ps(fft4274, fft4267);
__m512 fft4359 = _mm512_permutexvar_ps(fft4274, fft4350);
__m512 fft4279 = _mm512_permutexvar_ps(fft4276, fft4267);
__m512 fft4360 = _mm512_permutexvar_ps(fft4276, fft4350);
__m512 fft4280 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft4281 = _mm512_fmadd_ps(fft4275, fft4280, fft4277);
__m512 fft4361 = _mm512_fmadd_ps(fft4357, fft4280, fft4358);
__m512 fft4282 = _mm512_fnmadd_ps(fft4279, fft4280, fft4278);
__m512 fft4362 = _mm512_fnmadd_ps(fft4360, fft4280, fft4359);
__m512 fft4283 = _mm512_mask_mov_ps(fft4279, 21845, fft4281);
__m512 fft4363 = _mm512_mask_mov_ps(fft4360, 21845, fft4361);
__m512 fft4284 = _mm512_mask_mov_ps(fft4275, 43176, fft4281);
__m512 fft4364 = _mm512_mask_mov_ps(fft4357, 43176, fft4361);
__m512 fft4285 = _mm512_mask_mov_ps(fft4283, 43176, fft4282);
__m512 fft4365 = _mm512_mask_mov_ps(fft4363, 43176, fft4362);
__m512 fft4286 = _mm512_mask_mov_ps(fft4284, 22102, fft4282);
__m512 fft4366 = _mm512_mask_mov_ps(fft4364, 22102, fft4362);
__m512 fft4287 = _mm512_mask_mul_ps(fft4285, 64764, fft4285, _mm512_set1_ps(5e-01f));
__m512 fft4367 = _mm512_mask_mul_ps(fft4365, 64764, fft4365, _mm512_set1_ps(5e-01f));
__m512 fft4288 = _mm512_mask_mul_ps(fft4286, 64764, fft4286, _mm512_set1_ps(5e-01f));
__m512 fft4368 = _mm512_mask_mul_ps(fft4366, 64764, fft4366, _mm512_set1_ps(5e-01f));
__m512 df369 = fft4287;
__m512 df377 = fft4367;
__m512 df370 = fft4288;
__m512 df378 = fft4368;
__m512 df371 = fft4268;
__m512 df379 = fft4351;
__m512 df372 = fft4269;
__m512 df380 = fft4352;
__m512 df373 = fft4270;
__m512 df381 = fft4353;
__m512 df374 = fft4271;
__m512 df382 = fft4354;
__m512 df375 = fft4272;
__m512 df383 = fft4355;
__m512 df376 = fft4273;
__m512 df384 = fft4356;
__m512i eo26 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df371 = _mm512_permutexvar_ps(eo26, df371);
df372 = _mm512_permutexvar_ps(eo26, df372);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df371);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df372);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df371);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df372);
df379 = _mm512_permutexvar_ps(eo26, df379);
df380 = _mm512_permutexvar_ps(eo26, df380);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df379);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df380);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df379);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df380);
df373 = _mm512_permutexvar_ps(eo26, df373);
df374 = _mm512_permutexvar_ps(eo26, df374);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df373);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df374);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df373);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df374);
df381 = _mm512_permutexvar_ps(eo26, df381);
df382 = _mm512_permutexvar_ps(eo26, df382);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df381);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df382);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df381);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df382);
df375 = _mm512_permutexvar_ps(eo26, df375);
df376 = _mm512_permutexvar_ps(eo26, df376);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df375);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df376);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df375);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df376);
df383 = _mm512_permutexvar_ps(eo26, df383);
df384 = _mm512_permutexvar_ps(eo26, df384);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df383);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df384);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df383);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df384);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df369);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df370);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df369);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df370);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df377);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df378);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df377);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df378);
for (ptrdiff_t b27 = 2; b27 < 6; ++b27) {
ptrdiff_t m27 = (size_t)b27/2;
ptrdiff_t f28 = (size_t)b27%2;
__m512 dat370 = _mm512_maskz_loadu_ps(65535, datPtr1+8040+602112*i6+200704*k12+896*h11+4*w11+40*b27);
__m512 dat371 = _mm512_maskz_loadu_ps(65535, datPtr1+8936+602112*i6+200704*k12+896*h11+4*w11+40*b27);
__m512 dat372 = _mm512_maskz_loadu_ps(65535, datPtr1+9832+602112*i6+200704*k12+896*h11+4*w11+40*b27);
__m512 dat373 = _mm512_maskz_loadu_ps(65535, datPtr1+10728+602112*i6+200704*k12+896*h11+4*w11+40*b27);
__m512 dat374 = _mm512_maskz_loadu_ps(65535, datPtr1+11624+602112*i6+200704*k12+896*h11+4*w11+40*b27);
__m512 dat375 = _mm512_maskz_loadu_ps(65535, datPtr1+12520+602112*i6+200704*k12+896*h11+4*w11+40*b27);
__m512 dat376 = _mm512_maskz_loadu_ps(65535, datPtr1+13416+602112*i6+200704*k12+896*h11+4*w11+40*b27);
__m512 dat377 = _mm512_maskz_loadu_ps(65535, datPtr1+14312+602112*i6+200704*k12+896*h11+4*w11+40*b27);
__m512 dat378 = _mm512_maskz_loadu_ps(65535, datPtr1+15208+602112*i6+200704*k12+896*h11+4*w11+40*b27);
__m512 dat379 = _mm512_maskz_loadu_ps(65535, datPtr1+16104+602112*i6+200704*k12+896*h11+4*w11+40*b27);
__m512 dat380 = _mm512_maskz_loadu_ps(65535, datPtr1+17000+602112*i6+200704*k12+896*h11+4*w11+40*b27);
__m512 dat381 = _mm512_maskz_loadu_ps(65535, datPtr1+17896+602112*i6+200704*k12+896*h11+4*w11+40*b27);
__m512 dat382 = _mm512_maskz_loadu_ps(65535, datPtr1+18792+602112*i6+200704*k12+896*h11+4*w11+40*b27);
__m512 dat383 = _mm512_maskz_loadu_ps(65535, datPtr1+19688+602112*i6+200704*k12+896*h11+4*w11+40*b27);
__m512 dat384 = _mm512_maskz_loadu_ps(65535, datPtr1+20584+602112*i6+200704*k12+896*h11+4*w11+40*b27);
__m512 dat385 = _mm512_maskz_loadu_ps(65535, datPtr1+21480+602112*i6+200704*k12+896*h11+4*w11+40*b27);
__m512 fft4369 = _mm512_add_ps(dat370, dat378);
__m512 fft4457 = _mm512_add_ps(dat371, dat379);
__m512 fft4370 = _mm512_sub_ps(dat370, dat378);
__m512 fft4458 = _mm512_sub_ps(dat371, dat379);
__m512 fft4371 = _mm512_add_ps(dat372, dat380);
__m512 fft4459 = _mm512_add_ps(dat373, dat381);
__m512 fft4372 = _mm512_sub_ps(dat372, dat380);
__m512 fft4460 = _mm512_sub_ps(dat373, dat381);
__m512 fft4373 = _mm512_add_ps(dat374, dat382);
__m512 fft4461 = _mm512_add_ps(dat375, dat383);
__m512 fft4374 = _mm512_sub_ps(dat374, dat382);
__m512 fft4462 = _mm512_sub_ps(dat375, dat383);
__m512 fft4375 = _mm512_add_ps(dat376, dat384);
__m512 fft4463 = _mm512_add_ps(dat377, dat385);
__m512 fft4376 = _mm512_sub_ps(dat376, dat384);
__m512 fft4464 = _mm512_sub_ps(dat377, dat385);
__m512 fft4377 = _mm512_add_ps(fft4369, fft4373);
__m512 fft4465 = _mm512_add_ps(fft4457, fft4461);
__m512 fft4378 = _mm512_sub_ps(fft4369, fft4373);
__m512 fft4466 = _mm512_sub_ps(fft4457, fft4461);
__m512 fft4379 = _mm512_add_ps(fft4371, fft4375);
__m512 fft4467 = _mm512_add_ps(fft4459, fft4463);
__m512 fft4380 = _mm512_sub_ps(fft4375, fft4371);
__m512 fft4468 = _mm512_sub_ps(fft4463, fft4459);
__m512 fft4381 = _mm512_sub_ps(fft4372, fft4376);
__m512 fft4469 = _mm512_sub_ps(fft4460, fft4464);
__m512 fft4382 = _mm512_add_ps(fft4372, fft4376);
__m512 fft4470 = _mm512_add_ps(fft4460, fft4464);
__m512 fft4383 = _mm512_add_ps(fft4377, fft4379);
__m512 fft4471 = _mm512_add_ps(fft4465, fft4467);
__m512 fft4384 = _mm512_sub_ps(fft4377, fft4379);
__m512 fft4472 = _mm512_sub_ps(fft4465, fft4467);
__m512 fft4385 = _mm512_fmadd_ps(fft4381, _mm512_set1_ps(7.0710677e-01f), fft4370);
__m512 fft4473 = _mm512_fmadd_ps(fft4469, _mm512_set1_ps(7.0710677e-01f), fft4458);
__m512 fft4386 = _mm512_fnmsub_ps(fft4382, _mm512_set1_ps(7.0710677e-01f), fft4374);
__m512 fft4474 = _mm512_fnmsub_ps(fft4470, _mm512_set1_ps(7.0710677e-01f), fft4462);
__m512 fft4387 = _mm512_fnmadd_ps(fft4381, _mm512_set1_ps(7.0710677e-01f), fft4370);
__m512 fft4475 = _mm512_fnmadd_ps(fft4469, _mm512_set1_ps(7.0710677e-01f), fft4458);
__m512 fft4388 = _mm512_fnmadd_ps(fft4382, _mm512_set1_ps(7.0710677e-01f), fft4374);
__m512 fft4476 = _mm512_fnmadd_ps(fft4470, _mm512_set1_ps(7.0710677e-01f), fft4462);
__m512 fft4389 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4390 = _mm512_fmadd_ps(fft4383, fft4389, _mm512_shuffle_f32x4(fft4383, fft4383, 78));
__m512 fft4477 = _mm512_fmadd_ps(fft4471, fft4389, _mm512_shuffle_f32x4(fft4471, fft4471, 78));
__m512 fft4391 = _mm512_fmadd_ps(fft4384, fft4389, _mm512_shuffle_f32x4(fft4384, fft4384, 78));
__m512 fft4478 = _mm512_fmadd_ps(fft4472, fft4389, _mm512_shuffle_f32x4(fft4472, fft4472, 78));
__m512 fft4392 = _mm512_fmadd_ps(fft4385, fft4389, _mm512_shuffle_f32x4(fft4385, fft4385, 78));
__m512 fft4479 = _mm512_fmadd_ps(fft4473, fft4389, _mm512_shuffle_f32x4(fft4473, fft4473, 78));
__m512 fft4393 = _mm512_fmadd_ps(fft4386, fft4389, _mm512_shuffle_f32x4(fft4386, fft4386, 78));
__m512 fft4480 = _mm512_fmadd_ps(fft4474, fft4389, _mm512_shuffle_f32x4(fft4474, fft4474, 78));
__m512 fft4394 = _mm512_fmadd_ps(fft4378, fft4389, _mm512_shuffle_f32x4(fft4378, fft4378, 78));
__m512 fft4481 = _mm512_fmadd_ps(fft4466, fft4389, _mm512_shuffle_f32x4(fft4466, fft4466, 78));
__m512 fft4395 = _mm512_fmadd_ps(fft4380, fft4389, _mm512_shuffle_f32x4(fft4380, fft4380, 78));
__m512 fft4482 = _mm512_fmadd_ps(fft4468, fft4389, _mm512_shuffle_f32x4(fft4468, fft4468, 78));
__m512 fft4396 = _mm512_fmadd_ps(fft4387, fft4389, _mm512_shuffle_f32x4(fft4387, fft4387, 78));
__m512 fft4483 = _mm512_fmadd_ps(fft4475, fft4389, _mm512_shuffle_f32x4(fft4475, fft4475, 78));
__m512 fft4397 = _mm512_fmadd_ps(fft4388, fft4389, _mm512_shuffle_f32x4(fft4388, fft4388, 78));
__m512 fft4484 = _mm512_fmadd_ps(fft4476, fft4389, _mm512_shuffle_f32x4(fft4476, fft4476, 78));
__m512 fft4398 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4399 = _mm512_mul_ps(fft4390, fft4398);
__m512 fft4485 = _mm512_mul_ps(fft4477, fft4398);
__m512 fft4400 = _mm512_mul_ps(fft4391, fft4398);
__m512 fft4486 = _mm512_mul_ps(fft4478, fft4398);
__m512 fft4401 = _mm512_mul_ps(fft4392, fft4398);
__m512 fft4487 = _mm512_mul_ps(fft4479, fft4398);
__m512 fft4402 = _mm512_mul_ps(fft4393, fft4398);
__m512 fft4488 = _mm512_mul_ps(fft4480, fft4398);
__m512 fft4403 = _mm512_mul_ps(fft4394, fft4398);
__m512 fft4489 = _mm512_mul_ps(fft4481, fft4398);
__m512 fft4404 = _mm512_mul_ps(fft4395, fft4398);
__m512 fft4490 = _mm512_mul_ps(fft4482, fft4398);
__m512 fft4405 = _mm512_mul_ps(fft4396, fft4398);
__m512 fft4491 = _mm512_mul_ps(fft4483, fft4398);
__m512 fft4406 = _mm512_mul_ps(fft4397, fft4398);
__m512 fft4492 = _mm512_mul_ps(fft4484, fft4398);
__m512 fft4407 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft4408 = _mm512_fmadd_ps(fft4391, fft4407, fft4399);
__m512 fft4493 = _mm512_fmadd_ps(fft4478, fft4407, fft4485);
__m512 fft4409 = _mm512_fnmadd_ps(fft4390, fft4407, fft4400);
__m512 fft4494 = _mm512_fnmadd_ps(fft4477, fft4407, fft4486);
__m512 fft4410 = _mm512_fmadd_ps(fft4393, fft4407, fft4401);
__m512 fft4495 = _mm512_fmadd_ps(fft4480, fft4407, fft4487);
__m512 fft4411 = _mm512_fnmadd_ps(fft4392, fft4407, fft4402);
__m512 fft4496 = _mm512_fnmadd_ps(fft4479, fft4407, fft4488);
__m512 fft4412 = _mm512_fmadd_ps(fft4395, fft4407, fft4403);
__m512 fft4497 = _mm512_fmadd_ps(fft4482, fft4407, fft4489);
__m512 fft4413 = _mm512_fnmadd_ps(fft4394, fft4407, fft4404);
__m512 fft4498 = _mm512_fnmadd_ps(fft4481, fft4407, fft4490);
__m512 fft4414 = _mm512_fmadd_ps(fft4397, fft4407, fft4405);
__m512 fft4499 = _mm512_fmadd_ps(fft4484, fft4407, fft4491);
__m512 fft4415 = _mm512_fnmadd_ps(fft4396, fft4407, fft4406);
__m512 fft4500 = _mm512_fnmadd_ps(fft4483, fft4407, fft4492);
__m512 fft4416 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft4417 = _mm512_fmadd_ps(fft4408, fft4416, _mm512_shuffle_f32x4(fft4408, fft4408, 177));
__m512 fft4501 = _mm512_fmadd_ps(fft4493, fft4416, _mm512_shuffle_f32x4(fft4493, fft4493, 177));
__m512 fft4418 = _mm512_fmadd_ps(fft4409, fft4416, _mm512_shuffle_f32x4(fft4409, fft4409, 177));
__m512 fft4502 = _mm512_fmadd_ps(fft4494, fft4416, _mm512_shuffle_f32x4(fft4494, fft4494, 177));
__m512 fft4419 = _mm512_fmadd_ps(fft4410, fft4416, _mm512_shuffle_f32x4(fft4410, fft4410, 177));
__m512 fft4503 = _mm512_fmadd_ps(fft4495, fft4416, _mm512_shuffle_f32x4(fft4495, fft4495, 177));
__m512 fft4420 = _mm512_fmadd_ps(fft4411, fft4416, _mm512_shuffle_f32x4(fft4411, fft4411, 177));
__m512 fft4504 = _mm512_fmadd_ps(fft4496, fft4416, _mm512_shuffle_f32x4(fft4496, fft4496, 177));
__m512 fft4421 = _mm512_fmadd_ps(fft4412, fft4416, _mm512_shuffle_f32x4(fft4412, fft4412, 177));
__m512 fft4505 = _mm512_fmadd_ps(fft4497, fft4416, _mm512_shuffle_f32x4(fft4497, fft4497, 177));
__m512 fft4422 = _mm512_fmadd_ps(fft4413, fft4416, _mm512_shuffle_f32x4(fft4413, fft4413, 177));
__m512 fft4506 = _mm512_fmadd_ps(fft4498, fft4416, _mm512_shuffle_f32x4(fft4498, fft4498, 177));
__m512 fft4423 = _mm512_fmadd_ps(fft4414, fft4416, _mm512_shuffle_f32x4(fft4414, fft4414, 177));
__m512 fft4507 = _mm512_fmadd_ps(fft4499, fft4416, _mm512_shuffle_f32x4(fft4499, fft4499, 177));
__m512 fft4424 = _mm512_fmadd_ps(fft4415, fft4416, _mm512_shuffle_f32x4(fft4415, fft4415, 177));
__m512 fft4508 = _mm512_fmadd_ps(fft4500, fft4416, _mm512_shuffle_f32x4(fft4500, fft4500, 177));
__m512 fft4425 = _mm512_mask_mov_ps(fft4417, 49344, fft4418);
__m512 fft4509 = _mm512_mask_mov_ps(fft4501, 49344, fft4502);
__m512 fft4426 = _mm512_mask_sub_ps(fft4418, 49344, _mm512_setzero_ps(), fft4417);
__m512 fft4510 = _mm512_mask_sub_ps(fft4502, 49344, _mm512_setzero_ps(), fft4501);
__m512 fft4427 = _mm512_mask_mov_ps(fft4419, 49344, fft4420);
__m512 fft4511 = _mm512_mask_mov_ps(fft4503, 49344, fft4504);
__m512 fft4428 = _mm512_mask_sub_ps(fft4420, 49344, _mm512_setzero_ps(), fft4419);
__m512 fft4512 = _mm512_mask_sub_ps(fft4504, 49344, _mm512_setzero_ps(), fft4503);
__m512 fft4429 = _mm512_mask_mov_ps(fft4421, 49344, fft4422);
__m512 fft4513 = _mm512_mask_mov_ps(fft4505, 49344, fft4506);
__m512 fft4430 = _mm512_mask_sub_ps(fft4422, 49344, _mm512_setzero_ps(), fft4421);
__m512 fft4514 = _mm512_mask_sub_ps(fft4506, 49344, _mm512_setzero_ps(), fft4505);
__m512 fft4431 = _mm512_mask_mov_ps(fft4423, 49344, fft4424);
__m512 fft4515 = _mm512_mask_mov_ps(fft4507, 49344, fft4508);
__m512 fft4432 = _mm512_mask_sub_ps(fft4424, 49344, _mm512_setzero_ps(), fft4423);
__m512 fft4516 = _mm512_mask_sub_ps(fft4508, 49344, _mm512_setzero_ps(), fft4507);
__m512 fft4433 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft4434 = _mm512_fmadd_ps(fft4425, fft4433, _mm512_shuffle_ps(fft4425, fft4425, 78));
__m512 fft4517 = _mm512_fmadd_ps(fft4509, fft4433, _mm512_shuffle_ps(fft4509, fft4509, 78));
__m512 fft4435 = _mm512_fmadd_ps(fft4426, fft4433, _mm512_shuffle_ps(fft4426, fft4426, 78));
__m512 fft4518 = _mm512_fmadd_ps(fft4510, fft4433, _mm512_shuffle_ps(fft4510, fft4510, 78));
__m512 fft4436 = _mm512_fmadd_ps(fft4427, fft4433, _mm512_shuffle_ps(fft4427, fft4427, 78));
__m512 fft4519 = _mm512_fmadd_ps(fft4511, fft4433, _mm512_shuffle_ps(fft4511, fft4511, 78));
__m512 fft4437 = _mm512_fmadd_ps(fft4428, fft4433, _mm512_shuffle_ps(fft4428, fft4428, 78));
__m512 fft4520 = _mm512_fmadd_ps(fft4512, fft4433, _mm512_shuffle_ps(fft4512, fft4512, 78));
__m512 fft4438 = _mm512_fmadd_ps(fft4429, fft4433, _mm512_shuffle_ps(fft4429, fft4429, 78));
__m512 fft4521 = _mm512_fmadd_ps(fft4513, fft4433, _mm512_shuffle_ps(fft4513, fft4513, 78));
__m512 fft4439 = _mm512_fmadd_ps(fft4430, fft4433, _mm512_shuffle_ps(fft4430, fft4430, 78));
__m512 fft4522 = _mm512_fmadd_ps(fft4514, fft4433, _mm512_shuffle_ps(fft4514, fft4514, 78));
__m512 fft4440 = _mm512_fmadd_ps(fft4431, fft4433, _mm512_shuffle_ps(fft4431, fft4431, 78));
__m512 fft4523 = _mm512_fmadd_ps(fft4515, fft4433, _mm512_shuffle_ps(fft4515, fft4515, 78));
__m512 fft4441 = _mm512_fmadd_ps(fft4432, fft4433, _mm512_shuffle_ps(fft4432, fft4432, 78));
__m512 fft4524 = _mm512_fmadd_ps(fft4516, fft4433, _mm512_shuffle_ps(fft4516, fft4516, 78));
__m512i fft4442 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft4443 = _mm512_permutexvar_ps(fft4442, fft4434);
__m512 fft4525 = _mm512_permutexvar_ps(fft4442, fft4517);
__m512i fft4444 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft4445 = _mm512_permutexvar_ps(fft4444, fft4434);
__m512 fft4526 = _mm512_permutexvar_ps(fft4444, fft4517);
__m512 fft4446 = _mm512_permutexvar_ps(fft4442, fft4435);
__m512 fft4527 = _mm512_permutexvar_ps(fft4442, fft4518);
__m512 fft4447 = _mm512_permutexvar_ps(fft4444, fft4435);
__m512 fft4528 = _mm512_permutexvar_ps(fft4444, fft4518);
__m512 fft4448 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft4449 = _mm512_fmadd_ps(fft4443, fft4448, fft4445);
__m512 fft4529 = _mm512_fmadd_ps(fft4525, fft4448, fft4526);
__m512 fft4450 = _mm512_fnmadd_ps(fft4447, fft4448, fft4446);
__m512 fft4530 = _mm512_fnmadd_ps(fft4528, fft4448, fft4527);
__m512 fft4451 = _mm512_mask_mov_ps(fft4447, 21845, fft4449);
__m512 fft4531 = _mm512_mask_mov_ps(fft4528, 21845, fft4529);
__m512 fft4452 = _mm512_mask_mov_ps(fft4443, 43176, fft4449);
__m512 fft4532 = _mm512_mask_mov_ps(fft4525, 43176, fft4529);
__m512 fft4453 = _mm512_mask_mov_ps(fft4451, 43176, fft4450);
__m512 fft4533 = _mm512_mask_mov_ps(fft4531, 43176, fft4530);
__m512 fft4454 = _mm512_mask_mov_ps(fft4452, 22102, fft4450);
__m512 fft4534 = _mm512_mask_mov_ps(fft4532, 22102, fft4530);
__m512 fft4455 = _mm512_mask_mul_ps(fft4453, 64764, fft4453, _mm512_set1_ps(5e-01f));
__m512 fft4535 = _mm512_mask_mul_ps(fft4533, 64764, fft4533, _mm512_set1_ps(5e-01f));
__m512 fft4456 = _mm512_mask_mul_ps(fft4454, 64764, fft4454, _mm512_set1_ps(5e-01f));
__m512 fft4536 = _mm512_mask_mul_ps(fft4534, 64764, fft4534, _mm512_set1_ps(5e-01f));
__m512 df385 = fft4455;
__m512 df393 = fft4535;
__m512 df386 = fft4456;
__m512 df394 = fft4536;
__m512 df387 = fft4436;
__m512 df395 = fft4519;
__m512 df388 = fft4437;
__m512 df396 = fft4520;
__m512 df389 = fft4438;
__m512 df397 = fft4521;
__m512 df390 = fft4439;
__m512 df398 = fft4522;
__m512 df391 = fft4440;
__m512 df399 = fft4523;
__m512 df392 = fft4441;
__m512 df400 = fft4524;
__m512i eo27 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df387 = _mm512_permutexvar_ps(eo27, df387);
df388 = _mm512_permutexvar_ps(eo27, df388);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df387);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df388);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df387);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df388);
df395 = _mm512_permutexvar_ps(eo27, df395);
df396 = _mm512_permutexvar_ps(eo27, df396);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df395);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df396);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df395);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df396);
df389 = _mm512_permutexvar_ps(eo27, df389);
df390 = _mm512_permutexvar_ps(eo27, df390);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df389);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df390);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df389);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df390);
df397 = _mm512_permutexvar_ps(eo27, df397);
df398 = _mm512_permutexvar_ps(eo27, df398);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df397);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df398);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df397);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df398);
df391 = _mm512_permutexvar_ps(eo27, df391);
df392 = _mm512_permutexvar_ps(eo27, df392);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df391);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df392);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df391);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df392);
df399 = _mm512_permutexvar_ps(eo27, df399);
df400 = _mm512_permutexvar_ps(eo27, df400);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df399);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df400);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df399);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df400);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df385);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df386);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df385);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df386);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df393);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df394);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df393);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df394);
}
}
if (j2 >= last1) return;
++j2;
rel2 = 16;
}
if (rel2 < 19) {
if (rel2 < 18) {
ptrdiff_t h12 = base2+40;
ptrdiff_t w12 = -910+60*rel2;
ptrdiff_t jj6 = 17-rel2+j2;
for (; j2 <= jj6; w12 += 60) {
ptrdiff_t k13 = 3*s1;
ptrdiff_t kk12 = k13+2;
for (; k13 <= kk12; ++k13) {
for (ptrdiff_t b28 = 0; b28 < 6; ++b28) {
ptrdiff_t m28 = (size_t)b28/2;
ptrdiff_t f29 = (size_t)b28%2;
__m512 dat386 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k13+896*h12+4*w12+40*b28);
__m512 dat387 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k13+896*h12+4*w12+40*b28);
__m512 dat388 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k13+896*h12+4*w12+40*b28);
__m512 dat389 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k13+896*h12+4*w12+40*b28);
__m512 dat390 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k13+896*h12+4*w12+40*b28);
__m512 dat391 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k13+896*h12+4*w12+40*b28);
__m512 dat392 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k13+896*h12+4*w12+40*b28);
__m512 dat393 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k13+896*h12+4*w12+40*b28);
__m512 dat394 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k13+896*h12+4*w12+40*b28);
__m512 dat395 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k13+896*h12+4*w12+40*b28);
__m512 dat396 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k13+896*h12+4*w12+40*b28);
__m512 dat397 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k13+896*h12+4*w12+40*b28);
__m512 dat398 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k13+896*h12+4*w12+40*b28);
__m512 dat399 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k13+896*h12+4*w12+40*b28);
__m512 dat400 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k13+896*h12+4*w12+40*b28);
__m512 dat401 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k13+896*h12+4*w12+40*b28);
__m512 fft4537 = _mm512_add_ps(dat386, dat394);
__m512 fft4625 = _mm512_add_ps(dat387, dat395);
__m512 fft4538 = _mm512_sub_ps(dat386, dat394);
__m512 fft4626 = _mm512_sub_ps(dat387, dat395);
__m512 fft4539 = _mm512_add_ps(dat388, dat396);
__m512 fft4627 = _mm512_add_ps(dat389, dat397);
__m512 fft4540 = _mm512_sub_ps(dat388, dat396);
__m512 fft4628 = _mm512_sub_ps(dat389, dat397);
__m512 fft4541 = _mm512_add_ps(dat390, dat398);
__m512 fft4629 = _mm512_add_ps(dat391, dat399);
__m512 fft4542 = _mm512_sub_ps(dat390, dat398);
__m512 fft4630 = _mm512_sub_ps(dat391, dat399);
__m512 fft4543 = _mm512_add_ps(dat392, dat400);
__m512 fft4631 = _mm512_add_ps(dat393, dat401);
__m512 fft4544 = _mm512_sub_ps(dat392, dat400);
__m512 fft4632 = _mm512_sub_ps(dat393, dat401);
__m512 fft4545 = _mm512_add_ps(fft4537, fft4541);
__m512 fft4633 = _mm512_add_ps(fft4625, fft4629);
__m512 fft4546 = _mm512_sub_ps(fft4537, fft4541);
__m512 fft4634 = _mm512_sub_ps(fft4625, fft4629);
__m512 fft4547 = _mm512_add_ps(fft4539, fft4543);
__m512 fft4635 = _mm512_add_ps(fft4627, fft4631);
__m512 fft4548 = _mm512_sub_ps(fft4543, fft4539);
__m512 fft4636 = _mm512_sub_ps(fft4631, fft4627);
__m512 fft4549 = _mm512_sub_ps(fft4540, fft4544);
__m512 fft4637 = _mm512_sub_ps(fft4628, fft4632);
__m512 fft4550 = _mm512_add_ps(fft4540, fft4544);
__m512 fft4638 = _mm512_add_ps(fft4628, fft4632);
__m512 fft4551 = _mm512_add_ps(fft4545, fft4547);
__m512 fft4639 = _mm512_add_ps(fft4633, fft4635);
__m512 fft4552 = _mm512_sub_ps(fft4545, fft4547);
__m512 fft4640 = _mm512_sub_ps(fft4633, fft4635);
__m512 fft4553 = _mm512_fmadd_ps(fft4549, _mm512_set1_ps(7.0710677e-01f), fft4538);
__m512 fft4641 = _mm512_fmadd_ps(fft4637, _mm512_set1_ps(7.0710677e-01f), fft4626);
__m512 fft4554 = _mm512_fnmsub_ps(fft4550, _mm512_set1_ps(7.0710677e-01f), fft4542);
__m512 fft4642 = _mm512_fnmsub_ps(fft4638, _mm512_set1_ps(7.0710677e-01f), fft4630);
__m512 fft4555 = _mm512_fnmadd_ps(fft4549, _mm512_set1_ps(7.0710677e-01f), fft4538);
__m512 fft4643 = _mm512_fnmadd_ps(fft4637, _mm512_set1_ps(7.0710677e-01f), fft4626);
__m512 fft4556 = _mm512_fnmadd_ps(fft4550, _mm512_set1_ps(7.0710677e-01f), fft4542);
__m512 fft4644 = _mm512_fnmadd_ps(fft4638, _mm512_set1_ps(7.0710677e-01f), fft4630);
__m512 fft4557 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4558 = _mm512_fmadd_ps(fft4551, fft4557, _mm512_shuffle_f32x4(fft4551, fft4551, 78));
__m512 fft4645 = _mm512_fmadd_ps(fft4639, fft4557, _mm512_shuffle_f32x4(fft4639, fft4639, 78));
__m512 fft4559 = _mm512_fmadd_ps(fft4552, fft4557, _mm512_shuffle_f32x4(fft4552, fft4552, 78));
__m512 fft4646 = _mm512_fmadd_ps(fft4640, fft4557, _mm512_shuffle_f32x4(fft4640, fft4640, 78));
__m512 fft4560 = _mm512_fmadd_ps(fft4553, fft4557, _mm512_shuffle_f32x4(fft4553, fft4553, 78));
__m512 fft4647 = _mm512_fmadd_ps(fft4641, fft4557, _mm512_shuffle_f32x4(fft4641, fft4641, 78));
__m512 fft4561 = _mm512_fmadd_ps(fft4554, fft4557, _mm512_shuffle_f32x4(fft4554, fft4554, 78));
__m512 fft4648 = _mm512_fmadd_ps(fft4642, fft4557, _mm512_shuffle_f32x4(fft4642, fft4642, 78));
__m512 fft4562 = _mm512_fmadd_ps(fft4546, fft4557, _mm512_shuffle_f32x4(fft4546, fft4546, 78));
__m512 fft4649 = _mm512_fmadd_ps(fft4634, fft4557, _mm512_shuffle_f32x4(fft4634, fft4634, 78));
__m512 fft4563 = _mm512_fmadd_ps(fft4548, fft4557, _mm512_shuffle_f32x4(fft4548, fft4548, 78));
__m512 fft4650 = _mm512_fmadd_ps(fft4636, fft4557, _mm512_shuffle_f32x4(fft4636, fft4636, 78));
__m512 fft4564 = _mm512_fmadd_ps(fft4555, fft4557, _mm512_shuffle_f32x4(fft4555, fft4555, 78));
__m512 fft4651 = _mm512_fmadd_ps(fft4643, fft4557, _mm512_shuffle_f32x4(fft4643, fft4643, 78));
__m512 fft4565 = _mm512_fmadd_ps(fft4556, fft4557, _mm512_shuffle_f32x4(fft4556, fft4556, 78));
__m512 fft4652 = _mm512_fmadd_ps(fft4644, fft4557, _mm512_shuffle_f32x4(fft4644, fft4644, 78));
__m512 fft4566 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4567 = _mm512_mul_ps(fft4558, fft4566);
__m512 fft4653 = _mm512_mul_ps(fft4645, fft4566);
__m512 fft4568 = _mm512_mul_ps(fft4559, fft4566);
__m512 fft4654 = _mm512_mul_ps(fft4646, fft4566);
__m512 fft4569 = _mm512_mul_ps(fft4560, fft4566);
__m512 fft4655 = _mm512_mul_ps(fft4647, fft4566);
__m512 fft4570 = _mm512_mul_ps(fft4561, fft4566);
__m512 fft4656 = _mm512_mul_ps(fft4648, fft4566);
__m512 fft4571 = _mm512_mul_ps(fft4562, fft4566);
__m512 fft4657 = _mm512_mul_ps(fft4649, fft4566);
__m512 fft4572 = _mm512_mul_ps(fft4563, fft4566);
__m512 fft4658 = _mm512_mul_ps(fft4650, fft4566);
__m512 fft4573 = _mm512_mul_ps(fft4564, fft4566);
__m512 fft4659 = _mm512_mul_ps(fft4651, fft4566);
__m512 fft4574 = _mm512_mul_ps(fft4565, fft4566);
__m512 fft4660 = _mm512_mul_ps(fft4652, fft4566);
__m512 fft4575 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft4576 = _mm512_fmadd_ps(fft4559, fft4575, fft4567);
__m512 fft4661 = _mm512_fmadd_ps(fft4646, fft4575, fft4653);
__m512 fft4577 = _mm512_fnmadd_ps(fft4558, fft4575, fft4568);
__m512 fft4662 = _mm512_fnmadd_ps(fft4645, fft4575, fft4654);
__m512 fft4578 = _mm512_fmadd_ps(fft4561, fft4575, fft4569);
__m512 fft4663 = _mm512_fmadd_ps(fft4648, fft4575, fft4655);
__m512 fft4579 = _mm512_fnmadd_ps(fft4560, fft4575, fft4570);
__m512 fft4664 = _mm512_fnmadd_ps(fft4647, fft4575, fft4656);
__m512 fft4580 = _mm512_fmadd_ps(fft4563, fft4575, fft4571);
__m512 fft4665 = _mm512_fmadd_ps(fft4650, fft4575, fft4657);
__m512 fft4581 = _mm512_fnmadd_ps(fft4562, fft4575, fft4572);
__m512 fft4666 = _mm512_fnmadd_ps(fft4649, fft4575, fft4658);
__m512 fft4582 = _mm512_fmadd_ps(fft4565, fft4575, fft4573);
__m512 fft4667 = _mm512_fmadd_ps(fft4652, fft4575, fft4659);
__m512 fft4583 = _mm512_fnmadd_ps(fft4564, fft4575, fft4574);
__m512 fft4668 = _mm512_fnmadd_ps(fft4651, fft4575, fft4660);
__m512 fft4584 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft4585 = _mm512_fmadd_ps(fft4576, fft4584, _mm512_shuffle_f32x4(fft4576, fft4576, 177));
__m512 fft4669 = _mm512_fmadd_ps(fft4661, fft4584, _mm512_shuffle_f32x4(fft4661, fft4661, 177));
__m512 fft4586 = _mm512_fmadd_ps(fft4577, fft4584, _mm512_shuffle_f32x4(fft4577, fft4577, 177));
__m512 fft4670 = _mm512_fmadd_ps(fft4662, fft4584, _mm512_shuffle_f32x4(fft4662, fft4662, 177));
__m512 fft4587 = _mm512_fmadd_ps(fft4578, fft4584, _mm512_shuffle_f32x4(fft4578, fft4578, 177));
__m512 fft4671 = _mm512_fmadd_ps(fft4663, fft4584, _mm512_shuffle_f32x4(fft4663, fft4663, 177));
__m512 fft4588 = _mm512_fmadd_ps(fft4579, fft4584, _mm512_shuffle_f32x4(fft4579, fft4579, 177));
__m512 fft4672 = _mm512_fmadd_ps(fft4664, fft4584, _mm512_shuffle_f32x4(fft4664, fft4664, 177));
__m512 fft4589 = _mm512_fmadd_ps(fft4580, fft4584, _mm512_shuffle_f32x4(fft4580, fft4580, 177));
__m512 fft4673 = _mm512_fmadd_ps(fft4665, fft4584, _mm512_shuffle_f32x4(fft4665, fft4665, 177));
__m512 fft4590 = _mm512_fmadd_ps(fft4581, fft4584, _mm512_shuffle_f32x4(fft4581, fft4581, 177));
__m512 fft4674 = _mm512_fmadd_ps(fft4666, fft4584, _mm512_shuffle_f32x4(fft4666, fft4666, 177));
__m512 fft4591 = _mm512_fmadd_ps(fft4582, fft4584, _mm512_shuffle_f32x4(fft4582, fft4582, 177));
__m512 fft4675 = _mm512_fmadd_ps(fft4667, fft4584, _mm512_shuffle_f32x4(fft4667, fft4667, 177));
__m512 fft4592 = _mm512_fmadd_ps(fft4583, fft4584, _mm512_shuffle_f32x4(fft4583, fft4583, 177));
__m512 fft4676 = _mm512_fmadd_ps(fft4668, fft4584, _mm512_shuffle_f32x4(fft4668, fft4668, 177));
__m512 fft4593 = _mm512_mask_mov_ps(fft4585, 49344, fft4586);
__m512 fft4677 = _mm512_mask_mov_ps(fft4669, 49344, fft4670);
__m512 fft4594 = _mm512_mask_sub_ps(fft4586, 49344, _mm512_setzero_ps(), fft4585);
__m512 fft4678 = _mm512_mask_sub_ps(fft4670, 49344, _mm512_setzero_ps(), fft4669);
__m512 fft4595 = _mm512_mask_mov_ps(fft4587, 49344, fft4588);
__m512 fft4679 = _mm512_mask_mov_ps(fft4671, 49344, fft4672);
__m512 fft4596 = _mm512_mask_sub_ps(fft4588, 49344, _mm512_setzero_ps(), fft4587);
__m512 fft4680 = _mm512_mask_sub_ps(fft4672, 49344, _mm512_setzero_ps(), fft4671);
__m512 fft4597 = _mm512_mask_mov_ps(fft4589, 49344, fft4590);
__m512 fft4681 = _mm512_mask_mov_ps(fft4673, 49344, fft4674);
__m512 fft4598 = _mm512_mask_sub_ps(fft4590, 49344, _mm512_setzero_ps(), fft4589);
__m512 fft4682 = _mm512_mask_sub_ps(fft4674, 49344, _mm512_setzero_ps(), fft4673);
__m512 fft4599 = _mm512_mask_mov_ps(fft4591, 49344, fft4592);
__m512 fft4683 = _mm512_mask_mov_ps(fft4675, 49344, fft4676);
__m512 fft4600 = _mm512_mask_sub_ps(fft4592, 49344, _mm512_setzero_ps(), fft4591);
__m512 fft4684 = _mm512_mask_sub_ps(fft4676, 49344, _mm512_setzero_ps(), fft4675);
__m512 fft4601 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft4602 = _mm512_fmadd_ps(fft4593, fft4601, _mm512_shuffle_ps(fft4593, fft4593, 78));
__m512 fft4685 = _mm512_fmadd_ps(fft4677, fft4601, _mm512_shuffle_ps(fft4677, fft4677, 78));
__m512 fft4603 = _mm512_fmadd_ps(fft4594, fft4601, _mm512_shuffle_ps(fft4594, fft4594, 78));
__m512 fft4686 = _mm512_fmadd_ps(fft4678, fft4601, _mm512_shuffle_ps(fft4678, fft4678, 78));
__m512 fft4604 = _mm512_fmadd_ps(fft4595, fft4601, _mm512_shuffle_ps(fft4595, fft4595, 78));
__m512 fft4687 = _mm512_fmadd_ps(fft4679, fft4601, _mm512_shuffle_ps(fft4679, fft4679, 78));
__m512 fft4605 = _mm512_fmadd_ps(fft4596, fft4601, _mm512_shuffle_ps(fft4596, fft4596, 78));
__m512 fft4688 = _mm512_fmadd_ps(fft4680, fft4601, _mm512_shuffle_ps(fft4680, fft4680, 78));
__m512 fft4606 = _mm512_fmadd_ps(fft4597, fft4601, _mm512_shuffle_ps(fft4597, fft4597, 78));
__m512 fft4689 = _mm512_fmadd_ps(fft4681, fft4601, _mm512_shuffle_ps(fft4681, fft4681, 78));
__m512 fft4607 = _mm512_fmadd_ps(fft4598, fft4601, _mm512_shuffle_ps(fft4598, fft4598, 78));
__m512 fft4690 = _mm512_fmadd_ps(fft4682, fft4601, _mm512_shuffle_ps(fft4682, fft4682, 78));
__m512 fft4608 = _mm512_fmadd_ps(fft4599, fft4601, _mm512_shuffle_ps(fft4599, fft4599, 78));
__m512 fft4691 = _mm512_fmadd_ps(fft4683, fft4601, _mm512_shuffle_ps(fft4683, fft4683, 78));
__m512 fft4609 = _mm512_fmadd_ps(fft4600, fft4601, _mm512_shuffle_ps(fft4600, fft4600, 78));
__m512 fft4692 = _mm512_fmadd_ps(fft4684, fft4601, _mm512_shuffle_ps(fft4684, fft4684, 78));
__m512i fft4610 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft4611 = _mm512_permutexvar_ps(fft4610, fft4602);
__m512 fft4693 = _mm512_permutexvar_ps(fft4610, fft4685);
__m512i fft4612 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft4613 = _mm512_permutexvar_ps(fft4612, fft4602);
__m512 fft4694 = _mm512_permutexvar_ps(fft4612, fft4685);
__m512 fft4614 = _mm512_permutexvar_ps(fft4610, fft4603);
__m512 fft4695 = _mm512_permutexvar_ps(fft4610, fft4686);
__m512 fft4615 = _mm512_permutexvar_ps(fft4612, fft4603);
__m512 fft4696 = _mm512_permutexvar_ps(fft4612, fft4686);
__m512 fft4616 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft4617 = _mm512_fmadd_ps(fft4611, fft4616, fft4613);
__m512 fft4697 = _mm512_fmadd_ps(fft4693, fft4616, fft4694);
__m512 fft4618 = _mm512_fnmadd_ps(fft4615, fft4616, fft4614);
__m512 fft4698 = _mm512_fnmadd_ps(fft4696, fft4616, fft4695);
__m512 fft4619 = _mm512_mask_mov_ps(fft4615, 21845, fft4617);
__m512 fft4699 = _mm512_mask_mov_ps(fft4696, 21845, fft4697);
__m512 fft4620 = _mm512_mask_mov_ps(fft4611, 43176, fft4617);
__m512 fft4700 = _mm512_mask_mov_ps(fft4693, 43176, fft4697);
__m512 fft4621 = _mm512_mask_mov_ps(fft4619, 43176, fft4618);
__m512 fft4701 = _mm512_mask_mov_ps(fft4699, 43176, fft4698);
__m512 fft4622 = _mm512_mask_mov_ps(fft4620, 22102, fft4618);
__m512 fft4702 = _mm512_mask_mov_ps(fft4700, 22102, fft4698);
__m512 fft4623 = _mm512_mask_mul_ps(fft4621, 64764, fft4621, _mm512_set1_ps(5e-01f));
__m512 fft4703 = _mm512_mask_mul_ps(fft4701, 64764, fft4701, _mm512_set1_ps(5e-01f));
__m512 fft4624 = _mm512_mask_mul_ps(fft4622, 64764, fft4622, _mm512_set1_ps(5e-01f));
__m512 fft4704 = _mm512_mask_mul_ps(fft4702, 64764, fft4702, _mm512_set1_ps(5e-01f));
__m512 df401 = fft4623;
__m512 df409 = fft4703;
__m512 df402 = fft4624;
__m512 df410 = fft4704;
__m512 df403 = fft4604;
__m512 df411 = fft4687;
__m512 df404 = fft4605;
__m512 df412 = fft4688;
__m512 df405 = fft4606;
__m512 df413 = fft4689;
__m512 df406 = fft4607;
__m512 df414 = fft4690;
__m512 df407 = fft4608;
__m512 df415 = fft4691;
__m512 df408 = fft4609;
__m512 df416 = fft4692;
__m512i eo28 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df403 = _mm512_permutexvar_ps(eo28, df403);
df404 = _mm512_permutexvar_ps(eo28, df404);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df403);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df404);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df403);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df404);
df411 = _mm512_permutexvar_ps(eo28, df411);
df412 = _mm512_permutexvar_ps(eo28, df412);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df411);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df412);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df411);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df412);
df405 = _mm512_permutexvar_ps(eo28, df405);
df406 = _mm512_permutexvar_ps(eo28, df406);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df405);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df406);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df405);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df406);
df413 = _mm512_permutexvar_ps(eo28, df413);
df414 = _mm512_permutexvar_ps(eo28, df414);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df413);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df414);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df413);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df414);
df407 = _mm512_permutexvar_ps(eo28, df407);
df408 = _mm512_permutexvar_ps(eo28, df408);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df407);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df408);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df407);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df408);
df415 = _mm512_permutexvar_ps(eo28, df415);
df416 = _mm512_permutexvar_ps(eo28, df416);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df415);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df416);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df415);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df416);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df401);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df402);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df401);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df402);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df409);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df410);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df409);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df410);
}
}
if (j2 >= last1) return;
++j2;
}
rel2 = 18;
}
ptrdiff_t h13 = base2+40;
ptrdiff_t w13 = 170;
ptrdiff_t k14 = 3*s1;
ptrdiff_t kk13 = k14+2;
for (; k14 <= kk13; ++k14) {
for (ptrdiff_t b29 = 0; b29 < 5; ++b29) {
ptrdiff_t m29 = (size_t)b29/2;
ptrdiff_t f30 = (size_t)b29%2;
__m512 dat402 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k14+896*h13+4*w13+40*b29);
__m512 dat403 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k14+896*h13+4*w13+40*b29);
__m512 dat404 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k14+896*h13+4*w13+40*b29);
__m512 dat405 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k14+896*h13+4*w13+40*b29);
__m512 dat406 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k14+896*h13+4*w13+40*b29);
__m512 dat407 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k14+896*h13+4*w13+40*b29);
__m512 dat408 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k14+896*h13+4*w13+40*b29);
__m512 dat409 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k14+896*h13+4*w13+40*b29);
__m512 dat410 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k14+896*h13+4*w13+40*b29);
__m512 dat411 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k14+896*h13+4*w13+40*b29);
__m512 dat412 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k14+896*h13+4*w13+40*b29);
__m512 dat413 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k14+896*h13+4*w13+40*b29);
__m512 dat414 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k14+896*h13+4*w13+40*b29);
__m512 dat415 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k14+896*h13+4*w13+40*b29);
__m512 dat416 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k14+896*h13+4*w13+40*b29);
__m512 dat417 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k14+896*h13+4*w13+40*b29);
__m512 fft4705 = _mm512_add_ps(dat402, dat410);
__m512 fft4793 = _mm512_add_ps(dat403, dat411);
__m512 fft4706 = _mm512_sub_ps(dat402, dat410);
__m512 fft4794 = _mm512_sub_ps(dat403, dat411);
__m512 fft4707 = _mm512_add_ps(dat404, dat412);
__m512 fft4795 = _mm512_add_ps(dat405, dat413);
__m512 fft4708 = _mm512_sub_ps(dat404, dat412);
__m512 fft4796 = _mm512_sub_ps(dat405, dat413);
__m512 fft4709 = _mm512_add_ps(dat406, dat414);
__m512 fft4797 = _mm512_add_ps(dat407, dat415);
__m512 fft4710 = _mm512_sub_ps(dat406, dat414);
__m512 fft4798 = _mm512_sub_ps(dat407, dat415);
__m512 fft4711 = _mm512_add_ps(dat408, dat416);
__m512 fft4799 = _mm512_add_ps(dat409, dat417);
__m512 fft4712 = _mm512_sub_ps(dat408, dat416);
__m512 fft4800 = _mm512_sub_ps(dat409, dat417);
__m512 fft4713 = _mm512_add_ps(fft4705, fft4709);
__m512 fft4801 = _mm512_add_ps(fft4793, fft4797);
__m512 fft4714 = _mm512_sub_ps(fft4705, fft4709);
__m512 fft4802 = _mm512_sub_ps(fft4793, fft4797);
__m512 fft4715 = _mm512_add_ps(fft4707, fft4711);
__m512 fft4803 = _mm512_add_ps(fft4795, fft4799);
__m512 fft4716 = _mm512_sub_ps(fft4711, fft4707);
__m512 fft4804 = _mm512_sub_ps(fft4799, fft4795);
__m512 fft4717 = _mm512_sub_ps(fft4708, fft4712);
__m512 fft4805 = _mm512_sub_ps(fft4796, fft4800);
__m512 fft4718 = _mm512_add_ps(fft4708, fft4712);
__m512 fft4806 = _mm512_add_ps(fft4796, fft4800);
__m512 fft4719 = _mm512_add_ps(fft4713, fft4715);
__m512 fft4807 = _mm512_add_ps(fft4801, fft4803);
__m512 fft4720 = _mm512_sub_ps(fft4713, fft4715);
__m512 fft4808 = _mm512_sub_ps(fft4801, fft4803);
__m512 fft4721 = _mm512_fmadd_ps(fft4717, _mm512_set1_ps(7.0710677e-01f), fft4706);
__m512 fft4809 = _mm512_fmadd_ps(fft4805, _mm512_set1_ps(7.0710677e-01f), fft4794);
__m512 fft4722 = _mm512_fnmsub_ps(fft4718, _mm512_set1_ps(7.0710677e-01f), fft4710);
__m512 fft4810 = _mm512_fnmsub_ps(fft4806, _mm512_set1_ps(7.0710677e-01f), fft4798);
__m512 fft4723 = _mm512_fnmadd_ps(fft4717, _mm512_set1_ps(7.0710677e-01f), fft4706);
__m512 fft4811 = _mm512_fnmadd_ps(fft4805, _mm512_set1_ps(7.0710677e-01f), fft4794);
__m512 fft4724 = _mm512_fnmadd_ps(fft4718, _mm512_set1_ps(7.0710677e-01f), fft4710);
__m512 fft4812 = _mm512_fnmadd_ps(fft4806, _mm512_set1_ps(7.0710677e-01f), fft4798);
__m512 fft4725 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4726 = _mm512_fmadd_ps(fft4719, fft4725, _mm512_shuffle_f32x4(fft4719, fft4719, 78));
__m512 fft4813 = _mm512_fmadd_ps(fft4807, fft4725, _mm512_shuffle_f32x4(fft4807, fft4807, 78));
__m512 fft4727 = _mm512_fmadd_ps(fft4720, fft4725, _mm512_shuffle_f32x4(fft4720, fft4720, 78));
__m512 fft4814 = _mm512_fmadd_ps(fft4808, fft4725, _mm512_shuffle_f32x4(fft4808, fft4808, 78));
__m512 fft4728 = _mm512_fmadd_ps(fft4721, fft4725, _mm512_shuffle_f32x4(fft4721, fft4721, 78));
__m512 fft4815 = _mm512_fmadd_ps(fft4809, fft4725, _mm512_shuffle_f32x4(fft4809, fft4809, 78));
__m512 fft4729 = _mm512_fmadd_ps(fft4722, fft4725, _mm512_shuffle_f32x4(fft4722, fft4722, 78));
__m512 fft4816 = _mm512_fmadd_ps(fft4810, fft4725, _mm512_shuffle_f32x4(fft4810, fft4810, 78));
__m512 fft4730 = _mm512_fmadd_ps(fft4714, fft4725, _mm512_shuffle_f32x4(fft4714, fft4714, 78));
__m512 fft4817 = _mm512_fmadd_ps(fft4802, fft4725, _mm512_shuffle_f32x4(fft4802, fft4802, 78));
__m512 fft4731 = _mm512_fmadd_ps(fft4716, fft4725, _mm512_shuffle_f32x4(fft4716, fft4716, 78));
__m512 fft4818 = _mm512_fmadd_ps(fft4804, fft4725, _mm512_shuffle_f32x4(fft4804, fft4804, 78));
__m512 fft4732 = _mm512_fmadd_ps(fft4723, fft4725, _mm512_shuffle_f32x4(fft4723, fft4723, 78));
__m512 fft4819 = _mm512_fmadd_ps(fft4811, fft4725, _mm512_shuffle_f32x4(fft4811, fft4811, 78));
__m512 fft4733 = _mm512_fmadd_ps(fft4724, fft4725, _mm512_shuffle_f32x4(fft4724, fft4724, 78));
__m512 fft4820 = _mm512_fmadd_ps(fft4812, fft4725, _mm512_shuffle_f32x4(fft4812, fft4812, 78));
__m512 fft4734 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4735 = _mm512_mul_ps(fft4726, fft4734);
__m512 fft4821 = _mm512_mul_ps(fft4813, fft4734);
__m512 fft4736 = _mm512_mul_ps(fft4727, fft4734);
__m512 fft4822 = _mm512_mul_ps(fft4814, fft4734);
__m512 fft4737 = _mm512_mul_ps(fft4728, fft4734);
__m512 fft4823 = _mm512_mul_ps(fft4815, fft4734);
__m512 fft4738 = _mm512_mul_ps(fft4729, fft4734);
__m512 fft4824 = _mm512_mul_ps(fft4816, fft4734);
__m512 fft4739 = _mm512_mul_ps(fft4730, fft4734);
__m512 fft4825 = _mm512_mul_ps(fft4817, fft4734);
__m512 fft4740 = _mm512_mul_ps(fft4731, fft4734);
__m512 fft4826 = _mm512_mul_ps(fft4818, fft4734);
__m512 fft4741 = _mm512_mul_ps(fft4732, fft4734);
__m512 fft4827 = _mm512_mul_ps(fft4819, fft4734);
__m512 fft4742 = _mm512_mul_ps(fft4733, fft4734);
__m512 fft4828 = _mm512_mul_ps(fft4820, fft4734);
__m512 fft4743 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft4744 = _mm512_fmadd_ps(fft4727, fft4743, fft4735);
__m512 fft4829 = _mm512_fmadd_ps(fft4814, fft4743, fft4821);
__m512 fft4745 = _mm512_fnmadd_ps(fft4726, fft4743, fft4736);
__m512 fft4830 = _mm512_fnmadd_ps(fft4813, fft4743, fft4822);
__m512 fft4746 = _mm512_fmadd_ps(fft4729, fft4743, fft4737);
__m512 fft4831 = _mm512_fmadd_ps(fft4816, fft4743, fft4823);
__m512 fft4747 = _mm512_fnmadd_ps(fft4728, fft4743, fft4738);
__m512 fft4832 = _mm512_fnmadd_ps(fft4815, fft4743, fft4824);
__m512 fft4748 = _mm512_fmadd_ps(fft4731, fft4743, fft4739);
__m512 fft4833 = _mm512_fmadd_ps(fft4818, fft4743, fft4825);
__m512 fft4749 = _mm512_fnmadd_ps(fft4730, fft4743, fft4740);
__m512 fft4834 = _mm512_fnmadd_ps(fft4817, fft4743, fft4826);
__m512 fft4750 = _mm512_fmadd_ps(fft4733, fft4743, fft4741);
__m512 fft4835 = _mm512_fmadd_ps(fft4820, fft4743, fft4827);
__m512 fft4751 = _mm512_fnmadd_ps(fft4732, fft4743, fft4742);
__m512 fft4836 = _mm512_fnmadd_ps(fft4819, fft4743, fft4828);
__m512 fft4752 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft4753 = _mm512_fmadd_ps(fft4744, fft4752, _mm512_shuffle_f32x4(fft4744, fft4744, 177));
__m512 fft4837 = _mm512_fmadd_ps(fft4829, fft4752, _mm512_shuffle_f32x4(fft4829, fft4829, 177));
__m512 fft4754 = _mm512_fmadd_ps(fft4745, fft4752, _mm512_shuffle_f32x4(fft4745, fft4745, 177));
__m512 fft4838 = _mm512_fmadd_ps(fft4830, fft4752, _mm512_shuffle_f32x4(fft4830, fft4830, 177));
__m512 fft4755 = _mm512_fmadd_ps(fft4746, fft4752, _mm512_shuffle_f32x4(fft4746, fft4746, 177));
__m512 fft4839 = _mm512_fmadd_ps(fft4831, fft4752, _mm512_shuffle_f32x4(fft4831, fft4831, 177));
__m512 fft4756 = _mm512_fmadd_ps(fft4747, fft4752, _mm512_shuffle_f32x4(fft4747, fft4747, 177));
__m512 fft4840 = _mm512_fmadd_ps(fft4832, fft4752, _mm512_shuffle_f32x4(fft4832, fft4832, 177));
__m512 fft4757 = _mm512_fmadd_ps(fft4748, fft4752, _mm512_shuffle_f32x4(fft4748, fft4748, 177));
__m512 fft4841 = _mm512_fmadd_ps(fft4833, fft4752, _mm512_shuffle_f32x4(fft4833, fft4833, 177));
__m512 fft4758 = _mm512_fmadd_ps(fft4749, fft4752, _mm512_shuffle_f32x4(fft4749, fft4749, 177));
__m512 fft4842 = _mm512_fmadd_ps(fft4834, fft4752, _mm512_shuffle_f32x4(fft4834, fft4834, 177));
__m512 fft4759 = _mm512_fmadd_ps(fft4750, fft4752, _mm512_shuffle_f32x4(fft4750, fft4750, 177));
__m512 fft4843 = _mm512_fmadd_ps(fft4835, fft4752, _mm512_shuffle_f32x4(fft4835, fft4835, 177));
__m512 fft4760 = _mm512_fmadd_ps(fft4751, fft4752, _mm512_shuffle_f32x4(fft4751, fft4751, 177));
__m512 fft4844 = _mm512_fmadd_ps(fft4836, fft4752, _mm512_shuffle_f32x4(fft4836, fft4836, 177));
__m512 fft4761 = _mm512_mask_mov_ps(fft4753, 49344, fft4754);
__m512 fft4845 = _mm512_mask_mov_ps(fft4837, 49344, fft4838);
__m512 fft4762 = _mm512_mask_sub_ps(fft4754, 49344, _mm512_setzero_ps(), fft4753);
__m512 fft4846 = _mm512_mask_sub_ps(fft4838, 49344, _mm512_setzero_ps(), fft4837);
__m512 fft4763 = _mm512_mask_mov_ps(fft4755, 49344, fft4756);
__m512 fft4847 = _mm512_mask_mov_ps(fft4839, 49344, fft4840);
__m512 fft4764 = _mm512_mask_sub_ps(fft4756, 49344, _mm512_setzero_ps(), fft4755);
__m512 fft4848 = _mm512_mask_sub_ps(fft4840, 49344, _mm512_setzero_ps(), fft4839);
__m512 fft4765 = _mm512_mask_mov_ps(fft4757, 49344, fft4758);
__m512 fft4849 = _mm512_mask_mov_ps(fft4841, 49344, fft4842);
__m512 fft4766 = _mm512_mask_sub_ps(fft4758, 49344, _mm512_setzero_ps(), fft4757);
__m512 fft4850 = _mm512_mask_sub_ps(fft4842, 49344, _mm512_setzero_ps(), fft4841);
__m512 fft4767 = _mm512_mask_mov_ps(fft4759, 49344, fft4760);
__m512 fft4851 = _mm512_mask_mov_ps(fft4843, 49344, fft4844);
__m512 fft4768 = _mm512_mask_sub_ps(fft4760, 49344, _mm512_setzero_ps(), fft4759);
__m512 fft4852 = _mm512_mask_sub_ps(fft4844, 49344, _mm512_setzero_ps(), fft4843);
__m512 fft4769 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft4770 = _mm512_fmadd_ps(fft4761, fft4769, _mm512_shuffle_ps(fft4761, fft4761, 78));
__m512 fft4853 = _mm512_fmadd_ps(fft4845, fft4769, _mm512_shuffle_ps(fft4845, fft4845, 78));
__m512 fft4771 = _mm512_fmadd_ps(fft4762, fft4769, _mm512_shuffle_ps(fft4762, fft4762, 78));
__m512 fft4854 = _mm512_fmadd_ps(fft4846, fft4769, _mm512_shuffle_ps(fft4846, fft4846, 78));
__m512 fft4772 = _mm512_fmadd_ps(fft4763, fft4769, _mm512_shuffle_ps(fft4763, fft4763, 78));
__m512 fft4855 = _mm512_fmadd_ps(fft4847, fft4769, _mm512_shuffle_ps(fft4847, fft4847, 78));
__m512 fft4773 = _mm512_fmadd_ps(fft4764, fft4769, _mm512_shuffle_ps(fft4764, fft4764, 78));
__m512 fft4856 = _mm512_fmadd_ps(fft4848, fft4769, _mm512_shuffle_ps(fft4848, fft4848, 78));
__m512 fft4774 = _mm512_fmadd_ps(fft4765, fft4769, _mm512_shuffle_ps(fft4765, fft4765, 78));
__m512 fft4857 = _mm512_fmadd_ps(fft4849, fft4769, _mm512_shuffle_ps(fft4849, fft4849, 78));
__m512 fft4775 = _mm512_fmadd_ps(fft4766, fft4769, _mm512_shuffle_ps(fft4766, fft4766, 78));
__m512 fft4858 = _mm512_fmadd_ps(fft4850, fft4769, _mm512_shuffle_ps(fft4850, fft4850, 78));
__m512 fft4776 = _mm512_fmadd_ps(fft4767, fft4769, _mm512_shuffle_ps(fft4767, fft4767, 78));
__m512 fft4859 = _mm512_fmadd_ps(fft4851, fft4769, _mm512_shuffle_ps(fft4851, fft4851, 78));
__m512 fft4777 = _mm512_fmadd_ps(fft4768, fft4769, _mm512_shuffle_ps(fft4768, fft4768, 78));
__m512 fft4860 = _mm512_fmadd_ps(fft4852, fft4769, _mm512_shuffle_ps(fft4852, fft4852, 78));
__m512i fft4778 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft4779 = _mm512_permutexvar_ps(fft4778, fft4770);
__m512 fft4861 = _mm512_permutexvar_ps(fft4778, fft4853);
__m512i fft4780 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft4781 = _mm512_permutexvar_ps(fft4780, fft4770);
__m512 fft4862 = _mm512_permutexvar_ps(fft4780, fft4853);
__m512 fft4782 = _mm512_permutexvar_ps(fft4778, fft4771);
__m512 fft4863 = _mm512_permutexvar_ps(fft4778, fft4854);
__m512 fft4783 = _mm512_permutexvar_ps(fft4780, fft4771);
__m512 fft4864 = _mm512_permutexvar_ps(fft4780, fft4854);
__m512 fft4784 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft4785 = _mm512_fmadd_ps(fft4779, fft4784, fft4781);
__m512 fft4865 = _mm512_fmadd_ps(fft4861, fft4784, fft4862);
__m512 fft4786 = _mm512_fnmadd_ps(fft4783, fft4784, fft4782);
__m512 fft4866 = _mm512_fnmadd_ps(fft4864, fft4784, fft4863);
__m512 fft4787 = _mm512_mask_mov_ps(fft4783, 21845, fft4785);
__m512 fft4867 = _mm512_mask_mov_ps(fft4864, 21845, fft4865);
__m512 fft4788 = _mm512_mask_mov_ps(fft4779, 43176, fft4785);
__m512 fft4868 = _mm512_mask_mov_ps(fft4861, 43176, fft4865);
__m512 fft4789 = _mm512_mask_mov_ps(fft4787, 43176, fft4786);
__m512 fft4869 = _mm512_mask_mov_ps(fft4867, 43176, fft4866);
__m512 fft4790 = _mm512_mask_mov_ps(fft4788, 22102, fft4786);
__m512 fft4870 = _mm512_mask_mov_ps(fft4868, 22102, fft4866);
__m512 fft4791 = _mm512_mask_mul_ps(fft4789, 64764, fft4789, _mm512_set1_ps(5e-01f));
__m512 fft4871 = _mm512_mask_mul_ps(fft4869, 64764, fft4869, _mm512_set1_ps(5e-01f));
__m512 fft4792 = _mm512_mask_mul_ps(fft4790, 64764, fft4790, _mm512_set1_ps(5e-01f));
__m512 fft4872 = _mm512_mask_mul_ps(fft4870, 64764, fft4870, _mm512_set1_ps(5e-01f));
__m512 df417 = fft4791;
__m512 df425 = fft4871;
__m512 df418 = fft4792;
__m512 df426 = fft4872;
__m512 df419 = fft4772;
__m512 df427 = fft4855;
__m512 df420 = fft4773;
__m512 df428 = fft4856;
__m512 df421 = fft4774;
__m512 df429 = fft4857;
__m512 df422 = fft4775;
__m512 df430 = fft4858;
__m512 df423 = fft4776;
__m512 df431 = fft4859;
__m512 df424 = fft4777;
__m512 df432 = fft4860;
__m512i eo29 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df419 = _mm512_permutexvar_ps(eo29, df419);
df420 = _mm512_permutexvar_ps(eo29, df420);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df419);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df420);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df419);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df420);
df427 = _mm512_permutexvar_ps(eo29, df427);
df428 = _mm512_permutexvar_ps(eo29, df428);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df427);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df428);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df427);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df428);
df421 = _mm512_permutexvar_ps(eo29, df421);
df422 = _mm512_permutexvar_ps(eo29, df422);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df421);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df422);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df421);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df422);
df429 = _mm512_permutexvar_ps(eo29, df429);
df430 = _mm512_permutexvar_ps(eo29, df430);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df429);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df430);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df429);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df430);
df423 = _mm512_permutexvar_ps(eo29, df423);
df424 = _mm512_permutexvar_ps(eo29, df424);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df423);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df424);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df423);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df424);
df431 = _mm512_permutexvar_ps(eo29, df431);
df432 = _mm512_permutexvar_ps(eo29, df432);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df431);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df432);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df431);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df432);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df417);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df418);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df417);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df418);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df425);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df426);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df425);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df426);
}
ptrdiff_t b30 = 5;
ptrdiff_t m30 = (size_t)b30/2;
ptrdiff_t f31 = (size_t)b30%2;
__m512 dat418 = _mm512_maskz_loadu_ps(127, datPtr1+200+602112*i6+200704*k14+896*h13+4*w13+0*b30);
__m512 dat419 = _mm512_maskz_loadu_ps(127, datPtr1+1096+602112*i6+200704*k14+896*h13+4*w13+0*b30);
__m512 dat420 = _mm512_maskz_loadu_ps(127, datPtr1+1992+602112*i6+200704*k14+896*h13+4*w13+0*b30);
__m512 dat421 = _mm512_maskz_loadu_ps(127, datPtr1+2888+602112*i6+200704*k14+896*h13+4*w13+0*b30);
__m512 dat422 = _mm512_maskz_loadu_ps(127, datPtr1+3784+602112*i6+200704*k14+896*h13+4*w13+0*b30);
__m512 dat423 = _mm512_maskz_loadu_ps(127, datPtr1+4680+602112*i6+200704*k14+896*h13+4*w13+0*b30);
__m512 dat424 = _mm512_maskz_loadu_ps(127, datPtr1+5576+602112*i6+200704*k14+896*h13+4*w13+0*b30);
__m512 dat425 = _mm512_maskz_loadu_ps(127, datPtr1+6472+602112*i6+200704*k14+896*h13+4*w13+0*b30);
__m512 dat426 = _mm512_maskz_loadu_ps(127, datPtr1+7368+602112*i6+200704*k14+896*h13+4*w13+0*b30);
__m512 dat427 = _mm512_maskz_loadu_ps(127, datPtr1+8264+602112*i6+200704*k14+896*h13+4*w13+0*b30);
__m512 dat428 = _mm512_maskz_loadu_ps(127, datPtr1+9160+602112*i6+200704*k14+896*h13+4*w13+0*b30);
__m512 dat429 = _mm512_maskz_loadu_ps(127, datPtr1+10056+602112*i6+200704*k14+896*h13+4*w13+0*b30);
__m512 dat430 = _mm512_maskz_loadu_ps(127, datPtr1+10952+602112*i6+200704*k14+896*h13+4*w13+0*b30);
__m512 dat431 = _mm512_maskz_loadu_ps(127, datPtr1+11848+602112*i6+200704*k14+896*h13+4*w13+0*b30);
__m512 dat432 = _mm512_maskz_loadu_ps(127, datPtr1+12744+602112*i6+200704*k14+896*h13+4*w13+0*b30);
__m512 dat433 = _mm512_maskz_loadu_ps(127, datPtr1+13640+602112*i6+200704*k14+896*h13+4*w13+0*b30);
__m512 fft4873 = _mm512_add_ps(dat418, dat426);
__m512 fft4961 = _mm512_add_ps(dat419, dat427);
__m512 fft4874 = _mm512_sub_ps(dat418, dat426);
__m512 fft4962 = _mm512_sub_ps(dat419, dat427);
__m512 fft4875 = _mm512_add_ps(dat420, dat428);
__m512 fft4963 = _mm512_add_ps(dat421, dat429);
__m512 fft4876 = _mm512_sub_ps(dat420, dat428);
__m512 fft4964 = _mm512_sub_ps(dat421, dat429);
__m512 fft4877 = _mm512_add_ps(dat422, dat430);
__m512 fft4965 = _mm512_add_ps(dat423, dat431);
__m512 fft4878 = _mm512_sub_ps(dat422, dat430);
__m512 fft4966 = _mm512_sub_ps(dat423, dat431);
__m512 fft4879 = _mm512_add_ps(dat424, dat432);
__m512 fft4967 = _mm512_add_ps(dat425, dat433);
__m512 fft4880 = _mm512_sub_ps(dat424, dat432);
__m512 fft4968 = _mm512_sub_ps(dat425, dat433);
__m512 fft4881 = _mm512_add_ps(fft4873, fft4877);
__m512 fft4969 = _mm512_add_ps(fft4961, fft4965);
__m512 fft4882 = _mm512_sub_ps(fft4873, fft4877);
__m512 fft4970 = _mm512_sub_ps(fft4961, fft4965);
__m512 fft4883 = _mm512_add_ps(fft4875, fft4879);
__m512 fft4971 = _mm512_add_ps(fft4963, fft4967);
__m512 fft4884 = _mm512_sub_ps(fft4879, fft4875);
__m512 fft4972 = _mm512_sub_ps(fft4967, fft4963);
__m512 fft4885 = _mm512_sub_ps(fft4876, fft4880);
__m512 fft4973 = _mm512_sub_ps(fft4964, fft4968);
__m512 fft4886 = _mm512_add_ps(fft4876, fft4880);
__m512 fft4974 = _mm512_add_ps(fft4964, fft4968);
__m512 fft4887 = _mm512_add_ps(fft4881, fft4883);
__m512 fft4975 = _mm512_add_ps(fft4969, fft4971);
__m512 fft4888 = _mm512_sub_ps(fft4881, fft4883);
__m512 fft4976 = _mm512_sub_ps(fft4969, fft4971);
__m512 fft4889 = _mm512_fmadd_ps(fft4885, _mm512_set1_ps(7.0710677e-01f), fft4874);
__m512 fft4977 = _mm512_fmadd_ps(fft4973, _mm512_set1_ps(7.0710677e-01f), fft4962);
__m512 fft4890 = _mm512_fnmsub_ps(fft4886, _mm512_set1_ps(7.0710677e-01f), fft4878);
__m512 fft4978 = _mm512_fnmsub_ps(fft4974, _mm512_set1_ps(7.0710677e-01f), fft4966);
__m512 fft4891 = _mm512_fnmadd_ps(fft4885, _mm512_set1_ps(7.0710677e-01f), fft4874);
__m512 fft4979 = _mm512_fnmadd_ps(fft4973, _mm512_set1_ps(7.0710677e-01f), fft4962);
__m512 fft4892 = _mm512_fnmadd_ps(fft4886, _mm512_set1_ps(7.0710677e-01f), fft4878);
__m512 fft4980 = _mm512_fnmadd_ps(fft4974, _mm512_set1_ps(7.0710677e-01f), fft4966);
__m512 fft4893 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4894 = _mm512_fmadd_ps(fft4887, fft4893, _mm512_shuffle_f32x4(fft4887, fft4887, 78));
__m512 fft4981 = _mm512_fmadd_ps(fft4975, fft4893, _mm512_shuffle_f32x4(fft4975, fft4975, 78));
__m512 fft4895 = _mm512_fmadd_ps(fft4888, fft4893, _mm512_shuffle_f32x4(fft4888, fft4888, 78));
__m512 fft4982 = _mm512_fmadd_ps(fft4976, fft4893, _mm512_shuffle_f32x4(fft4976, fft4976, 78));
__m512 fft4896 = _mm512_fmadd_ps(fft4889, fft4893, _mm512_shuffle_f32x4(fft4889, fft4889, 78));
__m512 fft4983 = _mm512_fmadd_ps(fft4977, fft4893, _mm512_shuffle_f32x4(fft4977, fft4977, 78));
__m512 fft4897 = _mm512_fmadd_ps(fft4890, fft4893, _mm512_shuffle_f32x4(fft4890, fft4890, 78));
__m512 fft4984 = _mm512_fmadd_ps(fft4978, fft4893, _mm512_shuffle_f32x4(fft4978, fft4978, 78));
__m512 fft4898 = _mm512_fmadd_ps(fft4882, fft4893, _mm512_shuffle_f32x4(fft4882, fft4882, 78));
__m512 fft4985 = _mm512_fmadd_ps(fft4970, fft4893, _mm512_shuffle_f32x4(fft4970, fft4970, 78));
__m512 fft4899 = _mm512_fmadd_ps(fft4884, fft4893, _mm512_shuffle_f32x4(fft4884, fft4884, 78));
__m512 fft4986 = _mm512_fmadd_ps(fft4972, fft4893, _mm512_shuffle_f32x4(fft4972, fft4972, 78));
__m512 fft4900 = _mm512_fmadd_ps(fft4891, fft4893, _mm512_shuffle_f32x4(fft4891, fft4891, 78));
__m512 fft4987 = _mm512_fmadd_ps(fft4979, fft4893, _mm512_shuffle_f32x4(fft4979, fft4979, 78));
__m512 fft4901 = _mm512_fmadd_ps(fft4892, fft4893, _mm512_shuffle_f32x4(fft4892, fft4892, 78));
__m512 fft4988 = _mm512_fmadd_ps(fft4980, fft4893, _mm512_shuffle_f32x4(fft4980, fft4980, 78));
__m512 fft4902 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4903 = _mm512_mul_ps(fft4894, fft4902);
__m512 fft4989 = _mm512_mul_ps(fft4981, fft4902);
__m512 fft4904 = _mm512_mul_ps(fft4895, fft4902);
__m512 fft4990 = _mm512_mul_ps(fft4982, fft4902);
__m512 fft4905 = _mm512_mul_ps(fft4896, fft4902);
__m512 fft4991 = _mm512_mul_ps(fft4983, fft4902);
__m512 fft4906 = _mm512_mul_ps(fft4897, fft4902);
__m512 fft4992 = _mm512_mul_ps(fft4984, fft4902);
__m512 fft4907 = _mm512_mul_ps(fft4898, fft4902);
__m512 fft4993 = _mm512_mul_ps(fft4985, fft4902);
__m512 fft4908 = _mm512_mul_ps(fft4899, fft4902);
__m512 fft4994 = _mm512_mul_ps(fft4986, fft4902);
__m512 fft4909 = _mm512_mul_ps(fft4900, fft4902);
__m512 fft4995 = _mm512_mul_ps(fft4987, fft4902);
__m512 fft4910 = _mm512_mul_ps(fft4901, fft4902);
__m512 fft4996 = _mm512_mul_ps(fft4988, fft4902);
__m512 fft4911 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft4912 = _mm512_fmadd_ps(fft4895, fft4911, fft4903);
__m512 fft4997 = _mm512_fmadd_ps(fft4982, fft4911, fft4989);
__m512 fft4913 = _mm512_fnmadd_ps(fft4894, fft4911, fft4904);
__m512 fft4998 = _mm512_fnmadd_ps(fft4981, fft4911, fft4990);
__m512 fft4914 = _mm512_fmadd_ps(fft4897, fft4911, fft4905);
__m512 fft4999 = _mm512_fmadd_ps(fft4984, fft4911, fft4991);
__m512 fft4915 = _mm512_fnmadd_ps(fft4896, fft4911, fft4906);
__m512 fft5000 = _mm512_fnmadd_ps(fft4983, fft4911, fft4992);
__m512 fft4916 = _mm512_fmadd_ps(fft4899, fft4911, fft4907);
__m512 fft5001 = _mm512_fmadd_ps(fft4986, fft4911, fft4993);
__m512 fft4917 = _mm512_fnmadd_ps(fft4898, fft4911, fft4908);
__m512 fft5002 = _mm512_fnmadd_ps(fft4985, fft4911, fft4994);
__m512 fft4918 = _mm512_fmadd_ps(fft4901, fft4911, fft4909);
__m512 fft5003 = _mm512_fmadd_ps(fft4988, fft4911, fft4995);
__m512 fft4919 = _mm512_fnmadd_ps(fft4900, fft4911, fft4910);
__m512 fft5004 = _mm512_fnmadd_ps(fft4987, fft4911, fft4996);
__m512 fft4920 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft4921 = _mm512_fmadd_ps(fft4912, fft4920, _mm512_shuffle_f32x4(fft4912, fft4912, 177));
__m512 fft5005 = _mm512_fmadd_ps(fft4997, fft4920, _mm512_shuffle_f32x4(fft4997, fft4997, 177));
__m512 fft4922 = _mm512_fmadd_ps(fft4913, fft4920, _mm512_shuffle_f32x4(fft4913, fft4913, 177));
__m512 fft5006 = _mm512_fmadd_ps(fft4998, fft4920, _mm512_shuffle_f32x4(fft4998, fft4998, 177));
__m512 fft4923 = _mm512_fmadd_ps(fft4914, fft4920, _mm512_shuffle_f32x4(fft4914, fft4914, 177));
__m512 fft5007 = _mm512_fmadd_ps(fft4999, fft4920, _mm512_shuffle_f32x4(fft4999, fft4999, 177));
__m512 fft4924 = _mm512_fmadd_ps(fft4915, fft4920, _mm512_shuffle_f32x4(fft4915, fft4915, 177));
__m512 fft5008 = _mm512_fmadd_ps(fft5000, fft4920, _mm512_shuffle_f32x4(fft5000, fft5000, 177));
__m512 fft4925 = _mm512_fmadd_ps(fft4916, fft4920, _mm512_shuffle_f32x4(fft4916, fft4916, 177));
__m512 fft5009 = _mm512_fmadd_ps(fft5001, fft4920, _mm512_shuffle_f32x4(fft5001, fft5001, 177));
__m512 fft4926 = _mm512_fmadd_ps(fft4917, fft4920, _mm512_shuffle_f32x4(fft4917, fft4917, 177));
__m512 fft5010 = _mm512_fmadd_ps(fft5002, fft4920, _mm512_shuffle_f32x4(fft5002, fft5002, 177));
__m512 fft4927 = _mm512_fmadd_ps(fft4918, fft4920, _mm512_shuffle_f32x4(fft4918, fft4918, 177));
__m512 fft5011 = _mm512_fmadd_ps(fft5003, fft4920, _mm512_shuffle_f32x4(fft5003, fft5003, 177));
__m512 fft4928 = _mm512_fmadd_ps(fft4919, fft4920, _mm512_shuffle_f32x4(fft4919, fft4919, 177));
__m512 fft5012 = _mm512_fmadd_ps(fft5004, fft4920, _mm512_shuffle_f32x4(fft5004, fft5004, 177));
__m512 fft4929 = _mm512_mask_mov_ps(fft4921, 49344, fft4922);
__m512 fft5013 = _mm512_mask_mov_ps(fft5005, 49344, fft5006);
__m512 fft4930 = _mm512_mask_sub_ps(fft4922, 49344, _mm512_setzero_ps(), fft4921);
__m512 fft5014 = _mm512_mask_sub_ps(fft5006, 49344, _mm512_setzero_ps(), fft5005);
__m512 fft4931 = _mm512_mask_mov_ps(fft4923, 49344, fft4924);
__m512 fft5015 = _mm512_mask_mov_ps(fft5007, 49344, fft5008);
__m512 fft4932 = _mm512_mask_sub_ps(fft4924, 49344, _mm512_setzero_ps(), fft4923);
__m512 fft5016 = _mm512_mask_sub_ps(fft5008, 49344, _mm512_setzero_ps(), fft5007);
__m512 fft4933 = _mm512_mask_mov_ps(fft4925, 49344, fft4926);
__m512 fft5017 = _mm512_mask_mov_ps(fft5009, 49344, fft5010);
__m512 fft4934 = _mm512_mask_sub_ps(fft4926, 49344, _mm512_setzero_ps(), fft4925);
__m512 fft5018 = _mm512_mask_sub_ps(fft5010, 49344, _mm512_setzero_ps(), fft5009);
__m512 fft4935 = _mm512_mask_mov_ps(fft4927, 49344, fft4928);
__m512 fft5019 = _mm512_mask_mov_ps(fft5011, 49344, fft5012);
__m512 fft4936 = _mm512_mask_sub_ps(fft4928, 49344, _mm512_setzero_ps(), fft4927);
__m512 fft5020 = _mm512_mask_sub_ps(fft5012, 49344, _mm512_setzero_ps(), fft5011);
__m512 fft4937 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft4938 = _mm512_fmadd_ps(fft4929, fft4937, _mm512_shuffle_ps(fft4929, fft4929, 78));
__m512 fft5021 = _mm512_fmadd_ps(fft5013, fft4937, _mm512_shuffle_ps(fft5013, fft5013, 78));
__m512 fft4939 = _mm512_fmadd_ps(fft4930, fft4937, _mm512_shuffle_ps(fft4930, fft4930, 78));
__m512 fft5022 = _mm512_fmadd_ps(fft5014, fft4937, _mm512_shuffle_ps(fft5014, fft5014, 78));
__m512 fft4940 = _mm512_fmadd_ps(fft4931, fft4937, _mm512_shuffle_ps(fft4931, fft4931, 78));
__m512 fft5023 = _mm512_fmadd_ps(fft5015, fft4937, _mm512_shuffle_ps(fft5015, fft5015, 78));
__m512 fft4941 = _mm512_fmadd_ps(fft4932, fft4937, _mm512_shuffle_ps(fft4932, fft4932, 78));
__m512 fft5024 = _mm512_fmadd_ps(fft5016, fft4937, _mm512_shuffle_ps(fft5016, fft5016, 78));
__m512 fft4942 = _mm512_fmadd_ps(fft4933, fft4937, _mm512_shuffle_ps(fft4933, fft4933, 78));
__m512 fft5025 = _mm512_fmadd_ps(fft5017, fft4937, _mm512_shuffle_ps(fft5017, fft5017, 78));
__m512 fft4943 = _mm512_fmadd_ps(fft4934, fft4937, _mm512_shuffle_ps(fft4934, fft4934, 78));
__m512 fft5026 = _mm512_fmadd_ps(fft5018, fft4937, _mm512_shuffle_ps(fft5018, fft5018, 78));
__m512 fft4944 = _mm512_fmadd_ps(fft4935, fft4937, _mm512_shuffle_ps(fft4935, fft4935, 78));
__m512 fft5027 = _mm512_fmadd_ps(fft5019, fft4937, _mm512_shuffle_ps(fft5019, fft5019, 78));
__m512 fft4945 = _mm512_fmadd_ps(fft4936, fft4937, _mm512_shuffle_ps(fft4936, fft4936, 78));
__m512 fft5028 = _mm512_fmadd_ps(fft5020, fft4937, _mm512_shuffle_ps(fft5020, fft5020, 78));
__m512i fft4946 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft4947 = _mm512_permutexvar_ps(fft4946, fft4938);
__m512 fft5029 = _mm512_permutexvar_ps(fft4946, fft5021);
__m512i fft4948 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft4949 = _mm512_permutexvar_ps(fft4948, fft4938);
__m512 fft5030 = _mm512_permutexvar_ps(fft4948, fft5021);
__m512 fft4950 = _mm512_permutexvar_ps(fft4946, fft4939);
__m512 fft5031 = _mm512_permutexvar_ps(fft4946, fft5022);
__m512 fft4951 = _mm512_permutexvar_ps(fft4948, fft4939);
__m512 fft5032 = _mm512_permutexvar_ps(fft4948, fft5022);
__m512 fft4952 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft4953 = _mm512_fmadd_ps(fft4947, fft4952, fft4949);
__m512 fft5033 = _mm512_fmadd_ps(fft5029, fft4952, fft5030);
__m512 fft4954 = _mm512_fnmadd_ps(fft4951, fft4952, fft4950);
__m512 fft5034 = _mm512_fnmadd_ps(fft5032, fft4952, fft5031);
__m512 fft4955 = _mm512_mask_mov_ps(fft4951, 21845, fft4953);
__m512 fft5035 = _mm512_mask_mov_ps(fft5032, 21845, fft5033);
__m512 fft4956 = _mm512_mask_mov_ps(fft4947, 43176, fft4953);
__m512 fft5036 = _mm512_mask_mov_ps(fft5029, 43176, fft5033);
__m512 fft4957 = _mm512_mask_mov_ps(fft4955, 43176, fft4954);
__m512 fft5037 = _mm512_mask_mov_ps(fft5035, 43176, fft5034);
__m512 fft4958 = _mm512_mask_mov_ps(fft4956, 22102, fft4954);
__m512 fft5038 = _mm512_mask_mov_ps(fft5036, 22102, fft5034);
__m512 fft4959 = _mm512_mask_mul_ps(fft4957, 64764, fft4957, _mm512_set1_ps(5e-01f));
__m512 fft5039 = _mm512_mask_mul_ps(fft5037, 64764, fft5037, _mm512_set1_ps(5e-01f));
__m512 fft4960 = _mm512_mask_mul_ps(fft4958, 64764, fft4958, _mm512_set1_ps(5e-01f));
__m512 fft5040 = _mm512_mask_mul_ps(fft5038, 64764, fft5038, _mm512_set1_ps(5e-01f));
__m512 df433 = fft4959;
__m512 df441 = fft5039;
__m512 df434 = fft4960;
__m512 df442 = fft5040;
__m512 df435 = fft4940;
__m512 df443 = fft5023;
__m512 df436 = fft4941;
__m512 df444 = fft5024;
__m512 df437 = fft4942;
__m512 df445 = fft5025;
__m512 df438 = fft4943;
__m512 df446 = fft5026;
__m512 df439 = fft4944;
__m512 df447 = fft5027;
__m512 df440 = fft4945;
__m512 df448 = fft5028;
__m512i eo30 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df435 = _mm512_permutexvar_ps(eo30, df435);
df436 = _mm512_permutexvar_ps(eo30, df436);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df435);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df436);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df435);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df436);
df443 = _mm512_permutexvar_ps(eo30, df443);
df444 = _mm512_permutexvar_ps(eo30, df444);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df443);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df444);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df443);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df444);
df437 = _mm512_permutexvar_ps(eo30, df437);
df438 = _mm512_permutexvar_ps(eo30, df438);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df437);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df438);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df437);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df438);
df445 = _mm512_permutexvar_ps(eo30, df445);
df446 = _mm512_permutexvar_ps(eo30, df446);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df445);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df446);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df445);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df446);
df439 = _mm512_permutexvar_ps(eo30, df439);
df440 = _mm512_permutexvar_ps(eo30, df440);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df439);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df440);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df439);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df440);
df447 = _mm512_permutexvar_ps(eo30, df447);
df448 = _mm512_permutexvar_ps(eo30, df448);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df447);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df448);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df447);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df448);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df433);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df434);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df433);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df434);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df441);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df442);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df441);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df442);
}
if (j2 >= last1) return;
++j2;
rel2 = 19;
}
if (rel2 < 20) {
ptrdiff_t h14 = base2+50;
ptrdiff_t w14 = 0;
ptrdiff_t k15 = 3*s1;
ptrdiff_t kk14 = k15+2;
for (; k15 <= kk14; ++k15) {
ptrdiff_t b31 = 0;
ptrdiff_t m31 = (size_t)b31/2;
ptrdiff_t f32 = (size_t)b31%2;
__m512 dat434 = _mm512_maskz_loadu_ps(65528, datPtr1+0+602112*i6+200704*k15+896*h14+4*w14+0*b31);
__m512 dat435 = _mm512_maskz_loadu_ps(65528, datPtr1+896+602112*i6+200704*k15+896*h14+4*w14+0*b31);
__m512 dat436 = _mm512_maskz_loadu_ps(65528, datPtr1+1792+602112*i6+200704*k15+896*h14+4*w14+0*b31);
__m512 dat437 = _mm512_maskz_loadu_ps(65528, datPtr1+2688+602112*i6+200704*k15+896*h14+4*w14+0*b31);
__m512 dat438 = _mm512_maskz_loadu_ps(65528, datPtr1+3584+602112*i6+200704*k15+896*h14+4*w14+0*b31);
__m512 dat439 = _mm512_maskz_loadu_ps(65528, datPtr1+4480+602112*i6+200704*k15+896*h14+4*w14+0*b31);
__m512 dat440 = _mm512_maskz_loadu_ps(65528, datPtr1+5376+602112*i6+200704*k15+896*h14+4*w14+0*b31);
__m512 dat441 = _mm512_maskz_loadu_ps(65528, datPtr1+6272+602112*i6+200704*k15+896*h14+4*w14+0*b31);
__m512 dat442 = _mm512_maskz_loadu_ps(65528, datPtr1+7168+602112*i6+200704*k15+896*h14+4*w14+0*b31);
__m512 dat443 = _mm512_maskz_loadu_ps(65528, datPtr1+8064+602112*i6+200704*k15+896*h14+4*w14+0*b31);
__m512 dat444 = _mm512_maskz_loadu_ps(65528, datPtr1+8960+602112*i6+200704*k15+896*h14+4*w14+0*b31);
__m512 dat445 = _mm512_maskz_loadu_ps(65528, datPtr1+9856+602112*i6+200704*k15+896*h14+4*w14+0*b31);
__m512 dat446 = _mm512_maskz_loadu_ps(65528, datPtr1+10752+602112*i6+200704*k15+896*h14+4*w14+0*b31);
__m512 dat447 = _mm512_maskz_loadu_ps(65528, datPtr1+11648+602112*i6+200704*k15+896*h14+4*w14+0*b31);
__m512 dat448 = _mm512_maskz_loadu_ps(65528, datPtr1+12544+602112*i6+200704*k15+896*h14+4*w14+0*b31);
__m512 dat449 = _mm512_maskz_loadu_ps(65528, datPtr1+13440+602112*i6+200704*k15+896*h14+4*w14+0*b31);
__m512 fft5041 = _mm512_add_ps(dat434, dat442);
__m512 fft5129 = _mm512_add_ps(dat435, dat443);
__m512 fft5042 = _mm512_sub_ps(dat434, dat442);
__m512 fft5130 = _mm512_sub_ps(dat435, dat443);
__m512 fft5043 = _mm512_add_ps(dat436, dat444);
__m512 fft5131 = _mm512_add_ps(dat437, dat445);
__m512 fft5044 = _mm512_sub_ps(dat436, dat444);
__m512 fft5132 = _mm512_sub_ps(dat437, dat445);
__m512 fft5045 = _mm512_add_ps(dat438, dat446);
__m512 fft5133 = _mm512_add_ps(dat439, dat447);
__m512 fft5046 = _mm512_sub_ps(dat438, dat446);
__m512 fft5134 = _mm512_sub_ps(dat439, dat447);
__m512 fft5047 = _mm512_add_ps(dat440, dat448);
__m512 fft5135 = _mm512_add_ps(dat441, dat449);
__m512 fft5048 = _mm512_sub_ps(dat440, dat448);
__m512 fft5136 = _mm512_sub_ps(dat441, dat449);
__m512 fft5049 = _mm512_add_ps(fft5041, fft5045);
__m512 fft5137 = _mm512_add_ps(fft5129, fft5133);
__m512 fft5050 = _mm512_sub_ps(fft5041, fft5045);
__m512 fft5138 = _mm512_sub_ps(fft5129, fft5133);
__m512 fft5051 = _mm512_add_ps(fft5043, fft5047);
__m512 fft5139 = _mm512_add_ps(fft5131, fft5135);
__m512 fft5052 = _mm512_sub_ps(fft5047, fft5043);
__m512 fft5140 = _mm512_sub_ps(fft5135, fft5131);
__m512 fft5053 = _mm512_sub_ps(fft5044, fft5048);
__m512 fft5141 = _mm512_sub_ps(fft5132, fft5136);
__m512 fft5054 = _mm512_add_ps(fft5044, fft5048);
__m512 fft5142 = _mm512_add_ps(fft5132, fft5136);
__m512 fft5055 = _mm512_add_ps(fft5049, fft5051);
__m512 fft5143 = _mm512_add_ps(fft5137, fft5139);
__m512 fft5056 = _mm512_sub_ps(fft5049, fft5051);
__m512 fft5144 = _mm512_sub_ps(fft5137, fft5139);
__m512 fft5057 = _mm512_fmadd_ps(fft5053, _mm512_set1_ps(7.0710677e-01f), fft5042);
__m512 fft5145 = _mm512_fmadd_ps(fft5141, _mm512_set1_ps(7.0710677e-01f), fft5130);
__m512 fft5058 = _mm512_fnmsub_ps(fft5054, _mm512_set1_ps(7.0710677e-01f), fft5046);
__m512 fft5146 = _mm512_fnmsub_ps(fft5142, _mm512_set1_ps(7.0710677e-01f), fft5134);
__m512 fft5059 = _mm512_fnmadd_ps(fft5053, _mm512_set1_ps(7.0710677e-01f), fft5042);
__m512 fft5147 = _mm512_fnmadd_ps(fft5141, _mm512_set1_ps(7.0710677e-01f), fft5130);
__m512 fft5060 = _mm512_fnmadd_ps(fft5054, _mm512_set1_ps(7.0710677e-01f), fft5046);
__m512 fft5148 = _mm512_fnmadd_ps(fft5142, _mm512_set1_ps(7.0710677e-01f), fft5134);
__m512 fft5061 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5062 = _mm512_fmadd_ps(fft5055, fft5061, _mm512_shuffle_f32x4(fft5055, fft5055, 78));
__m512 fft5149 = _mm512_fmadd_ps(fft5143, fft5061, _mm512_shuffle_f32x4(fft5143, fft5143, 78));
__m512 fft5063 = _mm512_fmadd_ps(fft5056, fft5061, _mm512_shuffle_f32x4(fft5056, fft5056, 78));
__m512 fft5150 = _mm512_fmadd_ps(fft5144, fft5061, _mm512_shuffle_f32x4(fft5144, fft5144, 78));
__m512 fft5064 = _mm512_fmadd_ps(fft5057, fft5061, _mm512_shuffle_f32x4(fft5057, fft5057, 78));
__m512 fft5151 = _mm512_fmadd_ps(fft5145, fft5061, _mm512_shuffle_f32x4(fft5145, fft5145, 78));
__m512 fft5065 = _mm512_fmadd_ps(fft5058, fft5061, _mm512_shuffle_f32x4(fft5058, fft5058, 78));
__m512 fft5152 = _mm512_fmadd_ps(fft5146, fft5061, _mm512_shuffle_f32x4(fft5146, fft5146, 78));
__m512 fft5066 = _mm512_fmadd_ps(fft5050, fft5061, _mm512_shuffle_f32x4(fft5050, fft5050, 78));
__m512 fft5153 = _mm512_fmadd_ps(fft5138, fft5061, _mm512_shuffle_f32x4(fft5138, fft5138, 78));
__m512 fft5067 = _mm512_fmadd_ps(fft5052, fft5061, _mm512_shuffle_f32x4(fft5052, fft5052, 78));
__m512 fft5154 = _mm512_fmadd_ps(fft5140, fft5061, _mm512_shuffle_f32x4(fft5140, fft5140, 78));
__m512 fft5068 = _mm512_fmadd_ps(fft5059, fft5061, _mm512_shuffle_f32x4(fft5059, fft5059, 78));
__m512 fft5155 = _mm512_fmadd_ps(fft5147, fft5061, _mm512_shuffle_f32x4(fft5147, fft5147, 78));
__m512 fft5069 = _mm512_fmadd_ps(fft5060, fft5061, _mm512_shuffle_f32x4(fft5060, fft5060, 78));
__m512 fft5156 = _mm512_fmadd_ps(fft5148, fft5061, _mm512_shuffle_f32x4(fft5148, fft5148, 78));
__m512 fft5070 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5071 = _mm512_mul_ps(fft5062, fft5070);
__m512 fft5157 = _mm512_mul_ps(fft5149, fft5070);
__m512 fft5072 = _mm512_mul_ps(fft5063, fft5070);
__m512 fft5158 = _mm512_mul_ps(fft5150, fft5070);
__m512 fft5073 = _mm512_mul_ps(fft5064, fft5070);
__m512 fft5159 = _mm512_mul_ps(fft5151, fft5070);
__m512 fft5074 = _mm512_mul_ps(fft5065, fft5070);
__m512 fft5160 = _mm512_mul_ps(fft5152, fft5070);
__m512 fft5075 = _mm512_mul_ps(fft5066, fft5070);
__m512 fft5161 = _mm512_mul_ps(fft5153, fft5070);
__m512 fft5076 = _mm512_mul_ps(fft5067, fft5070);
__m512 fft5162 = _mm512_mul_ps(fft5154, fft5070);
__m512 fft5077 = _mm512_mul_ps(fft5068, fft5070);
__m512 fft5163 = _mm512_mul_ps(fft5155, fft5070);
__m512 fft5078 = _mm512_mul_ps(fft5069, fft5070);
__m512 fft5164 = _mm512_mul_ps(fft5156, fft5070);
__m512 fft5079 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft5080 = _mm512_fmadd_ps(fft5063, fft5079, fft5071);
__m512 fft5165 = _mm512_fmadd_ps(fft5150, fft5079, fft5157);
__m512 fft5081 = _mm512_fnmadd_ps(fft5062, fft5079, fft5072);
__m512 fft5166 = _mm512_fnmadd_ps(fft5149, fft5079, fft5158);
__m512 fft5082 = _mm512_fmadd_ps(fft5065, fft5079, fft5073);
__m512 fft5167 = _mm512_fmadd_ps(fft5152, fft5079, fft5159);
__m512 fft5083 = _mm512_fnmadd_ps(fft5064, fft5079, fft5074);
__m512 fft5168 = _mm512_fnmadd_ps(fft5151, fft5079, fft5160);
__m512 fft5084 = _mm512_fmadd_ps(fft5067, fft5079, fft5075);
__m512 fft5169 = _mm512_fmadd_ps(fft5154, fft5079, fft5161);
__m512 fft5085 = _mm512_fnmadd_ps(fft5066, fft5079, fft5076);
__m512 fft5170 = _mm512_fnmadd_ps(fft5153, fft5079, fft5162);
__m512 fft5086 = _mm512_fmadd_ps(fft5069, fft5079, fft5077);
__m512 fft5171 = _mm512_fmadd_ps(fft5156, fft5079, fft5163);
__m512 fft5087 = _mm512_fnmadd_ps(fft5068, fft5079, fft5078);
__m512 fft5172 = _mm512_fnmadd_ps(fft5155, fft5079, fft5164);
__m512 fft5088 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft5089 = _mm512_fmadd_ps(fft5080, fft5088, _mm512_shuffle_f32x4(fft5080, fft5080, 177));
__m512 fft5173 = _mm512_fmadd_ps(fft5165, fft5088, _mm512_shuffle_f32x4(fft5165, fft5165, 177));
__m512 fft5090 = _mm512_fmadd_ps(fft5081, fft5088, _mm512_shuffle_f32x4(fft5081, fft5081, 177));
__m512 fft5174 = _mm512_fmadd_ps(fft5166, fft5088, _mm512_shuffle_f32x4(fft5166, fft5166, 177));
__m512 fft5091 = _mm512_fmadd_ps(fft5082, fft5088, _mm512_shuffle_f32x4(fft5082, fft5082, 177));
__m512 fft5175 = _mm512_fmadd_ps(fft5167, fft5088, _mm512_shuffle_f32x4(fft5167, fft5167, 177));
__m512 fft5092 = _mm512_fmadd_ps(fft5083, fft5088, _mm512_shuffle_f32x4(fft5083, fft5083, 177));
__m512 fft5176 = _mm512_fmadd_ps(fft5168, fft5088, _mm512_shuffle_f32x4(fft5168, fft5168, 177));
__m512 fft5093 = _mm512_fmadd_ps(fft5084, fft5088, _mm512_shuffle_f32x4(fft5084, fft5084, 177));
__m512 fft5177 = _mm512_fmadd_ps(fft5169, fft5088, _mm512_shuffle_f32x4(fft5169, fft5169, 177));
__m512 fft5094 = _mm512_fmadd_ps(fft5085, fft5088, _mm512_shuffle_f32x4(fft5085, fft5085, 177));
__m512 fft5178 = _mm512_fmadd_ps(fft5170, fft5088, _mm512_shuffle_f32x4(fft5170, fft5170, 177));
__m512 fft5095 = _mm512_fmadd_ps(fft5086, fft5088, _mm512_shuffle_f32x4(fft5086, fft5086, 177));
__m512 fft5179 = _mm512_fmadd_ps(fft5171, fft5088, _mm512_shuffle_f32x4(fft5171, fft5171, 177));
__m512 fft5096 = _mm512_fmadd_ps(fft5087, fft5088, _mm512_shuffle_f32x4(fft5087, fft5087, 177));
__m512 fft5180 = _mm512_fmadd_ps(fft5172, fft5088, _mm512_shuffle_f32x4(fft5172, fft5172, 177));
__m512 fft5097 = _mm512_mask_mov_ps(fft5089, 49344, fft5090);
__m512 fft5181 = _mm512_mask_mov_ps(fft5173, 49344, fft5174);
__m512 fft5098 = _mm512_mask_sub_ps(fft5090, 49344, _mm512_setzero_ps(), fft5089);
__m512 fft5182 = _mm512_mask_sub_ps(fft5174, 49344, _mm512_setzero_ps(), fft5173);
__m512 fft5099 = _mm512_mask_mov_ps(fft5091, 49344, fft5092);
__m512 fft5183 = _mm512_mask_mov_ps(fft5175, 49344, fft5176);
__m512 fft5100 = _mm512_mask_sub_ps(fft5092, 49344, _mm512_setzero_ps(), fft5091);
__m512 fft5184 = _mm512_mask_sub_ps(fft5176, 49344, _mm512_setzero_ps(), fft5175);
__m512 fft5101 = _mm512_mask_mov_ps(fft5093, 49344, fft5094);
__m512 fft5185 = _mm512_mask_mov_ps(fft5177, 49344, fft5178);
__m512 fft5102 = _mm512_mask_sub_ps(fft5094, 49344, _mm512_setzero_ps(), fft5093);
__m512 fft5186 = _mm512_mask_sub_ps(fft5178, 49344, _mm512_setzero_ps(), fft5177);
__m512 fft5103 = _mm512_mask_mov_ps(fft5095, 49344, fft5096);
__m512 fft5187 = _mm512_mask_mov_ps(fft5179, 49344, fft5180);
__m512 fft5104 = _mm512_mask_sub_ps(fft5096, 49344, _mm512_setzero_ps(), fft5095);
__m512 fft5188 = _mm512_mask_sub_ps(fft5180, 49344, _mm512_setzero_ps(), fft5179);
__m512 fft5105 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft5106 = _mm512_fmadd_ps(fft5097, fft5105, _mm512_shuffle_ps(fft5097, fft5097, 78));
__m512 fft5189 = _mm512_fmadd_ps(fft5181, fft5105, _mm512_shuffle_ps(fft5181, fft5181, 78));
__m512 fft5107 = _mm512_fmadd_ps(fft5098, fft5105, _mm512_shuffle_ps(fft5098, fft5098, 78));
__m512 fft5190 = _mm512_fmadd_ps(fft5182, fft5105, _mm512_shuffle_ps(fft5182, fft5182, 78));
__m512 fft5108 = _mm512_fmadd_ps(fft5099, fft5105, _mm512_shuffle_ps(fft5099, fft5099, 78));
__m512 fft5191 = _mm512_fmadd_ps(fft5183, fft5105, _mm512_shuffle_ps(fft5183, fft5183, 78));
__m512 fft5109 = _mm512_fmadd_ps(fft5100, fft5105, _mm512_shuffle_ps(fft5100, fft5100, 78));
__m512 fft5192 = _mm512_fmadd_ps(fft5184, fft5105, _mm512_shuffle_ps(fft5184, fft5184, 78));
__m512 fft5110 = _mm512_fmadd_ps(fft5101, fft5105, _mm512_shuffle_ps(fft5101, fft5101, 78));
__m512 fft5193 = _mm512_fmadd_ps(fft5185, fft5105, _mm512_shuffle_ps(fft5185, fft5185, 78));
__m512 fft5111 = _mm512_fmadd_ps(fft5102, fft5105, _mm512_shuffle_ps(fft5102, fft5102, 78));
__m512 fft5194 = _mm512_fmadd_ps(fft5186, fft5105, _mm512_shuffle_ps(fft5186, fft5186, 78));
__m512 fft5112 = _mm512_fmadd_ps(fft5103, fft5105, _mm512_shuffle_ps(fft5103, fft5103, 78));
__m512 fft5195 = _mm512_fmadd_ps(fft5187, fft5105, _mm512_shuffle_ps(fft5187, fft5187, 78));
__m512 fft5113 = _mm512_fmadd_ps(fft5104, fft5105, _mm512_shuffle_ps(fft5104, fft5104, 78));
__m512 fft5196 = _mm512_fmadd_ps(fft5188, fft5105, _mm512_shuffle_ps(fft5188, fft5188, 78));
__m512i fft5114 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft5115 = _mm512_permutexvar_ps(fft5114, fft5106);
__m512 fft5197 = _mm512_permutexvar_ps(fft5114, fft5189);
__m512i fft5116 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft5117 = _mm512_permutexvar_ps(fft5116, fft5106);
__m512 fft5198 = _mm512_permutexvar_ps(fft5116, fft5189);
__m512 fft5118 = _mm512_permutexvar_ps(fft5114, fft5107);
__m512 fft5199 = _mm512_permutexvar_ps(fft5114, fft5190);
__m512 fft5119 = _mm512_permutexvar_ps(fft5116, fft5107);
__m512 fft5200 = _mm512_permutexvar_ps(fft5116, fft5190);
__m512 fft5120 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft5121 = _mm512_fmadd_ps(fft5115, fft5120, fft5117);
__m512 fft5201 = _mm512_fmadd_ps(fft5197, fft5120, fft5198);
__m512 fft5122 = _mm512_fnmadd_ps(fft5119, fft5120, fft5118);
__m512 fft5202 = _mm512_fnmadd_ps(fft5200, fft5120, fft5199);
__m512 fft5123 = _mm512_mask_mov_ps(fft5119, 21845, fft5121);
__m512 fft5203 = _mm512_mask_mov_ps(fft5200, 21845, fft5201);
__m512 fft5124 = _mm512_mask_mov_ps(fft5115, 43176, fft5121);
__m512 fft5204 = _mm512_mask_mov_ps(fft5197, 43176, fft5201);
__m512 fft5125 = _mm512_mask_mov_ps(fft5123, 43176, fft5122);
__m512 fft5205 = _mm512_mask_mov_ps(fft5203, 43176, fft5202);
__m512 fft5126 = _mm512_mask_mov_ps(fft5124, 22102, fft5122);
__m512 fft5206 = _mm512_mask_mov_ps(fft5204, 22102, fft5202);
__m512 fft5127 = _mm512_mask_mul_ps(fft5125, 64764, fft5125, _mm512_set1_ps(5e-01f));
__m512 fft5207 = _mm512_mask_mul_ps(fft5205, 64764, fft5205, _mm512_set1_ps(5e-01f));
__m512 fft5128 = _mm512_mask_mul_ps(fft5126, 64764, fft5126, _mm512_set1_ps(5e-01f));
__m512 fft5208 = _mm512_mask_mul_ps(fft5206, 64764, fft5206, _mm512_set1_ps(5e-01f));
__m512 df449 = fft5127;
__m512 df457 = fft5207;
__m512 df450 = fft5128;
__m512 df458 = fft5208;
__m512 df451 = fft5108;
__m512 df459 = fft5191;
__m512 df452 = fft5109;
__m512 df460 = fft5192;
__m512 df453 = fft5110;
__m512 df461 = fft5193;
__m512 df454 = fft5111;
__m512 df462 = fft5194;
__m512 df455 = fft5112;
__m512 df463 = fft5195;
__m512 df456 = fft5113;
__m512 df464 = fft5196;
__m512i eo31 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df451 = _mm512_permutexvar_ps(eo31, df451);
df452 = _mm512_permutexvar_ps(eo31, df452);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df451);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df452);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df451);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df452);
df459 = _mm512_permutexvar_ps(eo31, df459);
df460 = _mm512_permutexvar_ps(eo31, df460);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df459);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df460);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df459);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df460);
df453 = _mm512_permutexvar_ps(eo31, df453);
df454 = _mm512_permutexvar_ps(eo31, df454);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df453);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df454);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df453);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df454);
df461 = _mm512_permutexvar_ps(eo31, df461);
df462 = _mm512_permutexvar_ps(eo31, df462);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df461);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df462);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df461);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df462);
df455 = _mm512_permutexvar_ps(eo31, df455);
df456 = _mm512_permutexvar_ps(eo31, df456);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df455);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df456);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df455);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df456);
df463 = _mm512_permutexvar_ps(eo31, df463);
df464 = _mm512_permutexvar_ps(eo31, df464);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df463);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df464);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df463);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df464);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df449);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df450);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df449);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df450);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df457);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df458);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df457);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df458);
for (ptrdiff_t b32 = 1; b32 < 6; ++b32) {
ptrdiff_t m32 = (size_t)b32/2;
ptrdiff_t f33 = (size_t)b32%2;
__m512 dat450 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k15+896*h14+4*w14+40*b32);
__m512 dat451 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k15+896*h14+4*w14+40*b32);
__m512 dat452 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k15+896*h14+4*w14+40*b32);
__m512 dat453 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k15+896*h14+4*w14+40*b32);
__m512 dat454 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k15+896*h14+4*w14+40*b32);
__m512 dat455 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k15+896*h14+4*w14+40*b32);
__m512 dat456 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k15+896*h14+4*w14+40*b32);
__m512 dat457 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k15+896*h14+4*w14+40*b32);
__m512 dat458 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k15+896*h14+4*w14+40*b32);
__m512 dat459 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k15+896*h14+4*w14+40*b32);
__m512 dat460 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k15+896*h14+4*w14+40*b32);
__m512 dat461 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k15+896*h14+4*w14+40*b32);
__m512 dat462 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k15+896*h14+4*w14+40*b32);
__m512 dat463 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k15+896*h14+4*w14+40*b32);
__m512 dat464 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k15+896*h14+4*w14+40*b32);
__m512 dat465 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k15+896*h14+4*w14+40*b32);
__m512 fft5209 = _mm512_add_ps(dat450, dat458);
__m512 fft5297 = _mm512_add_ps(dat451, dat459);
__m512 fft5210 = _mm512_sub_ps(dat450, dat458);
__m512 fft5298 = _mm512_sub_ps(dat451, dat459);
__m512 fft5211 = _mm512_add_ps(dat452, dat460);
__m512 fft5299 = _mm512_add_ps(dat453, dat461);
__m512 fft5212 = _mm512_sub_ps(dat452, dat460);
__m512 fft5300 = _mm512_sub_ps(dat453, dat461);
__m512 fft5213 = _mm512_add_ps(dat454, dat462);
__m512 fft5301 = _mm512_add_ps(dat455, dat463);
__m512 fft5214 = _mm512_sub_ps(dat454, dat462);
__m512 fft5302 = _mm512_sub_ps(dat455, dat463);
__m512 fft5215 = _mm512_add_ps(dat456, dat464);
__m512 fft5303 = _mm512_add_ps(dat457, dat465);
__m512 fft5216 = _mm512_sub_ps(dat456, dat464);
__m512 fft5304 = _mm512_sub_ps(dat457, dat465);
__m512 fft5217 = _mm512_add_ps(fft5209, fft5213);
__m512 fft5305 = _mm512_add_ps(fft5297, fft5301);
__m512 fft5218 = _mm512_sub_ps(fft5209, fft5213);
__m512 fft5306 = _mm512_sub_ps(fft5297, fft5301);
__m512 fft5219 = _mm512_add_ps(fft5211, fft5215);
__m512 fft5307 = _mm512_add_ps(fft5299, fft5303);
__m512 fft5220 = _mm512_sub_ps(fft5215, fft5211);
__m512 fft5308 = _mm512_sub_ps(fft5303, fft5299);
__m512 fft5221 = _mm512_sub_ps(fft5212, fft5216);
__m512 fft5309 = _mm512_sub_ps(fft5300, fft5304);
__m512 fft5222 = _mm512_add_ps(fft5212, fft5216);
__m512 fft5310 = _mm512_add_ps(fft5300, fft5304);
__m512 fft5223 = _mm512_add_ps(fft5217, fft5219);
__m512 fft5311 = _mm512_add_ps(fft5305, fft5307);
__m512 fft5224 = _mm512_sub_ps(fft5217, fft5219);
__m512 fft5312 = _mm512_sub_ps(fft5305, fft5307);
__m512 fft5225 = _mm512_fmadd_ps(fft5221, _mm512_set1_ps(7.0710677e-01f), fft5210);
__m512 fft5313 = _mm512_fmadd_ps(fft5309, _mm512_set1_ps(7.0710677e-01f), fft5298);
__m512 fft5226 = _mm512_fnmsub_ps(fft5222, _mm512_set1_ps(7.0710677e-01f), fft5214);
__m512 fft5314 = _mm512_fnmsub_ps(fft5310, _mm512_set1_ps(7.0710677e-01f), fft5302);
__m512 fft5227 = _mm512_fnmadd_ps(fft5221, _mm512_set1_ps(7.0710677e-01f), fft5210);
__m512 fft5315 = _mm512_fnmadd_ps(fft5309, _mm512_set1_ps(7.0710677e-01f), fft5298);
__m512 fft5228 = _mm512_fnmadd_ps(fft5222, _mm512_set1_ps(7.0710677e-01f), fft5214);
__m512 fft5316 = _mm512_fnmadd_ps(fft5310, _mm512_set1_ps(7.0710677e-01f), fft5302);
__m512 fft5229 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5230 = _mm512_fmadd_ps(fft5223, fft5229, _mm512_shuffle_f32x4(fft5223, fft5223, 78));
__m512 fft5317 = _mm512_fmadd_ps(fft5311, fft5229, _mm512_shuffle_f32x4(fft5311, fft5311, 78));
__m512 fft5231 = _mm512_fmadd_ps(fft5224, fft5229, _mm512_shuffle_f32x4(fft5224, fft5224, 78));
__m512 fft5318 = _mm512_fmadd_ps(fft5312, fft5229, _mm512_shuffle_f32x4(fft5312, fft5312, 78));
__m512 fft5232 = _mm512_fmadd_ps(fft5225, fft5229, _mm512_shuffle_f32x4(fft5225, fft5225, 78));
__m512 fft5319 = _mm512_fmadd_ps(fft5313, fft5229, _mm512_shuffle_f32x4(fft5313, fft5313, 78));
__m512 fft5233 = _mm512_fmadd_ps(fft5226, fft5229, _mm512_shuffle_f32x4(fft5226, fft5226, 78));
__m512 fft5320 = _mm512_fmadd_ps(fft5314, fft5229, _mm512_shuffle_f32x4(fft5314, fft5314, 78));
__m512 fft5234 = _mm512_fmadd_ps(fft5218, fft5229, _mm512_shuffle_f32x4(fft5218, fft5218, 78));
__m512 fft5321 = _mm512_fmadd_ps(fft5306, fft5229, _mm512_shuffle_f32x4(fft5306, fft5306, 78));
__m512 fft5235 = _mm512_fmadd_ps(fft5220, fft5229, _mm512_shuffle_f32x4(fft5220, fft5220, 78));
__m512 fft5322 = _mm512_fmadd_ps(fft5308, fft5229, _mm512_shuffle_f32x4(fft5308, fft5308, 78));
__m512 fft5236 = _mm512_fmadd_ps(fft5227, fft5229, _mm512_shuffle_f32x4(fft5227, fft5227, 78));
__m512 fft5323 = _mm512_fmadd_ps(fft5315, fft5229, _mm512_shuffle_f32x4(fft5315, fft5315, 78));
__m512 fft5237 = _mm512_fmadd_ps(fft5228, fft5229, _mm512_shuffle_f32x4(fft5228, fft5228, 78));
__m512 fft5324 = _mm512_fmadd_ps(fft5316, fft5229, _mm512_shuffle_f32x4(fft5316, fft5316, 78));
__m512 fft5238 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5239 = _mm512_mul_ps(fft5230, fft5238);
__m512 fft5325 = _mm512_mul_ps(fft5317, fft5238);
__m512 fft5240 = _mm512_mul_ps(fft5231, fft5238);
__m512 fft5326 = _mm512_mul_ps(fft5318, fft5238);
__m512 fft5241 = _mm512_mul_ps(fft5232, fft5238);
__m512 fft5327 = _mm512_mul_ps(fft5319, fft5238);
__m512 fft5242 = _mm512_mul_ps(fft5233, fft5238);
__m512 fft5328 = _mm512_mul_ps(fft5320, fft5238);
__m512 fft5243 = _mm512_mul_ps(fft5234, fft5238);
__m512 fft5329 = _mm512_mul_ps(fft5321, fft5238);
__m512 fft5244 = _mm512_mul_ps(fft5235, fft5238);
__m512 fft5330 = _mm512_mul_ps(fft5322, fft5238);
__m512 fft5245 = _mm512_mul_ps(fft5236, fft5238);
__m512 fft5331 = _mm512_mul_ps(fft5323, fft5238);
__m512 fft5246 = _mm512_mul_ps(fft5237, fft5238);
__m512 fft5332 = _mm512_mul_ps(fft5324, fft5238);
__m512 fft5247 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft5248 = _mm512_fmadd_ps(fft5231, fft5247, fft5239);
__m512 fft5333 = _mm512_fmadd_ps(fft5318, fft5247, fft5325);
__m512 fft5249 = _mm512_fnmadd_ps(fft5230, fft5247, fft5240);
__m512 fft5334 = _mm512_fnmadd_ps(fft5317, fft5247, fft5326);
__m512 fft5250 = _mm512_fmadd_ps(fft5233, fft5247, fft5241);
__m512 fft5335 = _mm512_fmadd_ps(fft5320, fft5247, fft5327);
__m512 fft5251 = _mm512_fnmadd_ps(fft5232, fft5247, fft5242);
__m512 fft5336 = _mm512_fnmadd_ps(fft5319, fft5247, fft5328);
__m512 fft5252 = _mm512_fmadd_ps(fft5235, fft5247, fft5243);
__m512 fft5337 = _mm512_fmadd_ps(fft5322, fft5247, fft5329);
__m512 fft5253 = _mm512_fnmadd_ps(fft5234, fft5247, fft5244);
__m512 fft5338 = _mm512_fnmadd_ps(fft5321, fft5247, fft5330);
__m512 fft5254 = _mm512_fmadd_ps(fft5237, fft5247, fft5245);
__m512 fft5339 = _mm512_fmadd_ps(fft5324, fft5247, fft5331);
__m512 fft5255 = _mm512_fnmadd_ps(fft5236, fft5247, fft5246);
__m512 fft5340 = _mm512_fnmadd_ps(fft5323, fft5247, fft5332);
__m512 fft5256 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft5257 = _mm512_fmadd_ps(fft5248, fft5256, _mm512_shuffle_f32x4(fft5248, fft5248, 177));
__m512 fft5341 = _mm512_fmadd_ps(fft5333, fft5256, _mm512_shuffle_f32x4(fft5333, fft5333, 177));
__m512 fft5258 = _mm512_fmadd_ps(fft5249, fft5256, _mm512_shuffle_f32x4(fft5249, fft5249, 177));
__m512 fft5342 = _mm512_fmadd_ps(fft5334, fft5256, _mm512_shuffle_f32x4(fft5334, fft5334, 177));
__m512 fft5259 = _mm512_fmadd_ps(fft5250, fft5256, _mm512_shuffle_f32x4(fft5250, fft5250, 177));
__m512 fft5343 = _mm512_fmadd_ps(fft5335, fft5256, _mm512_shuffle_f32x4(fft5335, fft5335, 177));
__m512 fft5260 = _mm512_fmadd_ps(fft5251, fft5256, _mm512_shuffle_f32x4(fft5251, fft5251, 177));
__m512 fft5344 = _mm512_fmadd_ps(fft5336, fft5256, _mm512_shuffle_f32x4(fft5336, fft5336, 177));
__m512 fft5261 = _mm512_fmadd_ps(fft5252, fft5256, _mm512_shuffle_f32x4(fft5252, fft5252, 177));
__m512 fft5345 = _mm512_fmadd_ps(fft5337, fft5256, _mm512_shuffle_f32x4(fft5337, fft5337, 177));
__m512 fft5262 = _mm512_fmadd_ps(fft5253, fft5256, _mm512_shuffle_f32x4(fft5253, fft5253, 177));
__m512 fft5346 = _mm512_fmadd_ps(fft5338, fft5256, _mm512_shuffle_f32x4(fft5338, fft5338, 177));
__m512 fft5263 = _mm512_fmadd_ps(fft5254, fft5256, _mm512_shuffle_f32x4(fft5254, fft5254, 177));
__m512 fft5347 = _mm512_fmadd_ps(fft5339, fft5256, _mm512_shuffle_f32x4(fft5339, fft5339, 177));
__m512 fft5264 = _mm512_fmadd_ps(fft5255, fft5256, _mm512_shuffle_f32x4(fft5255, fft5255, 177));
__m512 fft5348 = _mm512_fmadd_ps(fft5340, fft5256, _mm512_shuffle_f32x4(fft5340, fft5340, 177));
__m512 fft5265 = _mm512_mask_mov_ps(fft5257, 49344, fft5258);
__m512 fft5349 = _mm512_mask_mov_ps(fft5341, 49344, fft5342);
__m512 fft5266 = _mm512_mask_sub_ps(fft5258, 49344, _mm512_setzero_ps(), fft5257);
__m512 fft5350 = _mm512_mask_sub_ps(fft5342, 49344, _mm512_setzero_ps(), fft5341);
__m512 fft5267 = _mm512_mask_mov_ps(fft5259, 49344, fft5260);
__m512 fft5351 = _mm512_mask_mov_ps(fft5343, 49344, fft5344);
__m512 fft5268 = _mm512_mask_sub_ps(fft5260, 49344, _mm512_setzero_ps(), fft5259);
__m512 fft5352 = _mm512_mask_sub_ps(fft5344, 49344, _mm512_setzero_ps(), fft5343);
__m512 fft5269 = _mm512_mask_mov_ps(fft5261, 49344, fft5262);
__m512 fft5353 = _mm512_mask_mov_ps(fft5345, 49344, fft5346);
__m512 fft5270 = _mm512_mask_sub_ps(fft5262, 49344, _mm512_setzero_ps(), fft5261);
__m512 fft5354 = _mm512_mask_sub_ps(fft5346, 49344, _mm512_setzero_ps(), fft5345);
__m512 fft5271 = _mm512_mask_mov_ps(fft5263, 49344, fft5264);
__m512 fft5355 = _mm512_mask_mov_ps(fft5347, 49344, fft5348);
__m512 fft5272 = _mm512_mask_sub_ps(fft5264, 49344, _mm512_setzero_ps(), fft5263);
__m512 fft5356 = _mm512_mask_sub_ps(fft5348, 49344, _mm512_setzero_ps(), fft5347);
__m512 fft5273 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft5274 = _mm512_fmadd_ps(fft5265, fft5273, _mm512_shuffle_ps(fft5265, fft5265, 78));
__m512 fft5357 = _mm512_fmadd_ps(fft5349, fft5273, _mm512_shuffle_ps(fft5349, fft5349, 78));
__m512 fft5275 = _mm512_fmadd_ps(fft5266, fft5273, _mm512_shuffle_ps(fft5266, fft5266, 78));
__m512 fft5358 = _mm512_fmadd_ps(fft5350, fft5273, _mm512_shuffle_ps(fft5350, fft5350, 78));
__m512 fft5276 = _mm512_fmadd_ps(fft5267, fft5273, _mm512_shuffle_ps(fft5267, fft5267, 78));
__m512 fft5359 = _mm512_fmadd_ps(fft5351, fft5273, _mm512_shuffle_ps(fft5351, fft5351, 78));
__m512 fft5277 = _mm512_fmadd_ps(fft5268, fft5273, _mm512_shuffle_ps(fft5268, fft5268, 78));
__m512 fft5360 = _mm512_fmadd_ps(fft5352, fft5273, _mm512_shuffle_ps(fft5352, fft5352, 78));
__m512 fft5278 = _mm512_fmadd_ps(fft5269, fft5273, _mm512_shuffle_ps(fft5269, fft5269, 78));
__m512 fft5361 = _mm512_fmadd_ps(fft5353, fft5273, _mm512_shuffle_ps(fft5353, fft5353, 78));
__m512 fft5279 = _mm512_fmadd_ps(fft5270, fft5273, _mm512_shuffle_ps(fft5270, fft5270, 78));
__m512 fft5362 = _mm512_fmadd_ps(fft5354, fft5273, _mm512_shuffle_ps(fft5354, fft5354, 78));
__m512 fft5280 = _mm512_fmadd_ps(fft5271, fft5273, _mm512_shuffle_ps(fft5271, fft5271, 78));
__m512 fft5363 = _mm512_fmadd_ps(fft5355, fft5273, _mm512_shuffle_ps(fft5355, fft5355, 78));
__m512 fft5281 = _mm512_fmadd_ps(fft5272, fft5273, _mm512_shuffle_ps(fft5272, fft5272, 78));
__m512 fft5364 = _mm512_fmadd_ps(fft5356, fft5273, _mm512_shuffle_ps(fft5356, fft5356, 78));
__m512i fft5282 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft5283 = _mm512_permutexvar_ps(fft5282, fft5274);
__m512 fft5365 = _mm512_permutexvar_ps(fft5282, fft5357);
__m512i fft5284 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft5285 = _mm512_permutexvar_ps(fft5284, fft5274);
__m512 fft5366 = _mm512_permutexvar_ps(fft5284, fft5357);
__m512 fft5286 = _mm512_permutexvar_ps(fft5282, fft5275);
__m512 fft5367 = _mm512_permutexvar_ps(fft5282, fft5358);
__m512 fft5287 = _mm512_permutexvar_ps(fft5284, fft5275);
__m512 fft5368 = _mm512_permutexvar_ps(fft5284, fft5358);
__m512 fft5288 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft5289 = _mm512_fmadd_ps(fft5283, fft5288, fft5285);
__m512 fft5369 = _mm512_fmadd_ps(fft5365, fft5288, fft5366);
__m512 fft5290 = _mm512_fnmadd_ps(fft5287, fft5288, fft5286);
__m512 fft5370 = _mm512_fnmadd_ps(fft5368, fft5288, fft5367);
__m512 fft5291 = _mm512_mask_mov_ps(fft5287, 21845, fft5289);
__m512 fft5371 = _mm512_mask_mov_ps(fft5368, 21845, fft5369);
__m512 fft5292 = _mm512_mask_mov_ps(fft5283, 43176, fft5289);
__m512 fft5372 = _mm512_mask_mov_ps(fft5365, 43176, fft5369);
__m512 fft5293 = _mm512_mask_mov_ps(fft5291, 43176, fft5290);
__m512 fft5373 = _mm512_mask_mov_ps(fft5371, 43176, fft5370);
__m512 fft5294 = _mm512_mask_mov_ps(fft5292, 22102, fft5290);
__m512 fft5374 = _mm512_mask_mov_ps(fft5372, 22102, fft5370);
__m512 fft5295 = _mm512_mask_mul_ps(fft5293, 64764, fft5293, _mm512_set1_ps(5e-01f));
__m512 fft5375 = _mm512_mask_mul_ps(fft5373, 64764, fft5373, _mm512_set1_ps(5e-01f));
__m512 fft5296 = _mm512_mask_mul_ps(fft5294, 64764, fft5294, _mm512_set1_ps(5e-01f));
__m512 fft5376 = _mm512_mask_mul_ps(fft5374, 64764, fft5374, _mm512_set1_ps(5e-01f));
__m512 df465 = fft5295;
__m512 df473 = fft5375;
__m512 df466 = fft5296;
__m512 df474 = fft5376;
__m512 df467 = fft5276;
__m512 df475 = fft5359;
__m512 df468 = fft5277;
__m512 df476 = fft5360;
__m512 df469 = fft5278;
__m512 df477 = fft5361;
__m512 df470 = fft5279;
__m512 df478 = fft5362;
__m512 df471 = fft5280;
__m512 df479 = fft5363;
__m512 df472 = fft5281;
__m512 df480 = fft5364;
__m512i eo32 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df467 = _mm512_permutexvar_ps(eo32, df467);
df468 = _mm512_permutexvar_ps(eo32, df468);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df467);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df468);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df467);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df468);
df475 = _mm512_permutexvar_ps(eo32, df475);
df476 = _mm512_permutexvar_ps(eo32, df476);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df475);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df476);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df475);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df476);
df469 = _mm512_permutexvar_ps(eo32, df469);
df470 = _mm512_permutexvar_ps(eo32, df470);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df469);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df470);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df469);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df470);
df477 = _mm512_permutexvar_ps(eo32, df477);
df478 = _mm512_permutexvar_ps(eo32, df478);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df477);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df478);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df477);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df478);
df471 = _mm512_permutexvar_ps(eo32, df471);
df472 = _mm512_permutexvar_ps(eo32, df472);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df471);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df472);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df471);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df472);
df479 = _mm512_permutexvar_ps(eo32, df479);
df480 = _mm512_permutexvar_ps(eo32, df480);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df479);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df480);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df479);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df480);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df465);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df466);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df465);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df466);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df473);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df474);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df473);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df474);
}
}
if (j2 >= last1) return;
++j2;
rel2 = 20;
}
if (rel2 < 22) {
ptrdiff_t h15 = base2+50;
ptrdiff_t w15 = -1140+60*rel2;
ptrdiff_t jj7 = 21-rel2+j2;
for (; j2 <= jj7; w15 += 60) {
ptrdiff_t k16 = 3*s1;
ptrdiff_t kk15 = k16+2;
for (; k16 <= kk15; ++k16) {
for (ptrdiff_t b33 = 0; b33 < 6; ++b33) {
ptrdiff_t m33 = (size_t)b33/2;
ptrdiff_t f34 = (size_t)b33%2;
__m512 dat466 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k16+896*h15+4*w15+40*b33);
__m512 dat467 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k16+896*h15+4*w15+40*b33);
__m512 dat468 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k16+896*h15+4*w15+40*b33);
__m512 dat469 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k16+896*h15+4*w15+40*b33);
__m512 dat470 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k16+896*h15+4*w15+40*b33);
__m512 dat471 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k16+896*h15+4*w15+40*b33);
__m512 dat472 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k16+896*h15+4*w15+40*b33);
__m512 dat473 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k16+896*h15+4*w15+40*b33);
__m512 dat474 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k16+896*h15+4*w15+40*b33);
__m512 dat475 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k16+896*h15+4*w15+40*b33);
__m512 dat476 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k16+896*h15+4*w15+40*b33);
__m512 dat477 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k16+896*h15+4*w15+40*b33);
__m512 dat478 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k16+896*h15+4*w15+40*b33);
__m512 dat479 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k16+896*h15+4*w15+40*b33);
__m512 dat480 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k16+896*h15+4*w15+40*b33);
__m512 dat481 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k16+896*h15+4*w15+40*b33);
__m512 fft5377 = _mm512_add_ps(dat466, dat474);
__m512 fft5465 = _mm512_add_ps(dat467, dat475);
__m512 fft5378 = _mm512_sub_ps(dat466, dat474);
__m512 fft5466 = _mm512_sub_ps(dat467, dat475);
__m512 fft5379 = _mm512_add_ps(dat468, dat476);
__m512 fft5467 = _mm512_add_ps(dat469, dat477);
__m512 fft5380 = _mm512_sub_ps(dat468, dat476);
__m512 fft5468 = _mm512_sub_ps(dat469, dat477);
__m512 fft5381 = _mm512_add_ps(dat470, dat478);
__m512 fft5469 = _mm512_add_ps(dat471, dat479);
__m512 fft5382 = _mm512_sub_ps(dat470, dat478);
__m512 fft5470 = _mm512_sub_ps(dat471, dat479);
__m512 fft5383 = _mm512_add_ps(dat472, dat480);
__m512 fft5471 = _mm512_add_ps(dat473, dat481);
__m512 fft5384 = _mm512_sub_ps(dat472, dat480);
__m512 fft5472 = _mm512_sub_ps(dat473, dat481);
__m512 fft5385 = _mm512_add_ps(fft5377, fft5381);
__m512 fft5473 = _mm512_add_ps(fft5465, fft5469);
__m512 fft5386 = _mm512_sub_ps(fft5377, fft5381);
__m512 fft5474 = _mm512_sub_ps(fft5465, fft5469);
__m512 fft5387 = _mm512_add_ps(fft5379, fft5383);
__m512 fft5475 = _mm512_add_ps(fft5467, fft5471);
__m512 fft5388 = _mm512_sub_ps(fft5383, fft5379);
__m512 fft5476 = _mm512_sub_ps(fft5471, fft5467);
__m512 fft5389 = _mm512_sub_ps(fft5380, fft5384);
__m512 fft5477 = _mm512_sub_ps(fft5468, fft5472);
__m512 fft5390 = _mm512_add_ps(fft5380, fft5384);
__m512 fft5478 = _mm512_add_ps(fft5468, fft5472);
__m512 fft5391 = _mm512_add_ps(fft5385, fft5387);
__m512 fft5479 = _mm512_add_ps(fft5473, fft5475);
__m512 fft5392 = _mm512_sub_ps(fft5385, fft5387);
__m512 fft5480 = _mm512_sub_ps(fft5473, fft5475);
__m512 fft5393 = _mm512_fmadd_ps(fft5389, _mm512_set1_ps(7.0710677e-01f), fft5378);
__m512 fft5481 = _mm512_fmadd_ps(fft5477, _mm512_set1_ps(7.0710677e-01f), fft5466);
__m512 fft5394 = _mm512_fnmsub_ps(fft5390, _mm512_set1_ps(7.0710677e-01f), fft5382);
__m512 fft5482 = _mm512_fnmsub_ps(fft5478, _mm512_set1_ps(7.0710677e-01f), fft5470);
__m512 fft5395 = _mm512_fnmadd_ps(fft5389, _mm512_set1_ps(7.0710677e-01f), fft5378);
__m512 fft5483 = _mm512_fnmadd_ps(fft5477, _mm512_set1_ps(7.0710677e-01f), fft5466);
__m512 fft5396 = _mm512_fnmadd_ps(fft5390, _mm512_set1_ps(7.0710677e-01f), fft5382);
__m512 fft5484 = _mm512_fnmadd_ps(fft5478, _mm512_set1_ps(7.0710677e-01f), fft5470);
__m512 fft5397 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5398 = _mm512_fmadd_ps(fft5391, fft5397, _mm512_shuffle_f32x4(fft5391, fft5391, 78));
__m512 fft5485 = _mm512_fmadd_ps(fft5479, fft5397, _mm512_shuffle_f32x4(fft5479, fft5479, 78));
__m512 fft5399 = _mm512_fmadd_ps(fft5392, fft5397, _mm512_shuffle_f32x4(fft5392, fft5392, 78));
__m512 fft5486 = _mm512_fmadd_ps(fft5480, fft5397, _mm512_shuffle_f32x4(fft5480, fft5480, 78));
__m512 fft5400 = _mm512_fmadd_ps(fft5393, fft5397, _mm512_shuffle_f32x4(fft5393, fft5393, 78));
__m512 fft5487 = _mm512_fmadd_ps(fft5481, fft5397, _mm512_shuffle_f32x4(fft5481, fft5481, 78));
__m512 fft5401 = _mm512_fmadd_ps(fft5394, fft5397, _mm512_shuffle_f32x4(fft5394, fft5394, 78));
__m512 fft5488 = _mm512_fmadd_ps(fft5482, fft5397, _mm512_shuffle_f32x4(fft5482, fft5482, 78));
__m512 fft5402 = _mm512_fmadd_ps(fft5386, fft5397, _mm512_shuffle_f32x4(fft5386, fft5386, 78));
__m512 fft5489 = _mm512_fmadd_ps(fft5474, fft5397, _mm512_shuffle_f32x4(fft5474, fft5474, 78));
__m512 fft5403 = _mm512_fmadd_ps(fft5388, fft5397, _mm512_shuffle_f32x4(fft5388, fft5388, 78));
__m512 fft5490 = _mm512_fmadd_ps(fft5476, fft5397, _mm512_shuffle_f32x4(fft5476, fft5476, 78));
__m512 fft5404 = _mm512_fmadd_ps(fft5395, fft5397, _mm512_shuffle_f32x4(fft5395, fft5395, 78));
__m512 fft5491 = _mm512_fmadd_ps(fft5483, fft5397, _mm512_shuffle_f32x4(fft5483, fft5483, 78));
__m512 fft5405 = _mm512_fmadd_ps(fft5396, fft5397, _mm512_shuffle_f32x4(fft5396, fft5396, 78));
__m512 fft5492 = _mm512_fmadd_ps(fft5484, fft5397, _mm512_shuffle_f32x4(fft5484, fft5484, 78));
__m512 fft5406 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5407 = _mm512_mul_ps(fft5398, fft5406);
__m512 fft5493 = _mm512_mul_ps(fft5485, fft5406);
__m512 fft5408 = _mm512_mul_ps(fft5399, fft5406);
__m512 fft5494 = _mm512_mul_ps(fft5486, fft5406);
__m512 fft5409 = _mm512_mul_ps(fft5400, fft5406);
__m512 fft5495 = _mm512_mul_ps(fft5487, fft5406);
__m512 fft5410 = _mm512_mul_ps(fft5401, fft5406);
__m512 fft5496 = _mm512_mul_ps(fft5488, fft5406);
__m512 fft5411 = _mm512_mul_ps(fft5402, fft5406);
__m512 fft5497 = _mm512_mul_ps(fft5489, fft5406);
__m512 fft5412 = _mm512_mul_ps(fft5403, fft5406);
__m512 fft5498 = _mm512_mul_ps(fft5490, fft5406);
__m512 fft5413 = _mm512_mul_ps(fft5404, fft5406);
__m512 fft5499 = _mm512_mul_ps(fft5491, fft5406);
__m512 fft5414 = _mm512_mul_ps(fft5405, fft5406);
__m512 fft5500 = _mm512_mul_ps(fft5492, fft5406);
__m512 fft5415 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft5416 = _mm512_fmadd_ps(fft5399, fft5415, fft5407);
__m512 fft5501 = _mm512_fmadd_ps(fft5486, fft5415, fft5493);
__m512 fft5417 = _mm512_fnmadd_ps(fft5398, fft5415, fft5408);
__m512 fft5502 = _mm512_fnmadd_ps(fft5485, fft5415, fft5494);
__m512 fft5418 = _mm512_fmadd_ps(fft5401, fft5415, fft5409);
__m512 fft5503 = _mm512_fmadd_ps(fft5488, fft5415, fft5495);
__m512 fft5419 = _mm512_fnmadd_ps(fft5400, fft5415, fft5410);
__m512 fft5504 = _mm512_fnmadd_ps(fft5487, fft5415, fft5496);
__m512 fft5420 = _mm512_fmadd_ps(fft5403, fft5415, fft5411);
__m512 fft5505 = _mm512_fmadd_ps(fft5490, fft5415, fft5497);
__m512 fft5421 = _mm512_fnmadd_ps(fft5402, fft5415, fft5412);
__m512 fft5506 = _mm512_fnmadd_ps(fft5489, fft5415, fft5498);
__m512 fft5422 = _mm512_fmadd_ps(fft5405, fft5415, fft5413);
__m512 fft5507 = _mm512_fmadd_ps(fft5492, fft5415, fft5499);
__m512 fft5423 = _mm512_fnmadd_ps(fft5404, fft5415, fft5414);
__m512 fft5508 = _mm512_fnmadd_ps(fft5491, fft5415, fft5500);
__m512 fft5424 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft5425 = _mm512_fmadd_ps(fft5416, fft5424, _mm512_shuffle_f32x4(fft5416, fft5416, 177));
__m512 fft5509 = _mm512_fmadd_ps(fft5501, fft5424, _mm512_shuffle_f32x4(fft5501, fft5501, 177));
__m512 fft5426 = _mm512_fmadd_ps(fft5417, fft5424, _mm512_shuffle_f32x4(fft5417, fft5417, 177));
__m512 fft5510 = _mm512_fmadd_ps(fft5502, fft5424, _mm512_shuffle_f32x4(fft5502, fft5502, 177));
__m512 fft5427 = _mm512_fmadd_ps(fft5418, fft5424, _mm512_shuffle_f32x4(fft5418, fft5418, 177));
__m512 fft5511 = _mm512_fmadd_ps(fft5503, fft5424, _mm512_shuffle_f32x4(fft5503, fft5503, 177));
__m512 fft5428 = _mm512_fmadd_ps(fft5419, fft5424, _mm512_shuffle_f32x4(fft5419, fft5419, 177));
__m512 fft5512 = _mm512_fmadd_ps(fft5504, fft5424, _mm512_shuffle_f32x4(fft5504, fft5504, 177));
__m512 fft5429 = _mm512_fmadd_ps(fft5420, fft5424, _mm512_shuffle_f32x4(fft5420, fft5420, 177));
__m512 fft5513 = _mm512_fmadd_ps(fft5505, fft5424, _mm512_shuffle_f32x4(fft5505, fft5505, 177));
__m512 fft5430 = _mm512_fmadd_ps(fft5421, fft5424, _mm512_shuffle_f32x4(fft5421, fft5421, 177));
__m512 fft5514 = _mm512_fmadd_ps(fft5506, fft5424, _mm512_shuffle_f32x4(fft5506, fft5506, 177));
__m512 fft5431 = _mm512_fmadd_ps(fft5422, fft5424, _mm512_shuffle_f32x4(fft5422, fft5422, 177));
__m512 fft5515 = _mm512_fmadd_ps(fft5507, fft5424, _mm512_shuffle_f32x4(fft5507, fft5507, 177));
__m512 fft5432 = _mm512_fmadd_ps(fft5423, fft5424, _mm512_shuffle_f32x4(fft5423, fft5423, 177));
__m512 fft5516 = _mm512_fmadd_ps(fft5508, fft5424, _mm512_shuffle_f32x4(fft5508, fft5508, 177));
__m512 fft5433 = _mm512_mask_mov_ps(fft5425, 49344, fft5426);
__m512 fft5517 = _mm512_mask_mov_ps(fft5509, 49344, fft5510);
__m512 fft5434 = _mm512_mask_sub_ps(fft5426, 49344, _mm512_setzero_ps(), fft5425);
__m512 fft5518 = _mm512_mask_sub_ps(fft5510, 49344, _mm512_setzero_ps(), fft5509);
__m512 fft5435 = _mm512_mask_mov_ps(fft5427, 49344, fft5428);
__m512 fft5519 = _mm512_mask_mov_ps(fft5511, 49344, fft5512);
__m512 fft5436 = _mm512_mask_sub_ps(fft5428, 49344, _mm512_setzero_ps(), fft5427);
__m512 fft5520 = _mm512_mask_sub_ps(fft5512, 49344, _mm512_setzero_ps(), fft5511);
__m512 fft5437 = _mm512_mask_mov_ps(fft5429, 49344, fft5430);
__m512 fft5521 = _mm512_mask_mov_ps(fft5513, 49344, fft5514);
__m512 fft5438 = _mm512_mask_sub_ps(fft5430, 49344, _mm512_setzero_ps(), fft5429);
__m512 fft5522 = _mm512_mask_sub_ps(fft5514, 49344, _mm512_setzero_ps(), fft5513);
__m512 fft5439 = _mm512_mask_mov_ps(fft5431, 49344, fft5432);
__m512 fft5523 = _mm512_mask_mov_ps(fft5515, 49344, fft5516);
__m512 fft5440 = _mm512_mask_sub_ps(fft5432, 49344, _mm512_setzero_ps(), fft5431);
__m512 fft5524 = _mm512_mask_sub_ps(fft5516, 49344, _mm512_setzero_ps(), fft5515);
__m512 fft5441 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft5442 = _mm512_fmadd_ps(fft5433, fft5441, _mm512_shuffle_ps(fft5433, fft5433, 78));
__m512 fft5525 = _mm512_fmadd_ps(fft5517, fft5441, _mm512_shuffle_ps(fft5517, fft5517, 78));
__m512 fft5443 = _mm512_fmadd_ps(fft5434, fft5441, _mm512_shuffle_ps(fft5434, fft5434, 78));
__m512 fft5526 = _mm512_fmadd_ps(fft5518, fft5441, _mm512_shuffle_ps(fft5518, fft5518, 78));
__m512 fft5444 = _mm512_fmadd_ps(fft5435, fft5441, _mm512_shuffle_ps(fft5435, fft5435, 78));
__m512 fft5527 = _mm512_fmadd_ps(fft5519, fft5441, _mm512_shuffle_ps(fft5519, fft5519, 78));
__m512 fft5445 = _mm512_fmadd_ps(fft5436, fft5441, _mm512_shuffle_ps(fft5436, fft5436, 78));
__m512 fft5528 = _mm512_fmadd_ps(fft5520, fft5441, _mm512_shuffle_ps(fft5520, fft5520, 78));
__m512 fft5446 = _mm512_fmadd_ps(fft5437, fft5441, _mm512_shuffle_ps(fft5437, fft5437, 78));
__m512 fft5529 = _mm512_fmadd_ps(fft5521, fft5441, _mm512_shuffle_ps(fft5521, fft5521, 78));
__m512 fft5447 = _mm512_fmadd_ps(fft5438, fft5441, _mm512_shuffle_ps(fft5438, fft5438, 78));
__m512 fft5530 = _mm512_fmadd_ps(fft5522, fft5441, _mm512_shuffle_ps(fft5522, fft5522, 78));
__m512 fft5448 = _mm512_fmadd_ps(fft5439, fft5441, _mm512_shuffle_ps(fft5439, fft5439, 78));
__m512 fft5531 = _mm512_fmadd_ps(fft5523, fft5441, _mm512_shuffle_ps(fft5523, fft5523, 78));
__m512 fft5449 = _mm512_fmadd_ps(fft5440, fft5441, _mm512_shuffle_ps(fft5440, fft5440, 78));
__m512 fft5532 = _mm512_fmadd_ps(fft5524, fft5441, _mm512_shuffle_ps(fft5524, fft5524, 78));
__m512i fft5450 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft5451 = _mm512_permutexvar_ps(fft5450, fft5442);
__m512 fft5533 = _mm512_permutexvar_ps(fft5450, fft5525);
__m512i fft5452 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft5453 = _mm512_permutexvar_ps(fft5452, fft5442);
__m512 fft5534 = _mm512_permutexvar_ps(fft5452, fft5525);
__m512 fft5454 = _mm512_permutexvar_ps(fft5450, fft5443);
__m512 fft5535 = _mm512_permutexvar_ps(fft5450, fft5526);
__m512 fft5455 = _mm512_permutexvar_ps(fft5452, fft5443);
__m512 fft5536 = _mm512_permutexvar_ps(fft5452, fft5526);
__m512 fft5456 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft5457 = _mm512_fmadd_ps(fft5451, fft5456, fft5453);
__m512 fft5537 = _mm512_fmadd_ps(fft5533, fft5456, fft5534);
__m512 fft5458 = _mm512_fnmadd_ps(fft5455, fft5456, fft5454);
__m512 fft5538 = _mm512_fnmadd_ps(fft5536, fft5456, fft5535);
__m512 fft5459 = _mm512_mask_mov_ps(fft5455, 21845, fft5457);
__m512 fft5539 = _mm512_mask_mov_ps(fft5536, 21845, fft5537);
__m512 fft5460 = _mm512_mask_mov_ps(fft5451, 43176, fft5457);
__m512 fft5540 = _mm512_mask_mov_ps(fft5533, 43176, fft5537);
__m512 fft5461 = _mm512_mask_mov_ps(fft5459, 43176, fft5458);
__m512 fft5541 = _mm512_mask_mov_ps(fft5539, 43176, fft5538);
__m512 fft5462 = _mm512_mask_mov_ps(fft5460, 22102, fft5458);
__m512 fft5542 = _mm512_mask_mov_ps(fft5540, 22102, fft5538);
__m512 fft5463 = _mm512_mask_mul_ps(fft5461, 64764, fft5461, _mm512_set1_ps(5e-01f));
__m512 fft5543 = _mm512_mask_mul_ps(fft5541, 64764, fft5541, _mm512_set1_ps(5e-01f));
__m512 fft5464 = _mm512_mask_mul_ps(fft5462, 64764, fft5462, _mm512_set1_ps(5e-01f));
__m512 fft5544 = _mm512_mask_mul_ps(fft5542, 64764, fft5542, _mm512_set1_ps(5e-01f));
__m512 df481 = fft5463;
__m512 df489 = fft5543;
__m512 df482 = fft5464;
__m512 df490 = fft5544;
__m512 df483 = fft5444;
__m512 df491 = fft5527;
__m512 df484 = fft5445;
__m512 df492 = fft5528;
__m512 df485 = fft5446;
__m512 df493 = fft5529;
__m512 df486 = fft5447;
__m512 df494 = fft5530;
__m512 df487 = fft5448;
__m512 df495 = fft5531;
__m512 df488 = fft5449;
__m512 df496 = fft5532;
__m512i eo33 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df483 = _mm512_permutexvar_ps(eo33, df483);
df484 = _mm512_permutexvar_ps(eo33, df484);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df483);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df484);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df483);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df484);
df491 = _mm512_permutexvar_ps(eo33, df491);
df492 = _mm512_permutexvar_ps(eo33, df492);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df491);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df492);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df491);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df492);
df485 = _mm512_permutexvar_ps(eo33, df485);
df486 = _mm512_permutexvar_ps(eo33, df486);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df485);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df486);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df485);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df486);
df493 = _mm512_permutexvar_ps(eo33, df493);
df494 = _mm512_permutexvar_ps(eo33, df494);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df493);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df494);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df493);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df494);
df487 = _mm512_permutexvar_ps(eo33, df487);
df488 = _mm512_permutexvar_ps(eo33, df488);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df487);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df488);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df487);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df488);
df495 = _mm512_permutexvar_ps(eo33, df495);
df496 = _mm512_permutexvar_ps(eo33, df496);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df495);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df496);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df495);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df496);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df481);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df482);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df481);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df482);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df489);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df490);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df489);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df490);
}
}
if (j2 >= last1) return;
++j2;
}
rel2 = 22;
}
ptrdiff_t h16 = base2+50;
ptrdiff_t w16 = 180;
ptrdiff_t k17 = 3*s1;
ptrdiff_t kk16 = k17+2;
for (; k17 <= kk16; ++k17) {
for (ptrdiff_t b34 = 0; b34 < 4; ++b34) {
ptrdiff_t m34 = (size_t)b34/2;
ptrdiff_t f35 = (size_t)b34%2;
__m512 dat482 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k17+896*h16+4*w16+40*b34);
__m512 dat483 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k17+896*h16+4*w16+40*b34);
__m512 dat484 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k17+896*h16+4*w16+40*b34);
__m512 dat485 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k17+896*h16+4*w16+40*b34);
__m512 dat486 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k17+896*h16+4*w16+40*b34);
__m512 dat487 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k17+896*h16+4*w16+40*b34);
__m512 dat488 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k17+896*h16+4*w16+40*b34);
__m512 dat489 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k17+896*h16+4*w16+40*b34);
__m512 dat490 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k17+896*h16+4*w16+40*b34);
__m512 dat491 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k17+896*h16+4*w16+40*b34);
__m512 dat492 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k17+896*h16+4*w16+40*b34);
__m512 dat493 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k17+896*h16+4*w16+40*b34);
__m512 dat494 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k17+896*h16+4*w16+40*b34);
__m512 dat495 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k17+896*h16+4*w16+40*b34);
__m512 dat496 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k17+896*h16+4*w16+40*b34);
__m512 dat497 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k17+896*h16+4*w16+40*b34);
__m512 fft5545 = _mm512_add_ps(dat482, dat490);
__m512 fft5633 = _mm512_add_ps(dat483, dat491);
__m512 fft5546 = _mm512_sub_ps(dat482, dat490);
__m512 fft5634 = _mm512_sub_ps(dat483, dat491);
__m512 fft5547 = _mm512_add_ps(dat484, dat492);
__m512 fft5635 = _mm512_add_ps(dat485, dat493);
__m512 fft5548 = _mm512_sub_ps(dat484, dat492);
__m512 fft5636 = _mm512_sub_ps(dat485, dat493);
__m512 fft5549 = _mm512_add_ps(dat486, dat494);
__m512 fft5637 = _mm512_add_ps(dat487, dat495);
__m512 fft5550 = _mm512_sub_ps(dat486, dat494);
__m512 fft5638 = _mm512_sub_ps(dat487, dat495);
__m512 fft5551 = _mm512_add_ps(dat488, dat496);
__m512 fft5639 = _mm512_add_ps(dat489, dat497);
__m512 fft5552 = _mm512_sub_ps(dat488, dat496);
__m512 fft5640 = _mm512_sub_ps(dat489, dat497);
__m512 fft5553 = _mm512_add_ps(fft5545, fft5549);
__m512 fft5641 = _mm512_add_ps(fft5633, fft5637);
__m512 fft5554 = _mm512_sub_ps(fft5545, fft5549);
__m512 fft5642 = _mm512_sub_ps(fft5633, fft5637);
__m512 fft5555 = _mm512_add_ps(fft5547, fft5551);
__m512 fft5643 = _mm512_add_ps(fft5635, fft5639);
__m512 fft5556 = _mm512_sub_ps(fft5551, fft5547);
__m512 fft5644 = _mm512_sub_ps(fft5639, fft5635);
__m512 fft5557 = _mm512_sub_ps(fft5548, fft5552);
__m512 fft5645 = _mm512_sub_ps(fft5636, fft5640);
__m512 fft5558 = _mm512_add_ps(fft5548, fft5552);
__m512 fft5646 = _mm512_add_ps(fft5636, fft5640);
__m512 fft5559 = _mm512_add_ps(fft5553, fft5555);
__m512 fft5647 = _mm512_add_ps(fft5641, fft5643);
__m512 fft5560 = _mm512_sub_ps(fft5553, fft5555);
__m512 fft5648 = _mm512_sub_ps(fft5641, fft5643);
__m512 fft5561 = _mm512_fmadd_ps(fft5557, _mm512_set1_ps(7.0710677e-01f), fft5546);
__m512 fft5649 = _mm512_fmadd_ps(fft5645, _mm512_set1_ps(7.0710677e-01f), fft5634);
__m512 fft5562 = _mm512_fnmsub_ps(fft5558, _mm512_set1_ps(7.0710677e-01f), fft5550);
__m512 fft5650 = _mm512_fnmsub_ps(fft5646, _mm512_set1_ps(7.0710677e-01f), fft5638);
__m512 fft5563 = _mm512_fnmadd_ps(fft5557, _mm512_set1_ps(7.0710677e-01f), fft5546);
__m512 fft5651 = _mm512_fnmadd_ps(fft5645, _mm512_set1_ps(7.0710677e-01f), fft5634);
__m512 fft5564 = _mm512_fnmadd_ps(fft5558, _mm512_set1_ps(7.0710677e-01f), fft5550);
__m512 fft5652 = _mm512_fnmadd_ps(fft5646, _mm512_set1_ps(7.0710677e-01f), fft5638);
__m512 fft5565 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5566 = _mm512_fmadd_ps(fft5559, fft5565, _mm512_shuffle_f32x4(fft5559, fft5559, 78));
__m512 fft5653 = _mm512_fmadd_ps(fft5647, fft5565, _mm512_shuffle_f32x4(fft5647, fft5647, 78));
__m512 fft5567 = _mm512_fmadd_ps(fft5560, fft5565, _mm512_shuffle_f32x4(fft5560, fft5560, 78));
__m512 fft5654 = _mm512_fmadd_ps(fft5648, fft5565, _mm512_shuffle_f32x4(fft5648, fft5648, 78));
__m512 fft5568 = _mm512_fmadd_ps(fft5561, fft5565, _mm512_shuffle_f32x4(fft5561, fft5561, 78));
__m512 fft5655 = _mm512_fmadd_ps(fft5649, fft5565, _mm512_shuffle_f32x4(fft5649, fft5649, 78));
__m512 fft5569 = _mm512_fmadd_ps(fft5562, fft5565, _mm512_shuffle_f32x4(fft5562, fft5562, 78));
__m512 fft5656 = _mm512_fmadd_ps(fft5650, fft5565, _mm512_shuffle_f32x4(fft5650, fft5650, 78));
__m512 fft5570 = _mm512_fmadd_ps(fft5554, fft5565, _mm512_shuffle_f32x4(fft5554, fft5554, 78));
__m512 fft5657 = _mm512_fmadd_ps(fft5642, fft5565, _mm512_shuffle_f32x4(fft5642, fft5642, 78));
__m512 fft5571 = _mm512_fmadd_ps(fft5556, fft5565, _mm512_shuffle_f32x4(fft5556, fft5556, 78));
__m512 fft5658 = _mm512_fmadd_ps(fft5644, fft5565, _mm512_shuffle_f32x4(fft5644, fft5644, 78));
__m512 fft5572 = _mm512_fmadd_ps(fft5563, fft5565, _mm512_shuffle_f32x4(fft5563, fft5563, 78));
__m512 fft5659 = _mm512_fmadd_ps(fft5651, fft5565, _mm512_shuffle_f32x4(fft5651, fft5651, 78));
__m512 fft5573 = _mm512_fmadd_ps(fft5564, fft5565, _mm512_shuffle_f32x4(fft5564, fft5564, 78));
__m512 fft5660 = _mm512_fmadd_ps(fft5652, fft5565, _mm512_shuffle_f32x4(fft5652, fft5652, 78));
__m512 fft5574 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5575 = _mm512_mul_ps(fft5566, fft5574);
__m512 fft5661 = _mm512_mul_ps(fft5653, fft5574);
__m512 fft5576 = _mm512_mul_ps(fft5567, fft5574);
__m512 fft5662 = _mm512_mul_ps(fft5654, fft5574);
__m512 fft5577 = _mm512_mul_ps(fft5568, fft5574);
__m512 fft5663 = _mm512_mul_ps(fft5655, fft5574);
__m512 fft5578 = _mm512_mul_ps(fft5569, fft5574);
__m512 fft5664 = _mm512_mul_ps(fft5656, fft5574);
__m512 fft5579 = _mm512_mul_ps(fft5570, fft5574);
__m512 fft5665 = _mm512_mul_ps(fft5657, fft5574);
__m512 fft5580 = _mm512_mul_ps(fft5571, fft5574);
__m512 fft5666 = _mm512_mul_ps(fft5658, fft5574);
__m512 fft5581 = _mm512_mul_ps(fft5572, fft5574);
__m512 fft5667 = _mm512_mul_ps(fft5659, fft5574);
__m512 fft5582 = _mm512_mul_ps(fft5573, fft5574);
__m512 fft5668 = _mm512_mul_ps(fft5660, fft5574);
__m512 fft5583 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft5584 = _mm512_fmadd_ps(fft5567, fft5583, fft5575);
__m512 fft5669 = _mm512_fmadd_ps(fft5654, fft5583, fft5661);
__m512 fft5585 = _mm512_fnmadd_ps(fft5566, fft5583, fft5576);
__m512 fft5670 = _mm512_fnmadd_ps(fft5653, fft5583, fft5662);
__m512 fft5586 = _mm512_fmadd_ps(fft5569, fft5583, fft5577);
__m512 fft5671 = _mm512_fmadd_ps(fft5656, fft5583, fft5663);
__m512 fft5587 = _mm512_fnmadd_ps(fft5568, fft5583, fft5578);
__m512 fft5672 = _mm512_fnmadd_ps(fft5655, fft5583, fft5664);
__m512 fft5588 = _mm512_fmadd_ps(fft5571, fft5583, fft5579);
__m512 fft5673 = _mm512_fmadd_ps(fft5658, fft5583, fft5665);
__m512 fft5589 = _mm512_fnmadd_ps(fft5570, fft5583, fft5580);
__m512 fft5674 = _mm512_fnmadd_ps(fft5657, fft5583, fft5666);
__m512 fft5590 = _mm512_fmadd_ps(fft5573, fft5583, fft5581);
__m512 fft5675 = _mm512_fmadd_ps(fft5660, fft5583, fft5667);
__m512 fft5591 = _mm512_fnmadd_ps(fft5572, fft5583, fft5582);
__m512 fft5676 = _mm512_fnmadd_ps(fft5659, fft5583, fft5668);
__m512 fft5592 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft5593 = _mm512_fmadd_ps(fft5584, fft5592, _mm512_shuffle_f32x4(fft5584, fft5584, 177));
__m512 fft5677 = _mm512_fmadd_ps(fft5669, fft5592, _mm512_shuffle_f32x4(fft5669, fft5669, 177));
__m512 fft5594 = _mm512_fmadd_ps(fft5585, fft5592, _mm512_shuffle_f32x4(fft5585, fft5585, 177));
__m512 fft5678 = _mm512_fmadd_ps(fft5670, fft5592, _mm512_shuffle_f32x4(fft5670, fft5670, 177));
__m512 fft5595 = _mm512_fmadd_ps(fft5586, fft5592, _mm512_shuffle_f32x4(fft5586, fft5586, 177));
__m512 fft5679 = _mm512_fmadd_ps(fft5671, fft5592, _mm512_shuffle_f32x4(fft5671, fft5671, 177));
__m512 fft5596 = _mm512_fmadd_ps(fft5587, fft5592, _mm512_shuffle_f32x4(fft5587, fft5587, 177));
__m512 fft5680 = _mm512_fmadd_ps(fft5672, fft5592, _mm512_shuffle_f32x4(fft5672, fft5672, 177));
__m512 fft5597 = _mm512_fmadd_ps(fft5588, fft5592, _mm512_shuffle_f32x4(fft5588, fft5588, 177));
__m512 fft5681 = _mm512_fmadd_ps(fft5673, fft5592, _mm512_shuffle_f32x4(fft5673, fft5673, 177));
__m512 fft5598 = _mm512_fmadd_ps(fft5589, fft5592, _mm512_shuffle_f32x4(fft5589, fft5589, 177));
__m512 fft5682 = _mm512_fmadd_ps(fft5674, fft5592, _mm512_shuffle_f32x4(fft5674, fft5674, 177));
__m512 fft5599 = _mm512_fmadd_ps(fft5590, fft5592, _mm512_shuffle_f32x4(fft5590, fft5590, 177));
__m512 fft5683 = _mm512_fmadd_ps(fft5675, fft5592, _mm512_shuffle_f32x4(fft5675, fft5675, 177));
__m512 fft5600 = _mm512_fmadd_ps(fft5591, fft5592, _mm512_shuffle_f32x4(fft5591, fft5591, 177));
__m512 fft5684 = _mm512_fmadd_ps(fft5676, fft5592, _mm512_shuffle_f32x4(fft5676, fft5676, 177));
__m512 fft5601 = _mm512_mask_mov_ps(fft5593, 49344, fft5594);
__m512 fft5685 = _mm512_mask_mov_ps(fft5677, 49344, fft5678);
__m512 fft5602 = _mm512_mask_sub_ps(fft5594, 49344, _mm512_setzero_ps(), fft5593);
__m512 fft5686 = _mm512_mask_sub_ps(fft5678, 49344, _mm512_setzero_ps(), fft5677);
__m512 fft5603 = _mm512_mask_mov_ps(fft5595, 49344, fft5596);
__m512 fft5687 = _mm512_mask_mov_ps(fft5679, 49344, fft5680);
__m512 fft5604 = _mm512_mask_sub_ps(fft5596, 49344, _mm512_setzero_ps(), fft5595);
__m512 fft5688 = _mm512_mask_sub_ps(fft5680, 49344, _mm512_setzero_ps(), fft5679);
__m512 fft5605 = _mm512_mask_mov_ps(fft5597, 49344, fft5598);
__m512 fft5689 = _mm512_mask_mov_ps(fft5681, 49344, fft5682);
__m512 fft5606 = _mm512_mask_sub_ps(fft5598, 49344, _mm512_setzero_ps(), fft5597);
__m512 fft5690 = _mm512_mask_sub_ps(fft5682, 49344, _mm512_setzero_ps(), fft5681);
__m512 fft5607 = _mm512_mask_mov_ps(fft5599, 49344, fft5600);
__m512 fft5691 = _mm512_mask_mov_ps(fft5683, 49344, fft5684);
__m512 fft5608 = _mm512_mask_sub_ps(fft5600, 49344, _mm512_setzero_ps(), fft5599);
__m512 fft5692 = _mm512_mask_sub_ps(fft5684, 49344, _mm512_setzero_ps(), fft5683);
__m512 fft5609 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft5610 = _mm512_fmadd_ps(fft5601, fft5609, _mm512_shuffle_ps(fft5601, fft5601, 78));
__m512 fft5693 = _mm512_fmadd_ps(fft5685, fft5609, _mm512_shuffle_ps(fft5685, fft5685, 78));
__m512 fft5611 = _mm512_fmadd_ps(fft5602, fft5609, _mm512_shuffle_ps(fft5602, fft5602, 78));
__m512 fft5694 = _mm512_fmadd_ps(fft5686, fft5609, _mm512_shuffle_ps(fft5686, fft5686, 78));
__m512 fft5612 = _mm512_fmadd_ps(fft5603, fft5609, _mm512_shuffle_ps(fft5603, fft5603, 78));
__m512 fft5695 = _mm512_fmadd_ps(fft5687, fft5609, _mm512_shuffle_ps(fft5687, fft5687, 78));
__m512 fft5613 = _mm512_fmadd_ps(fft5604, fft5609, _mm512_shuffle_ps(fft5604, fft5604, 78));
__m512 fft5696 = _mm512_fmadd_ps(fft5688, fft5609, _mm512_shuffle_ps(fft5688, fft5688, 78));
__m512 fft5614 = _mm512_fmadd_ps(fft5605, fft5609, _mm512_shuffle_ps(fft5605, fft5605, 78));
__m512 fft5697 = _mm512_fmadd_ps(fft5689, fft5609, _mm512_shuffle_ps(fft5689, fft5689, 78));
__m512 fft5615 = _mm512_fmadd_ps(fft5606, fft5609, _mm512_shuffle_ps(fft5606, fft5606, 78));
__m512 fft5698 = _mm512_fmadd_ps(fft5690, fft5609, _mm512_shuffle_ps(fft5690, fft5690, 78));
__m512 fft5616 = _mm512_fmadd_ps(fft5607, fft5609, _mm512_shuffle_ps(fft5607, fft5607, 78));
__m512 fft5699 = _mm512_fmadd_ps(fft5691, fft5609, _mm512_shuffle_ps(fft5691, fft5691, 78));
__m512 fft5617 = _mm512_fmadd_ps(fft5608, fft5609, _mm512_shuffle_ps(fft5608, fft5608, 78));
__m512 fft5700 = _mm512_fmadd_ps(fft5692, fft5609, _mm512_shuffle_ps(fft5692, fft5692, 78));
__m512i fft5618 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft5619 = _mm512_permutexvar_ps(fft5618, fft5610);
__m512 fft5701 = _mm512_permutexvar_ps(fft5618, fft5693);
__m512i fft5620 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft5621 = _mm512_permutexvar_ps(fft5620, fft5610);
__m512 fft5702 = _mm512_permutexvar_ps(fft5620, fft5693);
__m512 fft5622 = _mm512_permutexvar_ps(fft5618, fft5611);
__m512 fft5703 = _mm512_permutexvar_ps(fft5618, fft5694);
__m512 fft5623 = _mm512_permutexvar_ps(fft5620, fft5611);
__m512 fft5704 = _mm512_permutexvar_ps(fft5620, fft5694);
__m512 fft5624 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft5625 = _mm512_fmadd_ps(fft5619, fft5624, fft5621);
__m512 fft5705 = _mm512_fmadd_ps(fft5701, fft5624, fft5702);
__m512 fft5626 = _mm512_fnmadd_ps(fft5623, fft5624, fft5622);
__m512 fft5706 = _mm512_fnmadd_ps(fft5704, fft5624, fft5703);
__m512 fft5627 = _mm512_mask_mov_ps(fft5623, 21845, fft5625);
__m512 fft5707 = _mm512_mask_mov_ps(fft5704, 21845, fft5705);
__m512 fft5628 = _mm512_mask_mov_ps(fft5619, 43176, fft5625);
__m512 fft5708 = _mm512_mask_mov_ps(fft5701, 43176, fft5705);
__m512 fft5629 = _mm512_mask_mov_ps(fft5627, 43176, fft5626);
__m512 fft5709 = _mm512_mask_mov_ps(fft5707, 43176, fft5706);
__m512 fft5630 = _mm512_mask_mov_ps(fft5628, 22102, fft5626);
__m512 fft5710 = _mm512_mask_mov_ps(fft5708, 22102, fft5706);
__m512 fft5631 = _mm512_mask_mul_ps(fft5629, 64764, fft5629, _mm512_set1_ps(5e-01f));
__m512 fft5711 = _mm512_mask_mul_ps(fft5709, 64764, fft5709, _mm512_set1_ps(5e-01f));
__m512 fft5632 = _mm512_mask_mul_ps(fft5630, 64764, fft5630, _mm512_set1_ps(5e-01f));
__m512 fft5712 = _mm512_mask_mul_ps(fft5710, 64764, fft5710, _mm512_set1_ps(5e-01f));
__m512 df497 = fft5631;
__m512 df505 = fft5711;
__m512 df498 = fft5632;
__m512 df506 = fft5712;
__m512 df499 = fft5612;
__m512 df507 = fft5695;
__m512 df500 = fft5613;
__m512 df508 = fft5696;
__m512 df501 = fft5614;
__m512 df509 = fft5697;
__m512 df502 = fft5615;
__m512 df510 = fft5698;
__m512 df503 = fft5616;
__m512 df511 = fft5699;
__m512 df504 = fft5617;
__m512 df512 = fft5700;
__m512i eo34 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df499 = _mm512_permutexvar_ps(eo34, df499);
df500 = _mm512_permutexvar_ps(eo34, df500);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df499);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df500);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df499);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df500);
df507 = _mm512_permutexvar_ps(eo34, df507);
df508 = _mm512_permutexvar_ps(eo34, df508);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df507);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df508);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df507);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df508);
df501 = _mm512_permutexvar_ps(eo34, df501);
df502 = _mm512_permutexvar_ps(eo34, df502);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df501);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df502);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df501);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df502);
df509 = _mm512_permutexvar_ps(eo34, df509);
df510 = _mm512_permutexvar_ps(eo34, df510);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df509);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df510);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df509);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df510);
df503 = _mm512_permutexvar_ps(eo34, df503);
df504 = _mm512_permutexvar_ps(eo34, df504);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df503);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df504);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df503);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df504);
df511 = _mm512_permutexvar_ps(eo34, df511);
df512 = _mm512_permutexvar_ps(eo34, df512);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df511);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df512);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df511);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df512);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df497);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df498);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df497);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df498);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df505);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df506);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df505);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df506);
}
ptrdiff_t b35 = 4;
ptrdiff_t m35 = (size_t)b35/2;
ptrdiff_t f36 = (size_t)b35%2;
__m512 dat498 = _mm512_maskz_loadu_ps(127, datPtr1+160+602112*i6+200704*k17+896*h16+4*w16+0*b35);
__m512 dat499 = _mm512_maskz_loadu_ps(127, datPtr1+1056+602112*i6+200704*k17+896*h16+4*w16+0*b35);
__m512 dat500 = _mm512_maskz_loadu_ps(127, datPtr1+1952+602112*i6+200704*k17+896*h16+4*w16+0*b35);
__m512 dat501 = _mm512_maskz_loadu_ps(127, datPtr1+2848+602112*i6+200704*k17+896*h16+4*w16+0*b35);
__m512 dat502 = _mm512_maskz_loadu_ps(127, datPtr1+3744+602112*i6+200704*k17+896*h16+4*w16+0*b35);
__m512 dat503 = _mm512_maskz_loadu_ps(127, datPtr1+4640+602112*i6+200704*k17+896*h16+4*w16+0*b35);
__m512 dat504 = _mm512_maskz_loadu_ps(127, datPtr1+5536+602112*i6+200704*k17+896*h16+4*w16+0*b35);
__m512 dat505 = _mm512_maskz_loadu_ps(127, datPtr1+6432+602112*i6+200704*k17+896*h16+4*w16+0*b35);
__m512 dat506 = _mm512_maskz_loadu_ps(127, datPtr1+7328+602112*i6+200704*k17+896*h16+4*w16+0*b35);
__m512 dat507 = _mm512_maskz_loadu_ps(127, datPtr1+8224+602112*i6+200704*k17+896*h16+4*w16+0*b35);
__m512 dat508 = _mm512_maskz_loadu_ps(127, datPtr1+9120+602112*i6+200704*k17+896*h16+4*w16+0*b35);
__m512 dat509 = _mm512_maskz_loadu_ps(127, datPtr1+10016+602112*i6+200704*k17+896*h16+4*w16+0*b35);
__m512 dat510 = _mm512_maskz_loadu_ps(127, datPtr1+10912+602112*i6+200704*k17+896*h16+4*w16+0*b35);
__m512 dat511 = _mm512_maskz_loadu_ps(127, datPtr1+11808+602112*i6+200704*k17+896*h16+4*w16+0*b35);
__m512 dat512 = _mm512_maskz_loadu_ps(127, datPtr1+12704+602112*i6+200704*k17+896*h16+4*w16+0*b35);
__m512 dat513 = _mm512_maskz_loadu_ps(127, datPtr1+13600+602112*i6+200704*k17+896*h16+4*w16+0*b35);
__m512 fft5713 = _mm512_add_ps(dat498, dat506);
__m512 fft5801 = _mm512_add_ps(dat499, dat507);
__m512 fft5714 = _mm512_sub_ps(dat498, dat506);
__m512 fft5802 = _mm512_sub_ps(dat499, dat507);
__m512 fft5715 = _mm512_add_ps(dat500, dat508);
__m512 fft5803 = _mm512_add_ps(dat501, dat509);
__m512 fft5716 = _mm512_sub_ps(dat500, dat508);
__m512 fft5804 = _mm512_sub_ps(dat501, dat509);
__m512 fft5717 = _mm512_add_ps(dat502, dat510);
__m512 fft5805 = _mm512_add_ps(dat503, dat511);
__m512 fft5718 = _mm512_sub_ps(dat502, dat510);
__m512 fft5806 = _mm512_sub_ps(dat503, dat511);
__m512 fft5719 = _mm512_add_ps(dat504, dat512);
__m512 fft5807 = _mm512_add_ps(dat505, dat513);
__m512 fft5720 = _mm512_sub_ps(dat504, dat512);
__m512 fft5808 = _mm512_sub_ps(dat505, dat513);
__m512 fft5721 = _mm512_add_ps(fft5713, fft5717);
__m512 fft5809 = _mm512_add_ps(fft5801, fft5805);
__m512 fft5722 = _mm512_sub_ps(fft5713, fft5717);
__m512 fft5810 = _mm512_sub_ps(fft5801, fft5805);
__m512 fft5723 = _mm512_add_ps(fft5715, fft5719);
__m512 fft5811 = _mm512_add_ps(fft5803, fft5807);
__m512 fft5724 = _mm512_sub_ps(fft5719, fft5715);
__m512 fft5812 = _mm512_sub_ps(fft5807, fft5803);
__m512 fft5725 = _mm512_sub_ps(fft5716, fft5720);
__m512 fft5813 = _mm512_sub_ps(fft5804, fft5808);
__m512 fft5726 = _mm512_add_ps(fft5716, fft5720);
__m512 fft5814 = _mm512_add_ps(fft5804, fft5808);
__m512 fft5727 = _mm512_add_ps(fft5721, fft5723);
__m512 fft5815 = _mm512_add_ps(fft5809, fft5811);
__m512 fft5728 = _mm512_sub_ps(fft5721, fft5723);
__m512 fft5816 = _mm512_sub_ps(fft5809, fft5811);
__m512 fft5729 = _mm512_fmadd_ps(fft5725, _mm512_set1_ps(7.0710677e-01f), fft5714);
__m512 fft5817 = _mm512_fmadd_ps(fft5813, _mm512_set1_ps(7.0710677e-01f), fft5802);
__m512 fft5730 = _mm512_fnmsub_ps(fft5726, _mm512_set1_ps(7.0710677e-01f), fft5718);
__m512 fft5818 = _mm512_fnmsub_ps(fft5814, _mm512_set1_ps(7.0710677e-01f), fft5806);
__m512 fft5731 = _mm512_fnmadd_ps(fft5725, _mm512_set1_ps(7.0710677e-01f), fft5714);
__m512 fft5819 = _mm512_fnmadd_ps(fft5813, _mm512_set1_ps(7.0710677e-01f), fft5802);
__m512 fft5732 = _mm512_fnmadd_ps(fft5726, _mm512_set1_ps(7.0710677e-01f), fft5718);
__m512 fft5820 = _mm512_fnmadd_ps(fft5814, _mm512_set1_ps(7.0710677e-01f), fft5806);
__m512 fft5733 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5734 = _mm512_fmadd_ps(fft5727, fft5733, _mm512_shuffle_f32x4(fft5727, fft5727, 78));
__m512 fft5821 = _mm512_fmadd_ps(fft5815, fft5733, _mm512_shuffle_f32x4(fft5815, fft5815, 78));
__m512 fft5735 = _mm512_fmadd_ps(fft5728, fft5733, _mm512_shuffle_f32x4(fft5728, fft5728, 78));
__m512 fft5822 = _mm512_fmadd_ps(fft5816, fft5733, _mm512_shuffle_f32x4(fft5816, fft5816, 78));
__m512 fft5736 = _mm512_fmadd_ps(fft5729, fft5733, _mm512_shuffle_f32x4(fft5729, fft5729, 78));
__m512 fft5823 = _mm512_fmadd_ps(fft5817, fft5733, _mm512_shuffle_f32x4(fft5817, fft5817, 78));
__m512 fft5737 = _mm512_fmadd_ps(fft5730, fft5733, _mm512_shuffle_f32x4(fft5730, fft5730, 78));
__m512 fft5824 = _mm512_fmadd_ps(fft5818, fft5733, _mm512_shuffle_f32x4(fft5818, fft5818, 78));
__m512 fft5738 = _mm512_fmadd_ps(fft5722, fft5733, _mm512_shuffle_f32x4(fft5722, fft5722, 78));
__m512 fft5825 = _mm512_fmadd_ps(fft5810, fft5733, _mm512_shuffle_f32x4(fft5810, fft5810, 78));
__m512 fft5739 = _mm512_fmadd_ps(fft5724, fft5733, _mm512_shuffle_f32x4(fft5724, fft5724, 78));
__m512 fft5826 = _mm512_fmadd_ps(fft5812, fft5733, _mm512_shuffle_f32x4(fft5812, fft5812, 78));
__m512 fft5740 = _mm512_fmadd_ps(fft5731, fft5733, _mm512_shuffle_f32x4(fft5731, fft5731, 78));
__m512 fft5827 = _mm512_fmadd_ps(fft5819, fft5733, _mm512_shuffle_f32x4(fft5819, fft5819, 78));
__m512 fft5741 = _mm512_fmadd_ps(fft5732, fft5733, _mm512_shuffle_f32x4(fft5732, fft5732, 78));
__m512 fft5828 = _mm512_fmadd_ps(fft5820, fft5733, _mm512_shuffle_f32x4(fft5820, fft5820, 78));
__m512 fft5742 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5743 = _mm512_mul_ps(fft5734, fft5742);
__m512 fft5829 = _mm512_mul_ps(fft5821, fft5742);
__m512 fft5744 = _mm512_mul_ps(fft5735, fft5742);
__m512 fft5830 = _mm512_mul_ps(fft5822, fft5742);
__m512 fft5745 = _mm512_mul_ps(fft5736, fft5742);
__m512 fft5831 = _mm512_mul_ps(fft5823, fft5742);
__m512 fft5746 = _mm512_mul_ps(fft5737, fft5742);
__m512 fft5832 = _mm512_mul_ps(fft5824, fft5742);
__m512 fft5747 = _mm512_mul_ps(fft5738, fft5742);
__m512 fft5833 = _mm512_mul_ps(fft5825, fft5742);
__m512 fft5748 = _mm512_mul_ps(fft5739, fft5742);
__m512 fft5834 = _mm512_mul_ps(fft5826, fft5742);
__m512 fft5749 = _mm512_mul_ps(fft5740, fft5742);
__m512 fft5835 = _mm512_mul_ps(fft5827, fft5742);
__m512 fft5750 = _mm512_mul_ps(fft5741, fft5742);
__m512 fft5836 = _mm512_mul_ps(fft5828, fft5742);
__m512 fft5751 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft5752 = _mm512_fmadd_ps(fft5735, fft5751, fft5743);
__m512 fft5837 = _mm512_fmadd_ps(fft5822, fft5751, fft5829);
__m512 fft5753 = _mm512_fnmadd_ps(fft5734, fft5751, fft5744);
__m512 fft5838 = _mm512_fnmadd_ps(fft5821, fft5751, fft5830);
__m512 fft5754 = _mm512_fmadd_ps(fft5737, fft5751, fft5745);
__m512 fft5839 = _mm512_fmadd_ps(fft5824, fft5751, fft5831);
__m512 fft5755 = _mm512_fnmadd_ps(fft5736, fft5751, fft5746);
__m512 fft5840 = _mm512_fnmadd_ps(fft5823, fft5751, fft5832);
__m512 fft5756 = _mm512_fmadd_ps(fft5739, fft5751, fft5747);
__m512 fft5841 = _mm512_fmadd_ps(fft5826, fft5751, fft5833);
__m512 fft5757 = _mm512_fnmadd_ps(fft5738, fft5751, fft5748);
__m512 fft5842 = _mm512_fnmadd_ps(fft5825, fft5751, fft5834);
__m512 fft5758 = _mm512_fmadd_ps(fft5741, fft5751, fft5749);
__m512 fft5843 = _mm512_fmadd_ps(fft5828, fft5751, fft5835);
__m512 fft5759 = _mm512_fnmadd_ps(fft5740, fft5751, fft5750);
__m512 fft5844 = _mm512_fnmadd_ps(fft5827, fft5751, fft5836);
__m512 fft5760 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft5761 = _mm512_fmadd_ps(fft5752, fft5760, _mm512_shuffle_f32x4(fft5752, fft5752, 177));
__m512 fft5845 = _mm512_fmadd_ps(fft5837, fft5760, _mm512_shuffle_f32x4(fft5837, fft5837, 177));
__m512 fft5762 = _mm512_fmadd_ps(fft5753, fft5760, _mm512_shuffle_f32x4(fft5753, fft5753, 177));
__m512 fft5846 = _mm512_fmadd_ps(fft5838, fft5760, _mm512_shuffle_f32x4(fft5838, fft5838, 177));
__m512 fft5763 = _mm512_fmadd_ps(fft5754, fft5760, _mm512_shuffle_f32x4(fft5754, fft5754, 177));
__m512 fft5847 = _mm512_fmadd_ps(fft5839, fft5760, _mm512_shuffle_f32x4(fft5839, fft5839, 177));
__m512 fft5764 = _mm512_fmadd_ps(fft5755, fft5760, _mm512_shuffle_f32x4(fft5755, fft5755, 177));
__m512 fft5848 = _mm512_fmadd_ps(fft5840, fft5760, _mm512_shuffle_f32x4(fft5840, fft5840, 177));
__m512 fft5765 = _mm512_fmadd_ps(fft5756, fft5760, _mm512_shuffle_f32x4(fft5756, fft5756, 177));
__m512 fft5849 = _mm512_fmadd_ps(fft5841, fft5760, _mm512_shuffle_f32x4(fft5841, fft5841, 177));
__m512 fft5766 = _mm512_fmadd_ps(fft5757, fft5760, _mm512_shuffle_f32x4(fft5757, fft5757, 177));
__m512 fft5850 = _mm512_fmadd_ps(fft5842, fft5760, _mm512_shuffle_f32x4(fft5842, fft5842, 177));
__m512 fft5767 = _mm512_fmadd_ps(fft5758, fft5760, _mm512_shuffle_f32x4(fft5758, fft5758, 177));
__m512 fft5851 = _mm512_fmadd_ps(fft5843, fft5760, _mm512_shuffle_f32x4(fft5843, fft5843, 177));
__m512 fft5768 = _mm512_fmadd_ps(fft5759, fft5760, _mm512_shuffle_f32x4(fft5759, fft5759, 177));
__m512 fft5852 = _mm512_fmadd_ps(fft5844, fft5760, _mm512_shuffle_f32x4(fft5844, fft5844, 177));
__m512 fft5769 = _mm512_mask_mov_ps(fft5761, 49344, fft5762);
__m512 fft5853 = _mm512_mask_mov_ps(fft5845, 49344, fft5846);
__m512 fft5770 = _mm512_mask_sub_ps(fft5762, 49344, _mm512_setzero_ps(), fft5761);
__m512 fft5854 = _mm512_mask_sub_ps(fft5846, 49344, _mm512_setzero_ps(), fft5845);
__m512 fft5771 = _mm512_mask_mov_ps(fft5763, 49344, fft5764);
__m512 fft5855 = _mm512_mask_mov_ps(fft5847, 49344, fft5848);
__m512 fft5772 = _mm512_mask_sub_ps(fft5764, 49344, _mm512_setzero_ps(), fft5763);
__m512 fft5856 = _mm512_mask_sub_ps(fft5848, 49344, _mm512_setzero_ps(), fft5847);
__m512 fft5773 = _mm512_mask_mov_ps(fft5765, 49344, fft5766);
__m512 fft5857 = _mm512_mask_mov_ps(fft5849, 49344, fft5850);
__m512 fft5774 = _mm512_mask_sub_ps(fft5766, 49344, _mm512_setzero_ps(), fft5765);
__m512 fft5858 = _mm512_mask_sub_ps(fft5850, 49344, _mm512_setzero_ps(), fft5849);
__m512 fft5775 = _mm512_mask_mov_ps(fft5767, 49344, fft5768);
__m512 fft5859 = _mm512_mask_mov_ps(fft5851, 49344, fft5852);
__m512 fft5776 = _mm512_mask_sub_ps(fft5768, 49344, _mm512_setzero_ps(), fft5767);
__m512 fft5860 = _mm512_mask_sub_ps(fft5852, 49344, _mm512_setzero_ps(), fft5851);
__m512 fft5777 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft5778 = _mm512_fmadd_ps(fft5769, fft5777, _mm512_shuffle_ps(fft5769, fft5769, 78));
__m512 fft5861 = _mm512_fmadd_ps(fft5853, fft5777, _mm512_shuffle_ps(fft5853, fft5853, 78));
__m512 fft5779 = _mm512_fmadd_ps(fft5770, fft5777, _mm512_shuffle_ps(fft5770, fft5770, 78));
__m512 fft5862 = _mm512_fmadd_ps(fft5854, fft5777, _mm512_shuffle_ps(fft5854, fft5854, 78));
__m512 fft5780 = _mm512_fmadd_ps(fft5771, fft5777, _mm512_shuffle_ps(fft5771, fft5771, 78));
__m512 fft5863 = _mm512_fmadd_ps(fft5855, fft5777, _mm512_shuffle_ps(fft5855, fft5855, 78));
__m512 fft5781 = _mm512_fmadd_ps(fft5772, fft5777, _mm512_shuffle_ps(fft5772, fft5772, 78));
__m512 fft5864 = _mm512_fmadd_ps(fft5856, fft5777, _mm512_shuffle_ps(fft5856, fft5856, 78));
__m512 fft5782 = _mm512_fmadd_ps(fft5773, fft5777, _mm512_shuffle_ps(fft5773, fft5773, 78));
__m512 fft5865 = _mm512_fmadd_ps(fft5857, fft5777, _mm512_shuffle_ps(fft5857, fft5857, 78));
__m512 fft5783 = _mm512_fmadd_ps(fft5774, fft5777, _mm512_shuffle_ps(fft5774, fft5774, 78));
__m512 fft5866 = _mm512_fmadd_ps(fft5858, fft5777, _mm512_shuffle_ps(fft5858, fft5858, 78));
__m512 fft5784 = _mm512_fmadd_ps(fft5775, fft5777, _mm512_shuffle_ps(fft5775, fft5775, 78));
__m512 fft5867 = _mm512_fmadd_ps(fft5859, fft5777, _mm512_shuffle_ps(fft5859, fft5859, 78));
__m512 fft5785 = _mm512_fmadd_ps(fft5776, fft5777, _mm512_shuffle_ps(fft5776, fft5776, 78));
__m512 fft5868 = _mm512_fmadd_ps(fft5860, fft5777, _mm512_shuffle_ps(fft5860, fft5860, 78));
__m512i fft5786 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft5787 = _mm512_permutexvar_ps(fft5786, fft5778);
__m512 fft5869 = _mm512_permutexvar_ps(fft5786, fft5861);
__m512i fft5788 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft5789 = _mm512_permutexvar_ps(fft5788, fft5778);
__m512 fft5870 = _mm512_permutexvar_ps(fft5788, fft5861);
__m512 fft5790 = _mm512_permutexvar_ps(fft5786, fft5779);
__m512 fft5871 = _mm512_permutexvar_ps(fft5786, fft5862);
__m512 fft5791 = _mm512_permutexvar_ps(fft5788, fft5779);
__m512 fft5872 = _mm512_permutexvar_ps(fft5788, fft5862);
__m512 fft5792 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft5793 = _mm512_fmadd_ps(fft5787, fft5792, fft5789);
__m512 fft5873 = _mm512_fmadd_ps(fft5869, fft5792, fft5870);
__m512 fft5794 = _mm512_fnmadd_ps(fft5791, fft5792, fft5790);
__m512 fft5874 = _mm512_fnmadd_ps(fft5872, fft5792, fft5871);
__m512 fft5795 = _mm512_mask_mov_ps(fft5791, 21845, fft5793);
__m512 fft5875 = _mm512_mask_mov_ps(fft5872, 21845, fft5873);
__m512 fft5796 = _mm512_mask_mov_ps(fft5787, 43176, fft5793);
__m512 fft5876 = _mm512_mask_mov_ps(fft5869, 43176, fft5873);
__m512 fft5797 = _mm512_mask_mov_ps(fft5795, 43176, fft5794);
__m512 fft5877 = _mm512_mask_mov_ps(fft5875, 43176, fft5874);
__m512 fft5798 = _mm512_mask_mov_ps(fft5796, 22102, fft5794);
__m512 fft5878 = _mm512_mask_mov_ps(fft5876, 22102, fft5874);
__m512 fft5799 = _mm512_mask_mul_ps(fft5797, 64764, fft5797, _mm512_set1_ps(5e-01f));
__m512 fft5879 = _mm512_mask_mul_ps(fft5877, 64764, fft5877, _mm512_set1_ps(5e-01f));
__m512 fft5800 = _mm512_mask_mul_ps(fft5798, 64764, fft5798, _mm512_set1_ps(5e-01f));
__m512 fft5880 = _mm512_mask_mul_ps(fft5878, 64764, fft5878, _mm512_set1_ps(5e-01f));
__m512 df513 = fft5799;
__m512 df521 = fft5879;
__m512 df514 = fft5800;
__m512 df522 = fft5880;
__m512 df515 = fft5780;
__m512 df523 = fft5863;
__m512 df516 = fft5781;
__m512 df524 = fft5864;
__m512 df517 = fft5782;
__m512 df525 = fft5865;
__m512 df518 = fft5783;
__m512 df526 = fft5866;
__m512 df519 = fft5784;
__m512 df527 = fft5867;
__m512 df520 = fft5785;
__m512 df528 = fft5868;
__m512i eo35 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df515 = _mm512_permutexvar_ps(eo35, df515);
df516 = _mm512_permutexvar_ps(eo35, df516);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df515);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df516);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df515);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df516);
df523 = _mm512_permutexvar_ps(eo35, df523);
df524 = _mm512_permutexvar_ps(eo35, df524);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df523);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df524);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df523);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df524);
df517 = _mm512_permutexvar_ps(eo35, df517);
df518 = _mm512_permutexvar_ps(eo35, df518);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df517);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df518);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df517);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df518);
df525 = _mm512_permutexvar_ps(eo35, df525);
df526 = _mm512_permutexvar_ps(eo35, df526);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df525);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df526);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df525);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df526);
df519 = _mm512_permutexvar_ps(eo35, df519);
df520 = _mm512_permutexvar_ps(eo35, df520);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df519);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df520);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df519);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df520);
df527 = _mm512_permutexvar_ps(eo35, df527);
df528 = _mm512_permutexvar_ps(eo35, df528);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df527);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df528);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df527);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df528);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df513);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df514);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df513);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df514);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df521);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df522);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df521);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df522);
ptrdiff_t b36 = 5;
ptrdiff_t m36 = (size_t)b36/2;
ptrdiff_t f37 = (size_t)b36%2;
__m512 dat514 = _mm512_maskz_loadu_ps(65528, datPtr1+8240+602112*i6+200704*k17+896*h16+4*w16+0*b36);
__m512 dat515 = _mm512_maskz_loadu_ps(65528, datPtr1+9136+602112*i6+200704*k17+896*h16+4*w16+0*b36);
__m512 dat516 = _mm512_maskz_loadu_ps(65528, datPtr1+10032+602112*i6+200704*k17+896*h16+4*w16+0*b36);
__m512 dat517 = _mm512_maskz_loadu_ps(65528, datPtr1+10928+602112*i6+200704*k17+896*h16+4*w16+0*b36);
__m512 dat518 = _mm512_maskz_loadu_ps(65528, datPtr1+11824+602112*i6+200704*k17+896*h16+4*w16+0*b36);
__m512 dat519 = _mm512_maskz_loadu_ps(65528, datPtr1+12720+602112*i6+200704*k17+896*h16+4*w16+0*b36);
__m512 dat520 = _mm512_maskz_loadu_ps(65528, datPtr1+13616+602112*i6+200704*k17+896*h16+4*w16+0*b36);
__m512 dat521 = _mm512_maskz_loadu_ps(65528, datPtr1+14512+602112*i6+200704*k17+896*h16+4*w16+0*b36);
__m512 dat522 = _mm512_maskz_loadu_ps(65528, datPtr1+15408+602112*i6+200704*k17+896*h16+4*w16+0*b36);
__m512 dat523 = _mm512_maskz_loadu_ps(65528, datPtr1+16304+602112*i6+200704*k17+896*h16+4*w16+0*b36);
__m512 dat524 = _mm512_maskz_loadu_ps(65528, datPtr1+17200+602112*i6+200704*k17+896*h16+4*w16+0*b36);
__m512 dat525 = _mm512_maskz_loadu_ps(65528, datPtr1+18096+602112*i6+200704*k17+896*h16+4*w16+0*b36);
__m512 dat526 = _mm512_maskz_loadu_ps(65528, datPtr1+18992+602112*i6+200704*k17+896*h16+4*w16+0*b36);
__m512 dat527 = _mm512_maskz_loadu_ps(65528, datPtr1+19888+602112*i6+200704*k17+896*h16+4*w16+0*b36);
__m512 dat528 = _mm512_maskz_loadu_ps(65528, datPtr1+20784+602112*i6+200704*k17+896*h16+4*w16+0*b36);
__m512 dat529 = _mm512_maskz_loadu_ps(65528, datPtr1+21680+602112*i6+200704*k17+896*h16+4*w16+0*b36);
__m512 fft5881 = _mm512_add_ps(dat514, dat522);
__m512 fft5969 = _mm512_add_ps(dat515, dat523);
__m512 fft5882 = _mm512_sub_ps(dat514, dat522);
__m512 fft5970 = _mm512_sub_ps(dat515, dat523);
__m512 fft5883 = _mm512_add_ps(dat516, dat524);
__m512 fft5971 = _mm512_add_ps(dat517, dat525);
__m512 fft5884 = _mm512_sub_ps(dat516, dat524);
__m512 fft5972 = _mm512_sub_ps(dat517, dat525);
__m512 fft5885 = _mm512_add_ps(dat518, dat526);
__m512 fft5973 = _mm512_add_ps(dat519, dat527);
__m512 fft5886 = _mm512_sub_ps(dat518, dat526);
__m512 fft5974 = _mm512_sub_ps(dat519, dat527);
__m512 fft5887 = _mm512_add_ps(dat520, dat528);
__m512 fft5975 = _mm512_add_ps(dat521, dat529);
__m512 fft5888 = _mm512_sub_ps(dat520, dat528);
__m512 fft5976 = _mm512_sub_ps(dat521, dat529);
__m512 fft5889 = _mm512_add_ps(fft5881, fft5885);
__m512 fft5977 = _mm512_add_ps(fft5969, fft5973);
__m512 fft5890 = _mm512_sub_ps(fft5881, fft5885);
__m512 fft5978 = _mm512_sub_ps(fft5969, fft5973);
__m512 fft5891 = _mm512_add_ps(fft5883, fft5887);
__m512 fft5979 = _mm512_add_ps(fft5971, fft5975);
__m512 fft5892 = _mm512_sub_ps(fft5887, fft5883);
__m512 fft5980 = _mm512_sub_ps(fft5975, fft5971);
__m512 fft5893 = _mm512_sub_ps(fft5884, fft5888);
__m512 fft5981 = _mm512_sub_ps(fft5972, fft5976);
__m512 fft5894 = _mm512_add_ps(fft5884, fft5888);
__m512 fft5982 = _mm512_add_ps(fft5972, fft5976);
__m512 fft5895 = _mm512_add_ps(fft5889, fft5891);
__m512 fft5983 = _mm512_add_ps(fft5977, fft5979);
__m512 fft5896 = _mm512_sub_ps(fft5889, fft5891);
__m512 fft5984 = _mm512_sub_ps(fft5977, fft5979);
__m512 fft5897 = _mm512_fmadd_ps(fft5893, _mm512_set1_ps(7.0710677e-01f), fft5882);
__m512 fft5985 = _mm512_fmadd_ps(fft5981, _mm512_set1_ps(7.0710677e-01f), fft5970);
__m512 fft5898 = _mm512_fnmsub_ps(fft5894, _mm512_set1_ps(7.0710677e-01f), fft5886);
__m512 fft5986 = _mm512_fnmsub_ps(fft5982, _mm512_set1_ps(7.0710677e-01f), fft5974);
__m512 fft5899 = _mm512_fnmadd_ps(fft5893, _mm512_set1_ps(7.0710677e-01f), fft5882);
__m512 fft5987 = _mm512_fnmadd_ps(fft5981, _mm512_set1_ps(7.0710677e-01f), fft5970);
__m512 fft5900 = _mm512_fnmadd_ps(fft5894, _mm512_set1_ps(7.0710677e-01f), fft5886);
__m512 fft5988 = _mm512_fnmadd_ps(fft5982, _mm512_set1_ps(7.0710677e-01f), fft5974);
__m512 fft5901 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5902 = _mm512_fmadd_ps(fft5895, fft5901, _mm512_shuffle_f32x4(fft5895, fft5895, 78));
__m512 fft5989 = _mm512_fmadd_ps(fft5983, fft5901, _mm512_shuffle_f32x4(fft5983, fft5983, 78));
__m512 fft5903 = _mm512_fmadd_ps(fft5896, fft5901, _mm512_shuffle_f32x4(fft5896, fft5896, 78));
__m512 fft5990 = _mm512_fmadd_ps(fft5984, fft5901, _mm512_shuffle_f32x4(fft5984, fft5984, 78));
__m512 fft5904 = _mm512_fmadd_ps(fft5897, fft5901, _mm512_shuffle_f32x4(fft5897, fft5897, 78));
__m512 fft5991 = _mm512_fmadd_ps(fft5985, fft5901, _mm512_shuffle_f32x4(fft5985, fft5985, 78));
__m512 fft5905 = _mm512_fmadd_ps(fft5898, fft5901, _mm512_shuffle_f32x4(fft5898, fft5898, 78));
__m512 fft5992 = _mm512_fmadd_ps(fft5986, fft5901, _mm512_shuffle_f32x4(fft5986, fft5986, 78));
__m512 fft5906 = _mm512_fmadd_ps(fft5890, fft5901, _mm512_shuffle_f32x4(fft5890, fft5890, 78));
__m512 fft5993 = _mm512_fmadd_ps(fft5978, fft5901, _mm512_shuffle_f32x4(fft5978, fft5978, 78));
__m512 fft5907 = _mm512_fmadd_ps(fft5892, fft5901, _mm512_shuffle_f32x4(fft5892, fft5892, 78));
__m512 fft5994 = _mm512_fmadd_ps(fft5980, fft5901, _mm512_shuffle_f32x4(fft5980, fft5980, 78));
__m512 fft5908 = _mm512_fmadd_ps(fft5899, fft5901, _mm512_shuffle_f32x4(fft5899, fft5899, 78));
__m512 fft5995 = _mm512_fmadd_ps(fft5987, fft5901, _mm512_shuffle_f32x4(fft5987, fft5987, 78));
__m512 fft5909 = _mm512_fmadd_ps(fft5900, fft5901, _mm512_shuffle_f32x4(fft5900, fft5900, 78));
__m512 fft5996 = _mm512_fmadd_ps(fft5988, fft5901, _mm512_shuffle_f32x4(fft5988, fft5988, 78));
__m512 fft5910 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5911 = _mm512_mul_ps(fft5902, fft5910);
__m512 fft5997 = _mm512_mul_ps(fft5989, fft5910);
__m512 fft5912 = _mm512_mul_ps(fft5903, fft5910);
__m512 fft5998 = _mm512_mul_ps(fft5990, fft5910);
__m512 fft5913 = _mm512_mul_ps(fft5904, fft5910);
__m512 fft5999 = _mm512_mul_ps(fft5991, fft5910);
__m512 fft5914 = _mm512_mul_ps(fft5905, fft5910);
__m512 fft6000 = _mm512_mul_ps(fft5992, fft5910);
__m512 fft5915 = _mm512_mul_ps(fft5906, fft5910);
__m512 fft6001 = _mm512_mul_ps(fft5993, fft5910);
__m512 fft5916 = _mm512_mul_ps(fft5907, fft5910);
__m512 fft6002 = _mm512_mul_ps(fft5994, fft5910);
__m512 fft5917 = _mm512_mul_ps(fft5908, fft5910);
__m512 fft6003 = _mm512_mul_ps(fft5995, fft5910);
__m512 fft5918 = _mm512_mul_ps(fft5909, fft5910);
__m512 fft6004 = _mm512_mul_ps(fft5996, fft5910);
__m512 fft5919 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft5920 = _mm512_fmadd_ps(fft5903, fft5919, fft5911);
__m512 fft6005 = _mm512_fmadd_ps(fft5990, fft5919, fft5997);
__m512 fft5921 = _mm512_fnmadd_ps(fft5902, fft5919, fft5912);
__m512 fft6006 = _mm512_fnmadd_ps(fft5989, fft5919, fft5998);
__m512 fft5922 = _mm512_fmadd_ps(fft5905, fft5919, fft5913);
__m512 fft6007 = _mm512_fmadd_ps(fft5992, fft5919, fft5999);
__m512 fft5923 = _mm512_fnmadd_ps(fft5904, fft5919, fft5914);
__m512 fft6008 = _mm512_fnmadd_ps(fft5991, fft5919, fft6000);
__m512 fft5924 = _mm512_fmadd_ps(fft5907, fft5919, fft5915);
__m512 fft6009 = _mm512_fmadd_ps(fft5994, fft5919, fft6001);
__m512 fft5925 = _mm512_fnmadd_ps(fft5906, fft5919, fft5916);
__m512 fft6010 = _mm512_fnmadd_ps(fft5993, fft5919, fft6002);
__m512 fft5926 = _mm512_fmadd_ps(fft5909, fft5919, fft5917);
__m512 fft6011 = _mm512_fmadd_ps(fft5996, fft5919, fft6003);
__m512 fft5927 = _mm512_fnmadd_ps(fft5908, fft5919, fft5918);
__m512 fft6012 = _mm512_fnmadd_ps(fft5995, fft5919, fft6004);
__m512 fft5928 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft5929 = _mm512_fmadd_ps(fft5920, fft5928, _mm512_shuffle_f32x4(fft5920, fft5920, 177));
__m512 fft6013 = _mm512_fmadd_ps(fft6005, fft5928, _mm512_shuffle_f32x4(fft6005, fft6005, 177));
__m512 fft5930 = _mm512_fmadd_ps(fft5921, fft5928, _mm512_shuffle_f32x4(fft5921, fft5921, 177));
__m512 fft6014 = _mm512_fmadd_ps(fft6006, fft5928, _mm512_shuffle_f32x4(fft6006, fft6006, 177));
__m512 fft5931 = _mm512_fmadd_ps(fft5922, fft5928, _mm512_shuffle_f32x4(fft5922, fft5922, 177));
__m512 fft6015 = _mm512_fmadd_ps(fft6007, fft5928, _mm512_shuffle_f32x4(fft6007, fft6007, 177));
__m512 fft5932 = _mm512_fmadd_ps(fft5923, fft5928, _mm512_shuffle_f32x4(fft5923, fft5923, 177));
__m512 fft6016 = _mm512_fmadd_ps(fft6008, fft5928, _mm512_shuffle_f32x4(fft6008, fft6008, 177));
__m512 fft5933 = _mm512_fmadd_ps(fft5924, fft5928, _mm512_shuffle_f32x4(fft5924, fft5924, 177));
__m512 fft6017 = _mm512_fmadd_ps(fft6009, fft5928, _mm512_shuffle_f32x4(fft6009, fft6009, 177));
__m512 fft5934 = _mm512_fmadd_ps(fft5925, fft5928, _mm512_shuffle_f32x4(fft5925, fft5925, 177));
__m512 fft6018 = _mm512_fmadd_ps(fft6010, fft5928, _mm512_shuffle_f32x4(fft6010, fft6010, 177));
__m512 fft5935 = _mm512_fmadd_ps(fft5926, fft5928, _mm512_shuffle_f32x4(fft5926, fft5926, 177));
__m512 fft6019 = _mm512_fmadd_ps(fft6011, fft5928, _mm512_shuffle_f32x4(fft6011, fft6011, 177));
__m512 fft5936 = _mm512_fmadd_ps(fft5927, fft5928, _mm512_shuffle_f32x4(fft5927, fft5927, 177));
__m512 fft6020 = _mm512_fmadd_ps(fft6012, fft5928, _mm512_shuffle_f32x4(fft6012, fft6012, 177));
__m512 fft5937 = _mm512_mask_mov_ps(fft5929, 49344, fft5930);
__m512 fft6021 = _mm512_mask_mov_ps(fft6013, 49344, fft6014);
__m512 fft5938 = _mm512_mask_sub_ps(fft5930, 49344, _mm512_setzero_ps(), fft5929);
__m512 fft6022 = _mm512_mask_sub_ps(fft6014, 49344, _mm512_setzero_ps(), fft6013);
__m512 fft5939 = _mm512_mask_mov_ps(fft5931, 49344, fft5932);
__m512 fft6023 = _mm512_mask_mov_ps(fft6015, 49344, fft6016);
__m512 fft5940 = _mm512_mask_sub_ps(fft5932, 49344, _mm512_setzero_ps(), fft5931);
__m512 fft6024 = _mm512_mask_sub_ps(fft6016, 49344, _mm512_setzero_ps(), fft6015);
__m512 fft5941 = _mm512_mask_mov_ps(fft5933, 49344, fft5934);
__m512 fft6025 = _mm512_mask_mov_ps(fft6017, 49344, fft6018);
__m512 fft5942 = _mm512_mask_sub_ps(fft5934, 49344, _mm512_setzero_ps(), fft5933);
__m512 fft6026 = _mm512_mask_sub_ps(fft6018, 49344, _mm512_setzero_ps(), fft6017);
__m512 fft5943 = _mm512_mask_mov_ps(fft5935, 49344, fft5936);
__m512 fft6027 = _mm512_mask_mov_ps(fft6019, 49344, fft6020);
__m512 fft5944 = _mm512_mask_sub_ps(fft5936, 49344, _mm512_setzero_ps(), fft5935);
__m512 fft6028 = _mm512_mask_sub_ps(fft6020, 49344, _mm512_setzero_ps(), fft6019);
__m512 fft5945 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft5946 = _mm512_fmadd_ps(fft5937, fft5945, _mm512_shuffle_ps(fft5937, fft5937, 78));
__m512 fft6029 = _mm512_fmadd_ps(fft6021, fft5945, _mm512_shuffle_ps(fft6021, fft6021, 78));
__m512 fft5947 = _mm512_fmadd_ps(fft5938, fft5945, _mm512_shuffle_ps(fft5938, fft5938, 78));
__m512 fft6030 = _mm512_fmadd_ps(fft6022, fft5945, _mm512_shuffle_ps(fft6022, fft6022, 78));
__m512 fft5948 = _mm512_fmadd_ps(fft5939, fft5945, _mm512_shuffle_ps(fft5939, fft5939, 78));
__m512 fft6031 = _mm512_fmadd_ps(fft6023, fft5945, _mm512_shuffle_ps(fft6023, fft6023, 78));
__m512 fft5949 = _mm512_fmadd_ps(fft5940, fft5945, _mm512_shuffle_ps(fft5940, fft5940, 78));
__m512 fft6032 = _mm512_fmadd_ps(fft6024, fft5945, _mm512_shuffle_ps(fft6024, fft6024, 78));
__m512 fft5950 = _mm512_fmadd_ps(fft5941, fft5945, _mm512_shuffle_ps(fft5941, fft5941, 78));
__m512 fft6033 = _mm512_fmadd_ps(fft6025, fft5945, _mm512_shuffle_ps(fft6025, fft6025, 78));
__m512 fft5951 = _mm512_fmadd_ps(fft5942, fft5945, _mm512_shuffle_ps(fft5942, fft5942, 78));
__m512 fft6034 = _mm512_fmadd_ps(fft6026, fft5945, _mm512_shuffle_ps(fft6026, fft6026, 78));
__m512 fft5952 = _mm512_fmadd_ps(fft5943, fft5945, _mm512_shuffle_ps(fft5943, fft5943, 78));
__m512 fft6035 = _mm512_fmadd_ps(fft6027, fft5945, _mm512_shuffle_ps(fft6027, fft6027, 78));
__m512 fft5953 = _mm512_fmadd_ps(fft5944, fft5945, _mm512_shuffle_ps(fft5944, fft5944, 78));
__m512 fft6036 = _mm512_fmadd_ps(fft6028, fft5945, _mm512_shuffle_ps(fft6028, fft6028, 78));
__m512i fft5954 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft5955 = _mm512_permutexvar_ps(fft5954, fft5946);
__m512 fft6037 = _mm512_permutexvar_ps(fft5954, fft6029);
__m512i fft5956 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft5957 = _mm512_permutexvar_ps(fft5956, fft5946);
__m512 fft6038 = _mm512_permutexvar_ps(fft5956, fft6029);
__m512 fft5958 = _mm512_permutexvar_ps(fft5954, fft5947);
__m512 fft6039 = _mm512_permutexvar_ps(fft5954, fft6030);
__m512 fft5959 = _mm512_permutexvar_ps(fft5956, fft5947);
__m512 fft6040 = _mm512_permutexvar_ps(fft5956, fft6030);
__m512 fft5960 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft5961 = _mm512_fmadd_ps(fft5955, fft5960, fft5957);
__m512 fft6041 = _mm512_fmadd_ps(fft6037, fft5960, fft6038);
__m512 fft5962 = _mm512_fnmadd_ps(fft5959, fft5960, fft5958);
__m512 fft6042 = _mm512_fnmadd_ps(fft6040, fft5960, fft6039);
__m512 fft5963 = _mm512_mask_mov_ps(fft5959, 21845, fft5961);
__m512 fft6043 = _mm512_mask_mov_ps(fft6040, 21845, fft6041);
__m512 fft5964 = _mm512_mask_mov_ps(fft5955, 43176, fft5961);
__m512 fft6044 = _mm512_mask_mov_ps(fft6037, 43176, fft6041);
__m512 fft5965 = _mm512_mask_mov_ps(fft5963, 43176, fft5962);
__m512 fft6045 = _mm512_mask_mov_ps(fft6043, 43176, fft6042);
__m512 fft5966 = _mm512_mask_mov_ps(fft5964, 22102, fft5962);
__m512 fft6046 = _mm512_mask_mov_ps(fft6044, 22102, fft6042);
__m512 fft5967 = _mm512_mask_mul_ps(fft5965, 64764, fft5965, _mm512_set1_ps(5e-01f));
__m512 fft6047 = _mm512_mask_mul_ps(fft6045, 64764, fft6045, _mm512_set1_ps(5e-01f));
__m512 fft5968 = _mm512_mask_mul_ps(fft5966, 64764, fft5966, _mm512_set1_ps(5e-01f));
__m512 fft6048 = _mm512_mask_mul_ps(fft6046, 64764, fft6046, _mm512_set1_ps(5e-01f));
__m512 df529 = fft5967;
__m512 df537 = fft6047;
__m512 df530 = fft5968;
__m512 df538 = fft6048;
__m512 df531 = fft5948;
__m512 df539 = fft6031;
__m512 df532 = fft5949;
__m512 df540 = fft6032;
__m512 df533 = fft5950;
__m512 df541 = fft6033;
__m512 df534 = fft5951;
__m512 df542 = fft6034;
__m512 df535 = fft5952;
__m512 df543 = fft6035;
__m512 df536 = fft5953;
__m512 df544 = fft6036;
__m512i eo36 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df531 = _mm512_permutexvar_ps(eo36, df531);
df532 = _mm512_permutexvar_ps(eo36, df532);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df531);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df532);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df531);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df532);
df539 = _mm512_permutexvar_ps(eo36, df539);
df540 = _mm512_permutexvar_ps(eo36, df540);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df539);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df540);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df539);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df540);
df533 = _mm512_permutexvar_ps(eo36, df533);
df534 = _mm512_permutexvar_ps(eo36, df534);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df533);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df534);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df533);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df534);
df541 = _mm512_permutexvar_ps(eo36, df541);
df542 = _mm512_permutexvar_ps(eo36, df542);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df541);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df542);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df541);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df542);
df535 = _mm512_permutexvar_ps(eo36, df535);
df536 = _mm512_permutexvar_ps(eo36, df536);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df535);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df536);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df535);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df536);
df543 = _mm512_permutexvar_ps(eo36, df543);
df544 = _mm512_permutexvar_ps(eo36, df544);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df543);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df544);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df543);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df544);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df529);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df530);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df529);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df530);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df537);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df538);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df537);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df538);
}
if (j2 >= last1) return;
++j2;
}
j2 = 84;
}
ptrdiff_t rel3 = j2-84;
ptrdiff_t base3 = 210;
if (rel3 < 1) {
ptrdiff_t h17 = base3+0;
ptrdiff_t w17 = 210;
ptrdiff_t k18 = 3*s1;
ptrdiff_t kk17 = k18+2;
for (; k18 <= kk17; ++k18) {
ptrdiff_t b37 = 0;
ptrdiff_t m37 = (size_t)b37/2;
ptrdiff_t f38 = (size_t)b37%2;
__m512 dat530 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k18+896*h17+4*w17+0*b37);
__m512 dat531 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k18+896*h17+4*w17+0*b37);
__m512 dat532 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k18+896*h17+4*w17+0*b37);
__m512 dat533 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k18+896*h17+4*w17+0*b37);
__m512 dat534 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k18+896*h17+4*w17+0*b37);
__m512 dat535 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k18+896*h17+4*w17+0*b37);
__m512 dat536 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k18+896*h17+4*w17+0*b37);
__m512 dat537 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k18+896*h17+4*w17+0*b37);
__m512 dat538 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k18+896*h17+4*w17+0*b37);
__m512 dat539 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k18+896*h17+4*w17+0*b37);
__m512 dat540 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k18+896*h17+4*w17+0*b37);
__m512 dat541 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k18+896*h17+4*w17+0*b37);
__m512 dat542 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k18+896*h17+4*w17+0*b37);
__m512 dat543 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k18+896*h17+4*w17+0*b37);
__m512 dat544 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k18+896*h17+4*w17+0*b37);
__m512 dat545 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k18+896*h17+4*w17+0*b37);
__m512 fft6049 = _mm512_add_ps(dat530, dat538);
__m512 fft6137 = _mm512_add_ps(dat531, dat539);
__m512 fft6050 = _mm512_sub_ps(dat530, dat538);
__m512 fft6138 = _mm512_sub_ps(dat531, dat539);
__m512 fft6051 = _mm512_add_ps(dat532, dat540);
__m512 fft6139 = _mm512_add_ps(dat533, dat541);
__m512 fft6052 = _mm512_sub_ps(dat532, dat540);
__m512 fft6140 = _mm512_sub_ps(dat533, dat541);
__m512 fft6053 = _mm512_add_ps(dat534, dat542);
__m512 fft6141 = _mm512_add_ps(dat535, dat543);
__m512 fft6054 = _mm512_sub_ps(dat534, dat542);
__m512 fft6142 = _mm512_sub_ps(dat535, dat543);
__m512 fft6055 = _mm512_add_ps(dat536, dat544);
__m512 fft6143 = _mm512_add_ps(dat537, dat545);
__m512 fft6056 = _mm512_sub_ps(dat536, dat544);
__m512 fft6144 = _mm512_sub_ps(dat537, dat545);
__m512 fft6057 = _mm512_add_ps(fft6049, fft6053);
__m512 fft6145 = _mm512_add_ps(fft6137, fft6141);
__m512 fft6058 = _mm512_sub_ps(fft6049, fft6053);
__m512 fft6146 = _mm512_sub_ps(fft6137, fft6141);
__m512 fft6059 = _mm512_add_ps(fft6051, fft6055);
__m512 fft6147 = _mm512_add_ps(fft6139, fft6143);
__m512 fft6060 = _mm512_sub_ps(fft6055, fft6051);
__m512 fft6148 = _mm512_sub_ps(fft6143, fft6139);
__m512 fft6061 = _mm512_sub_ps(fft6052, fft6056);
__m512 fft6149 = _mm512_sub_ps(fft6140, fft6144);
__m512 fft6062 = _mm512_add_ps(fft6052, fft6056);
__m512 fft6150 = _mm512_add_ps(fft6140, fft6144);
__m512 fft6063 = _mm512_add_ps(fft6057, fft6059);
__m512 fft6151 = _mm512_add_ps(fft6145, fft6147);
__m512 fft6064 = _mm512_sub_ps(fft6057, fft6059);
__m512 fft6152 = _mm512_sub_ps(fft6145, fft6147);
__m512 fft6065 = _mm512_fmadd_ps(fft6061, _mm512_set1_ps(7.0710677e-01f), fft6050);
__m512 fft6153 = _mm512_fmadd_ps(fft6149, _mm512_set1_ps(7.0710677e-01f), fft6138);
__m512 fft6066 = _mm512_fnmsub_ps(fft6062, _mm512_set1_ps(7.0710677e-01f), fft6054);
__m512 fft6154 = _mm512_fnmsub_ps(fft6150, _mm512_set1_ps(7.0710677e-01f), fft6142);
__m512 fft6067 = _mm512_fnmadd_ps(fft6061, _mm512_set1_ps(7.0710677e-01f), fft6050);
__m512 fft6155 = _mm512_fnmadd_ps(fft6149, _mm512_set1_ps(7.0710677e-01f), fft6138);
__m512 fft6068 = _mm512_fnmadd_ps(fft6062, _mm512_set1_ps(7.0710677e-01f), fft6054);
__m512 fft6156 = _mm512_fnmadd_ps(fft6150, _mm512_set1_ps(7.0710677e-01f), fft6142);
__m512 fft6069 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6070 = _mm512_fmadd_ps(fft6063, fft6069, _mm512_shuffle_f32x4(fft6063, fft6063, 78));
__m512 fft6157 = _mm512_fmadd_ps(fft6151, fft6069, _mm512_shuffle_f32x4(fft6151, fft6151, 78));
__m512 fft6071 = _mm512_fmadd_ps(fft6064, fft6069, _mm512_shuffle_f32x4(fft6064, fft6064, 78));
__m512 fft6158 = _mm512_fmadd_ps(fft6152, fft6069, _mm512_shuffle_f32x4(fft6152, fft6152, 78));
__m512 fft6072 = _mm512_fmadd_ps(fft6065, fft6069, _mm512_shuffle_f32x4(fft6065, fft6065, 78));
__m512 fft6159 = _mm512_fmadd_ps(fft6153, fft6069, _mm512_shuffle_f32x4(fft6153, fft6153, 78));
__m512 fft6073 = _mm512_fmadd_ps(fft6066, fft6069, _mm512_shuffle_f32x4(fft6066, fft6066, 78));
__m512 fft6160 = _mm512_fmadd_ps(fft6154, fft6069, _mm512_shuffle_f32x4(fft6154, fft6154, 78));
__m512 fft6074 = _mm512_fmadd_ps(fft6058, fft6069, _mm512_shuffle_f32x4(fft6058, fft6058, 78));
__m512 fft6161 = _mm512_fmadd_ps(fft6146, fft6069, _mm512_shuffle_f32x4(fft6146, fft6146, 78));
__m512 fft6075 = _mm512_fmadd_ps(fft6060, fft6069, _mm512_shuffle_f32x4(fft6060, fft6060, 78));
__m512 fft6162 = _mm512_fmadd_ps(fft6148, fft6069, _mm512_shuffle_f32x4(fft6148, fft6148, 78));
__m512 fft6076 = _mm512_fmadd_ps(fft6067, fft6069, _mm512_shuffle_f32x4(fft6067, fft6067, 78));
__m512 fft6163 = _mm512_fmadd_ps(fft6155, fft6069, _mm512_shuffle_f32x4(fft6155, fft6155, 78));
__m512 fft6077 = _mm512_fmadd_ps(fft6068, fft6069, _mm512_shuffle_f32x4(fft6068, fft6068, 78));
__m512 fft6164 = _mm512_fmadd_ps(fft6156, fft6069, _mm512_shuffle_f32x4(fft6156, fft6156, 78));
__m512 fft6078 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6079 = _mm512_mul_ps(fft6070, fft6078);
__m512 fft6165 = _mm512_mul_ps(fft6157, fft6078);
__m512 fft6080 = _mm512_mul_ps(fft6071, fft6078);
__m512 fft6166 = _mm512_mul_ps(fft6158, fft6078);
__m512 fft6081 = _mm512_mul_ps(fft6072, fft6078);
__m512 fft6167 = _mm512_mul_ps(fft6159, fft6078);
__m512 fft6082 = _mm512_mul_ps(fft6073, fft6078);
__m512 fft6168 = _mm512_mul_ps(fft6160, fft6078);
__m512 fft6083 = _mm512_mul_ps(fft6074, fft6078);
__m512 fft6169 = _mm512_mul_ps(fft6161, fft6078);
__m512 fft6084 = _mm512_mul_ps(fft6075, fft6078);
__m512 fft6170 = _mm512_mul_ps(fft6162, fft6078);
__m512 fft6085 = _mm512_mul_ps(fft6076, fft6078);
__m512 fft6171 = _mm512_mul_ps(fft6163, fft6078);
__m512 fft6086 = _mm512_mul_ps(fft6077, fft6078);
__m512 fft6172 = _mm512_mul_ps(fft6164, fft6078);
__m512 fft6087 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft6088 = _mm512_fmadd_ps(fft6071, fft6087, fft6079);
__m512 fft6173 = _mm512_fmadd_ps(fft6158, fft6087, fft6165);
__m512 fft6089 = _mm512_fnmadd_ps(fft6070, fft6087, fft6080);
__m512 fft6174 = _mm512_fnmadd_ps(fft6157, fft6087, fft6166);
__m512 fft6090 = _mm512_fmadd_ps(fft6073, fft6087, fft6081);
__m512 fft6175 = _mm512_fmadd_ps(fft6160, fft6087, fft6167);
__m512 fft6091 = _mm512_fnmadd_ps(fft6072, fft6087, fft6082);
__m512 fft6176 = _mm512_fnmadd_ps(fft6159, fft6087, fft6168);
__m512 fft6092 = _mm512_fmadd_ps(fft6075, fft6087, fft6083);
__m512 fft6177 = _mm512_fmadd_ps(fft6162, fft6087, fft6169);
__m512 fft6093 = _mm512_fnmadd_ps(fft6074, fft6087, fft6084);
__m512 fft6178 = _mm512_fnmadd_ps(fft6161, fft6087, fft6170);
__m512 fft6094 = _mm512_fmadd_ps(fft6077, fft6087, fft6085);
__m512 fft6179 = _mm512_fmadd_ps(fft6164, fft6087, fft6171);
__m512 fft6095 = _mm512_fnmadd_ps(fft6076, fft6087, fft6086);
__m512 fft6180 = _mm512_fnmadd_ps(fft6163, fft6087, fft6172);
__m512 fft6096 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft6097 = _mm512_fmadd_ps(fft6088, fft6096, _mm512_shuffle_f32x4(fft6088, fft6088, 177));
__m512 fft6181 = _mm512_fmadd_ps(fft6173, fft6096, _mm512_shuffle_f32x4(fft6173, fft6173, 177));
__m512 fft6098 = _mm512_fmadd_ps(fft6089, fft6096, _mm512_shuffle_f32x4(fft6089, fft6089, 177));
__m512 fft6182 = _mm512_fmadd_ps(fft6174, fft6096, _mm512_shuffle_f32x4(fft6174, fft6174, 177));
__m512 fft6099 = _mm512_fmadd_ps(fft6090, fft6096, _mm512_shuffle_f32x4(fft6090, fft6090, 177));
__m512 fft6183 = _mm512_fmadd_ps(fft6175, fft6096, _mm512_shuffle_f32x4(fft6175, fft6175, 177));
__m512 fft6100 = _mm512_fmadd_ps(fft6091, fft6096, _mm512_shuffle_f32x4(fft6091, fft6091, 177));
__m512 fft6184 = _mm512_fmadd_ps(fft6176, fft6096, _mm512_shuffle_f32x4(fft6176, fft6176, 177));
__m512 fft6101 = _mm512_fmadd_ps(fft6092, fft6096, _mm512_shuffle_f32x4(fft6092, fft6092, 177));
__m512 fft6185 = _mm512_fmadd_ps(fft6177, fft6096, _mm512_shuffle_f32x4(fft6177, fft6177, 177));
__m512 fft6102 = _mm512_fmadd_ps(fft6093, fft6096, _mm512_shuffle_f32x4(fft6093, fft6093, 177));
__m512 fft6186 = _mm512_fmadd_ps(fft6178, fft6096, _mm512_shuffle_f32x4(fft6178, fft6178, 177));
__m512 fft6103 = _mm512_fmadd_ps(fft6094, fft6096, _mm512_shuffle_f32x4(fft6094, fft6094, 177));
__m512 fft6187 = _mm512_fmadd_ps(fft6179, fft6096, _mm512_shuffle_f32x4(fft6179, fft6179, 177));
__m512 fft6104 = _mm512_fmadd_ps(fft6095, fft6096, _mm512_shuffle_f32x4(fft6095, fft6095, 177));
__m512 fft6188 = _mm512_fmadd_ps(fft6180, fft6096, _mm512_shuffle_f32x4(fft6180, fft6180, 177));
__m512 fft6105 = _mm512_mask_mov_ps(fft6097, 49344, fft6098);
__m512 fft6189 = _mm512_mask_mov_ps(fft6181, 49344, fft6182);
__m512 fft6106 = _mm512_mask_sub_ps(fft6098, 49344, _mm512_setzero_ps(), fft6097);
__m512 fft6190 = _mm512_mask_sub_ps(fft6182, 49344, _mm512_setzero_ps(), fft6181);
__m512 fft6107 = _mm512_mask_mov_ps(fft6099, 49344, fft6100);
__m512 fft6191 = _mm512_mask_mov_ps(fft6183, 49344, fft6184);
__m512 fft6108 = _mm512_mask_sub_ps(fft6100, 49344, _mm512_setzero_ps(), fft6099);
__m512 fft6192 = _mm512_mask_sub_ps(fft6184, 49344, _mm512_setzero_ps(), fft6183);
__m512 fft6109 = _mm512_mask_mov_ps(fft6101, 49344, fft6102);
__m512 fft6193 = _mm512_mask_mov_ps(fft6185, 49344, fft6186);
__m512 fft6110 = _mm512_mask_sub_ps(fft6102, 49344, _mm512_setzero_ps(), fft6101);
__m512 fft6194 = _mm512_mask_sub_ps(fft6186, 49344, _mm512_setzero_ps(), fft6185);
__m512 fft6111 = _mm512_mask_mov_ps(fft6103, 49344, fft6104);
__m512 fft6195 = _mm512_mask_mov_ps(fft6187, 49344, fft6188);
__m512 fft6112 = _mm512_mask_sub_ps(fft6104, 49344, _mm512_setzero_ps(), fft6103);
__m512 fft6196 = _mm512_mask_sub_ps(fft6188, 49344, _mm512_setzero_ps(), fft6187);
__m512 fft6113 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft6114 = _mm512_fmadd_ps(fft6105, fft6113, _mm512_shuffle_ps(fft6105, fft6105, 78));
__m512 fft6197 = _mm512_fmadd_ps(fft6189, fft6113, _mm512_shuffle_ps(fft6189, fft6189, 78));
__m512 fft6115 = _mm512_fmadd_ps(fft6106, fft6113, _mm512_shuffle_ps(fft6106, fft6106, 78));
__m512 fft6198 = _mm512_fmadd_ps(fft6190, fft6113, _mm512_shuffle_ps(fft6190, fft6190, 78));
__m512 fft6116 = _mm512_fmadd_ps(fft6107, fft6113, _mm512_shuffle_ps(fft6107, fft6107, 78));
__m512 fft6199 = _mm512_fmadd_ps(fft6191, fft6113, _mm512_shuffle_ps(fft6191, fft6191, 78));
__m512 fft6117 = _mm512_fmadd_ps(fft6108, fft6113, _mm512_shuffle_ps(fft6108, fft6108, 78));
__m512 fft6200 = _mm512_fmadd_ps(fft6192, fft6113, _mm512_shuffle_ps(fft6192, fft6192, 78));
__m512 fft6118 = _mm512_fmadd_ps(fft6109, fft6113, _mm512_shuffle_ps(fft6109, fft6109, 78));
__m512 fft6201 = _mm512_fmadd_ps(fft6193, fft6113, _mm512_shuffle_ps(fft6193, fft6193, 78));
__m512 fft6119 = _mm512_fmadd_ps(fft6110, fft6113, _mm512_shuffle_ps(fft6110, fft6110, 78));
__m512 fft6202 = _mm512_fmadd_ps(fft6194, fft6113, _mm512_shuffle_ps(fft6194, fft6194, 78));
__m512 fft6120 = _mm512_fmadd_ps(fft6111, fft6113, _mm512_shuffle_ps(fft6111, fft6111, 78));
__m512 fft6203 = _mm512_fmadd_ps(fft6195, fft6113, _mm512_shuffle_ps(fft6195, fft6195, 78));
__m512 fft6121 = _mm512_fmadd_ps(fft6112, fft6113, _mm512_shuffle_ps(fft6112, fft6112, 78));
__m512 fft6204 = _mm512_fmadd_ps(fft6196, fft6113, _mm512_shuffle_ps(fft6196, fft6196, 78));
__m512i fft6122 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft6123 = _mm512_permutexvar_ps(fft6122, fft6114);
__m512 fft6205 = _mm512_permutexvar_ps(fft6122, fft6197);
__m512i fft6124 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft6125 = _mm512_permutexvar_ps(fft6124, fft6114);
__m512 fft6206 = _mm512_permutexvar_ps(fft6124, fft6197);
__m512 fft6126 = _mm512_permutexvar_ps(fft6122, fft6115);
__m512 fft6207 = _mm512_permutexvar_ps(fft6122, fft6198);
__m512 fft6127 = _mm512_permutexvar_ps(fft6124, fft6115);
__m512 fft6208 = _mm512_permutexvar_ps(fft6124, fft6198);
__m512 fft6128 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft6129 = _mm512_fmadd_ps(fft6123, fft6128, fft6125);
__m512 fft6209 = _mm512_fmadd_ps(fft6205, fft6128, fft6206);
__m512 fft6130 = _mm512_fnmadd_ps(fft6127, fft6128, fft6126);
__m512 fft6210 = _mm512_fnmadd_ps(fft6208, fft6128, fft6207);
__m512 fft6131 = _mm512_mask_mov_ps(fft6127, 21845, fft6129);
__m512 fft6211 = _mm512_mask_mov_ps(fft6208, 21845, fft6209);
__m512 fft6132 = _mm512_mask_mov_ps(fft6123, 43176, fft6129);
__m512 fft6212 = _mm512_mask_mov_ps(fft6205, 43176, fft6209);
__m512 fft6133 = _mm512_mask_mov_ps(fft6131, 43176, fft6130);
__m512 fft6213 = _mm512_mask_mov_ps(fft6211, 43176, fft6210);
__m512 fft6134 = _mm512_mask_mov_ps(fft6132, 22102, fft6130);
__m512 fft6214 = _mm512_mask_mov_ps(fft6212, 22102, fft6210);
__m512 fft6135 = _mm512_mask_mul_ps(fft6133, 64764, fft6133, _mm512_set1_ps(5e-01f));
__m512 fft6215 = _mm512_mask_mul_ps(fft6213, 64764, fft6213, _mm512_set1_ps(5e-01f));
__m512 fft6136 = _mm512_mask_mul_ps(fft6134, 64764, fft6134, _mm512_set1_ps(5e-01f));
__m512 fft6216 = _mm512_mask_mul_ps(fft6214, 64764, fft6214, _mm512_set1_ps(5e-01f));
__m512 df545 = fft6135;
__m512 df553 = fft6215;
__m512 df546 = fft6136;
__m512 df554 = fft6216;
__m512 df547 = fft6116;
__m512 df555 = fft6199;
__m512 df548 = fft6117;
__m512 df556 = fft6200;
__m512 df549 = fft6118;
__m512 df557 = fft6201;
__m512 df550 = fft6119;
__m512 df558 = fft6202;
__m512 df551 = fft6120;
__m512 df559 = fft6203;
__m512 df552 = fft6121;
__m512 df560 = fft6204;
__m512i eo37 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df547 = _mm512_permutexvar_ps(eo37, df547);
df548 = _mm512_permutexvar_ps(eo37, df548);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df547);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df548);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df547);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df548);
df555 = _mm512_permutexvar_ps(eo37, df555);
df556 = _mm512_permutexvar_ps(eo37, df556);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df555);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df556);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df555);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df556);
df549 = _mm512_permutexvar_ps(eo37, df549);
df550 = _mm512_permutexvar_ps(eo37, df550);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df549);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df550);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df549);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df550);
df557 = _mm512_permutexvar_ps(eo37, df557);
df558 = _mm512_permutexvar_ps(eo37, df558);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df557);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df558);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df557);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df558);
df551 = _mm512_permutexvar_ps(eo37, df551);
df552 = _mm512_permutexvar_ps(eo37, df552);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df551);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df552);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df551);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df552);
df559 = _mm512_permutexvar_ps(eo37, df559);
df560 = _mm512_permutexvar_ps(eo37, df560);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df559);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df560);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df559);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df560);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df545);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df546);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df545);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df546);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df553);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df554);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df553);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df554);
ptrdiff_t b38 = 1;
ptrdiff_t m38 = (size_t)b38/2;
ptrdiff_t f39 = (size_t)b38%2;
__m512 dat546 = _mm512_maskz_loadu_ps(127, datPtr1+40+602112*i6+200704*k18+896*h17+4*w17+0*b38);
__m512 dat547 = _mm512_maskz_loadu_ps(127, datPtr1+936+602112*i6+200704*k18+896*h17+4*w17+0*b38);
__m512 dat548 = _mm512_maskz_loadu_ps(127, datPtr1+1832+602112*i6+200704*k18+896*h17+4*w17+0*b38);
__m512 dat549 = _mm512_maskz_loadu_ps(127, datPtr1+2728+602112*i6+200704*k18+896*h17+4*w17+0*b38);
__m512 dat550 = _mm512_maskz_loadu_ps(127, datPtr1+3624+602112*i6+200704*k18+896*h17+4*w17+0*b38);
__m512 dat551 = _mm512_maskz_loadu_ps(127, datPtr1+4520+602112*i6+200704*k18+896*h17+4*w17+0*b38);
__m512 dat552 = _mm512_maskz_loadu_ps(127, datPtr1+5416+602112*i6+200704*k18+896*h17+4*w17+0*b38);
__m512 dat553 = _mm512_maskz_loadu_ps(127, datPtr1+6312+602112*i6+200704*k18+896*h17+4*w17+0*b38);
__m512 dat554 = _mm512_maskz_loadu_ps(127, datPtr1+7208+602112*i6+200704*k18+896*h17+4*w17+0*b38);
__m512 dat555 = _mm512_maskz_loadu_ps(127, datPtr1+8104+602112*i6+200704*k18+896*h17+4*w17+0*b38);
__m512 dat556 = _mm512_maskz_loadu_ps(127, datPtr1+9000+602112*i6+200704*k18+896*h17+4*w17+0*b38);
__m512 dat557 = _mm512_maskz_loadu_ps(127, datPtr1+9896+602112*i6+200704*k18+896*h17+4*w17+0*b38);
__m512 dat558 = _mm512_maskz_loadu_ps(127, datPtr1+10792+602112*i6+200704*k18+896*h17+4*w17+0*b38);
__m512 dat559 = _mm512_maskz_loadu_ps(127, datPtr1+11688+602112*i6+200704*k18+896*h17+4*w17+0*b38);
__m512 dat560 = _mm512_maskz_loadu_ps(127, datPtr1+12584+602112*i6+200704*k18+896*h17+4*w17+0*b38);
__m512 dat561 = _mm512_maskz_loadu_ps(127, datPtr1+13480+602112*i6+200704*k18+896*h17+4*w17+0*b38);
__m512 fft6217 = _mm512_add_ps(dat546, dat554);
__m512 fft6305 = _mm512_add_ps(dat547, dat555);
__m512 fft6218 = _mm512_sub_ps(dat546, dat554);
__m512 fft6306 = _mm512_sub_ps(dat547, dat555);
__m512 fft6219 = _mm512_add_ps(dat548, dat556);
__m512 fft6307 = _mm512_add_ps(dat549, dat557);
__m512 fft6220 = _mm512_sub_ps(dat548, dat556);
__m512 fft6308 = _mm512_sub_ps(dat549, dat557);
__m512 fft6221 = _mm512_add_ps(dat550, dat558);
__m512 fft6309 = _mm512_add_ps(dat551, dat559);
__m512 fft6222 = _mm512_sub_ps(dat550, dat558);
__m512 fft6310 = _mm512_sub_ps(dat551, dat559);
__m512 fft6223 = _mm512_add_ps(dat552, dat560);
__m512 fft6311 = _mm512_add_ps(dat553, dat561);
__m512 fft6224 = _mm512_sub_ps(dat552, dat560);
__m512 fft6312 = _mm512_sub_ps(dat553, dat561);
__m512 fft6225 = _mm512_add_ps(fft6217, fft6221);
__m512 fft6313 = _mm512_add_ps(fft6305, fft6309);
__m512 fft6226 = _mm512_sub_ps(fft6217, fft6221);
__m512 fft6314 = _mm512_sub_ps(fft6305, fft6309);
__m512 fft6227 = _mm512_add_ps(fft6219, fft6223);
__m512 fft6315 = _mm512_add_ps(fft6307, fft6311);
__m512 fft6228 = _mm512_sub_ps(fft6223, fft6219);
__m512 fft6316 = _mm512_sub_ps(fft6311, fft6307);
__m512 fft6229 = _mm512_sub_ps(fft6220, fft6224);
__m512 fft6317 = _mm512_sub_ps(fft6308, fft6312);
__m512 fft6230 = _mm512_add_ps(fft6220, fft6224);
__m512 fft6318 = _mm512_add_ps(fft6308, fft6312);
__m512 fft6231 = _mm512_add_ps(fft6225, fft6227);
__m512 fft6319 = _mm512_add_ps(fft6313, fft6315);
__m512 fft6232 = _mm512_sub_ps(fft6225, fft6227);
__m512 fft6320 = _mm512_sub_ps(fft6313, fft6315);
__m512 fft6233 = _mm512_fmadd_ps(fft6229, _mm512_set1_ps(7.0710677e-01f), fft6218);
__m512 fft6321 = _mm512_fmadd_ps(fft6317, _mm512_set1_ps(7.0710677e-01f), fft6306);
__m512 fft6234 = _mm512_fnmsub_ps(fft6230, _mm512_set1_ps(7.0710677e-01f), fft6222);
__m512 fft6322 = _mm512_fnmsub_ps(fft6318, _mm512_set1_ps(7.0710677e-01f), fft6310);
__m512 fft6235 = _mm512_fnmadd_ps(fft6229, _mm512_set1_ps(7.0710677e-01f), fft6218);
__m512 fft6323 = _mm512_fnmadd_ps(fft6317, _mm512_set1_ps(7.0710677e-01f), fft6306);
__m512 fft6236 = _mm512_fnmadd_ps(fft6230, _mm512_set1_ps(7.0710677e-01f), fft6222);
__m512 fft6324 = _mm512_fnmadd_ps(fft6318, _mm512_set1_ps(7.0710677e-01f), fft6310);
__m512 fft6237 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6238 = _mm512_fmadd_ps(fft6231, fft6237, _mm512_shuffle_f32x4(fft6231, fft6231, 78));
__m512 fft6325 = _mm512_fmadd_ps(fft6319, fft6237, _mm512_shuffle_f32x4(fft6319, fft6319, 78));
__m512 fft6239 = _mm512_fmadd_ps(fft6232, fft6237, _mm512_shuffle_f32x4(fft6232, fft6232, 78));
__m512 fft6326 = _mm512_fmadd_ps(fft6320, fft6237, _mm512_shuffle_f32x4(fft6320, fft6320, 78));
__m512 fft6240 = _mm512_fmadd_ps(fft6233, fft6237, _mm512_shuffle_f32x4(fft6233, fft6233, 78));
__m512 fft6327 = _mm512_fmadd_ps(fft6321, fft6237, _mm512_shuffle_f32x4(fft6321, fft6321, 78));
__m512 fft6241 = _mm512_fmadd_ps(fft6234, fft6237, _mm512_shuffle_f32x4(fft6234, fft6234, 78));
__m512 fft6328 = _mm512_fmadd_ps(fft6322, fft6237, _mm512_shuffle_f32x4(fft6322, fft6322, 78));
__m512 fft6242 = _mm512_fmadd_ps(fft6226, fft6237, _mm512_shuffle_f32x4(fft6226, fft6226, 78));
__m512 fft6329 = _mm512_fmadd_ps(fft6314, fft6237, _mm512_shuffle_f32x4(fft6314, fft6314, 78));
__m512 fft6243 = _mm512_fmadd_ps(fft6228, fft6237, _mm512_shuffle_f32x4(fft6228, fft6228, 78));
__m512 fft6330 = _mm512_fmadd_ps(fft6316, fft6237, _mm512_shuffle_f32x4(fft6316, fft6316, 78));
__m512 fft6244 = _mm512_fmadd_ps(fft6235, fft6237, _mm512_shuffle_f32x4(fft6235, fft6235, 78));
__m512 fft6331 = _mm512_fmadd_ps(fft6323, fft6237, _mm512_shuffle_f32x4(fft6323, fft6323, 78));
__m512 fft6245 = _mm512_fmadd_ps(fft6236, fft6237, _mm512_shuffle_f32x4(fft6236, fft6236, 78));
__m512 fft6332 = _mm512_fmadd_ps(fft6324, fft6237, _mm512_shuffle_f32x4(fft6324, fft6324, 78));
__m512 fft6246 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6247 = _mm512_mul_ps(fft6238, fft6246);
__m512 fft6333 = _mm512_mul_ps(fft6325, fft6246);
__m512 fft6248 = _mm512_mul_ps(fft6239, fft6246);
__m512 fft6334 = _mm512_mul_ps(fft6326, fft6246);
__m512 fft6249 = _mm512_mul_ps(fft6240, fft6246);
__m512 fft6335 = _mm512_mul_ps(fft6327, fft6246);
__m512 fft6250 = _mm512_mul_ps(fft6241, fft6246);
__m512 fft6336 = _mm512_mul_ps(fft6328, fft6246);
__m512 fft6251 = _mm512_mul_ps(fft6242, fft6246);
__m512 fft6337 = _mm512_mul_ps(fft6329, fft6246);
__m512 fft6252 = _mm512_mul_ps(fft6243, fft6246);
__m512 fft6338 = _mm512_mul_ps(fft6330, fft6246);
__m512 fft6253 = _mm512_mul_ps(fft6244, fft6246);
__m512 fft6339 = _mm512_mul_ps(fft6331, fft6246);
__m512 fft6254 = _mm512_mul_ps(fft6245, fft6246);
__m512 fft6340 = _mm512_mul_ps(fft6332, fft6246);
__m512 fft6255 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft6256 = _mm512_fmadd_ps(fft6239, fft6255, fft6247);
__m512 fft6341 = _mm512_fmadd_ps(fft6326, fft6255, fft6333);
__m512 fft6257 = _mm512_fnmadd_ps(fft6238, fft6255, fft6248);
__m512 fft6342 = _mm512_fnmadd_ps(fft6325, fft6255, fft6334);
__m512 fft6258 = _mm512_fmadd_ps(fft6241, fft6255, fft6249);
__m512 fft6343 = _mm512_fmadd_ps(fft6328, fft6255, fft6335);
__m512 fft6259 = _mm512_fnmadd_ps(fft6240, fft6255, fft6250);
__m512 fft6344 = _mm512_fnmadd_ps(fft6327, fft6255, fft6336);
__m512 fft6260 = _mm512_fmadd_ps(fft6243, fft6255, fft6251);
__m512 fft6345 = _mm512_fmadd_ps(fft6330, fft6255, fft6337);
__m512 fft6261 = _mm512_fnmadd_ps(fft6242, fft6255, fft6252);
__m512 fft6346 = _mm512_fnmadd_ps(fft6329, fft6255, fft6338);
__m512 fft6262 = _mm512_fmadd_ps(fft6245, fft6255, fft6253);
__m512 fft6347 = _mm512_fmadd_ps(fft6332, fft6255, fft6339);
__m512 fft6263 = _mm512_fnmadd_ps(fft6244, fft6255, fft6254);
__m512 fft6348 = _mm512_fnmadd_ps(fft6331, fft6255, fft6340);
__m512 fft6264 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft6265 = _mm512_fmadd_ps(fft6256, fft6264, _mm512_shuffle_f32x4(fft6256, fft6256, 177));
__m512 fft6349 = _mm512_fmadd_ps(fft6341, fft6264, _mm512_shuffle_f32x4(fft6341, fft6341, 177));
__m512 fft6266 = _mm512_fmadd_ps(fft6257, fft6264, _mm512_shuffle_f32x4(fft6257, fft6257, 177));
__m512 fft6350 = _mm512_fmadd_ps(fft6342, fft6264, _mm512_shuffle_f32x4(fft6342, fft6342, 177));
__m512 fft6267 = _mm512_fmadd_ps(fft6258, fft6264, _mm512_shuffle_f32x4(fft6258, fft6258, 177));
__m512 fft6351 = _mm512_fmadd_ps(fft6343, fft6264, _mm512_shuffle_f32x4(fft6343, fft6343, 177));
__m512 fft6268 = _mm512_fmadd_ps(fft6259, fft6264, _mm512_shuffle_f32x4(fft6259, fft6259, 177));
__m512 fft6352 = _mm512_fmadd_ps(fft6344, fft6264, _mm512_shuffle_f32x4(fft6344, fft6344, 177));
__m512 fft6269 = _mm512_fmadd_ps(fft6260, fft6264, _mm512_shuffle_f32x4(fft6260, fft6260, 177));
__m512 fft6353 = _mm512_fmadd_ps(fft6345, fft6264, _mm512_shuffle_f32x4(fft6345, fft6345, 177));
__m512 fft6270 = _mm512_fmadd_ps(fft6261, fft6264, _mm512_shuffle_f32x4(fft6261, fft6261, 177));
__m512 fft6354 = _mm512_fmadd_ps(fft6346, fft6264, _mm512_shuffle_f32x4(fft6346, fft6346, 177));
__m512 fft6271 = _mm512_fmadd_ps(fft6262, fft6264, _mm512_shuffle_f32x4(fft6262, fft6262, 177));
__m512 fft6355 = _mm512_fmadd_ps(fft6347, fft6264, _mm512_shuffle_f32x4(fft6347, fft6347, 177));
__m512 fft6272 = _mm512_fmadd_ps(fft6263, fft6264, _mm512_shuffle_f32x4(fft6263, fft6263, 177));
__m512 fft6356 = _mm512_fmadd_ps(fft6348, fft6264, _mm512_shuffle_f32x4(fft6348, fft6348, 177));
__m512 fft6273 = _mm512_mask_mov_ps(fft6265, 49344, fft6266);
__m512 fft6357 = _mm512_mask_mov_ps(fft6349, 49344, fft6350);
__m512 fft6274 = _mm512_mask_sub_ps(fft6266, 49344, _mm512_setzero_ps(), fft6265);
__m512 fft6358 = _mm512_mask_sub_ps(fft6350, 49344, _mm512_setzero_ps(), fft6349);
__m512 fft6275 = _mm512_mask_mov_ps(fft6267, 49344, fft6268);
__m512 fft6359 = _mm512_mask_mov_ps(fft6351, 49344, fft6352);
__m512 fft6276 = _mm512_mask_sub_ps(fft6268, 49344, _mm512_setzero_ps(), fft6267);
__m512 fft6360 = _mm512_mask_sub_ps(fft6352, 49344, _mm512_setzero_ps(), fft6351);
__m512 fft6277 = _mm512_mask_mov_ps(fft6269, 49344, fft6270);
__m512 fft6361 = _mm512_mask_mov_ps(fft6353, 49344, fft6354);
__m512 fft6278 = _mm512_mask_sub_ps(fft6270, 49344, _mm512_setzero_ps(), fft6269);
__m512 fft6362 = _mm512_mask_sub_ps(fft6354, 49344, _mm512_setzero_ps(), fft6353);
__m512 fft6279 = _mm512_mask_mov_ps(fft6271, 49344, fft6272);
__m512 fft6363 = _mm512_mask_mov_ps(fft6355, 49344, fft6356);
__m512 fft6280 = _mm512_mask_sub_ps(fft6272, 49344, _mm512_setzero_ps(), fft6271);
__m512 fft6364 = _mm512_mask_sub_ps(fft6356, 49344, _mm512_setzero_ps(), fft6355);
__m512 fft6281 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft6282 = _mm512_fmadd_ps(fft6273, fft6281, _mm512_shuffle_ps(fft6273, fft6273, 78));
__m512 fft6365 = _mm512_fmadd_ps(fft6357, fft6281, _mm512_shuffle_ps(fft6357, fft6357, 78));
__m512 fft6283 = _mm512_fmadd_ps(fft6274, fft6281, _mm512_shuffle_ps(fft6274, fft6274, 78));
__m512 fft6366 = _mm512_fmadd_ps(fft6358, fft6281, _mm512_shuffle_ps(fft6358, fft6358, 78));
__m512 fft6284 = _mm512_fmadd_ps(fft6275, fft6281, _mm512_shuffle_ps(fft6275, fft6275, 78));
__m512 fft6367 = _mm512_fmadd_ps(fft6359, fft6281, _mm512_shuffle_ps(fft6359, fft6359, 78));
__m512 fft6285 = _mm512_fmadd_ps(fft6276, fft6281, _mm512_shuffle_ps(fft6276, fft6276, 78));
__m512 fft6368 = _mm512_fmadd_ps(fft6360, fft6281, _mm512_shuffle_ps(fft6360, fft6360, 78));
__m512 fft6286 = _mm512_fmadd_ps(fft6277, fft6281, _mm512_shuffle_ps(fft6277, fft6277, 78));
__m512 fft6369 = _mm512_fmadd_ps(fft6361, fft6281, _mm512_shuffle_ps(fft6361, fft6361, 78));
__m512 fft6287 = _mm512_fmadd_ps(fft6278, fft6281, _mm512_shuffle_ps(fft6278, fft6278, 78));
__m512 fft6370 = _mm512_fmadd_ps(fft6362, fft6281, _mm512_shuffle_ps(fft6362, fft6362, 78));
__m512 fft6288 = _mm512_fmadd_ps(fft6279, fft6281, _mm512_shuffle_ps(fft6279, fft6279, 78));
__m512 fft6371 = _mm512_fmadd_ps(fft6363, fft6281, _mm512_shuffle_ps(fft6363, fft6363, 78));
__m512 fft6289 = _mm512_fmadd_ps(fft6280, fft6281, _mm512_shuffle_ps(fft6280, fft6280, 78));
__m512 fft6372 = _mm512_fmadd_ps(fft6364, fft6281, _mm512_shuffle_ps(fft6364, fft6364, 78));
__m512i fft6290 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft6291 = _mm512_permutexvar_ps(fft6290, fft6282);
__m512 fft6373 = _mm512_permutexvar_ps(fft6290, fft6365);
__m512i fft6292 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft6293 = _mm512_permutexvar_ps(fft6292, fft6282);
__m512 fft6374 = _mm512_permutexvar_ps(fft6292, fft6365);
__m512 fft6294 = _mm512_permutexvar_ps(fft6290, fft6283);
__m512 fft6375 = _mm512_permutexvar_ps(fft6290, fft6366);
__m512 fft6295 = _mm512_permutexvar_ps(fft6292, fft6283);
__m512 fft6376 = _mm512_permutexvar_ps(fft6292, fft6366);
__m512 fft6296 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft6297 = _mm512_fmadd_ps(fft6291, fft6296, fft6293);
__m512 fft6377 = _mm512_fmadd_ps(fft6373, fft6296, fft6374);
__m512 fft6298 = _mm512_fnmadd_ps(fft6295, fft6296, fft6294);
__m512 fft6378 = _mm512_fnmadd_ps(fft6376, fft6296, fft6375);
__m512 fft6299 = _mm512_mask_mov_ps(fft6295, 21845, fft6297);
__m512 fft6379 = _mm512_mask_mov_ps(fft6376, 21845, fft6377);
__m512 fft6300 = _mm512_mask_mov_ps(fft6291, 43176, fft6297);
__m512 fft6380 = _mm512_mask_mov_ps(fft6373, 43176, fft6377);
__m512 fft6301 = _mm512_mask_mov_ps(fft6299, 43176, fft6298);
__m512 fft6381 = _mm512_mask_mov_ps(fft6379, 43176, fft6378);
__m512 fft6302 = _mm512_mask_mov_ps(fft6300, 22102, fft6298);
__m512 fft6382 = _mm512_mask_mov_ps(fft6380, 22102, fft6378);
__m512 fft6303 = _mm512_mask_mul_ps(fft6301, 64764, fft6301, _mm512_set1_ps(5e-01f));
__m512 fft6383 = _mm512_mask_mul_ps(fft6381, 64764, fft6381, _mm512_set1_ps(5e-01f));
__m512 fft6304 = _mm512_mask_mul_ps(fft6302, 64764, fft6302, _mm512_set1_ps(5e-01f));
__m512 fft6384 = _mm512_mask_mul_ps(fft6382, 64764, fft6382, _mm512_set1_ps(5e-01f));
__m512 df561 = fft6303;
__m512 df569 = fft6383;
__m512 df562 = fft6304;
__m512 df570 = fft6384;
__m512 df563 = fft6284;
__m512 df571 = fft6367;
__m512 df564 = fft6285;
__m512 df572 = fft6368;
__m512 df565 = fft6286;
__m512 df573 = fft6369;
__m512 df566 = fft6287;
__m512 df574 = fft6370;
__m512 df567 = fft6288;
__m512 df575 = fft6371;
__m512 df568 = fft6289;
__m512 df576 = fft6372;
__m512i eo38 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df563 = _mm512_permutexvar_ps(eo38, df563);
df564 = _mm512_permutexvar_ps(eo38, df564);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df563);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df564);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df563);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df564);
df571 = _mm512_permutexvar_ps(eo38, df571);
df572 = _mm512_permutexvar_ps(eo38, df572);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df571);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df572);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df571);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df572);
df565 = _mm512_permutexvar_ps(eo38, df565);
df566 = _mm512_permutexvar_ps(eo38, df566);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df565);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df566);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df565);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df566);
df573 = _mm512_permutexvar_ps(eo38, df573);
df574 = _mm512_permutexvar_ps(eo38, df574);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df573);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df574);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df573);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df574);
df567 = _mm512_permutexvar_ps(eo38, df567);
df568 = _mm512_permutexvar_ps(eo38, df568);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df567);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df568);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df567);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df568);
df575 = _mm512_permutexvar_ps(eo38, df575);
df576 = _mm512_permutexvar_ps(eo38, df576);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df575);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df576);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df575);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df576);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df561);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df562);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df561);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df562);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df569);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df570);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df569);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df570);
ptrdiff_t b39 = 2;
ptrdiff_t m39 = (size_t)b39/2;
ptrdiff_t f40 = (size_t)b39%2;
__m512 dat562 = _mm512_maskz_loadu_ps(65528, datPtr1+8120+602112*i6+200704*k18+896*h17+4*w17+0*b39);
__m512 dat563 = _mm512_maskz_loadu_ps(65528, datPtr1+9016+602112*i6+200704*k18+896*h17+4*w17+0*b39);
__m512 dat564 = _mm512_maskz_loadu_ps(65528, datPtr1+9912+602112*i6+200704*k18+896*h17+4*w17+0*b39);
__m512 dat565 = _mm512_maskz_loadu_ps(65528, datPtr1+10808+602112*i6+200704*k18+896*h17+4*w17+0*b39);
__m512 dat566 = _mm512_maskz_loadu_ps(65528, datPtr1+11704+602112*i6+200704*k18+896*h17+4*w17+0*b39);
__m512 dat567 = _mm512_maskz_loadu_ps(65528, datPtr1+12600+602112*i6+200704*k18+896*h17+4*w17+0*b39);
__m512 dat568 = _mm512_maskz_loadu_ps(65528, datPtr1+13496+602112*i6+200704*k18+896*h17+4*w17+0*b39);
__m512 fft6385 = _mm512_add_ps(dat562, _mm512_setzero_ps());
__m512 fft6473 = _mm512_add_ps(dat563, _mm512_setzero_ps());
__m512 fft6386 = _mm512_sub_ps(dat562, _mm512_setzero_ps());
__m512 fft6474 = _mm512_sub_ps(dat563, _mm512_setzero_ps());
__m512 fft6387 = _mm512_add_ps(dat564, _mm512_setzero_ps());
__m512 fft6475 = _mm512_add_ps(dat565, _mm512_setzero_ps());
__m512 fft6388 = _mm512_sub_ps(dat564, _mm512_setzero_ps());
__m512 fft6476 = _mm512_sub_ps(dat565, _mm512_setzero_ps());
__m512 fft6389 = _mm512_add_ps(dat566, _mm512_setzero_ps());
__m512 fft6477 = _mm512_add_ps(dat567, _mm512_setzero_ps());
__m512 fft6390 = _mm512_sub_ps(dat566, _mm512_setzero_ps());
__m512 fft6478 = _mm512_sub_ps(dat567, _mm512_setzero_ps());
__m512 fft6391 = _mm512_add_ps(dat568, _mm512_setzero_ps());
__m512 fft6479 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6392 = _mm512_sub_ps(dat568, _mm512_setzero_ps());
__m512 fft6480 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6393 = _mm512_add_ps(fft6385, fft6389);
__m512 fft6481 = _mm512_add_ps(fft6473, fft6477);
__m512 fft6394 = _mm512_sub_ps(fft6385, fft6389);
__m512 fft6482 = _mm512_sub_ps(fft6473, fft6477);
__m512 fft6395 = _mm512_add_ps(fft6387, fft6391);
__m512 fft6483 = _mm512_add_ps(fft6475, fft6479);
__m512 fft6396 = _mm512_sub_ps(fft6391, fft6387);
__m512 fft6484 = _mm512_sub_ps(fft6479, fft6475);
__m512 fft6397 = _mm512_sub_ps(fft6388, fft6392);
__m512 fft6485 = _mm512_sub_ps(fft6476, fft6480);
__m512 fft6398 = _mm512_add_ps(fft6388, fft6392);
__m512 fft6486 = _mm512_add_ps(fft6476, fft6480);
__m512 fft6399 = _mm512_add_ps(fft6393, fft6395);
__m512 fft6487 = _mm512_add_ps(fft6481, fft6483);
__m512 fft6400 = _mm512_sub_ps(fft6393, fft6395);
__m512 fft6488 = _mm512_sub_ps(fft6481, fft6483);
__m512 fft6401 = _mm512_fmadd_ps(fft6397, _mm512_set1_ps(7.0710677e-01f), fft6386);
__m512 fft6489 = _mm512_fmadd_ps(fft6485, _mm512_set1_ps(7.0710677e-01f), fft6474);
__m512 fft6402 = _mm512_fnmsub_ps(fft6398, _mm512_set1_ps(7.0710677e-01f), fft6390);
__m512 fft6490 = _mm512_fnmsub_ps(fft6486, _mm512_set1_ps(7.0710677e-01f), fft6478);
__m512 fft6403 = _mm512_fnmadd_ps(fft6397, _mm512_set1_ps(7.0710677e-01f), fft6386);
__m512 fft6491 = _mm512_fnmadd_ps(fft6485, _mm512_set1_ps(7.0710677e-01f), fft6474);
__m512 fft6404 = _mm512_fnmadd_ps(fft6398, _mm512_set1_ps(7.0710677e-01f), fft6390);
__m512 fft6492 = _mm512_fnmadd_ps(fft6486, _mm512_set1_ps(7.0710677e-01f), fft6478);
__m512 fft6405 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6406 = _mm512_fmadd_ps(fft6399, fft6405, _mm512_shuffle_f32x4(fft6399, fft6399, 78));
__m512 fft6493 = _mm512_fmadd_ps(fft6487, fft6405, _mm512_shuffle_f32x4(fft6487, fft6487, 78));
__m512 fft6407 = _mm512_fmadd_ps(fft6400, fft6405, _mm512_shuffle_f32x4(fft6400, fft6400, 78));
__m512 fft6494 = _mm512_fmadd_ps(fft6488, fft6405, _mm512_shuffle_f32x4(fft6488, fft6488, 78));
__m512 fft6408 = _mm512_fmadd_ps(fft6401, fft6405, _mm512_shuffle_f32x4(fft6401, fft6401, 78));
__m512 fft6495 = _mm512_fmadd_ps(fft6489, fft6405, _mm512_shuffle_f32x4(fft6489, fft6489, 78));
__m512 fft6409 = _mm512_fmadd_ps(fft6402, fft6405, _mm512_shuffle_f32x4(fft6402, fft6402, 78));
__m512 fft6496 = _mm512_fmadd_ps(fft6490, fft6405, _mm512_shuffle_f32x4(fft6490, fft6490, 78));
__m512 fft6410 = _mm512_fmadd_ps(fft6394, fft6405, _mm512_shuffle_f32x4(fft6394, fft6394, 78));
__m512 fft6497 = _mm512_fmadd_ps(fft6482, fft6405, _mm512_shuffle_f32x4(fft6482, fft6482, 78));
__m512 fft6411 = _mm512_fmadd_ps(fft6396, fft6405, _mm512_shuffle_f32x4(fft6396, fft6396, 78));
__m512 fft6498 = _mm512_fmadd_ps(fft6484, fft6405, _mm512_shuffle_f32x4(fft6484, fft6484, 78));
__m512 fft6412 = _mm512_fmadd_ps(fft6403, fft6405, _mm512_shuffle_f32x4(fft6403, fft6403, 78));
__m512 fft6499 = _mm512_fmadd_ps(fft6491, fft6405, _mm512_shuffle_f32x4(fft6491, fft6491, 78));
__m512 fft6413 = _mm512_fmadd_ps(fft6404, fft6405, _mm512_shuffle_f32x4(fft6404, fft6404, 78));
__m512 fft6500 = _mm512_fmadd_ps(fft6492, fft6405, _mm512_shuffle_f32x4(fft6492, fft6492, 78));
__m512 fft6414 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6415 = _mm512_mul_ps(fft6406, fft6414);
__m512 fft6501 = _mm512_mul_ps(fft6493, fft6414);
__m512 fft6416 = _mm512_mul_ps(fft6407, fft6414);
__m512 fft6502 = _mm512_mul_ps(fft6494, fft6414);
__m512 fft6417 = _mm512_mul_ps(fft6408, fft6414);
__m512 fft6503 = _mm512_mul_ps(fft6495, fft6414);
__m512 fft6418 = _mm512_mul_ps(fft6409, fft6414);
__m512 fft6504 = _mm512_mul_ps(fft6496, fft6414);
__m512 fft6419 = _mm512_mul_ps(fft6410, fft6414);
__m512 fft6505 = _mm512_mul_ps(fft6497, fft6414);
__m512 fft6420 = _mm512_mul_ps(fft6411, fft6414);
__m512 fft6506 = _mm512_mul_ps(fft6498, fft6414);
__m512 fft6421 = _mm512_mul_ps(fft6412, fft6414);
__m512 fft6507 = _mm512_mul_ps(fft6499, fft6414);
__m512 fft6422 = _mm512_mul_ps(fft6413, fft6414);
__m512 fft6508 = _mm512_mul_ps(fft6500, fft6414);
__m512 fft6423 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft6424 = _mm512_fmadd_ps(fft6407, fft6423, fft6415);
__m512 fft6509 = _mm512_fmadd_ps(fft6494, fft6423, fft6501);
__m512 fft6425 = _mm512_fnmadd_ps(fft6406, fft6423, fft6416);
__m512 fft6510 = _mm512_fnmadd_ps(fft6493, fft6423, fft6502);
__m512 fft6426 = _mm512_fmadd_ps(fft6409, fft6423, fft6417);
__m512 fft6511 = _mm512_fmadd_ps(fft6496, fft6423, fft6503);
__m512 fft6427 = _mm512_fnmadd_ps(fft6408, fft6423, fft6418);
__m512 fft6512 = _mm512_fnmadd_ps(fft6495, fft6423, fft6504);
__m512 fft6428 = _mm512_fmadd_ps(fft6411, fft6423, fft6419);
__m512 fft6513 = _mm512_fmadd_ps(fft6498, fft6423, fft6505);
__m512 fft6429 = _mm512_fnmadd_ps(fft6410, fft6423, fft6420);
__m512 fft6514 = _mm512_fnmadd_ps(fft6497, fft6423, fft6506);
__m512 fft6430 = _mm512_fmadd_ps(fft6413, fft6423, fft6421);
__m512 fft6515 = _mm512_fmadd_ps(fft6500, fft6423, fft6507);
__m512 fft6431 = _mm512_fnmadd_ps(fft6412, fft6423, fft6422);
__m512 fft6516 = _mm512_fnmadd_ps(fft6499, fft6423, fft6508);
__m512 fft6432 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft6433 = _mm512_fmadd_ps(fft6424, fft6432, _mm512_shuffle_f32x4(fft6424, fft6424, 177));
__m512 fft6517 = _mm512_fmadd_ps(fft6509, fft6432, _mm512_shuffle_f32x4(fft6509, fft6509, 177));
__m512 fft6434 = _mm512_fmadd_ps(fft6425, fft6432, _mm512_shuffle_f32x4(fft6425, fft6425, 177));
__m512 fft6518 = _mm512_fmadd_ps(fft6510, fft6432, _mm512_shuffle_f32x4(fft6510, fft6510, 177));
__m512 fft6435 = _mm512_fmadd_ps(fft6426, fft6432, _mm512_shuffle_f32x4(fft6426, fft6426, 177));
__m512 fft6519 = _mm512_fmadd_ps(fft6511, fft6432, _mm512_shuffle_f32x4(fft6511, fft6511, 177));
__m512 fft6436 = _mm512_fmadd_ps(fft6427, fft6432, _mm512_shuffle_f32x4(fft6427, fft6427, 177));
__m512 fft6520 = _mm512_fmadd_ps(fft6512, fft6432, _mm512_shuffle_f32x4(fft6512, fft6512, 177));
__m512 fft6437 = _mm512_fmadd_ps(fft6428, fft6432, _mm512_shuffle_f32x4(fft6428, fft6428, 177));
__m512 fft6521 = _mm512_fmadd_ps(fft6513, fft6432, _mm512_shuffle_f32x4(fft6513, fft6513, 177));
__m512 fft6438 = _mm512_fmadd_ps(fft6429, fft6432, _mm512_shuffle_f32x4(fft6429, fft6429, 177));
__m512 fft6522 = _mm512_fmadd_ps(fft6514, fft6432, _mm512_shuffle_f32x4(fft6514, fft6514, 177));
__m512 fft6439 = _mm512_fmadd_ps(fft6430, fft6432, _mm512_shuffle_f32x4(fft6430, fft6430, 177));
__m512 fft6523 = _mm512_fmadd_ps(fft6515, fft6432, _mm512_shuffle_f32x4(fft6515, fft6515, 177));
__m512 fft6440 = _mm512_fmadd_ps(fft6431, fft6432, _mm512_shuffle_f32x4(fft6431, fft6431, 177));
__m512 fft6524 = _mm512_fmadd_ps(fft6516, fft6432, _mm512_shuffle_f32x4(fft6516, fft6516, 177));
__m512 fft6441 = _mm512_mask_mov_ps(fft6433, 49344, fft6434);
__m512 fft6525 = _mm512_mask_mov_ps(fft6517, 49344, fft6518);
__m512 fft6442 = _mm512_mask_sub_ps(fft6434, 49344, _mm512_setzero_ps(), fft6433);
__m512 fft6526 = _mm512_mask_sub_ps(fft6518, 49344, _mm512_setzero_ps(), fft6517);
__m512 fft6443 = _mm512_mask_mov_ps(fft6435, 49344, fft6436);
__m512 fft6527 = _mm512_mask_mov_ps(fft6519, 49344, fft6520);
__m512 fft6444 = _mm512_mask_sub_ps(fft6436, 49344, _mm512_setzero_ps(), fft6435);
__m512 fft6528 = _mm512_mask_sub_ps(fft6520, 49344, _mm512_setzero_ps(), fft6519);
__m512 fft6445 = _mm512_mask_mov_ps(fft6437, 49344, fft6438);
__m512 fft6529 = _mm512_mask_mov_ps(fft6521, 49344, fft6522);
__m512 fft6446 = _mm512_mask_sub_ps(fft6438, 49344, _mm512_setzero_ps(), fft6437);
__m512 fft6530 = _mm512_mask_sub_ps(fft6522, 49344, _mm512_setzero_ps(), fft6521);
__m512 fft6447 = _mm512_mask_mov_ps(fft6439, 49344, fft6440);
__m512 fft6531 = _mm512_mask_mov_ps(fft6523, 49344, fft6524);
__m512 fft6448 = _mm512_mask_sub_ps(fft6440, 49344, _mm512_setzero_ps(), fft6439);
__m512 fft6532 = _mm512_mask_sub_ps(fft6524, 49344, _mm512_setzero_ps(), fft6523);
__m512 fft6449 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft6450 = _mm512_fmadd_ps(fft6441, fft6449, _mm512_shuffle_ps(fft6441, fft6441, 78));
__m512 fft6533 = _mm512_fmadd_ps(fft6525, fft6449, _mm512_shuffle_ps(fft6525, fft6525, 78));
__m512 fft6451 = _mm512_fmadd_ps(fft6442, fft6449, _mm512_shuffle_ps(fft6442, fft6442, 78));
__m512 fft6534 = _mm512_fmadd_ps(fft6526, fft6449, _mm512_shuffle_ps(fft6526, fft6526, 78));
__m512 fft6452 = _mm512_fmadd_ps(fft6443, fft6449, _mm512_shuffle_ps(fft6443, fft6443, 78));
__m512 fft6535 = _mm512_fmadd_ps(fft6527, fft6449, _mm512_shuffle_ps(fft6527, fft6527, 78));
__m512 fft6453 = _mm512_fmadd_ps(fft6444, fft6449, _mm512_shuffle_ps(fft6444, fft6444, 78));
__m512 fft6536 = _mm512_fmadd_ps(fft6528, fft6449, _mm512_shuffle_ps(fft6528, fft6528, 78));
__m512 fft6454 = _mm512_fmadd_ps(fft6445, fft6449, _mm512_shuffle_ps(fft6445, fft6445, 78));
__m512 fft6537 = _mm512_fmadd_ps(fft6529, fft6449, _mm512_shuffle_ps(fft6529, fft6529, 78));
__m512 fft6455 = _mm512_fmadd_ps(fft6446, fft6449, _mm512_shuffle_ps(fft6446, fft6446, 78));
__m512 fft6538 = _mm512_fmadd_ps(fft6530, fft6449, _mm512_shuffle_ps(fft6530, fft6530, 78));
__m512 fft6456 = _mm512_fmadd_ps(fft6447, fft6449, _mm512_shuffle_ps(fft6447, fft6447, 78));
__m512 fft6539 = _mm512_fmadd_ps(fft6531, fft6449, _mm512_shuffle_ps(fft6531, fft6531, 78));
__m512 fft6457 = _mm512_fmadd_ps(fft6448, fft6449, _mm512_shuffle_ps(fft6448, fft6448, 78));
__m512 fft6540 = _mm512_fmadd_ps(fft6532, fft6449, _mm512_shuffle_ps(fft6532, fft6532, 78));
__m512i fft6458 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft6459 = _mm512_permutexvar_ps(fft6458, fft6450);
__m512 fft6541 = _mm512_permutexvar_ps(fft6458, fft6533);
__m512i fft6460 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft6461 = _mm512_permutexvar_ps(fft6460, fft6450);
__m512 fft6542 = _mm512_permutexvar_ps(fft6460, fft6533);
__m512 fft6462 = _mm512_permutexvar_ps(fft6458, fft6451);
__m512 fft6543 = _mm512_permutexvar_ps(fft6458, fft6534);
__m512 fft6463 = _mm512_permutexvar_ps(fft6460, fft6451);
__m512 fft6544 = _mm512_permutexvar_ps(fft6460, fft6534);
__m512 fft6464 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft6465 = _mm512_fmadd_ps(fft6459, fft6464, fft6461);
__m512 fft6545 = _mm512_fmadd_ps(fft6541, fft6464, fft6542);
__m512 fft6466 = _mm512_fnmadd_ps(fft6463, fft6464, fft6462);
__m512 fft6546 = _mm512_fnmadd_ps(fft6544, fft6464, fft6543);
__m512 fft6467 = _mm512_mask_mov_ps(fft6463, 21845, fft6465);
__m512 fft6547 = _mm512_mask_mov_ps(fft6544, 21845, fft6545);
__m512 fft6468 = _mm512_mask_mov_ps(fft6459, 43176, fft6465);
__m512 fft6548 = _mm512_mask_mov_ps(fft6541, 43176, fft6545);
__m512 fft6469 = _mm512_mask_mov_ps(fft6467, 43176, fft6466);
__m512 fft6549 = _mm512_mask_mov_ps(fft6547, 43176, fft6546);
__m512 fft6470 = _mm512_mask_mov_ps(fft6468, 22102, fft6466);
__m512 fft6550 = _mm512_mask_mov_ps(fft6548, 22102, fft6546);
__m512 fft6471 = _mm512_mask_mul_ps(fft6469, 64764, fft6469, _mm512_set1_ps(5e-01f));
__m512 fft6551 = _mm512_mask_mul_ps(fft6549, 64764, fft6549, _mm512_set1_ps(5e-01f));
__m512 fft6472 = _mm512_mask_mul_ps(fft6470, 64764, fft6470, _mm512_set1_ps(5e-01f));
__m512 fft6552 = _mm512_mask_mul_ps(fft6550, 64764, fft6550, _mm512_set1_ps(5e-01f));
__m512 df577 = fft6471;
__m512 df585 = fft6551;
__m512 df578 = fft6472;
__m512 df586 = fft6552;
__m512 df579 = fft6452;
__m512 df587 = fft6535;
__m512 df580 = fft6453;
__m512 df588 = fft6536;
__m512 df581 = fft6454;
__m512 df589 = fft6537;
__m512 df582 = fft6455;
__m512 df590 = fft6538;
__m512 df583 = fft6456;
__m512 df591 = fft6539;
__m512 df584 = fft6457;
__m512 df592 = fft6540;
__m512i eo39 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df579 = _mm512_permutexvar_ps(eo39, df579);
df580 = _mm512_permutexvar_ps(eo39, df580);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df579);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df580);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df579);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df580);
df587 = _mm512_permutexvar_ps(eo39, df587);
df588 = _mm512_permutexvar_ps(eo39, df588);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df587);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df588);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df587);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df588);
df581 = _mm512_permutexvar_ps(eo39, df581);
df582 = _mm512_permutexvar_ps(eo39, df582);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df581);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df582);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df581);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df582);
df589 = _mm512_permutexvar_ps(eo39, df589);
df590 = _mm512_permutexvar_ps(eo39, df590);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df589);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df590);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df589);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df590);
df583 = _mm512_permutexvar_ps(eo39, df583);
df584 = _mm512_permutexvar_ps(eo39, df584);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df583);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df584);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df583);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df584);
df591 = _mm512_permutexvar_ps(eo39, df591);
df592 = _mm512_permutexvar_ps(eo39, df592);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df591);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df592);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df591);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df592);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df577);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df578);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df577);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df578);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df585);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df586);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df585);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df586);
for (ptrdiff_t b40 = 3; b40 < 6; ++b40) {
ptrdiff_t m40 = (size_t)b40/2;
ptrdiff_t f41 = (size_t)b40%2;
__m512 dat569 = _mm512_maskz_loadu_ps(65535, datPtr1+8040+602112*i6+200704*k18+896*h17+4*w17+40*b40);
__m512 dat570 = _mm512_maskz_loadu_ps(65535, datPtr1+8936+602112*i6+200704*k18+896*h17+4*w17+40*b40);
__m512 dat571 = _mm512_maskz_loadu_ps(65535, datPtr1+9832+602112*i6+200704*k18+896*h17+4*w17+40*b40);
__m512 dat572 = _mm512_maskz_loadu_ps(65535, datPtr1+10728+602112*i6+200704*k18+896*h17+4*w17+40*b40);
__m512 dat573 = _mm512_maskz_loadu_ps(65535, datPtr1+11624+602112*i6+200704*k18+896*h17+4*w17+40*b40);
__m512 dat574 = _mm512_maskz_loadu_ps(65535, datPtr1+12520+602112*i6+200704*k18+896*h17+4*w17+40*b40);
__m512 dat575 = _mm512_maskz_loadu_ps(65535, datPtr1+13416+602112*i6+200704*k18+896*h17+4*w17+40*b40);
__m512 fft6553 = _mm512_add_ps(dat569, _mm512_setzero_ps());
__m512 fft6641 = _mm512_add_ps(dat570, _mm512_setzero_ps());
__m512 fft6554 = _mm512_sub_ps(dat569, _mm512_setzero_ps());
__m512 fft6642 = _mm512_sub_ps(dat570, _mm512_setzero_ps());
__m512 fft6555 = _mm512_add_ps(dat571, _mm512_setzero_ps());
__m512 fft6643 = _mm512_add_ps(dat572, _mm512_setzero_ps());
__m512 fft6556 = _mm512_sub_ps(dat571, _mm512_setzero_ps());
__m512 fft6644 = _mm512_sub_ps(dat572, _mm512_setzero_ps());
__m512 fft6557 = _mm512_add_ps(dat573, _mm512_setzero_ps());
__m512 fft6645 = _mm512_add_ps(dat574, _mm512_setzero_ps());
__m512 fft6558 = _mm512_sub_ps(dat573, _mm512_setzero_ps());
__m512 fft6646 = _mm512_sub_ps(dat574, _mm512_setzero_ps());
__m512 fft6559 = _mm512_add_ps(dat575, _mm512_setzero_ps());
__m512 fft6647 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6560 = _mm512_sub_ps(dat575, _mm512_setzero_ps());
__m512 fft6648 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6561 = _mm512_add_ps(fft6553, fft6557);
__m512 fft6649 = _mm512_add_ps(fft6641, fft6645);
__m512 fft6562 = _mm512_sub_ps(fft6553, fft6557);
__m512 fft6650 = _mm512_sub_ps(fft6641, fft6645);
__m512 fft6563 = _mm512_add_ps(fft6555, fft6559);
__m512 fft6651 = _mm512_add_ps(fft6643, fft6647);
__m512 fft6564 = _mm512_sub_ps(fft6559, fft6555);
__m512 fft6652 = _mm512_sub_ps(fft6647, fft6643);
__m512 fft6565 = _mm512_sub_ps(fft6556, fft6560);
__m512 fft6653 = _mm512_sub_ps(fft6644, fft6648);
__m512 fft6566 = _mm512_add_ps(fft6556, fft6560);
__m512 fft6654 = _mm512_add_ps(fft6644, fft6648);
__m512 fft6567 = _mm512_add_ps(fft6561, fft6563);
__m512 fft6655 = _mm512_add_ps(fft6649, fft6651);
__m512 fft6568 = _mm512_sub_ps(fft6561, fft6563);
__m512 fft6656 = _mm512_sub_ps(fft6649, fft6651);
__m512 fft6569 = _mm512_fmadd_ps(fft6565, _mm512_set1_ps(7.0710677e-01f), fft6554);
__m512 fft6657 = _mm512_fmadd_ps(fft6653, _mm512_set1_ps(7.0710677e-01f), fft6642);
__m512 fft6570 = _mm512_fnmsub_ps(fft6566, _mm512_set1_ps(7.0710677e-01f), fft6558);
__m512 fft6658 = _mm512_fnmsub_ps(fft6654, _mm512_set1_ps(7.0710677e-01f), fft6646);
__m512 fft6571 = _mm512_fnmadd_ps(fft6565, _mm512_set1_ps(7.0710677e-01f), fft6554);
__m512 fft6659 = _mm512_fnmadd_ps(fft6653, _mm512_set1_ps(7.0710677e-01f), fft6642);
__m512 fft6572 = _mm512_fnmadd_ps(fft6566, _mm512_set1_ps(7.0710677e-01f), fft6558);
__m512 fft6660 = _mm512_fnmadd_ps(fft6654, _mm512_set1_ps(7.0710677e-01f), fft6646);
__m512 fft6573 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6574 = _mm512_fmadd_ps(fft6567, fft6573, _mm512_shuffle_f32x4(fft6567, fft6567, 78));
__m512 fft6661 = _mm512_fmadd_ps(fft6655, fft6573, _mm512_shuffle_f32x4(fft6655, fft6655, 78));
__m512 fft6575 = _mm512_fmadd_ps(fft6568, fft6573, _mm512_shuffle_f32x4(fft6568, fft6568, 78));
__m512 fft6662 = _mm512_fmadd_ps(fft6656, fft6573, _mm512_shuffle_f32x4(fft6656, fft6656, 78));
__m512 fft6576 = _mm512_fmadd_ps(fft6569, fft6573, _mm512_shuffle_f32x4(fft6569, fft6569, 78));
__m512 fft6663 = _mm512_fmadd_ps(fft6657, fft6573, _mm512_shuffle_f32x4(fft6657, fft6657, 78));
__m512 fft6577 = _mm512_fmadd_ps(fft6570, fft6573, _mm512_shuffle_f32x4(fft6570, fft6570, 78));
__m512 fft6664 = _mm512_fmadd_ps(fft6658, fft6573, _mm512_shuffle_f32x4(fft6658, fft6658, 78));
__m512 fft6578 = _mm512_fmadd_ps(fft6562, fft6573, _mm512_shuffle_f32x4(fft6562, fft6562, 78));
__m512 fft6665 = _mm512_fmadd_ps(fft6650, fft6573, _mm512_shuffle_f32x4(fft6650, fft6650, 78));
__m512 fft6579 = _mm512_fmadd_ps(fft6564, fft6573, _mm512_shuffle_f32x4(fft6564, fft6564, 78));
__m512 fft6666 = _mm512_fmadd_ps(fft6652, fft6573, _mm512_shuffle_f32x4(fft6652, fft6652, 78));
__m512 fft6580 = _mm512_fmadd_ps(fft6571, fft6573, _mm512_shuffle_f32x4(fft6571, fft6571, 78));
__m512 fft6667 = _mm512_fmadd_ps(fft6659, fft6573, _mm512_shuffle_f32x4(fft6659, fft6659, 78));
__m512 fft6581 = _mm512_fmadd_ps(fft6572, fft6573, _mm512_shuffle_f32x4(fft6572, fft6572, 78));
__m512 fft6668 = _mm512_fmadd_ps(fft6660, fft6573, _mm512_shuffle_f32x4(fft6660, fft6660, 78));
__m512 fft6582 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6583 = _mm512_mul_ps(fft6574, fft6582);
__m512 fft6669 = _mm512_mul_ps(fft6661, fft6582);
__m512 fft6584 = _mm512_mul_ps(fft6575, fft6582);
__m512 fft6670 = _mm512_mul_ps(fft6662, fft6582);
__m512 fft6585 = _mm512_mul_ps(fft6576, fft6582);
__m512 fft6671 = _mm512_mul_ps(fft6663, fft6582);
__m512 fft6586 = _mm512_mul_ps(fft6577, fft6582);
__m512 fft6672 = _mm512_mul_ps(fft6664, fft6582);
__m512 fft6587 = _mm512_mul_ps(fft6578, fft6582);
__m512 fft6673 = _mm512_mul_ps(fft6665, fft6582);
__m512 fft6588 = _mm512_mul_ps(fft6579, fft6582);
__m512 fft6674 = _mm512_mul_ps(fft6666, fft6582);
__m512 fft6589 = _mm512_mul_ps(fft6580, fft6582);
__m512 fft6675 = _mm512_mul_ps(fft6667, fft6582);
__m512 fft6590 = _mm512_mul_ps(fft6581, fft6582);
__m512 fft6676 = _mm512_mul_ps(fft6668, fft6582);
__m512 fft6591 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft6592 = _mm512_fmadd_ps(fft6575, fft6591, fft6583);
__m512 fft6677 = _mm512_fmadd_ps(fft6662, fft6591, fft6669);
__m512 fft6593 = _mm512_fnmadd_ps(fft6574, fft6591, fft6584);
__m512 fft6678 = _mm512_fnmadd_ps(fft6661, fft6591, fft6670);
__m512 fft6594 = _mm512_fmadd_ps(fft6577, fft6591, fft6585);
__m512 fft6679 = _mm512_fmadd_ps(fft6664, fft6591, fft6671);
__m512 fft6595 = _mm512_fnmadd_ps(fft6576, fft6591, fft6586);
__m512 fft6680 = _mm512_fnmadd_ps(fft6663, fft6591, fft6672);
__m512 fft6596 = _mm512_fmadd_ps(fft6579, fft6591, fft6587);
__m512 fft6681 = _mm512_fmadd_ps(fft6666, fft6591, fft6673);
__m512 fft6597 = _mm512_fnmadd_ps(fft6578, fft6591, fft6588);
__m512 fft6682 = _mm512_fnmadd_ps(fft6665, fft6591, fft6674);
__m512 fft6598 = _mm512_fmadd_ps(fft6581, fft6591, fft6589);
__m512 fft6683 = _mm512_fmadd_ps(fft6668, fft6591, fft6675);
__m512 fft6599 = _mm512_fnmadd_ps(fft6580, fft6591, fft6590);
__m512 fft6684 = _mm512_fnmadd_ps(fft6667, fft6591, fft6676);
__m512 fft6600 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft6601 = _mm512_fmadd_ps(fft6592, fft6600, _mm512_shuffle_f32x4(fft6592, fft6592, 177));
__m512 fft6685 = _mm512_fmadd_ps(fft6677, fft6600, _mm512_shuffle_f32x4(fft6677, fft6677, 177));
__m512 fft6602 = _mm512_fmadd_ps(fft6593, fft6600, _mm512_shuffle_f32x4(fft6593, fft6593, 177));
__m512 fft6686 = _mm512_fmadd_ps(fft6678, fft6600, _mm512_shuffle_f32x4(fft6678, fft6678, 177));
__m512 fft6603 = _mm512_fmadd_ps(fft6594, fft6600, _mm512_shuffle_f32x4(fft6594, fft6594, 177));
__m512 fft6687 = _mm512_fmadd_ps(fft6679, fft6600, _mm512_shuffle_f32x4(fft6679, fft6679, 177));
__m512 fft6604 = _mm512_fmadd_ps(fft6595, fft6600, _mm512_shuffle_f32x4(fft6595, fft6595, 177));
__m512 fft6688 = _mm512_fmadd_ps(fft6680, fft6600, _mm512_shuffle_f32x4(fft6680, fft6680, 177));
__m512 fft6605 = _mm512_fmadd_ps(fft6596, fft6600, _mm512_shuffle_f32x4(fft6596, fft6596, 177));
__m512 fft6689 = _mm512_fmadd_ps(fft6681, fft6600, _mm512_shuffle_f32x4(fft6681, fft6681, 177));
__m512 fft6606 = _mm512_fmadd_ps(fft6597, fft6600, _mm512_shuffle_f32x4(fft6597, fft6597, 177));
__m512 fft6690 = _mm512_fmadd_ps(fft6682, fft6600, _mm512_shuffle_f32x4(fft6682, fft6682, 177));
__m512 fft6607 = _mm512_fmadd_ps(fft6598, fft6600, _mm512_shuffle_f32x4(fft6598, fft6598, 177));
__m512 fft6691 = _mm512_fmadd_ps(fft6683, fft6600, _mm512_shuffle_f32x4(fft6683, fft6683, 177));
__m512 fft6608 = _mm512_fmadd_ps(fft6599, fft6600, _mm512_shuffle_f32x4(fft6599, fft6599, 177));
__m512 fft6692 = _mm512_fmadd_ps(fft6684, fft6600, _mm512_shuffle_f32x4(fft6684, fft6684, 177));
__m512 fft6609 = _mm512_mask_mov_ps(fft6601, 49344, fft6602);
__m512 fft6693 = _mm512_mask_mov_ps(fft6685, 49344, fft6686);
__m512 fft6610 = _mm512_mask_sub_ps(fft6602, 49344, _mm512_setzero_ps(), fft6601);
__m512 fft6694 = _mm512_mask_sub_ps(fft6686, 49344, _mm512_setzero_ps(), fft6685);
__m512 fft6611 = _mm512_mask_mov_ps(fft6603, 49344, fft6604);
__m512 fft6695 = _mm512_mask_mov_ps(fft6687, 49344, fft6688);
__m512 fft6612 = _mm512_mask_sub_ps(fft6604, 49344, _mm512_setzero_ps(), fft6603);
__m512 fft6696 = _mm512_mask_sub_ps(fft6688, 49344, _mm512_setzero_ps(), fft6687);
__m512 fft6613 = _mm512_mask_mov_ps(fft6605, 49344, fft6606);
__m512 fft6697 = _mm512_mask_mov_ps(fft6689, 49344, fft6690);
__m512 fft6614 = _mm512_mask_sub_ps(fft6606, 49344, _mm512_setzero_ps(), fft6605);
__m512 fft6698 = _mm512_mask_sub_ps(fft6690, 49344, _mm512_setzero_ps(), fft6689);
__m512 fft6615 = _mm512_mask_mov_ps(fft6607, 49344, fft6608);
__m512 fft6699 = _mm512_mask_mov_ps(fft6691, 49344, fft6692);
__m512 fft6616 = _mm512_mask_sub_ps(fft6608, 49344, _mm512_setzero_ps(), fft6607);
__m512 fft6700 = _mm512_mask_sub_ps(fft6692, 49344, _mm512_setzero_ps(), fft6691);
__m512 fft6617 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft6618 = _mm512_fmadd_ps(fft6609, fft6617, _mm512_shuffle_ps(fft6609, fft6609, 78));
__m512 fft6701 = _mm512_fmadd_ps(fft6693, fft6617, _mm512_shuffle_ps(fft6693, fft6693, 78));
__m512 fft6619 = _mm512_fmadd_ps(fft6610, fft6617, _mm512_shuffle_ps(fft6610, fft6610, 78));
__m512 fft6702 = _mm512_fmadd_ps(fft6694, fft6617, _mm512_shuffle_ps(fft6694, fft6694, 78));
__m512 fft6620 = _mm512_fmadd_ps(fft6611, fft6617, _mm512_shuffle_ps(fft6611, fft6611, 78));
__m512 fft6703 = _mm512_fmadd_ps(fft6695, fft6617, _mm512_shuffle_ps(fft6695, fft6695, 78));
__m512 fft6621 = _mm512_fmadd_ps(fft6612, fft6617, _mm512_shuffle_ps(fft6612, fft6612, 78));
__m512 fft6704 = _mm512_fmadd_ps(fft6696, fft6617, _mm512_shuffle_ps(fft6696, fft6696, 78));
__m512 fft6622 = _mm512_fmadd_ps(fft6613, fft6617, _mm512_shuffle_ps(fft6613, fft6613, 78));
__m512 fft6705 = _mm512_fmadd_ps(fft6697, fft6617, _mm512_shuffle_ps(fft6697, fft6697, 78));
__m512 fft6623 = _mm512_fmadd_ps(fft6614, fft6617, _mm512_shuffle_ps(fft6614, fft6614, 78));
__m512 fft6706 = _mm512_fmadd_ps(fft6698, fft6617, _mm512_shuffle_ps(fft6698, fft6698, 78));
__m512 fft6624 = _mm512_fmadd_ps(fft6615, fft6617, _mm512_shuffle_ps(fft6615, fft6615, 78));
__m512 fft6707 = _mm512_fmadd_ps(fft6699, fft6617, _mm512_shuffle_ps(fft6699, fft6699, 78));
__m512 fft6625 = _mm512_fmadd_ps(fft6616, fft6617, _mm512_shuffle_ps(fft6616, fft6616, 78));
__m512 fft6708 = _mm512_fmadd_ps(fft6700, fft6617, _mm512_shuffle_ps(fft6700, fft6700, 78));
__m512i fft6626 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft6627 = _mm512_permutexvar_ps(fft6626, fft6618);
__m512 fft6709 = _mm512_permutexvar_ps(fft6626, fft6701);
__m512i fft6628 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft6629 = _mm512_permutexvar_ps(fft6628, fft6618);
__m512 fft6710 = _mm512_permutexvar_ps(fft6628, fft6701);
__m512 fft6630 = _mm512_permutexvar_ps(fft6626, fft6619);
__m512 fft6711 = _mm512_permutexvar_ps(fft6626, fft6702);
__m512 fft6631 = _mm512_permutexvar_ps(fft6628, fft6619);
__m512 fft6712 = _mm512_permutexvar_ps(fft6628, fft6702);
__m512 fft6632 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft6633 = _mm512_fmadd_ps(fft6627, fft6632, fft6629);
__m512 fft6713 = _mm512_fmadd_ps(fft6709, fft6632, fft6710);
__m512 fft6634 = _mm512_fnmadd_ps(fft6631, fft6632, fft6630);
__m512 fft6714 = _mm512_fnmadd_ps(fft6712, fft6632, fft6711);
__m512 fft6635 = _mm512_mask_mov_ps(fft6631, 21845, fft6633);
__m512 fft6715 = _mm512_mask_mov_ps(fft6712, 21845, fft6713);
__m512 fft6636 = _mm512_mask_mov_ps(fft6627, 43176, fft6633);
__m512 fft6716 = _mm512_mask_mov_ps(fft6709, 43176, fft6713);
__m512 fft6637 = _mm512_mask_mov_ps(fft6635, 43176, fft6634);
__m512 fft6717 = _mm512_mask_mov_ps(fft6715, 43176, fft6714);
__m512 fft6638 = _mm512_mask_mov_ps(fft6636, 22102, fft6634);
__m512 fft6718 = _mm512_mask_mov_ps(fft6716, 22102, fft6714);
__m512 fft6639 = _mm512_mask_mul_ps(fft6637, 64764, fft6637, _mm512_set1_ps(5e-01f));
__m512 fft6719 = _mm512_mask_mul_ps(fft6717, 64764, fft6717, _mm512_set1_ps(5e-01f));
__m512 fft6640 = _mm512_mask_mul_ps(fft6638, 64764, fft6638, _mm512_set1_ps(5e-01f));
__m512 fft6720 = _mm512_mask_mul_ps(fft6718, 64764, fft6718, _mm512_set1_ps(5e-01f));
__m512 df593 = fft6639;
__m512 df601 = fft6719;
__m512 df594 = fft6640;
__m512 df602 = fft6720;
__m512 df595 = fft6620;
__m512 df603 = fft6703;
__m512 df596 = fft6621;
__m512 df604 = fft6704;
__m512 df597 = fft6622;
__m512 df605 = fft6705;
__m512 df598 = fft6623;
__m512 df606 = fft6706;
__m512 df599 = fft6624;
__m512 df607 = fft6707;
__m512 df600 = fft6625;
__m512 df608 = fft6708;
__m512i eo40 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df595 = _mm512_permutexvar_ps(eo40, df595);
df596 = _mm512_permutexvar_ps(eo40, df596);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df595);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df596);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df595);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df596);
df603 = _mm512_permutexvar_ps(eo40, df603);
df604 = _mm512_permutexvar_ps(eo40, df604);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df603);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df604);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df603);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df604);
df597 = _mm512_permutexvar_ps(eo40, df597);
df598 = _mm512_permutexvar_ps(eo40, df598);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df597);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df598);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df597);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df598);
df605 = _mm512_permutexvar_ps(eo40, df605);
df606 = _mm512_permutexvar_ps(eo40, df606);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df605);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df606);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df605);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df606);
df599 = _mm512_permutexvar_ps(eo40, df599);
df600 = _mm512_permutexvar_ps(eo40, df600);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df599);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df600);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df599);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df600);
df607 = _mm512_permutexvar_ps(eo40, df607);
df608 = _mm512_permutexvar_ps(eo40, df608);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df607);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df608);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df607);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df608);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df593);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df594);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df593);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df594);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df601);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df602);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df601);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df602);
}
}
if (j2 >= last1) return;
++j2;
rel3 = 1;
}
if (rel3 < 4) {
ptrdiff_t h18 = base3+10;
ptrdiff_t w18 = -20+60*rel3;
ptrdiff_t jj8 = 3-rel3+j2;
for (; j2 <= jj8; w18 += 60) {
ptrdiff_t k19 = 3*s1;
ptrdiff_t kk18 = k19+2;
for (; k19 <= kk18; ++k19) {
for (ptrdiff_t b41 = 0; b41 < 6; ++b41) {
ptrdiff_t m41 = (size_t)b41/2;
ptrdiff_t f42 = (size_t)b41%2;
__m512 dat576 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k19+896*h18+4*w18+40*b41);
__m512 dat577 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k19+896*h18+4*w18+40*b41);
__m512 dat578 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k19+896*h18+4*w18+40*b41);
__m512 dat579 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k19+896*h18+4*w18+40*b41);
__m512 dat580 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k19+896*h18+4*w18+40*b41);
__m512 dat581 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k19+896*h18+4*w18+40*b41);
__m512 dat582 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k19+896*h18+4*w18+40*b41);
__m512 fft6721 = _mm512_add_ps(dat576, _mm512_setzero_ps());
__m512 fft6809 = _mm512_add_ps(dat577, _mm512_setzero_ps());
__m512 fft6722 = _mm512_sub_ps(dat576, _mm512_setzero_ps());
__m512 fft6810 = _mm512_sub_ps(dat577, _mm512_setzero_ps());
__m512 fft6723 = _mm512_add_ps(dat578, _mm512_setzero_ps());
__m512 fft6811 = _mm512_add_ps(dat579, _mm512_setzero_ps());
__m512 fft6724 = _mm512_sub_ps(dat578, _mm512_setzero_ps());
__m512 fft6812 = _mm512_sub_ps(dat579, _mm512_setzero_ps());
__m512 fft6725 = _mm512_add_ps(dat580, _mm512_setzero_ps());
__m512 fft6813 = _mm512_add_ps(dat581, _mm512_setzero_ps());
__m512 fft6726 = _mm512_sub_ps(dat580, _mm512_setzero_ps());
__m512 fft6814 = _mm512_sub_ps(dat581, _mm512_setzero_ps());
__m512 fft6727 = _mm512_add_ps(dat582, _mm512_setzero_ps());
__m512 fft6815 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6728 = _mm512_sub_ps(dat582, _mm512_setzero_ps());
__m512 fft6816 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6729 = _mm512_add_ps(fft6721, fft6725);
__m512 fft6817 = _mm512_add_ps(fft6809, fft6813);
__m512 fft6730 = _mm512_sub_ps(fft6721, fft6725);
__m512 fft6818 = _mm512_sub_ps(fft6809, fft6813);
__m512 fft6731 = _mm512_add_ps(fft6723, fft6727);
__m512 fft6819 = _mm512_add_ps(fft6811, fft6815);
__m512 fft6732 = _mm512_sub_ps(fft6727, fft6723);
__m512 fft6820 = _mm512_sub_ps(fft6815, fft6811);
__m512 fft6733 = _mm512_sub_ps(fft6724, fft6728);
__m512 fft6821 = _mm512_sub_ps(fft6812, fft6816);
__m512 fft6734 = _mm512_add_ps(fft6724, fft6728);
__m512 fft6822 = _mm512_add_ps(fft6812, fft6816);
__m512 fft6735 = _mm512_add_ps(fft6729, fft6731);
__m512 fft6823 = _mm512_add_ps(fft6817, fft6819);
__m512 fft6736 = _mm512_sub_ps(fft6729, fft6731);
__m512 fft6824 = _mm512_sub_ps(fft6817, fft6819);
__m512 fft6737 = _mm512_fmadd_ps(fft6733, _mm512_set1_ps(7.0710677e-01f), fft6722);
__m512 fft6825 = _mm512_fmadd_ps(fft6821, _mm512_set1_ps(7.0710677e-01f), fft6810);
__m512 fft6738 = _mm512_fnmsub_ps(fft6734, _mm512_set1_ps(7.0710677e-01f), fft6726);
__m512 fft6826 = _mm512_fnmsub_ps(fft6822, _mm512_set1_ps(7.0710677e-01f), fft6814);
__m512 fft6739 = _mm512_fnmadd_ps(fft6733, _mm512_set1_ps(7.0710677e-01f), fft6722);
__m512 fft6827 = _mm512_fnmadd_ps(fft6821, _mm512_set1_ps(7.0710677e-01f), fft6810);
__m512 fft6740 = _mm512_fnmadd_ps(fft6734, _mm512_set1_ps(7.0710677e-01f), fft6726);
__m512 fft6828 = _mm512_fnmadd_ps(fft6822, _mm512_set1_ps(7.0710677e-01f), fft6814);
__m512 fft6741 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6742 = _mm512_fmadd_ps(fft6735, fft6741, _mm512_shuffle_f32x4(fft6735, fft6735, 78));
__m512 fft6829 = _mm512_fmadd_ps(fft6823, fft6741, _mm512_shuffle_f32x4(fft6823, fft6823, 78));
__m512 fft6743 = _mm512_fmadd_ps(fft6736, fft6741, _mm512_shuffle_f32x4(fft6736, fft6736, 78));
__m512 fft6830 = _mm512_fmadd_ps(fft6824, fft6741, _mm512_shuffle_f32x4(fft6824, fft6824, 78));
__m512 fft6744 = _mm512_fmadd_ps(fft6737, fft6741, _mm512_shuffle_f32x4(fft6737, fft6737, 78));
__m512 fft6831 = _mm512_fmadd_ps(fft6825, fft6741, _mm512_shuffle_f32x4(fft6825, fft6825, 78));
__m512 fft6745 = _mm512_fmadd_ps(fft6738, fft6741, _mm512_shuffle_f32x4(fft6738, fft6738, 78));
__m512 fft6832 = _mm512_fmadd_ps(fft6826, fft6741, _mm512_shuffle_f32x4(fft6826, fft6826, 78));
__m512 fft6746 = _mm512_fmadd_ps(fft6730, fft6741, _mm512_shuffle_f32x4(fft6730, fft6730, 78));
__m512 fft6833 = _mm512_fmadd_ps(fft6818, fft6741, _mm512_shuffle_f32x4(fft6818, fft6818, 78));
__m512 fft6747 = _mm512_fmadd_ps(fft6732, fft6741, _mm512_shuffle_f32x4(fft6732, fft6732, 78));
__m512 fft6834 = _mm512_fmadd_ps(fft6820, fft6741, _mm512_shuffle_f32x4(fft6820, fft6820, 78));
__m512 fft6748 = _mm512_fmadd_ps(fft6739, fft6741, _mm512_shuffle_f32x4(fft6739, fft6739, 78));
__m512 fft6835 = _mm512_fmadd_ps(fft6827, fft6741, _mm512_shuffle_f32x4(fft6827, fft6827, 78));
__m512 fft6749 = _mm512_fmadd_ps(fft6740, fft6741, _mm512_shuffle_f32x4(fft6740, fft6740, 78));
__m512 fft6836 = _mm512_fmadd_ps(fft6828, fft6741, _mm512_shuffle_f32x4(fft6828, fft6828, 78));
__m512 fft6750 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6751 = _mm512_mul_ps(fft6742, fft6750);
__m512 fft6837 = _mm512_mul_ps(fft6829, fft6750);
__m512 fft6752 = _mm512_mul_ps(fft6743, fft6750);
__m512 fft6838 = _mm512_mul_ps(fft6830, fft6750);
__m512 fft6753 = _mm512_mul_ps(fft6744, fft6750);
__m512 fft6839 = _mm512_mul_ps(fft6831, fft6750);
__m512 fft6754 = _mm512_mul_ps(fft6745, fft6750);
__m512 fft6840 = _mm512_mul_ps(fft6832, fft6750);
__m512 fft6755 = _mm512_mul_ps(fft6746, fft6750);
__m512 fft6841 = _mm512_mul_ps(fft6833, fft6750);
__m512 fft6756 = _mm512_mul_ps(fft6747, fft6750);
__m512 fft6842 = _mm512_mul_ps(fft6834, fft6750);
__m512 fft6757 = _mm512_mul_ps(fft6748, fft6750);
__m512 fft6843 = _mm512_mul_ps(fft6835, fft6750);
__m512 fft6758 = _mm512_mul_ps(fft6749, fft6750);
__m512 fft6844 = _mm512_mul_ps(fft6836, fft6750);
__m512 fft6759 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft6760 = _mm512_fmadd_ps(fft6743, fft6759, fft6751);
__m512 fft6845 = _mm512_fmadd_ps(fft6830, fft6759, fft6837);
__m512 fft6761 = _mm512_fnmadd_ps(fft6742, fft6759, fft6752);
__m512 fft6846 = _mm512_fnmadd_ps(fft6829, fft6759, fft6838);
__m512 fft6762 = _mm512_fmadd_ps(fft6745, fft6759, fft6753);
__m512 fft6847 = _mm512_fmadd_ps(fft6832, fft6759, fft6839);
__m512 fft6763 = _mm512_fnmadd_ps(fft6744, fft6759, fft6754);
__m512 fft6848 = _mm512_fnmadd_ps(fft6831, fft6759, fft6840);
__m512 fft6764 = _mm512_fmadd_ps(fft6747, fft6759, fft6755);
__m512 fft6849 = _mm512_fmadd_ps(fft6834, fft6759, fft6841);
__m512 fft6765 = _mm512_fnmadd_ps(fft6746, fft6759, fft6756);
__m512 fft6850 = _mm512_fnmadd_ps(fft6833, fft6759, fft6842);
__m512 fft6766 = _mm512_fmadd_ps(fft6749, fft6759, fft6757);
__m512 fft6851 = _mm512_fmadd_ps(fft6836, fft6759, fft6843);
__m512 fft6767 = _mm512_fnmadd_ps(fft6748, fft6759, fft6758);
__m512 fft6852 = _mm512_fnmadd_ps(fft6835, fft6759, fft6844);
__m512 fft6768 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft6769 = _mm512_fmadd_ps(fft6760, fft6768, _mm512_shuffle_f32x4(fft6760, fft6760, 177));
__m512 fft6853 = _mm512_fmadd_ps(fft6845, fft6768, _mm512_shuffle_f32x4(fft6845, fft6845, 177));
__m512 fft6770 = _mm512_fmadd_ps(fft6761, fft6768, _mm512_shuffle_f32x4(fft6761, fft6761, 177));
__m512 fft6854 = _mm512_fmadd_ps(fft6846, fft6768, _mm512_shuffle_f32x4(fft6846, fft6846, 177));
__m512 fft6771 = _mm512_fmadd_ps(fft6762, fft6768, _mm512_shuffle_f32x4(fft6762, fft6762, 177));
__m512 fft6855 = _mm512_fmadd_ps(fft6847, fft6768, _mm512_shuffle_f32x4(fft6847, fft6847, 177));
__m512 fft6772 = _mm512_fmadd_ps(fft6763, fft6768, _mm512_shuffle_f32x4(fft6763, fft6763, 177));
__m512 fft6856 = _mm512_fmadd_ps(fft6848, fft6768, _mm512_shuffle_f32x4(fft6848, fft6848, 177));
__m512 fft6773 = _mm512_fmadd_ps(fft6764, fft6768, _mm512_shuffle_f32x4(fft6764, fft6764, 177));
__m512 fft6857 = _mm512_fmadd_ps(fft6849, fft6768, _mm512_shuffle_f32x4(fft6849, fft6849, 177));
__m512 fft6774 = _mm512_fmadd_ps(fft6765, fft6768, _mm512_shuffle_f32x4(fft6765, fft6765, 177));
__m512 fft6858 = _mm512_fmadd_ps(fft6850, fft6768, _mm512_shuffle_f32x4(fft6850, fft6850, 177));
__m512 fft6775 = _mm512_fmadd_ps(fft6766, fft6768, _mm512_shuffle_f32x4(fft6766, fft6766, 177));
__m512 fft6859 = _mm512_fmadd_ps(fft6851, fft6768, _mm512_shuffle_f32x4(fft6851, fft6851, 177));
__m512 fft6776 = _mm512_fmadd_ps(fft6767, fft6768, _mm512_shuffle_f32x4(fft6767, fft6767, 177));
__m512 fft6860 = _mm512_fmadd_ps(fft6852, fft6768, _mm512_shuffle_f32x4(fft6852, fft6852, 177));
__m512 fft6777 = _mm512_mask_mov_ps(fft6769, 49344, fft6770);
__m512 fft6861 = _mm512_mask_mov_ps(fft6853, 49344, fft6854);
__m512 fft6778 = _mm512_mask_sub_ps(fft6770, 49344, _mm512_setzero_ps(), fft6769);
__m512 fft6862 = _mm512_mask_sub_ps(fft6854, 49344, _mm512_setzero_ps(), fft6853);
__m512 fft6779 = _mm512_mask_mov_ps(fft6771, 49344, fft6772);
__m512 fft6863 = _mm512_mask_mov_ps(fft6855, 49344, fft6856);
__m512 fft6780 = _mm512_mask_sub_ps(fft6772, 49344, _mm512_setzero_ps(), fft6771);
__m512 fft6864 = _mm512_mask_sub_ps(fft6856, 49344, _mm512_setzero_ps(), fft6855);
__m512 fft6781 = _mm512_mask_mov_ps(fft6773, 49344, fft6774);
__m512 fft6865 = _mm512_mask_mov_ps(fft6857, 49344, fft6858);
__m512 fft6782 = _mm512_mask_sub_ps(fft6774, 49344, _mm512_setzero_ps(), fft6773);
__m512 fft6866 = _mm512_mask_sub_ps(fft6858, 49344, _mm512_setzero_ps(), fft6857);
__m512 fft6783 = _mm512_mask_mov_ps(fft6775, 49344, fft6776);
__m512 fft6867 = _mm512_mask_mov_ps(fft6859, 49344, fft6860);
__m512 fft6784 = _mm512_mask_sub_ps(fft6776, 49344, _mm512_setzero_ps(), fft6775);
__m512 fft6868 = _mm512_mask_sub_ps(fft6860, 49344, _mm512_setzero_ps(), fft6859);
__m512 fft6785 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft6786 = _mm512_fmadd_ps(fft6777, fft6785, _mm512_shuffle_ps(fft6777, fft6777, 78));
__m512 fft6869 = _mm512_fmadd_ps(fft6861, fft6785, _mm512_shuffle_ps(fft6861, fft6861, 78));
__m512 fft6787 = _mm512_fmadd_ps(fft6778, fft6785, _mm512_shuffle_ps(fft6778, fft6778, 78));
__m512 fft6870 = _mm512_fmadd_ps(fft6862, fft6785, _mm512_shuffle_ps(fft6862, fft6862, 78));
__m512 fft6788 = _mm512_fmadd_ps(fft6779, fft6785, _mm512_shuffle_ps(fft6779, fft6779, 78));
__m512 fft6871 = _mm512_fmadd_ps(fft6863, fft6785, _mm512_shuffle_ps(fft6863, fft6863, 78));
__m512 fft6789 = _mm512_fmadd_ps(fft6780, fft6785, _mm512_shuffle_ps(fft6780, fft6780, 78));
__m512 fft6872 = _mm512_fmadd_ps(fft6864, fft6785, _mm512_shuffle_ps(fft6864, fft6864, 78));
__m512 fft6790 = _mm512_fmadd_ps(fft6781, fft6785, _mm512_shuffle_ps(fft6781, fft6781, 78));
__m512 fft6873 = _mm512_fmadd_ps(fft6865, fft6785, _mm512_shuffle_ps(fft6865, fft6865, 78));
__m512 fft6791 = _mm512_fmadd_ps(fft6782, fft6785, _mm512_shuffle_ps(fft6782, fft6782, 78));
__m512 fft6874 = _mm512_fmadd_ps(fft6866, fft6785, _mm512_shuffle_ps(fft6866, fft6866, 78));
__m512 fft6792 = _mm512_fmadd_ps(fft6783, fft6785, _mm512_shuffle_ps(fft6783, fft6783, 78));
__m512 fft6875 = _mm512_fmadd_ps(fft6867, fft6785, _mm512_shuffle_ps(fft6867, fft6867, 78));
__m512 fft6793 = _mm512_fmadd_ps(fft6784, fft6785, _mm512_shuffle_ps(fft6784, fft6784, 78));
__m512 fft6876 = _mm512_fmadd_ps(fft6868, fft6785, _mm512_shuffle_ps(fft6868, fft6868, 78));
__m512i fft6794 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft6795 = _mm512_permutexvar_ps(fft6794, fft6786);
__m512 fft6877 = _mm512_permutexvar_ps(fft6794, fft6869);
__m512i fft6796 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft6797 = _mm512_permutexvar_ps(fft6796, fft6786);
__m512 fft6878 = _mm512_permutexvar_ps(fft6796, fft6869);
__m512 fft6798 = _mm512_permutexvar_ps(fft6794, fft6787);
__m512 fft6879 = _mm512_permutexvar_ps(fft6794, fft6870);
__m512 fft6799 = _mm512_permutexvar_ps(fft6796, fft6787);
__m512 fft6880 = _mm512_permutexvar_ps(fft6796, fft6870);
__m512 fft6800 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft6801 = _mm512_fmadd_ps(fft6795, fft6800, fft6797);
__m512 fft6881 = _mm512_fmadd_ps(fft6877, fft6800, fft6878);
__m512 fft6802 = _mm512_fnmadd_ps(fft6799, fft6800, fft6798);
__m512 fft6882 = _mm512_fnmadd_ps(fft6880, fft6800, fft6879);
__m512 fft6803 = _mm512_mask_mov_ps(fft6799, 21845, fft6801);
__m512 fft6883 = _mm512_mask_mov_ps(fft6880, 21845, fft6881);
__m512 fft6804 = _mm512_mask_mov_ps(fft6795, 43176, fft6801);
__m512 fft6884 = _mm512_mask_mov_ps(fft6877, 43176, fft6881);
__m512 fft6805 = _mm512_mask_mov_ps(fft6803, 43176, fft6802);
__m512 fft6885 = _mm512_mask_mov_ps(fft6883, 43176, fft6882);
__m512 fft6806 = _mm512_mask_mov_ps(fft6804, 22102, fft6802);
__m512 fft6886 = _mm512_mask_mov_ps(fft6884, 22102, fft6882);
__m512 fft6807 = _mm512_mask_mul_ps(fft6805, 64764, fft6805, _mm512_set1_ps(5e-01f));
__m512 fft6887 = _mm512_mask_mul_ps(fft6885, 64764, fft6885, _mm512_set1_ps(5e-01f));
__m512 fft6808 = _mm512_mask_mul_ps(fft6806, 64764, fft6806, _mm512_set1_ps(5e-01f));
__m512 fft6888 = _mm512_mask_mul_ps(fft6886, 64764, fft6886, _mm512_set1_ps(5e-01f));
__m512 df609 = fft6807;
__m512 df617 = fft6887;
__m512 df610 = fft6808;
__m512 df618 = fft6888;
__m512 df611 = fft6788;
__m512 df619 = fft6871;
__m512 df612 = fft6789;
__m512 df620 = fft6872;
__m512 df613 = fft6790;
__m512 df621 = fft6873;
__m512 df614 = fft6791;
__m512 df622 = fft6874;
__m512 df615 = fft6792;
__m512 df623 = fft6875;
__m512 df616 = fft6793;
__m512 df624 = fft6876;
__m512i eo41 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df611 = _mm512_permutexvar_ps(eo41, df611);
df612 = _mm512_permutexvar_ps(eo41, df612);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df611);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df612);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df611);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df612);
df619 = _mm512_permutexvar_ps(eo41, df619);
df620 = _mm512_permutexvar_ps(eo41, df620);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df619);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df620);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df619);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df620);
df613 = _mm512_permutexvar_ps(eo41, df613);
df614 = _mm512_permutexvar_ps(eo41, df614);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df613);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df614);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df613);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df614);
df621 = _mm512_permutexvar_ps(eo41, df621);
df622 = _mm512_permutexvar_ps(eo41, df622);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df621);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df622);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df621);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df622);
df615 = _mm512_permutexvar_ps(eo41, df615);
df616 = _mm512_permutexvar_ps(eo41, df616);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df615);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df616);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df615);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df616);
df623 = _mm512_permutexvar_ps(eo41, df623);
df624 = _mm512_permutexvar_ps(eo41, df624);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df623);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df624);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df623);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df624);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df609);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df610);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df609);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df610);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df617);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df618);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df617);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df618);
}
}
if (j2 >= last1) return;
++j2;
}
rel3 = 4;
}
ptrdiff_t h19 = base3+10;
ptrdiff_t w19 = 220;
ptrdiff_t k20 = 3*s1;
ptrdiff_t kk19 = k20+2;
for (; k20 <= kk19; ++k20) {
ptrdiff_t b42 = 0;
ptrdiff_t m42 = (size_t)b42/2;
ptrdiff_t f43 = (size_t)b42%2;
__m512 dat583 = _mm512_maskz_loadu_ps(127, datPtr1+0+602112*i6+200704*k20+896*h19+4*w19+0*b42);
__m512 dat584 = _mm512_maskz_loadu_ps(127, datPtr1+896+602112*i6+200704*k20+896*h19+4*w19+0*b42);
__m512 dat585 = _mm512_maskz_loadu_ps(127, datPtr1+1792+602112*i6+200704*k20+896*h19+4*w19+0*b42);
__m512 dat586 = _mm512_maskz_loadu_ps(127, datPtr1+2688+602112*i6+200704*k20+896*h19+4*w19+0*b42);
__m512 dat587 = _mm512_maskz_loadu_ps(127, datPtr1+3584+602112*i6+200704*k20+896*h19+4*w19+0*b42);
__m512 dat588 = _mm512_maskz_loadu_ps(127, datPtr1+4480+602112*i6+200704*k20+896*h19+4*w19+0*b42);
__m512 dat589 = _mm512_maskz_loadu_ps(127, datPtr1+5376+602112*i6+200704*k20+896*h19+4*w19+0*b42);
__m512 fft6889 = _mm512_add_ps(dat583, _mm512_setzero_ps());
__m512 fft6977 = _mm512_add_ps(dat584, _mm512_setzero_ps());
__m512 fft6890 = _mm512_sub_ps(dat583, _mm512_setzero_ps());
__m512 fft6978 = _mm512_sub_ps(dat584, _mm512_setzero_ps());
__m512 fft6891 = _mm512_add_ps(dat585, _mm512_setzero_ps());
__m512 fft6979 = _mm512_add_ps(dat586, _mm512_setzero_ps());
__m512 fft6892 = _mm512_sub_ps(dat585, _mm512_setzero_ps());
__m512 fft6980 = _mm512_sub_ps(dat586, _mm512_setzero_ps());
__m512 fft6893 = _mm512_add_ps(dat587, _mm512_setzero_ps());
__m512 fft6981 = _mm512_add_ps(dat588, _mm512_setzero_ps());
__m512 fft6894 = _mm512_sub_ps(dat587, _mm512_setzero_ps());
__m512 fft6982 = _mm512_sub_ps(dat588, _mm512_setzero_ps());
__m512 fft6895 = _mm512_add_ps(dat589, _mm512_setzero_ps());
__m512 fft6983 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6896 = _mm512_sub_ps(dat589, _mm512_setzero_ps());
__m512 fft6984 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6897 = _mm512_add_ps(fft6889, fft6893);
__m512 fft6985 = _mm512_add_ps(fft6977, fft6981);
__m512 fft6898 = _mm512_sub_ps(fft6889, fft6893);
__m512 fft6986 = _mm512_sub_ps(fft6977, fft6981);
__m512 fft6899 = _mm512_add_ps(fft6891, fft6895);
__m512 fft6987 = _mm512_add_ps(fft6979, fft6983);
__m512 fft6900 = _mm512_sub_ps(fft6895, fft6891);
__m512 fft6988 = _mm512_sub_ps(fft6983, fft6979);
__m512 fft6901 = _mm512_sub_ps(fft6892, fft6896);
__m512 fft6989 = _mm512_sub_ps(fft6980, fft6984);
__m512 fft6902 = _mm512_add_ps(fft6892, fft6896);
__m512 fft6990 = _mm512_add_ps(fft6980, fft6984);
__m512 fft6903 = _mm512_add_ps(fft6897, fft6899);
__m512 fft6991 = _mm512_add_ps(fft6985, fft6987);
__m512 fft6904 = _mm512_sub_ps(fft6897, fft6899);
__m512 fft6992 = _mm512_sub_ps(fft6985, fft6987);
__m512 fft6905 = _mm512_fmadd_ps(fft6901, _mm512_set1_ps(7.0710677e-01f), fft6890);
__m512 fft6993 = _mm512_fmadd_ps(fft6989, _mm512_set1_ps(7.0710677e-01f), fft6978);
__m512 fft6906 = _mm512_fnmsub_ps(fft6902, _mm512_set1_ps(7.0710677e-01f), fft6894);
__m512 fft6994 = _mm512_fnmsub_ps(fft6990, _mm512_set1_ps(7.0710677e-01f), fft6982);
__m512 fft6907 = _mm512_fnmadd_ps(fft6901, _mm512_set1_ps(7.0710677e-01f), fft6890);
__m512 fft6995 = _mm512_fnmadd_ps(fft6989, _mm512_set1_ps(7.0710677e-01f), fft6978);
__m512 fft6908 = _mm512_fnmadd_ps(fft6902, _mm512_set1_ps(7.0710677e-01f), fft6894);
__m512 fft6996 = _mm512_fnmadd_ps(fft6990, _mm512_set1_ps(7.0710677e-01f), fft6982);
__m512 fft6909 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6910 = _mm512_fmadd_ps(fft6903, fft6909, _mm512_shuffle_f32x4(fft6903, fft6903, 78));
__m512 fft6997 = _mm512_fmadd_ps(fft6991, fft6909, _mm512_shuffle_f32x4(fft6991, fft6991, 78));
__m512 fft6911 = _mm512_fmadd_ps(fft6904, fft6909, _mm512_shuffle_f32x4(fft6904, fft6904, 78));
__m512 fft6998 = _mm512_fmadd_ps(fft6992, fft6909, _mm512_shuffle_f32x4(fft6992, fft6992, 78));
__m512 fft6912 = _mm512_fmadd_ps(fft6905, fft6909, _mm512_shuffle_f32x4(fft6905, fft6905, 78));
__m512 fft6999 = _mm512_fmadd_ps(fft6993, fft6909, _mm512_shuffle_f32x4(fft6993, fft6993, 78));
__m512 fft6913 = _mm512_fmadd_ps(fft6906, fft6909, _mm512_shuffle_f32x4(fft6906, fft6906, 78));
__m512 fft7000 = _mm512_fmadd_ps(fft6994, fft6909, _mm512_shuffle_f32x4(fft6994, fft6994, 78));
__m512 fft6914 = _mm512_fmadd_ps(fft6898, fft6909, _mm512_shuffle_f32x4(fft6898, fft6898, 78));
__m512 fft7001 = _mm512_fmadd_ps(fft6986, fft6909, _mm512_shuffle_f32x4(fft6986, fft6986, 78));
__m512 fft6915 = _mm512_fmadd_ps(fft6900, fft6909, _mm512_shuffle_f32x4(fft6900, fft6900, 78));
__m512 fft7002 = _mm512_fmadd_ps(fft6988, fft6909, _mm512_shuffle_f32x4(fft6988, fft6988, 78));
__m512 fft6916 = _mm512_fmadd_ps(fft6907, fft6909, _mm512_shuffle_f32x4(fft6907, fft6907, 78));
__m512 fft7003 = _mm512_fmadd_ps(fft6995, fft6909, _mm512_shuffle_f32x4(fft6995, fft6995, 78));
__m512 fft6917 = _mm512_fmadd_ps(fft6908, fft6909, _mm512_shuffle_f32x4(fft6908, fft6908, 78));
__m512 fft7004 = _mm512_fmadd_ps(fft6996, fft6909, _mm512_shuffle_f32x4(fft6996, fft6996, 78));
__m512 fft6918 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6919 = _mm512_mul_ps(fft6910, fft6918);
__m512 fft7005 = _mm512_mul_ps(fft6997, fft6918);
__m512 fft6920 = _mm512_mul_ps(fft6911, fft6918);
__m512 fft7006 = _mm512_mul_ps(fft6998, fft6918);
__m512 fft6921 = _mm512_mul_ps(fft6912, fft6918);
__m512 fft7007 = _mm512_mul_ps(fft6999, fft6918);
__m512 fft6922 = _mm512_mul_ps(fft6913, fft6918);
__m512 fft7008 = _mm512_mul_ps(fft7000, fft6918);
__m512 fft6923 = _mm512_mul_ps(fft6914, fft6918);
__m512 fft7009 = _mm512_mul_ps(fft7001, fft6918);
__m512 fft6924 = _mm512_mul_ps(fft6915, fft6918);
__m512 fft7010 = _mm512_mul_ps(fft7002, fft6918);
__m512 fft6925 = _mm512_mul_ps(fft6916, fft6918);
__m512 fft7011 = _mm512_mul_ps(fft7003, fft6918);
__m512 fft6926 = _mm512_mul_ps(fft6917, fft6918);
__m512 fft7012 = _mm512_mul_ps(fft7004, fft6918);
__m512 fft6927 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft6928 = _mm512_fmadd_ps(fft6911, fft6927, fft6919);
__m512 fft7013 = _mm512_fmadd_ps(fft6998, fft6927, fft7005);
__m512 fft6929 = _mm512_fnmadd_ps(fft6910, fft6927, fft6920);
__m512 fft7014 = _mm512_fnmadd_ps(fft6997, fft6927, fft7006);
__m512 fft6930 = _mm512_fmadd_ps(fft6913, fft6927, fft6921);
__m512 fft7015 = _mm512_fmadd_ps(fft7000, fft6927, fft7007);
__m512 fft6931 = _mm512_fnmadd_ps(fft6912, fft6927, fft6922);
__m512 fft7016 = _mm512_fnmadd_ps(fft6999, fft6927, fft7008);
__m512 fft6932 = _mm512_fmadd_ps(fft6915, fft6927, fft6923);
__m512 fft7017 = _mm512_fmadd_ps(fft7002, fft6927, fft7009);
__m512 fft6933 = _mm512_fnmadd_ps(fft6914, fft6927, fft6924);
__m512 fft7018 = _mm512_fnmadd_ps(fft7001, fft6927, fft7010);
__m512 fft6934 = _mm512_fmadd_ps(fft6917, fft6927, fft6925);
__m512 fft7019 = _mm512_fmadd_ps(fft7004, fft6927, fft7011);
__m512 fft6935 = _mm512_fnmadd_ps(fft6916, fft6927, fft6926);
__m512 fft7020 = _mm512_fnmadd_ps(fft7003, fft6927, fft7012);
__m512 fft6936 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft6937 = _mm512_fmadd_ps(fft6928, fft6936, _mm512_shuffle_f32x4(fft6928, fft6928, 177));
__m512 fft7021 = _mm512_fmadd_ps(fft7013, fft6936, _mm512_shuffle_f32x4(fft7013, fft7013, 177));
__m512 fft6938 = _mm512_fmadd_ps(fft6929, fft6936, _mm512_shuffle_f32x4(fft6929, fft6929, 177));
__m512 fft7022 = _mm512_fmadd_ps(fft7014, fft6936, _mm512_shuffle_f32x4(fft7014, fft7014, 177));
__m512 fft6939 = _mm512_fmadd_ps(fft6930, fft6936, _mm512_shuffle_f32x4(fft6930, fft6930, 177));
__m512 fft7023 = _mm512_fmadd_ps(fft7015, fft6936, _mm512_shuffle_f32x4(fft7015, fft7015, 177));
__m512 fft6940 = _mm512_fmadd_ps(fft6931, fft6936, _mm512_shuffle_f32x4(fft6931, fft6931, 177));
__m512 fft7024 = _mm512_fmadd_ps(fft7016, fft6936, _mm512_shuffle_f32x4(fft7016, fft7016, 177));
__m512 fft6941 = _mm512_fmadd_ps(fft6932, fft6936, _mm512_shuffle_f32x4(fft6932, fft6932, 177));
__m512 fft7025 = _mm512_fmadd_ps(fft7017, fft6936, _mm512_shuffle_f32x4(fft7017, fft7017, 177));
__m512 fft6942 = _mm512_fmadd_ps(fft6933, fft6936, _mm512_shuffle_f32x4(fft6933, fft6933, 177));
__m512 fft7026 = _mm512_fmadd_ps(fft7018, fft6936, _mm512_shuffle_f32x4(fft7018, fft7018, 177));
__m512 fft6943 = _mm512_fmadd_ps(fft6934, fft6936, _mm512_shuffle_f32x4(fft6934, fft6934, 177));
__m512 fft7027 = _mm512_fmadd_ps(fft7019, fft6936, _mm512_shuffle_f32x4(fft7019, fft7019, 177));
__m512 fft6944 = _mm512_fmadd_ps(fft6935, fft6936, _mm512_shuffle_f32x4(fft6935, fft6935, 177));
__m512 fft7028 = _mm512_fmadd_ps(fft7020, fft6936, _mm512_shuffle_f32x4(fft7020, fft7020, 177));
__m512 fft6945 = _mm512_mask_mov_ps(fft6937, 49344, fft6938);
__m512 fft7029 = _mm512_mask_mov_ps(fft7021, 49344, fft7022);
__m512 fft6946 = _mm512_mask_sub_ps(fft6938, 49344, _mm512_setzero_ps(), fft6937);
__m512 fft7030 = _mm512_mask_sub_ps(fft7022, 49344, _mm512_setzero_ps(), fft7021);
__m512 fft6947 = _mm512_mask_mov_ps(fft6939, 49344, fft6940);
__m512 fft7031 = _mm512_mask_mov_ps(fft7023, 49344, fft7024);
__m512 fft6948 = _mm512_mask_sub_ps(fft6940, 49344, _mm512_setzero_ps(), fft6939);
__m512 fft7032 = _mm512_mask_sub_ps(fft7024, 49344, _mm512_setzero_ps(), fft7023);
__m512 fft6949 = _mm512_mask_mov_ps(fft6941, 49344, fft6942);
__m512 fft7033 = _mm512_mask_mov_ps(fft7025, 49344, fft7026);
__m512 fft6950 = _mm512_mask_sub_ps(fft6942, 49344, _mm512_setzero_ps(), fft6941);
__m512 fft7034 = _mm512_mask_sub_ps(fft7026, 49344, _mm512_setzero_ps(), fft7025);
__m512 fft6951 = _mm512_mask_mov_ps(fft6943, 49344, fft6944);
__m512 fft7035 = _mm512_mask_mov_ps(fft7027, 49344, fft7028);
__m512 fft6952 = _mm512_mask_sub_ps(fft6944, 49344, _mm512_setzero_ps(), fft6943);
__m512 fft7036 = _mm512_mask_sub_ps(fft7028, 49344, _mm512_setzero_ps(), fft7027);
__m512 fft6953 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft6954 = _mm512_fmadd_ps(fft6945, fft6953, _mm512_shuffle_ps(fft6945, fft6945, 78));
__m512 fft7037 = _mm512_fmadd_ps(fft7029, fft6953, _mm512_shuffle_ps(fft7029, fft7029, 78));
__m512 fft6955 = _mm512_fmadd_ps(fft6946, fft6953, _mm512_shuffle_ps(fft6946, fft6946, 78));
__m512 fft7038 = _mm512_fmadd_ps(fft7030, fft6953, _mm512_shuffle_ps(fft7030, fft7030, 78));
__m512 fft6956 = _mm512_fmadd_ps(fft6947, fft6953, _mm512_shuffle_ps(fft6947, fft6947, 78));
__m512 fft7039 = _mm512_fmadd_ps(fft7031, fft6953, _mm512_shuffle_ps(fft7031, fft7031, 78));
__m512 fft6957 = _mm512_fmadd_ps(fft6948, fft6953, _mm512_shuffle_ps(fft6948, fft6948, 78));
__m512 fft7040 = _mm512_fmadd_ps(fft7032, fft6953, _mm512_shuffle_ps(fft7032, fft7032, 78));
__m512 fft6958 = _mm512_fmadd_ps(fft6949, fft6953, _mm512_shuffle_ps(fft6949, fft6949, 78));
__m512 fft7041 = _mm512_fmadd_ps(fft7033, fft6953, _mm512_shuffle_ps(fft7033, fft7033, 78));
__m512 fft6959 = _mm512_fmadd_ps(fft6950, fft6953, _mm512_shuffle_ps(fft6950, fft6950, 78));
__m512 fft7042 = _mm512_fmadd_ps(fft7034, fft6953, _mm512_shuffle_ps(fft7034, fft7034, 78));
__m512 fft6960 = _mm512_fmadd_ps(fft6951, fft6953, _mm512_shuffle_ps(fft6951, fft6951, 78));
__m512 fft7043 = _mm512_fmadd_ps(fft7035, fft6953, _mm512_shuffle_ps(fft7035, fft7035, 78));
__m512 fft6961 = _mm512_fmadd_ps(fft6952, fft6953, _mm512_shuffle_ps(fft6952, fft6952, 78));
__m512 fft7044 = _mm512_fmadd_ps(fft7036, fft6953, _mm512_shuffle_ps(fft7036, fft7036, 78));
__m512i fft6962 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft6963 = _mm512_permutexvar_ps(fft6962, fft6954);
__m512 fft7045 = _mm512_permutexvar_ps(fft6962, fft7037);
__m512i fft6964 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft6965 = _mm512_permutexvar_ps(fft6964, fft6954);
__m512 fft7046 = _mm512_permutexvar_ps(fft6964, fft7037);
__m512 fft6966 = _mm512_permutexvar_ps(fft6962, fft6955);
__m512 fft7047 = _mm512_permutexvar_ps(fft6962, fft7038);
__m512 fft6967 = _mm512_permutexvar_ps(fft6964, fft6955);
__m512 fft7048 = _mm512_permutexvar_ps(fft6964, fft7038);
__m512 fft6968 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft6969 = _mm512_fmadd_ps(fft6963, fft6968, fft6965);
__m512 fft7049 = _mm512_fmadd_ps(fft7045, fft6968, fft7046);
__m512 fft6970 = _mm512_fnmadd_ps(fft6967, fft6968, fft6966);
__m512 fft7050 = _mm512_fnmadd_ps(fft7048, fft6968, fft7047);
__m512 fft6971 = _mm512_mask_mov_ps(fft6967, 21845, fft6969);
__m512 fft7051 = _mm512_mask_mov_ps(fft7048, 21845, fft7049);
__m512 fft6972 = _mm512_mask_mov_ps(fft6963, 43176, fft6969);
__m512 fft7052 = _mm512_mask_mov_ps(fft7045, 43176, fft7049);
__m512 fft6973 = _mm512_mask_mov_ps(fft6971, 43176, fft6970);
__m512 fft7053 = _mm512_mask_mov_ps(fft7051, 43176, fft7050);
__m512 fft6974 = _mm512_mask_mov_ps(fft6972, 22102, fft6970);
__m512 fft7054 = _mm512_mask_mov_ps(fft7052, 22102, fft7050);
__m512 fft6975 = _mm512_mask_mul_ps(fft6973, 64764, fft6973, _mm512_set1_ps(5e-01f));
__m512 fft7055 = _mm512_mask_mul_ps(fft7053, 64764, fft7053, _mm512_set1_ps(5e-01f));
__m512 fft6976 = _mm512_mask_mul_ps(fft6974, 64764, fft6974, _mm512_set1_ps(5e-01f));
__m512 fft7056 = _mm512_mask_mul_ps(fft7054, 64764, fft7054, _mm512_set1_ps(5e-01f));
__m512 df625 = fft6975;
__m512 df633 = fft7055;
__m512 df626 = fft6976;
__m512 df634 = fft7056;
__m512 df627 = fft6956;
__m512 df635 = fft7039;
__m512 df628 = fft6957;
__m512 df636 = fft7040;
__m512 df629 = fft6958;
__m512 df637 = fft7041;
__m512 df630 = fft6959;
__m512 df638 = fft7042;
__m512 df631 = fft6960;
__m512 df639 = fft7043;
__m512 df632 = fft6961;
__m512 df640 = fft7044;
__m512i eo42 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df627 = _mm512_permutexvar_ps(eo42, df627);
df628 = _mm512_permutexvar_ps(eo42, df628);
__m512 rep1 = _mm512_shuffle_f32x4(df627, df627, 68);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep1);
__m512 rep2 = _mm512_shuffle_f32x4(df628, df628, 68);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep2);
__m512 rep3 = _mm512_shuffle_f32x4(df627, df627, 238);
_mm512_mask_storeu_ps(dfPtr1+508800+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep3);
__m512 rep4 = _mm512_shuffle_f32x4(df628, df628, 238);
_mm512_mask_storeu_ps(dfPtr1+508864+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep4);
df635 = _mm512_permutexvar_ps(eo42, df635);
df636 = _mm512_permutexvar_ps(eo42, df636);
__m512 rep5 = _mm512_shuffle_f32x4(df635, df635, 68);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep5);
__m512 rep6 = _mm512_shuffle_f32x4(df636, df636, 68);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep6);
__m512 rep7 = _mm512_shuffle_f32x4(df635, df635, 238);
_mm512_mask_storeu_ps(dfPtr1+1322880+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep7);
__m512 rep8 = _mm512_shuffle_f32x4(df636, df636, 238);
_mm512_mask_storeu_ps(dfPtr1+1322944+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep8);
df629 = _mm512_permutexvar_ps(eo42, df629);
df630 = _mm512_permutexvar_ps(eo42, df630);
__m512 rep9 = _mm512_shuffle_f32x4(df629, df629, 68);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep9);
__m512 rep10 = _mm512_shuffle_f32x4(df630, df630, 68);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep10);
__m512 rep11 = _mm512_shuffle_f32x4(df629, df629, 238);
_mm512_mask_storeu_ps(dfPtr1+610560+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep11);
__m512 rep12 = _mm512_shuffle_f32x4(df630, df630, 238);
_mm512_mask_storeu_ps(dfPtr1+610624+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep12);
df637 = _mm512_permutexvar_ps(eo42, df637);
df638 = _mm512_permutexvar_ps(eo42, df638);
__m512 rep13 = _mm512_shuffle_f32x4(df637, df637, 68);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep13);
__m512 rep14 = _mm512_shuffle_f32x4(df638, df638, 68);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep14);
__m512 rep15 = _mm512_shuffle_f32x4(df637, df637, 238);
_mm512_mask_storeu_ps(dfPtr1+1424640+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep15);
__m512 rep16 = _mm512_shuffle_f32x4(df638, df638, 238);
_mm512_mask_storeu_ps(dfPtr1+1424704+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep16);
df631 = _mm512_permutexvar_ps(eo42, df631);
df632 = _mm512_permutexvar_ps(eo42, df632);
__m512 rep17 = _mm512_shuffle_f32x4(df631, df631, 68);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep17);
__m512 rep18 = _mm512_shuffle_f32x4(df632, df632, 68);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep18);
__m512 rep19 = _mm512_shuffle_f32x4(df631, df631, 238);
_mm512_mask_storeu_ps(dfPtr1+712320+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep19);
__m512 rep20 = _mm512_shuffle_f32x4(df632, df632, 238);
_mm512_mask_storeu_ps(dfPtr1+712384+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep20);
df639 = _mm512_permutexvar_ps(eo42, df639);
df640 = _mm512_permutexvar_ps(eo42, df640);
__m512 rep21 = _mm512_shuffle_f32x4(df639, df639, 68);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep21);
__m512 rep22 = _mm512_shuffle_f32x4(df640, df640, 68);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep22);
__m512 rep23 = _mm512_shuffle_f32x4(df639, df639, 238);
_mm512_mask_storeu_ps(dfPtr1+1526400+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep23);
__m512 rep24 = _mm512_shuffle_f32x4(df640, df640, 238);
_mm512_mask_storeu_ps(dfPtr1+1526464+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep24);
__m512 rep25 = _mm512_shuffle_f32x4(df625, df625, 68);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep25);
__m512 rep26 = _mm512_shuffle_f32x4(df626, df626, 68);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep26);
__m512 rep27 = _mm512_shuffle_f32x4(df625, df625, 238);
_mm512_mask_storeu_ps(dfPtr1+407040+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep27);
__m512 rep28 = _mm512_shuffle_f32x4(df626, df626, 238);
_mm512_mask_storeu_ps(dfPtr1+407104+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep28);
__m512 rep29 = _mm512_shuffle_f32x4(df633, df633, 68);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep29);
__m512 rep30 = _mm512_shuffle_f32x4(df634, df634, 68);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep30);
__m512 rep31 = _mm512_shuffle_f32x4(df633, df633, 238);
_mm512_mask_storeu_ps(dfPtr1+1221120+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep31);
__m512 rep32 = _mm512_shuffle_f32x4(df634, df634, 238);
_mm512_mask_storeu_ps(dfPtr1+1221184+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep32);
}
if (j2 >= last1) return;
++j2;
}

static void ResNet50StriderArrangeDats1(ResNet50ThreaderTeam1* team15, char** tensors3) {
ResNet50ThreaderTask1 task7;
task7.callee1 = ResNet50StriderArrangeDats1Callee1;
task7.any1 = tensors3;
task7.nd1 = 4;
task7.hull1[0] = 1;
task7.hull1[1] = 11;
task7.hull1[2] = 1;
task7.hull1[3] = 1;
ResNet50ThreaderDo1(team15, &task7);
}

static void ResNet50StriderProduceSums1Callee1(ResNet50ThreaderTask1* task8, int64_t* pt9) {
void** tuple2 = task8->any1;
char** tensors6 = tuple2[0];
ptrdiff_t e3 = 0;
ptrdiff_t z2 = (ptrdiff_t)tuple2[2];
ptrdiff_t g4 = 0;
ptrdiff_t p1 = pt9[2];
ptrdiff_t d1 = pt9[1];
ptrdiff_t w20 = 0;
if (__builtin_expect(!(e3|z2), 0)) {
z2 = 0;
char*restrict bfPtr2 = tensors6[0]+256*e3;
char*restrict wfPtr2 = tensors6[0]+256+12976128*e3+24576*z2;
char*restrict dfPtr2 = tensors6[1]+214917120*e3+407040*z2;
char*restrict sfPtr1 = tensors6[2];
ptrdiff_t i7 = 1*g4;
ptrdiff_t j3 = 1*p1;
ptrdiff_t jj9 = j3+0;
if (__builtin_expect(!j3, 0)) {
ptrdiff_t k21 = 6*d1;
ptrdiff_t kk20 = k21+(d1 < 13 ? 5 : 10);
for (; k21 != 88; ++k21) {
ptrdiff_t l1 = 16*w20;
for (; l1 != 16; ++l1) {
__m512 sfRe1 = _mm512_setzero_ps();
__m512 sfIm1 = _mm512_setzero_ps();
__m512 sfRe7 = _mm512_setzero_ps();
__m512 sfIm7 = _mm512_setzero_ps();
sfRe1 = _mm512_mask_mov_ps(sfRe1, 1, _mm512_set1_ps(*(float*)(bfPtr2+0+256*i7+16*l1)));
sfRe1 = _mm512_mask_mov_ps(sfRe1, 256, _mm512_set1_ps(*(float*)(bfPtr2+4+256*i7+16*l1)));
sfRe7 = _mm512_mask_mov_ps(sfRe7, 1, _mm512_set1_ps(*(float*)(bfPtr2+8+256*i7+16*l1)));
sfRe7 = _mm512_mask_mov_ps(sfRe7, 256, _mm512_set1_ps(*(float*)(bfPtr2+12+256*i7+16*l1)));
__m512 sfRe2 = sfRe1;
__m512 sfIm2 = sfIm1;
__m512 sfRe3 = sfRe1;
__m512 sfIm3 = sfIm1;
__m512 sfRe4 = sfRe1;
__m512 sfIm4 = sfIm1;
__m512 sfRe5 = sfRe1;
__m512 sfIm5 = sfIm1;
__m512 sfRe6 = sfRe1;
__m512 sfIm6 = sfIm1;
__m512 sfRe8 = sfRe7;
__m512 sfIm8 = sfIm7;
__m512 sfRe9 = sfRe7;
__m512 sfIm9 = sfIm7;
__m512 sfRe10 = sfRe7;
__m512 sfIm10 = sfIm7;
__m512 sfRe11 = sfRe7;
__m512 sfIm11 = sfIm7;
__m512 sfRe12 = sfRe7;
__m512 sfIm12 = sfIm7;
for (ptrdiff_t s2 = 0; s2 < 3; ++s2) {
__m512i wfLd1 = _mm512_loadu_si512(wfPtr2+0+24576*i7+6144*j3+384*l1+128*s2);
__m512 wfRe1 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd1));
__m512 wfIm1 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd1, 1));
__m512 wfMx1 = _mm512_mask_mov_ps(wfIm1, 64764, wfRe1);
__m512i wfLd2 = _mm512_loadu_si512(wfPtr2+64+24576*i7+6144*j3+384*l1+128*s2);
__m512 wfRe2 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd2));
__m512 wfIm2 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd2, 1));
__m512 wfMx2 = _mm512_mask_mov_ps(wfIm2, 64764, wfRe2);
__m512 dfRe1 = _mm512_loadu_ps(dfPtr2+0+407040*i7+101760*j3+1152*k21+384*s2);
__m512 dfIm1 = _mm512_loadu_ps(dfPtr2+64+407040*i7+101760*j3+1152*k21+384*s2);
sfRe1 = _mm512_fmadd_ps(wfRe1, dfRe1, sfRe1);
sfRe1 = _mm512_mask3_fmadd_ps(wfIm1, dfIm1, sfRe1, 64764);
sfIm1 = _mm512_fmadd_ps(wfMx1, dfIm1, sfIm1);
sfIm1 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe1, sfIm1, 64764);
sfRe7 = _mm512_fmadd_ps(wfRe2, dfRe1, sfRe7);
sfRe7 = _mm512_mask3_fmadd_ps(wfIm2, dfIm1, sfRe7, 64764);
sfIm7 = _mm512_fmadd_ps(wfMx2, dfIm1, sfIm7);
sfIm7 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe1, sfIm7, 64764);
dfRe1 = _mm512_shuffle_f32x4(dfRe1, dfRe1, 78);
dfIm1 = _mm512_shuffle_f32x4(dfIm1, dfIm1, 78);
sfRe2 = _mm512_fmadd_ps(wfRe1, dfRe1, sfRe2);
sfRe2 = _mm512_mask3_fmadd_ps(wfIm1, dfIm1, sfRe2, 64764);
sfIm2 = _mm512_fmadd_ps(wfMx1, dfIm1, sfIm2);
sfIm2 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe1, sfIm2, 64764);
sfRe8 = _mm512_fmadd_ps(wfRe2, dfRe1, sfRe8);
sfRe8 = _mm512_mask3_fmadd_ps(wfIm2, dfIm1, sfRe8, 64764);
sfIm8 = _mm512_fmadd_ps(wfMx2, dfIm1, sfIm8);
sfIm8 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe1, sfIm8, 64764);
__m512 dfRe2 = _mm512_loadu_ps(dfPtr2+128+407040*i7+101760*j3+1152*k21+384*s2);
__m512 dfIm2 = _mm512_loadu_ps(dfPtr2+192+407040*i7+101760*j3+1152*k21+384*s2);
sfRe3 = _mm512_fmadd_ps(wfRe1, dfRe2, sfRe3);
sfRe3 = _mm512_mask3_fmadd_ps(wfIm1, dfIm2, sfRe3, 64764);
sfIm3 = _mm512_fmadd_ps(wfMx1, dfIm2, sfIm3);
sfIm3 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe2, sfIm3, 64764);
sfRe9 = _mm512_fmadd_ps(wfRe2, dfRe2, sfRe9);
sfRe9 = _mm512_mask3_fmadd_ps(wfIm2, dfIm2, sfRe9, 64764);
sfIm9 = _mm512_fmadd_ps(wfMx2, dfIm2, sfIm9);
sfIm9 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe2, sfIm9, 64764);
dfRe2 = _mm512_shuffle_f32x4(dfRe2, dfRe2, 78);
dfIm2 = _mm512_shuffle_f32x4(dfIm2, dfIm2, 78);
sfRe4 = _mm512_fmadd_ps(wfRe1, dfRe2, sfRe4);
sfRe4 = _mm512_mask3_fmadd_ps(wfIm1, dfIm2, sfRe4, 64764);
sfIm4 = _mm512_fmadd_ps(wfMx1, dfIm2, sfIm4);
sfIm4 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe2, sfIm4, 64764);
sfRe10 = _mm512_fmadd_ps(wfRe2, dfRe2, sfRe10);
sfRe10 = _mm512_mask3_fmadd_ps(wfIm2, dfIm2, sfRe10, 64764);
sfIm10 = _mm512_fmadd_ps(wfMx2, dfIm2, sfIm10);
sfIm10 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe2, sfIm10, 64764);
__m512 dfRe3 = _mm512_loadu_ps(dfPtr2+256+407040*i7+101760*j3+1152*k21+384*s2);
__m512 dfIm3 = _mm512_loadu_ps(dfPtr2+320+407040*i7+101760*j3+1152*k21+384*s2);
sfRe5 = _mm512_fmadd_ps(wfRe1, dfRe3, sfRe5);
sfRe5 = _mm512_mask3_fmadd_ps(wfIm1, dfIm3, sfRe5, 64764);
sfIm5 = _mm512_fmadd_ps(wfMx1, dfIm3, sfIm5);
sfIm5 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe3, sfIm5, 64764);
sfRe11 = _mm512_fmadd_ps(wfRe2, dfRe3, sfRe11);
sfRe11 = _mm512_mask3_fmadd_ps(wfIm2, dfIm3, sfRe11, 64764);
sfIm11 = _mm512_fmadd_ps(wfMx2, dfIm3, sfIm11);
sfIm11 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe3, sfIm11, 64764);
dfRe3 = _mm512_shuffle_f32x4(dfRe3, dfRe3, 78);
dfIm3 = _mm512_shuffle_f32x4(dfIm3, dfIm3, 78);
sfRe6 = _mm512_fmadd_ps(wfRe1, dfRe3, sfRe6);
sfRe6 = _mm512_mask3_fmadd_ps(wfIm1, dfIm3, sfRe6, 64764);
sfIm6 = _mm512_fmadd_ps(wfMx1, dfIm3, sfIm6);
sfIm6 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe3, sfIm6, 64764);
sfRe12 = _mm512_fmadd_ps(wfRe2, dfRe3, sfRe12);
sfRe12 = _mm512_mask3_fmadd_ps(wfIm2, dfIm3, sfRe12, 64764);
sfIm12 = _mm512_fmadd_ps(wfMx2, dfIm3, sfIm12);
sfIm12 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe3, sfIm12, 64764);
}
_mm512_storeu_ps(sfPtr1+0+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe1);
_mm512_storeu_ps(sfPtr1+64+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm1);
_mm512_storeu_ps(sfPtr1+128+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe2);
_mm512_storeu_ps(sfPtr1+192+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm2);
_mm512_storeu_ps(sfPtr1+256+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe3);
_mm512_storeu_ps(sfPtr1+320+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm3);
_mm512_storeu_ps(sfPtr1+384+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe4);
_mm512_storeu_ps(sfPtr1+448+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm4);
_mm512_storeu_ps(sfPtr1+512+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe5);
_mm512_storeu_ps(sfPtr1+576+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm5);
_mm512_storeu_ps(sfPtr1+640+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe6);
_mm512_storeu_ps(sfPtr1+704+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm6);
_mm512_storeu_ps(sfPtr1+768+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe7);
_mm512_storeu_ps(sfPtr1+832+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm7);
_mm512_storeu_ps(sfPtr1+896+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe8);
_mm512_storeu_ps(sfPtr1+960+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm8);
_mm512_storeu_ps(sfPtr1+1024+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe9);
_mm512_storeu_ps(sfPtr1+1088+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm9);
_mm512_storeu_ps(sfPtr1+1152+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe10);
_mm512_storeu_ps(sfPtr1+1216+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm10);
_mm512_storeu_ps(sfPtr1+1280+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe11);
_mm512_storeu_ps(sfPtr1+1344+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm11);
_mm512_storeu_ps(sfPtr1+1408+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe12);
_mm512_storeu_ps(sfPtr1+1472+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm12);
}
if (k21 >= kk20) return;
}
ptrdiff_t l2 = 16*w20;
for (; l2 != 16; ++l2) {
__m512 sfRe13 = _mm512_setzero_ps();
__m512 sfIm13 = _mm512_setzero_ps();
__m512 sfRe14 = _mm512_setzero_ps();
__m512 sfIm14 = _mm512_setzero_ps();
sfRe13 = _mm512_mask_mov_ps(sfRe13, 1, _mm512_set1_ps(*(float*)(bfPtr2+0+256*i7+16*l2)));
sfRe13 = _mm512_mask_mov_ps(sfRe13, 256, _mm512_set1_ps(*(float*)(bfPtr2+4+256*i7+16*l2)));
sfRe14 = _mm512_mask_mov_ps(sfRe14, 1, _mm512_set1_ps(*(float*)(bfPtr2+8+256*i7+16*l2)));
sfRe14 = _mm512_mask_mov_ps(sfRe14, 256, _mm512_set1_ps(*(float*)(bfPtr2+12+256*i7+16*l2)));
for (ptrdiff_t s3 = 0; s3 < 3; ++s3) {
__m512i wfLd3 = _mm512_loadu_si512(wfPtr2+0+24576*i7+6144*j3+384*l2+128*s3);
__m512 wfRe3 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd3));
__m512 wfIm3 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd3, 1));
__m512 wfMx3 = _mm512_mask_mov_ps(wfIm3, 64764, wfRe3);
__m512i wfLd4 = _mm512_loadu_si512(wfPtr2+64+24576*i7+6144*j3+384*l2+128*s3);
__m512 wfRe4 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd4));
__m512 wfIm4 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd4, 1));
__m512 wfMx4 = _mm512_mask_mov_ps(wfIm4, 64764, wfRe4);
__m512 dfRe4 = _mm512_loadu_ps(dfPtr2+0+407040*i7+101760*j3+1152*k21+128*s3);
__m512 dfIm4 = _mm512_loadu_ps(dfPtr2+64+407040*i7+101760*j3+1152*k21+128*s3);
sfRe13 = _mm512_fmadd_ps(wfRe3, dfRe4, sfRe13);
sfRe13 = _mm512_mask3_fmadd_ps(wfIm3, dfIm4, sfRe13, 64764);
sfIm13 = _mm512_fmadd_ps(wfMx3, dfIm4, sfIm13);
sfIm13 = _mm512_mask3_fnmadd_ps(wfIm3, dfRe4, sfIm13, 64764);
sfRe14 = _mm512_fmadd_ps(wfRe4, dfRe4, sfRe14);
sfRe14 = _mm512_mask3_fmadd_ps(wfIm4, dfIm4, sfRe14, 64764);
sfIm14 = _mm512_fmadd_ps(wfMx4, dfIm4, sfIm14);
sfIm14 = _mm512_mask3_fnmadd_ps(wfIm4, dfRe4, sfIm14, 64764);
}
_mm512_storeu_ps(sfPtr1+0+8667136*i7+2166784*j3+24576*k21+256*l2, sfRe13);
_mm512_storeu_ps(sfPtr1+64+8667136*i7+2166784*j3+24576*k21+256*l2, sfIm13);
_mm512_storeu_ps(sfPtr1+128+8667136*i7+2166784*j3+24576*k21+256*l2, sfRe14);
_mm512_storeu_ps(sfPtr1+192+8667136*i7+2166784*j3+24576*k21+256*l2, sfIm14);
}
j3 = 1;
}
for (; j3 <= jj9; ++j3) {
ptrdiff_t k22 = 6*d1;
ptrdiff_t kk21 = k22+(d1 < 13 ? 5 : 10);
for (; k22 != 88; ++k22) {
ptrdiff_t l3 = 16*w20;
for (; l3 != 16; ++l3) {
__m512 sfRe15 = _mm512_setzero_ps();
__m512 sfIm15 = _mm512_setzero_ps();
__m512 sfRe21 = _mm512_setzero_ps();
__m512 sfIm21 = _mm512_setzero_ps();
(void)bfPtr2;
__m512 sfRe16 = sfRe15;
__m512 sfIm16 = sfIm15;
__m512 sfRe17 = sfRe15;
__m512 sfIm17 = sfIm15;
__m512 sfRe18 = sfRe15;
__m512 sfIm18 = sfIm15;
__m512 sfRe19 = sfRe15;
__m512 sfIm19 = sfIm15;
__m512 sfRe20 = sfRe15;
__m512 sfIm20 = sfIm15;
__m512 sfRe22 = sfRe21;
__m512 sfIm22 = sfIm21;
__m512 sfRe23 = sfRe21;
__m512 sfIm23 = sfIm21;
__m512 sfRe24 = sfRe21;
__m512 sfIm24 = sfIm21;
__m512 sfRe25 = sfRe21;
__m512 sfIm25 = sfIm21;
__m512 sfRe26 = sfRe21;
__m512 sfIm26 = sfIm21;
for (ptrdiff_t s4 = 0; s4 < 3; ++s4) {
__m512i wfLd5 = _mm512_loadu_si512(wfPtr2+0+24576*i7+6144*j3+384*l3+128*s4);
__m512 wfRe5 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd5));
__m512 wfIm5 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd5, 1));
__m512i wfLd6 = _mm512_loadu_si512(wfPtr2+64+24576*i7+6144*j3+384*l3+128*s4);
__m512 wfRe6 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd6));
__m512 wfIm6 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd6, 1));
__m512 dfRe5 = _mm512_loadu_ps(dfPtr2+0+407040*i7+101760*j3+1152*k22+384*s4);
__m512 dfIm5 = _mm512_loadu_ps(dfPtr2+64+407040*i7+101760*j3+1152*k22+384*s4);
sfRe15 = _mm512_fmadd_ps(wfRe5, dfRe5, sfRe15);
sfRe15 = _mm512_fmadd_ps(wfIm5, dfIm5, sfRe15);
sfIm15 = _mm512_fmadd_ps(wfRe5, dfIm5, sfIm15);
sfIm15 = _mm512_fnmadd_ps(wfIm5, dfRe5, sfIm15);
sfRe21 = _mm512_fmadd_ps(wfRe6, dfRe5, sfRe21);
sfRe21 = _mm512_fmadd_ps(wfIm6, dfIm5, sfRe21);
sfIm21 = _mm512_fmadd_ps(wfRe6, dfIm5, sfIm21);
sfIm21 = _mm512_fnmadd_ps(wfIm6, dfRe5, sfIm21);
dfRe5 = _mm512_shuffle_f32x4(dfRe5, dfRe5, 78);
dfIm5 = _mm512_shuffle_f32x4(dfIm5, dfIm5, 78);
sfRe16 = _mm512_fmadd_ps(wfRe5, dfRe5, sfRe16);
sfRe16 = _mm512_fmadd_ps(wfIm5, dfIm5, sfRe16);
sfIm16 = _mm512_fmadd_ps(wfRe5, dfIm5, sfIm16);
sfIm16 = _mm512_fnmadd_ps(wfIm5, dfRe5, sfIm16);
sfRe22 = _mm512_fmadd_ps(wfRe6, dfRe5, sfRe22);
sfRe22 = _mm512_fmadd_ps(wfIm6, dfIm5, sfRe22);
sfIm22 = _mm512_fmadd_ps(wfRe6, dfIm5, sfIm22);
sfIm22 = _mm512_fnmadd_ps(wfIm6, dfRe5, sfIm22);
__m512 dfRe6 = _mm512_loadu_ps(dfPtr2+128+407040*i7+101760*j3+1152*k22+384*s4);
__m512 dfIm6 = _mm512_loadu_ps(dfPtr2+192+407040*i7+101760*j3+1152*k22+384*s4);
sfRe17 = _mm512_fmadd_ps(wfRe5, dfRe6, sfRe17);
sfRe17 = _mm512_fmadd_ps(wfIm5, dfIm6, sfRe17);
sfIm17 = _mm512_fmadd_ps(wfRe5, dfIm6, sfIm17);
sfIm17 = _mm512_fnmadd_ps(wfIm5, dfRe6, sfIm17);
sfRe23 = _mm512_fmadd_ps(wfRe6, dfRe6, sfRe23);
sfRe23 = _mm512_fmadd_ps(wfIm6, dfIm6, sfRe23);
sfIm23 = _mm512_fmadd_ps(wfRe6, dfIm6, sfIm23);
sfIm23 = _mm512_fnmadd_ps(wfIm6, dfRe6, sfIm23);
dfRe6 = _mm512_shuffle_f32x4(dfRe6, dfRe6, 78);
dfIm6 = _mm512_shuffle_f32x4(dfIm6, dfIm6, 78);
sfRe18 = _mm512_fmadd_ps(wfRe5, dfRe6, sfRe18);
sfRe18 = _mm512_fmadd_ps(wfIm5, dfIm6, sfRe18);
sfIm18 = _mm512_fmadd_ps(wfRe5, dfIm6, sfIm18);
sfIm18 = _mm512_fnmadd_ps(wfIm5, dfRe6, sfIm18);
sfRe24 = _mm512_fmadd_ps(wfRe6, dfRe6, sfRe24);
sfRe24 = _mm512_fmadd_ps(wfIm6, dfIm6, sfRe24);
sfIm24 = _mm512_fmadd_ps(wfRe6, dfIm6, sfIm24);
sfIm24 = _mm512_fnmadd_ps(wfIm6, dfRe6, sfIm24);
__m512 dfRe7 = _mm512_loadu_ps(dfPtr2+256+407040*i7+101760*j3+1152*k22+384*s4);
__m512 dfIm7 = _mm512_loadu_ps(dfPtr2+320+407040*i7+101760*j3+1152*k22+384*s4);
sfRe19 = _mm512_fmadd_ps(wfRe5, dfRe7, sfRe19);
sfRe19 = _mm512_fmadd_ps(wfIm5, dfIm7, sfRe19);
sfIm19 = _mm512_fmadd_ps(wfRe5, dfIm7, sfIm19);
sfIm19 = _mm512_fnmadd_ps(wfIm5, dfRe7, sfIm19);
sfRe25 = _mm512_fmadd_ps(wfRe6, dfRe7, sfRe25);
sfRe25 = _mm512_fmadd_ps(wfIm6, dfIm7, sfRe25);
sfIm25 = _mm512_fmadd_ps(wfRe6, dfIm7, sfIm25);
sfIm25 = _mm512_fnmadd_ps(wfIm6, dfRe7, sfIm25);
dfRe7 = _mm512_shuffle_f32x4(dfRe7, dfRe7, 78);
dfIm7 = _mm512_shuffle_f32x4(dfIm7, dfIm7, 78);
sfRe20 = _mm512_fmadd_ps(wfRe5, dfRe7, sfRe20);
sfRe20 = _mm512_fmadd_ps(wfIm5, dfIm7, sfRe20);
sfIm20 = _mm512_fmadd_ps(wfRe5, dfIm7, sfIm20);
sfIm20 = _mm512_fnmadd_ps(wfIm5, dfRe7, sfIm20);
sfRe26 = _mm512_fmadd_ps(wfRe6, dfRe7, sfRe26);
sfRe26 = _mm512_fmadd_ps(wfIm6, dfIm7, sfRe26);
sfIm26 = _mm512_fmadd_ps(wfRe6, dfIm7, sfIm26);
sfIm26 = _mm512_fnmadd_ps(wfIm6, dfRe7, sfIm26);
}
_mm512_storeu_ps(sfPtr1+0+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe15);
_mm512_storeu_ps(sfPtr1+64+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm15);
_mm512_storeu_ps(sfPtr1+128+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe16);
_mm512_storeu_ps(sfPtr1+192+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm16);
_mm512_storeu_ps(sfPtr1+256+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe17);
_mm512_storeu_ps(sfPtr1+320+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm17);
_mm512_storeu_ps(sfPtr1+384+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe18);
_mm512_storeu_ps(sfPtr1+448+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm18);
_mm512_storeu_ps(sfPtr1+512+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe19);
_mm512_storeu_ps(sfPtr1+576+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm19);
_mm512_storeu_ps(sfPtr1+640+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe20);
_mm512_storeu_ps(sfPtr1+704+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm20);
_mm512_storeu_ps(sfPtr1+768+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe21);
_mm512_storeu_ps(sfPtr1+832+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm21);
_mm512_storeu_ps(sfPtr1+896+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe22);
_mm512_storeu_ps(sfPtr1+960+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm22);
_mm512_storeu_ps(sfPtr1+1024+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe23);
_mm512_storeu_ps(sfPtr1+1088+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm23);
_mm512_storeu_ps(sfPtr1+1152+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe24);
_mm512_storeu_ps(sfPtr1+1216+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm24);
_mm512_storeu_ps(sfPtr1+1280+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe25);
_mm512_storeu_ps(sfPtr1+1344+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm25);
_mm512_storeu_ps(sfPtr1+1408+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe26);
_mm512_storeu_ps(sfPtr1+1472+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm26);
}
if (k22 >= kk21) return;
}
ptrdiff_t l4 = 16*w20;
for (; l4 != 16; ++l4) {
__m512 sfRe27 = _mm512_setzero_ps();
__m512 sfIm27 = _mm512_setzero_ps();
__m512 sfRe28 = _mm512_setzero_ps();
__m512 sfIm28 = _mm512_setzero_ps();
(void)bfPtr2;
for (ptrdiff_t s5 = 0; s5 < 3; ++s5) {
__m512i wfLd7 = _mm512_loadu_si512(wfPtr2+0+24576*i7+6144*j3+384*l4+128*s5);
__m512 wfRe7 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd7));
__m512 wfIm7 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd7, 1));
__m512i wfLd8 = _mm512_loadu_si512(wfPtr2+64+24576*i7+6144*j3+384*l4+128*s5);
__m512 wfRe8 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd8));
__m512 wfIm8 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd8, 1));
__m512 dfRe8 = _mm512_loadu_ps(dfPtr2+0+407040*i7+101760*j3+1152*k22+128*s5);
__m512 dfIm8 = _mm512_loadu_ps(dfPtr2+64+407040*i7+101760*j3+1152*k22+128*s5);
sfRe27 = _mm512_fmadd_ps(wfRe7, dfRe8, sfRe27);
sfRe27 = _mm512_fmadd_ps(wfIm7, dfIm8, sfRe27);
sfIm27 = _mm512_fmadd_ps(wfRe7, dfIm8, sfIm27);
sfIm27 = _mm512_fnmadd_ps(wfIm7, dfRe8, sfIm27);
sfRe28 = _mm512_fmadd_ps(wfRe8, dfRe8, sfRe28);
sfRe28 = _mm512_fmadd_ps(wfIm8, dfIm8, sfRe28);
sfIm28 = _mm512_fmadd_ps(wfRe8, dfIm8, sfIm28);
sfIm28 = _mm512_fnmadd_ps(wfIm8, dfRe8, sfIm28);
}
_mm512_storeu_ps(sfPtr1+0+8667136*i7+2166784*j3+24576*k22+256*l4, sfRe27);
_mm512_storeu_ps(sfPtr1+64+8667136*i7+2166784*j3+24576*k22+256*l4, sfIm27);
_mm512_storeu_ps(sfPtr1+128+8667136*i7+2166784*j3+24576*k22+256*l4, sfRe28);
_mm512_storeu_ps(sfPtr1+192+8667136*i7+2166784*j3+24576*k22+256*l4, sfIm28);
}
}
return;
}
char*restrict bfPtr3 = tensors6[0]+256*e3;
char*restrict wfPtr3 = tensors6[0]+256+12976128*e3+24576*z2;
char*restrict dfPtr3 = tensors6[1]+214917120*e3+407040*z2;
char*restrict sfPtr2 = tensors6[2];
ptrdiff_t i8 = 1*g4;
ptrdiff_t j4 = 1*p1;
ptrdiff_t jj10 = j4+0;
if (__builtin_expect(!j4, 0)) {
ptrdiff_t k23 = 6*d1;
ptrdiff_t kk22 = k23+(d1 < 13 ? 5 : 10);
for (; k23 != 88; ++k23) {
ptrdiff_t l5 = 16*w20;
for (; l5 != 16; ++l5) {
__m512 sfRe29 = _mm512_setzero_ps();
__m512 sfIm29 = _mm512_setzero_ps();
__m512 sfRe35 = _mm512_setzero_ps();
__m512 sfIm35 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe30 = sfRe29;
__m512 sfIm30 = sfIm29;
__m512 sfRe31 = sfRe29;
__m512 sfIm31 = sfIm29;
__m512 sfRe32 = sfRe29;
__m512 sfIm32 = sfIm29;
__m512 sfRe33 = sfRe29;
__m512 sfIm33 = sfIm29;
__m512 sfRe34 = sfRe29;
__m512 sfIm34 = sfIm29;
__m512 sfRe36 = sfRe35;
__m512 sfIm36 = sfIm35;
__m512 sfRe37 = sfRe35;
__m512 sfIm37 = sfIm35;
__m512 sfRe38 = sfRe35;
__m512 sfIm38 = sfIm35;
__m512 sfRe39 = sfRe35;
__m512 sfIm39 = sfIm35;
__m512 sfRe40 = sfRe35;
__m512 sfIm40 = sfIm35;
for (ptrdiff_t s6 = 0; s6 < 3; ++s6) {
__m512i wfLd9 = _mm512_loadu_si512(wfPtr3+0+24576*i8+6144*j4+384*l5+128*s6);
__m512 wfRe9 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd9));
__m512 wfIm9 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd9, 1));
__m512 wfMx5 = _mm512_mask_mov_ps(wfIm9, 64764, wfRe9);
__m512i wfLd10 = _mm512_loadu_si512(wfPtr3+64+24576*i8+6144*j4+384*l5+128*s6);
__m512 wfRe10 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd10));
__m512 wfIm10 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd10, 1));
__m512 wfMx6 = _mm512_mask_mov_ps(wfIm10, 64764, wfRe10);
__m512 dfRe9 = _mm512_loadu_ps(dfPtr3+0+407040*i8+101760*j4+1152*k23+384*s6);
__m512 dfIm9 = _mm512_loadu_ps(dfPtr3+64+407040*i8+101760*j4+1152*k23+384*s6);
sfRe29 = _mm512_fmadd_ps(wfRe9, dfRe9, sfRe29);
sfRe29 = _mm512_mask3_fmadd_ps(wfIm9, dfIm9, sfRe29, 64764);
sfIm29 = _mm512_fmadd_ps(wfMx5, dfIm9, sfIm29);
sfIm29 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe9, sfIm29, 64764);
sfRe35 = _mm512_fmadd_ps(wfRe10, dfRe9, sfRe35);
sfRe35 = _mm512_mask3_fmadd_ps(wfIm10, dfIm9, sfRe35, 64764);
sfIm35 = _mm512_fmadd_ps(wfMx6, dfIm9, sfIm35);
sfIm35 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe9, sfIm35, 64764);
dfRe9 = _mm512_shuffle_f32x4(dfRe9, dfRe9, 78);
dfIm9 = _mm512_shuffle_f32x4(dfIm9, dfIm9, 78);
sfRe30 = _mm512_fmadd_ps(wfRe9, dfRe9, sfRe30);
sfRe30 = _mm512_mask3_fmadd_ps(wfIm9, dfIm9, sfRe30, 64764);
sfIm30 = _mm512_fmadd_ps(wfMx5, dfIm9, sfIm30);
sfIm30 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe9, sfIm30, 64764);
sfRe36 = _mm512_fmadd_ps(wfRe10, dfRe9, sfRe36);
sfRe36 = _mm512_mask3_fmadd_ps(wfIm10, dfIm9, sfRe36, 64764);
sfIm36 = _mm512_fmadd_ps(wfMx6, dfIm9, sfIm36);
sfIm36 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe9, sfIm36, 64764);
__m512 dfRe10 = _mm512_loadu_ps(dfPtr3+128+407040*i8+101760*j4+1152*k23+384*s6);
__m512 dfIm10 = _mm512_loadu_ps(dfPtr3+192+407040*i8+101760*j4+1152*k23+384*s6);
sfRe31 = _mm512_fmadd_ps(wfRe9, dfRe10, sfRe31);
sfRe31 = _mm512_mask3_fmadd_ps(wfIm9, dfIm10, sfRe31, 64764);
sfIm31 = _mm512_fmadd_ps(wfMx5, dfIm10, sfIm31);
sfIm31 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe10, sfIm31, 64764);
sfRe37 = _mm512_fmadd_ps(wfRe10, dfRe10, sfRe37);
sfRe37 = _mm512_mask3_fmadd_ps(wfIm10, dfIm10, sfRe37, 64764);
sfIm37 = _mm512_fmadd_ps(wfMx6, dfIm10, sfIm37);
sfIm37 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe10, sfIm37, 64764);
dfRe10 = _mm512_shuffle_f32x4(dfRe10, dfRe10, 78);
dfIm10 = _mm512_shuffle_f32x4(dfIm10, dfIm10, 78);
sfRe32 = _mm512_fmadd_ps(wfRe9, dfRe10, sfRe32);
sfRe32 = _mm512_mask3_fmadd_ps(wfIm9, dfIm10, sfRe32, 64764);
sfIm32 = _mm512_fmadd_ps(wfMx5, dfIm10, sfIm32);
sfIm32 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe10, sfIm32, 64764);
sfRe38 = _mm512_fmadd_ps(wfRe10, dfRe10, sfRe38);
sfRe38 = _mm512_mask3_fmadd_ps(wfIm10, dfIm10, sfRe38, 64764);
sfIm38 = _mm512_fmadd_ps(wfMx6, dfIm10, sfIm38);
sfIm38 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe10, sfIm38, 64764);
__m512 dfRe11 = _mm512_loadu_ps(dfPtr3+256+407040*i8+101760*j4+1152*k23+384*s6);
__m512 dfIm11 = _mm512_loadu_ps(dfPtr3+320+407040*i8+101760*j4+1152*k23+384*s6);
sfRe33 = _mm512_fmadd_ps(wfRe9, dfRe11, sfRe33);
sfRe33 = _mm512_mask3_fmadd_ps(wfIm9, dfIm11, sfRe33, 64764);
sfIm33 = _mm512_fmadd_ps(wfMx5, dfIm11, sfIm33);
sfIm33 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe11, sfIm33, 64764);
sfRe39 = _mm512_fmadd_ps(wfRe10, dfRe11, sfRe39);
sfRe39 = _mm512_mask3_fmadd_ps(wfIm10, dfIm11, sfRe39, 64764);
sfIm39 = _mm512_fmadd_ps(wfMx6, dfIm11, sfIm39);
sfIm39 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe11, sfIm39, 64764);
dfRe11 = _mm512_shuffle_f32x4(dfRe11, dfRe11, 78);
dfIm11 = _mm512_shuffle_f32x4(dfIm11, dfIm11, 78);
sfRe34 = _mm512_fmadd_ps(wfRe9, dfRe11, sfRe34);
sfRe34 = _mm512_mask3_fmadd_ps(wfIm9, dfIm11, sfRe34, 64764);
sfIm34 = _mm512_fmadd_ps(wfMx5, dfIm11, sfIm34);
sfIm34 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe11, sfIm34, 64764);
sfRe40 = _mm512_fmadd_ps(wfRe10, dfRe11, sfRe40);
sfRe40 = _mm512_mask3_fmadd_ps(wfIm10, dfIm11, sfRe40, 64764);
sfIm40 = _mm512_fmadd_ps(wfMx6, dfIm11, sfIm40);
sfIm40 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe11, sfIm40, 64764);
}
sfRe29 = _mm512_add_ps(sfRe29, _mm512_loadu_ps(sfPtr2+0+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm29 = _mm512_add_ps(sfIm29, _mm512_loadu_ps(sfPtr2+64+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe30 = _mm512_add_ps(sfRe30, _mm512_loadu_ps(sfPtr2+128+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm30 = _mm512_add_ps(sfIm30, _mm512_loadu_ps(sfPtr2+192+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe31 = _mm512_add_ps(sfRe31, _mm512_loadu_ps(sfPtr2+256+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm31 = _mm512_add_ps(sfIm31, _mm512_loadu_ps(sfPtr2+320+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe32 = _mm512_add_ps(sfRe32, _mm512_loadu_ps(sfPtr2+384+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm32 = _mm512_add_ps(sfIm32, _mm512_loadu_ps(sfPtr2+448+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe33 = _mm512_add_ps(sfRe33, _mm512_loadu_ps(sfPtr2+512+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm33 = _mm512_add_ps(sfIm33, _mm512_loadu_ps(sfPtr2+576+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe34 = _mm512_add_ps(sfRe34, _mm512_loadu_ps(sfPtr2+640+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm34 = _mm512_add_ps(sfIm34, _mm512_loadu_ps(sfPtr2+704+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe35 = _mm512_add_ps(sfRe35, _mm512_loadu_ps(sfPtr2+768+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm35 = _mm512_add_ps(sfIm35, _mm512_loadu_ps(sfPtr2+832+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe36 = _mm512_add_ps(sfRe36, _mm512_loadu_ps(sfPtr2+896+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm36 = _mm512_add_ps(sfIm36, _mm512_loadu_ps(sfPtr2+960+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe37 = _mm512_add_ps(sfRe37, _mm512_loadu_ps(sfPtr2+1024+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm37 = _mm512_add_ps(sfIm37, _mm512_loadu_ps(sfPtr2+1088+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe38 = _mm512_add_ps(sfRe38, _mm512_loadu_ps(sfPtr2+1152+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm38 = _mm512_add_ps(sfIm38, _mm512_loadu_ps(sfPtr2+1216+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe39 = _mm512_add_ps(sfRe39, _mm512_loadu_ps(sfPtr2+1280+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm39 = _mm512_add_ps(sfIm39, _mm512_loadu_ps(sfPtr2+1344+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe40 = _mm512_add_ps(sfRe40, _mm512_loadu_ps(sfPtr2+1408+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm40 = _mm512_add_ps(sfIm40, _mm512_loadu_ps(sfPtr2+1472+8667136*i8+2166784*j4+24576*k23+1536*l5));
_mm512_storeu_ps(sfPtr2+0+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe29);
_mm512_storeu_ps(sfPtr2+64+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm29);
_mm512_storeu_ps(sfPtr2+128+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe30);
_mm512_storeu_ps(sfPtr2+192+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm30);
_mm512_storeu_ps(sfPtr2+256+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe31);
_mm512_storeu_ps(sfPtr2+320+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm31);
_mm512_storeu_ps(sfPtr2+384+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe32);
_mm512_storeu_ps(sfPtr2+448+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm32);
_mm512_storeu_ps(sfPtr2+512+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe33);
_mm512_storeu_ps(sfPtr2+576+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm33);
_mm512_storeu_ps(sfPtr2+640+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe34);
_mm512_storeu_ps(sfPtr2+704+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm34);
_mm512_storeu_ps(sfPtr2+768+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe35);
_mm512_storeu_ps(sfPtr2+832+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm35);
_mm512_storeu_ps(sfPtr2+896+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe36);
_mm512_storeu_ps(sfPtr2+960+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm36);
_mm512_storeu_ps(sfPtr2+1024+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe37);
_mm512_storeu_ps(sfPtr2+1088+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm37);
_mm512_storeu_ps(sfPtr2+1152+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe38);
_mm512_storeu_ps(sfPtr2+1216+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm38);
_mm512_storeu_ps(sfPtr2+1280+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe39);
_mm512_storeu_ps(sfPtr2+1344+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm39);
_mm512_storeu_ps(sfPtr2+1408+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe40);
_mm512_storeu_ps(sfPtr2+1472+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm40);
}
if (k23 >= kk22) return;
}
ptrdiff_t l6 = 16*w20;
for (; l6 != 16; ++l6) {
__m512 sfRe41 = _mm512_setzero_ps();
__m512 sfIm41 = _mm512_setzero_ps();
__m512 sfRe42 = _mm512_setzero_ps();
__m512 sfIm42 = _mm512_setzero_ps();
(void)bfPtr3;
for (ptrdiff_t s7 = 0; s7 < 3; ++s7) {
__m512i wfLd11 = _mm512_loadu_si512(wfPtr3+0+24576*i8+6144*j4+384*l6+128*s7);
__m512 wfRe11 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd11));
__m512 wfIm11 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd11, 1));
__m512 wfMx7 = _mm512_mask_mov_ps(wfIm11, 64764, wfRe11);
__m512i wfLd12 = _mm512_loadu_si512(wfPtr3+64+24576*i8+6144*j4+384*l6+128*s7);
__m512 wfRe12 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd12));
__m512 wfIm12 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd12, 1));
__m512 wfMx8 = _mm512_mask_mov_ps(wfIm12, 64764, wfRe12);
__m512 dfRe12 = _mm512_loadu_ps(dfPtr3+0+407040*i8+101760*j4+1152*k23+128*s7);
__m512 dfIm12 = _mm512_loadu_ps(dfPtr3+64+407040*i8+101760*j4+1152*k23+128*s7);
sfRe41 = _mm512_fmadd_ps(wfRe11, dfRe12, sfRe41);
sfRe41 = _mm512_mask3_fmadd_ps(wfIm11, dfIm12, sfRe41, 64764);
sfIm41 = _mm512_fmadd_ps(wfMx7, dfIm12, sfIm41);
sfIm41 = _mm512_mask3_fnmadd_ps(wfIm11, dfRe12, sfIm41, 64764);
sfRe42 = _mm512_fmadd_ps(wfRe12, dfRe12, sfRe42);
sfRe42 = _mm512_mask3_fmadd_ps(wfIm12, dfIm12, sfRe42, 64764);
sfIm42 = _mm512_fmadd_ps(wfMx8, dfIm12, sfIm42);
sfIm42 = _mm512_mask3_fnmadd_ps(wfIm12, dfRe12, sfIm42, 64764);
}
sfRe41 = _mm512_add_ps(sfRe41, _mm512_loadu_ps(sfPtr2+0+8667136*i8+2166784*j4+24576*k23+256*l6));
sfIm41 = _mm512_add_ps(sfIm41, _mm512_loadu_ps(sfPtr2+64+8667136*i8+2166784*j4+24576*k23+256*l6));
sfRe42 = _mm512_add_ps(sfRe42, _mm512_loadu_ps(sfPtr2+128+8667136*i8+2166784*j4+24576*k23+256*l6));
sfIm42 = _mm512_add_ps(sfIm42, _mm512_loadu_ps(sfPtr2+192+8667136*i8+2166784*j4+24576*k23+256*l6));
_mm512_storeu_ps(sfPtr2+0+8667136*i8+2166784*j4+24576*k23+256*l6, sfRe41);
_mm512_storeu_ps(sfPtr2+64+8667136*i8+2166784*j4+24576*k23+256*l6, sfIm41);
_mm512_storeu_ps(sfPtr2+128+8667136*i8+2166784*j4+24576*k23+256*l6, sfRe42);
_mm512_storeu_ps(sfPtr2+192+8667136*i8+2166784*j4+24576*k23+256*l6, sfIm42);
}
j4 = 1;
}
for (; j4 <= jj10; ++j4) {
ptrdiff_t k24 = 6*d1;
ptrdiff_t kk23 = k24+(d1 < 13 ? 5 : 10);
for (; k24 != 88; ++k24) {
ptrdiff_t l7 = 16*w20;
for (; l7 != 16; ++l7) {
__m512 sfRe43 = _mm512_setzero_ps();
__m512 sfIm43 = _mm512_setzero_ps();
__m512 sfRe49 = _mm512_setzero_ps();
__m512 sfIm49 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe44 = sfRe43;
__m512 sfIm44 = sfIm43;
__m512 sfRe45 = sfRe43;
__m512 sfIm45 = sfIm43;
__m512 sfRe46 = sfRe43;
__m512 sfIm46 = sfIm43;
__m512 sfRe47 = sfRe43;
__m512 sfIm47 = sfIm43;
__m512 sfRe48 = sfRe43;
__m512 sfIm48 = sfIm43;
__m512 sfRe50 = sfRe49;
__m512 sfIm50 = sfIm49;
__m512 sfRe51 = sfRe49;
__m512 sfIm51 = sfIm49;
__m512 sfRe52 = sfRe49;
__m512 sfIm52 = sfIm49;
__m512 sfRe53 = sfRe49;
__m512 sfIm53 = sfIm49;
__m512 sfRe54 = sfRe49;
__m512 sfIm54 = sfIm49;
for (ptrdiff_t s8 = 0; s8 < 3; ++s8) {
__m512i wfLd13 = _mm512_loadu_si512(wfPtr3+0+24576*i8+6144*j4+384*l7+128*s8);
__m512 wfRe13 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd13));
__m512 wfIm13 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd13, 1));
__m512i wfLd14 = _mm512_loadu_si512(wfPtr3+64+24576*i8+6144*j4+384*l7+128*s8);
__m512 wfRe14 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd14));
__m512 wfIm14 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd14, 1));
__m512 dfRe13 = _mm512_loadu_ps(dfPtr3+0+407040*i8+101760*j4+1152*k24+384*s8);
__m512 dfIm13 = _mm512_loadu_ps(dfPtr3+64+407040*i8+101760*j4+1152*k24+384*s8);
sfRe43 = _mm512_fmadd_ps(wfRe13, dfRe13, sfRe43);
sfRe43 = _mm512_fmadd_ps(wfIm13, dfIm13, sfRe43);
sfIm43 = _mm512_fmadd_ps(wfRe13, dfIm13, sfIm43);
sfIm43 = _mm512_fnmadd_ps(wfIm13, dfRe13, sfIm43);
sfRe49 = _mm512_fmadd_ps(wfRe14, dfRe13, sfRe49);
sfRe49 = _mm512_fmadd_ps(wfIm14, dfIm13, sfRe49);
sfIm49 = _mm512_fmadd_ps(wfRe14, dfIm13, sfIm49);
sfIm49 = _mm512_fnmadd_ps(wfIm14, dfRe13, sfIm49);
dfRe13 = _mm512_shuffle_f32x4(dfRe13, dfRe13, 78);
dfIm13 = _mm512_shuffle_f32x4(dfIm13, dfIm13, 78);
sfRe44 = _mm512_fmadd_ps(wfRe13, dfRe13, sfRe44);
sfRe44 = _mm512_fmadd_ps(wfIm13, dfIm13, sfRe44);
sfIm44 = _mm512_fmadd_ps(wfRe13, dfIm13, sfIm44);
sfIm44 = _mm512_fnmadd_ps(wfIm13, dfRe13, sfIm44);
sfRe50 = _mm512_fmadd_ps(wfRe14, dfRe13, sfRe50);
sfRe50 = _mm512_fmadd_ps(wfIm14, dfIm13, sfRe50);
sfIm50 = _mm512_fmadd_ps(wfRe14, dfIm13, sfIm50);
sfIm50 = _mm512_fnmadd_ps(wfIm14, dfRe13, sfIm50);
__m512 dfRe14 = _mm512_loadu_ps(dfPtr3+128+407040*i8+101760*j4+1152*k24+384*s8);
__m512 dfIm14 = _mm512_loadu_ps(dfPtr3+192+407040*i8+101760*j4+1152*k24+384*s8);
sfRe45 = _mm512_fmadd_ps(wfRe13, dfRe14, sfRe45);
sfRe45 = _mm512_fmadd_ps(wfIm13, dfIm14, sfRe45);
sfIm45 = _mm512_fmadd_ps(wfRe13, dfIm14, sfIm45);
sfIm45 = _mm512_fnmadd_ps(wfIm13, dfRe14, sfIm45);
sfRe51 = _mm512_fmadd_ps(wfRe14, dfRe14, sfRe51);
sfRe51 = _mm512_fmadd_ps(wfIm14, dfIm14, sfRe51);
sfIm51 = _mm512_fmadd_ps(wfRe14, dfIm14, sfIm51);
sfIm51 = _mm512_fnmadd_ps(wfIm14, dfRe14, sfIm51);
dfRe14 = _mm512_shuffle_f32x4(dfRe14, dfRe14, 78);
dfIm14 = _mm512_shuffle_f32x4(dfIm14, dfIm14, 78);
sfRe46 = _mm512_fmadd_ps(wfRe13, dfRe14, sfRe46);
sfRe46 = _mm512_fmadd_ps(wfIm13, dfIm14, sfRe46);
sfIm46 = _mm512_fmadd_ps(wfRe13, dfIm14, sfIm46);
sfIm46 = _mm512_fnmadd_ps(wfIm13, dfRe14, sfIm46);
sfRe52 = _mm512_fmadd_ps(wfRe14, dfRe14, sfRe52);
sfRe52 = _mm512_fmadd_ps(wfIm14, dfIm14, sfRe52);
sfIm52 = _mm512_fmadd_ps(wfRe14, dfIm14, sfIm52);
sfIm52 = _mm512_fnmadd_ps(wfIm14, dfRe14, sfIm52);
__m512 dfRe15 = _mm512_loadu_ps(dfPtr3+256+407040*i8+101760*j4+1152*k24+384*s8);
__m512 dfIm15 = _mm512_loadu_ps(dfPtr3+320+407040*i8+101760*j4+1152*k24+384*s8);
sfRe47 = _mm512_fmadd_ps(wfRe13, dfRe15, sfRe47);
sfRe47 = _mm512_fmadd_ps(wfIm13, dfIm15, sfRe47);
sfIm47 = _mm512_fmadd_ps(wfRe13, dfIm15, sfIm47);
sfIm47 = _mm512_fnmadd_ps(wfIm13, dfRe15, sfIm47);
sfRe53 = _mm512_fmadd_ps(wfRe14, dfRe15, sfRe53);
sfRe53 = _mm512_fmadd_ps(wfIm14, dfIm15, sfRe53);
sfIm53 = _mm512_fmadd_ps(wfRe14, dfIm15, sfIm53);
sfIm53 = _mm512_fnmadd_ps(wfIm14, dfRe15, sfIm53);
dfRe15 = _mm512_shuffle_f32x4(dfRe15, dfRe15, 78);
dfIm15 = _mm512_shuffle_f32x4(dfIm15, dfIm15, 78);
sfRe48 = _mm512_fmadd_ps(wfRe13, dfRe15, sfRe48);
sfRe48 = _mm512_fmadd_ps(wfIm13, dfIm15, sfRe48);
sfIm48 = _mm512_fmadd_ps(wfRe13, dfIm15, sfIm48);
sfIm48 = _mm512_fnmadd_ps(wfIm13, dfRe15, sfIm48);
sfRe54 = _mm512_fmadd_ps(wfRe14, dfRe15, sfRe54);
sfRe54 = _mm512_fmadd_ps(wfIm14, dfIm15, sfRe54);
sfIm54 = _mm512_fmadd_ps(wfRe14, dfIm15, sfIm54);
sfIm54 = _mm512_fnmadd_ps(wfIm14, dfRe15, sfIm54);
}
sfRe43 = _mm512_add_ps(sfRe43, _mm512_loadu_ps(sfPtr2+0+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm43 = _mm512_add_ps(sfIm43, _mm512_loadu_ps(sfPtr2+64+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe44 = _mm512_add_ps(sfRe44, _mm512_loadu_ps(sfPtr2+128+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm44 = _mm512_add_ps(sfIm44, _mm512_loadu_ps(sfPtr2+192+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe45 = _mm512_add_ps(sfRe45, _mm512_loadu_ps(sfPtr2+256+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm45 = _mm512_add_ps(sfIm45, _mm512_loadu_ps(sfPtr2+320+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe46 = _mm512_add_ps(sfRe46, _mm512_loadu_ps(sfPtr2+384+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm46 = _mm512_add_ps(sfIm46, _mm512_loadu_ps(sfPtr2+448+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe47 = _mm512_add_ps(sfRe47, _mm512_loadu_ps(sfPtr2+512+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm47 = _mm512_add_ps(sfIm47, _mm512_loadu_ps(sfPtr2+576+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe48 = _mm512_add_ps(sfRe48, _mm512_loadu_ps(sfPtr2+640+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm48 = _mm512_add_ps(sfIm48, _mm512_loadu_ps(sfPtr2+704+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe49 = _mm512_add_ps(sfRe49, _mm512_loadu_ps(sfPtr2+768+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm49 = _mm512_add_ps(sfIm49, _mm512_loadu_ps(sfPtr2+832+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe50 = _mm512_add_ps(sfRe50, _mm512_loadu_ps(sfPtr2+896+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm50 = _mm512_add_ps(sfIm50, _mm512_loadu_ps(sfPtr2+960+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe51 = _mm512_add_ps(sfRe51, _mm512_loadu_ps(sfPtr2+1024+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm51 = _mm512_add_ps(sfIm51, _mm512_loadu_ps(sfPtr2+1088+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe52 = _mm512_add_ps(sfRe52, _mm512_loadu_ps(sfPtr2+1152+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm52 = _mm512_add_ps(sfIm52, _mm512_loadu_ps(sfPtr2+1216+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe53 = _mm512_add_ps(sfRe53, _mm512_loadu_ps(sfPtr2+1280+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm53 = _mm512_add_ps(sfIm53, _mm512_loadu_ps(sfPtr2+1344+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe54 = _mm512_add_ps(sfRe54, _mm512_loadu_ps(sfPtr2+1408+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm54 = _mm512_add_ps(sfIm54, _mm512_loadu_ps(sfPtr2+1472+8667136*i8+2166784*j4+24576*k24+1536*l7));
_mm512_storeu_ps(sfPtr2+0+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe43);
_mm512_storeu_ps(sfPtr2+64+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm43);
_mm512_storeu_ps(sfPtr2+128+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe44);
_mm512_storeu_ps(sfPtr2+192+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm44);
_mm512_storeu_ps(sfPtr2+256+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe45);
_mm512_storeu_ps(sfPtr2+320+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm45);
_mm512_storeu_ps(sfPtr2+384+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe46);
_mm512_storeu_ps(sfPtr2+448+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm46);
_mm512_storeu_ps(sfPtr2+512+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe47);
_mm512_storeu_ps(sfPtr2+576+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm47);
_mm512_storeu_ps(sfPtr2+640+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe48);
_mm512_storeu_ps(sfPtr2+704+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm48);
_mm512_storeu_ps(sfPtr2+768+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe49);
_mm512_storeu_ps(sfPtr2+832+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm49);
_mm512_storeu_ps(sfPtr2+896+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe50);
_mm512_storeu_ps(sfPtr2+960+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm50);
_mm512_storeu_ps(sfPtr2+1024+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe51);
_mm512_storeu_ps(sfPtr2+1088+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm51);
_mm512_storeu_ps(sfPtr2+1152+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe52);
_mm512_storeu_ps(sfPtr2+1216+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm52);
_mm512_storeu_ps(sfPtr2+1280+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe53);
_mm512_storeu_ps(sfPtr2+1344+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm53);
_mm512_storeu_ps(sfPtr2+1408+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe54);
_mm512_storeu_ps(sfPtr2+1472+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm54);
}
if (k24 >= kk23) return;
}
ptrdiff_t l8 = 16*w20;
for (; l8 != 16; ++l8) {
__m512 sfRe55 = _mm512_setzero_ps();
__m512 sfIm55 = _mm512_setzero_ps();
__m512 sfRe56 = _mm512_setzero_ps();
__m512 sfIm56 = _mm512_setzero_ps();
(void)bfPtr3;
for (ptrdiff_t s9 = 0; s9 < 3; ++s9) {
__m512i wfLd15 = _mm512_loadu_si512(wfPtr3+0+24576*i8+6144*j4+384*l8+128*s9);
__m512 wfRe15 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd15));
__m512 wfIm15 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd15, 1));
__m512i wfLd16 = _mm512_loadu_si512(wfPtr3+64+24576*i8+6144*j4+384*l8+128*s9);
__m512 wfRe16 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd16));
__m512 wfIm16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd16, 1));
__m512 dfRe16 = _mm512_loadu_ps(dfPtr3+0+407040*i8+101760*j4+1152*k24+128*s9);
__m512 dfIm16 = _mm512_loadu_ps(dfPtr3+64+407040*i8+101760*j4+1152*k24+128*s9);
sfRe55 = _mm512_fmadd_ps(wfRe15, dfRe16, sfRe55);
sfRe55 = _mm512_fmadd_ps(wfIm15, dfIm16, sfRe55);
sfIm55 = _mm512_fmadd_ps(wfRe15, dfIm16, sfIm55);
sfIm55 = _mm512_fnmadd_ps(wfIm15, dfRe16, sfIm55);
sfRe56 = _mm512_fmadd_ps(wfRe16, dfRe16, sfRe56);
sfRe56 = _mm512_fmadd_ps(wfIm16, dfIm16, sfRe56);
sfIm56 = _mm512_fmadd_ps(wfRe16, dfIm16, sfIm56);
sfIm56 = _mm512_fnmadd_ps(wfIm16, dfRe16, sfIm56);
}
sfRe55 = _mm512_add_ps(sfRe55, _mm512_loadu_ps(sfPtr2+0+8667136*i8+2166784*j4+24576*k24+256*l8));
sfIm55 = _mm512_add_ps(sfIm55, _mm512_loadu_ps(sfPtr2+64+8667136*i8+2166784*j4+24576*k24+256*l8));
sfRe56 = _mm512_add_ps(sfRe56, _mm512_loadu_ps(sfPtr2+128+8667136*i8+2166784*j4+24576*k24+256*l8));
sfIm56 = _mm512_add_ps(sfIm56, _mm512_loadu_ps(sfPtr2+192+8667136*i8+2166784*j4+24576*k24+256*l8));
_mm512_storeu_ps(sfPtr2+0+8667136*i8+2166784*j4+24576*k24+256*l8, sfRe55);
_mm512_storeu_ps(sfPtr2+64+8667136*i8+2166784*j4+24576*k24+256*l8, sfIm55);
_mm512_storeu_ps(sfPtr2+128+8667136*i8+2166784*j4+24576*k24+256*l8, sfRe56);
_mm512_storeu_ps(sfPtr2+192+8667136*i8+2166784*j4+24576*k24+256*l8, sfIm56);
}
}
}

static void ResNet50StriderProduceSums1(ResNet50ThreaderTeam1* team16, char** tensors5) {
void* tuple1[3];
tuple1[0] = tensors5;
for (ptrdiff_t e4 = 0; e4 < 1; ++e4) {
tuple1[1] = (void*)e4;
for (ptrdiff_t z3 = 0; z3 < 4; ++z3) {
tuple1[2] = (void*)z3;
ResNet50ThreaderTask1 task9;
task9.callee1 = ResNet50StriderProduceSums1Callee1;
task9.any1 = tuple1;
task9.nd1 = 4;
task9.hull1[0] = 1;
task9.hull1[1] = 14;
task9.hull1[2] = 4;
task9.hull1[3] = 1;
ResNet50ThreaderDo1(team16, &task9);
}
}
}

static void ResNet50StriderConsumeSums1Callee1(ResNet50ThreaderTask1* task10, int64_t* pt10) {
char** tensors8 = task10->any1;
ptrdiff_t w21 = 0;
ptrdiff_t d2 = pt10[1];
ptrdiff_t g5 = 0;
char*restrict sfPtr3 = tensors8[0];
char*restrict datPtr2 = tensors8[1];
ptrdiff_t i9 = 1*g5;
ptrdiff_t j5 = 2*d2;
ptrdiff_t last2 = j5+(d2 < 43 ? 1 : 2);
if (j5 < 4) {
ptrdiff_t rel4 = j5-0;
ptrdiff_t base4 = 0;
if (rel4 < 1) {
ptrdiff_t toH1 = base4+0;
ptrdiff_t toW1 = 0;
ptrdiff_t k25 = 16*w21;
for (; k25 != 16; ++k25) {
ptrdiff_t r2 = 0;
for (; r2 != 2; ++r2) {
ptrdiff_t t2 = 0;
__m512 sfRe57 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfIm57 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfRe61 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfIm61 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfRe58 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfIm58 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfRe62 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfIm62 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfRe59 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfIm59 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfRe63 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfIm63 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfRe60 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfIm60 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfRe64 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfIm64 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512i ifft1 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2 = _mm512_permutexvar_ps(ifft1, sfRe57);
__m512 ifft93 = _mm512_permutexvar_ps(ifft1, sfRe61);
__m512i ifft3 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4 = _mm512_permutexvar_ps(ifft3, sfRe57);
__m512 ifft94 = _mm512_permutexvar_ps(ifft3, sfRe61);
__m512 ifft5 = _mm512_permutexvar_ps(ifft1, sfIm57);
__m512 ifft95 = _mm512_permutexvar_ps(ifft1, sfIm61);
__m512 ifft6 = _mm512_permutexvar_ps(ifft3, sfIm57);
__m512 ifft96 = _mm512_permutexvar_ps(ifft3, sfIm61);
__m512 ifft7 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft8 = _mm512_mask_fmadd_ps(ifft6, 65021, ifft7, ifft2);
__m512 ifft97 = _mm512_mask_fmadd_ps(ifft96, 65021, ifft7, ifft93);
__m512 ifft9 = _mm512_mask_fnmadd_ps(ifft5, 65021, ifft7, ifft4);
__m512 ifft98 = _mm512_mask_fnmadd_ps(ifft95, 65021, ifft7, ifft94);
__m512 ifft10 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft11 = _mm512_fmadd_ps(ifft8, ifft10, _mm512_shuffle_ps(ifft8, ifft8, 177));
__m512 ifft99 = _mm512_fmadd_ps(ifft97, ifft10, _mm512_shuffle_ps(ifft97, ifft97, 177));
__m512 ifft12 = _mm512_fmadd_ps(ifft9, ifft10, _mm512_shuffle_ps(ifft9, ifft9, 177));
__m512 ifft100 = _mm512_fmadd_ps(ifft98, ifft10, _mm512_shuffle_ps(ifft98, ifft98, 177));
__m512 ifft13 = _mm512_fmadd_ps(sfRe58, ifft10, _mm512_shuffle_ps(sfRe58, sfRe58, 177));
__m512 ifft101 = _mm512_fmadd_ps(sfRe62, ifft10, _mm512_shuffle_ps(sfRe62, sfRe62, 177));
__m512 ifft14 = _mm512_fmadd_ps(sfIm58, ifft10, _mm512_shuffle_ps(sfIm58, sfIm58, 177));
__m512 ifft102 = _mm512_fmadd_ps(sfIm62, ifft10, _mm512_shuffle_ps(sfIm62, sfIm62, 177));
__m512 ifft15 = _mm512_fmadd_ps(sfRe59, ifft10, _mm512_shuffle_ps(sfRe59, sfRe59, 177));
__m512 ifft103 = _mm512_fmadd_ps(sfRe63, ifft10, _mm512_shuffle_ps(sfRe63, sfRe63, 177));
__m512 ifft16 = _mm512_fmadd_ps(sfIm59, ifft10, _mm512_shuffle_ps(sfIm59, sfIm59, 177));
__m512 ifft104 = _mm512_fmadd_ps(sfIm63, ifft10, _mm512_shuffle_ps(sfIm63, sfIm63, 177));
__m512 ifft17 = _mm512_fmadd_ps(sfRe60, ifft10, _mm512_shuffle_ps(sfRe60, sfRe60, 177));
__m512 ifft105 = _mm512_fmadd_ps(sfRe64, ifft10, _mm512_shuffle_ps(sfRe64, sfRe64, 177));
__m512 ifft18 = _mm512_fmadd_ps(sfIm60, ifft10, _mm512_shuffle_ps(sfIm60, sfIm60, 177));
__m512 ifft106 = _mm512_fmadd_ps(sfIm64, ifft10, _mm512_shuffle_ps(sfIm64, sfIm64, 177));
__m512 ifft19 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft20 = _mm512_mul_ps(ifft11, ifft19);
__m512 ifft107 = _mm512_mul_ps(ifft99, ifft19);
__m512 ifft21 = _mm512_mul_ps(ifft12, ifft19);
__m512 ifft108 = _mm512_mul_ps(ifft100, ifft19);
__m512 ifft22 = _mm512_mul_ps(ifft13, ifft19);
__m512 ifft109 = _mm512_mul_ps(ifft101, ifft19);
__m512 ifft23 = _mm512_mul_ps(ifft14, ifft19);
__m512 ifft110 = _mm512_mul_ps(ifft102, ifft19);
__m512 ifft24 = _mm512_mul_ps(ifft15, ifft19);
__m512 ifft111 = _mm512_mul_ps(ifft103, ifft19);
__m512 ifft25 = _mm512_mul_ps(ifft16, ifft19);
__m512 ifft112 = _mm512_mul_ps(ifft104, ifft19);
__m512 ifft26 = _mm512_mul_ps(ifft17, ifft19);
__m512 ifft113 = _mm512_mul_ps(ifft105, ifft19);
__m512 ifft27 = _mm512_mul_ps(ifft18, ifft19);
__m512 ifft114 = _mm512_mul_ps(ifft106, ifft19);
__m512 ifft28 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft29 = _mm512_fnmadd_ps(ifft12, ifft28, ifft20);
__m512 ifft115 = _mm512_fnmadd_ps(ifft100, ifft28, ifft107);
__m512 ifft30 = _mm512_fmadd_ps(ifft11, ifft28, ifft21);
__m512 ifft116 = _mm512_fmadd_ps(ifft99, ifft28, ifft108);
__m512 ifft31 = _mm512_fnmadd_ps(ifft14, ifft28, ifft22);
__m512 ifft117 = _mm512_fnmadd_ps(ifft102, ifft28, ifft109);
__m512 ifft32 = _mm512_fmadd_ps(ifft13, ifft28, ifft23);
__m512 ifft118 = _mm512_fmadd_ps(ifft101, ifft28, ifft110);
__m512 ifft33 = _mm512_fnmadd_ps(ifft16, ifft28, ifft24);
__m512 ifft119 = _mm512_fnmadd_ps(ifft104, ifft28, ifft111);
__m512 ifft34 = _mm512_fmadd_ps(ifft15, ifft28, ifft25);
__m512 ifft120 = _mm512_fmadd_ps(ifft103, ifft28, ifft112);
__m512 ifft35 = _mm512_fnmadd_ps(ifft18, ifft28, ifft26);
__m512 ifft121 = _mm512_fnmadd_ps(ifft106, ifft28, ifft113);
__m512 ifft36 = _mm512_fmadd_ps(ifft17, ifft28, ifft27);
__m512 ifft122 = _mm512_fmadd_ps(ifft105, ifft28, ifft114);
__m512 ifft37 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft38 = _mm512_fmadd_ps(ifft29, ifft37, _mm512_shuffle_ps(ifft29, ifft29, 78));
__m512 ifft123 = _mm512_fmadd_ps(ifft115, ifft37, _mm512_shuffle_ps(ifft115, ifft115, 78));
__m512 ifft39 = _mm512_fmadd_ps(ifft30, ifft37, _mm512_shuffle_ps(ifft30, ifft30, 78));
__m512 ifft124 = _mm512_fmadd_ps(ifft116, ifft37, _mm512_shuffle_ps(ifft116, ifft116, 78));
__m512 ifft40 = _mm512_fmadd_ps(ifft31, ifft37, _mm512_shuffle_ps(ifft31, ifft31, 78));
__m512 ifft125 = _mm512_fmadd_ps(ifft117, ifft37, _mm512_shuffle_ps(ifft117, ifft117, 78));
__m512 ifft41 = _mm512_fmadd_ps(ifft32, ifft37, _mm512_shuffle_ps(ifft32, ifft32, 78));
__m512 ifft126 = _mm512_fmadd_ps(ifft118, ifft37, _mm512_shuffle_ps(ifft118, ifft118, 78));
__m512 ifft42 = _mm512_fmadd_ps(ifft33, ifft37, _mm512_shuffle_ps(ifft33, ifft33, 78));
__m512 ifft127 = _mm512_fmadd_ps(ifft119, ifft37, _mm512_shuffle_ps(ifft119, ifft119, 78));
__m512 ifft43 = _mm512_fmadd_ps(ifft34, ifft37, _mm512_shuffle_ps(ifft34, ifft34, 78));
__m512 ifft128 = _mm512_fmadd_ps(ifft120, ifft37, _mm512_shuffle_ps(ifft120, ifft120, 78));
__m512 ifft44 = _mm512_fmadd_ps(ifft35, ifft37, _mm512_shuffle_ps(ifft35, ifft35, 78));
__m512 ifft129 = _mm512_fmadd_ps(ifft121, ifft37, _mm512_shuffle_ps(ifft121, ifft121, 78));
__m512 ifft45 = _mm512_fmadd_ps(ifft36, ifft37, _mm512_shuffle_ps(ifft36, ifft36, 78));
__m512 ifft130 = _mm512_fmadd_ps(ifft122, ifft37, _mm512_shuffle_ps(ifft122, ifft122, 78));
__m512 ifft46 = _mm512_mask_sub_ps(ifft38, 49344, _mm512_setzero_ps(), ifft39);
__m512 ifft131 = _mm512_mask_sub_ps(ifft123, 49344, _mm512_setzero_ps(), ifft124);
__m512 ifft47 = _mm512_mask_mov_ps(ifft39, 49344, ifft38);
__m512 ifft132 = _mm512_mask_mov_ps(ifft124, 49344, ifft123);
__m512 ifft48 = _mm512_mask_sub_ps(ifft40, 49344, _mm512_setzero_ps(), ifft41);
__m512 ifft133 = _mm512_mask_sub_ps(ifft125, 49344, _mm512_setzero_ps(), ifft126);
__m512 ifft49 = _mm512_mask_mov_ps(ifft41, 49344, ifft40);
__m512 ifft134 = _mm512_mask_mov_ps(ifft126, 49344, ifft125);
__m512 ifft50 = _mm512_mask_sub_ps(ifft42, 49344, _mm512_setzero_ps(), ifft43);
__m512 ifft135 = _mm512_mask_sub_ps(ifft127, 49344, _mm512_setzero_ps(), ifft128);
__m512 ifft51 = _mm512_mask_mov_ps(ifft43, 49344, ifft42);
__m512 ifft136 = _mm512_mask_mov_ps(ifft128, 49344, ifft127);
__m512 ifft52 = _mm512_mask_sub_ps(ifft44, 49344, _mm512_setzero_ps(), ifft45);
__m512 ifft137 = _mm512_mask_sub_ps(ifft129, 49344, _mm512_setzero_ps(), ifft130);
__m512 ifft53 = _mm512_mask_mov_ps(ifft45, 49344, ifft44);
__m512 ifft138 = _mm512_mask_mov_ps(ifft130, 49344, ifft129);
__m512 ifft54 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft55 = _mm512_fmadd_ps(ifft46, ifft54, _mm512_shuffle_f32x4(ifft46, ifft46, 177));
__m512 ifft139 = _mm512_fmadd_ps(ifft131, ifft54, _mm512_shuffle_f32x4(ifft131, ifft131, 177));
__m512 ifft56 = _mm512_fmadd_ps(ifft47, ifft54, _mm512_shuffle_f32x4(ifft47, ifft47, 177));
__m512 ifft140 = _mm512_fmadd_ps(ifft132, ifft54, _mm512_shuffle_f32x4(ifft132, ifft132, 177));
__m512 ifft57 = _mm512_fmadd_ps(ifft48, ifft54, _mm512_shuffle_f32x4(ifft48, ifft48, 177));
__m512 ifft141 = _mm512_fmadd_ps(ifft133, ifft54, _mm512_shuffle_f32x4(ifft133, ifft133, 177));
__m512 ifft58 = _mm512_fmadd_ps(ifft49, ifft54, _mm512_shuffle_f32x4(ifft49, ifft49, 177));
__m512 ifft142 = _mm512_fmadd_ps(ifft134, ifft54, _mm512_shuffle_f32x4(ifft134, ifft134, 177));
__m512 ifft59 = _mm512_fmadd_ps(ifft50, ifft54, _mm512_shuffle_f32x4(ifft50, ifft50, 177));
__m512 ifft143 = _mm512_fmadd_ps(ifft135, ifft54, _mm512_shuffle_f32x4(ifft135, ifft135, 177));
__m512 ifft60 = _mm512_fnmsub_ps(ifft51, ifft54, _mm512_shuffle_f32x4(ifft51, ifft51, 177));
__m512 ifft144 = _mm512_fnmsub_ps(ifft136, ifft54, _mm512_shuffle_f32x4(ifft136, ifft136, 177));
__m512 ifft61 = _mm512_fmadd_ps(ifft52, ifft54, _mm512_shuffle_f32x4(ifft52, ifft52, 177));
__m512 ifft145 = _mm512_fmadd_ps(ifft137, ifft54, _mm512_shuffle_f32x4(ifft137, ifft137, 177));
__m512 ifft62 = _mm512_fmadd_ps(ifft53, ifft54, _mm512_shuffle_f32x4(ifft53, ifft53, 177));
__m512 ifft146 = _mm512_fmadd_ps(ifft138, ifft54, _mm512_shuffle_f32x4(ifft138, ifft138, 177));
__m512 ifft63 = _mm512_add_ps(ifft55, ifft56);
__m512 ifft147 = _mm512_add_ps(ifft139, ifft140);
__m512 ifft64 = _mm512_sub_ps(ifft55, ifft56);
__m512 ifft148 = _mm512_sub_ps(ifft139, ifft140);
__m512 ifft65 = _mm512_sub_ps(ifft57, ifft61);
__m512 ifft149 = _mm512_sub_ps(ifft141, ifft145);
__m512 ifft66 = _mm512_add_ps(ifft58, ifft62);
__m512 ifft150 = _mm512_add_ps(ifft142, ifft146);
__m512 ifft67 = _mm512_add_ps(ifft57, ifft61);
__m512 ifft151 = _mm512_add_ps(ifft141, ifft145);
__m512 ifft68 = _mm512_sub_ps(ifft58, ifft62);
__m512 ifft152 = _mm512_sub_ps(ifft142, ifft146);
__m512 ifft69 = _mm512_mul_ps(ifft59, _mm512_set1_ps(3.125e-02f));
__m512 ifft153 = _mm512_mul_ps(ifft143, _mm512_set1_ps(3.125e-02f));
__m512 ifft70 = _mm512_mul_ps(ifft60, _mm512_set1_ps(3.125e-02f));
__m512 ifft154 = _mm512_mul_ps(ifft144, _mm512_set1_ps(3.125e-02f));
__m512 ifft71 = _mm512_fmadd_ps(ifft63, _mm512_set1_ps(1.5625e-02f), ifft69);
__m512 ifft155 = _mm512_fmadd_ps(ifft147, _mm512_set1_ps(1.5625e-02f), ifft153);
__m512 ifft72 = _mm512_fmsub_ps(ifft63, _mm512_set1_ps(1.5625e-02f), ifft69);
__m512 ifft156 = _mm512_fmsub_ps(ifft147, _mm512_set1_ps(1.5625e-02f), ifft153);
__m512 ifft73 = _mm512_fmadd_ps(ifft64, _mm512_set1_ps(1.5625e-02f), ifft70);
__m512 ifft157 = _mm512_fmadd_ps(ifft148, _mm512_set1_ps(1.5625e-02f), ifft154);
__m512 ifft74 = _mm512_fmsub_ps(ifft64, _mm512_set1_ps(1.5625e-02f), ifft70);
__m512 ifft158 = _mm512_fmsub_ps(ifft148, _mm512_set1_ps(1.5625e-02f), ifft154);
__m512 ifft75 = _mm512_add_ps(ifft65, ifft66);
__m512 ifft159 = _mm512_add_ps(ifft149, ifft150);
__m512 ifft76 = _mm512_sub_ps(ifft65, ifft66);
__m512 ifft160 = _mm512_sub_ps(ifft149, ifft150);
__m512 ifft77 = _mm512_fnmadd_ps(ifft75, _mm512_set1_ps(7.0710677e-01f), ifft67);
__m512 ifft161 = _mm512_fnmadd_ps(ifft159, _mm512_set1_ps(7.0710677e-01f), ifft151);
__m512 ifft78 = _mm512_fmadd_ps(ifft75, _mm512_set1_ps(7.0710677e-01f), ifft67);
__m512 ifft162 = _mm512_fmadd_ps(ifft159, _mm512_set1_ps(7.0710677e-01f), ifft151);
__m512 ifft79 = _mm512_fmadd_ps(ifft76, _mm512_set1_ps(7.0710677e-01f), ifft68);
__m512 ifft163 = _mm512_fmadd_ps(ifft160, _mm512_set1_ps(7.0710677e-01f), ifft152);
__m512 ifft80 = _mm512_fmsub_ps(ifft76, _mm512_set1_ps(7.0710677e-01f), ifft68);
__m512 ifft164 = _mm512_fmsub_ps(ifft160, _mm512_set1_ps(7.0710677e-01f), ifft152);
__m512 ifft81 = _mm512_add_ps(ifft77, ifft78);
__m512 ifft165 = _mm512_add_ps(ifft161, ifft162);
__m512 ifft82 = _mm512_sub_ps(ifft77, ifft78);
__m512 ifft166 = _mm512_sub_ps(ifft161, ifft162);
__m512 ifft83 = _mm512_add_ps(ifft79, ifft80);
__m512 ifft167 = _mm512_add_ps(ifft163, ifft164);
__m512 ifft84 = _mm512_sub_ps(ifft79, ifft80);
__m512 ifft168 = _mm512_sub_ps(ifft163, ifft164);
__m512 ifft85 = _mm512_fmadd_ps(ifft81, _mm512_set1_ps(1.5625e-02f), ifft71);
__m512 ifft169 = _mm512_fmadd_ps(ifft165, _mm512_set1_ps(1.5625e-02f), ifft155);
__m512 ifft86 = _mm512_fnmadd_ps(ifft81, _mm512_set1_ps(1.5625e-02f), ifft71);
__m512 ifft170 = _mm512_fnmadd_ps(ifft165, _mm512_set1_ps(1.5625e-02f), ifft155);
__m512 ifft87 = _mm512_fmadd_ps(ifft83, _mm512_set1_ps(1.5625e-02f), ifft73);
__m512 ifft171 = _mm512_fmadd_ps(ifft167, _mm512_set1_ps(1.5625e-02f), ifft157);
__m512 ifft88 = _mm512_fnmadd_ps(ifft83, _mm512_set1_ps(1.5625e-02f), ifft73);
__m512 ifft172 = _mm512_fnmadd_ps(ifft167, _mm512_set1_ps(1.5625e-02f), ifft157);
__m512 ifft89 = _mm512_fnmadd_ps(ifft84, _mm512_set1_ps(1.5625e-02f), ifft72);
__m512 ifft173 = _mm512_fnmadd_ps(ifft168, _mm512_set1_ps(1.5625e-02f), ifft156);
__m512 ifft90 = _mm512_fmadd_ps(ifft84, _mm512_set1_ps(1.5625e-02f), ifft72);
__m512 ifft174 = _mm512_fmadd_ps(ifft168, _mm512_set1_ps(1.5625e-02f), ifft156);
__m512 ifft91 = _mm512_fmadd_ps(ifft82, _mm512_set1_ps(1.5625e-02f), ifft74);
__m512 ifft175 = _mm512_fmadd_ps(ifft166, _mm512_set1_ps(1.5625e-02f), ifft158);
__m512 ifft92 = _mm512_fnmadd_ps(ifft82, _mm512_set1_ps(1.5625e-02f), ifft74);
__m512 ifft176 = _mm512_fnmadd_ps(ifft166, _mm512_set1_ps(1.5625e-02f), ifft158);
__m512 dat590 = ifft85;
__m512 dat595 = ifft169;
__m512 dat591 = ifft87;
__m512 dat596 = ifft171;
__m512 dat592 = ifft89;
__m512 dat597 = ifft173;
__m512 dat593 = ifft91;
__m512 dat598 = ifft175;
__m512 dat594 = ifft86;
__m512 dat599 = ifft170;
(void)ifft88;
(void)ifft172;
(void)ifft90;
(void)ifft174;
(void)ifft92;
(void)ifft176;
__m512i pm1 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack1 = _mm512_permutex2var_ps(dat590, pm1, dat595);
__m512i pm2 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack2 = _mm512_permutex2var_ps(dat590, pm2, dat595);
__m512 pack3 = _mm512_permutex2var_ps(dat591, pm1, dat596);
__m512 pack4 = _mm512_permutex2var_ps(dat591, pm2, dat596);
__m512 pack5 = _mm512_permutex2var_ps(dat592, pm1, dat597);
__m512 pack6 = _mm512_permutex2var_ps(dat592, pm2, dat597);
__m512 pack7 = _mm512_permutex2var_ps(dat593, pm1, dat598);
__m512 pack8 = _mm512_permutex2var_ps(dat593, pm2, dat598);
__m512 pack9 = _mm512_permutex2var_ps(dat594, pm1, dat599);
__m512 pack10 = _mm512_permutex2var_ps(dat594, pm2, dat599);
pack1 = _mm512_max_ps(_mm512_setzero_ps(), pack1);
pack2 = _mm512_max_ps(_mm512_setzero_ps(), pack2);
pack3 = _mm512_max_ps(_mm512_setzero_ps(), pack3);
pack4 = _mm512_max_ps(_mm512_setzero_ps(), pack4);
pack5 = _mm512_max_ps(_mm512_setzero_ps(), pack5);
pack6 = _mm512_max_ps(_mm512_setzero_ps(), pack6);
pack7 = _mm512_max_ps(_mm512_setzero_ps(), pack7);
pack8 = _mm512_max_ps(_mm512_setzero_ps(), pack8);
pack9 = _mm512_max_ps(_mm512_setzero_ps(), pack9);
pack10 = _mm512_max_ps(_mm512_setzero_ps(), pack10);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack1);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack2);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack3);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack4);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack5);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack6);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack7);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack8);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack9);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack10);
ptrdiff_t t3 = 0;
for (; t3 < 2; ++t3) {
__m512 sfRe65 = _mm512_loadu_ps(sfPtr3+256+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfIm65 = _mm512_loadu_ps(sfPtr3+320+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfRe69 = _mm512_loadu_ps(sfPtr3+384+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfIm69 = _mm512_loadu_ps(sfPtr3+448+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfRe66 = _mm512_loadu_ps(sfPtr3+2167040+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfIm66 = _mm512_loadu_ps(sfPtr3+2167104+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfRe70 = _mm512_loadu_ps(sfPtr3+2167168+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfIm70 = _mm512_loadu_ps(sfPtr3+2167232+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfRe67 = _mm512_loadu_ps(sfPtr3+4333824+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfIm67 = _mm512_loadu_ps(sfPtr3+4333888+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfRe71 = _mm512_loadu_ps(sfPtr3+4333952+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfIm71 = _mm512_loadu_ps(sfPtr3+4334016+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfRe68 = _mm512_loadu_ps(sfPtr3+6500608+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfIm68 = _mm512_loadu_ps(sfPtr3+6500672+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfRe72 = _mm512_loadu_ps(sfPtr3+6500736+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfIm72 = _mm512_loadu_ps(sfPtr3+6500800+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512i ifft177 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft178 = _mm512_permutexvar_ps(ifft177, sfRe65);
__m512 ifft269 = _mm512_permutexvar_ps(ifft177, sfRe69);
__m512i ifft179 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft180 = _mm512_permutexvar_ps(ifft179, sfRe65);
__m512 ifft270 = _mm512_permutexvar_ps(ifft179, sfRe69);
__m512 ifft181 = _mm512_permutexvar_ps(ifft177, sfIm65);
__m512 ifft271 = _mm512_permutexvar_ps(ifft177, sfIm69);
__m512 ifft182 = _mm512_permutexvar_ps(ifft179, sfIm65);
__m512 ifft272 = _mm512_permutexvar_ps(ifft179, sfIm69);
__m512 ifft183 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft184 = _mm512_mask_fmadd_ps(ifft182, 65021, ifft183, ifft178);
__m512 ifft273 = _mm512_mask_fmadd_ps(ifft272, 65021, ifft183, ifft269);
__m512 ifft185 = _mm512_mask_fnmadd_ps(ifft181, 65021, ifft183, ifft180);
__m512 ifft274 = _mm512_mask_fnmadd_ps(ifft271, 65021, ifft183, ifft270);
__m512 ifft186 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft187 = _mm512_fmadd_ps(ifft184, ifft186, _mm512_shuffle_ps(ifft184, ifft184, 177));
__m512 ifft275 = _mm512_fmadd_ps(ifft273, ifft186, _mm512_shuffle_ps(ifft273, ifft273, 177));
__m512 ifft188 = _mm512_fmadd_ps(ifft185, ifft186, _mm512_shuffle_ps(ifft185, ifft185, 177));
__m512 ifft276 = _mm512_fmadd_ps(ifft274, ifft186, _mm512_shuffle_ps(ifft274, ifft274, 177));
__m512 ifft189 = _mm512_fmadd_ps(sfRe66, ifft186, _mm512_shuffle_ps(sfRe66, sfRe66, 177));
__m512 ifft277 = _mm512_fmadd_ps(sfRe70, ifft186, _mm512_shuffle_ps(sfRe70, sfRe70, 177));
__m512 ifft190 = _mm512_fmadd_ps(sfIm66, ifft186, _mm512_shuffle_ps(sfIm66, sfIm66, 177));
__m512 ifft278 = _mm512_fmadd_ps(sfIm70, ifft186, _mm512_shuffle_ps(sfIm70, sfIm70, 177));
__m512 ifft191 = _mm512_fmadd_ps(sfRe67, ifft186, _mm512_shuffle_ps(sfRe67, sfRe67, 177));
__m512 ifft279 = _mm512_fmadd_ps(sfRe71, ifft186, _mm512_shuffle_ps(sfRe71, sfRe71, 177));
__m512 ifft192 = _mm512_fmadd_ps(sfIm67, ifft186, _mm512_shuffle_ps(sfIm67, sfIm67, 177));
__m512 ifft280 = _mm512_fmadd_ps(sfIm71, ifft186, _mm512_shuffle_ps(sfIm71, sfIm71, 177));
__m512 ifft193 = _mm512_fmadd_ps(sfRe68, ifft186, _mm512_shuffle_ps(sfRe68, sfRe68, 177));
__m512 ifft281 = _mm512_fmadd_ps(sfRe72, ifft186, _mm512_shuffle_ps(sfRe72, sfRe72, 177));
__m512 ifft194 = _mm512_fmadd_ps(sfIm68, ifft186, _mm512_shuffle_ps(sfIm68, sfIm68, 177));
__m512 ifft282 = _mm512_fmadd_ps(sfIm72, ifft186, _mm512_shuffle_ps(sfIm72, sfIm72, 177));
__m512 ifft195 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft196 = _mm512_mul_ps(ifft187, ifft195);
__m512 ifft283 = _mm512_mul_ps(ifft275, ifft195);
__m512 ifft197 = _mm512_mul_ps(ifft188, ifft195);
__m512 ifft284 = _mm512_mul_ps(ifft276, ifft195);
__m512 ifft198 = _mm512_mul_ps(ifft189, ifft195);
__m512 ifft285 = _mm512_mul_ps(ifft277, ifft195);
__m512 ifft199 = _mm512_mul_ps(ifft190, ifft195);
__m512 ifft286 = _mm512_mul_ps(ifft278, ifft195);
__m512 ifft200 = _mm512_mul_ps(ifft191, ifft195);
__m512 ifft287 = _mm512_mul_ps(ifft279, ifft195);
__m512 ifft201 = _mm512_mul_ps(ifft192, ifft195);
__m512 ifft288 = _mm512_mul_ps(ifft280, ifft195);
__m512 ifft202 = _mm512_mul_ps(ifft193, ifft195);
__m512 ifft289 = _mm512_mul_ps(ifft281, ifft195);
__m512 ifft203 = _mm512_mul_ps(ifft194, ifft195);
__m512 ifft290 = _mm512_mul_ps(ifft282, ifft195);
__m512 ifft204 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft205 = _mm512_fnmadd_ps(ifft188, ifft204, ifft196);
__m512 ifft291 = _mm512_fnmadd_ps(ifft276, ifft204, ifft283);
__m512 ifft206 = _mm512_fmadd_ps(ifft187, ifft204, ifft197);
__m512 ifft292 = _mm512_fmadd_ps(ifft275, ifft204, ifft284);
__m512 ifft207 = _mm512_fnmadd_ps(ifft190, ifft204, ifft198);
__m512 ifft293 = _mm512_fnmadd_ps(ifft278, ifft204, ifft285);
__m512 ifft208 = _mm512_fmadd_ps(ifft189, ifft204, ifft199);
__m512 ifft294 = _mm512_fmadd_ps(ifft277, ifft204, ifft286);
__m512 ifft209 = _mm512_fnmadd_ps(ifft192, ifft204, ifft200);
__m512 ifft295 = _mm512_fnmadd_ps(ifft280, ifft204, ifft287);
__m512 ifft210 = _mm512_fmadd_ps(ifft191, ifft204, ifft201);
__m512 ifft296 = _mm512_fmadd_ps(ifft279, ifft204, ifft288);
__m512 ifft211 = _mm512_fnmadd_ps(ifft194, ifft204, ifft202);
__m512 ifft297 = _mm512_fnmadd_ps(ifft282, ifft204, ifft289);
__m512 ifft212 = _mm512_fmadd_ps(ifft193, ifft204, ifft203);
__m512 ifft298 = _mm512_fmadd_ps(ifft281, ifft204, ifft290);
__m512 ifft213 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft214 = _mm512_fmadd_ps(ifft205, ifft213, _mm512_shuffle_ps(ifft205, ifft205, 78));
__m512 ifft299 = _mm512_fmadd_ps(ifft291, ifft213, _mm512_shuffle_ps(ifft291, ifft291, 78));
__m512 ifft215 = _mm512_fmadd_ps(ifft206, ifft213, _mm512_shuffle_ps(ifft206, ifft206, 78));
__m512 ifft300 = _mm512_fmadd_ps(ifft292, ifft213, _mm512_shuffle_ps(ifft292, ifft292, 78));
__m512 ifft216 = _mm512_fmadd_ps(ifft207, ifft213, _mm512_shuffle_ps(ifft207, ifft207, 78));
__m512 ifft301 = _mm512_fmadd_ps(ifft293, ifft213, _mm512_shuffle_ps(ifft293, ifft293, 78));
__m512 ifft217 = _mm512_fmadd_ps(ifft208, ifft213, _mm512_shuffle_ps(ifft208, ifft208, 78));
__m512 ifft302 = _mm512_fmadd_ps(ifft294, ifft213, _mm512_shuffle_ps(ifft294, ifft294, 78));
__m512 ifft218 = _mm512_fmadd_ps(ifft209, ifft213, _mm512_shuffle_ps(ifft209, ifft209, 78));
__m512 ifft303 = _mm512_fmadd_ps(ifft295, ifft213, _mm512_shuffle_ps(ifft295, ifft295, 78));
__m512 ifft219 = _mm512_fmadd_ps(ifft210, ifft213, _mm512_shuffle_ps(ifft210, ifft210, 78));
__m512 ifft304 = _mm512_fmadd_ps(ifft296, ifft213, _mm512_shuffle_ps(ifft296, ifft296, 78));
__m512 ifft220 = _mm512_fmadd_ps(ifft211, ifft213, _mm512_shuffle_ps(ifft211, ifft211, 78));
__m512 ifft305 = _mm512_fmadd_ps(ifft297, ifft213, _mm512_shuffle_ps(ifft297, ifft297, 78));
__m512 ifft221 = _mm512_fmadd_ps(ifft212, ifft213, _mm512_shuffle_ps(ifft212, ifft212, 78));
__m512 ifft306 = _mm512_fmadd_ps(ifft298, ifft213, _mm512_shuffle_ps(ifft298, ifft298, 78));
__m512 ifft222 = _mm512_mask_sub_ps(ifft214, 49344, _mm512_setzero_ps(), ifft215);
__m512 ifft307 = _mm512_mask_sub_ps(ifft299, 49344, _mm512_setzero_ps(), ifft300);
__m512 ifft223 = _mm512_mask_mov_ps(ifft215, 49344, ifft214);
__m512 ifft308 = _mm512_mask_mov_ps(ifft300, 49344, ifft299);
__m512 ifft224 = _mm512_mask_sub_ps(ifft216, 49344, _mm512_setzero_ps(), ifft217);
__m512 ifft309 = _mm512_mask_sub_ps(ifft301, 49344, _mm512_setzero_ps(), ifft302);
__m512 ifft225 = _mm512_mask_mov_ps(ifft217, 49344, ifft216);
__m512 ifft310 = _mm512_mask_mov_ps(ifft302, 49344, ifft301);
__m512 ifft226 = _mm512_mask_sub_ps(ifft218, 49344, _mm512_setzero_ps(), ifft219);
__m512 ifft311 = _mm512_mask_sub_ps(ifft303, 49344, _mm512_setzero_ps(), ifft304);
__m512 ifft227 = _mm512_mask_mov_ps(ifft219, 49344, ifft218);
__m512 ifft312 = _mm512_mask_mov_ps(ifft304, 49344, ifft303);
__m512 ifft228 = _mm512_mask_sub_ps(ifft220, 49344, _mm512_setzero_ps(), ifft221);
__m512 ifft313 = _mm512_mask_sub_ps(ifft305, 49344, _mm512_setzero_ps(), ifft306);
__m512 ifft229 = _mm512_mask_mov_ps(ifft221, 49344, ifft220);
__m512 ifft314 = _mm512_mask_mov_ps(ifft306, 49344, ifft305);
__m512 ifft230 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft231 = _mm512_fmadd_ps(ifft222, ifft230, _mm512_shuffle_f32x4(ifft222, ifft222, 177));
__m512 ifft315 = _mm512_fmadd_ps(ifft307, ifft230, _mm512_shuffle_f32x4(ifft307, ifft307, 177));
__m512 ifft232 = _mm512_fmadd_ps(ifft223, ifft230, _mm512_shuffle_f32x4(ifft223, ifft223, 177));
__m512 ifft316 = _mm512_fmadd_ps(ifft308, ifft230, _mm512_shuffle_f32x4(ifft308, ifft308, 177));
__m512 ifft233 = _mm512_fmadd_ps(ifft224, ifft230, _mm512_shuffle_f32x4(ifft224, ifft224, 177));
__m512 ifft317 = _mm512_fmadd_ps(ifft309, ifft230, _mm512_shuffle_f32x4(ifft309, ifft309, 177));
__m512 ifft234 = _mm512_fmadd_ps(ifft225, ifft230, _mm512_shuffle_f32x4(ifft225, ifft225, 177));
__m512 ifft318 = _mm512_fmadd_ps(ifft310, ifft230, _mm512_shuffle_f32x4(ifft310, ifft310, 177));
__m512 ifft235 = _mm512_fmadd_ps(ifft226, ifft230, _mm512_shuffle_f32x4(ifft226, ifft226, 177));
__m512 ifft319 = _mm512_fmadd_ps(ifft311, ifft230, _mm512_shuffle_f32x4(ifft311, ifft311, 177));
__m512 ifft236 = _mm512_fnmsub_ps(ifft227, ifft230, _mm512_shuffle_f32x4(ifft227, ifft227, 177));
__m512 ifft320 = _mm512_fnmsub_ps(ifft312, ifft230, _mm512_shuffle_f32x4(ifft312, ifft312, 177));
__m512 ifft237 = _mm512_fmadd_ps(ifft228, ifft230, _mm512_shuffle_f32x4(ifft228, ifft228, 177));
__m512 ifft321 = _mm512_fmadd_ps(ifft313, ifft230, _mm512_shuffle_f32x4(ifft313, ifft313, 177));
__m512 ifft238 = _mm512_fmadd_ps(ifft229, ifft230, _mm512_shuffle_f32x4(ifft229, ifft229, 177));
__m512 ifft322 = _mm512_fmadd_ps(ifft314, ifft230, _mm512_shuffle_f32x4(ifft314, ifft314, 177));
__m512 ifft239 = _mm512_add_ps(ifft231, ifft232);
__m512 ifft323 = _mm512_add_ps(ifft315, ifft316);
__m512 ifft240 = _mm512_sub_ps(ifft231, ifft232);
__m512 ifft324 = _mm512_sub_ps(ifft315, ifft316);
__m512 ifft241 = _mm512_sub_ps(ifft233, ifft237);
__m512 ifft325 = _mm512_sub_ps(ifft317, ifft321);
__m512 ifft242 = _mm512_add_ps(ifft234, ifft238);
__m512 ifft326 = _mm512_add_ps(ifft318, ifft322);
__m512 ifft243 = _mm512_add_ps(ifft233, ifft237);
__m512 ifft327 = _mm512_add_ps(ifft317, ifft321);
__m512 ifft244 = _mm512_sub_ps(ifft234, ifft238);
__m512 ifft328 = _mm512_sub_ps(ifft318, ifft322);
__m512 ifft245 = _mm512_mul_ps(ifft235, _mm512_set1_ps(3.125e-02f));
__m512 ifft329 = _mm512_mul_ps(ifft319, _mm512_set1_ps(3.125e-02f));
__m512 ifft246 = _mm512_mul_ps(ifft236, _mm512_set1_ps(3.125e-02f));
__m512 ifft330 = _mm512_mul_ps(ifft320, _mm512_set1_ps(3.125e-02f));
__m512 ifft247 = _mm512_fmadd_ps(ifft239, _mm512_set1_ps(1.5625e-02f), ifft245);
__m512 ifft331 = _mm512_fmadd_ps(ifft323, _mm512_set1_ps(1.5625e-02f), ifft329);
__m512 ifft248 = _mm512_fmsub_ps(ifft239, _mm512_set1_ps(1.5625e-02f), ifft245);
__m512 ifft332 = _mm512_fmsub_ps(ifft323, _mm512_set1_ps(1.5625e-02f), ifft329);
__m512 ifft249 = _mm512_fmadd_ps(ifft240, _mm512_set1_ps(1.5625e-02f), ifft246);
__m512 ifft333 = _mm512_fmadd_ps(ifft324, _mm512_set1_ps(1.5625e-02f), ifft330);
__m512 ifft250 = _mm512_fmsub_ps(ifft240, _mm512_set1_ps(1.5625e-02f), ifft246);
__m512 ifft334 = _mm512_fmsub_ps(ifft324, _mm512_set1_ps(1.5625e-02f), ifft330);
__m512 ifft251 = _mm512_add_ps(ifft241, ifft242);
__m512 ifft335 = _mm512_add_ps(ifft325, ifft326);
__m512 ifft252 = _mm512_sub_ps(ifft241, ifft242);
__m512 ifft336 = _mm512_sub_ps(ifft325, ifft326);
__m512 ifft253 = _mm512_fnmadd_ps(ifft251, _mm512_set1_ps(7.0710677e-01f), ifft243);
__m512 ifft337 = _mm512_fnmadd_ps(ifft335, _mm512_set1_ps(7.0710677e-01f), ifft327);
__m512 ifft254 = _mm512_fmadd_ps(ifft251, _mm512_set1_ps(7.0710677e-01f), ifft243);
__m512 ifft338 = _mm512_fmadd_ps(ifft335, _mm512_set1_ps(7.0710677e-01f), ifft327);
__m512 ifft255 = _mm512_fmadd_ps(ifft252, _mm512_set1_ps(7.0710677e-01f), ifft244);
__m512 ifft339 = _mm512_fmadd_ps(ifft336, _mm512_set1_ps(7.0710677e-01f), ifft328);
__m512 ifft256 = _mm512_fmsub_ps(ifft252, _mm512_set1_ps(7.0710677e-01f), ifft244);
__m512 ifft340 = _mm512_fmsub_ps(ifft336, _mm512_set1_ps(7.0710677e-01f), ifft328);
__m512 ifft257 = _mm512_add_ps(ifft253, ifft254);
__m512 ifft341 = _mm512_add_ps(ifft337, ifft338);
__m512 ifft258 = _mm512_sub_ps(ifft253, ifft254);
__m512 ifft342 = _mm512_sub_ps(ifft337, ifft338);
__m512 ifft259 = _mm512_add_ps(ifft255, ifft256);
__m512 ifft343 = _mm512_add_ps(ifft339, ifft340);
__m512 ifft260 = _mm512_sub_ps(ifft255, ifft256);
__m512 ifft344 = _mm512_sub_ps(ifft339, ifft340);
__m512 ifft261 = _mm512_fmadd_ps(ifft257, _mm512_set1_ps(1.5625e-02f), ifft247);
__m512 ifft345 = _mm512_fmadd_ps(ifft341, _mm512_set1_ps(1.5625e-02f), ifft331);
__m512 ifft262 = _mm512_fnmadd_ps(ifft257, _mm512_set1_ps(1.5625e-02f), ifft247);
__m512 ifft346 = _mm512_fnmadd_ps(ifft341, _mm512_set1_ps(1.5625e-02f), ifft331);
__m512 ifft263 = _mm512_fmadd_ps(ifft259, _mm512_set1_ps(1.5625e-02f), ifft249);
__m512 ifft347 = _mm512_fmadd_ps(ifft343, _mm512_set1_ps(1.5625e-02f), ifft333);
__m512 ifft264 = _mm512_fnmadd_ps(ifft259, _mm512_set1_ps(1.5625e-02f), ifft249);
__m512 ifft348 = _mm512_fnmadd_ps(ifft343, _mm512_set1_ps(1.5625e-02f), ifft333);
__m512 ifft265 = _mm512_fnmadd_ps(ifft260, _mm512_set1_ps(1.5625e-02f), ifft248);
__m512 ifft349 = _mm512_fnmadd_ps(ifft344, _mm512_set1_ps(1.5625e-02f), ifft332);
__m512 ifft266 = _mm512_fmadd_ps(ifft260, _mm512_set1_ps(1.5625e-02f), ifft248);
__m512 ifft350 = _mm512_fmadd_ps(ifft344, _mm512_set1_ps(1.5625e-02f), ifft332);
__m512 ifft267 = _mm512_fmadd_ps(ifft258, _mm512_set1_ps(1.5625e-02f), ifft250);
__m512 ifft351 = _mm512_fmadd_ps(ifft342, _mm512_set1_ps(1.5625e-02f), ifft334);
__m512 ifft268 = _mm512_fnmadd_ps(ifft258, _mm512_set1_ps(1.5625e-02f), ifft250);
__m512 ifft352 = _mm512_fnmadd_ps(ifft342, _mm512_set1_ps(1.5625e-02f), ifft334);
__m512 dat600 = ifft261;
__m512 dat605 = ifft345;
__m512 dat601 = ifft263;
__m512 dat606 = ifft347;
__m512 dat602 = ifft265;
__m512 dat607 = ifft349;
__m512 dat603 = ifft267;
__m512 dat608 = ifft351;
__m512 dat604 = ifft262;
__m512 dat609 = ifft346;
(void)ifft264;
(void)ifft348;
(void)ifft266;
(void)ifft350;
(void)ifft268;
(void)ifft352;
__m512i pm3 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack11 = _mm512_permutex2var_ps(dat600, pm3, dat605);
__m512i pm4 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack12 = _mm512_permutex2var_ps(dat600, pm4, dat605);
__m512 pack13 = _mm512_permutex2var_ps(dat601, pm3, dat606);
__m512 pack14 = _mm512_permutex2var_ps(dat601, pm4, dat606);
__m512 pack15 = _mm512_permutex2var_ps(dat602, pm3, dat607);
__m512 pack16 = _mm512_permutex2var_ps(dat602, pm4, dat607);
__m512 pack17 = _mm512_permutex2var_ps(dat603, pm3, dat608);
__m512 pack18 = _mm512_permutex2var_ps(dat603, pm4, dat608);
__m512 pack19 = _mm512_permutex2var_ps(dat604, pm3, dat609);
__m512 pack20 = _mm512_permutex2var_ps(dat604, pm4, dat609);
pack11 = _mm512_max_ps(_mm512_setzero_ps(), pack11);
pack12 = _mm512_max_ps(_mm512_setzero_ps(), pack12);
pack13 = _mm512_max_ps(_mm512_setzero_ps(), pack13);
pack14 = _mm512_max_ps(_mm512_setzero_ps(), pack14);
pack15 = _mm512_max_ps(_mm512_setzero_ps(), pack15);
pack16 = _mm512_max_ps(_mm512_setzero_ps(), pack16);
pack17 = _mm512_max_ps(_mm512_setzero_ps(), pack17);
pack18 = _mm512_max_ps(_mm512_setzero_ps(), pack18);
pack19 = _mm512_max_ps(_mm512_setzero_ps(), pack19);
pack20 = _mm512_max_ps(_mm512_setzero_ps(), pack20);
_mm512_mask_storeu_ps(datPtr2+40+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack11);
_mm512_mask_storeu_ps(datPtr2+50280+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack12);
_mm512_mask_storeu_ps(datPtr2+488+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack13);
_mm512_mask_storeu_ps(datPtr2+50728+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack14);
_mm512_mask_storeu_ps(datPtr2+936+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack15);
_mm512_mask_storeu_ps(datPtr2+51176+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack16);
_mm512_mask_storeu_ps(datPtr2+1384+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack17);
_mm512_mask_storeu_ps(datPtr2+51624+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack18);
_mm512_mask_storeu_ps(datPtr2+1832+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack19);
_mm512_mask_storeu_ps(datPtr2+52072+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack20);
}
}
}
if (j5 >= last2) return;
++j5;
rel4 = 1;
}
if (rel4 < 3) {
ptrdiff_t toH2 = base4+0;
ptrdiff_t toW2 = 0+30*rel4;
ptrdiff_t jj11 = 2-rel4+j5;
for (; j5 <= jj11; toW2 += 30) {
ptrdiff_t k26 = 16*w21;
for (; k26 != 16; ++k26) {
ptrdiff_t r3 = 0;
for (; r3 != 2; ++r3) {
ptrdiff_t t4 = 0;
for (; t4 < 3; ++t4) {
__m512 sfRe73 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfIm73 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfRe77 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfIm77 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfRe74 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfIm74 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfRe78 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfIm78 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfRe75 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfIm75 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfRe79 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfIm79 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfRe76 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfIm76 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfRe80 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfIm80 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512i ifft353 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft354 = _mm512_permutexvar_ps(ifft353, sfRe73);
__m512 ifft445 = _mm512_permutexvar_ps(ifft353, sfRe77);
__m512i ifft355 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft356 = _mm512_permutexvar_ps(ifft355, sfRe73);
__m512 ifft446 = _mm512_permutexvar_ps(ifft355, sfRe77);
__m512 ifft357 = _mm512_permutexvar_ps(ifft353, sfIm73);
__m512 ifft447 = _mm512_permutexvar_ps(ifft353, sfIm77);
__m512 ifft358 = _mm512_permutexvar_ps(ifft355, sfIm73);
__m512 ifft448 = _mm512_permutexvar_ps(ifft355, sfIm77);
__m512 ifft359 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft360 = _mm512_mask_fmadd_ps(ifft358, 65021, ifft359, ifft354);
__m512 ifft449 = _mm512_mask_fmadd_ps(ifft448, 65021, ifft359, ifft445);
__m512 ifft361 = _mm512_mask_fnmadd_ps(ifft357, 65021, ifft359, ifft356);
__m512 ifft450 = _mm512_mask_fnmadd_ps(ifft447, 65021, ifft359, ifft446);
__m512 ifft362 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft363 = _mm512_fmadd_ps(ifft360, ifft362, _mm512_shuffle_ps(ifft360, ifft360, 177));
__m512 ifft451 = _mm512_fmadd_ps(ifft449, ifft362, _mm512_shuffle_ps(ifft449, ifft449, 177));
__m512 ifft364 = _mm512_fmadd_ps(ifft361, ifft362, _mm512_shuffle_ps(ifft361, ifft361, 177));
__m512 ifft452 = _mm512_fmadd_ps(ifft450, ifft362, _mm512_shuffle_ps(ifft450, ifft450, 177));
__m512 ifft365 = _mm512_fmadd_ps(sfRe74, ifft362, _mm512_shuffle_ps(sfRe74, sfRe74, 177));
__m512 ifft453 = _mm512_fmadd_ps(sfRe78, ifft362, _mm512_shuffle_ps(sfRe78, sfRe78, 177));
__m512 ifft366 = _mm512_fmadd_ps(sfIm74, ifft362, _mm512_shuffle_ps(sfIm74, sfIm74, 177));
__m512 ifft454 = _mm512_fmadd_ps(sfIm78, ifft362, _mm512_shuffle_ps(sfIm78, sfIm78, 177));
__m512 ifft367 = _mm512_fmadd_ps(sfRe75, ifft362, _mm512_shuffle_ps(sfRe75, sfRe75, 177));
__m512 ifft455 = _mm512_fmadd_ps(sfRe79, ifft362, _mm512_shuffle_ps(sfRe79, sfRe79, 177));
__m512 ifft368 = _mm512_fmadd_ps(sfIm75, ifft362, _mm512_shuffle_ps(sfIm75, sfIm75, 177));
__m512 ifft456 = _mm512_fmadd_ps(sfIm79, ifft362, _mm512_shuffle_ps(sfIm79, sfIm79, 177));
__m512 ifft369 = _mm512_fmadd_ps(sfRe76, ifft362, _mm512_shuffle_ps(sfRe76, sfRe76, 177));
__m512 ifft457 = _mm512_fmadd_ps(sfRe80, ifft362, _mm512_shuffle_ps(sfRe80, sfRe80, 177));
__m512 ifft370 = _mm512_fmadd_ps(sfIm76, ifft362, _mm512_shuffle_ps(sfIm76, sfIm76, 177));
__m512 ifft458 = _mm512_fmadd_ps(sfIm80, ifft362, _mm512_shuffle_ps(sfIm80, sfIm80, 177));
__m512 ifft371 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft372 = _mm512_mul_ps(ifft363, ifft371);
__m512 ifft459 = _mm512_mul_ps(ifft451, ifft371);
__m512 ifft373 = _mm512_mul_ps(ifft364, ifft371);
__m512 ifft460 = _mm512_mul_ps(ifft452, ifft371);
__m512 ifft374 = _mm512_mul_ps(ifft365, ifft371);
__m512 ifft461 = _mm512_mul_ps(ifft453, ifft371);
__m512 ifft375 = _mm512_mul_ps(ifft366, ifft371);
__m512 ifft462 = _mm512_mul_ps(ifft454, ifft371);
__m512 ifft376 = _mm512_mul_ps(ifft367, ifft371);
__m512 ifft463 = _mm512_mul_ps(ifft455, ifft371);
__m512 ifft377 = _mm512_mul_ps(ifft368, ifft371);
__m512 ifft464 = _mm512_mul_ps(ifft456, ifft371);
__m512 ifft378 = _mm512_mul_ps(ifft369, ifft371);
__m512 ifft465 = _mm512_mul_ps(ifft457, ifft371);
__m512 ifft379 = _mm512_mul_ps(ifft370, ifft371);
__m512 ifft466 = _mm512_mul_ps(ifft458, ifft371);
__m512 ifft380 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft381 = _mm512_fnmadd_ps(ifft364, ifft380, ifft372);
__m512 ifft467 = _mm512_fnmadd_ps(ifft452, ifft380, ifft459);
__m512 ifft382 = _mm512_fmadd_ps(ifft363, ifft380, ifft373);
__m512 ifft468 = _mm512_fmadd_ps(ifft451, ifft380, ifft460);
__m512 ifft383 = _mm512_fnmadd_ps(ifft366, ifft380, ifft374);
__m512 ifft469 = _mm512_fnmadd_ps(ifft454, ifft380, ifft461);
__m512 ifft384 = _mm512_fmadd_ps(ifft365, ifft380, ifft375);
__m512 ifft470 = _mm512_fmadd_ps(ifft453, ifft380, ifft462);
__m512 ifft385 = _mm512_fnmadd_ps(ifft368, ifft380, ifft376);
__m512 ifft471 = _mm512_fnmadd_ps(ifft456, ifft380, ifft463);
__m512 ifft386 = _mm512_fmadd_ps(ifft367, ifft380, ifft377);
__m512 ifft472 = _mm512_fmadd_ps(ifft455, ifft380, ifft464);
__m512 ifft387 = _mm512_fnmadd_ps(ifft370, ifft380, ifft378);
__m512 ifft473 = _mm512_fnmadd_ps(ifft458, ifft380, ifft465);
__m512 ifft388 = _mm512_fmadd_ps(ifft369, ifft380, ifft379);
__m512 ifft474 = _mm512_fmadd_ps(ifft457, ifft380, ifft466);
__m512 ifft389 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft390 = _mm512_fmadd_ps(ifft381, ifft389, _mm512_shuffle_ps(ifft381, ifft381, 78));
__m512 ifft475 = _mm512_fmadd_ps(ifft467, ifft389, _mm512_shuffle_ps(ifft467, ifft467, 78));
__m512 ifft391 = _mm512_fmadd_ps(ifft382, ifft389, _mm512_shuffle_ps(ifft382, ifft382, 78));
__m512 ifft476 = _mm512_fmadd_ps(ifft468, ifft389, _mm512_shuffle_ps(ifft468, ifft468, 78));
__m512 ifft392 = _mm512_fmadd_ps(ifft383, ifft389, _mm512_shuffle_ps(ifft383, ifft383, 78));
__m512 ifft477 = _mm512_fmadd_ps(ifft469, ifft389, _mm512_shuffle_ps(ifft469, ifft469, 78));
__m512 ifft393 = _mm512_fmadd_ps(ifft384, ifft389, _mm512_shuffle_ps(ifft384, ifft384, 78));
__m512 ifft478 = _mm512_fmadd_ps(ifft470, ifft389, _mm512_shuffle_ps(ifft470, ifft470, 78));
__m512 ifft394 = _mm512_fmadd_ps(ifft385, ifft389, _mm512_shuffle_ps(ifft385, ifft385, 78));
__m512 ifft479 = _mm512_fmadd_ps(ifft471, ifft389, _mm512_shuffle_ps(ifft471, ifft471, 78));
__m512 ifft395 = _mm512_fmadd_ps(ifft386, ifft389, _mm512_shuffle_ps(ifft386, ifft386, 78));
__m512 ifft480 = _mm512_fmadd_ps(ifft472, ifft389, _mm512_shuffle_ps(ifft472, ifft472, 78));
__m512 ifft396 = _mm512_fmadd_ps(ifft387, ifft389, _mm512_shuffle_ps(ifft387, ifft387, 78));
__m512 ifft481 = _mm512_fmadd_ps(ifft473, ifft389, _mm512_shuffle_ps(ifft473, ifft473, 78));
__m512 ifft397 = _mm512_fmadd_ps(ifft388, ifft389, _mm512_shuffle_ps(ifft388, ifft388, 78));
__m512 ifft482 = _mm512_fmadd_ps(ifft474, ifft389, _mm512_shuffle_ps(ifft474, ifft474, 78));
__m512 ifft398 = _mm512_mask_sub_ps(ifft390, 49344, _mm512_setzero_ps(), ifft391);
__m512 ifft483 = _mm512_mask_sub_ps(ifft475, 49344, _mm512_setzero_ps(), ifft476);
__m512 ifft399 = _mm512_mask_mov_ps(ifft391, 49344, ifft390);
__m512 ifft484 = _mm512_mask_mov_ps(ifft476, 49344, ifft475);
__m512 ifft400 = _mm512_mask_sub_ps(ifft392, 49344, _mm512_setzero_ps(), ifft393);
__m512 ifft485 = _mm512_mask_sub_ps(ifft477, 49344, _mm512_setzero_ps(), ifft478);
__m512 ifft401 = _mm512_mask_mov_ps(ifft393, 49344, ifft392);
__m512 ifft486 = _mm512_mask_mov_ps(ifft478, 49344, ifft477);
__m512 ifft402 = _mm512_mask_sub_ps(ifft394, 49344, _mm512_setzero_ps(), ifft395);
__m512 ifft487 = _mm512_mask_sub_ps(ifft479, 49344, _mm512_setzero_ps(), ifft480);
__m512 ifft403 = _mm512_mask_mov_ps(ifft395, 49344, ifft394);
__m512 ifft488 = _mm512_mask_mov_ps(ifft480, 49344, ifft479);
__m512 ifft404 = _mm512_mask_sub_ps(ifft396, 49344, _mm512_setzero_ps(), ifft397);
__m512 ifft489 = _mm512_mask_sub_ps(ifft481, 49344, _mm512_setzero_ps(), ifft482);
__m512 ifft405 = _mm512_mask_mov_ps(ifft397, 49344, ifft396);
__m512 ifft490 = _mm512_mask_mov_ps(ifft482, 49344, ifft481);
__m512 ifft406 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft407 = _mm512_fmadd_ps(ifft398, ifft406, _mm512_shuffle_f32x4(ifft398, ifft398, 177));
__m512 ifft491 = _mm512_fmadd_ps(ifft483, ifft406, _mm512_shuffle_f32x4(ifft483, ifft483, 177));
__m512 ifft408 = _mm512_fmadd_ps(ifft399, ifft406, _mm512_shuffle_f32x4(ifft399, ifft399, 177));
__m512 ifft492 = _mm512_fmadd_ps(ifft484, ifft406, _mm512_shuffle_f32x4(ifft484, ifft484, 177));
__m512 ifft409 = _mm512_fmadd_ps(ifft400, ifft406, _mm512_shuffle_f32x4(ifft400, ifft400, 177));
__m512 ifft493 = _mm512_fmadd_ps(ifft485, ifft406, _mm512_shuffle_f32x4(ifft485, ifft485, 177));
__m512 ifft410 = _mm512_fmadd_ps(ifft401, ifft406, _mm512_shuffle_f32x4(ifft401, ifft401, 177));
__m512 ifft494 = _mm512_fmadd_ps(ifft486, ifft406, _mm512_shuffle_f32x4(ifft486, ifft486, 177));
__m512 ifft411 = _mm512_fmadd_ps(ifft402, ifft406, _mm512_shuffle_f32x4(ifft402, ifft402, 177));
__m512 ifft495 = _mm512_fmadd_ps(ifft487, ifft406, _mm512_shuffle_f32x4(ifft487, ifft487, 177));
__m512 ifft412 = _mm512_fnmsub_ps(ifft403, ifft406, _mm512_shuffle_f32x4(ifft403, ifft403, 177));
__m512 ifft496 = _mm512_fnmsub_ps(ifft488, ifft406, _mm512_shuffle_f32x4(ifft488, ifft488, 177));
__m512 ifft413 = _mm512_fmadd_ps(ifft404, ifft406, _mm512_shuffle_f32x4(ifft404, ifft404, 177));
__m512 ifft497 = _mm512_fmadd_ps(ifft489, ifft406, _mm512_shuffle_f32x4(ifft489, ifft489, 177));
__m512 ifft414 = _mm512_fmadd_ps(ifft405, ifft406, _mm512_shuffle_f32x4(ifft405, ifft405, 177));
__m512 ifft498 = _mm512_fmadd_ps(ifft490, ifft406, _mm512_shuffle_f32x4(ifft490, ifft490, 177));
__m512 ifft415 = _mm512_add_ps(ifft407, ifft408);
__m512 ifft499 = _mm512_add_ps(ifft491, ifft492);
__m512 ifft416 = _mm512_sub_ps(ifft407, ifft408);
__m512 ifft500 = _mm512_sub_ps(ifft491, ifft492);
__m512 ifft417 = _mm512_sub_ps(ifft409, ifft413);
__m512 ifft501 = _mm512_sub_ps(ifft493, ifft497);
__m512 ifft418 = _mm512_add_ps(ifft410, ifft414);
__m512 ifft502 = _mm512_add_ps(ifft494, ifft498);
__m512 ifft419 = _mm512_add_ps(ifft409, ifft413);
__m512 ifft503 = _mm512_add_ps(ifft493, ifft497);
__m512 ifft420 = _mm512_sub_ps(ifft410, ifft414);
__m512 ifft504 = _mm512_sub_ps(ifft494, ifft498);
__m512 ifft421 = _mm512_mul_ps(ifft411, _mm512_set1_ps(3.125e-02f));
__m512 ifft505 = _mm512_mul_ps(ifft495, _mm512_set1_ps(3.125e-02f));
__m512 ifft422 = _mm512_mul_ps(ifft412, _mm512_set1_ps(3.125e-02f));
__m512 ifft506 = _mm512_mul_ps(ifft496, _mm512_set1_ps(3.125e-02f));
__m512 ifft423 = _mm512_fmadd_ps(ifft415, _mm512_set1_ps(1.5625e-02f), ifft421);
__m512 ifft507 = _mm512_fmadd_ps(ifft499, _mm512_set1_ps(1.5625e-02f), ifft505);
__m512 ifft424 = _mm512_fmsub_ps(ifft415, _mm512_set1_ps(1.5625e-02f), ifft421);
__m512 ifft508 = _mm512_fmsub_ps(ifft499, _mm512_set1_ps(1.5625e-02f), ifft505);
__m512 ifft425 = _mm512_fmadd_ps(ifft416, _mm512_set1_ps(1.5625e-02f), ifft422);
__m512 ifft509 = _mm512_fmadd_ps(ifft500, _mm512_set1_ps(1.5625e-02f), ifft506);
__m512 ifft426 = _mm512_fmsub_ps(ifft416, _mm512_set1_ps(1.5625e-02f), ifft422);
__m512 ifft510 = _mm512_fmsub_ps(ifft500, _mm512_set1_ps(1.5625e-02f), ifft506);
__m512 ifft427 = _mm512_add_ps(ifft417, ifft418);
__m512 ifft511 = _mm512_add_ps(ifft501, ifft502);
__m512 ifft428 = _mm512_sub_ps(ifft417, ifft418);
__m512 ifft512 = _mm512_sub_ps(ifft501, ifft502);
__m512 ifft429 = _mm512_fnmadd_ps(ifft427, _mm512_set1_ps(7.0710677e-01f), ifft419);
__m512 ifft513 = _mm512_fnmadd_ps(ifft511, _mm512_set1_ps(7.0710677e-01f), ifft503);
__m512 ifft430 = _mm512_fmadd_ps(ifft427, _mm512_set1_ps(7.0710677e-01f), ifft419);
__m512 ifft514 = _mm512_fmadd_ps(ifft511, _mm512_set1_ps(7.0710677e-01f), ifft503);
__m512 ifft431 = _mm512_fmadd_ps(ifft428, _mm512_set1_ps(7.0710677e-01f), ifft420);
__m512 ifft515 = _mm512_fmadd_ps(ifft512, _mm512_set1_ps(7.0710677e-01f), ifft504);
__m512 ifft432 = _mm512_fmsub_ps(ifft428, _mm512_set1_ps(7.0710677e-01f), ifft420);
__m512 ifft516 = _mm512_fmsub_ps(ifft512, _mm512_set1_ps(7.0710677e-01f), ifft504);
__m512 ifft433 = _mm512_add_ps(ifft429, ifft430);
__m512 ifft517 = _mm512_add_ps(ifft513, ifft514);
__m512 ifft434 = _mm512_sub_ps(ifft429, ifft430);
__m512 ifft518 = _mm512_sub_ps(ifft513, ifft514);
__m512 ifft435 = _mm512_add_ps(ifft431, ifft432);
__m512 ifft519 = _mm512_add_ps(ifft515, ifft516);
__m512 ifft436 = _mm512_sub_ps(ifft431, ifft432);
__m512 ifft520 = _mm512_sub_ps(ifft515, ifft516);
__m512 ifft437 = _mm512_fmadd_ps(ifft433, _mm512_set1_ps(1.5625e-02f), ifft423);
__m512 ifft521 = _mm512_fmadd_ps(ifft517, _mm512_set1_ps(1.5625e-02f), ifft507);
__m512 ifft438 = _mm512_fnmadd_ps(ifft433, _mm512_set1_ps(1.5625e-02f), ifft423);
__m512 ifft522 = _mm512_fnmadd_ps(ifft517, _mm512_set1_ps(1.5625e-02f), ifft507);
__m512 ifft439 = _mm512_fmadd_ps(ifft435, _mm512_set1_ps(1.5625e-02f), ifft425);
__m512 ifft523 = _mm512_fmadd_ps(ifft519, _mm512_set1_ps(1.5625e-02f), ifft509);
__m512 ifft440 = _mm512_fnmadd_ps(ifft435, _mm512_set1_ps(1.5625e-02f), ifft425);
__m512 ifft524 = _mm512_fnmadd_ps(ifft519, _mm512_set1_ps(1.5625e-02f), ifft509);
__m512 ifft441 = _mm512_fnmadd_ps(ifft436, _mm512_set1_ps(1.5625e-02f), ifft424);
__m512 ifft525 = _mm512_fnmadd_ps(ifft520, _mm512_set1_ps(1.5625e-02f), ifft508);
__m512 ifft442 = _mm512_fmadd_ps(ifft436, _mm512_set1_ps(1.5625e-02f), ifft424);
__m512 ifft526 = _mm512_fmadd_ps(ifft520, _mm512_set1_ps(1.5625e-02f), ifft508);
__m512 ifft443 = _mm512_fmadd_ps(ifft434, _mm512_set1_ps(1.5625e-02f), ifft426);
__m512 ifft527 = _mm512_fmadd_ps(ifft518, _mm512_set1_ps(1.5625e-02f), ifft510);
__m512 ifft444 = _mm512_fnmadd_ps(ifft434, _mm512_set1_ps(1.5625e-02f), ifft426);
__m512 ifft528 = _mm512_fnmadd_ps(ifft518, _mm512_set1_ps(1.5625e-02f), ifft510);
__m512 dat610 = ifft437;
__m512 dat615 = ifft521;
__m512 dat611 = ifft439;
__m512 dat616 = ifft523;
__m512 dat612 = ifft441;
__m512 dat617 = ifft525;
__m512 dat613 = ifft443;
__m512 dat618 = ifft527;
__m512 dat614 = ifft438;
__m512 dat619 = ifft522;
(void)ifft440;
(void)ifft524;
(void)ifft442;
(void)ifft526;
(void)ifft444;
(void)ifft528;
__m512i pm5 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack21 = _mm512_permutex2var_ps(dat610, pm5, dat615);
__m512i pm6 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack22 = _mm512_permutex2var_ps(dat610, pm6, dat615);
__m512 pack23 = _mm512_permutex2var_ps(dat611, pm5, dat616);
__m512 pack24 = _mm512_permutex2var_ps(dat611, pm6, dat616);
__m512 pack25 = _mm512_permutex2var_ps(dat612, pm5, dat617);
__m512 pack26 = _mm512_permutex2var_ps(dat612, pm6, dat617);
__m512 pack27 = _mm512_permutex2var_ps(dat613, pm5, dat618);
__m512 pack28 = _mm512_permutex2var_ps(dat613, pm6, dat618);
__m512 pack29 = _mm512_permutex2var_ps(dat614, pm5, dat619);
__m512 pack30 = _mm512_permutex2var_ps(dat614, pm6, dat619);
pack21 = _mm512_max_ps(_mm512_setzero_ps(), pack21);
pack22 = _mm512_max_ps(_mm512_setzero_ps(), pack22);
pack23 = _mm512_max_ps(_mm512_setzero_ps(), pack23);
pack24 = _mm512_max_ps(_mm512_setzero_ps(), pack24);
pack25 = _mm512_max_ps(_mm512_setzero_ps(), pack25);
pack26 = _mm512_max_ps(_mm512_setzero_ps(), pack26);
pack27 = _mm512_max_ps(_mm512_setzero_ps(), pack27);
pack28 = _mm512_max_ps(_mm512_setzero_ps(), pack28);
pack29 = _mm512_max_ps(_mm512_setzero_ps(), pack29);
pack30 = _mm512_max_ps(_mm512_setzero_ps(), pack30);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack21);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack22);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack23);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack24);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack25);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack26);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack27);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack28);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack29);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack30);
}
}
}
if (j5 >= last2) return;
++j5;
}
rel4 = 3;
}
ptrdiff_t toH3 = base4+0;
ptrdiff_t toW3 = 90;
ptrdiff_t k27 = 16*w21;
for (; k27 != 16; ++k27) {
ptrdiff_t r4 = 0;
for (; r4 != 2; ++r4) {
ptrdiff_t t5 = 0;
for (; t5 < 2; ++t5) {
__m512 sfRe81 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfIm81 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfRe85 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfIm85 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfRe82 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfIm82 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfRe86 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfIm86 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfRe83 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfIm83 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfRe87 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfIm87 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfRe84 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfIm84 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfRe88 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfIm88 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512i ifft529 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft530 = _mm512_permutexvar_ps(ifft529, sfRe81);
__m512 ifft621 = _mm512_permutexvar_ps(ifft529, sfRe85);
__m512i ifft531 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft532 = _mm512_permutexvar_ps(ifft531, sfRe81);
__m512 ifft622 = _mm512_permutexvar_ps(ifft531, sfRe85);
__m512 ifft533 = _mm512_permutexvar_ps(ifft529, sfIm81);
__m512 ifft623 = _mm512_permutexvar_ps(ifft529, sfIm85);
__m512 ifft534 = _mm512_permutexvar_ps(ifft531, sfIm81);
__m512 ifft624 = _mm512_permutexvar_ps(ifft531, sfIm85);
__m512 ifft535 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft536 = _mm512_mask_fmadd_ps(ifft534, 65021, ifft535, ifft530);
__m512 ifft625 = _mm512_mask_fmadd_ps(ifft624, 65021, ifft535, ifft621);
__m512 ifft537 = _mm512_mask_fnmadd_ps(ifft533, 65021, ifft535, ifft532);
__m512 ifft626 = _mm512_mask_fnmadd_ps(ifft623, 65021, ifft535, ifft622);
__m512 ifft538 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft539 = _mm512_fmadd_ps(ifft536, ifft538, _mm512_shuffle_ps(ifft536, ifft536, 177));
__m512 ifft627 = _mm512_fmadd_ps(ifft625, ifft538, _mm512_shuffle_ps(ifft625, ifft625, 177));
__m512 ifft540 = _mm512_fmadd_ps(ifft537, ifft538, _mm512_shuffle_ps(ifft537, ifft537, 177));
__m512 ifft628 = _mm512_fmadd_ps(ifft626, ifft538, _mm512_shuffle_ps(ifft626, ifft626, 177));
__m512 ifft541 = _mm512_fmadd_ps(sfRe82, ifft538, _mm512_shuffle_ps(sfRe82, sfRe82, 177));
__m512 ifft629 = _mm512_fmadd_ps(sfRe86, ifft538, _mm512_shuffle_ps(sfRe86, sfRe86, 177));
__m512 ifft542 = _mm512_fmadd_ps(sfIm82, ifft538, _mm512_shuffle_ps(sfIm82, sfIm82, 177));
__m512 ifft630 = _mm512_fmadd_ps(sfIm86, ifft538, _mm512_shuffle_ps(sfIm86, sfIm86, 177));
__m512 ifft543 = _mm512_fmadd_ps(sfRe83, ifft538, _mm512_shuffle_ps(sfRe83, sfRe83, 177));
__m512 ifft631 = _mm512_fmadd_ps(sfRe87, ifft538, _mm512_shuffle_ps(sfRe87, sfRe87, 177));
__m512 ifft544 = _mm512_fmadd_ps(sfIm83, ifft538, _mm512_shuffle_ps(sfIm83, sfIm83, 177));
__m512 ifft632 = _mm512_fmadd_ps(sfIm87, ifft538, _mm512_shuffle_ps(sfIm87, sfIm87, 177));
__m512 ifft545 = _mm512_fmadd_ps(sfRe84, ifft538, _mm512_shuffle_ps(sfRe84, sfRe84, 177));
__m512 ifft633 = _mm512_fmadd_ps(sfRe88, ifft538, _mm512_shuffle_ps(sfRe88, sfRe88, 177));
__m512 ifft546 = _mm512_fmadd_ps(sfIm84, ifft538, _mm512_shuffle_ps(sfIm84, sfIm84, 177));
__m512 ifft634 = _mm512_fmadd_ps(sfIm88, ifft538, _mm512_shuffle_ps(sfIm88, sfIm88, 177));
__m512 ifft547 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft548 = _mm512_mul_ps(ifft539, ifft547);
__m512 ifft635 = _mm512_mul_ps(ifft627, ifft547);
__m512 ifft549 = _mm512_mul_ps(ifft540, ifft547);
__m512 ifft636 = _mm512_mul_ps(ifft628, ifft547);
__m512 ifft550 = _mm512_mul_ps(ifft541, ifft547);
__m512 ifft637 = _mm512_mul_ps(ifft629, ifft547);
__m512 ifft551 = _mm512_mul_ps(ifft542, ifft547);
__m512 ifft638 = _mm512_mul_ps(ifft630, ifft547);
__m512 ifft552 = _mm512_mul_ps(ifft543, ifft547);
__m512 ifft639 = _mm512_mul_ps(ifft631, ifft547);
__m512 ifft553 = _mm512_mul_ps(ifft544, ifft547);
__m512 ifft640 = _mm512_mul_ps(ifft632, ifft547);
__m512 ifft554 = _mm512_mul_ps(ifft545, ifft547);
__m512 ifft641 = _mm512_mul_ps(ifft633, ifft547);
__m512 ifft555 = _mm512_mul_ps(ifft546, ifft547);
__m512 ifft642 = _mm512_mul_ps(ifft634, ifft547);
__m512 ifft556 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft557 = _mm512_fnmadd_ps(ifft540, ifft556, ifft548);
__m512 ifft643 = _mm512_fnmadd_ps(ifft628, ifft556, ifft635);
__m512 ifft558 = _mm512_fmadd_ps(ifft539, ifft556, ifft549);
__m512 ifft644 = _mm512_fmadd_ps(ifft627, ifft556, ifft636);
__m512 ifft559 = _mm512_fnmadd_ps(ifft542, ifft556, ifft550);
__m512 ifft645 = _mm512_fnmadd_ps(ifft630, ifft556, ifft637);
__m512 ifft560 = _mm512_fmadd_ps(ifft541, ifft556, ifft551);
__m512 ifft646 = _mm512_fmadd_ps(ifft629, ifft556, ifft638);
__m512 ifft561 = _mm512_fnmadd_ps(ifft544, ifft556, ifft552);
__m512 ifft647 = _mm512_fnmadd_ps(ifft632, ifft556, ifft639);
__m512 ifft562 = _mm512_fmadd_ps(ifft543, ifft556, ifft553);
__m512 ifft648 = _mm512_fmadd_ps(ifft631, ifft556, ifft640);
__m512 ifft563 = _mm512_fnmadd_ps(ifft546, ifft556, ifft554);
__m512 ifft649 = _mm512_fnmadd_ps(ifft634, ifft556, ifft641);
__m512 ifft564 = _mm512_fmadd_ps(ifft545, ifft556, ifft555);
__m512 ifft650 = _mm512_fmadd_ps(ifft633, ifft556, ifft642);
__m512 ifft565 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft566 = _mm512_fmadd_ps(ifft557, ifft565, _mm512_shuffle_ps(ifft557, ifft557, 78));
__m512 ifft651 = _mm512_fmadd_ps(ifft643, ifft565, _mm512_shuffle_ps(ifft643, ifft643, 78));
__m512 ifft567 = _mm512_fmadd_ps(ifft558, ifft565, _mm512_shuffle_ps(ifft558, ifft558, 78));
__m512 ifft652 = _mm512_fmadd_ps(ifft644, ifft565, _mm512_shuffle_ps(ifft644, ifft644, 78));
__m512 ifft568 = _mm512_fmadd_ps(ifft559, ifft565, _mm512_shuffle_ps(ifft559, ifft559, 78));
__m512 ifft653 = _mm512_fmadd_ps(ifft645, ifft565, _mm512_shuffle_ps(ifft645, ifft645, 78));
__m512 ifft569 = _mm512_fmadd_ps(ifft560, ifft565, _mm512_shuffle_ps(ifft560, ifft560, 78));
__m512 ifft654 = _mm512_fmadd_ps(ifft646, ifft565, _mm512_shuffle_ps(ifft646, ifft646, 78));
__m512 ifft570 = _mm512_fmadd_ps(ifft561, ifft565, _mm512_shuffle_ps(ifft561, ifft561, 78));
__m512 ifft655 = _mm512_fmadd_ps(ifft647, ifft565, _mm512_shuffle_ps(ifft647, ifft647, 78));
__m512 ifft571 = _mm512_fmadd_ps(ifft562, ifft565, _mm512_shuffle_ps(ifft562, ifft562, 78));
__m512 ifft656 = _mm512_fmadd_ps(ifft648, ifft565, _mm512_shuffle_ps(ifft648, ifft648, 78));
__m512 ifft572 = _mm512_fmadd_ps(ifft563, ifft565, _mm512_shuffle_ps(ifft563, ifft563, 78));
__m512 ifft657 = _mm512_fmadd_ps(ifft649, ifft565, _mm512_shuffle_ps(ifft649, ifft649, 78));
__m512 ifft573 = _mm512_fmadd_ps(ifft564, ifft565, _mm512_shuffle_ps(ifft564, ifft564, 78));
__m512 ifft658 = _mm512_fmadd_ps(ifft650, ifft565, _mm512_shuffle_ps(ifft650, ifft650, 78));
__m512 ifft574 = _mm512_mask_sub_ps(ifft566, 49344, _mm512_setzero_ps(), ifft567);
__m512 ifft659 = _mm512_mask_sub_ps(ifft651, 49344, _mm512_setzero_ps(), ifft652);
__m512 ifft575 = _mm512_mask_mov_ps(ifft567, 49344, ifft566);
__m512 ifft660 = _mm512_mask_mov_ps(ifft652, 49344, ifft651);
__m512 ifft576 = _mm512_mask_sub_ps(ifft568, 49344, _mm512_setzero_ps(), ifft569);
__m512 ifft661 = _mm512_mask_sub_ps(ifft653, 49344, _mm512_setzero_ps(), ifft654);
__m512 ifft577 = _mm512_mask_mov_ps(ifft569, 49344, ifft568);
__m512 ifft662 = _mm512_mask_mov_ps(ifft654, 49344, ifft653);
__m512 ifft578 = _mm512_mask_sub_ps(ifft570, 49344, _mm512_setzero_ps(), ifft571);
__m512 ifft663 = _mm512_mask_sub_ps(ifft655, 49344, _mm512_setzero_ps(), ifft656);
__m512 ifft579 = _mm512_mask_mov_ps(ifft571, 49344, ifft570);
__m512 ifft664 = _mm512_mask_mov_ps(ifft656, 49344, ifft655);
__m512 ifft580 = _mm512_mask_sub_ps(ifft572, 49344, _mm512_setzero_ps(), ifft573);
__m512 ifft665 = _mm512_mask_sub_ps(ifft657, 49344, _mm512_setzero_ps(), ifft658);
__m512 ifft581 = _mm512_mask_mov_ps(ifft573, 49344, ifft572);
__m512 ifft666 = _mm512_mask_mov_ps(ifft658, 49344, ifft657);
__m512 ifft582 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft583 = _mm512_fmadd_ps(ifft574, ifft582, _mm512_shuffle_f32x4(ifft574, ifft574, 177));
__m512 ifft667 = _mm512_fmadd_ps(ifft659, ifft582, _mm512_shuffle_f32x4(ifft659, ifft659, 177));
__m512 ifft584 = _mm512_fmadd_ps(ifft575, ifft582, _mm512_shuffle_f32x4(ifft575, ifft575, 177));
__m512 ifft668 = _mm512_fmadd_ps(ifft660, ifft582, _mm512_shuffle_f32x4(ifft660, ifft660, 177));
__m512 ifft585 = _mm512_fmadd_ps(ifft576, ifft582, _mm512_shuffle_f32x4(ifft576, ifft576, 177));
__m512 ifft669 = _mm512_fmadd_ps(ifft661, ifft582, _mm512_shuffle_f32x4(ifft661, ifft661, 177));
__m512 ifft586 = _mm512_fmadd_ps(ifft577, ifft582, _mm512_shuffle_f32x4(ifft577, ifft577, 177));
__m512 ifft670 = _mm512_fmadd_ps(ifft662, ifft582, _mm512_shuffle_f32x4(ifft662, ifft662, 177));
__m512 ifft587 = _mm512_fmadd_ps(ifft578, ifft582, _mm512_shuffle_f32x4(ifft578, ifft578, 177));
__m512 ifft671 = _mm512_fmadd_ps(ifft663, ifft582, _mm512_shuffle_f32x4(ifft663, ifft663, 177));
__m512 ifft588 = _mm512_fnmsub_ps(ifft579, ifft582, _mm512_shuffle_f32x4(ifft579, ifft579, 177));
__m512 ifft672 = _mm512_fnmsub_ps(ifft664, ifft582, _mm512_shuffle_f32x4(ifft664, ifft664, 177));
__m512 ifft589 = _mm512_fmadd_ps(ifft580, ifft582, _mm512_shuffle_f32x4(ifft580, ifft580, 177));
__m512 ifft673 = _mm512_fmadd_ps(ifft665, ifft582, _mm512_shuffle_f32x4(ifft665, ifft665, 177));
__m512 ifft590 = _mm512_fmadd_ps(ifft581, ifft582, _mm512_shuffle_f32x4(ifft581, ifft581, 177));
__m512 ifft674 = _mm512_fmadd_ps(ifft666, ifft582, _mm512_shuffle_f32x4(ifft666, ifft666, 177));
__m512 ifft591 = _mm512_add_ps(ifft583, ifft584);
__m512 ifft675 = _mm512_add_ps(ifft667, ifft668);
__m512 ifft592 = _mm512_sub_ps(ifft583, ifft584);
__m512 ifft676 = _mm512_sub_ps(ifft667, ifft668);
__m512 ifft593 = _mm512_sub_ps(ifft585, ifft589);
__m512 ifft677 = _mm512_sub_ps(ifft669, ifft673);
__m512 ifft594 = _mm512_add_ps(ifft586, ifft590);
__m512 ifft678 = _mm512_add_ps(ifft670, ifft674);
__m512 ifft595 = _mm512_add_ps(ifft585, ifft589);
__m512 ifft679 = _mm512_add_ps(ifft669, ifft673);
__m512 ifft596 = _mm512_sub_ps(ifft586, ifft590);
__m512 ifft680 = _mm512_sub_ps(ifft670, ifft674);
__m512 ifft597 = _mm512_mul_ps(ifft587, _mm512_set1_ps(3.125e-02f));
__m512 ifft681 = _mm512_mul_ps(ifft671, _mm512_set1_ps(3.125e-02f));
__m512 ifft598 = _mm512_mul_ps(ifft588, _mm512_set1_ps(3.125e-02f));
__m512 ifft682 = _mm512_mul_ps(ifft672, _mm512_set1_ps(3.125e-02f));
__m512 ifft599 = _mm512_fmadd_ps(ifft591, _mm512_set1_ps(1.5625e-02f), ifft597);
__m512 ifft683 = _mm512_fmadd_ps(ifft675, _mm512_set1_ps(1.5625e-02f), ifft681);
__m512 ifft600 = _mm512_fmsub_ps(ifft591, _mm512_set1_ps(1.5625e-02f), ifft597);
__m512 ifft684 = _mm512_fmsub_ps(ifft675, _mm512_set1_ps(1.5625e-02f), ifft681);
__m512 ifft601 = _mm512_fmadd_ps(ifft592, _mm512_set1_ps(1.5625e-02f), ifft598);
__m512 ifft685 = _mm512_fmadd_ps(ifft676, _mm512_set1_ps(1.5625e-02f), ifft682);
__m512 ifft602 = _mm512_fmsub_ps(ifft592, _mm512_set1_ps(1.5625e-02f), ifft598);
__m512 ifft686 = _mm512_fmsub_ps(ifft676, _mm512_set1_ps(1.5625e-02f), ifft682);
__m512 ifft603 = _mm512_add_ps(ifft593, ifft594);
__m512 ifft687 = _mm512_add_ps(ifft677, ifft678);
__m512 ifft604 = _mm512_sub_ps(ifft593, ifft594);
__m512 ifft688 = _mm512_sub_ps(ifft677, ifft678);
__m512 ifft605 = _mm512_fnmadd_ps(ifft603, _mm512_set1_ps(7.0710677e-01f), ifft595);
__m512 ifft689 = _mm512_fnmadd_ps(ifft687, _mm512_set1_ps(7.0710677e-01f), ifft679);
__m512 ifft606 = _mm512_fmadd_ps(ifft603, _mm512_set1_ps(7.0710677e-01f), ifft595);
__m512 ifft690 = _mm512_fmadd_ps(ifft687, _mm512_set1_ps(7.0710677e-01f), ifft679);
__m512 ifft607 = _mm512_fmadd_ps(ifft604, _mm512_set1_ps(7.0710677e-01f), ifft596);
__m512 ifft691 = _mm512_fmadd_ps(ifft688, _mm512_set1_ps(7.0710677e-01f), ifft680);
__m512 ifft608 = _mm512_fmsub_ps(ifft604, _mm512_set1_ps(7.0710677e-01f), ifft596);
__m512 ifft692 = _mm512_fmsub_ps(ifft688, _mm512_set1_ps(7.0710677e-01f), ifft680);
__m512 ifft609 = _mm512_add_ps(ifft605, ifft606);
__m512 ifft693 = _mm512_add_ps(ifft689, ifft690);
__m512 ifft610 = _mm512_sub_ps(ifft605, ifft606);
__m512 ifft694 = _mm512_sub_ps(ifft689, ifft690);
__m512 ifft611 = _mm512_add_ps(ifft607, ifft608);
__m512 ifft695 = _mm512_add_ps(ifft691, ifft692);
__m512 ifft612 = _mm512_sub_ps(ifft607, ifft608);
__m512 ifft696 = _mm512_sub_ps(ifft691, ifft692);
__m512 ifft613 = _mm512_fmadd_ps(ifft609, _mm512_set1_ps(1.5625e-02f), ifft599);
__m512 ifft697 = _mm512_fmadd_ps(ifft693, _mm512_set1_ps(1.5625e-02f), ifft683);
__m512 ifft614 = _mm512_fnmadd_ps(ifft609, _mm512_set1_ps(1.5625e-02f), ifft599);
__m512 ifft698 = _mm512_fnmadd_ps(ifft693, _mm512_set1_ps(1.5625e-02f), ifft683);
__m512 ifft615 = _mm512_fmadd_ps(ifft611, _mm512_set1_ps(1.5625e-02f), ifft601);
__m512 ifft699 = _mm512_fmadd_ps(ifft695, _mm512_set1_ps(1.5625e-02f), ifft685);
__m512 ifft616 = _mm512_fnmadd_ps(ifft611, _mm512_set1_ps(1.5625e-02f), ifft601);
__m512 ifft700 = _mm512_fnmadd_ps(ifft695, _mm512_set1_ps(1.5625e-02f), ifft685);
__m512 ifft617 = _mm512_fnmadd_ps(ifft612, _mm512_set1_ps(1.5625e-02f), ifft600);
__m512 ifft701 = _mm512_fnmadd_ps(ifft696, _mm512_set1_ps(1.5625e-02f), ifft684);
__m512 ifft618 = _mm512_fmadd_ps(ifft612, _mm512_set1_ps(1.5625e-02f), ifft600);
__m512 ifft702 = _mm512_fmadd_ps(ifft696, _mm512_set1_ps(1.5625e-02f), ifft684);
__m512 ifft619 = _mm512_fmadd_ps(ifft610, _mm512_set1_ps(1.5625e-02f), ifft602);
__m512 ifft703 = _mm512_fmadd_ps(ifft694, _mm512_set1_ps(1.5625e-02f), ifft686);
__m512 ifft620 = _mm512_fnmadd_ps(ifft610, _mm512_set1_ps(1.5625e-02f), ifft602);
__m512 ifft704 = _mm512_fnmadd_ps(ifft694, _mm512_set1_ps(1.5625e-02f), ifft686);
__m512 dat620 = ifft613;
__m512 dat625 = ifft697;
__m512 dat621 = ifft615;
__m512 dat626 = ifft699;
__m512 dat622 = ifft617;
__m512 dat627 = ifft701;
__m512 dat623 = ifft619;
__m512 dat628 = ifft703;
__m512 dat624 = ifft614;
__m512 dat629 = ifft698;
(void)ifft616;
(void)ifft700;
(void)ifft618;
(void)ifft702;
(void)ifft620;
(void)ifft704;
__m512i pm7 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack31 = _mm512_permutex2var_ps(dat620, pm7, dat625);
__m512i pm8 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack32 = _mm512_permutex2var_ps(dat620, pm8, dat625);
__m512 pack33 = _mm512_permutex2var_ps(dat621, pm7, dat626);
__m512 pack34 = _mm512_permutex2var_ps(dat621, pm8, dat626);
__m512 pack35 = _mm512_permutex2var_ps(dat622, pm7, dat627);
__m512 pack36 = _mm512_permutex2var_ps(dat622, pm8, dat627);
__m512 pack37 = _mm512_permutex2var_ps(dat623, pm7, dat628);
__m512 pack38 = _mm512_permutex2var_ps(dat623, pm8, dat628);
__m512 pack39 = _mm512_permutex2var_ps(dat624, pm7, dat629);
__m512 pack40 = _mm512_permutex2var_ps(dat624, pm8, dat629);
pack31 = _mm512_max_ps(_mm512_setzero_ps(), pack31);
pack32 = _mm512_max_ps(_mm512_setzero_ps(), pack32);
pack33 = _mm512_max_ps(_mm512_setzero_ps(), pack33);
pack34 = _mm512_max_ps(_mm512_setzero_ps(), pack34);
pack35 = _mm512_max_ps(_mm512_setzero_ps(), pack35);
pack36 = _mm512_max_ps(_mm512_setzero_ps(), pack36);
pack37 = _mm512_max_ps(_mm512_setzero_ps(), pack37);
pack38 = _mm512_max_ps(_mm512_setzero_ps(), pack38);
pack39 = _mm512_max_ps(_mm512_setzero_ps(), pack39);
pack40 = _mm512_max_ps(_mm512_setzero_ps(), pack40);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack31);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack32);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack33);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack34);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack35);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack36);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack37);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack38);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack39);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack40);
}
ptrdiff_t t6 = 0;
__m512 sfRe89 = _mm512_loadu_ps(sfPtr3+512+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfIm89 = _mm512_loadu_ps(sfPtr3+576+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfRe93 = _mm512_loadu_ps(sfPtr3+640+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfIm93 = _mm512_loadu_ps(sfPtr3+704+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfRe90 = _mm512_loadu_ps(sfPtr3+2167296+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfIm90 = _mm512_loadu_ps(sfPtr3+2167360+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfRe94 = _mm512_loadu_ps(sfPtr3+2167424+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfIm94 = _mm512_loadu_ps(sfPtr3+2167488+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfRe91 = _mm512_loadu_ps(sfPtr3+4334080+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfIm91 = _mm512_loadu_ps(sfPtr3+4334144+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfRe95 = _mm512_loadu_ps(sfPtr3+4334208+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfIm95 = _mm512_loadu_ps(sfPtr3+4334272+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfRe92 = _mm512_loadu_ps(sfPtr3+6500864+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfIm92 = _mm512_loadu_ps(sfPtr3+6500928+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfRe96 = _mm512_loadu_ps(sfPtr3+6500992+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfIm96 = _mm512_loadu_ps(sfPtr3+6501056+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512i ifft705 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft706 = _mm512_permutexvar_ps(ifft705, sfRe89);
__m512 ifft797 = _mm512_permutexvar_ps(ifft705, sfRe93);
__m512i ifft707 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft708 = _mm512_permutexvar_ps(ifft707, sfRe89);
__m512 ifft798 = _mm512_permutexvar_ps(ifft707, sfRe93);
__m512 ifft709 = _mm512_permutexvar_ps(ifft705, sfIm89);
__m512 ifft799 = _mm512_permutexvar_ps(ifft705, sfIm93);
__m512 ifft710 = _mm512_permutexvar_ps(ifft707, sfIm89);
__m512 ifft800 = _mm512_permutexvar_ps(ifft707, sfIm93);
__m512 ifft711 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft712 = _mm512_mask_fmadd_ps(ifft710, 65021, ifft711, ifft706);
__m512 ifft801 = _mm512_mask_fmadd_ps(ifft800, 65021, ifft711, ifft797);
__m512 ifft713 = _mm512_mask_fnmadd_ps(ifft709, 65021, ifft711, ifft708);
__m512 ifft802 = _mm512_mask_fnmadd_ps(ifft799, 65021, ifft711, ifft798);
__m512 ifft714 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft715 = _mm512_fmadd_ps(ifft712, ifft714, _mm512_shuffle_ps(ifft712, ifft712, 177));
__m512 ifft803 = _mm512_fmadd_ps(ifft801, ifft714, _mm512_shuffle_ps(ifft801, ifft801, 177));
__m512 ifft716 = _mm512_fmadd_ps(ifft713, ifft714, _mm512_shuffle_ps(ifft713, ifft713, 177));
__m512 ifft804 = _mm512_fmadd_ps(ifft802, ifft714, _mm512_shuffle_ps(ifft802, ifft802, 177));
__m512 ifft717 = _mm512_fmadd_ps(sfRe90, ifft714, _mm512_shuffle_ps(sfRe90, sfRe90, 177));
__m512 ifft805 = _mm512_fmadd_ps(sfRe94, ifft714, _mm512_shuffle_ps(sfRe94, sfRe94, 177));
__m512 ifft718 = _mm512_fmadd_ps(sfIm90, ifft714, _mm512_shuffle_ps(sfIm90, sfIm90, 177));
__m512 ifft806 = _mm512_fmadd_ps(sfIm94, ifft714, _mm512_shuffle_ps(sfIm94, sfIm94, 177));
__m512 ifft719 = _mm512_fmadd_ps(sfRe91, ifft714, _mm512_shuffle_ps(sfRe91, sfRe91, 177));
__m512 ifft807 = _mm512_fmadd_ps(sfRe95, ifft714, _mm512_shuffle_ps(sfRe95, sfRe95, 177));
__m512 ifft720 = _mm512_fmadd_ps(sfIm91, ifft714, _mm512_shuffle_ps(sfIm91, sfIm91, 177));
__m512 ifft808 = _mm512_fmadd_ps(sfIm95, ifft714, _mm512_shuffle_ps(sfIm95, sfIm95, 177));
__m512 ifft721 = _mm512_fmadd_ps(sfRe92, ifft714, _mm512_shuffle_ps(sfRe92, sfRe92, 177));
__m512 ifft809 = _mm512_fmadd_ps(sfRe96, ifft714, _mm512_shuffle_ps(sfRe96, sfRe96, 177));
__m512 ifft722 = _mm512_fmadd_ps(sfIm92, ifft714, _mm512_shuffle_ps(sfIm92, sfIm92, 177));
__m512 ifft810 = _mm512_fmadd_ps(sfIm96, ifft714, _mm512_shuffle_ps(sfIm96, sfIm96, 177));
__m512 ifft723 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft724 = _mm512_mul_ps(ifft715, ifft723);
__m512 ifft811 = _mm512_mul_ps(ifft803, ifft723);
__m512 ifft725 = _mm512_mul_ps(ifft716, ifft723);
__m512 ifft812 = _mm512_mul_ps(ifft804, ifft723);
__m512 ifft726 = _mm512_mul_ps(ifft717, ifft723);
__m512 ifft813 = _mm512_mul_ps(ifft805, ifft723);
__m512 ifft727 = _mm512_mul_ps(ifft718, ifft723);
__m512 ifft814 = _mm512_mul_ps(ifft806, ifft723);
__m512 ifft728 = _mm512_mul_ps(ifft719, ifft723);
__m512 ifft815 = _mm512_mul_ps(ifft807, ifft723);
__m512 ifft729 = _mm512_mul_ps(ifft720, ifft723);
__m512 ifft816 = _mm512_mul_ps(ifft808, ifft723);
__m512 ifft730 = _mm512_mul_ps(ifft721, ifft723);
__m512 ifft817 = _mm512_mul_ps(ifft809, ifft723);
__m512 ifft731 = _mm512_mul_ps(ifft722, ifft723);
__m512 ifft818 = _mm512_mul_ps(ifft810, ifft723);
__m512 ifft732 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft733 = _mm512_fnmadd_ps(ifft716, ifft732, ifft724);
__m512 ifft819 = _mm512_fnmadd_ps(ifft804, ifft732, ifft811);
__m512 ifft734 = _mm512_fmadd_ps(ifft715, ifft732, ifft725);
__m512 ifft820 = _mm512_fmadd_ps(ifft803, ifft732, ifft812);
__m512 ifft735 = _mm512_fnmadd_ps(ifft718, ifft732, ifft726);
__m512 ifft821 = _mm512_fnmadd_ps(ifft806, ifft732, ifft813);
__m512 ifft736 = _mm512_fmadd_ps(ifft717, ifft732, ifft727);
__m512 ifft822 = _mm512_fmadd_ps(ifft805, ifft732, ifft814);
__m512 ifft737 = _mm512_fnmadd_ps(ifft720, ifft732, ifft728);
__m512 ifft823 = _mm512_fnmadd_ps(ifft808, ifft732, ifft815);
__m512 ifft738 = _mm512_fmadd_ps(ifft719, ifft732, ifft729);
__m512 ifft824 = _mm512_fmadd_ps(ifft807, ifft732, ifft816);
__m512 ifft739 = _mm512_fnmadd_ps(ifft722, ifft732, ifft730);
__m512 ifft825 = _mm512_fnmadd_ps(ifft810, ifft732, ifft817);
__m512 ifft740 = _mm512_fmadd_ps(ifft721, ifft732, ifft731);
__m512 ifft826 = _mm512_fmadd_ps(ifft809, ifft732, ifft818);
__m512 ifft741 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft742 = _mm512_fmadd_ps(ifft733, ifft741, _mm512_shuffle_ps(ifft733, ifft733, 78));
__m512 ifft827 = _mm512_fmadd_ps(ifft819, ifft741, _mm512_shuffle_ps(ifft819, ifft819, 78));
__m512 ifft743 = _mm512_fmadd_ps(ifft734, ifft741, _mm512_shuffle_ps(ifft734, ifft734, 78));
__m512 ifft828 = _mm512_fmadd_ps(ifft820, ifft741, _mm512_shuffle_ps(ifft820, ifft820, 78));
__m512 ifft744 = _mm512_fmadd_ps(ifft735, ifft741, _mm512_shuffle_ps(ifft735, ifft735, 78));
__m512 ifft829 = _mm512_fmadd_ps(ifft821, ifft741, _mm512_shuffle_ps(ifft821, ifft821, 78));
__m512 ifft745 = _mm512_fmadd_ps(ifft736, ifft741, _mm512_shuffle_ps(ifft736, ifft736, 78));
__m512 ifft830 = _mm512_fmadd_ps(ifft822, ifft741, _mm512_shuffle_ps(ifft822, ifft822, 78));
__m512 ifft746 = _mm512_fmadd_ps(ifft737, ifft741, _mm512_shuffle_ps(ifft737, ifft737, 78));
__m512 ifft831 = _mm512_fmadd_ps(ifft823, ifft741, _mm512_shuffle_ps(ifft823, ifft823, 78));
__m512 ifft747 = _mm512_fmadd_ps(ifft738, ifft741, _mm512_shuffle_ps(ifft738, ifft738, 78));
__m512 ifft832 = _mm512_fmadd_ps(ifft824, ifft741, _mm512_shuffle_ps(ifft824, ifft824, 78));
__m512 ifft748 = _mm512_fmadd_ps(ifft739, ifft741, _mm512_shuffle_ps(ifft739, ifft739, 78));
__m512 ifft833 = _mm512_fmadd_ps(ifft825, ifft741, _mm512_shuffle_ps(ifft825, ifft825, 78));
__m512 ifft749 = _mm512_fmadd_ps(ifft740, ifft741, _mm512_shuffle_ps(ifft740, ifft740, 78));
__m512 ifft834 = _mm512_fmadd_ps(ifft826, ifft741, _mm512_shuffle_ps(ifft826, ifft826, 78));
__m512 ifft750 = _mm512_mask_sub_ps(ifft742, 49344, _mm512_setzero_ps(), ifft743);
__m512 ifft835 = _mm512_mask_sub_ps(ifft827, 49344, _mm512_setzero_ps(), ifft828);
__m512 ifft751 = _mm512_mask_mov_ps(ifft743, 49344, ifft742);
__m512 ifft836 = _mm512_mask_mov_ps(ifft828, 49344, ifft827);
__m512 ifft752 = _mm512_mask_sub_ps(ifft744, 49344, _mm512_setzero_ps(), ifft745);
__m512 ifft837 = _mm512_mask_sub_ps(ifft829, 49344, _mm512_setzero_ps(), ifft830);
__m512 ifft753 = _mm512_mask_mov_ps(ifft745, 49344, ifft744);
__m512 ifft838 = _mm512_mask_mov_ps(ifft830, 49344, ifft829);
__m512 ifft754 = _mm512_mask_sub_ps(ifft746, 49344, _mm512_setzero_ps(), ifft747);
__m512 ifft839 = _mm512_mask_sub_ps(ifft831, 49344, _mm512_setzero_ps(), ifft832);
__m512 ifft755 = _mm512_mask_mov_ps(ifft747, 49344, ifft746);
__m512 ifft840 = _mm512_mask_mov_ps(ifft832, 49344, ifft831);
__m512 ifft756 = _mm512_mask_sub_ps(ifft748, 49344, _mm512_setzero_ps(), ifft749);
__m512 ifft841 = _mm512_mask_sub_ps(ifft833, 49344, _mm512_setzero_ps(), ifft834);
__m512 ifft757 = _mm512_mask_mov_ps(ifft749, 49344, ifft748);
__m512 ifft842 = _mm512_mask_mov_ps(ifft834, 49344, ifft833);
__m512 ifft758 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft759 = _mm512_fmadd_ps(ifft750, ifft758, _mm512_shuffle_f32x4(ifft750, ifft750, 177));
__m512 ifft843 = _mm512_fmadd_ps(ifft835, ifft758, _mm512_shuffle_f32x4(ifft835, ifft835, 177));
__m512 ifft760 = _mm512_fmadd_ps(ifft751, ifft758, _mm512_shuffle_f32x4(ifft751, ifft751, 177));
__m512 ifft844 = _mm512_fmadd_ps(ifft836, ifft758, _mm512_shuffle_f32x4(ifft836, ifft836, 177));
__m512 ifft761 = _mm512_fmadd_ps(ifft752, ifft758, _mm512_shuffle_f32x4(ifft752, ifft752, 177));
__m512 ifft845 = _mm512_fmadd_ps(ifft837, ifft758, _mm512_shuffle_f32x4(ifft837, ifft837, 177));
__m512 ifft762 = _mm512_fmadd_ps(ifft753, ifft758, _mm512_shuffle_f32x4(ifft753, ifft753, 177));
__m512 ifft846 = _mm512_fmadd_ps(ifft838, ifft758, _mm512_shuffle_f32x4(ifft838, ifft838, 177));
__m512 ifft763 = _mm512_fmadd_ps(ifft754, ifft758, _mm512_shuffle_f32x4(ifft754, ifft754, 177));
__m512 ifft847 = _mm512_fmadd_ps(ifft839, ifft758, _mm512_shuffle_f32x4(ifft839, ifft839, 177));
__m512 ifft764 = _mm512_fnmsub_ps(ifft755, ifft758, _mm512_shuffle_f32x4(ifft755, ifft755, 177));
__m512 ifft848 = _mm512_fnmsub_ps(ifft840, ifft758, _mm512_shuffle_f32x4(ifft840, ifft840, 177));
__m512 ifft765 = _mm512_fmadd_ps(ifft756, ifft758, _mm512_shuffle_f32x4(ifft756, ifft756, 177));
__m512 ifft849 = _mm512_fmadd_ps(ifft841, ifft758, _mm512_shuffle_f32x4(ifft841, ifft841, 177));
__m512 ifft766 = _mm512_fmadd_ps(ifft757, ifft758, _mm512_shuffle_f32x4(ifft757, ifft757, 177));
__m512 ifft850 = _mm512_fmadd_ps(ifft842, ifft758, _mm512_shuffle_f32x4(ifft842, ifft842, 177));
__m512 ifft767 = _mm512_add_ps(ifft759, ifft760);
__m512 ifft851 = _mm512_add_ps(ifft843, ifft844);
__m512 ifft768 = _mm512_sub_ps(ifft759, ifft760);
__m512 ifft852 = _mm512_sub_ps(ifft843, ifft844);
__m512 ifft769 = _mm512_sub_ps(ifft761, ifft765);
__m512 ifft853 = _mm512_sub_ps(ifft845, ifft849);
__m512 ifft770 = _mm512_add_ps(ifft762, ifft766);
__m512 ifft854 = _mm512_add_ps(ifft846, ifft850);
__m512 ifft771 = _mm512_add_ps(ifft761, ifft765);
__m512 ifft855 = _mm512_add_ps(ifft845, ifft849);
__m512 ifft772 = _mm512_sub_ps(ifft762, ifft766);
__m512 ifft856 = _mm512_sub_ps(ifft846, ifft850);
__m512 ifft773 = _mm512_mul_ps(ifft763, _mm512_set1_ps(3.125e-02f));
__m512 ifft857 = _mm512_mul_ps(ifft847, _mm512_set1_ps(3.125e-02f));
__m512 ifft774 = _mm512_mul_ps(ifft764, _mm512_set1_ps(3.125e-02f));
__m512 ifft858 = _mm512_mul_ps(ifft848, _mm512_set1_ps(3.125e-02f));
__m512 ifft775 = _mm512_fmadd_ps(ifft767, _mm512_set1_ps(1.5625e-02f), ifft773);
__m512 ifft859 = _mm512_fmadd_ps(ifft851, _mm512_set1_ps(1.5625e-02f), ifft857);
__m512 ifft776 = _mm512_fmsub_ps(ifft767, _mm512_set1_ps(1.5625e-02f), ifft773);
__m512 ifft860 = _mm512_fmsub_ps(ifft851, _mm512_set1_ps(1.5625e-02f), ifft857);
__m512 ifft777 = _mm512_fmadd_ps(ifft768, _mm512_set1_ps(1.5625e-02f), ifft774);
__m512 ifft861 = _mm512_fmadd_ps(ifft852, _mm512_set1_ps(1.5625e-02f), ifft858);
__m512 ifft778 = _mm512_fmsub_ps(ifft768, _mm512_set1_ps(1.5625e-02f), ifft774);
__m512 ifft862 = _mm512_fmsub_ps(ifft852, _mm512_set1_ps(1.5625e-02f), ifft858);
__m512 ifft779 = _mm512_add_ps(ifft769, ifft770);
__m512 ifft863 = _mm512_add_ps(ifft853, ifft854);
__m512 ifft780 = _mm512_sub_ps(ifft769, ifft770);
__m512 ifft864 = _mm512_sub_ps(ifft853, ifft854);
__m512 ifft781 = _mm512_fnmadd_ps(ifft779, _mm512_set1_ps(7.0710677e-01f), ifft771);
__m512 ifft865 = _mm512_fnmadd_ps(ifft863, _mm512_set1_ps(7.0710677e-01f), ifft855);
__m512 ifft782 = _mm512_fmadd_ps(ifft779, _mm512_set1_ps(7.0710677e-01f), ifft771);
__m512 ifft866 = _mm512_fmadd_ps(ifft863, _mm512_set1_ps(7.0710677e-01f), ifft855);
__m512 ifft783 = _mm512_fmadd_ps(ifft780, _mm512_set1_ps(7.0710677e-01f), ifft772);
__m512 ifft867 = _mm512_fmadd_ps(ifft864, _mm512_set1_ps(7.0710677e-01f), ifft856);
__m512 ifft784 = _mm512_fmsub_ps(ifft780, _mm512_set1_ps(7.0710677e-01f), ifft772);
__m512 ifft868 = _mm512_fmsub_ps(ifft864, _mm512_set1_ps(7.0710677e-01f), ifft856);
__m512 ifft785 = _mm512_add_ps(ifft781, ifft782);
__m512 ifft869 = _mm512_add_ps(ifft865, ifft866);
__m512 ifft786 = _mm512_sub_ps(ifft781, ifft782);
__m512 ifft870 = _mm512_sub_ps(ifft865, ifft866);
__m512 ifft787 = _mm512_add_ps(ifft783, ifft784);
__m512 ifft871 = _mm512_add_ps(ifft867, ifft868);
__m512 ifft788 = _mm512_sub_ps(ifft783, ifft784);
__m512 ifft872 = _mm512_sub_ps(ifft867, ifft868);
__m512 ifft789 = _mm512_fmadd_ps(ifft785, _mm512_set1_ps(1.5625e-02f), ifft775);
__m512 ifft873 = _mm512_fmadd_ps(ifft869, _mm512_set1_ps(1.5625e-02f), ifft859);
__m512 ifft790 = _mm512_fnmadd_ps(ifft785, _mm512_set1_ps(1.5625e-02f), ifft775);
__m512 ifft874 = _mm512_fnmadd_ps(ifft869, _mm512_set1_ps(1.5625e-02f), ifft859);
__m512 ifft791 = _mm512_fmadd_ps(ifft787, _mm512_set1_ps(1.5625e-02f), ifft777);
__m512 ifft875 = _mm512_fmadd_ps(ifft871, _mm512_set1_ps(1.5625e-02f), ifft861);
__m512 ifft792 = _mm512_fnmadd_ps(ifft787, _mm512_set1_ps(1.5625e-02f), ifft777);
__m512 ifft876 = _mm512_fnmadd_ps(ifft871, _mm512_set1_ps(1.5625e-02f), ifft861);
__m512 ifft793 = _mm512_fnmadd_ps(ifft788, _mm512_set1_ps(1.5625e-02f), ifft776);
__m512 ifft877 = _mm512_fnmadd_ps(ifft872, _mm512_set1_ps(1.5625e-02f), ifft860);
__m512 ifft794 = _mm512_fmadd_ps(ifft788, _mm512_set1_ps(1.5625e-02f), ifft776);
__m512 ifft878 = _mm512_fmadd_ps(ifft872, _mm512_set1_ps(1.5625e-02f), ifft860);
__m512 ifft795 = _mm512_fmadd_ps(ifft786, _mm512_set1_ps(1.5625e-02f), ifft778);
__m512 ifft879 = _mm512_fmadd_ps(ifft870, _mm512_set1_ps(1.5625e-02f), ifft862);
__m512 ifft796 = _mm512_fnmadd_ps(ifft786, _mm512_set1_ps(1.5625e-02f), ifft778);
__m512 ifft880 = _mm512_fnmadd_ps(ifft870, _mm512_set1_ps(1.5625e-02f), ifft862);
__m512 dat630 = ifft789;
__m512 dat635 = ifft873;
__m512 dat631 = ifft791;
__m512 dat636 = ifft875;
__m512 dat632 = ifft793;
__m512 dat637 = ifft877;
__m512 dat633 = ifft795;
__m512 dat638 = ifft879;
__m512 dat634 = ifft790;
__m512 dat639 = ifft874;
(void)ifft792;
(void)ifft876;
(void)ifft794;
(void)ifft878;
(void)ifft796;
(void)ifft880;
dat630 = _mm512_max_ps(_mm512_setzero_ps(), dat630);
dat635 = _mm512_max_ps(_mm512_setzero_ps(), dat635);
dat631 = _mm512_max_ps(_mm512_setzero_ps(), dat631);
dat636 = _mm512_max_ps(_mm512_setzero_ps(), dat636);
dat632 = _mm512_max_ps(_mm512_setzero_ps(), dat632);
dat637 = _mm512_max_ps(_mm512_setzero_ps(), dat637);
dat633 = _mm512_max_ps(_mm512_setzero_ps(), dat633);
dat638 = _mm512_max_ps(_mm512_setzero_ps(), dat638);
dat634 = _mm512_max_ps(_mm512_setzero_ps(), dat634);
dat639 = _mm512_max_ps(_mm512_setzero_ps(), dat639);
_mm512_mask_storeu_ps(datPtr2+80+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 3, dat630);
_mm512_mask_storeu_ps(datPtr2+52088+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 7936, dat630);
_mm512_mask_storeu_ps(datPtr2+1880+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 31, dat635);
_mm512_mask_storeu_ps(datPtr2+50288+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 768, dat635);
_mm512_mask_storeu_ps(datPtr2+528+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 3, dat631);
_mm512_mask_storeu_ps(datPtr2+52536+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 7936, dat631);
_mm512_mask_storeu_ps(datPtr2+2328+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 31, dat636);
_mm512_mask_storeu_ps(datPtr2+50736+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 768, dat636);
_mm512_mask_storeu_ps(datPtr2+976+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 3, dat632);
_mm512_mask_storeu_ps(datPtr2+52984+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 7936, dat632);
_mm512_mask_storeu_ps(datPtr2+2776+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 31, dat637);
_mm512_mask_storeu_ps(datPtr2+51184+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 768, dat637);
_mm512_mask_storeu_ps(datPtr2+1424+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 3, dat633);
_mm512_mask_storeu_ps(datPtr2+53432+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 7936, dat633);
_mm512_mask_storeu_ps(datPtr2+3224+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 31, dat638);
_mm512_mask_storeu_ps(datPtr2+51632+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 768, dat638);
_mm512_mask_storeu_ps(datPtr2+1872+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 3, dat634);
_mm512_mask_storeu_ps(datPtr2+53880+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 7936, dat634);
_mm512_mask_storeu_ps(datPtr2+3672+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 31, dat639);
_mm512_mask_storeu_ps(datPtr2+52080+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 768, dat639);
}
}
if (j5 >= last2) return;
++j5;
j5 = 4;
}
if (j5 < 84) {
ptrdiff_t rel5 = (size_t)(j5-4)%23;
ptrdiff_t base5 = 5+(size_t)(j5-4)/23*30;
for (; ; rel5 = 0, base5 += 30) {
if (rel5 < 11) {
if (rel5 < 4) {
if (rel5 < 3) {
ptrdiff_t toH4 = base5+0;
ptrdiff_t toW4 = 5+30*rel5;
ptrdiff_t jj12 = 2-rel5+j5;
for (; j5 <= jj12; toW4 += 30) {
ptrdiff_t k28 = 16*w21;
for (; k28 != 16; ++k28) {
ptrdiff_t r5 = 0;
for (; r5 != 2; ++r5) {
ptrdiff_t t7 = 0;
for (; t7 < 3; ++t7) {
__m512 sfRe97 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfIm97 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfRe101 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfIm101 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfRe98 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfIm98 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfRe102 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfIm102 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfRe99 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfIm99 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfRe103 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfIm103 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfRe100 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfIm100 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfRe104 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfIm104 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512i ifft881 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft882 = _mm512_permutexvar_ps(ifft881, sfRe97);
__m512 ifft973 = _mm512_permutexvar_ps(ifft881, sfRe101);
__m512i ifft883 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft884 = _mm512_permutexvar_ps(ifft883, sfRe97);
__m512 ifft974 = _mm512_permutexvar_ps(ifft883, sfRe101);
__m512 ifft885 = _mm512_permutexvar_ps(ifft881, sfIm97);
__m512 ifft975 = _mm512_permutexvar_ps(ifft881, sfIm101);
__m512 ifft886 = _mm512_permutexvar_ps(ifft883, sfIm97);
__m512 ifft976 = _mm512_permutexvar_ps(ifft883, sfIm101);
__m512 ifft887 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft888 = _mm512_mask_fmadd_ps(ifft886, 65021, ifft887, ifft882);
__m512 ifft977 = _mm512_mask_fmadd_ps(ifft976, 65021, ifft887, ifft973);
__m512 ifft889 = _mm512_mask_fnmadd_ps(ifft885, 65021, ifft887, ifft884);
__m512 ifft978 = _mm512_mask_fnmadd_ps(ifft975, 65021, ifft887, ifft974);
__m512 ifft890 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft891 = _mm512_fmadd_ps(ifft888, ifft890, _mm512_shuffle_ps(ifft888, ifft888, 177));
__m512 ifft979 = _mm512_fmadd_ps(ifft977, ifft890, _mm512_shuffle_ps(ifft977, ifft977, 177));
__m512 ifft892 = _mm512_fmadd_ps(ifft889, ifft890, _mm512_shuffle_ps(ifft889, ifft889, 177));
__m512 ifft980 = _mm512_fmadd_ps(ifft978, ifft890, _mm512_shuffle_ps(ifft978, ifft978, 177));
__m512 ifft893 = _mm512_fmadd_ps(sfRe98, ifft890, _mm512_shuffle_ps(sfRe98, sfRe98, 177));
__m512 ifft981 = _mm512_fmadd_ps(sfRe102, ifft890, _mm512_shuffle_ps(sfRe102, sfRe102, 177));
__m512 ifft894 = _mm512_fmadd_ps(sfIm98, ifft890, _mm512_shuffle_ps(sfIm98, sfIm98, 177));
__m512 ifft982 = _mm512_fmadd_ps(sfIm102, ifft890, _mm512_shuffle_ps(sfIm102, sfIm102, 177));
__m512 ifft895 = _mm512_fmadd_ps(sfRe99, ifft890, _mm512_shuffle_ps(sfRe99, sfRe99, 177));
__m512 ifft983 = _mm512_fmadd_ps(sfRe103, ifft890, _mm512_shuffle_ps(sfRe103, sfRe103, 177));
__m512 ifft896 = _mm512_fmadd_ps(sfIm99, ifft890, _mm512_shuffle_ps(sfIm99, sfIm99, 177));
__m512 ifft984 = _mm512_fmadd_ps(sfIm103, ifft890, _mm512_shuffle_ps(sfIm103, sfIm103, 177));
__m512 ifft897 = _mm512_fmadd_ps(sfRe100, ifft890, _mm512_shuffle_ps(sfRe100, sfRe100, 177));
__m512 ifft985 = _mm512_fmadd_ps(sfRe104, ifft890, _mm512_shuffle_ps(sfRe104, sfRe104, 177));
__m512 ifft898 = _mm512_fmadd_ps(sfIm100, ifft890, _mm512_shuffle_ps(sfIm100, sfIm100, 177));
__m512 ifft986 = _mm512_fmadd_ps(sfIm104, ifft890, _mm512_shuffle_ps(sfIm104, sfIm104, 177));
__m512 ifft899 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft900 = _mm512_mul_ps(ifft891, ifft899);
__m512 ifft987 = _mm512_mul_ps(ifft979, ifft899);
__m512 ifft901 = _mm512_mul_ps(ifft892, ifft899);
__m512 ifft988 = _mm512_mul_ps(ifft980, ifft899);
__m512 ifft902 = _mm512_mul_ps(ifft893, ifft899);
__m512 ifft989 = _mm512_mul_ps(ifft981, ifft899);
__m512 ifft903 = _mm512_mul_ps(ifft894, ifft899);
__m512 ifft990 = _mm512_mul_ps(ifft982, ifft899);
__m512 ifft904 = _mm512_mul_ps(ifft895, ifft899);
__m512 ifft991 = _mm512_mul_ps(ifft983, ifft899);
__m512 ifft905 = _mm512_mul_ps(ifft896, ifft899);
__m512 ifft992 = _mm512_mul_ps(ifft984, ifft899);
__m512 ifft906 = _mm512_mul_ps(ifft897, ifft899);
__m512 ifft993 = _mm512_mul_ps(ifft985, ifft899);
__m512 ifft907 = _mm512_mul_ps(ifft898, ifft899);
__m512 ifft994 = _mm512_mul_ps(ifft986, ifft899);
__m512 ifft908 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft909 = _mm512_fnmadd_ps(ifft892, ifft908, ifft900);
__m512 ifft995 = _mm512_fnmadd_ps(ifft980, ifft908, ifft987);
__m512 ifft910 = _mm512_fmadd_ps(ifft891, ifft908, ifft901);
__m512 ifft996 = _mm512_fmadd_ps(ifft979, ifft908, ifft988);
__m512 ifft911 = _mm512_fnmadd_ps(ifft894, ifft908, ifft902);
__m512 ifft997 = _mm512_fnmadd_ps(ifft982, ifft908, ifft989);
__m512 ifft912 = _mm512_fmadd_ps(ifft893, ifft908, ifft903);
__m512 ifft998 = _mm512_fmadd_ps(ifft981, ifft908, ifft990);
__m512 ifft913 = _mm512_fnmadd_ps(ifft896, ifft908, ifft904);
__m512 ifft999 = _mm512_fnmadd_ps(ifft984, ifft908, ifft991);
__m512 ifft914 = _mm512_fmadd_ps(ifft895, ifft908, ifft905);
__m512 ifft1000 = _mm512_fmadd_ps(ifft983, ifft908, ifft992);
__m512 ifft915 = _mm512_fnmadd_ps(ifft898, ifft908, ifft906);
__m512 ifft1001 = _mm512_fnmadd_ps(ifft986, ifft908, ifft993);
__m512 ifft916 = _mm512_fmadd_ps(ifft897, ifft908, ifft907);
__m512 ifft1002 = _mm512_fmadd_ps(ifft985, ifft908, ifft994);
__m512 ifft917 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft918 = _mm512_fmadd_ps(ifft909, ifft917, _mm512_shuffle_ps(ifft909, ifft909, 78));
__m512 ifft1003 = _mm512_fmadd_ps(ifft995, ifft917, _mm512_shuffle_ps(ifft995, ifft995, 78));
__m512 ifft919 = _mm512_fmadd_ps(ifft910, ifft917, _mm512_shuffle_ps(ifft910, ifft910, 78));
__m512 ifft1004 = _mm512_fmadd_ps(ifft996, ifft917, _mm512_shuffle_ps(ifft996, ifft996, 78));
__m512 ifft920 = _mm512_fmadd_ps(ifft911, ifft917, _mm512_shuffle_ps(ifft911, ifft911, 78));
__m512 ifft1005 = _mm512_fmadd_ps(ifft997, ifft917, _mm512_shuffle_ps(ifft997, ifft997, 78));
__m512 ifft921 = _mm512_fmadd_ps(ifft912, ifft917, _mm512_shuffle_ps(ifft912, ifft912, 78));
__m512 ifft1006 = _mm512_fmadd_ps(ifft998, ifft917, _mm512_shuffle_ps(ifft998, ifft998, 78));
__m512 ifft922 = _mm512_fmadd_ps(ifft913, ifft917, _mm512_shuffle_ps(ifft913, ifft913, 78));
__m512 ifft1007 = _mm512_fmadd_ps(ifft999, ifft917, _mm512_shuffle_ps(ifft999, ifft999, 78));
__m512 ifft923 = _mm512_fmadd_ps(ifft914, ifft917, _mm512_shuffle_ps(ifft914, ifft914, 78));
__m512 ifft1008 = _mm512_fmadd_ps(ifft1000, ifft917, _mm512_shuffle_ps(ifft1000, ifft1000, 78));
__m512 ifft924 = _mm512_fmadd_ps(ifft915, ifft917, _mm512_shuffle_ps(ifft915, ifft915, 78));
__m512 ifft1009 = _mm512_fmadd_ps(ifft1001, ifft917, _mm512_shuffle_ps(ifft1001, ifft1001, 78));
__m512 ifft925 = _mm512_fmadd_ps(ifft916, ifft917, _mm512_shuffle_ps(ifft916, ifft916, 78));
__m512 ifft1010 = _mm512_fmadd_ps(ifft1002, ifft917, _mm512_shuffle_ps(ifft1002, ifft1002, 78));
__m512 ifft926 = _mm512_mask_sub_ps(ifft918, 49344, _mm512_setzero_ps(), ifft919);
__m512 ifft1011 = _mm512_mask_sub_ps(ifft1003, 49344, _mm512_setzero_ps(), ifft1004);
__m512 ifft927 = _mm512_mask_mov_ps(ifft919, 49344, ifft918);
__m512 ifft1012 = _mm512_mask_mov_ps(ifft1004, 49344, ifft1003);
__m512 ifft928 = _mm512_mask_sub_ps(ifft920, 49344, _mm512_setzero_ps(), ifft921);
__m512 ifft1013 = _mm512_mask_sub_ps(ifft1005, 49344, _mm512_setzero_ps(), ifft1006);
__m512 ifft929 = _mm512_mask_mov_ps(ifft921, 49344, ifft920);
__m512 ifft1014 = _mm512_mask_mov_ps(ifft1006, 49344, ifft1005);
__m512 ifft930 = _mm512_mask_sub_ps(ifft922, 49344, _mm512_setzero_ps(), ifft923);
__m512 ifft1015 = _mm512_mask_sub_ps(ifft1007, 49344, _mm512_setzero_ps(), ifft1008);
__m512 ifft931 = _mm512_mask_mov_ps(ifft923, 49344, ifft922);
__m512 ifft1016 = _mm512_mask_mov_ps(ifft1008, 49344, ifft1007);
__m512 ifft932 = _mm512_mask_sub_ps(ifft924, 49344, _mm512_setzero_ps(), ifft925);
__m512 ifft1017 = _mm512_mask_sub_ps(ifft1009, 49344, _mm512_setzero_ps(), ifft1010);
__m512 ifft933 = _mm512_mask_mov_ps(ifft925, 49344, ifft924);
__m512 ifft1018 = _mm512_mask_mov_ps(ifft1010, 49344, ifft1009);
__m512 ifft934 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft935 = _mm512_fmadd_ps(ifft926, ifft934, _mm512_shuffle_f32x4(ifft926, ifft926, 177));
__m512 ifft1019 = _mm512_fmadd_ps(ifft1011, ifft934, _mm512_shuffle_f32x4(ifft1011, ifft1011, 177));
__m512 ifft936 = _mm512_fmadd_ps(ifft927, ifft934, _mm512_shuffle_f32x4(ifft927, ifft927, 177));
__m512 ifft1020 = _mm512_fmadd_ps(ifft1012, ifft934, _mm512_shuffle_f32x4(ifft1012, ifft1012, 177));
__m512 ifft937 = _mm512_fmadd_ps(ifft928, ifft934, _mm512_shuffle_f32x4(ifft928, ifft928, 177));
__m512 ifft1021 = _mm512_fmadd_ps(ifft1013, ifft934, _mm512_shuffle_f32x4(ifft1013, ifft1013, 177));
__m512 ifft938 = _mm512_fmadd_ps(ifft929, ifft934, _mm512_shuffle_f32x4(ifft929, ifft929, 177));
__m512 ifft1022 = _mm512_fmadd_ps(ifft1014, ifft934, _mm512_shuffle_f32x4(ifft1014, ifft1014, 177));
__m512 ifft939 = _mm512_fmadd_ps(ifft930, ifft934, _mm512_shuffle_f32x4(ifft930, ifft930, 177));
__m512 ifft1023 = _mm512_fmadd_ps(ifft1015, ifft934, _mm512_shuffle_f32x4(ifft1015, ifft1015, 177));
__m512 ifft940 = _mm512_fnmsub_ps(ifft931, ifft934, _mm512_shuffle_f32x4(ifft931, ifft931, 177));
__m512 ifft1024 = _mm512_fnmsub_ps(ifft1016, ifft934, _mm512_shuffle_f32x4(ifft1016, ifft1016, 177));
__m512 ifft941 = _mm512_fmadd_ps(ifft932, ifft934, _mm512_shuffle_f32x4(ifft932, ifft932, 177));
__m512 ifft1025 = _mm512_fmadd_ps(ifft1017, ifft934, _mm512_shuffle_f32x4(ifft1017, ifft1017, 177));
__m512 ifft942 = _mm512_fmadd_ps(ifft933, ifft934, _mm512_shuffle_f32x4(ifft933, ifft933, 177));
__m512 ifft1026 = _mm512_fmadd_ps(ifft1018, ifft934, _mm512_shuffle_f32x4(ifft1018, ifft1018, 177));
__m512 ifft943 = _mm512_add_ps(ifft935, ifft936);
__m512 ifft1027 = _mm512_add_ps(ifft1019, ifft1020);
__m512 ifft944 = _mm512_sub_ps(ifft935, ifft936);
__m512 ifft1028 = _mm512_sub_ps(ifft1019, ifft1020);
__m512 ifft945 = _mm512_sub_ps(ifft937, ifft941);
__m512 ifft1029 = _mm512_sub_ps(ifft1021, ifft1025);
__m512 ifft946 = _mm512_add_ps(ifft938, ifft942);
__m512 ifft1030 = _mm512_add_ps(ifft1022, ifft1026);
__m512 ifft947 = _mm512_add_ps(ifft937, ifft941);
__m512 ifft1031 = _mm512_add_ps(ifft1021, ifft1025);
__m512 ifft948 = _mm512_sub_ps(ifft938, ifft942);
__m512 ifft1032 = _mm512_sub_ps(ifft1022, ifft1026);
__m512 ifft949 = _mm512_mul_ps(ifft939, _mm512_set1_ps(3.125e-02f));
__m512 ifft1033 = _mm512_mul_ps(ifft1023, _mm512_set1_ps(3.125e-02f));
__m512 ifft950 = _mm512_mul_ps(ifft940, _mm512_set1_ps(3.125e-02f));
__m512 ifft1034 = _mm512_mul_ps(ifft1024, _mm512_set1_ps(3.125e-02f));
__m512 ifft951 = _mm512_fmadd_ps(ifft943, _mm512_set1_ps(1.5625e-02f), ifft949);
__m512 ifft1035 = _mm512_fmadd_ps(ifft1027, _mm512_set1_ps(1.5625e-02f), ifft1033);
__m512 ifft952 = _mm512_fmsub_ps(ifft943, _mm512_set1_ps(1.5625e-02f), ifft949);
__m512 ifft1036 = _mm512_fmsub_ps(ifft1027, _mm512_set1_ps(1.5625e-02f), ifft1033);
__m512 ifft953 = _mm512_fmadd_ps(ifft944, _mm512_set1_ps(1.5625e-02f), ifft950);
__m512 ifft1037 = _mm512_fmadd_ps(ifft1028, _mm512_set1_ps(1.5625e-02f), ifft1034);
__m512 ifft954 = _mm512_fmsub_ps(ifft944, _mm512_set1_ps(1.5625e-02f), ifft950);
__m512 ifft1038 = _mm512_fmsub_ps(ifft1028, _mm512_set1_ps(1.5625e-02f), ifft1034);
__m512 ifft955 = _mm512_add_ps(ifft945, ifft946);
__m512 ifft1039 = _mm512_add_ps(ifft1029, ifft1030);
__m512 ifft956 = _mm512_sub_ps(ifft945, ifft946);
__m512 ifft1040 = _mm512_sub_ps(ifft1029, ifft1030);
__m512 ifft957 = _mm512_fnmadd_ps(ifft955, _mm512_set1_ps(7.0710677e-01f), ifft947);
__m512 ifft1041 = _mm512_fnmadd_ps(ifft1039, _mm512_set1_ps(7.0710677e-01f), ifft1031);
__m512 ifft958 = _mm512_fmadd_ps(ifft955, _mm512_set1_ps(7.0710677e-01f), ifft947);
__m512 ifft1042 = _mm512_fmadd_ps(ifft1039, _mm512_set1_ps(7.0710677e-01f), ifft1031);
__m512 ifft959 = _mm512_fmadd_ps(ifft956, _mm512_set1_ps(7.0710677e-01f), ifft948);
__m512 ifft1043 = _mm512_fmadd_ps(ifft1040, _mm512_set1_ps(7.0710677e-01f), ifft1032);
__m512 ifft960 = _mm512_fmsub_ps(ifft956, _mm512_set1_ps(7.0710677e-01f), ifft948);
__m512 ifft1044 = _mm512_fmsub_ps(ifft1040, _mm512_set1_ps(7.0710677e-01f), ifft1032);
__m512 ifft961 = _mm512_add_ps(ifft957, ifft958);
__m512 ifft1045 = _mm512_add_ps(ifft1041, ifft1042);
__m512 ifft962 = _mm512_sub_ps(ifft957, ifft958);
__m512 ifft1046 = _mm512_sub_ps(ifft1041, ifft1042);
__m512 ifft963 = _mm512_add_ps(ifft959, ifft960);
__m512 ifft1047 = _mm512_add_ps(ifft1043, ifft1044);
__m512 ifft964 = _mm512_sub_ps(ifft959, ifft960);
__m512 ifft1048 = _mm512_sub_ps(ifft1043, ifft1044);
__m512 ifft965 = _mm512_fmadd_ps(ifft961, _mm512_set1_ps(1.5625e-02f), ifft951);
__m512 ifft1049 = _mm512_fmadd_ps(ifft1045, _mm512_set1_ps(1.5625e-02f), ifft1035);
__m512 ifft966 = _mm512_fnmadd_ps(ifft961, _mm512_set1_ps(1.5625e-02f), ifft951);
__m512 ifft1050 = _mm512_fnmadd_ps(ifft1045, _mm512_set1_ps(1.5625e-02f), ifft1035);
__m512 ifft967 = _mm512_fmadd_ps(ifft963, _mm512_set1_ps(1.5625e-02f), ifft953);
__m512 ifft1051 = _mm512_fmadd_ps(ifft1047, _mm512_set1_ps(1.5625e-02f), ifft1037);
__m512 ifft968 = _mm512_fnmadd_ps(ifft963, _mm512_set1_ps(1.5625e-02f), ifft953);
__m512 ifft1052 = _mm512_fnmadd_ps(ifft1047, _mm512_set1_ps(1.5625e-02f), ifft1037);
__m512 ifft969 = _mm512_fnmadd_ps(ifft964, _mm512_set1_ps(1.5625e-02f), ifft952);
__m512 ifft1053 = _mm512_fnmadd_ps(ifft1048, _mm512_set1_ps(1.5625e-02f), ifft1036);
__m512 ifft970 = _mm512_fmadd_ps(ifft964, _mm512_set1_ps(1.5625e-02f), ifft952);
__m512 ifft1054 = _mm512_fmadd_ps(ifft1048, _mm512_set1_ps(1.5625e-02f), ifft1036);
__m512 ifft971 = _mm512_fmadd_ps(ifft962, _mm512_set1_ps(1.5625e-02f), ifft954);
__m512 ifft1055 = _mm512_fmadd_ps(ifft1046, _mm512_set1_ps(1.5625e-02f), ifft1038);
__m512 ifft972 = _mm512_fnmadd_ps(ifft962, _mm512_set1_ps(1.5625e-02f), ifft954);
__m512 ifft1056 = _mm512_fnmadd_ps(ifft1046, _mm512_set1_ps(1.5625e-02f), ifft1038);
__m512 dat640 = ifft965;
__m512 dat645 = ifft1049;
__m512 dat641 = ifft967;
__m512 dat646 = ifft1051;
__m512 dat642 = ifft969;
__m512 dat647 = ifft1053;
__m512 dat643 = ifft971;
__m512 dat648 = ifft1055;
__m512 dat644 = ifft966;
__m512 dat649 = ifft1050;
(void)ifft968;
(void)ifft1052;
(void)ifft970;
(void)ifft1054;
(void)ifft972;
(void)ifft1056;
__m512i pm9 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack41 = _mm512_permutex2var_ps(dat640, pm9, dat645);
__m512i pm10 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack42 = _mm512_permutex2var_ps(dat640, pm10, dat645);
__m512 pack43 = _mm512_permutex2var_ps(dat641, pm9, dat646);
__m512 pack44 = _mm512_permutex2var_ps(dat641, pm10, dat646);
__m512 pack45 = _mm512_permutex2var_ps(dat642, pm9, dat647);
__m512 pack46 = _mm512_permutex2var_ps(dat642, pm10, dat647);
__m512 pack47 = _mm512_permutex2var_ps(dat643, pm9, dat648);
__m512 pack48 = _mm512_permutex2var_ps(dat643, pm10, dat648);
__m512 pack49 = _mm512_permutex2var_ps(dat644, pm9, dat649);
__m512 pack50 = _mm512_permutex2var_ps(dat644, pm10, dat649);
pack41 = _mm512_max_ps(_mm512_setzero_ps(), pack41);
pack42 = _mm512_max_ps(_mm512_setzero_ps(), pack42);
pack43 = _mm512_max_ps(_mm512_setzero_ps(), pack43);
pack44 = _mm512_max_ps(_mm512_setzero_ps(), pack44);
pack45 = _mm512_max_ps(_mm512_setzero_ps(), pack45);
pack46 = _mm512_max_ps(_mm512_setzero_ps(), pack46);
pack47 = _mm512_max_ps(_mm512_setzero_ps(), pack47);
pack48 = _mm512_max_ps(_mm512_setzero_ps(), pack48);
pack49 = _mm512_max_ps(_mm512_setzero_ps(), pack49);
pack50 = _mm512_max_ps(_mm512_setzero_ps(), pack50);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack41);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack42);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack43);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack44);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack45);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack46);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack47);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack48);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack49);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack50);
}
}
}
if (j5 >= last2) return;
++j5;
}
rel5 = 3;
}
ptrdiff_t toH5 = base5+0;
ptrdiff_t toW5 = 95;
ptrdiff_t k29 = 16*w21;
for (; k29 != 16; ++k29) {
ptrdiff_t r6 = 0;
for (; r6 != 2; ++r6) {
ptrdiff_t t8 = 0;
__m512 sfRe105 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfIm105 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfRe109 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfIm109 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfRe106 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfIm106 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfRe110 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfIm110 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfRe107 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfIm107 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfRe111 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfIm111 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfRe108 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfIm108 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfRe112 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfIm112 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512i ifft1057 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft1058 = _mm512_permutexvar_ps(ifft1057, sfRe105);
__m512 ifft1149 = _mm512_permutexvar_ps(ifft1057, sfRe109);
__m512i ifft1059 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft1060 = _mm512_permutexvar_ps(ifft1059, sfRe105);
__m512 ifft1150 = _mm512_permutexvar_ps(ifft1059, sfRe109);
__m512 ifft1061 = _mm512_permutexvar_ps(ifft1057, sfIm105);
__m512 ifft1151 = _mm512_permutexvar_ps(ifft1057, sfIm109);
__m512 ifft1062 = _mm512_permutexvar_ps(ifft1059, sfIm105);
__m512 ifft1152 = _mm512_permutexvar_ps(ifft1059, sfIm109);
__m512 ifft1063 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft1064 = _mm512_mask_fmadd_ps(ifft1062, 65021, ifft1063, ifft1058);
__m512 ifft1153 = _mm512_mask_fmadd_ps(ifft1152, 65021, ifft1063, ifft1149);
__m512 ifft1065 = _mm512_mask_fnmadd_ps(ifft1061, 65021, ifft1063, ifft1060);
__m512 ifft1154 = _mm512_mask_fnmadd_ps(ifft1151, 65021, ifft1063, ifft1150);
__m512 ifft1066 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft1067 = _mm512_fmadd_ps(ifft1064, ifft1066, _mm512_shuffle_ps(ifft1064, ifft1064, 177));
__m512 ifft1155 = _mm512_fmadd_ps(ifft1153, ifft1066, _mm512_shuffle_ps(ifft1153, ifft1153, 177));
__m512 ifft1068 = _mm512_fmadd_ps(ifft1065, ifft1066, _mm512_shuffle_ps(ifft1065, ifft1065, 177));
__m512 ifft1156 = _mm512_fmadd_ps(ifft1154, ifft1066, _mm512_shuffle_ps(ifft1154, ifft1154, 177));
__m512 ifft1069 = _mm512_fmadd_ps(sfRe106, ifft1066, _mm512_shuffle_ps(sfRe106, sfRe106, 177));
__m512 ifft1157 = _mm512_fmadd_ps(sfRe110, ifft1066, _mm512_shuffle_ps(sfRe110, sfRe110, 177));
__m512 ifft1070 = _mm512_fmadd_ps(sfIm106, ifft1066, _mm512_shuffle_ps(sfIm106, sfIm106, 177));
__m512 ifft1158 = _mm512_fmadd_ps(sfIm110, ifft1066, _mm512_shuffle_ps(sfIm110, sfIm110, 177));
__m512 ifft1071 = _mm512_fmadd_ps(sfRe107, ifft1066, _mm512_shuffle_ps(sfRe107, sfRe107, 177));
__m512 ifft1159 = _mm512_fmadd_ps(sfRe111, ifft1066, _mm512_shuffle_ps(sfRe111, sfRe111, 177));
__m512 ifft1072 = _mm512_fmadd_ps(sfIm107, ifft1066, _mm512_shuffle_ps(sfIm107, sfIm107, 177));
__m512 ifft1160 = _mm512_fmadd_ps(sfIm111, ifft1066, _mm512_shuffle_ps(sfIm111, sfIm111, 177));
__m512 ifft1073 = _mm512_fmadd_ps(sfRe108, ifft1066, _mm512_shuffle_ps(sfRe108, sfRe108, 177));
__m512 ifft1161 = _mm512_fmadd_ps(sfRe112, ifft1066, _mm512_shuffle_ps(sfRe112, sfRe112, 177));
__m512 ifft1074 = _mm512_fmadd_ps(sfIm108, ifft1066, _mm512_shuffle_ps(sfIm108, sfIm108, 177));
__m512 ifft1162 = _mm512_fmadd_ps(sfIm112, ifft1066, _mm512_shuffle_ps(sfIm112, sfIm112, 177));
__m512 ifft1075 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft1076 = _mm512_mul_ps(ifft1067, ifft1075);
__m512 ifft1163 = _mm512_mul_ps(ifft1155, ifft1075);
__m512 ifft1077 = _mm512_mul_ps(ifft1068, ifft1075);
__m512 ifft1164 = _mm512_mul_ps(ifft1156, ifft1075);
__m512 ifft1078 = _mm512_mul_ps(ifft1069, ifft1075);
__m512 ifft1165 = _mm512_mul_ps(ifft1157, ifft1075);
__m512 ifft1079 = _mm512_mul_ps(ifft1070, ifft1075);
__m512 ifft1166 = _mm512_mul_ps(ifft1158, ifft1075);
__m512 ifft1080 = _mm512_mul_ps(ifft1071, ifft1075);
__m512 ifft1167 = _mm512_mul_ps(ifft1159, ifft1075);
__m512 ifft1081 = _mm512_mul_ps(ifft1072, ifft1075);
__m512 ifft1168 = _mm512_mul_ps(ifft1160, ifft1075);
__m512 ifft1082 = _mm512_mul_ps(ifft1073, ifft1075);
__m512 ifft1169 = _mm512_mul_ps(ifft1161, ifft1075);
__m512 ifft1083 = _mm512_mul_ps(ifft1074, ifft1075);
__m512 ifft1170 = _mm512_mul_ps(ifft1162, ifft1075);
__m512 ifft1084 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft1085 = _mm512_fnmadd_ps(ifft1068, ifft1084, ifft1076);
__m512 ifft1171 = _mm512_fnmadd_ps(ifft1156, ifft1084, ifft1163);
__m512 ifft1086 = _mm512_fmadd_ps(ifft1067, ifft1084, ifft1077);
__m512 ifft1172 = _mm512_fmadd_ps(ifft1155, ifft1084, ifft1164);
__m512 ifft1087 = _mm512_fnmadd_ps(ifft1070, ifft1084, ifft1078);
__m512 ifft1173 = _mm512_fnmadd_ps(ifft1158, ifft1084, ifft1165);
__m512 ifft1088 = _mm512_fmadd_ps(ifft1069, ifft1084, ifft1079);
__m512 ifft1174 = _mm512_fmadd_ps(ifft1157, ifft1084, ifft1166);
__m512 ifft1089 = _mm512_fnmadd_ps(ifft1072, ifft1084, ifft1080);
__m512 ifft1175 = _mm512_fnmadd_ps(ifft1160, ifft1084, ifft1167);
__m512 ifft1090 = _mm512_fmadd_ps(ifft1071, ifft1084, ifft1081);
__m512 ifft1176 = _mm512_fmadd_ps(ifft1159, ifft1084, ifft1168);
__m512 ifft1091 = _mm512_fnmadd_ps(ifft1074, ifft1084, ifft1082);
__m512 ifft1177 = _mm512_fnmadd_ps(ifft1162, ifft1084, ifft1169);
__m512 ifft1092 = _mm512_fmadd_ps(ifft1073, ifft1084, ifft1083);
__m512 ifft1178 = _mm512_fmadd_ps(ifft1161, ifft1084, ifft1170);
__m512 ifft1093 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft1094 = _mm512_fmadd_ps(ifft1085, ifft1093, _mm512_shuffle_ps(ifft1085, ifft1085, 78));
__m512 ifft1179 = _mm512_fmadd_ps(ifft1171, ifft1093, _mm512_shuffle_ps(ifft1171, ifft1171, 78));
__m512 ifft1095 = _mm512_fmadd_ps(ifft1086, ifft1093, _mm512_shuffle_ps(ifft1086, ifft1086, 78));
__m512 ifft1180 = _mm512_fmadd_ps(ifft1172, ifft1093, _mm512_shuffle_ps(ifft1172, ifft1172, 78));
__m512 ifft1096 = _mm512_fmadd_ps(ifft1087, ifft1093, _mm512_shuffle_ps(ifft1087, ifft1087, 78));
__m512 ifft1181 = _mm512_fmadd_ps(ifft1173, ifft1093, _mm512_shuffle_ps(ifft1173, ifft1173, 78));
__m512 ifft1097 = _mm512_fmadd_ps(ifft1088, ifft1093, _mm512_shuffle_ps(ifft1088, ifft1088, 78));
__m512 ifft1182 = _mm512_fmadd_ps(ifft1174, ifft1093, _mm512_shuffle_ps(ifft1174, ifft1174, 78));
__m512 ifft1098 = _mm512_fmadd_ps(ifft1089, ifft1093, _mm512_shuffle_ps(ifft1089, ifft1089, 78));
__m512 ifft1183 = _mm512_fmadd_ps(ifft1175, ifft1093, _mm512_shuffle_ps(ifft1175, ifft1175, 78));
__m512 ifft1099 = _mm512_fmadd_ps(ifft1090, ifft1093, _mm512_shuffle_ps(ifft1090, ifft1090, 78));
__m512 ifft1184 = _mm512_fmadd_ps(ifft1176, ifft1093, _mm512_shuffle_ps(ifft1176, ifft1176, 78));
__m512 ifft1100 = _mm512_fmadd_ps(ifft1091, ifft1093, _mm512_shuffle_ps(ifft1091, ifft1091, 78));
__m512 ifft1185 = _mm512_fmadd_ps(ifft1177, ifft1093, _mm512_shuffle_ps(ifft1177, ifft1177, 78));
__m512 ifft1101 = _mm512_fmadd_ps(ifft1092, ifft1093, _mm512_shuffle_ps(ifft1092, ifft1092, 78));
__m512 ifft1186 = _mm512_fmadd_ps(ifft1178, ifft1093, _mm512_shuffle_ps(ifft1178, ifft1178, 78));
__m512 ifft1102 = _mm512_mask_sub_ps(ifft1094, 49344, _mm512_setzero_ps(), ifft1095);
__m512 ifft1187 = _mm512_mask_sub_ps(ifft1179, 49344, _mm512_setzero_ps(), ifft1180);
__m512 ifft1103 = _mm512_mask_mov_ps(ifft1095, 49344, ifft1094);
__m512 ifft1188 = _mm512_mask_mov_ps(ifft1180, 49344, ifft1179);
__m512 ifft1104 = _mm512_mask_sub_ps(ifft1096, 49344, _mm512_setzero_ps(), ifft1097);
__m512 ifft1189 = _mm512_mask_sub_ps(ifft1181, 49344, _mm512_setzero_ps(), ifft1182);
__m512 ifft1105 = _mm512_mask_mov_ps(ifft1097, 49344, ifft1096);
__m512 ifft1190 = _mm512_mask_mov_ps(ifft1182, 49344, ifft1181);
__m512 ifft1106 = _mm512_mask_sub_ps(ifft1098, 49344, _mm512_setzero_ps(), ifft1099);
__m512 ifft1191 = _mm512_mask_sub_ps(ifft1183, 49344, _mm512_setzero_ps(), ifft1184);
__m512 ifft1107 = _mm512_mask_mov_ps(ifft1099, 49344, ifft1098);
__m512 ifft1192 = _mm512_mask_mov_ps(ifft1184, 49344, ifft1183);
__m512 ifft1108 = _mm512_mask_sub_ps(ifft1100, 49344, _mm512_setzero_ps(), ifft1101);
__m512 ifft1193 = _mm512_mask_sub_ps(ifft1185, 49344, _mm512_setzero_ps(), ifft1186);
__m512 ifft1109 = _mm512_mask_mov_ps(ifft1101, 49344, ifft1100);
__m512 ifft1194 = _mm512_mask_mov_ps(ifft1186, 49344, ifft1185);
__m512 ifft1110 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft1111 = _mm512_fmadd_ps(ifft1102, ifft1110, _mm512_shuffle_f32x4(ifft1102, ifft1102, 177));
__m512 ifft1195 = _mm512_fmadd_ps(ifft1187, ifft1110, _mm512_shuffle_f32x4(ifft1187, ifft1187, 177));
__m512 ifft1112 = _mm512_fmadd_ps(ifft1103, ifft1110, _mm512_shuffle_f32x4(ifft1103, ifft1103, 177));
__m512 ifft1196 = _mm512_fmadd_ps(ifft1188, ifft1110, _mm512_shuffle_f32x4(ifft1188, ifft1188, 177));
__m512 ifft1113 = _mm512_fmadd_ps(ifft1104, ifft1110, _mm512_shuffle_f32x4(ifft1104, ifft1104, 177));
__m512 ifft1197 = _mm512_fmadd_ps(ifft1189, ifft1110, _mm512_shuffle_f32x4(ifft1189, ifft1189, 177));
__m512 ifft1114 = _mm512_fmadd_ps(ifft1105, ifft1110, _mm512_shuffle_f32x4(ifft1105, ifft1105, 177));
__m512 ifft1198 = _mm512_fmadd_ps(ifft1190, ifft1110, _mm512_shuffle_f32x4(ifft1190, ifft1190, 177));
__m512 ifft1115 = _mm512_fmadd_ps(ifft1106, ifft1110, _mm512_shuffle_f32x4(ifft1106, ifft1106, 177));
__m512 ifft1199 = _mm512_fmadd_ps(ifft1191, ifft1110, _mm512_shuffle_f32x4(ifft1191, ifft1191, 177));
__m512 ifft1116 = _mm512_fnmsub_ps(ifft1107, ifft1110, _mm512_shuffle_f32x4(ifft1107, ifft1107, 177));
__m512 ifft1200 = _mm512_fnmsub_ps(ifft1192, ifft1110, _mm512_shuffle_f32x4(ifft1192, ifft1192, 177));
__m512 ifft1117 = _mm512_fmadd_ps(ifft1108, ifft1110, _mm512_shuffle_f32x4(ifft1108, ifft1108, 177));
__m512 ifft1201 = _mm512_fmadd_ps(ifft1193, ifft1110, _mm512_shuffle_f32x4(ifft1193, ifft1193, 177));
__m512 ifft1118 = _mm512_fmadd_ps(ifft1109, ifft1110, _mm512_shuffle_f32x4(ifft1109, ifft1109, 177));
__m512 ifft1202 = _mm512_fmadd_ps(ifft1194, ifft1110, _mm512_shuffle_f32x4(ifft1194, ifft1194, 177));
__m512 ifft1119 = _mm512_add_ps(ifft1111, ifft1112);
__m512 ifft1203 = _mm512_add_ps(ifft1195, ifft1196);
__m512 ifft1120 = _mm512_sub_ps(ifft1111, ifft1112);
__m512 ifft1204 = _mm512_sub_ps(ifft1195, ifft1196);
__m512 ifft1121 = _mm512_sub_ps(ifft1113, ifft1117);
__m512 ifft1205 = _mm512_sub_ps(ifft1197, ifft1201);
__m512 ifft1122 = _mm512_add_ps(ifft1114, ifft1118);
__m512 ifft1206 = _mm512_add_ps(ifft1198, ifft1202);
__m512 ifft1123 = _mm512_add_ps(ifft1113, ifft1117);
__m512 ifft1207 = _mm512_add_ps(ifft1197, ifft1201);
__m512 ifft1124 = _mm512_sub_ps(ifft1114, ifft1118);
__m512 ifft1208 = _mm512_sub_ps(ifft1198, ifft1202);
__m512 ifft1125 = _mm512_mul_ps(ifft1115, _mm512_set1_ps(3.125e-02f));
__m512 ifft1209 = _mm512_mul_ps(ifft1199, _mm512_set1_ps(3.125e-02f));
__m512 ifft1126 = _mm512_mul_ps(ifft1116, _mm512_set1_ps(3.125e-02f));
__m512 ifft1210 = _mm512_mul_ps(ifft1200, _mm512_set1_ps(3.125e-02f));
__m512 ifft1127 = _mm512_fmadd_ps(ifft1119, _mm512_set1_ps(1.5625e-02f), ifft1125);
__m512 ifft1211 = _mm512_fmadd_ps(ifft1203, _mm512_set1_ps(1.5625e-02f), ifft1209);
__m512 ifft1128 = _mm512_fmsub_ps(ifft1119, _mm512_set1_ps(1.5625e-02f), ifft1125);
__m512 ifft1212 = _mm512_fmsub_ps(ifft1203, _mm512_set1_ps(1.5625e-02f), ifft1209);
__m512 ifft1129 = _mm512_fmadd_ps(ifft1120, _mm512_set1_ps(1.5625e-02f), ifft1126);
__m512 ifft1213 = _mm512_fmadd_ps(ifft1204, _mm512_set1_ps(1.5625e-02f), ifft1210);
__m512 ifft1130 = _mm512_fmsub_ps(ifft1120, _mm512_set1_ps(1.5625e-02f), ifft1126);
__m512 ifft1214 = _mm512_fmsub_ps(ifft1204, _mm512_set1_ps(1.5625e-02f), ifft1210);
__m512 ifft1131 = _mm512_add_ps(ifft1121, ifft1122);
__m512 ifft1215 = _mm512_add_ps(ifft1205, ifft1206);
__m512 ifft1132 = _mm512_sub_ps(ifft1121, ifft1122);
__m512 ifft1216 = _mm512_sub_ps(ifft1205, ifft1206);
__m512 ifft1133 = _mm512_fnmadd_ps(ifft1131, _mm512_set1_ps(7.0710677e-01f), ifft1123);
__m512 ifft1217 = _mm512_fnmadd_ps(ifft1215, _mm512_set1_ps(7.0710677e-01f), ifft1207);
__m512 ifft1134 = _mm512_fmadd_ps(ifft1131, _mm512_set1_ps(7.0710677e-01f), ifft1123);
__m512 ifft1218 = _mm512_fmadd_ps(ifft1215, _mm512_set1_ps(7.0710677e-01f), ifft1207);
__m512 ifft1135 = _mm512_fmadd_ps(ifft1132, _mm512_set1_ps(7.0710677e-01f), ifft1124);
__m512 ifft1219 = _mm512_fmadd_ps(ifft1216, _mm512_set1_ps(7.0710677e-01f), ifft1208);
__m512 ifft1136 = _mm512_fmsub_ps(ifft1132, _mm512_set1_ps(7.0710677e-01f), ifft1124);
__m512 ifft1220 = _mm512_fmsub_ps(ifft1216, _mm512_set1_ps(7.0710677e-01f), ifft1208);
__m512 ifft1137 = _mm512_add_ps(ifft1133, ifft1134);
__m512 ifft1221 = _mm512_add_ps(ifft1217, ifft1218);
__m512 ifft1138 = _mm512_sub_ps(ifft1133, ifft1134);
__m512 ifft1222 = _mm512_sub_ps(ifft1217, ifft1218);
__m512 ifft1139 = _mm512_add_ps(ifft1135, ifft1136);
__m512 ifft1223 = _mm512_add_ps(ifft1219, ifft1220);
__m512 ifft1140 = _mm512_sub_ps(ifft1135, ifft1136);
__m512 ifft1224 = _mm512_sub_ps(ifft1219, ifft1220);
__m512 ifft1141 = _mm512_fmadd_ps(ifft1137, _mm512_set1_ps(1.5625e-02f), ifft1127);
__m512 ifft1225 = _mm512_fmadd_ps(ifft1221, _mm512_set1_ps(1.5625e-02f), ifft1211);
__m512 ifft1142 = _mm512_fnmadd_ps(ifft1137, _mm512_set1_ps(1.5625e-02f), ifft1127);
__m512 ifft1226 = _mm512_fnmadd_ps(ifft1221, _mm512_set1_ps(1.5625e-02f), ifft1211);
__m512 ifft1143 = _mm512_fmadd_ps(ifft1139, _mm512_set1_ps(1.5625e-02f), ifft1129);
__m512 ifft1227 = _mm512_fmadd_ps(ifft1223, _mm512_set1_ps(1.5625e-02f), ifft1213);
__m512 ifft1144 = _mm512_fnmadd_ps(ifft1139, _mm512_set1_ps(1.5625e-02f), ifft1129);
__m512 ifft1228 = _mm512_fnmadd_ps(ifft1223, _mm512_set1_ps(1.5625e-02f), ifft1213);
__m512 ifft1145 = _mm512_fnmadd_ps(ifft1140, _mm512_set1_ps(1.5625e-02f), ifft1128);
__m512 ifft1229 = _mm512_fnmadd_ps(ifft1224, _mm512_set1_ps(1.5625e-02f), ifft1212);
__m512 ifft1146 = _mm512_fmadd_ps(ifft1140, _mm512_set1_ps(1.5625e-02f), ifft1128);
__m512 ifft1230 = _mm512_fmadd_ps(ifft1224, _mm512_set1_ps(1.5625e-02f), ifft1212);
__m512 ifft1147 = _mm512_fmadd_ps(ifft1138, _mm512_set1_ps(1.5625e-02f), ifft1130);
__m512 ifft1231 = _mm512_fmadd_ps(ifft1222, _mm512_set1_ps(1.5625e-02f), ifft1214);
__m512 ifft1148 = _mm512_fnmadd_ps(ifft1138, _mm512_set1_ps(1.5625e-02f), ifft1130);
__m512 ifft1232 = _mm512_fnmadd_ps(ifft1222, _mm512_set1_ps(1.5625e-02f), ifft1214);
__m512 dat650 = ifft1141;
__m512 dat655 = ifft1225;
__m512 dat651 = ifft1143;
__m512 dat656 = ifft1227;
__m512 dat652 = ifft1145;
__m512 dat657 = ifft1229;
__m512 dat653 = ifft1147;
__m512 dat658 = ifft1231;
__m512 dat654 = ifft1142;
__m512 dat659 = ifft1226;
(void)ifft1144;
(void)ifft1228;
(void)ifft1146;
(void)ifft1230;
(void)ifft1148;
(void)ifft1232;
__m512i pm11 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack51 = _mm512_permutex2var_ps(dat650, pm11, dat655);
__m512i pm12 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack52 = _mm512_permutex2var_ps(dat650, pm12, dat655);
__m512 pack53 = _mm512_permutex2var_ps(dat651, pm11, dat656);
__m512 pack54 = _mm512_permutex2var_ps(dat651, pm12, dat656);
__m512 pack55 = _mm512_permutex2var_ps(dat652, pm11, dat657);
__m512 pack56 = _mm512_permutex2var_ps(dat652, pm12, dat657);
__m512 pack57 = _mm512_permutex2var_ps(dat653, pm11, dat658);
__m512 pack58 = _mm512_permutex2var_ps(dat653, pm12, dat658);
__m512 pack59 = _mm512_permutex2var_ps(dat654, pm11, dat659);
__m512 pack60 = _mm512_permutex2var_ps(dat654, pm12, dat659);
pack51 = _mm512_max_ps(_mm512_setzero_ps(), pack51);
pack52 = _mm512_max_ps(_mm512_setzero_ps(), pack52);
pack53 = _mm512_max_ps(_mm512_setzero_ps(), pack53);
pack54 = _mm512_max_ps(_mm512_setzero_ps(), pack54);
pack55 = _mm512_max_ps(_mm512_setzero_ps(), pack55);
pack56 = _mm512_max_ps(_mm512_setzero_ps(), pack56);
pack57 = _mm512_max_ps(_mm512_setzero_ps(), pack57);
pack58 = _mm512_max_ps(_mm512_setzero_ps(), pack58);
pack59 = _mm512_max_ps(_mm512_setzero_ps(), pack59);
pack60 = _mm512_max_ps(_mm512_setzero_ps(), pack60);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack51);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack52);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack53);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack54);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack55);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack56);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack57);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack58);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack59);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack60);
ptrdiff_t t9 = 0;
__m512 sfRe113 = _mm512_loadu_ps(sfPtr3+256+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfIm113 = _mm512_loadu_ps(sfPtr3+320+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfRe117 = _mm512_loadu_ps(sfPtr3+384+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfIm117 = _mm512_loadu_ps(sfPtr3+448+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfRe114 = _mm512_loadu_ps(sfPtr3+2167040+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfIm114 = _mm512_loadu_ps(sfPtr3+2167104+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfRe118 = _mm512_loadu_ps(sfPtr3+2167168+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfIm118 = _mm512_loadu_ps(sfPtr3+2167232+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfRe115 = _mm512_loadu_ps(sfPtr3+4333824+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfIm115 = _mm512_loadu_ps(sfPtr3+4333888+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfRe119 = _mm512_loadu_ps(sfPtr3+4333952+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfIm119 = _mm512_loadu_ps(sfPtr3+4334016+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfRe116 = _mm512_loadu_ps(sfPtr3+6500608+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfIm116 = _mm512_loadu_ps(sfPtr3+6500672+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfRe120 = _mm512_loadu_ps(sfPtr3+6500736+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfIm120 = _mm512_loadu_ps(sfPtr3+6500800+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512i ifft1233 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft1234 = _mm512_permutexvar_ps(ifft1233, sfRe113);
__m512 ifft1325 = _mm512_permutexvar_ps(ifft1233, sfRe117);
__m512i ifft1235 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft1236 = _mm512_permutexvar_ps(ifft1235, sfRe113);
__m512 ifft1326 = _mm512_permutexvar_ps(ifft1235, sfRe117);
__m512 ifft1237 = _mm512_permutexvar_ps(ifft1233, sfIm113);
__m512 ifft1327 = _mm512_permutexvar_ps(ifft1233, sfIm117);
__m512 ifft1238 = _mm512_permutexvar_ps(ifft1235, sfIm113);
__m512 ifft1328 = _mm512_permutexvar_ps(ifft1235, sfIm117);
__m512 ifft1239 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft1240 = _mm512_mask_fmadd_ps(ifft1238, 65021, ifft1239, ifft1234);
__m512 ifft1329 = _mm512_mask_fmadd_ps(ifft1328, 65021, ifft1239, ifft1325);
__m512 ifft1241 = _mm512_mask_fnmadd_ps(ifft1237, 65021, ifft1239, ifft1236);
__m512 ifft1330 = _mm512_mask_fnmadd_ps(ifft1327, 65021, ifft1239, ifft1326);
__m512 ifft1242 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft1243 = _mm512_fmadd_ps(ifft1240, ifft1242, _mm512_shuffle_ps(ifft1240, ifft1240, 177));
__m512 ifft1331 = _mm512_fmadd_ps(ifft1329, ifft1242, _mm512_shuffle_ps(ifft1329, ifft1329, 177));
__m512 ifft1244 = _mm512_fmadd_ps(ifft1241, ifft1242, _mm512_shuffle_ps(ifft1241, ifft1241, 177));
__m512 ifft1332 = _mm512_fmadd_ps(ifft1330, ifft1242, _mm512_shuffle_ps(ifft1330, ifft1330, 177));
__m512 ifft1245 = _mm512_fmadd_ps(sfRe114, ifft1242, _mm512_shuffle_ps(sfRe114, sfRe114, 177));
__m512 ifft1333 = _mm512_fmadd_ps(sfRe118, ifft1242, _mm512_shuffle_ps(sfRe118, sfRe118, 177));
__m512 ifft1246 = _mm512_fmadd_ps(sfIm114, ifft1242, _mm512_shuffle_ps(sfIm114, sfIm114, 177));
__m512 ifft1334 = _mm512_fmadd_ps(sfIm118, ifft1242, _mm512_shuffle_ps(sfIm118, sfIm118, 177));
__m512 ifft1247 = _mm512_fmadd_ps(sfRe115, ifft1242, _mm512_shuffle_ps(sfRe115, sfRe115, 177));
__m512 ifft1335 = _mm512_fmadd_ps(sfRe119, ifft1242, _mm512_shuffle_ps(sfRe119, sfRe119, 177));
__m512 ifft1248 = _mm512_fmadd_ps(sfIm115, ifft1242, _mm512_shuffle_ps(sfIm115, sfIm115, 177));
__m512 ifft1336 = _mm512_fmadd_ps(sfIm119, ifft1242, _mm512_shuffle_ps(sfIm119, sfIm119, 177));
__m512 ifft1249 = _mm512_fmadd_ps(sfRe116, ifft1242, _mm512_shuffle_ps(sfRe116, sfRe116, 177));
__m512 ifft1337 = _mm512_fmadd_ps(sfRe120, ifft1242, _mm512_shuffle_ps(sfRe120, sfRe120, 177));
__m512 ifft1250 = _mm512_fmadd_ps(sfIm116, ifft1242, _mm512_shuffle_ps(sfIm116, sfIm116, 177));
__m512 ifft1338 = _mm512_fmadd_ps(sfIm120, ifft1242, _mm512_shuffle_ps(sfIm120, sfIm120, 177));
__m512 ifft1251 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft1252 = _mm512_mul_ps(ifft1243, ifft1251);
__m512 ifft1339 = _mm512_mul_ps(ifft1331, ifft1251);
__m512 ifft1253 = _mm512_mul_ps(ifft1244, ifft1251);
__m512 ifft1340 = _mm512_mul_ps(ifft1332, ifft1251);
__m512 ifft1254 = _mm512_mul_ps(ifft1245, ifft1251);
__m512 ifft1341 = _mm512_mul_ps(ifft1333, ifft1251);
__m512 ifft1255 = _mm512_mul_ps(ifft1246, ifft1251);
__m512 ifft1342 = _mm512_mul_ps(ifft1334, ifft1251);
__m512 ifft1256 = _mm512_mul_ps(ifft1247, ifft1251);
__m512 ifft1343 = _mm512_mul_ps(ifft1335, ifft1251);
__m512 ifft1257 = _mm512_mul_ps(ifft1248, ifft1251);
__m512 ifft1344 = _mm512_mul_ps(ifft1336, ifft1251);
__m512 ifft1258 = _mm512_mul_ps(ifft1249, ifft1251);
__m512 ifft1345 = _mm512_mul_ps(ifft1337, ifft1251);
__m512 ifft1259 = _mm512_mul_ps(ifft1250, ifft1251);
__m512 ifft1346 = _mm512_mul_ps(ifft1338, ifft1251);
__m512 ifft1260 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft1261 = _mm512_fnmadd_ps(ifft1244, ifft1260, ifft1252);
__m512 ifft1347 = _mm512_fnmadd_ps(ifft1332, ifft1260, ifft1339);
__m512 ifft1262 = _mm512_fmadd_ps(ifft1243, ifft1260, ifft1253);
__m512 ifft1348 = _mm512_fmadd_ps(ifft1331, ifft1260, ifft1340);
__m512 ifft1263 = _mm512_fnmadd_ps(ifft1246, ifft1260, ifft1254);
__m512 ifft1349 = _mm512_fnmadd_ps(ifft1334, ifft1260, ifft1341);
__m512 ifft1264 = _mm512_fmadd_ps(ifft1245, ifft1260, ifft1255);
__m512 ifft1350 = _mm512_fmadd_ps(ifft1333, ifft1260, ifft1342);
__m512 ifft1265 = _mm512_fnmadd_ps(ifft1248, ifft1260, ifft1256);
__m512 ifft1351 = _mm512_fnmadd_ps(ifft1336, ifft1260, ifft1343);
__m512 ifft1266 = _mm512_fmadd_ps(ifft1247, ifft1260, ifft1257);
__m512 ifft1352 = _mm512_fmadd_ps(ifft1335, ifft1260, ifft1344);
__m512 ifft1267 = _mm512_fnmadd_ps(ifft1250, ifft1260, ifft1258);
__m512 ifft1353 = _mm512_fnmadd_ps(ifft1338, ifft1260, ifft1345);
__m512 ifft1268 = _mm512_fmadd_ps(ifft1249, ifft1260, ifft1259);
__m512 ifft1354 = _mm512_fmadd_ps(ifft1337, ifft1260, ifft1346);
__m512 ifft1269 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft1270 = _mm512_fmadd_ps(ifft1261, ifft1269, _mm512_shuffle_ps(ifft1261, ifft1261, 78));
__m512 ifft1355 = _mm512_fmadd_ps(ifft1347, ifft1269, _mm512_shuffle_ps(ifft1347, ifft1347, 78));
__m512 ifft1271 = _mm512_fmadd_ps(ifft1262, ifft1269, _mm512_shuffle_ps(ifft1262, ifft1262, 78));
__m512 ifft1356 = _mm512_fmadd_ps(ifft1348, ifft1269, _mm512_shuffle_ps(ifft1348, ifft1348, 78));
__m512 ifft1272 = _mm512_fmadd_ps(ifft1263, ifft1269, _mm512_shuffle_ps(ifft1263, ifft1263, 78));
__m512 ifft1357 = _mm512_fmadd_ps(ifft1349, ifft1269, _mm512_shuffle_ps(ifft1349, ifft1349, 78));
__m512 ifft1273 = _mm512_fmadd_ps(ifft1264, ifft1269, _mm512_shuffle_ps(ifft1264, ifft1264, 78));
__m512 ifft1358 = _mm512_fmadd_ps(ifft1350, ifft1269, _mm512_shuffle_ps(ifft1350, ifft1350, 78));
__m512 ifft1274 = _mm512_fmadd_ps(ifft1265, ifft1269, _mm512_shuffle_ps(ifft1265, ifft1265, 78));
__m512 ifft1359 = _mm512_fmadd_ps(ifft1351, ifft1269, _mm512_shuffle_ps(ifft1351, ifft1351, 78));
__m512 ifft1275 = _mm512_fmadd_ps(ifft1266, ifft1269, _mm512_shuffle_ps(ifft1266, ifft1266, 78));
__m512 ifft1360 = _mm512_fmadd_ps(ifft1352, ifft1269, _mm512_shuffle_ps(ifft1352, ifft1352, 78));
__m512 ifft1276 = _mm512_fmadd_ps(ifft1267, ifft1269, _mm512_shuffle_ps(ifft1267, ifft1267, 78));
__m512 ifft1361 = _mm512_fmadd_ps(ifft1353, ifft1269, _mm512_shuffle_ps(ifft1353, ifft1353, 78));
__m512 ifft1277 = _mm512_fmadd_ps(ifft1268, ifft1269, _mm512_shuffle_ps(ifft1268, ifft1268, 78));
__m512 ifft1362 = _mm512_fmadd_ps(ifft1354, ifft1269, _mm512_shuffle_ps(ifft1354, ifft1354, 78));
__m512 ifft1278 = _mm512_mask_sub_ps(ifft1270, 49344, _mm512_setzero_ps(), ifft1271);
__m512 ifft1363 = _mm512_mask_sub_ps(ifft1355, 49344, _mm512_setzero_ps(), ifft1356);
__m512 ifft1279 = _mm512_mask_mov_ps(ifft1271, 49344, ifft1270);
__m512 ifft1364 = _mm512_mask_mov_ps(ifft1356, 49344, ifft1355);
__m512 ifft1280 = _mm512_mask_sub_ps(ifft1272, 49344, _mm512_setzero_ps(), ifft1273);
__m512 ifft1365 = _mm512_mask_sub_ps(ifft1357, 49344, _mm512_setzero_ps(), ifft1358);
__m512 ifft1281 = _mm512_mask_mov_ps(ifft1273, 49344, ifft1272);
__m512 ifft1366 = _mm512_mask_mov_ps(ifft1358, 49344, ifft1357);
__m512 ifft1282 = _mm512_mask_sub_ps(ifft1274, 49344, _mm512_setzero_ps(), ifft1275);
__m512 ifft1367 = _mm512_mask_sub_ps(ifft1359, 49344, _mm512_setzero_ps(), ifft1360);
__m512 ifft1283 = _mm512_mask_mov_ps(ifft1275, 49344, ifft1274);
__m512 ifft1368 = _mm512_mask_mov_ps(ifft1360, 49344, ifft1359);
__m512 ifft1284 = _mm512_mask_sub_ps(ifft1276, 49344, _mm512_setzero_ps(), ifft1277);
__m512 ifft1369 = _mm512_mask_sub_ps(ifft1361, 49344, _mm512_setzero_ps(), ifft1362);
__m512 ifft1285 = _mm512_mask_mov_ps(ifft1277, 49344, ifft1276);
__m512 ifft1370 = _mm512_mask_mov_ps(ifft1362, 49344, ifft1361);
__m512 ifft1286 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft1287 = _mm512_fmadd_ps(ifft1278, ifft1286, _mm512_shuffle_f32x4(ifft1278, ifft1278, 177));
__m512 ifft1371 = _mm512_fmadd_ps(ifft1363, ifft1286, _mm512_shuffle_f32x4(ifft1363, ifft1363, 177));
__m512 ifft1288 = _mm512_fmadd_ps(ifft1279, ifft1286, _mm512_shuffle_f32x4(ifft1279, ifft1279, 177));
__m512 ifft1372 = _mm512_fmadd_ps(ifft1364, ifft1286, _mm512_shuffle_f32x4(ifft1364, ifft1364, 177));
__m512 ifft1289 = _mm512_fmadd_ps(ifft1280, ifft1286, _mm512_shuffle_f32x4(ifft1280, ifft1280, 177));
__m512 ifft1373 = _mm512_fmadd_ps(ifft1365, ifft1286, _mm512_shuffle_f32x4(ifft1365, ifft1365, 177));
__m512 ifft1290 = _mm512_fmadd_ps(ifft1281, ifft1286, _mm512_shuffle_f32x4(ifft1281, ifft1281, 177));
__m512 ifft1374 = _mm512_fmadd_ps(ifft1366, ifft1286, _mm512_shuffle_f32x4(ifft1366, ifft1366, 177));
__m512 ifft1291 = _mm512_fmadd_ps(ifft1282, ifft1286, _mm512_shuffle_f32x4(ifft1282, ifft1282, 177));
__m512 ifft1375 = _mm512_fmadd_ps(ifft1367, ifft1286, _mm512_shuffle_f32x4(ifft1367, ifft1367, 177));
__m512 ifft1292 = _mm512_fnmsub_ps(ifft1283, ifft1286, _mm512_shuffle_f32x4(ifft1283, ifft1283, 177));
__m512 ifft1376 = _mm512_fnmsub_ps(ifft1368, ifft1286, _mm512_shuffle_f32x4(ifft1368, ifft1368, 177));
__m512 ifft1293 = _mm512_fmadd_ps(ifft1284, ifft1286, _mm512_shuffle_f32x4(ifft1284, ifft1284, 177));
__m512 ifft1377 = _mm512_fmadd_ps(ifft1369, ifft1286, _mm512_shuffle_f32x4(ifft1369, ifft1369, 177));
__m512 ifft1294 = _mm512_fmadd_ps(ifft1285, ifft1286, _mm512_shuffle_f32x4(ifft1285, ifft1285, 177));
__m512 ifft1378 = _mm512_fmadd_ps(ifft1370, ifft1286, _mm512_shuffle_f32x4(ifft1370, ifft1370, 177));
__m512 ifft1295 = _mm512_add_ps(ifft1287, ifft1288);
__m512 ifft1379 = _mm512_add_ps(ifft1371, ifft1372);
__m512 ifft1296 = _mm512_sub_ps(ifft1287, ifft1288);
__m512 ifft1380 = _mm512_sub_ps(ifft1371, ifft1372);
__m512 ifft1297 = _mm512_sub_ps(ifft1289, ifft1293);
__m512 ifft1381 = _mm512_sub_ps(ifft1373, ifft1377);
__m512 ifft1298 = _mm512_add_ps(ifft1290, ifft1294);
__m512 ifft1382 = _mm512_add_ps(ifft1374, ifft1378);
__m512 ifft1299 = _mm512_add_ps(ifft1289, ifft1293);
__m512 ifft1383 = _mm512_add_ps(ifft1373, ifft1377);
__m512 ifft1300 = _mm512_sub_ps(ifft1290, ifft1294);
__m512 ifft1384 = _mm512_sub_ps(ifft1374, ifft1378);
__m512 ifft1301 = _mm512_mul_ps(ifft1291, _mm512_set1_ps(3.125e-02f));
__m512 ifft1385 = _mm512_mul_ps(ifft1375, _mm512_set1_ps(3.125e-02f));
__m512 ifft1302 = _mm512_mul_ps(ifft1292, _mm512_set1_ps(3.125e-02f));
__m512 ifft1386 = _mm512_mul_ps(ifft1376, _mm512_set1_ps(3.125e-02f));
__m512 ifft1303 = _mm512_fmadd_ps(ifft1295, _mm512_set1_ps(1.5625e-02f), ifft1301);
__m512 ifft1387 = _mm512_fmadd_ps(ifft1379, _mm512_set1_ps(1.5625e-02f), ifft1385);
__m512 ifft1304 = _mm512_fmsub_ps(ifft1295, _mm512_set1_ps(1.5625e-02f), ifft1301);
__m512 ifft1388 = _mm512_fmsub_ps(ifft1379, _mm512_set1_ps(1.5625e-02f), ifft1385);
__m512 ifft1305 = _mm512_fmadd_ps(ifft1296, _mm512_set1_ps(1.5625e-02f), ifft1302);
__m512 ifft1389 = _mm512_fmadd_ps(ifft1380, _mm512_set1_ps(1.5625e-02f), ifft1386);
__m512 ifft1306 = _mm512_fmsub_ps(ifft1296, _mm512_set1_ps(1.5625e-02f), ifft1302);
__m512 ifft1390 = _mm512_fmsub_ps(ifft1380, _mm512_set1_ps(1.5625e-02f), ifft1386);
__m512 ifft1307 = _mm512_add_ps(ifft1297, ifft1298);
__m512 ifft1391 = _mm512_add_ps(ifft1381, ifft1382);
__m512 ifft1308 = _mm512_sub_ps(ifft1297, ifft1298);
__m512 ifft1392 = _mm512_sub_ps(ifft1381, ifft1382);
__m512 ifft1309 = _mm512_fnmadd_ps(ifft1307, _mm512_set1_ps(7.0710677e-01f), ifft1299);
__m512 ifft1393 = _mm512_fnmadd_ps(ifft1391, _mm512_set1_ps(7.0710677e-01f), ifft1383);
__m512 ifft1310 = _mm512_fmadd_ps(ifft1307, _mm512_set1_ps(7.0710677e-01f), ifft1299);
__m512 ifft1394 = _mm512_fmadd_ps(ifft1391, _mm512_set1_ps(7.0710677e-01f), ifft1383);
__m512 ifft1311 = _mm512_fmadd_ps(ifft1308, _mm512_set1_ps(7.0710677e-01f), ifft1300);
__m512 ifft1395 = _mm512_fmadd_ps(ifft1392, _mm512_set1_ps(7.0710677e-01f), ifft1384);
__m512 ifft1312 = _mm512_fmsub_ps(ifft1308, _mm512_set1_ps(7.0710677e-01f), ifft1300);
__m512 ifft1396 = _mm512_fmsub_ps(ifft1392, _mm512_set1_ps(7.0710677e-01f), ifft1384);
__m512 ifft1313 = _mm512_add_ps(ifft1309, ifft1310);
__m512 ifft1397 = _mm512_add_ps(ifft1393, ifft1394);
__m512 ifft1314 = _mm512_sub_ps(ifft1309, ifft1310);
__m512 ifft1398 = _mm512_sub_ps(ifft1393, ifft1394);
__m512 ifft1315 = _mm512_add_ps(ifft1311, ifft1312);
__m512 ifft1399 = _mm512_add_ps(ifft1395, ifft1396);
__m512 ifft1316 = _mm512_sub_ps(ifft1311, ifft1312);
__m512 ifft1400 = _mm512_sub_ps(ifft1395, ifft1396);
__m512 ifft1317 = _mm512_fmadd_ps(ifft1313, _mm512_set1_ps(1.5625e-02f), ifft1303);
__m512 ifft1401 = _mm512_fmadd_ps(ifft1397, _mm512_set1_ps(1.5625e-02f), ifft1387);
__m512 ifft1318 = _mm512_fnmadd_ps(ifft1313, _mm512_set1_ps(1.5625e-02f), ifft1303);
__m512 ifft1402 = _mm512_fnmadd_ps(ifft1397, _mm512_set1_ps(1.5625e-02f), ifft1387);
__m512 ifft1319 = _mm512_fmadd_ps(ifft1315, _mm512_set1_ps(1.5625e-02f), ifft1305);
__m512 ifft1403 = _mm512_fmadd_ps(ifft1399, _mm512_set1_ps(1.5625e-02f), ifft1389);
__m512 ifft1320 = _mm512_fnmadd_ps(ifft1315, _mm512_set1_ps(1.5625e-02f), ifft1305);
__m512 ifft1404 = _mm512_fnmadd_ps(ifft1399, _mm512_set1_ps(1.5625e-02f), ifft1389);
__m512 ifft1321 = _mm512_fnmadd_ps(ifft1316, _mm512_set1_ps(1.5625e-02f), ifft1304);
__m512 ifft1405 = _mm512_fnmadd_ps(ifft1400, _mm512_set1_ps(1.5625e-02f), ifft1388);
__m512 ifft1322 = _mm512_fmadd_ps(ifft1316, _mm512_set1_ps(1.5625e-02f), ifft1304);
__m512 ifft1406 = _mm512_fmadd_ps(ifft1400, _mm512_set1_ps(1.5625e-02f), ifft1388);
__m512 ifft1323 = _mm512_fmadd_ps(ifft1314, _mm512_set1_ps(1.5625e-02f), ifft1306);
__m512 ifft1407 = _mm512_fmadd_ps(ifft1398, _mm512_set1_ps(1.5625e-02f), ifft1390);
__m512 ifft1324 = _mm512_fnmadd_ps(ifft1314, _mm512_set1_ps(1.5625e-02f), ifft1306);
__m512 ifft1408 = _mm512_fnmadd_ps(ifft1398, _mm512_set1_ps(1.5625e-02f), ifft1390);
__m512 dat660 = ifft1317;
__m512 dat665 = ifft1401;
__m512 dat661 = ifft1319;
__m512 dat666 = ifft1403;
__m512 dat662 = ifft1321;
__m512 dat667 = ifft1405;
__m512 dat663 = ifft1323;
__m512 dat668 = ifft1407;
__m512 dat664 = ifft1318;
__m512 dat669 = ifft1402;
(void)ifft1320;
(void)ifft1404;
(void)ifft1322;
(void)ifft1406;
(void)ifft1324;
(void)ifft1408;
__m512i pm13 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack61 = _mm512_permutex2var_ps(dat660, pm13, dat665);
__m512i pm14 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack62 = _mm512_permutex2var_ps(dat660, pm14, dat665);
__m512 pack63 = _mm512_permutex2var_ps(dat661, pm13, dat666);
__m512 pack64 = _mm512_permutex2var_ps(dat661, pm14, dat666);
__m512 pack65 = _mm512_permutex2var_ps(dat662, pm13, dat667);
__m512 pack66 = _mm512_permutex2var_ps(dat662, pm14, dat667);
__m512 pack67 = _mm512_permutex2var_ps(dat663, pm13, dat668);
__m512 pack68 = _mm512_permutex2var_ps(dat663, pm14, dat668);
__m512 pack69 = _mm512_permutex2var_ps(dat664, pm13, dat669);
__m512 pack70 = _mm512_permutex2var_ps(dat664, pm14, dat669);
pack61 = _mm512_max_ps(_mm512_setzero_ps(), pack61);
pack62 = _mm512_max_ps(_mm512_setzero_ps(), pack62);
pack63 = _mm512_max_ps(_mm512_setzero_ps(), pack63);
pack64 = _mm512_max_ps(_mm512_setzero_ps(), pack64);
pack65 = _mm512_max_ps(_mm512_setzero_ps(), pack65);
pack66 = _mm512_max_ps(_mm512_setzero_ps(), pack66);
pack67 = _mm512_max_ps(_mm512_setzero_ps(), pack67);
pack68 = _mm512_max_ps(_mm512_setzero_ps(), pack68);
pack69 = _mm512_max_ps(_mm512_setzero_ps(), pack69);
pack70 = _mm512_max_ps(_mm512_setzero_ps(), pack70);
_mm512_mask_storeu_ps(datPtr2+40+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack61);
_mm512_mask_storeu_ps(datPtr2+50280+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack62);
_mm512_mask_storeu_ps(datPtr2+488+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack63);
_mm512_mask_storeu_ps(datPtr2+50728+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack64);
_mm512_mask_storeu_ps(datPtr2+936+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack65);
_mm512_mask_storeu_ps(datPtr2+51176+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack66);
_mm512_mask_storeu_ps(datPtr2+1384+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack67);
_mm512_mask_storeu_ps(datPtr2+51624+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack68);
_mm512_mask_storeu_ps(datPtr2+1832+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack69);
_mm512_mask_storeu_ps(datPtr2+52072+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack70);
ptrdiff_t t10 = 0;
__m512 sfRe121 = _mm512_loadu_ps(sfPtr3+512+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfIm121 = _mm512_loadu_ps(sfPtr3+576+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfRe125 = _mm512_loadu_ps(sfPtr3+640+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfIm125 = _mm512_loadu_ps(sfPtr3+704+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfRe122 = _mm512_loadu_ps(sfPtr3+2167296+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfIm122 = _mm512_loadu_ps(sfPtr3+2167360+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfRe126 = _mm512_loadu_ps(sfPtr3+2167424+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfIm126 = _mm512_loadu_ps(sfPtr3+2167488+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfRe123 = _mm512_loadu_ps(sfPtr3+4334080+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfIm123 = _mm512_loadu_ps(sfPtr3+4334144+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfRe127 = _mm512_loadu_ps(sfPtr3+4334208+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfIm127 = _mm512_loadu_ps(sfPtr3+4334272+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfRe124 = _mm512_loadu_ps(sfPtr3+6500864+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfIm124 = _mm512_loadu_ps(sfPtr3+6500928+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfRe128 = _mm512_loadu_ps(sfPtr3+6500992+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfIm128 = _mm512_loadu_ps(sfPtr3+6501056+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512i ifft1409 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft1410 = _mm512_permutexvar_ps(ifft1409, sfRe121);
__m512 ifft1501 = _mm512_permutexvar_ps(ifft1409, sfRe125);
__m512i ifft1411 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft1412 = _mm512_permutexvar_ps(ifft1411, sfRe121);
__m512 ifft1502 = _mm512_permutexvar_ps(ifft1411, sfRe125);
__m512 ifft1413 = _mm512_permutexvar_ps(ifft1409, sfIm121);
__m512 ifft1503 = _mm512_permutexvar_ps(ifft1409, sfIm125);
__m512 ifft1414 = _mm512_permutexvar_ps(ifft1411, sfIm121);
__m512 ifft1504 = _mm512_permutexvar_ps(ifft1411, sfIm125);
__m512 ifft1415 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft1416 = _mm512_mask_fmadd_ps(ifft1414, 65021, ifft1415, ifft1410);
__m512 ifft1505 = _mm512_mask_fmadd_ps(ifft1504, 65021, ifft1415, ifft1501);
__m512 ifft1417 = _mm512_mask_fnmadd_ps(ifft1413, 65021, ifft1415, ifft1412);
__m512 ifft1506 = _mm512_mask_fnmadd_ps(ifft1503, 65021, ifft1415, ifft1502);
__m512 ifft1418 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft1419 = _mm512_fmadd_ps(ifft1416, ifft1418, _mm512_shuffle_ps(ifft1416, ifft1416, 177));
__m512 ifft1507 = _mm512_fmadd_ps(ifft1505, ifft1418, _mm512_shuffle_ps(ifft1505, ifft1505, 177));
__m512 ifft1420 = _mm512_fmadd_ps(ifft1417, ifft1418, _mm512_shuffle_ps(ifft1417, ifft1417, 177));
__m512 ifft1508 = _mm512_fmadd_ps(ifft1506, ifft1418, _mm512_shuffle_ps(ifft1506, ifft1506, 177));
__m512 ifft1421 = _mm512_fmadd_ps(sfRe122, ifft1418, _mm512_shuffle_ps(sfRe122, sfRe122, 177));
__m512 ifft1509 = _mm512_fmadd_ps(sfRe126, ifft1418, _mm512_shuffle_ps(sfRe126, sfRe126, 177));
__m512 ifft1422 = _mm512_fmadd_ps(sfIm122, ifft1418, _mm512_shuffle_ps(sfIm122, sfIm122, 177));
__m512 ifft1510 = _mm512_fmadd_ps(sfIm126, ifft1418, _mm512_shuffle_ps(sfIm126, sfIm126, 177));
__m512 ifft1423 = _mm512_fmadd_ps(sfRe123, ifft1418, _mm512_shuffle_ps(sfRe123, sfRe123, 177));
__m512 ifft1511 = _mm512_fmadd_ps(sfRe127, ifft1418, _mm512_shuffle_ps(sfRe127, sfRe127, 177));
__m512 ifft1424 = _mm512_fmadd_ps(sfIm123, ifft1418, _mm512_shuffle_ps(sfIm123, sfIm123, 177));
__m512 ifft1512 = _mm512_fmadd_ps(sfIm127, ifft1418, _mm512_shuffle_ps(sfIm127, sfIm127, 177));
__m512 ifft1425 = _mm512_fmadd_ps(sfRe124, ifft1418, _mm512_shuffle_ps(sfRe124, sfRe124, 177));
__m512 ifft1513 = _mm512_fmadd_ps(sfRe128, ifft1418, _mm512_shuffle_ps(sfRe128, sfRe128, 177));
__m512 ifft1426 = _mm512_fmadd_ps(sfIm124, ifft1418, _mm512_shuffle_ps(sfIm124, sfIm124, 177));
__m512 ifft1514 = _mm512_fmadd_ps(sfIm128, ifft1418, _mm512_shuffle_ps(sfIm128, sfIm128, 177));
__m512 ifft1427 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft1428 = _mm512_mul_ps(ifft1419, ifft1427);
__m512 ifft1515 = _mm512_mul_ps(ifft1507, ifft1427);
__m512 ifft1429 = _mm512_mul_ps(ifft1420, ifft1427);
__m512 ifft1516 = _mm512_mul_ps(ifft1508, ifft1427);
__m512 ifft1430 = _mm512_mul_ps(ifft1421, ifft1427);
__m512 ifft1517 = _mm512_mul_ps(ifft1509, ifft1427);
__m512 ifft1431 = _mm512_mul_ps(ifft1422, ifft1427);
__m512 ifft1518 = _mm512_mul_ps(ifft1510, ifft1427);
__m512 ifft1432 = _mm512_mul_ps(ifft1423, ifft1427);
__m512 ifft1519 = _mm512_mul_ps(ifft1511, ifft1427);
__m512 ifft1433 = _mm512_mul_ps(ifft1424, ifft1427);
__m512 ifft1520 = _mm512_mul_ps(ifft1512, ifft1427);
__m512 ifft1434 = _mm512_mul_ps(ifft1425, ifft1427);
__m512 ifft1521 = _mm512_mul_ps(ifft1513, ifft1427);
__m512 ifft1435 = _mm512_mul_ps(ifft1426, ifft1427);
__m512 ifft1522 = _mm512_mul_ps(ifft1514, ifft1427);
__m512 ifft1436 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft1437 = _mm512_fnmadd_ps(ifft1420, ifft1436, ifft1428);
__m512 ifft1523 = _mm512_fnmadd_ps(ifft1508, ifft1436, ifft1515);
__m512 ifft1438 = _mm512_fmadd_ps(ifft1419, ifft1436, ifft1429);
__m512 ifft1524 = _mm512_fmadd_ps(ifft1507, ifft1436, ifft1516);
__m512 ifft1439 = _mm512_fnmadd_ps(ifft1422, ifft1436, ifft1430);
__m512 ifft1525 = _mm512_fnmadd_ps(ifft1510, ifft1436, ifft1517);
__m512 ifft1440 = _mm512_fmadd_ps(ifft1421, ifft1436, ifft1431);
__m512 ifft1526 = _mm512_fmadd_ps(ifft1509, ifft1436, ifft1518);
__m512 ifft1441 = _mm512_fnmadd_ps(ifft1424, ifft1436, ifft1432);
__m512 ifft1527 = _mm512_fnmadd_ps(ifft1512, ifft1436, ifft1519);
__m512 ifft1442 = _mm512_fmadd_ps(ifft1423, ifft1436, ifft1433);
__m512 ifft1528 = _mm512_fmadd_ps(ifft1511, ifft1436, ifft1520);
__m512 ifft1443 = _mm512_fnmadd_ps(ifft1426, ifft1436, ifft1434);
__m512 ifft1529 = _mm512_fnmadd_ps(ifft1514, ifft1436, ifft1521);
__m512 ifft1444 = _mm512_fmadd_ps(ifft1425, ifft1436, ifft1435);
__m512 ifft1530 = _mm512_fmadd_ps(ifft1513, ifft1436, ifft1522);
__m512 ifft1445 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft1446 = _mm512_fmadd_ps(ifft1437, ifft1445, _mm512_shuffle_ps(ifft1437, ifft1437, 78));
__m512 ifft1531 = _mm512_fmadd_ps(ifft1523, ifft1445, _mm512_shuffle_ps(ifft1523, ifft1523, 78));
__m512 ifft1447 = _mm512_fmadd_ps(ifft1438, ifft1445, _mm512_shuffle_ps(ifft1438, ifft1438, 78));
__m512 ifft1532 = _mm512_fmadd_ps(ifft1524, ifft1445, _mm512_shuffle_ps(ifft1524, ifft1524, 78));
__m512 ifft1448 = _mm512_fmadd_ps(ifft1439, ifft1445, _mm512_shuffle_ps(ifft1439, ifft1439, 78));
__m512 ifft1533 = _mm512_fmadd_ps(ifft1525, ifft1445, _mm512_shuffle_ps(ifft1525, ifft1525, 78));
__m512 ifft1449 = _mm512_fmadd_ps(ifft1440, ifft1445, _mm512_shuffle_ps(ifft1440, ifft1440, 78));
__m512 ifft1534 = _mm512_fmadd_ps(ifft1526, ifft1445, _mm512_shuffle_ps(ifft1526, ifft1526, 78));
__m512 ifft1450 = _mm512_fmadd_ps(ifft1441, ifft1445, _mm512_shuffle_ps(ifft1441, ifft1441, 78));
__m512 ifft1535 = _mm512_fmadd_ps(ifft1527, ifft1445, _mm512_shuffle_ps(ifft1527, ifft1527, 78));
__m512 ifft1451 = _mm512_fmadd_ps(ifft1442, ifft1445, _mm512_shuffle_ps(ifft1442, ifft1442, 78));
__m512 ifft1536 = _mm512_fmadd_ps(ifft1528, ifft1445, _mm512_shuffle_ps(ifft1528, ifft1528, 78));
__m512 ifft1452 = _mm512_fmadd_ps(ifft1443, ifft1445, _mm512_shuffle_ps(ifft1443, ifft1443, 78));
__m512 ifft1537 = _mm512_fmadd_ps(ifft1529, ifft1445, _mm512_shuffle_ps(ifft1529, ifft1529, 78));
__m512 ifft1453 = _mm512_fmadd_ps(ifft1444, ifft1445, _mm512_shuffle_ps(ifft1444, ifft1444, 78));
__m512 ifft1538 = _mm512_fmadd_ps(ifft1530, ifft1445, _mm512_shuffle_ps(ifft1530, ifft1530, 78));
__m512 ifft1454 = _mm512_mask_sub_ps(ifft1446, 49344, _mm512_setzero_ps(), ifft1447);
__m512 ifft1539 = _mm512_mask_sub_ps(ifft1531, 49344, _mm512_setzero_ps(), ifft1532);
__m512 ifft1455 = _mm512_mask_mov_ps(ifft1447, 49344, ifft1446);
__m512 ifft1540 = _mm512_mask_mov_ps(ifft1532, 49344, ifft1531);
__m512 ifft1456 = _mm512_mask_sub_ps(ifft1448, 49344, _mm512_setzero_ps(), ifft1449);
__m512 ifft1541 = _mm512_mask_sub_ps(ifft1533, 49344, _mm512_setzero_ps(), ifft1534);
__m512 ifft1457 = _mm512_mask_mov_ps(ifft1449, 49344, ifft1448);
__m512 ifft1542 = _mm512_mask_mov_ps(ifft1534, 49344, ifft1533);
__m512 ifft1458 = _mm512_mask_sub_ps(ifft1450, 49344, _mm512_setzero_ps(), ifft1451);
__m512 ifft1543 = _mm512_mask_sub_ps(ifft1535, 49344, _mm512_setzero_ps(), ifft1536);
__m512 ifft1459 = _mm512_mask_mov_ps(ifft1451, 49344, ifft1450);
__m512 ifft1544 = _mm512_mask_mov_ps(ifft1536, 49344, ifft1535);
__m512 ifft1460 = _mm512_mask_sub_ps(ifft1452, 49344, _mm512_setzero_ps(), ifft1453);
__m512 ifft1545 = _mm512_mask_sub_ps(ifft1537, 49344, _mm512_setzero_ps(), ifft1538);
__m512 ifft1461 = _mm512_mask_mov_ps(ifft1453, 49344, ifft1452);
__m512 ifft1546 = _mm512_mask_mov_ps(ifft1538, 49344, ifft1537);
__m512 ifft1462 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft1463 = _mm512_fmadd_ps(ifft1454, ifft1462, _mm512_shuffle_f32x4(ifft1454, ifft1454, 177));
__m512 ifft1547 = _mm512_fmadd_ps(ifft1539, ifft1462, _mm512_shuffle_f32x4(ifft1539, ifft1539, 177));
__m512 ifft1464 = _mm512_fmadd_ps(ifft1455, ifft1462, _mm512_shuffle_f32x4(ifft1455, ifft1455, 177));
__m512 ifft1548 = _mm512_fmadd_ps(ifft1540, ifft1462, _mm512_shuffle_f32x4(ifft1540, ifft1540, 177));
__m512 ifft1465 = _mm512_fmadd_ps(ifft1456, ifft1462, _mm512_shuffle_f32x4(ifft1456, ifft1456, 177));
__m512 ifft1549 = _mm512_fmadd_ps(ifft1541, ifft1462, _mm512_shuffle_f32x4(ifft1541, ifft1541, 177));
__m512 ifft1466 = _mm512_fmadd_ps(ifft1457, ifft1462, _mm512_shuffle_f32x4(ifft1457, ifft1457, 177));
__m512 ifft1550 = _mm512_fmadd_ps(ifft1542, ifft1462, _mm512_shuffle_f32x4(ifft1542, ifft1542, 177));
__m512 ifft1467 = _mm512_fmadd_ps(ifft1458, ifft1462, _mm512_shuffle_f32x4(ifft1458, ifft1458, 177));
__m512 ifft1551 = _mm512_fmadd_ps(ifft1543, ifft1462, _mm512_shuffle_f32x4(ifft1543, ifft1543, 177));
__m512 ifft1468 = _mm512_fnmsub_ps(ifft1459, ifft1462, _mm512_shuffle_f32x4(ifft1459, ifft1459, 177));
__m512 ifft1552 = _mm512_fnmsub_ps(ifft1544, ifft1462, _mm512_shuffle_f32x4(ifft1544, ifft1544, 177));
__m512 ifft1469 = _mm512_fmadd_ps(ifft1460, ifft1462, _mm512_shuffle_f32x4(ifft1460, ifft1460, 177));
__m512 ifft1553 = _mm512_fmadd_ps(ifft1545, ifft1462, _mm512_shuffle_f32x4(ifft1545, ifft1545, 177));
__m512 ifft1470 = _mm512_fmadd_ps(ifft1461, ifft1462, _mm512_shuffle_f32x4(ifft1461, ifft1461, 177));
__m512 ifft1554 = _mm512_fmadd_ps(ifft1546, ifft1462, _mm512_shuffle_f32x4(ifft1546, ifft1546, 177));
__m512 ifft1471 = _mm512_add_ps(ifft1463, ifft1464);
__m512 ifft1555 = _mm512_add_ps(ifft1547, ifft1548);
__m512 ifft1472 = _mm512_sub_ps(ifft1463, ifft1464);
__m512 ifft1556 = _mm512_sub_ps(ifft1547, ifft1548);
__m512 ifft1473 = _mm512_sub_ps(ifft1465, ifft1469);
__m512 ifft1557 = _mm512_sub_ps(ifft1549, ifft1553);
__m512 ifft1474 = _mm512_add_ps(ifft1466, ifft1470);
__m512 ifft1558 = _mm512_add_ps(ifft1550, ifft1554);
__m512 ifft1475 = _mm512_add_ps(ifft1465, ifft1469);
__m512 ifft1559 = _mm512_add_ps(ifft1549, ifft1553);
__m512 ifft1476 = _mm512_sub_ps(ifft1466, ifft1470);
__m512 ifft1560 = _mm512_sub_ps(ifft1550, ifft1554);
__m512 ifft1477 = _mm512_mul_ps(ifft1467, _mm512_set1_ps(3.125e-02f));
__m512 ifft1561 = _mm512_mul_ps(ifft1551, _mm512_set1_ps(3.125e-02f));
__m512 ifft1478 = _mm512_mul_ps(ifft1468, _mm512_set1_ps(3.125e-02f));
__m512 ifft1562 = _mm512_mul_ps(ifft1552, _mm512_set1_ps(3.125e-02f));
__m512 ifft1479 = _mm512_fmadd_ps(ifft1471, _mm512_set1_ps(1.5625e-02f), ifft1477);
__m512 ifft1563 = _mm512_fmadd_ps(ifft1555, _mm512_set1_ps(1.5625e-02f), ifft1561);
__m512 ifft1480 = _mm512_fmsub_ps(ifft1471, _mm512_set1_ps(1.5625e-02f), ifft1477);
__m512 ifft1564 = _mm512_fmsub_ps(ifft1555, _mm512_set1_ps(1.5625e-02f), ifft1561);
__m512 ifft1481 = _mm512_fmadd_ps(ifft1472, _mm512_set1_ps(1.5625e-02f), ifft1478);
__m512 ifft1565 = _mm512_fmadd_ps(ifft1556, _mm512_set1_ps(1.5625e-02f), ifft1562);
__m512 ifft1482 = _mm512_fmsub_ps(ifft1472, _mm512_set1_ps(1.5625e-02f), ifft1478);
__m512 ifft1566 = _mm512_fmsub_ps(ifft1556, _mm512_set1_ps(1.5625e-02f), ifft1562);
__m512 ifft1483 = _mm512_add_ps(ifft1473, ifft1474);
__m512 ifft1567 = _mm512_add_ps(ifft1557, ifft1558);
__m512 ifft1484 = _mm512_sub_ps(ifft1473, ifft1474);
__m512 ifft1568 = _mm512_sub_ps(ifft1557, ifft1558);
__m512 ifft1485 = _mm512_fnmadd_ps(ifft1483, _mm512_set1_ps(7.0710677e-01f), ifft1475);
__m512 ifft1569 = _mm512_fnmadd_ps(ifft1567, _mm512_set1_ps(7.0710677e-01f), ifft1559);
__m512 ifft1486 = _mm512_fmadd_ps(ifft1483, _mm512_set1_ps(7.0710677e-01f), ifft1475);
__m512 ifft1570 = _mm512_fmadd_ps(ifft1567, _mm512_set1_ps(7.0710677e-01f), ifft1559);
__m512 ifft1487 = _mm512_fmadd_ps(ifft1484, _mm512_set1_ps(7.0710677e-01f), ifft1476);
__m512 ifft1571 = _mm512_fmadd_ps(ifft1568, _mm512_set1_ps(7.0710677e-01f), ifft1560);
__m512 ifft1488 = _mm512_fmsub_ps(ifft1484, _mm512_set1_ps(7.0710677e-01f), ifft1476);
__m512 ifft1572 = _mm512_fmsub_ps(ifft1568, _mm512_set1_ps(7.0710677e-01f), ifft1560);
__m512 ifft1489 = _mm512_add_ps(ifft1485, ifft1486);
__m512 ifft1573 = _mm512_add_ps(ifft1569, ifft1570);
__m512 ifft1490 = _mm512_sub_ps(ifft1485, ifft1486);
__m512 ifft1574 = _mm512_sub_ps(ifft1569, ifft1570);
__m512 ifft1491 = _mm512_add_ps(ifft1487, ifft1488);
__m512 ifft1575 = _mm512_add_ps(ifft1571, ifft1572);
__m512 ifft1492 = _mm512_sub_ps(ifft1487, ifft1488);
__m512 ifft1576 = _mm512_sub_ps(ifft1571, ifft1572);
__m512 ifft1493 = _mm512_fmadd_ps(ifft1489, _mm512_set1_ps(1.5625e-02f), ifft1479);
__m512 ifft1577 = _mm512_fmadd_ps(ifft1573, _mm512_set1_ps(1.5625e-02f), ifft1563);
__m512 ifft1494 = _mm512_fnmadd_ps(ifft1489, _mm512_set1_ps(1.5625e-02f), ifft1479);
__m512 ifft1578 = _mm512_fnmadd_ps(ifft1573, _mm512_set1_ps(1.5625e-02f), ifft1563);
__m512 ifft1495 = _mm512_fmadd_ps(ifft1491, _mm512_set1_ps(1.5625e-02f), ifft1481);
__m512 ifft1579 = _mm512_fmadd_ps(ifft1575, _mm512_set1_ps(1.5625e-02f), ifft1565);
__m512 ifft1496 = _mm512_fnmadd_ps(ifft1491, _mm512_set1_ps(1.5625e-02f), ifft1481);
__m512 ifft1580 = _mm512_fnmadd_ps(ifft1575, _mm512_set1_ps(1.5625e-02f), ifft1565);
__m512 ifft1497 = _mm512_fnmadd_ps(ifft1492, _mm512_set1_ps(1.5625e-02f), ifft1480);
__m512 ifft1581 = _mm512_fnmadd_ps(ifft1576, _mm512_set1_ps(1.5625e-02f), ifft1564);
__m512 ifft1498 = _mm512_fmadd_ps(ifft1492, _mm512_set1_ps(1.5625e-02f), ifft1480);
__m512 ifft1582 = _mm512_fmadd_ps(ifft1576, _mm512_set1_ps(1.5625e-02f), ifft1564);
__m512 ifft1499 = _mm512_fmadd_ps(ifft1490, _mm512_set1_ps(1.5625e-02f), ifft1482);
__m512 ifft1583 = _mm512_fmadd_ps(ifft1574, _mm512_set1_ps(1.5625e-02f), ifft1566);
__m512 ifft1500 = _mm512_fnmadd_ps(ifft1490, _mm512_set1_ps(1.5625e-02f), ifft1482);
__m512 ifft1584 = _mm512_fnmadd_ps(ifft1574, _mm512_set1_ps(1.5625e-02f), ifft1566);
__m512 dat670 = ifft1493;
__m512 dat675 = ifft1577;
__m512 dat671 = ifft1495;
__m512 dat676 = ifft1579;
__m512 dat672 = ifft1497;
__m512 dat677 = ifft1581;
__m512 dat673 = ifft1499;
__m512 dat678 = ifft1583;
__m512 dat674 = ifft1494;
__m512 dat679 = ifft1578;
(void)ifft1496;
(void)ifft1580;
(void)ifft1498;
(void)ifft1582;
(void)ifft1500;
(void)ifft1584;
__m512i pm15 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack71 = _mm512_permutex2var_ps(dat670, pm15, dat675);
__m512i pm16 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack72 = _mm512_permutex2var_ps(dat670, pm16, dat675);
__m512 pack73 = _mm512_permutex2var_ps(dat671, pm15, dat676);
__m512 pack74 = _mm512_permutex2var_ps(dat671, pm16, dat676);
__m512 pack75 = _mm512_permutex2var_ps(dat672, pm15, dat677);
__m512 pack76 = _mm512_permutex2var_ps(dat672, pm16, dat677);
__m512 pack77 = _mm512_permutex2var_ps(dat673, pm15, dat678);
__m512 pack78 = _mm512_permutex2var_ps(dat673, pm16, dat678);
__m512 pack79 = _mm512_permutex2var_ps(dat674, pm15, dat679);
__m512 pack80 = _mm512_permutex2var_ps(dat674, pm16, dat679);
pack71 = _mm512_max_ps(_mm512_setzero_ps(), pack71);
pack72 = _mm512_max_ps(_mm512_setzero_ps(), pack72);
pack73 = _mm512_max_ps(_mm512_setzero_ps(), pack73);
pack74 = _mm512_max_ps(_mm512_setzero_ps(), pack74);
pack75 = _mm512_max_ps(_mm512_setzero_ps(), pack75);
pack76 = _mm512_max_ps(_mm512_setzero_ps(), pack76);
pack77 = _mm512_max_ps(_mm512_setzero_ps(), pack77);
pack78 = _mm512_max_ps(_mm512_setzero_ps(), pack78);
pack79 = _mm512_max_ps(_mm512_setzero_ps(), pack79);
pack80 = _mm512_max_ps(_mm512_setzero_ps(), pack80);
_mm512_mask_storeu_ps(datPtr2+1860+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack71);
_mm512_mask_storeu_ps(datPtr2+52100+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack72);
_mm512_mask_storeu_ps(datPtr2+2308+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack73);
_mm512_mask_storeu_ps(datPtr2+52548+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack74);
_mm512_mask_storeu_ps(datPtr2+2756+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack75);
_mm512_mask_storeu_ps(datPtr2+52996+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack76);
_mm512_mask_storeu_ps(datPtr2+3204+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack77);
_mm512_mask_storeu_ps(datPtr2+53444+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack78);
_mm512_mask_storeu_ps(datPtr2+3652+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack79);
_mm512_mask_storeu_ps(datPtr2+53892+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack80);
}
}
if (j5 >= last2) return;
++j5;
rel5 = 4;
}
if (rel5 < 7) {
ptrdiff_t toH6 = base5+5;
ptrdiff_t toW6 = -110+30*rel5;
ptrdiff_t jj13 = 6-rel5+j5;
for (; j5 <= jj13; toW6 += 30) {
ptrdiff_t k30 = 16*w21;
for (; k30 != 16; ++k30) {
ptrdiff_t r7 = 0;
for (; r7 != 2; ++r7) {
ptrdiff_t t11 = 0;
for (; t11 < 3; ++t11) {
__m512 sfRe129 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfIm129 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfRe133 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfIm133 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfRe130 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfIm130 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfRe134 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfIm134 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfRe131 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfIm131 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfRe135 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfIm135 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfRe132 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfIm132 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfRe136 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfIm136 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512i ifft1585 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft1586 = _mm512_permutexvar_ps(ifft1585, sfRe129);
__m512 ifft1677 = _mm512_permutexvar_ps(ifft1585, sfRe133);
__m512i ifft1587 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft1588 = _mm512_permutexvar_ps(ifft1587, sfRe129);
__m512 ifft1678 = _mm512_permutexvar_ps(ifft1587, sfRe133);
__m512 ifft1589 = _mm512_permutexvar_ps(ifft1585, sfIm129);
__m512 ifft1679 = _mm512_permutexvar_ps(ifft1585, sfIm133);
__m512 ifft1590 = _mm512_permutexvar_ps(ifft1587, sfIm129);
__m512 ifft1680 = _mm512_permutexvar_ps(ifft1587, sfIm133);
__m512 ifft1591 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft1592 = _mm512_mask_fmadd_ps(ifft1590, 65021, ifft1591, ifft1586);
__m512 ifft1681 = _mm512_mask_fmadd_ps(ifft1680, 65021, ifft1591, ifft1677);
__m512 ifft1593 = _mm512_mask_fnmadd_ps(ifft1589, 65021, ifft1591, ifft1588);
__m512 ifft1682 = _mm512_mask_fnmadd_ps(ifft1679, 65021, ifft1591, ifft1678);
__m512 ifft1594 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft1595 = _mm512_fmadd_ps(ifft1592, ifft1594, _mm512_shuffle_ps(ifft1592, ifft1592, 177));
__m512 ifft1683 = _mm512_fmadd_ps(ifft1681, ifft1594, _mm512_shuffle_ps(ifft1681, ifft1681, 177));
__m512 ifft1596 = _mm512_fmadd_ps(ifft1593, ifft1594, _mm512_shuffle_ps(ifft1593, ifft1593, 177));
__m512 ifft1684 = _mm512_fmadd_ps(ifft1682, ifft1594, _mm512_shuffle_ps(ifft1682, ifft1682, 177));
__m512 ifft1597 = _mm512_fmadd_ps(sfRe130, ifft1594, _mm512_shuffle_ps(sfRe130, sfRe130, 177));
__m512 ifft1685 = _mm512_fmadd_ps(sfRe134, ifft1594, _mm512_shuffle_ps(sfRe134, sfRe134, 177));
__m512 ifft1598 = _mm512_fmadd_ps(sfIm130, ifft1594, _mm512_shuffle_ps(sfIm130, sfIm130, 177));
__m512 ifft1686 = _mm512_fmadd_ps(sfIm134, ifft1594, _mm512_shuffle_ps(sfIm134, sfIm134, 177));
__m512 ifft1599 = _mm512_fmadd_ps(sfRe131, ifft1594, _mm512_shuffle_ps(sfRe131, sfRe131, 177));
__m512 ifft1687 = _mm512_fmadd_ps(sfRe135, ifft1594, _mm512_shuffle_ps(sfRe135, sfRe135, 177));
__m512 ifft1600 = _mm512_fmadd_ps(sfIm131, ifft1594, _mm512_shuffle_ps(sfIm131, sfIm131, 177));
__m512 ifft1688 = _mm512_fmadd_ps(sfIm135, ifft1594, _mm512_shuffle_ps(sfIm135, sfIm135, 177));
__m512 ifft1601 = _mm512_fmadd_ps(sfRe132, ifft1594, _mm512_shuffle_ps(sfRe132, sfRe132, 177));
__m512 ifft1689 = _mm512_fmadd_ps(sfRe136, ifft1594, _mm512_shuffle_ps(sfRe136, sfRe136, 177));
__m512 ifft1602 = _mm512_fmadd_ps(sfIm132, ifft1594, _mm512_shuffle_ps(sfIm132, sfIm132, 177));
__m512 ifft1690 = _mm512_fmadd_ps(sfIm136, ifft1594, _mm512_shuffle_ps(sfIm136, sfIm136, 177));
__m512 ifft1603 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft1604 = _mm512_mul_ps(ifft1595, ifft1603);
__m512 ifft1691 = _mm512_mul_ps(ifft1683, ifft1603);
__m512 ifft1605 = _mm512_mul_ps(ifft1596, ifft1603);
__m512 ifft1692 = _mm512_mul_ps(ifft1684, ifft1603);
__m512 ifft1606 = _mm512_mul_ps(ifft1597, ifft1603);
__m512 ifft1693 = _mm512_mul_ps(ifft1685, ifft1603);
__m512 ifft1607 = _mm512_mul_ps(ifft1598, ifft1603);
__m512 ifft1694 = _mm512_mul_ps(ifft1686, ifft1603);
__m512 ifft1608 = _mm512_mul_ps(ifft1599, ifft1603);
__m512 ifft1695 = _mm512_mul_ps(ifft1687, ifft1603);
__m512 ifft1609 = _mm512_mul_ps(ifft1600, ifft1603);
__m512 ifft1696 = _mm512_mul_ps(ifft1688, ifft1603);
__m512 ifft1610 = _mm512_mul_ps(ifft1601, ifft1603);
__m512 ifft1697 = _mm512_mul_ps(ifft1689, ifft1603);
__m512 ifft1611 = _mm512_mul_ps(ifft1602, ifft1603);
__m512 ifft1698 = _mm512_mul_ps(ifft1690, ifft1603);
__m512 ifft1612 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft1613 = _mm512_fnmadd_ps(ifft1596, ifft1612, ifft1604);
__m512 ifft1699 = _mm512_fnmadd_ps(ifft1684, ifft1612, ifft1691);
__m512 ifft1614 = _mm512_fmadd_ps(ifft1595, ifft1612, ifft1605);
__m512 ifft1700 = _mm512_fmadd_ps(ifft1683, ifft1612, ifft1692);
__m512 ifft1615 = _mm512_fnmadd_ps(ifft1598, ifft1612, ifft1606);
__m512 ifft1701 = _mm512_fnmadd_ps(ifft1686, ifft1612, ifft1693);
__m512 ifft1616 = _mm512_fmadd_ps(ifft1597, ifft1612, ifft1607);
__m512 ifft1702 = _mm512_fmadd_ps(ifft1685, ifft1612, ifft1694);
__m512 ifft1617 = _mm512_fnmadd_ps(ifft1600, ifft1612, ifft1608);
__m512 ifft1703 = _mm512_fnmadd_ps(ifft1688, ifft1612, ifft1695);
__m512 ifft1618 = _mm512_fmadd_ps(ifft1599, ifft1612, ifft1609);
__m512 ifft1704 = _mm512_fmadd_ps(ifft1687, ifft1612, ifft1696);
__m512 ifft1619 = _mm512_fnmadd_ps(ifft1602, ifft1612, ifft1610);
__m512 ifft1705 = _mm512_fnmadd_ps(ifft1690, ifft1612, ifft1697);
__m512 ifft1620 = _mm512_fmadd_ps(ifft1601, ifft1612, ifft1611);
__m512 ifft1706 = _mm512_fmadd_ps(ifft1689, ifft1612, ifft1698);
__m512 ifft1621 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft1622 = _mm512_fmadd_ps(ifft1613, ifft1621, _mm512_shuffle_ps(ifft1613, ifft1613, 78));
__m512 ifft1707 = _mm512_fmadd_ps(ifft1699, ifft1621, _mm512_shuffle_ps(ifft1699, ifft1699, 78));
__m512 ifft1623 = _mm512_fmadd_ps(ifft1614, ifft1621, _mm512_shuffle_ps(ifft1614, ifft1614, 78));
__m512 ifft1708 = _mm512_fmadd_ps(ifft1700, ifft1621, _mm512_shuffle_ps(ifft1700, ifft1700, 78));
__m512 ifft1624 = _mm512_fmadd_ps(ifft1615, ifft1621, _mm512_shuffle_ps(ifft1615, ifft1615, 78));
__m512 ifft1709 = _mm512_fmadd_ps(ifft1701, ifft1621, _mm512_shuffle_ps(ifft1701, ifft1701, 78));
__m512 ifft1625 = _mm512_fmadd_ps(ifft1616, ifft1621, _mm512_shuffle_ps(ifft1616, ifft1616, 78));
__m512 ifft1710 = _mm512_fmadd_ps(ifft1702, ifft1621, _mm512_shuffle_ps(ifft1702, ifft1702, 78));
__m512 ifft1626 = _mm512_fmadd_ps(ifft1617, ifft1621, _mm512_shuffle_ps(ifft1617, ifft1617, 78));
__m512 ifft1711 = _mm512_fmadd_ps(ifft1703, ifft1621, _mm512_shuffle_ps(ifft1703, ifft1703, 78));
__m512 ifft1627 = _mm512_fmadd_ps(ifft1618, ifft1621, _mm512_shuffle_ps(ifft1618, ifft1618, 78));
__m512 ifft1712 = _mm512_fmadd_ps(ifft1704, ifft1621, _mm512_shuffle_ps(ifft1704, ifft1704, 78));
__m512 ifft1628 = _mm512_fmadd_ps(ifft1619, ifft1621, _mm512_shuffle_ps(ifft1619, ifft1619, 78));
__m512 ifft1713 = _mm512_fmadd_ps(ifft1705, ifft1621, _mm512_shuffle_ps(ifft1705, ifft1705, 78));
__m512 ifft1629 = _mm512_fmadd_ps(ifft1620, ifft1621, _mm512_shuffle_ps(ifft1620, ifft1620, 78));
__m512 ifft1714 = _mm512_fmadd_ps(ifft1706, ifft1621, _mm512_shuffle_ps(ifft1706, ifft1706, 78));
__m512 ifft1630 = _mm512_mask_sub_ps(ifft1622, 49344, _mm512_setzero_ps(), ifft1623);
__m512 ifft1715 = _mm512_mask_sub_ps(ifft1707, 49344, _mm512_setzero_ps(), ifft1708);
__m512 ifft1631 = _mm512_mask_mov_ps(ifft1623, 49344, ifft1622);
__m512 ifft1716 = _mm512_mask_mov_ps(ifft1708, 49344, ifft1707);
__m512 ifft1632 = _mm512_mask_sub_ps(ifft1624, 49344, _mm512_setzero_ps(), ifft1625);
__m512 ifft1717 = _mm512_mask_sub_ps(ifft1709, 49344, _mm512_setzero_ps(), ifft1710);
__m512 ifft1633 = _mm512_mask_mov_ps(ifft1625, 49344, ifft1624);
__m512 ifft1718 = _mm512_mask_mov_ps(ifft1710, 49344, ifft1709);
__m512 ifft1634 = _mm512_mask_sub_ps(ifft1626, 49344, _mm512_setzero_ps(), ifft1627);
__m512 ifft1719 = _mm512_mask_sub_ps(ifft1711, 49344, _mm512_setzero_ps(), ifft1712);
__m512 ifft1635 = _mm512_mask_mov_ps(ifft1627, 49344, ifft1626);
__m512 ifft1720 = _mm512_mask_mov_ps(ifft1712, 49344, ifft1711);
__m512 ifft1636 = _mm512_mask_sub_ps(ifft1628, 49344, _mm512_setzero_ps(), ifft1629);
__m512 ifft1721 = _mm512_mask_sub_ps(ifft1713, 49344, _mm512_setzero_ps(), ifft1714);
__m512 ifft1637 = _mm512_mask_mov_ps(ifft1629, 49344, ifft1628);
__m512 ifft1722 = _mm512_mask_mov_ps(ifft1714, 49344, ifft1713);
__m512 ifft1638 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft1639 = _mm512_fmadd_ps(ifft1630, ifft1638, _mm512_shuffle_f32x4(ifft1630, ifft1630, 177));
__m512 ifft1723 = _mm512_fmadd_ps(ifft1715, ifft1638, _mm512_shuffle_f32x4(ifft1715, ifft1715, 177));
__m512 ifft1640 = _mm512_fmadd_ps(ifft1631, ifft1638, _mm512_shuffle_f32x4(ifft1631, ifft1631, 177));
__m512 ifft1724 = _mm512_fmadd_ps(ifft1716, ifft1638, _mm512_shuffle_f32x4(ifft1716, ifft1716, 177));
__m512 ifft1641 = _mm512_fmadd_ps(ifft1632, ifft1638, _mm512_shuffle_f32x4(ifft1632, ifft1632, 177));
__m512 ifft1725 = _mm512_fmadd_ps(ifft1717, ifft1638, _mm512_shuffle_f32x4(ifft1717, ifft1717, 177));
__m512 ifft1642 = _mm512_fmadd_ps(ifft1633, ifft1638, _mm512_shuffle_f32x4(ifft1633, ifft1633, 177));
__m512 ifft1726 = _mm512_fmadd_ps(ifft1718, ifft1638, _mm512_shuffle_f32x4(ifft1718, ifft1718, 177));
__m512 ifft1643 = _mm512_fmadd_ps(ifft1634, ifft1638, _mm512_shuffle_f32x4(ifft1634, ifft1634, 177));
__m512 ifft1727 = _mm512_fmadd_ps(ifft1719, ifft1638, _mm512_shuffle_f32x4(ifft1719, ifft1719, 177));
__m512 ifft1644 = _mm512_fnmsub_ps(ifft1635, ifft1638, _mm512_shuffle_f32x4(ifft1635, ifft1635, 177));
__m512 ifft1728 = _mm512_fnmsub_ps(ifft1720, ifft1638, _mm512_shuffle_f32x4(ifft1720, ifft1720, 177));
__m512 ifft1645 = _mm512_fmadd_ps(ifft1636, ifft1638, _mm512_shuffle_f32x4(ifft1636, ifft1636, 177));
__m512 ifft1729 = _mm512_fmadd_ps(ifft1721, ifft1638, _mm512_shuffle_f32x4(ifft1721, ifft1721, 177));
__m512 ifft1646 = _mm512_fmadd_ps(ifft1637, ifft1638, _mm512_shuffle_f32x4(ifft1637, ifft1637, 177));
__m512 ifft1730 = _mm512_fmadd_ps(ifft1722, ifft1638, _mm512_shuffle_f32x4(ifft1722, ifft1722, 177));
__m512 ifft1647 = _mm512_add_ps(ifft1639, ifft1640);
__m512 ifft1731 = _mm512_add_ps(ifft1723, ifft1724);
__m512 ifft1648 = _mm512_sub_ps(ifft1639, ifft1640);
__m512 ifft1732 = _mm512_sub_ps(ifft1723, ifft1724);
__m512 ifft1649 = _mm512_sub_ps(ifft1641, ifft1645);
__m512 ifft1733 = _mm512_sub_ps(ifft1725, ifft1729);
__m512 ifft1650 = _mm512_add_ps(ifft1642, ifft1646);
__m512 ifft1734 = _mm512_add_ps(ifft1726, ifft1730);
__m512 ifft1651 = _mm512_add_ps(ifft1641, ifft1645);
__m512 ifft1735 = _mm512_add_ps(ifft1725, ifft1729);
__m512 ifft1652 = _mm512_sub_ps(ifft1642, ifft1646);
__m512 ifft1736 = _mm512_sub_ps(ifft1726, ifft1730);
__m512 ifft1653 = _mm512_mul_ps(ifft1643, _mm512_set1_ps(3.125e-02f));
__m512 ifft1737 = _mm512_mul_ps(ifft1727, _mm512_set1_ps(3.125e-02f));
__m512 ifft1654 = _mm512_mul_ps(ifft1644, _mm512_set1_ps(3.125e-02f));
__m512 ifft1738 = _mm512_mul_ps(ifft1728, _mm512_set1_ps(3.125e-02f));
__m512 ifft1655 = _mm512_fmadd_ps(ifft1647, _mm512_set1_ps(1.5625e-02f), ifft1653);
__m512 ifft1739 = _mm512_fmadd_ps(ifft1731, _mm512_set1_ps(1.5625e-02f), ifft1737);
__m512 ifft1656 = _mm512_fmsub_ps(ifft1647, _mm512_set1_ps(1.5625e-02f), ifft1653);
__m512 ifft1740 = _mm512_fmsub_ps(ifft1731, _mm512_set1_ps(1.5625e-02f), ifft1737);
__m512 ifft1657 = _mm512_fmadd_ps(ifft1648, _mm512_set1_ps(1.5625e-02f), ifft1654);
__m512 ifft1741 = _mm512_fmadd_ps(ifft1732, _mm512_set1_ps(1.5625e-02f), ifft1738);
__m512 ifft1658 = _mm512_fmsub_ps(ifft1648, _mm512_set1_ps(1.5625e-02f), ifft1654);
__m512 ifft1742 = _mm512_fmsub_ps(ifft1732, _mm512_set1_ps(1.5625e-02f), ifft1738);
__m512 ifft1659 = _mm512_add_ps(ifft1649, ifft1650);
__m512 ifft1743 = _mm512_add_ps(ifft1733, ifft1734);
__m512 ifft1660 = _mm512_sub_ps(ifft1649, ifft1650);
__m512 ifft1744 = _mm512_sub_ps(ifft1733, ifft1734);
__m512 ifft1661 = _mm512_fnmadd_ps(ifft1659, _mm512_set1_ps(7.0710677e-01f), ifft1651);
__m512 ifft1745 = _mm512_fnmadd_ps(ifft1743, _mm512_set1_ps(7.0710677e-01f), ifft1735);
__m512 ifft1662 = _mm512_fmadd_ps(ifft1659, _mm512_set1_ps(7.0710677e-01f), ifft1651);
__m512 ifft1746 = _mm512_fmadd_ps(ifft1743, _mm512_set1_ps(7.0710677e-01f), ifft1735);
__m512 ifft1663 = _mm512_fmadd_ps(ifft1660, _mm512_set1_ps(7.0710677e-01f), ifft1652);
__m512 ifft1747 = _mm512_fmadd_ps(ifft1744, _mm512_set1_ps(7.0710677e-01f), ifft1736);
__m512 ifft1664 = _mm512_fmsub_ps(ifft1660, _mm512_set1_ps(7.0710677e-01f), ifft1652);
__m512 ifft1748 = _mm512_fmsub_ps(ifft1744, _mm512_set1_ps(7.0710677e-01f), ifft1736);
__m512 ifft1665 = _mm512_add_ps(ifft1661, ifft1662);
__m512 ifft1749 = _mm512_add_ps(ifft1745, ifft1746);
__m512 ifft1666 = _mm512_sub_ps(ifft1661, ifft1662);
__m512 ifft1750 = _mm512_sub_ps(ifft1745, ifft1746);
__m512 ifft1667 = _mm512_add_ps(ifft1663, ifft1664);
__m512 ifft1751 = _mm512_add_ps(ifft1747, ifft1748);
__m512 ifft1668 = _mm512_sub_ps(ifft1663, ifft1664);
__m512 ifft1752 = _mm512_sub_ps(ifft1747, ifft1748);
__m512 ifft1669 = _mm512_fmadd_ps(ifft1665, _mm512_set1_ps(1.5625e-02f), ifft1655);
__m512 ifft1753 = _mm512_fmadd_ps(ifft1749, _mm512_set1_ps(1.5625e-02f), ifft1739);
__m512 ifft1670 = _mm512_fnmadd_ps(ifft1665, _mm512_set1_ps(1.5625e-02f), ifft1655);
__m512 ifft1754 = _mm512_fnmadd_ps(ifft1749, _mm512_set1_ps(1.5625e-02f), ifft1739);
__m512 ifft1671 = _mm512_fmadd_ps(ifft1667, _mm512_set1_ps(1.5625e-02f), ifft1657);
__m512 ifft1755 = _mm512_fmadd_ps(ifft1751, _mm512_set1_ps(1.5625e-02f), ifft1741);
__m512 ifft1672 = _mm512_fnmadd_ps(ifft1667, _mm512_set1_ps(1.5625e-02f), ifft1657);
__m512 ifft1756 = _mm512_fnmadd_ps(ifft1751, _mm512_set1_ps(1.5625e-02f), ifft1741);
__m512 ifft1673 = _mm512_fnmadd_ps(ifft1668, _mm512_set1_ps(1.5625e-02f), ifft1656);
__m512 ifft1757 = _mm512_fnmadd_ps(ifft1752, _mm512_set1_ps(1.5625e-02f), ifft1740);
__m512 ifft1674 = _mm512_fmadd_ps(ifft1668, _mm512_set1_ps(1.5625e-02f), ifft1656);
__m512 ifft1758 = _mm512_fmadd_ps(ifft1752, _mm512_set1_ps(1.5625e-02f), ifft1740);
__m512 ifft1675 = _mm512_fmadd_ps(ifft1666, _mm512_set1_ps(1.5625e-02f), ifft1658);
__m512 ifft1759 = _mm512_fmadd_ps(ifft1750, _mm512_set1_ps(1.5625e-02f), ifft1742);
__m512 ifft1676 = _mm512_fnmadd_ps(ifft1666, _mm512_set1_ps(1.5625e-02f), ifft1658);
__m512 ifft1760 = _mm512_fnmadd_ps(ifft1750, _mm512_set1_ps(1.5625e-02f), ifft1742);
__m512 dat680 = ifft1669;
__m512 dat685 = ifft1753;
__m512 dat681 = ifft1671;
__m512 dat686 = ifft1755;
__m512 dat682 = ifft1673;
__m512 dat687 = ifft1757;
__m512 dat683 = ifft1675;
__m512 dat688 = ifft1759;
__m512 dat684 = ifft1670;
__m512 dat689 = ifft1754;
(void)ifft1672;
(void)ifft1756;
(void)ifft1674;
(void)ifft1758;
(void)ifft1676;
(void)ifft1760;
__m512i pm17 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack81 = _mm512_permutex2var_ps(dat680, pm17, dat685);
__m512i pm18 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack82 = _mm512_permutex2var_ps(dat680, pm18, dat685);
__m512 pack83 = _mm512_permutex2var_ps(dat681, pm17, dat686);
__m512 pack84 = _mm512_permutex2var_ps(dat681, pm18, dat686);
__m512 pack85 = _mm512_permutex2var_ps(dat682, pm17, dat687);
__m512 pack86 = _mm512_permutex2var_ps(dat682, pm18, dat687);
__m512 pack87 = _mm512_permutex2var_ps(dat683, pm17, dat688);
__m512 pack88 = _mm512_permutex2var_ps(dat683, pm18, dat688);
__m512 pack89 = _mm512_permutex2var_ps(dat684, pm17, dat689);
__m512 pack90 = _mm512_permutex2var_ps(dat684, pm18, dat689);
pack81 = _mm512_max_ps(_mm512_setzero_ps(), pack81);
pack82 = _mm512_max_ps(_mm512_setzero_ps(), pack82);
pack83 = _mm512_max_ps(_mm512_setzero_ps(), pack83);
pack84 = _mm512_max_ps(_mm512_setzero_ps(), pack84);
pack85 = _mm512_max_ps(_mm512_setzero_ps(), pack85);
pack86 = _mm512_max_ps(_mm512_setzero_ps(), pack86);
pack87 = _mm512_max_ps(_mm512_setzero_ps(), pack87);
pack88 = _mm512_max_ps(_mm512_setzero_ps(), pack88);
pack89 = _mm512_max_ps(_mm512_setzero_ps(), pack89);
pack90 = _mm512_max_ps(_mm512_setzero_ps(), pack90);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack81);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack82);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack83);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack84);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack85);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack86);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack87);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack88);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack89);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack90);
}
}
}
if (j5 >= last2) return;
++j5;
}
rel5 = 7;
}
if (rel5 < 8) {
ptrdiff_t toH7 = base5+5;
ptrdiff_t toW7 = 100;
ptrdiff_t k31 = 16*w21;
for (; k31 != 16; ++k31) {
ptrdiff_t r8 = 0;
for (; r8 != 2; ++r8) {
ptrdiff_t t12 = 0;
__m512 sfRe137 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfIm137 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfRe141 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfIm141 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfRe138 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfIm138 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfRe142 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfIm142 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfRe139 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfIm139 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfRe143 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfIm143 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfRe140 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfIm140 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfRe144 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfIm144 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512i ifft1761 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft1762 = _mm512_permutexvar_ps(ifft1761, sfRe137);
__m512 ifft1853 = _mm512_permutexvar_ps(ifft1761, sfRe141);
__m512i ifft1763 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft1764 = _mm512_permutexvar_ps(ifft1763, sfRe137);
__m512 ifft1854 = _mm512_permutexvar_ps(ifft1763, sfRe141);
__m512 ifft1765 = _mm512_permutexvar_ps(ifft1761, sfIm137);
__m512 ifft1855 = _mm512_permutexvar_ps(ifft1761, sfIm141);
__m512 ifft1766 = _mm512_permutexvar_ps(ifft1763, sfIm137);
__m512 ifft1856 = _mm512_permutexvar_ps(ifft1763, sfIm141);
__m512 ifft1767 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft1768 = _mm512_mask_fmadd_ps(ifft1766, 65021, ifft1767, ifft1762);
__m512 ifft1857 = _mm512_mask_fmadd_ps(ifft1856, 65021, ifft1767, ifft1853);
__m512 ifft1769 = _mm512_mask_fnmadd_ps(ifft1765, 65021, ifft1767, ifft1764);
__m512 ifft1858 = _mm512_mask_fnmadd_ps(ifft1855, 65021, ifft1767, ifft1854);
__m512 ifft1770 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft1771 = _mm512_fmadd_ps(ifft1768, ifft1770, _mm512_shuffle_ps(ifft1768, ifft1768, 177));
__m512 ifft1859 = _mm512_fmadd_ps(ifft1857, ifft1770, _mm512_shuffle_ps(ifft1857, ifft1857, 177));
__m512 ifft1772 = _mm512_fmadd_ps(ifft1769, ifft1770, _mm512_shuffle_ps(ifft1769, ifft1769, 177));
__m512 ifft1860 = _mm512_fmadd_ps(ifft1858, ifft1770, _mm512_shuffle_ps(ifft1858, ifft1858, 177));
__m512 ifft1773 = _mm512_fmadd_ps(sfRe138, ifft1770, _mm512_shuffle_ps(sfRe138, sfRe138, 177));
__m512 ifft1861 = _mm512_fmadd_ps(sfRe142, ifft1770, _mm512_shuffle_ps(sfRe142, sfRe142, 177));
__m512 ifft1774 = _mm512_fmadd_ps(sfIm138, ifft1770, _mm512_shuffle_ps(sfIm138, sfIm138, 177));
__m512 ifft1862 = _mm512_fmadd_ps(sfIm142, ifft1770, _mm512_shuffle_ps(sfIm142, sfIm142, 177));
__m512 ifft1775 = _mm512_fmadd_ps(sfRe139, ifft1770, _mm512_shuffle_ps(sfRe139, sfRe139, 177));
__m512 ifft1863 = _mm512_fmadd_ps(sfRe143, ifft1770, _mm512_shuffle_ps(sfRe143, sfRe143, 177));
__m512 ifft1776 = _mm512_fmadd_ps(sfIm139, ifft1770, _mm512_shuffle_ps(sfIm139, sfIm139, 177));
__m512 ifft1864 = _mm512_fmadd_ps(sfIm143, ifft1770, _mm512_shuffle_ps(sfIm143, sfIm143, 177));
__m512 ifft1777 = _mm512_fmadd_ps(sfRe140, ifft1770, _mm512_shuffle_ps(sfRe140, sfRe140, 177));
__m512 ifft1865 = _mm512_fmadd_ps(sfRe144, ifft1770, _mm512_shuffle_ps(sfRe144, sfRe144, 177));
__m512 ifft1778 = _mm512_fmadd_ps(sfIm140, ifft1770, _mm512_shuffle_ps(sfIm140, sfIm140, 177));
__m512 ifft1866 = _mm512_fmadd_ps(sfIm144, ifft1770, _mm512_shuffle_ps(sfIm144, sfIm144, 177));
__m512 ifft1779 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft1780 = _mm512_mul_ps(ifft1771, ifft1779);
__m512 ifft1867 = _mm512_mul_ps(ifft1859, ifft1779);
__m512 ifft1781 = _mm512_mul_ps(ifft1772, ifft1779);
__m512 ifft1868 = _mm512_mul_ps(ifft1860, ifft1779);
__m512 ifft1782 = _mm512_mul_ps(ifft1773, ifft1779);
__m512 ifft1869 = _mm512_mul_ps(ifft1861, ifft1779);
__m512 ifft1783 = _mm512_mul_ps(ifft1774, ifft1779);
__m512 ifft1870 = _mm512_mul_ps(ifft1862, ifft1779);
__m512 ifft1784 = _mm512_mul_ps(ifft1775, ifft1779);
__m512 ifft1871 = _mm512_mul_ps(ifft1863, ifft1779);
__m512 ifft1785 = _mm512_mul_ps(ifft1776, ifft1779);
__m512 ifft1872 = _mm512_mul_ps(ifft1864, ifft1779);
__m512 ifft1786 = _mm512_mul_ps(ifft1777, ifft1779);
__m512 ifft1873 = _mm512_mul_ps(ifft1865, ifft1779);
__m512 ifft1787 = _mm512_mul_ps(ifft1778, ifft1779);
__m512 ifft1874 = _mm512_mul_ps(ifft1866, ifft1779);
__m512 ifft1788 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft1789 = _mm512_fnmadd_ps(ifft1772, ifft1788, ifft1780);
__m512 ifft1875 = _mm512_fnmadd_ps(ifft1860, ifft1788, ifft1867);
__m512 ifft1790 = _mm512_fmadd_ps(ifft1771, ifft1788, ifft1781);
__m512 ifft1876 = _mm512_fmadd_ps(ifft1859, ifft1788, ifft1868);
__m512 ifft1791 = _mm512_fnmadd_ps(ifft1774, ifft1788, ifft1782);
__m512 ifft1877 = _mm512_fnmadd_ps(ifft1862, ifft1788, ifft1869);
__m512 ifft1792 = _mm512_fmadd_ps(ifft1773, ifft1788, ifft1783);
__m512 ifft1878 = _mm512_fmadd_ps(ifft1861, ifft1788, ifft1870);
__m512 ifft1793 = _mm512_fnmadd_ps(ifft1776, ifft1788, ifft1784);
__m512 ifft1879 = _mm512_fnmadd_ps(ifft1864, ifft1788, ifft1871);
__m512 ifft1794 = _mm512_fmadd_ps(ifft1775, ifft1788, ifft1785);
__m512 ifft1880 = _mm512_fmadd_ps(ifft1863, ifft1788, ifft1872);
__m512 ifft1795 = _mm512_fnmadd_ps(ifft1778, ifft1788, ifft1786);
__m512 ifft1881 = _mm512_fnmadd_ps(ifft1866, ifft1788, ifft1873);
__m512 ifft1796 = _mm512_fmadd_ps(ifft1777, ifft1788, ifft1787);
__m512 ifft1882 = _mm512_fmadd_ps(ifft1865, ifft1788, ifft1874);
__m512 ifft1797 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft1798 = _mm512_fmadd_ps(ifft1789, ifft1797, _mm512_shuffle_ps(ifft1789, ifft1789, 78));
__m512 ifft1883 = _mm512_fmadd_ps(ifft1875, ifft1797, _mm512_shuffle_ps(ifft1875, ifft1875, 78));
__m512 ifft1799 = _mm512_fmadd_ps(ifft1790, ifft1797, _mm512_shuffle_ps(ifft1790, ifft1790, 78));
__m512 ifft1884 = _mm512_fmadd_ps(ifft1876, ifft1797, _mm512_shuffle_ps(ifft1876, ifft1876, 78));
__m512 ifft1800 = _mm512_fmadd_ps(ifft1791, ifft1797, _mm512_shuffle_ps(ifft1791, ifft1791, 78));
__m512 ifft1885 = _mm512_fmadd_ps(ifft1877, ifft1797, _mm512_shuffle_ps(ifft1877, ifft1877, 78));
__m512 ifft1801 = _mm512_fmadd_ps(ifft1792, ifft1797, _mm512_shuffle_ps(ifft1792, ifft1792, 78));
__m512 ifft1886 = _mm512_fmadd_ps(ifft1878, ifft1797, _mm512_shuffle_ps(ifft1878, ifft1878, 78));
__m512 ifft1802 = _mm512_fmadd_ps(ifft1793, ifft1797, _mm512_shuffle_ps(ifft1793, ifft1793, 78));
__m512 ifft1887 = _mm512_fmadd_ps(ifft1879, ifft1797, _mm512_shuffle_ps(ifft1879, ifft1879, 78));
__m512 ifft1803 = _mm512_fmadd_ps(ifft1794, ifft1797, _mm512_shuffle_ps(ifft1794, ifft1794, 78));
__m512 ifft1888 = _mm512_fmadd_ps(ifft1880, ifft1797, _mm512_shuffle_ps(ifft1880, ifft1880, 78));
__m512 ifft1804 = _mm512_fmadd_ps(ifft1795, ifft1797, _mm512_shuffle_ps(ifft1795, ifft1795, 78));
__m512 ifft1889 = _mm512_fmadd_ps(ifft1881, ifft1797, _mm512_shuffle_ps(ifft1881, ifft1881, 78));
__m512 ifft1805 = _mm512_fmadd_ps(ifft1796, ifft1797, _mm512_shuffle_ps(ifft1796, ifft1796, 78));
__m512 ifft1890 = _mm512_fmadd_ps(ifft1882, ifft1797, _mm512_shuffle_ps(ifft1882, ifft1882, 78));
__m512 ifft1806 = _mm512_mask_sub_ps(ifft1798, 49344, _mm512_setzero_ps(), ifft1799);
__m512 ifft1891 = _mm512_mask_sub_ps(ifft1883, 49344, _mm512_setzero_ps(), ifft1884);
__m512 ifft1807 = _mm512_mask_mov_ps(ifft1799, 49344, ifft1798);
__m512 ifft1892 = _mm512_mask_mov_ps(ifft1884, 49344, ifft1883);
__m512 ifft1808 = _mm512_mask_sub_ps(ifft1800, 49344, _mm512_setzero_ps(), ifft1801);
__m512 ifft1893 = _mm512_mask_sub_ps(ifft1885, 49344, _mm512_setzero_ps(), ifft1886);
__m512 ifft1809 = _mm512_mask_mov_ps(ifft1801, 49344, ifft1800);
__m512 ifft1894 = _mm512_mask_mov_ps(ifft1886, 49344, ifft1885);
__m512 ifft1810 = _mm512_mask_sub_ps(ifft1802, 49344, _mm512_setzero_ps(), ifft1803);
__m512 ifft1895 = _mm512_mask_sub_ps(ifft1887, 49344, _mm512_setzero_ps(), ifft1888);
__m512 ifft1811 = _mm512_mask_mov_ps(ifft1803, 49344, ifft1802);
__m512 ifft1896 = _mm512_mask_mov_ps(ifft1888, 49344, ifft1887);
__m512 ifft1812 = _mm512_mask_sub_ps(ifft1804, 49344, _mm512_setzero_ps(), ifft1805);
__m512 ifft1897 = _mm512_mask_sub_ps(ifft1889, 49344, _mm512_setzero_ps(), ifft1890);
__m512 ifft1813 = _mm512_mask_mov_ps(ifft1805, 49344, ifft1804);
__m512 ifft1898 = _mm512_mask_mov_ps(ifft1890, 49344, ifft1889);
__m512 ifft1814 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft1815 = _mm512_fmadd_ps(ifft1806, ifft1814, _mm512_shuffle_f32x4(ifft1806, ifft1806, 177));
__m512 ifft1899 = _mm512_fmadd_ps(ifft1891, ifft1814, _mm512_shuffle_f32x4(ifft1891, ifft1891, 177));
__m512 ifft1816 = _mm512_fmadd_ps(ifft1807, ifft1814, _mm512_shuffle_f32x4(ifft1807, ifft1807, 177));
__m512 ifft1900 = _mm512_fmadd_ps(ifft1892, ifft1814, _mm512_shuffle_f32x4(ifft1892, ifft1892, 177));
__m512 ifft1817 = _mm512_fmadd_ps(ifft1808, ifft1814, _mm512_shuffle_f32x4(ifft1808, ifft1808, 177));
__m512 ifft1901 = _mm512_fmadd_ps(ifft1893, ifft1814, _mm512_shuffle_f32x4(ifft1893, ifft1893, 177));
__m512 ifft1818 = _mm512_fmadd_ps(ifft1809, ifft1814, _mm512_shuffle_f32x4(ifft1809, ifft1809, 177));
__m512 ifft1902 = _mm512_fmadd_ps(ifft1894, ifft1814, _mm512_shuffle_f32x4(ifft1894, ifft1894, 177));
__m512 ifft1819 = _mm512_fmadd_ps(ifft1810, ifft1814, _mm512_shuffle_f32x4(ifft1810, ifft1810, 177));
__m512 ifft1903 = _mm512_fmadd_ps(ifft1895, ifft1814, _mm512_shuffle_f32x4(ifft1895, ifft1895, 177));
__m512 ifft1820 = _mm512_fnmsub_ps(ifft1811, ifft1814, _mm512_shuffle_f32x4(ifft1811, ifft1811, 177));
__m512 ifft1904 = _mm512_fnmsub_ps(ifft1896, ifft1814, _mm512_shuffle_f32x4(ifft1896, ifft1896, 177));
__m512 ifft1821 = _mm512_fmadd_ps(ifft1812, ifft1814, _mm512_shuffle_f32x4(ifft1812, ifft1812, 177));
__m512 ifft1905 = _mm512_fmadd_ps(ifft1897, ifft1814, _mm512_shuffle_f32x4(ifft1897, ifft1897, 177));
__m512 ifft1822 = _mm512_fmadd_ps(ifft1813, ifft1814, _mm512_shuffle_f32x4(ifft1813, ifft1813, 177));
__m512 ifft1906 = _mm512_fmadd_ps(ifft1898, ifft1814, _mm512_shuffle_f32x4(ifft1898, ifft1898, 177));
__m512 ifft1823 = _mm512_add_ps(ifft1815, ifft1816);
__m512 ifft1907 = _mm512_add_ps(ifft1899, ifft1900);
__m512 ifft1824 = _mm512_sub_ps(ifft1815, ifft1816);
__m512 ifft1908 = _mm512_sub_ps(ifft1899, ifft1900);
__m512 ifft1825 = _mm512_sub_ps(ifft1817, ifft1821);
__m512 ifft1909 = _mm512_sub_ps(ifft1901, ifft1905);
__m512 ifft1826 = _mm512_add_ps(ifft1818, ifft1822);
__m512 ifft1910 = _mm512_add_ps(ifft1902, ifft1906);
__m512 ifft1827 = _mm512_add_ps(ifft1817, ifft1821);
__m512 ifft1911 = _mm512_add_ps(ifft1901, ifft1905);
__m512 ifft1828 = _mm512_sub_ps(ifft1818, ifft1822);
__m512 ifft1912 = _mm512_sub_ps(ifft1902, ifft1906);
__m512 ifft1829 = _mm512_mul_ps(ifft1819, _mm512_set1_ps(3.125e-02f));
__m512 ifft1913 = _mm512_mul_ps(ifft1903, _mm512_set1_ps(3.125e-02f));
__m512 ifft1830 = _mm512_mul_ps(ifft1820, _mm512_set1_ps(3.125e-02f));
__m512 ifft1914 = _mm512_mul_ps(ifft1904, _mm512_set1_ps(3.125e-02f));
__m512 ifft1831 = _mm512_fmadd_ps(ifft1823, _mm512_set1_ps(1.5625e-02f), ifft1829);
__m512 ifft1915 = _mm512_fmadd_ps(ifft1907, _mm512_set1_ps(1.5625e-02f), ifft1913);
__m512 ifft1832 = _mm512_fmsub_ps(ifft1823, _mm512_set1_ps(1.5625e-02f), ifft1829);
__m512 ifft1916 = _mm512_fmsub_ps(ifft1907, _mm512_set1_ps(1.5625e-02f), ifft1913);
__m512 ifft1833 = _mm512_fmadd_ps(ifft1824, _mm512_set1_ps(1.5625e-02f), ifft1830);
__m512 ifft1917 = _mm512_fmadd_ps(ifft1908, _mm512_set1_ps(1.5625e-02f), ifft1914);
__m512 ifft1834 = _mm512_fmsub_ps(ifft1824, _mm512_set1_ps(1.5625e-02f), ifft1830);
__m512 ifft1918 = _mm512_fmsub_ps(ifft1908, _mm512_set1_ps(1.5625e-02f), ifft1914);
__m512 ifft1835 = _mm512_add_ps(ifft1825, ifft1826);
__m512 ifft1919 = _mm512_add_ps(ifft1909, ifft1910);
__m512 ifft1836 = _mm512_sub_ps(ifft1825, ifft1826);
__m512 ifft1920 = _mm512_sub_ps(ifft1909, ifft1910);
__m512 ifft1837 = _mm512_fnmadd_ps(ifft1835, _mm512_set1_ps(7.0710677e-01f), ifft1827);
__m512 ifft1921 = _mm512_fnmadd_ps(ifft1919, _mm512_set1_ps(7.0710677e-01f), ifft1911);
__m512 ifft1838 = _mm512_fmadd_ps(ifft1835, _mm512_set1_ps(7.0710677e-01f), ifft1827);
__m512 ifft1922 = _mm512_fmadd_ps(ifft1919, _mm512_set1_ps(7.0710677e-01f), ifft1911);
__m512 ifft1839 = _mm512_fmadd_ps(ifft1836, _mm512_set1_ps(7.0710677e-01f), ifft1828);
__m512 ifft1923 = _mm512_fmadd_ps(ifft1920, _mm512_set1_ps(7.0710677e-01f), ifft1912);
__m512 ifft1840 = _mm512_fmsub_ps(ifft1836, _mm512_set1_ps(7.0710677e-01f), ifft1828);
__m512 ifft1924 = _mm512_fmsub_ps(ifft1920, _mm512_set1_ps(7.0710677e-01f), ifft1912);
__m512 ifft1841 = _mm512_add_ps(ifft1837, ifft1838);
__m512 ifft1925 = _mm512_add_ps(ifft1921, ifft1922);
__m512 ifft1842 = _mm512_sub_ps(ifft1837, ifft1838);
__m512 ifft1926 = _mm512_sub_ps(ifft1921, ifft1922);
__m512 ifft1843 = _mm512_add_ps(ifft1839, ifft1840);
__m512 ifft1927 = _mm512_add_ps(ifft1923, ifft1924);
__m512 ifft1844 = _mm512_sub_ps(ifft1839, ifft1840);
__m512 ifft1928 = _mm512_sub_ps(ifft1923, ifft1924);
__m512 ifft1845 = _mm512_fmadd_ps(ifft1841, _mm512_set1_ps(1.5625e-02f), ifft1831);
__m512 ifft1929 = _mm512_fmadd_ps(ifft1925, _mm512_set1_ps(1.5625e-02f), ifft1915);
__m512 ifft1846 = _mm512_fnmadd_ps(ifft1841, _mm512_set1_ps(1.5625e-02f), ifft1831);
__m512 ifft1930 = _mm512_fnmadd_ps(ifft1925, _mm512_set1_ps(1.5625e-02f), ifft1915);
__m512 ifft1847 = _mm512_fmadd_ps(ifft1843, _mm512_set1_ps(1.5625e-02f), ifft1833);
__m512 ifft1931 = _mm512_fmadd_ps(ifft1927, _mm512_set1_ps(1.5625e-02f), ifft1917);
__m512 ifft1848 = _mm512_fnmadd_ps(ifft1843, _mm512_set1_ps(1.5625e-02f), ifft1833);
__m512 ifft1932 = _mm512_fnmadd_ps(ifft1927, _mm512_set1_ps(1.5625e-02f), ifft1917);
__m512 ifft1849 = _mm512_fnmadd_ps(ifft1844, _mm512_set1_ps(1.5625e-02f), ifft1832);
__m512 ifft1933 = _mm512_fnmadd_ps(ifft1928, _mm512_set1_ps(1.5625e-02f), ifft1916);
__m512 ifft1850 = _mm512_fmadd_ps(ifft1844, _mm512_set1_ps(1.5625e-02f), ifft1832);
__m512 ifft1934 = _mm512_fmadd_ps(ifft1928, _mm512_set1_ps(1.5625e-02f), ifft1916);
__m512 ifft1851 = _mm512_fmadd_ps(ifft1842, _mm512_set1_ps(1.5625e-02f), ifft1834);
__m512 ifft1935 = _mm512_fmadd_ps(ifft1926, _mm512_set1_ps(1.5625e-02f), ifft1918);
__m512 ifft1852 = _mm512_fnmadd_ps(ifft1842, _mm512_set1_ps(1.5625e-02f), ifft1834);
__m512 ifft1936 = _mm512_fnmadd_ps(ifft1926, _mm512_set1_ps(1.5625e-02f), ifft1918);
__m512 dat690 = ifft1845;
__m512 dat695 = ifft1929;
__m512 dat691 = ifft1847;
__m512 dat696 = ifft1931;
__m512 dat692 = ifft1849;
__m512 dat697 = ifft1933;
__m512 dat693 = ifft1851;
__m512 dat698 = ifft1935;
__m512 dat694 = ifft1846;
__m512 dat699 = ifft1930;
(void)ifft1848;
(void)ifft1932;
(void)ifft1850;
(void)ifft1934;
(void)ifft1852;
(void)ifft1936;
__m512i pm19 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack91 = _mm512_permutex2var_ps(dat690, pm19, dat695);
__m512i pm20 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack92 = _mm512_permutex2var_ps(dat690, pm20, dat695);
__m512 pack93 = _mm512_permutex2var_ps(dat691, pm19, dat696);
__m512 pack94 = _mm512_permutex2var_ps(dat691, pm20, dat696);
__m512 pack95 = _mm512_permutex2var_ps(dat692, pm19, dat697);
__m512 pack96 = _mm512_permutex2var_ps(dat692, pm20, dat697);
__m512 pack97 = _mm512_permutex2var_ps(dat693, pm19, dat698);
__m512 pack98 = _mm512_permutex2var_ps(dat693, pm20, dat698);
__m512 pack99 = _mm512_permutex2var_ps(dat694, pm19, dat699);
__m512 pack100 = _mm512_permutex2var_ps(dat694, pm20, dat699);
pack91 = _mm512_max_ps(_mm512_setzero_ps(), pack91);
pack92 = _mm512_max_ps(_mm512_setzero_ps(), pack92);
pack93 = _mm512_max_ps(_mm512_setzero_ps(), pack93);
pack94 = _mm512_max_ps(_mm512_setzero_ps(), pack94);
pack95 = _mm512_max_ps(_mm512_setzero_ps(), pack95);
pack96 = _mm512_max_ps(_mm512_setzero_ps(), pack96);
pack97 = _mm512_max_ps(_mm512_setzero_ps(), pack97);
pack98 = _mm512_max_ps(_mm512_setzero_ps(), pack98);
pack99 = _mm512_max_ps(_mm512_setzero_ps(), pack99);
pack100 = _mm512_max_ps(_mm512_setzero_ps(), pack100);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack91);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack92);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack93);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack94);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack95);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack96);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack97);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack98);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack99);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack100);
ptrdiff_t t13 = 0;
__m512 sfRe145 = _mm512_loadu_ps(sfPtr3+256+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfIm145 = _mm512_loadu_ps(sfPtr3+320+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfRe149 = _mm512_loadu_ps(sfPtr3+384+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfIm149 = _mm512_loadu_ps(sfPtr3+448+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfRe146 = _mm512_loadu_ps(sfPtr3+2167040+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfIm146 = _mm512_loadu_ps(sfPtr3+2167104+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfRe150 = _mm512_loadu_ps(sfPtr3+2167168+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfIm150 = _mm512_loadu_ps(sfPtr3+2167232+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfRe147 = _mm512_loadu_ps(sfPtr3+4333824+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfIm147 = _mm512_loadu_ps(sfPtr3+4333888+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfRe151 = _mm512_loadu_ps(sfPtr3+4333952+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfIm151 = _mm512_loadu_ps(sfPtr3+4334016+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfRe148 = _mm512_loadu_ps(sfPtr3+6500608+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfIm148 = _mm512_loadu_ps(sfPtr3+6500672+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfRe152 = _mm512_loadu_ps(sfPtr3+6500736+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfIm152 = _mm512_loadu_ps(sfPtr3+6500800+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512i ifft1937 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft1938 = _mm512_permutexvar_ps(ifft1937, sfRe145);
__m512 ifft2029 = _mm512_permutexvar_ps(ifft1937, sfRe149);
__m512i ifft1939 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft1940 = _mm512_permutexvar_ps(ifft1939, sfRe145);
__m512 ifft2030 = _mm512_permutexvar_ps(ifft1939, sfRe149);
__m512 ifft1941 = _mm512_permutexvar_ps(ifft1937, sfIm145);
__m512 ifft2031 = _mm512_permutexvar_ps(ifft1937, sfIm149);
__m512 ifft1942 = _mm512_permutexvar_ps(ifft1939, sfIm145);
__m512 ifft2032 = _mm512_permutexvar_ps(ifft1939, sfIm149);
__m512 ifft1943 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft1944 = _mm512_mask_fmadd_ps(ifft1942, 65021, ifft1943, ifft1938);
__m512 ifft2033 = _mm512_mask_fmadd_ps(ifft2032, 65021, ifft1943, ifft2029);
__m512 ifft1945 = _mm512_mask_fnmadd_ps(ifft1941, 65021, ifft1943, ifft1940);
__m512 ifft2034 = _mm512_mask_fnmadd_ps(ifft2031, 65021, ifft1943, ifft2030);
__m512 ifft1946 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft1947 = _mm512_fmadd_ps(ifft1944, ifft1946, _mm512_shuffle_ps(ifft1944, ifft1944, 177));
__m512 ifft2035 = _mm512_fmadd_ps(ifft2033, ifft1946, _mm512_shuffle_ps(ifft2033, ifft2033, 177));
__m512 ifft1948 = _mm512_fmadd_ps(ifft1945, ifft1946, _mm512_shuffle_ps(ifft1945, ifft1945, 177));
__m512 ifft2036 = _mm512_fmadd_ps(ifft2034, ifft1946, _mm512_shuffle_ps(ifft2034, ifft2034, 177));
__m512 ifft1949 = _mm512_fmadd_ps(sfRe146, ifft1946, _mm512_shuffle_ps(sfRe146, sfRe146, 177));
__m512 ifft2037 = _mm512_fmadd_ps(sfRe150, ifft1946, _mm512_shuffle_ps(sfRe150, sfRe150, 177));
__m512 ifft1950 = _mm512_fmadd_ps(sfIm146, ifft1946, _mm512_shuffle_ps(sfIm146, sfIm146, 177));
__m512 ifft2038 = _mm512_fmadd_ps(sfIm150, ifft1946, _mm512_shuffle_ps(sfIm150, sfIm150, 177));
__m512 ifft1951 = _mm512_fmadd_ps(sfRe147, ifft1946, _mm512_shuffle_ps(sfRe147, sfRe147, 177));
__m512 ifft2039 = _mm512_fmadd_ps(sfRe151, ifft1946, _mm512_shuffle_ps(sfRe151, sfRe151, 177));
__m512 ifft1952 = _mm512_fmadd_ps(sfIm147, ifft1946, _mm512_shuffle_ps(sfIm147, sfIm147, 177));
__m512 ifft2040 = _mm512_fmadd_ps(sfIm151, ifft1946, _mm512_shuffle_ps(sfIm151, sfIm151, 177));
__m512 ifft1953 = _mm512_fmadd_ps(sfRe148, ifft1946, _mm512_shuffle_ps(sfRe148, sfRe148, 177));
__m512 ifft2041 = _mm512_fmadd_ps(sfRe152, ifft1946, _mm512_shuffle_ps(sfRe152, sfRe152, 177));
__m512 ifft1954 = _mm512_fmadd_ps(sfIm148, ifft1946, _mm512_shuffle_ps(sfIm148, sfIm148, 177));
__m512 ifft2042 = _mm512_fmadd_ps(sfIm152, ifft1946, _mm512_shuffle_ps(sfIm152, sfIm152, 177));
__m512 ifft1955 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft1956 = _mm512_mul_ps(ifft1947, ifft1955);
__m512 ifft2043 = _mm512_mul_ps(ifft2035, ifft1955);
__m512 ifft1957 = _mm512_mul_ps(ifft1948, ifft1955);
__m512 ifft2044 = _mm512_mul_ps(ifft2036, ifft1955);
__m512 ifft1958 = _mm512_mul_ps(ifft1949, ifft1955);
__m512 ifft2045 = _mm512_mul_ps(ifft2037, ifft1955);
__m512 ifft1959 = _mm512_mul_ps(ifft1950, ifft1955);
__m512 ifft2046 = _mm512_mul_ps(ifft2038, ifft1955);
__m512 ifft1960 = _mm512_mul_ps(ifft1951, ifft1955);
__m512 ifft2047 = _mm512_mul_ps(ifft2039, ifft1955);
__m512 ifft1961 = _mm512_mul_ps(ifft1952, ifft1955);
__m512 ifft2048 = _mm512_mul_ps(ifft2040, ifft1955);
__m512 ifft1962 = _mm512_mul_ps(ifft1953, ifft1955);
__m512 ifft2049 = _mm512_mul_ps(ifft2041, ifft1955);
__m512 ifft1963 = _mm512_mul_ps(ifft1954, ifft1955);
__m512 ifft2050 = _mm512_mul_ps(ifft2042, ifft1955);
__m512 ifft1964 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft1965 = _mm512_fnmadd_ps(ifft1948, ifft1964, ifft1956);
__m512 ifft2051 = _mm512_fnmadd_ps(ifft2036, ifft1964, ifft2043);
__m512 ifft1966 = _mm512_fmadd_ps(ifft1947, ifft1964, ifft1957);
__m512 ifft2052 = _mm512_fmadd_ps(ifft2035, ifft1964, ifft2044);
__m512 ifft1967 = _mm512_fnmadd_ps(ifft1950, ifft1964, ifft1958);
__m512 ifft2053 = _mm512_fnmadd_ps(ifft2038, ifft1964, ifft2045);
__m512 ifft1968 = _mm512_fmadd_ps(ifft1949, ifft1964, ifft1959);
__m512 ifft2054 = _mm512_fmadd_ps(ifft2037, ifft1964, ifft2046);
__m512 ifft1969 = _mm512_fnmadd_ps(ifft1952, ifft1964, ifft1960);
__m512 ifft2055 = _mm512_fnmadd_ps(ifft2040, ifft1964, ifft2047);
__m512 ifft1970 = _mm512_fmadd_ps(ifft1951, ifft1964, ifft1961);
__m512 ifft2056 = _mm512_fmadd_ps(ifft2039, ifft1964, ifft2048);
__m512 ifft1971 = _mm512_fnmadd_ps(ifft1954, ifft1964, ifft1962);
__m512 ifft2057 = _mm512_fnmadd_ps(ifft2042, ifft1964, ifft2049);
__m512 ifft1972 = _mm512_fmadd_ps(ifft1953, ifft1964, ifft1963);
__m512 ifft2058 = _mm512_fmadd_ps(ifft2041, ifft1964, ifft2050);
__m512 ifft1973 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft1974 = _mm512_fmadd_ps(ifft1965, ifft1973, _mm512_shuffle_ps(ifft1965, ifft1965, 78));
__m512 ifft2059 = _mm512_fmadd_ps(ifft2051, ifft1973, _mm512_shuffle_ps(ifft2051, ifft2051, 78));
__m512 ifft1975 = _mm512_fmadd_ps(ifft1966, ifft1973, _mm512_shuffle_ps(ifft1966, ifft1966, 78));
__m512 ifft2060 = _mm512_fmadd_ps(ifft2052, ifft1973, _mm512_shuffle_ps(ifft2052, ifft2052, 78));
__m512 ifft1976 = _mm512_fmadd_ps(ifft1967, ifft1973, _mm512_shuffle_ps(ifft1967, ifft1967, 78));
__m512 ifft2061 = _mm512_fmadd_ps(ifft2053, ifft1973, _mm512_shuffle_ps(ifft2053, ifft2053, 78));
__m512 ifft1977 = _mm512_fmadd_ps(ifft1968, ifft1973, _mm512_shuffle_ps(ifft1968, ifft1968, 78));
__m512 ifft2062 = _mm512_fmadd_ps(ifft2054, ifft1973, _mm512_shuffle_ps(ifft2054, ifft2054, 78));
__m512 ifft1978 = _mm512_fmadd_ps(ifft1969, ifft1973, _mm512_shuffle_ps(ifft1969, ifft1969, 78));
__m512 ifft2063 = _mm512_fmadd_ps(ifft2055, ifft1973, _mm512_shuffle_ps(ifft2055, ifft2055, 78));
__m512 ifft1979 = _mm512_fmadd_ps(ifft1970, ifft1973, _mm512_shuffle_ps(ifft1970, ifft1970, 78));
__m512 ifft2064 = _mm512_fmadd_ps(ifft2056, ifft1973, _mm512_shuffle_ps(ifft2056, ifft2056, 78));
__m512 ifft1980 = _mm512_fmadd_ps(ifft1971, ifft1973, _mm512_shuffle_ps(ifft1971, ifft1971, 78));
__m512 ifft2065 = _mm512_fmadd_ps(ifft2057, ifft1973, _mm512_shuffle_ps(ifft2057, ifft2057, 78));
__m512 ifft1981 = _mm512_fmadd_ps(ifft1972, ifft1973, _mm512_shuffle_ps(ifft1972, ifft1972, 78));
__m512 ifft2066 = _mm512_fmadd_ps(ifft2058, ifft1973, _mm512_shuffle_ps(ifft2058, ifft2058, 78));
__m512 ifft1982 = _mm512_mask_sub_ps(ifft1974, 49344, _mm512_setzero_ps(), ifft1975);
__m512 ifft2067 = _mm512_mask_sub_ps(ifft2059, 49344, _mm512_setzero_ps(), ifft2060);
__m512 ifft1983 = _mm512_mask_mov_ps(ifft1975, 49344, ifft1974);
__m512 ifft2068 = _mm512_mask_mov_ps(ifft2060, 49344, ifft2059);
__m512 ifft1984 = _mm512_mask_sub_ps(ifft1976, 49344, _mm512_setzero_ps(), ifft1977);
__m512 ifft2069 = _mm512_mask_sub_ps(ifft2061, 49344, _mm512_setzero_ps(), ifft2062);
__m512 ifft1985 = _mm512_mask_mov_ps(ifft1977, 49344, ifft1976);
__m512 ifft2070 = _mm512_mask_mov_ps(ifft2062, 49344, ifft2061);
__m512 ifft1986 = _mm512_mask_sub_ps(ifft1978, 49344, _mm512_setzero_ps(), ifft1979);
__m512 ifft2071 = _mm512_mask_sub_ps(ifft2063, 49344, _mm512_setzero_ps(), ifft2064);
__m512 ifft1987 = _mm512_mask_mov_ps(ifft1979, 49344, ifft1978);
__m512 ifft2072 = _mm512_mask_mov_ps(ifft2064, 49344, ifft2063);
__m512 ifft1988 = _mm512_mask_sub_ps(ifft1980, 49344, _mm512_setzero_ps(), ifft1981);
__m512 ifft2073 = _mm512_mask_sub_ps(ifft2065, 49344, _mm512_setzero_ps(), ifft2066);
__m512 ifft1989 = _mm512_mask_mov_ps(ifft1981, 49344, ifft1980);
__m512 ifft2074 = _mm512_mask_mov_ps(ifft2066, 49344, ifft2065);
__m512 ifft1990 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft1991 = _mm512_fmadd_ps(ifft1982, ifft1990, _mm512_shuffle_f32x4(ifft1982, ifft1982, 177));
__m512 ifft2075 = _mm512_fmadd_ps(ifft2067, ifft1990, _mm512_shuffle_f32x4(ifft2067, ifft2067, 177));
__m512 ifft1992 = _mm512_fmadd_ps(ifft1983, ifft1990, _mm512_shuffle_f32x4(ifft1983, ifft1983, 177));
__m512 ifft2076 = _mm512_fmadd_ps(ifft2068, ifft1990, _mm512_shuffle_f32x4(ifft2068, ifft2068, 177));
__m512 ifft1993 = _mm512_fmadd_ps(ifft1984, ifft1990, _mm512_shuffle_f32x4(ifft1984, ifft1984, 177));
__m512 ifft2077 = _mm512_fmadd_ps(ifft2069, ifft1990, _mm512_shuffle_f32x4(ifft2069, ifft2069, 177));
__m512 ifft1994 = _mm512_fmadd_ps(ifft1985, ifft1990, _mm512_shuffle_f32x4(ifft1985, ifft1985, 177));
__m512 ifft2078 = _mm512_fmadd_ps(ifft2070, ifft1990, _mm512_shuffle_f32x4(ifft2070, ifft2070, 177));
__m512 ifft1995 = _mm512_fmadd_ps(ifft1986, ifft1990, _mm512_shuffle_f32x4(ifft1986, ifft1986, 177));
__m512 ifft2079 = _mm512_fmadd_ps(ifft2071, ifft1990, _mm512_shuffle_f32x4(ifft2071, ifft2071, 177));
__m512 ifft1996 = _mm512_fnmsub_ps(ifft1987, ifft1990, _mm512_shuffle_f32x4(ifft1987, ifft1987, 177));
__m512 ifft2080 = _mm512_fnmsub_ps(ifft2072, ifft1990, _mm512_shuffle_f32x4(ifft2072, ifft2072, 177));
__m512 ifft1997 = _mm512_fmadd_ps(ifft1988, ifft1990, _mm512_shuffle_f32x4(ifft1988, ifft1988, 177));
__m512 ifft2081 = _mm512_fmadd_ps(ifft2073, ifft1990, _mm512_shuffle_f32x4(ifft2073, ifft2073, 177));
__m512 ifft1998 = _mm512_fmadd_ps(ifft1989, ifft1990, _mm512_shuffle_f32x4(ifft1989, ifft1989, 177));
__m512 ifft2082 = _mm512_fmadd_ps(ifft2074, ifft1990, _mm512_shuffle_f32x4(ifft2074, ifft2074, 177));
__m512 ifft1999 = _mm512_add_ps(ifft1991, ifft1992);
__m512 ifft2083 = _mm512_add_ps(ifft2075, ifft2076);
__m512 ifft2000 = _mm512_sub_ps(ifft1991, ifft1992);
__m512 ifft2084 = _mm512_sub_ps(ifft2075, ifft2076);
__m512 ifft2001 = _mm512_sub_ps(ifft1993, ifft1997);
__m512 ifft2085 = _mm512_sub_ps(ifft2077, ifft2081);
__m512 ifft2002 = _mm512_add_ps(ifft1994, ifft1998);
__m512 ifft2086 = _mm512_add_ps(ifft2078, ifft2082);
__m512 ifft2003 = _mm512_add_ps(ifft1993, ifft1997);
__m512 ifft2087 = _mm512_add_ps(ifft2077, ifft2081);
__m512 ifft2004 = _mm512_sub_ps(ifft1994, ifft1998);
__m512 ifft2088 = _mm512_sub_ps(ifft2078, ifft2082);
__m512 ifft2005 = _mm512_mul_ps(ifft1995, _mm512_set1_ps(3.125e-02f));
__m512 ifft2089 = _mm512_mul_ps(ifft2079, _mm512_set1_ps(3.125e-02f));
__m512 ifft2006 = _mm512_mul_ps(ifft1996, _mm512_set1_ps(3.125e-02f));
__m512 ifft2090 = _mm512_mul_ps(ifft2080, _mm512_set1_ps(3.125e-02f));
__m512 ifft2007 = _mm512_fmadd_ps(ifft1999, _mm512_set1_ps(1.5625e-02f), ifft2005);
__m512 ifft2091 = _mm512_fmadd_ps(ifft2083, _mm512_set1_ps(1.5625e-02f), ifft2089);
__m512 ifft2008 = _mm512_fmsub_ps(ifft1999, _mm512_set1_ps(1.5625e-02f), ifft2005);
__m512 ifft2092 = _mm512_fmsub_ps(ifft2083, _mm512_set1_ps(1.5625e-02f), ifft2089);
__m512 ifft2009 = _mm512_fmadd_ps(ifft2000, _mm512_set1_ps(1.5625e-02f), ifft2006);
__m512 ifft2093 = _mm512_fmadd_ps(ifft2084, _mm512_set1_ps(1.5625e-02f), ifft2090);
__m512 ifft2010 = _mm512_fmsub_ps(ifft2000, _mm512_set1_ps(1.5625e-02f), ifft2006);
__m512 ifft2094 = _mm512_fmsub_ps(ifft2084, _mm512_set1_ps(1.5625e-02f), ifft2090);
__m512 ifft2011 = _mm512_add_ps(ifft2001, ifft2002);
__m512 ifft2095 = _mm512_add_ps(ifft2085, ifft2086);
__m512 ifft2012 = _mm512_sub_ps(ifft2001, ifft2002);
__m512 ifft2096 = _mm512_sub_ps(ifft2085, ifft2086);
__m512 ifft2013 = _mm512_fnmadd_ps(ifft2011, _mm512_set1_ps(7.0710677e-01f), ifft2003);
__m512 ifft2097 = _mm512_fnmadd_ps(ifft2095, _mm512_set1_ps(7.0710677e-01f), ifft2087);
__m512 ifft2014 = _mm512_fmadd_ps(ifft2011, _mm512_set1_ps(7.0710677e-01f), ifft2003);
__m512 ifft2098 = _mm512_fmadd_ps(ifft2095, _mm512_set1_ps(7.0710677e-01f), ifft2087);
__m512 ifft2015 = _mm512_fmadd_ps(ifft2012, _mm512_set1_ps(7.0710677e-01f), ifft2004);
__m512 ifft2099 = _mm512_fmadd_ps(ifft2096, _mm512_set1_ps(7.0710677e-01f), ifft2088);
__m512 ifft2016 = _mm512_fmsub_ps(ifft2012, _mm512_set1_ps(7.0710677e-01f), ifft2004);
__m512 ifft2100 = _mm512_fmsub_ps(ifft2096, _mm512_set1_ps(7.0710677e-01f), ifft2088);
__m512 ifft2017 = _mm512_add_ps(ifft2013, ifft2014);
__m512 ifft2101 = _mm512_add_ps(ifft2097, ifft2098);
__m512 ifft2018 = _mm512_sub_ps(ifft2013, ifft2014);
__m512 ifft2102 = _mm512_sub_ps(ifft2097, ifft2098);
__m512 ifft2019 = _mm512_add_ps(ifft2015, ifft2016);
__m512 ifft2103 = _mm512_add_ps(ifft2099, ifft2100);
__m512 ifft2020 = _mm512_sub_ps(ifft2015, ifft2016);
__m512 ifft2104 = _mm512_sub_ps(ifft2099, ifft2100);
__m512 ifft2021 = _mm512_fmadd_ps(ifft2017, _mm512_set1_ps(1.5625e-02f), ifft2007);
__m512 ifft2105 = _mm512_fmadd_ps(ifft2101, _mm512_set1_ps(1.5625e-02f), ifft2091);
__m512 ifft2022 = _mm512_fnmadd_ps(ifft2017, _mm512_set1_ps(1.5625e-02f), ifft2007);
__m512 ifft2106 = _mm512_fnmadd_ps(ifft2101, _mm512_set1_ps(1.5625e-02f), ifft2091);
__m512 ifft2023 = _mm512_fmadd_ps(ifft2019, _mm512_set1_ps(1.5625e-02f), ifft2009);
__m512 ifft2107 = _mm512_fmadd_ps(ifft2103, _mm512_set1_ps(1.5625e-02f), ifft2093);
__m512 ifft2024 = _mm512_fnmadd_ps(ifft2019, _mm512_set1_ps(1.5625e-02f), ifft2009);
__m512 ifft2108 = _mm512_fnmadd_ps(ifft2103, _mm512_set1_ps(1.5625e-02f), ifft2093);
__m512 ifft2025 = _mm512_fnmadd_ps(ifft2020, _mm512_set1_ps(1.5625e-02f), ifft2008);
__m512 ifft2109 = _mm512_fnmadd_ps(ifft2104, _mm512_set1_ps(1.5625e-02f), ifft2092);
__m512 ifft2026 = _mm512_fmadd_ps(ifft2020, _mm512_set1_ps(1.5625e-02f), ifft2008);
__m512 ifft2110 = _mm512_fmadd_ps(ifft2104, _mm512_set1_ps(1.5625e-02f), ifft2092);
__m512 ifft2027 = _mm512_fmadd_ps(ifft2018, _mm512_set1_ps(1.5625e-02f), ifft2010);
__m512 ifft2111 = _mm512_fmadd_ps(ifft2102, _mm512_set1_ps(1.5625e-02f), ifft2094);
__m512 ifft2028 = _mm512_fnmadd_ps(ifft2018, _mm512_set1_ps(1.5625e-02f), ifft2010);
__m512 ifft2112 = _mm512_fnmadd_ps(ifft2102, _mm512_set1_ps(1.5625e-02f), ifft2094);
__m512 dat700 = ifft2021;
__m512 dat705 = ifft2105;
__m512 dat701 = ifft2023;
__m512 dat706 = ifft2107;
__m512 dat702 = ifft2025;
__m512 dat707 = ifft2109;
__m512 dat703 = ifft2027;
__m512 dat708 = ifft2111;
__m512 dat704 = ifft2022;
__m512 dat709 = ifft2106;
(void)ifft2024;
(void)ifft2108;
(void)ifft2026;
(void)ifft2110;
(void)ifft2028;
(void)ifft2112;
dat700 = _mm512_max_ps(_mm512_setzero_ps(), dat700);
dat705 = _mm512_max_ps(_mm512_setzero_ps(), dat705);
dat701 = _mm512_max_ps(_mm512_setzero_ps(), dat701);
dat706 = _mm512_max_ps(_mm512_setzero_ps(), dat706);
dat702 = _mm512_max_ps(_mm512_setzero_ps(), dat702);
dat707 = _mm512_max_ps(_mm512_setzero_ps(), dat707);
dat703 = _mm512_max_ps(_mm512_setzero_ps(), dat703);
dat708 = _mm512_max_ps(_mm512_setzero_ps(), dat708);
dat704 = _mm512_max_ps(_mm512_setzero_ps(), dat704);
dat709 = _mm512_max_ps(_mm512_setzero_ps(), dat709);
_mm512_mask_storeu_ps(datPtr2+40+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 3, dat700);
_mm512_mask_storeu_ps(datPtr2+52048+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 7936, dat700);
_mm512_mask_storeu_ps(datPtr2+1840+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 31, dat705);
_mm512_mask_storeu_ps(datPtr2+50248+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 768, dat705);
_mm512_mask_storeu_ps(datPtr2+488+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 3, dat701);
_mm512_mask_storeu_ps(datPtr2+52496+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 7936, dat701);
_mm512_mask_storeu_ps(datPtr2+2288+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 31, dat706);
_mm512_mask_storeu_ps(datPtr2+50696+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 768, dat706);
_mm512_mask_storeu_ps(datPtr2+936+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 3, dat702);
_mm512_mask_storeu_ps(datPtr2+52944+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 7936, dat702);
_mm512_mask_storeu_ps(datPtr2+2736+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 31, dat707);
_mm512_mask_storeu_ps(datPtr2+51144+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 768, dat707);
_mm512_mask_storeu_ps(datPtr2+1384+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 3, dat703);
_mm512_mask_storeu_ps(datPtr2+53392+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 7936, dat703);
_mm512_mask_storeu_ps(datPtr2+3184+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 31, dat708);
_mm512_mask_storeu_ps(datPtr2+51592+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 768, dat708);
_mm512_mask_storeu_ps(datPtr2+1832+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 3, dat704);
_mm512_mask_storeu_ps(datPtr2+53840+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 7936, dat704);
_mm512_mask_storeu_ps(datPtr2+3632+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 31, dat709);
_mm512_mask_storeu_ps(datPtr2+52040+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 768, dat709);
ptrdiff_t t14 = 0;
__m512 sfRe153 = _mm512_loadu_ps(sfPtr3+512+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfIm153 = _mm512_loadu_ps(sfPtr3+576+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfRe157 = _mm512_loadu_ps(sfPtr3+640+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfIm157 = _mm512_loadu_ps(sfPtr3+704+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfRe154 = _mm512_loadu_ps(sfPtr3+2167296+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfIm154 = _mm512_loadu_ps(sfPtr3+2167360+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfRe158 = _mm512_loadu_ps(sfPtr3+2167424+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfIm158 = _mm512_loadu_ps(sfPtr3+2167488+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfRe155 = _mm512_loadu_ps(sfPtr3+4334080+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfIm155 = _mm512_loadu_ps(sfPtr3+4334144+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfRe159 = _mm512_loadu_ps(sfPtr3+4334208+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfIm159 = _mm512_loadu_ps(sfPtr3+4334272+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfRe156 = _mm512_loadu_ps(sfPtr3+6500864+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfIm156 = _mm512_loadu_ps(sfPtr3+6500928+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfRe160 = _mm512_loadu_ps(sfPtr3+6500992+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfIm160 = _mm512_loadu_ps(sfPtr3+6501056+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512i ifft2113 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2114 = _mm512_permutexvar_ps(ifft2113, sfRe153);
__m512 ifft2205 = _mm512_permutexvar_ps(ifft2113, sfRe157);
__m512i ifft2115 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft2116 = _mm512_permutexvar_ps(ifft2115, sfRe153);
__m512 ifft2206 = _mm512_permutexvar_ps(ifft2115, sfRe157);
__m512 ifft2117 = _mm512_permutexvar_ps(ifft2113, sfIm153);
__m512 ifft2207 = _mm512_permutexvar_ps(ifft2113, sfIm157);
__m512 ifft2118 = _mm512_permutexvar_ps(ifft2115, sfIm153);
__m512 ifft2208 = _mm512_permutexvar_ps(ifft2115, sfIm157);
__m512 ifft2119 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft2120 = _mm512_mask_fmadd_ps(ifft2118, 65021, ifft2119, ifft2114);
__m512 ifft2209 = _mm512_mask_fmadd_ps(ifft2208, 65021, ifft2119, ifft2205);
__m512 ifft2121 = _mm512_mask_fnmadd_ps(ifft2117, 65021, ifft2119, ifft2116);
__m512 ifft2210 = _mm512_mask_fnmadd_ps(ifft2207, 65021, ifft2119, ifft2206);
__m512 ifft2122 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft2123 = _mm512_fmadd_ps(ifft2120, ifft2122, _mm512_shuffle_ps(ifft2120, ifft2120, 177));
__m512 ifft2211 = _mm512_fmadd_ps(ifft2209, ifft2122, _mm512_shuffle_ps(ifft2209, ifft2209, 177));
__m512 ifft2124 = _mm512_fmadd_ps(ifft2121, ifft2122, _mm512_shuffle_ps(ifft2121, ifft2121, 177));
__m512 ifft2212 = _mm512_fmadd_ps(ifft2210, ifft2122, _mm512_shuffle_ps(ifft2210, ifft2210, 177));
__m512 ifft2125 = _mm512_fmadd_ps(sfRe154, ifft2122, _mm512_shuffle_ps(sfRe154, sfRe154, 177));
__m512 ifft2213 = _mm512_fmadd_ps(sfRe158, ifft2122, _mm512_shuffle_ps(sfRe158, sfRe158, 177));
__m512 ifft2126 = _mm512_fmadd_ps(sfIm154, ifft2122, _mm512_shuffle_ps(sfIm154, sfIm154, 177));
__m512 ifft2214 = _mm512_fmadd_ps(sfIm158, ifft2122, _mm512_shuffle_ps(sfIm158, sfIm158, 177));
__m512 ifft2127 = _mm512_fmadd_ps(sfRe155, ifft2122, _mm512_shuffle_ps(sfRe155, sfRe155, 177));
__m512 ifft2215 = _mm512_fmadd_ps(sfRe159, ifft2122, _mm512_shuffle_ps(sfRe159, sfRe159, 177));
__m512 ifft2128 = _mm512_fmadd_ps(sfIm155, ifft2122, _mm512_shuffle_ps(sfIm155, sfIm155, 177));
__m512 ifft2216 = _mm512_fmadd_ps(sfIm159, ifft2122, _mm512_shuffle_ps(sfIm159, sfIm159, 177));
__m512 ifft2129 = _mm512_fmadd_ps(sfRe156, ifft2122, _mm512_shuffle_ps(sfRe156, sfRe156, 177));
__m512 ifft2217 = _mm512_fmadd_ps(sfRe160, ifft2122, _mm512_shuffle_ps(sfRe160, sfRe160, 177));
__m512 ifft2130 = _mm512_fmadd_ps(sfIm156, ifft2122, _mm512_shuffle_ps(sfIm156, sfIm156, 177));
__m512 ifft2218 = _mm512_fmadd_ps(sfIm160, ifft2122, _mm512_shuffle_ps(sfIm160, sfIm160, 177));
__m512 ifft2131 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft2132 = _mm512_mul_ps(ifft2123, ifft2131);
__m512 ifft2219 = _mm512_mul_ps(ifft2211, ifft2131);
__m512 ifft2133 = _mm512_mul_ps(ifft2124, ifft2131);
__m512 ifft2220 = _mm512_mul_ps(ifft2212, ifft2131);
__m512 ifft2134 = _mm512_mul_ps(ifft2125, ifft2131);
__m512 ifft2221 = _mm512_mul_ps(ifft2213, ifft2131);
__m512 ifft2135 = _mm512_mul_ps(ifft2126, ifft2131);
__m512 ifft2222 = _mm512_mul_ps(ifft2214, ifft2131);
__m512 ifft2136 = _mm512_mul_ps(ifft2127, ifft2131);
__m512 ifft2223 = _mm512_mul_ps(ifft2215, ifft2131);
__m512 ifft2137 = _mm512_mul_ps(ifft2128, ifft2131);
__m512 ifft2224 = _mm512_mul_ps(ifft2216, ifft2131);
__m512 ifft2138 = _mm512_mul_ps(ifft2129, ifft2131);
__m512 ifft2225 = _mm512_mul_ps(ifft2217, ifft2131);
__m512 ifft2139 = _mm512_mul_ps(ifft2130, ifft2131);
__m512 ifft2226 = _mm512_mul_ps(ifft2218, ifft2131);
__m512 ifft2140 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft2141 = _mm512_fnmadd_ps(ifft2124, ifft2140, ifft2132);
__m512 ifft2227 = _mm512_fnmadd_ps(ifft2212, ifft2140, ifft2219);
__m512 ifft2142 = _mm512_fmadd_ps(ifft2123, ifft2140, ifft2133);
__m512 ifft2228 = _mm512_fmadd_ps(ifft2211, ifft2140, ifft2220);
__m512 ifft2143 = _mm512_fnmadd_ps(ifft2126, ifft2140, ifft2134);
__m512 ifft2229 = _mm512_fnmadd_ps(ifft2214, ifft2140, ifft2221);
__m512 ifft2144 = _mm512_fmadd_ps(ifft2125, ifft2140, ifft2135);
__m512 ifft2230 = _mm512_fmadd_ps(ifft2213, ifft2140, ifft2222);
__m512 ifft2145 = _mm512_fnmadd_ps(ifft2128, ifft2140, ifft2136);
__m512 ifft2231 = _mm512_fnmadd_ps(ifft2216, ifft2140, ifft2223);
__m512 ifft2146 = _mm512_fmadd_ps(ifft2127, ifft2140, ifft2137);
__m512 ifft2232 = _mm512_fmadd_ps(ifft2215, ifft2140, ifft2224);
__m512 ifft2147 = _mm512_fnmadd_ps(ifft2130, ifft2140, ifft2138);
__m512 ifft2233 = _mm512_fnmadd_ps(ifft2218, ifft2140, ifft2225);
__m512 ifft2148 = _mm512_fmadd_ps(ifft2129, ifft2140, ifft2139);
__m512 ifft2234 = _mm512_fmadd_ps(ifft2217, ifft2140, ifft2226);
__m512 ifft2149 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft2150 = _mm512_fmadd_ps(ifft2141, ifft2149, _mm512_shuffle_ps(ifft2141, ifft2141, 78));
__m512 ifft2235 = _mm512_fmadd_ps(ifft2227, ifft2149, _mm512_shuffle_ps(ifft2227, ifft2227, 78));
__m512 ifft2151 = _mm512_fmadd_ps(ifft2142, ifft2149, _mm512_shuffle_ps(ifft2142, ifft2142, 78));
__m512 ifft2236 = _mm512_fmadd_ps(ifft2228, ifft2149, _mm512_shuffle_ps(ifft2228, ifft2228, 78));
__m512 ifft2152 = _mm512_fmadd_ps(ifft2143, ifft2149, _mm512_shuffle_ps(ifft2143, ifft2143, 78));
__m512 ifft2237 = _mm512_fmadd_ps(ifft2229, ifft2149, _mm512_shuffle_ps(ifft2229, ifft2229, 78));
__m512 ifft2153 = _mm512_fmadd_ps(ifft2144, ifft2149, _mm512_shuffle_ps(ifft2144, ifft2144, 78));
__m512 ifft2238 = _mm512_fmadd_ps(ifft2230, ifft2149, _mm512_shuffle_ps(ifft2230, ifft2230, 78));
__m512 ifft2154 = _mm512_fmadd_ps(ifft2145, ifft2149, _mm512_shuffle_ps(ifft2145, ifft2145, 78));
__m512 ifft2239 = _mm512_fmadd_ps(ifft2231, ifft2149, _mm512_shuffle_ps(ifft2231, ifft2231, 78));
__m512 ifft2155 = _mm512_fmadd_ps(ifft2146, ifft2149, _mm512_shuffle_ps(ifft2146, ifft2146, 78));
__m512 ifft2240 = _mm512_fmadd_ps(ifft2232, ifft2149, _mm512_shuffle_ps(ifft2232, ifft2232, 78));
__m512 ifft2156 = _mm512_fmadd_ps(ifft2147, ifft2149, _mm512_shuffle_ps(ifft2147, ifft2147, 78));
__m512 ifft2241 = _mm512_fmadd_ps(ifft2233, ifft2149, _mm512_shuffle_ps(ifft2233, ifft2233, 78));
__m512 ifft2157 = _mm512_fmadd_ps(ifft2148, ifft2149, _mm512_shuffle_ps(ifft2148, ifft2148, 78));
__m512 ifft2242 = _mm512_fmadd_ps(ifft2234, ifft2149, _mm512_shuffle_ps(ifft2234, ifft2234, 78));
__m512 ifft2158 = _mm512_mask_sub_ps(ifft2150, 49344, _mm512_setzero_ps(), ifft2151);
__m512 ifft2243 = _mm512_mask_sub_ps(ifft2235, 49344, _mm512_setzero_ps(), ifft2236);
__m512 ifft2159 = _mm512_mask_mov_ps(ifft2151, 49344, ifft2150);
__m512 ifft2244 = _mm512_mask_mov_ps(ifft2236, 49344, ifft2235);
__m512 ifft2160 = _mm512_mask_sub_ps(ifft2152, 49344, _mm512_setzero_ps(), ifft2153);
__m512 ifft2245 = _mm512_mask_sub_ps(ifft2237, 49344, _mm512_setzero_ps(), ifft2238);
__m512 ifft2161 = _mm512_mask_mov_ps(ifft2153, 49344, ifft2152);
__m512 ifft2246 = _mm512_mask_mov_ps(ifft2238, 49344, ifft2237);
__m512 ifft2162 = _mm512_mask_sub_ps(ifft2154, 49344, _mm512_setzero_ps(), ifft2155);
__m512 ifft2247 = _mm512_mask_sub_ps(ifft2239, 49344, _mm512_setzero_ps(), ifft2240);
__m512 ifft2163 = _mm512_mask_mov_ps(ifft2155, 49344, ifft2154);
__m512 ifft2248 = _mm512_mask_mov_ps(ifft2240, 49344, ifft2239);
__m512 ifft2164 = _mm512_mask_sub_ps(ifft2156, 49344, _mm512_setzero_ps(), ifft2157);
__m512 ifft2249 = _mm512_mask_sub_ps(ifft2241, 49344, _mm512_setzero_ps(), ifft2242);
__m512 ifft2165 = _mm512_mask_mov_ps(ifft2157, 49344, ifft2156);
__m512 ifft2250 = _mm512_mask_mov_ps(ifft2242, 49344, ifft2241);
__m512 ifft2166 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft2167 = _mm512_fmadd_ps(ifft2158, ifft2166, _mm512_shuffle_f32x4(ifft2158, ifft2158, 177));
__m512 ifft2251 = _mm512_fmadd_ps(ifft2243, ifft2166, _mm512_shuffle_f32x4(ifft2243, ifft2243, 177));
__m512 ifft2168 = _mm512_fmadd_ps(ifft2159, ifft2166, _mm512_shuffle_f32x4(ifft2159, ifft2159, 177));
__m512 ifft2252 = _mm512_fmadd_ps(ifft2244, ifft2166, _mm512_shuffle_f32x4(ifft2244, ifft2244, 177));
__m512 ifft2169 = _mm512_fmadd_ps(ifft2160, ifft2166, _mm512_shuffle_f32x4(ifft2160, ifft2160, 177));
__m512 ifft2253 = _mm512_fmadd_ps(ifft2245, ifft2166, _mm512_shuffle_f32x4(ifft2245, ifft2245, 177));
__m512 ifft2170 = _mm512_fmadd_ps(ifft2161, ifft2166, _mm512_shuffle_f32x4(ifft2161, ifft2161, 177));
__m512 ifft2254 = _mm512_fmadd_ps(ifft2246, ifft2166, _mm512_shuffle_f32x4(ifft2246, ifft2246, 177));
__m512 ifft2171 = _mm512_fmadd_ps(ifft2162, ifft2166, _mm512_shuffle_f32x4(ifft2162, ifft2162, 177));
__m512 ifft2255 = _mm512_fmadd_ps(ifft2247, ifft2166, _mm512_shuffle_f32x4(ifft2247, ifft2247, 177));
__m512 ifft2172 = _mm512_fnmsub_ps(ifft2163, ifft2166, _mm512_shuffle_f32x4(ifft2163, ifft2163, 177));
__m512 ifft2256 = _mm512_fnmsub_ps(ifft2248, ifft2166, _mm512_shuffle_f32x4(ifft2248, ifft2248, 177));
__m512 ifft2173 = _mm512_fmadd_ps(ifft2164, ifft2166, _mm512_shuffle_f32x4(ifft2164, ifft2164, 177));
__m512 ifft2257 = _mm512_fmadd_ps(ifft2249, ifft2166, _mm512_shuffle_f32x4(ifft2249, ifft2249, 177));
__m512 ifft2174 = _mm512_fmadd_ps(ifft2165, ifft2166, _mm512_shuffle_f32x4(ifft2165, ifft2165, 177));
__m512 ifft2258 = _mm512_fmadd_ps(ifft2250, ifft2166, _mm512_shuffle_f32x4(ifft2250, ifft2250, 177));
__m512 ifft2175 = _mm512_add_ps(ifft2167, ifft2168);
__m512 ifft2259 = _mm512_add_ps(ifft2251, ifft2252);
__m512 ifft2176 = _mm512_sub_ps(ifft2167, ifft2168);
__m512 ifft2260 = _mm512_sub_ps(ifft2251, ifft2252);
__m512 ifft2177 = _mm512_sub_ps(ifft2169, ifft2173);
__m512 ifft2261 = _mm512_sub_ps(ifft2253, ifft2257);
__m512 ifft2178 = _mm512_add_ps(ifft2170, ifft2174);
__m512 ifft2262 = _mm512_add_ps(ifft2254, ifft2258);
__m512 ifft2179 = _mm512_add_ps(ifft2169, ifft2173);
__m512 ifft2263 = _mm512_add_ps(ifft2253, ifft2257);
__m512 ifft2180 = _mm512_sub_ps(ifft2170, ifft2174);
__m512 ifft2264 = _mm512_sub_ps(ifft2254, ifft2258);
__m512 ifft2181 = _mm512_mul_ps(ifft2171, _mm512_set1_ps(3.125e-02f));
__m512 ifft2265 = _mm512_mul_ps(ifft2255, _mm512_set1_ps(3.125e-02f));
__m512 ifft2182 = _mm512_mul_ps(ifft2172, _mm512_set1_ps(3.125e-02f));
__m512 ifft2266 = _mm512_mul_ps(ifft2256, _mm512_set1_ps(3.125e-02f));
__m512 ifft2183 = _mm512_fmadd_ps(ifft2175, _mm512_set1_ps(1.5625e-02f), ifft2181);
__m512 ifft2267 = _mm512_fmadd_ps(ifft2259, _mm512_set1_ps(1.5625e-02f), ifft2265);
__m512 ifft2184 = _mm512_fmsub_ps(ifft2175, _mm512_set1_ps(1.5625e-02f), ifft2181);
__m512 ifft2268 = _mm512_fmsub_ps(ifft2259, _mm512_set1_ps(1.5625e-02f), ifft2265);
__m512 ifft2185 = _mm512_fmadd_ps(ifft2176, _mm512_set1_ps(1.5625e-02f), ifft2182);
__m512 ifft2269 = _mm512_fmadd_ps(ifft2260, _mm512_set1_ps(1.5625e-02f), ifft2266);
__m512 ifft2186 = _mm512_fmsub_ps(ifft2176, _mm512_set1_ps(1.5625e-02f), ifft2182);
__m512 ifft2270 = _mm512_fmsub_ps(ifft2260, _mm512_set1_ps(1.5625e-02f), ifft2266);
__m512 ifft2187 = _mm512_add_ps(ifft2177, ifft2178);
__m512 ifft2271 = _mm512_add_ps(ifft2261, ifft2262);
__m512 ifft2188 = _mm512_sub_ps(ifft2177, ifft2178);
__m512 ifft2272 = _mm512_sub_ps(ifft2261, ifft2262);
__m512 ifft2189 = _mm512_fnmadd_ps(ifft2187, _mm512_set1_ps(7.0710677e-01f), ifft2179);
__m512 ifft2273 = _mm512_fnmadd_ps(ifft2271, _mm512_set1_ps(7.0710677e-01f), ifft2263);
__m512 ifft2190 = _mm512_fmadd_ps(ifft2187, _mm512_set1_ps(7.0710677e-01f), ifft2179);
__m512 ifft2274 = _mm512_fmadd_ps(ifft2271, _mm512_set1_ps(7.0710677e-01f), ifft2263);
__m512 ifft2191 = _mm512_fmadd_ps(ifft2188, _mm512_set1_ps(7.0710677e-01f), ifft2180);
__m512 ifft2275 = _mm512_fmadd_ps(ifft2272, _mm512_set1_ps(7.0710677e-01f), ifft2264);
__m512 ifft2192 = _mm512_fmsub_ps(ifft2188, _mm512_set1_ps(7.0710677e-01f), ifft2180);
__m512 ifft2276 = _mm512_fmsub_ps(ifft2272, _mm512_set1_ps(7.0710677e-01f), ifft2264);
__m512 ifft2193 = _mm512_add_ps(ifft2189, ifft2190);
__m512 ifft2277 = _mm512_add_ps(ifft2273, ifft2274);
__m512 ifft2194 = _mm512_sub_ps(ifft2189, ifft2190);
__m512 ifft2278 = _mm512_sub_ps(ifft2273, ifft2274);
__m512 ifft2195 = _mm512_add_ps(ifft2191, ifft2192);
__m512 ifft2279 = _mm512_add_ps(ifft2275, ifft2276);
__m512 ifft2196 = _mm512_sub_ps(ifft2191, ifft2192);
__m512 ifft2280 = _mm512_sub_ps(ifft2275, ifft2276);
__m512 ifft2197 = _mm512_fmadd_ps(ifft2193, _mm512_set1_ps(1.5625e-02f), ifft2183);
__m512 ifft2281 = _mm512_fmadd_ps(ifft2277, _mm512_set1_ps(1.5625e-02f), ifft2267);
__m512 ifft2198 = _mm512_fnmadd_ps(ifft2193, _mm512_set1_ps(1.5625e-02f), ifft2183);
__m512 ifft2282 = _mm512_fnmadd_ps(ifft2277, _mm512_set1_ps(1.5625e-02f), ifft2267);
__m512 ifft2199 = _mm512_fmadd_ps(ifft2195, _mm512_set1_ps(1.5625e-02f), ifft2185);
__m512 ifft2283 = _mm512_fmadd_ps(ifft2279, _mm512_set1_ps(1.5625e-02f), ifft2269);
__m512 ifft2200 = _mm512_fnmadd_ps(ifft2195, _mm512_set1_ps(1.5625e-02f), ifft2185);
__m512 ifft2284 = _mm512_fnmadd_ps(ifft2279, _mm512_set1_ps(1.5625e-02f), ifft2269);
__m512 ifft2201 = _mm512_fnmadd_ps(ifft2196, _mm512_set1_ps(1.5625e-02f), ifft2184);
__m512 ifft2285 = _mm512_fnmadd_ps(ifft2280, _mm512_set1_ps(1.5625e-02f), ifft2268);
__m512 ifft2202 = _mm512_fmadd_ps(ifft2196, _mm512_set1_ps(1.5625e-02f), ifft2184);
__m512 ifft2286 = _mm512_fmadd_ps(ifft2280, _mm512_set1_ps(1.5625e-02f), ifft2268);
__m512 ifft2203 = _mm512_fmadd_ps(ifft2194, _mm512_set1_ps(1.5625e-02f), ifft2186);
__m512 ifft2287 = _mm512_fmadd_ps(ifft2278, _mm512_set1_ps(1.5625e-02f), ifft2270);
__m512 ifft2204 = _mm512_fnmadd_ps(ifft2194, _mm512_set1_ps(1.5625e-02f), ifft2186);
__m512 ifft2288 = _mm512_fnmadd_ps(ifft2278, _mm512_set1_ps(1.5625e-02f), ifft2270);
__m512 dat710 = ifft2197;
__m512 dat715 = ifft2281;
__m512 dat711 = ifft2199;
__m512 dat716 = ifft2283;
__m512 dat712 = ifft2201;
__m512 dat717 = ifft2285;
__m512 dat713 = ifft2203;
__m512 dat718 = ifft2287;
__m512 dat714 = ifft2198;
__m512 dat719 = ifft2282;
(void)ifft2200;
(void)ifft2284;
(void)ifft2202;
(void)ifft2286;
(void)ifft2204;
(void)ifft2288;
__m512i pm21 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack101 = _mm512_permutex2var_ps(dat710, pm21, dat715);
__m512i pm22 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack102 = _mm512_permutex2var_ps(dat710, pm22, dat715);
__m512 pack103 = _mm512_permutex2var_ps(dat711, pm21, dat716);
__m512 pack104 = _mm512_permutex2var_ps(dat711, pm22, dat716);
__m512 pack105 = _mm512_permutex2var_ps(dat712, pm21, dat717);
__m512 pack106 = _mm512_permutex2var_ps(dat712, pm22, dat717);
__m512 pack107 = _mm512_permutex2var_ps(dat713, pm21, dat718);
__m512 pack108 = _mm512_permutex2var_ps(dat713, pm22, dat718);
__m512 pack109 = _mm512_permutex2var_ps(dat714, pm21, dat719);
__m512 pack110 = _mm512_permutex2var_ps(dat714, pm22, dat719);
pack101 = _mm512_max_ps(_mm512_setzero_ps(), pack101);
pack102 = _mm512_max_ps(_mm512_setzero_ps(), pack102);
pack103 = _mm512_max_ps(_mm512_setzero_ps(), pack103);
pack104 = _mm512_max_ps(_mm512_setzero_ps(), pack104);
pack105 = _mm512_max_ps(_mm512_setzero_ps(), pack105);
pack106 = _mm512_max_ps(_mm512_setzero_ps(), pack106);
pack107 = _mm512_max_ps(_mm512_setzero_ps(), pack107);
pack108 = _mm512_max_ps(_mm512_setzero_ps(), pack108);
pack109 = _mm512_max_ps(_mm512_setzero_ps(), pack109);
pack110 = _mm512_max_ps(_mm512_setzero_ps(), pack110);
_mm512_mask_storeu_ps(datPtr2+1860+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack101);
_mm512_mask_storeu_ps(datPtr2+52100+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack102);
_mm512_mask_storeu_ps(datPtr2+2308+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack103);
_mm512_mask_storeu_ps(datPtr2+52548+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack104);
_mm512_mask_storeu_ps(datPtr2+2756+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack105);
_mm512_mask_storeu_ps(datPtr2+52996+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack106);
_mm512_mask_storeu_ps(datPtr2+3204+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack107);
_mm512_mask_storeu_ps(datPtr2+53444+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack108);
_mm512_mask_storeu_ps(datPtr2+3652+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack109);
_mm512_mask_storeu_ps(datPtr2+53892+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack110);
}
}
if (j5 >= last2) return;
++j5;
rel5 = 8;
}
ptrdiff_t toH8 = base5+10;
ptrdiff_t toW8 = -225+30*rel5;
ptrdiff_t jj14 = 10-rel5+j5;
for (; j5 <= jj14; toW8 += 30) {
ptrdiff_t k32 = 16*w21;
for (; k32 != 16; ++k32) {
ptrdiff_t r9 = 0;
for (; r9 != 2; ++r9) {
ptrdiff_t t15 = 0;
for (; t15 < 3; ++t15) {
__m512 sfRe161 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfIm161 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfRe165 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfIm165 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfRe162 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfIm162 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfRe166 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfIm166 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfRe163 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfIm163 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfRe167 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfIm167 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfRe164 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfIm164 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfRe168 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfIm168 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512i ifft2289 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2290 = _mm512_permutexvar_ps(ifft2289, sfRe161);
__m512 ifft2381 = _mm512_permutexvar_ps(ifft2289, sfRe165);
__m512i ifft2291 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft2292 = _mm512_permutexvar_ps(ifft2291, sfRe161);
__m512 ifft2382 = _mm512_permutexvar_ps(ifft2291, sfRe165);
__m512 ifft2293 = _mm512_permutexvar_ps(ifft2289, sfIm161);
__m512 ifft2383 = _mm512_permutexvar_ps(ifft2289, sfIm165);
__m512 ifft2294 = _mm512_permutexvar_ps(ifft2291, sfIm161);
__m512 ifft2384 = _mm512_permutexvar_ps(ifft2291, sfIm165);
__m512 ifft2295 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft2296 = _mm512_mask_fmadd_ps(ifft2294, 65021, ifft2295, ifft2290);
__m512 ifft2385 = _mm512_mask_fmadd_ps(ifft2384, 65021, ifft2295, ifft2381);
__m512 ifft2297 = _mm512_mask_fnmadd_ps(ifft2293, 65021, ifft2295, ifft2292);
__m512 ifft2386 = _mm512_mask_fnmadd_ps(ifft2383, 65021, ifft2295, ifft2382);
__m512 ifft2298 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft2299 = _mm512_fmadd_ps(ifft2296, ifft2298, _mm512_shuffle_ps(ifft2296, ifft2296, 177));
__m512 ifft2387 = _mm512_fmadd_ps(ifft2385, ifft2298, _mm512_shuffle_ps(ifft2385, ifft2385, 177));
__m512 ifft2300 = _mm512_fmadd_ps(ifft2297, ifft2298, _mm512_shuffle_ps(ifft2297, ifft2297, 177));
__m512 ifft2388 = _mm512_fmadd_ps(ifft2386, ifft2298, _mm512_shuffle_ps(ifft2386, ifft2386, 177));
__m512 ifft2301 = _mm512_fmadd_ps(sfRe162, ifft2298, _mm512_shuffle_ps(sfRe162, sfRe162, 177));
__m512 ifft2389 = _mm512_fmadd_ps(sfRe166, ifft2298, _mm512_shuffle_ps(sfRe166, sfRe166, 177));
__m512 ifft2302 = _mm512_fmadd_ps(sfIm162, ifft2298, _mm512_shuffle_ps(sfIm162, sfIm162, 177));
__m512 ifft2390 = _mm512_fmadd_ps(sfIm166, ifft2298, _mm512_shuffle_ps(sfIm166, sfIm166, 177));
__m512 ifft2303 = _mm512_fmadd_ps(sfRe163, ifft2298, _mm512_shuffle_ps(sfRe163, sfRe163, 177));
__m512 ifft2391 = _mm512_fmadd_ps(sfRe167, ifft2298, _mm512_shuffle_ps(sfRe167, sfRe167, 177));
__m512 ifft2304 = _mm512_fmadd_ps(sfIm163, ifft2298, _mm512_shuffle_ps(sfIm163, sfIm163, 177));
__m512 ifft2392 = _mm512_fmadd_ps(sfIm167, ifft2298, _mm512_shuffle_ps(sfIm167, sfIm167, 177));
__m512 ifft2305 = _mm512_fmadd_ps(sfRe164, ifft2298, _mm512_shuffle_ps(sfRe164, sfRe164, 177));
__m512 ifft2393 = _mm512_fmadd_ps(sfRe168, ifft2298, _mm512_shuffle_ps(sfRe168, sfRe168, 177));
__m512 ifft2306 = _mm512_fmadd_ps(sfIm164, ifft2298, _mm512_shuffle_ps(sfIm164, sfIm164, 177));
__m512 ifft2394 = _mm512_fmadd_ps(sfIm168, ifft2298, _mm512_shuffle_ps(sfIm168, sfIm168, 177));
__m512 ifft2307 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft2308 = _mm512_mul_ps(ifft2299, ifft2307);
__m512 ifft2395 = _mm512_mul_ps(ifft2387, ifft2307);
__m512 ifft2309 = _mm512_mul_ps(ifft2300, ifft2307);
__m512 ifft2396 = _mm512_mul_ps(ifft2388, ifft2307);
__m512 ifft2310 = _mm512_mul_ps(ifft2301, ifft2307);
__m512 ifft2397 = _mm512_mul_ps(ifft2389, ifft2307);
__m512 ifft2311 = _mm512_mul_ps(ifft2302, ifft2307);
__m512 ifft2398 = _mm512_mul_ps(ifft2390, ifft2307);
__m512 ifft2312 = _mm512_mul_ps(ifft2303, ifft2307);
__m512 ifft2399 = _mm512_mul_ps(ifft2391, ifft2307);
__m512 ifft2313 = _mm512_mul_ps(ifft2304, ifft2307);
__m512 ifft2400 = _mm512_mul_ps(ifft2392, ifft2307);
__m512 ifft2314 = _mm512_mul_ps(ifft2305, ifft2307);
__m512 ifft2401 = _mm512_mul_ps(ifft2393, ifft2307);
__m512 ifft2315 = _mm512_mul_ps(ifft2306, ifft2307);
__m512 ifft2402 = _mm512_mul_ps(ifft2394, ifft2307);
__m512 ifft2316 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft2317 = _mm512_fnmadd_ps(ifft2300, ifft2316, ifft2308);
__m512 ifft2403 = _mm512_fnmadd_ps(ifft2388, ifft2316, ifft2395);
__m512 ifft2318 = _mm512_fmadd_ps(ifft2299, ifft2316, ifft2309);
__m512 ifft2404 = _mm512_fmadd_ps(ifft2387, ifft2316, ifft2396);
__m512 ifft2319 = _mm512_fnmadd_ps(ifft2302, ifft2316, ifft2310);
__m512 ifft2405 = _mm512_fnmadd_ps(ifft2390, ifft2316, ifft2397);
__m512 ifft2320 = _mm512_fmadd_ps(ifft2301, ifft2316, ifft2311);
__m512 ifft2406 = _mm512_fmadd_ps(ifft2389, ifft2316, ifft2398);
__m512 ifft2321 = _mm512_fnmadd_ps(ifft2304, ifft2316, ifft2312);
__m512 ifft2407 = _mm512_fnmadd_ps(ifft2392, ifft2316, ifft2399);
__m512 ifft2322 = _mm512_fmadd_ps(ifft2303, ifft2316, ifft2313);
__m512 ifft2408 = _mm512_fmadd_ps(ifft2391, ifft2316, ifft2400);
__m512 ifft2323 = _mm512_fnmadd_ps(ifft2306, ifft2316, ifft2314);
__m512 ifft2409 = _mm512_fnmadd_ps(ifft2394, ifft2316, ifft2401);
__m512 ifft2324 = _mm512_fmadd_ps(ifft2305, ifft2316, ifft2315);
__m512 ifft2410 = _mm512_fmadd_ps(ifft2393, ifft2316, ifft2402);
__m512 ifft2325 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft2326 = _mm512_fmadd_ps(ifft2317, ifft2325, _mm512_shuffle_ps(ifft2317, ifft2317, 78));
__m512 ifft2411 = _mm512_fmadd_ps(ifft2403, ifft2325, _mm512_shuffle_ps(ifft2403, ifft2403, 78));
__m512 ifft2327 = _mm512_fmadd_ps(ifft2318, ifft2325, _mm512_shuffle_ps(ifft2318, ifft2318, 78));
__m512 ifft2412 = _mm512_fmadd_ps(ifft2404, ifft2325, _mm512_shuffle_ps(ifft2404, ifft2404, 78));
__m512 ifft2328 = _mm512_fmadd_ps(ifft2319, ifft2325, _mm512_shuffle_ps(ifft2319, ifft2319, 78));
__m512 ifft2413 = _mm512_fmadd_ps(ifft2405, ifft2325, _mm512_shuffle_ps(ifft2405, ifft2405, 78));
__m512 ifft2329 = _mm512_fmadd_ps(ifft2320, ifft2325, _mm512_shuffle_ps(ifft2320, ifft2320, 78));
__m512 ifft2414 = _mm512_fmadd_ps(ifft2406, ifft2325, _mm512_shuffle_ps(ifft2406, ifft2406, 78));
__m512 ifft2330 = _mm512_fmadd_ps(ifft2321, ifft2325, _mm512_shuffle_ps(ifft2321, ifft2321, 78));
__m512 ifft2415 = _mm512_fmadd_ps(ifft2407, ifft2325, _mm512_shuffle_ps(ifft2407, ifft2407, 78));
__m512 ifft2331 = _mm512_fmadd_ps(ifft2322, ifft2325, _mm512_shuffle_ps(ifft2322, ifft2322, 78));
__m512 ifft2416 = _mm512_fmadd_ps(ifft2408, ifft2325, _mm512_shuffle_ps(ifft2408, ifft2408, 78));
__m512 ifft2332 = _mm512_fmadd_ps(ifft2323, ifft2325, _mm512_shuffle_ps(ifft2323, ifft2323, 78));
__m512 ifft2417 = _mm512_fmadd_ps(ifft2409, ifft2325, _mm512_shuffle_ps(ifft2409, ifft2409, 78));
__m512 ifft2333 = _mm512_fmadd_ps(ifft2324, ifft2325, _mm512_shuffle_ps(ifft2324, ifft2324, 78));
__m512 ifft2418 = _mm512_fmadd_ps(ifft2410, ifft2325, _mm512_shuffle_ps(ifft2410, ifft2410, 78));
__m512 ifft2334 = _mm512_mask_sub_ps(ifft2326, 49344, _mm512_setzero_ps(), ifft2327);
__m512 ifft2419 = _mm512_mask_sub_ps(ifft2411, 49344, _mm512_setzero_ps(), ifft2412);
__m512 ifft2335 = _mm512_mask_mov_ps(ifft2327, 49344, ifft2326);
__m512 ifft2420 = _mm512_mask_mov_ps(ifft2412, 49344, ifft2411);
__m512 ifft2336 = _mm512_mask_sub_ps(ifft2328, 49344, _mm512_setzero_ps(), ifft2329);
__m512 ifft2421 = _mm512_mask_sub_ps(ifft2413, 49344, _mm512_setzero_ps(), ifft2414);
__m512 ifft2337 = _mm512_mask_mov_ps(ifft2329, 49344, ifft2328);
__m512 ifft2422 = _mm512_mask_mov_ps(ifft2414, 49344, ifft2413);
__m512 ifft2338 = _mm512_mask_sub_ps(ifft2330, 49344, _mm512_setzero_ps(), ifft2331);
__m512 ifft2423 = _mm512_mask_sub_ps(ifft2415, 49344, _mm512_setzero_ps(), ifft2416);
__m512 ifft2339 = _mm512_mask_mov_ps(ifft2331, 49344, ifft2330);
__m512 ifft2424 = _mm512_mask_mov_ps(ifft2416, 49344, ifft2415);
__m512 ifft2340 = _mm512_mask_sub_ps(ifft2332, 49344, _mm512_setzero_ps(), ifft2333);
__m512 ifft2425 = _mm512_mask_sub_ps(ifft2417, 49344, _mm512_setzero_ps(), ifft2418);
__m512 ifft2341 = _mm512_mask_mov_ps(ifft2333, 49344, ifft2332);
__m512 ifft2426 = _mm512_mask_mov_ps(ifft2418, 49344, ifft2417);
__m512 ifft2342 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft2343 = _mm512_fmadd_ps(ifft2334, ifft2342, _mm512_shuffle_f32x4(ifft2334, ifft2334, 177));
__m512 ifft2427 = _mm512_fmadd_ps(ifft2419, ifft2342, _mm512_shuffle_f32x4(ifft2419, ifft2419, 177));
__m512 ifft2344 = _mm512_fmadd_ps(ifft2335, ifft2342, _mm512_shuffle_f32x4(ifft2335, ifft2335, 177));
__m512 ifft2428 = _mm512_fmadd_ps(ifft2420, ifft2342, _mm512_shuffle_f32x4(ifft2420, ifft2420, 177));
__m512 ifft2345 = _mm512_fmadd_ps(ifft2336, ifft2342, _mm512_shuffle_f32x4(ifft2336, ifft2336, 177));
__m512 ifft2429 = _mm512_fmadd_ps(ifft2421, ifft2342, _mm512_shuffle_f32x4(ifft2421, ifft2421, 177));
__m512 ifft2346 = _mm512_fmadd_ps(ifft2337, ifft2342, _mm512_shuffle_f32x4(ifft2337, ifft2337, 177));
__m512 ifft2430 = _mm512_fmadd_ps(ifft2422, ifft2342, _mm512_shuffle_f32x4(ifft2422, ifft2422, 177));
__m512 ifft2347 = _mm512_fmadd_ps(ifft2338, ifft2342, _mm512_shuffle_f32x4(ifft2338, ifft2338, 177));
__m512 ifft2431 = _mm512_fmadd_ps(ifft2423, ifft2342, _mm512_shuffle_f32x4(ifft2423, ifft2423, 177));
__m512 ifft2348 = _mm512_fnmsub_ps(ifft2339, ifft2342, _mm512_shuffle_f32x4(ifft2339, ifft2339, 177));
__m512 ifft2432 = _mm512_fnmsub_ps(ifft2424, ifft2342, _mm512_shuffle_f32x4(ifft2424, ifft2424, 177));
__m512 ifft2349 = _mm512_fmadd_ps(ifft2340, ifft2342, _mm512_shuffle_f32x4(ifft2340, ifft2340, 177));
__m512 ifft2433 = _mm512_fmadd_ps(ifft2425, ifft2342, _mm512_shuffle_f32x4(ifft2425, ifft2425, 177));
__m512 ifft2350 = _mm512_fmadd_ps(ifft2341, ifft2342, _mm512_shuffle_f32x4(ifft2341, ifft2341, 177));
__m512 ifft2434 = _mm512_fmadd_ps(ifft2426, ifft2342, _mm512_shuffle_f32x4(ifft2426, ifft2426, 177));
__m512 ifft2351 = _mm512_add_ps(ifft2343, ifft2344);
__m512 ifft2435 = _mm512_add_ps(ifft2427, ifft2428);
__m512 ifft2352 = _mm512_sub_ps(ifft2343, ifft2344);
__m512 ifft2436 = _mm512_sub_ps(ifft2427, ifft2428);
__m512 ifft2353 = _mm512_sub_ps(ifft2345, ifft2349);
__m512 ifft2437 = _mm512_sub_ps(ifft2429, ifft2433);
__m512 ifft2354 = _mm512_add_ps(ifft2346, ifft2350);
__m512 ifft2438 = _mm512_add_ps(ifft2430, ifft2434);
__m512 ifft2355 = _mm512_add_ps(ifft2345, ifft2349);
__m512 ifft2439 = _mm512_add_ps(ifft2429, ifft2433);
__m512 ifft2356 = _mm512_sub_ps(ifft2346, ifft2350);
__m512 ifft2440 = _mm512_sub_ps(ifft2430, ifft2434);
__m512 ifft2357 = _mm512_mul_ps(ifft2347, _mm512_set1_ps(3.125e-02f));
__m512 ifft2441 = _mm512_mul_ps(ifft2431, _mm512_set1_ps(3.125e-02f));
__m512 ifft2358 = _mm512_mul_ps(ifft2348, _mm512_set1_ps(3.125e-02f));
__m512 ifft2442 = _mm512_mul_ps(ifft2432, _mm512_set1_ps(3.125e-02f));
__m512 ifft2359 = _mm512_fmadd_ps(ifft2351, _mm512_set1_ps(1.5625e-02f), ifft2357);
__m512 ifft2443 = _mm512_fmadd_ps(ifft2435, _mm512_set1_ps(1.5625e-02f), ifft2441);
__m512 ifft2360 = _mm512_fmsub_ps(ifft2351, _mm512_set1_ps(1.5625e-02f), ifft2357);
__m512 ifft2444 = _mm512_fmsub_ps(ifft2435, _mm512_set1_ps(1.5625e-02f), ifft2441);
__m512 ifft2361 = _mm512_fmadd_ps(ifft2352, _mm512_set1_ps(1.5625e-02f), ifft2358);
__m512 ifft2445 = _mm512_fmadd_ps(ifft2436, _mm512_set1_ps(1.5625e-02f), ifft2442);
__m512 ifft2362 = _mm512_fmsub_ps(ifft2352, _mm512_set1_ps(1.5625e-02f), ifft2358);
__m512 ifft2446 = _mm512_fmsub_ps(ifft2436, _mm512_set1_ps(1.5625e-02f), ifft2442);
__m512 ifft2363 = _mm512_add_ps(ifft2353, ifft2354);
__m512 ifft2447 = _mm512_add_ps(ifft2437, ifft2438);
__m512 ifft2364 = _mm512_sub_ps(ifft2353, ifft2354);
__m512 ifft2448 = _mm512_sub_ps(ifft2437, ifft2438);
__m512 ifft2365 = _mm512_fnmadd_ps(ifft2363, _mm512_set1_ps(7.0710677e-01f), ifft2355);
__m512 ifft2449 = _mm512_fnmadd_ps(ifft2447, _mm512_set1_ps(7.0710677e-01f), ifft2439);
__m512 ifft2366 = _mm512_fmadd_ps(ifft2363, _mm512_set1_ps(7.0710677e-01f), ifft2355);
__m512 ifft2450 = _mm512_fmadd_ps(ifft2447, _mm512_set1_ps(7.0710677e-01f), ifft2439);
__m512 ifft2367 = _mm512_fmadd_ps(ifft2364, _mm512_set1_ps(7.0710677e-01f), ifft2356);
__m512 ifft2451 = _mm512_fmadd_ps(ifft2448, _mm512_set1_ps(7.0710677e-01f), ifft2440);
__m512 ifft2368 = _mm512_fmsub_ps(ifft2364, _mm512_set1_ps(7.0710677e-01f), ifft2356);
__m512 ifft2452 = _mm512_fmsub_ps(ifft2448, _mm512_set1_ps(7.0710677e-01f), ifft2440);
__m512 ifft2369 = _mm512_add_ps(ifft2365, ifft2366);
__m512 ifft2453 = _mm512_add_ps(ifft2449, ifft2450);
__m512 ifft2370 = _mm512_sub_ps(ifft2365, ifft2366);
__m512 ifft2454 = _mm512_sub_ps(ifft2449, ifft2450);
__m512 ifft2371 = _mm512_add_ps(ifft2367, ifft2368);
__m512 ifft2455 = _mm512_add_ps(ifft2451, ifft2452);
__m512 ifft2372 = _mm512_sub_ps(ifft2367, ifft2368);
__m512 ifft2456 = _mm512_sub_ps(ifft2451, ifft2452);
__m512 ifft2373 = _mm512_fmadd_ps(ifft2369, _mm512_set1_ps(1.5625e-02f), ifft2359);
__m512 ifft2457 = _mm512_fmadd_ps(ifft2453, _mm512_set1_ps(1.5625e-02f), ifft2443);
__m512 ifft2374 = _mm512_fnmadd_ps(ifft2369, _mm512_set1_ps(1.5625e-02f), ifft2359);
__m512 ifft2458 = _mm512_fnmadd_ps(ifft2453, _mm512_set1_ps(1.5625e-02f), ifft2443);
__m512 ifft2375 = _mm512_fmadd_ps(ifft2371, _mm512_set1_ps(1.5625e-02f), ifft2361);
__m512 ifft2459 = _mm512_fmadd_ps(ifft2455, _mm512_set1_ps(1.5625e-02f), ifft2445);
__m512 ifft2376 = _mm512_fnmadd_ps(ifft2371, _mm512_set1_ps(1.5625e-02f), ifft2361);
__m512 ifft2460 = _mm512_fnmadd_ps(ifft2455, _mm512_set1_ps(1.5625e-02f), ifft2445);
__m512 ifft2377 = _mm512_fnmadd_ps(ifft2372, _mm512_set1_ps(1.5625e-02f), ifft2360);
__m512 ifft2461 = _mm512_fnmadd_ps(ifft2456, _mm512_set1_ps(1.5625e-02f), ifft2444);
__m512 ifft2378 = _mm512_fmadd_ps(ifft2372, _mm512_set1_ps(1.5625e-02f), ifft2360);
__m512 ifft2462 = _mm512_fmadd_ps(ifft2456, _mm512_set1_ps(1.5625e-02f), ifft2444);
__m512 ifft2379 = _mm512_fmadd_ps(ifft2370, _mm512_set1_ps(1.5625e-02f), ifft2362);
__m512 ifft2463 = _mm512_fmadd_ps(ifft2454, _mm512_set1_ps(1.5625e-02f), ifft2446);
__m512 ifft2380 = _mm512_fnmadd_ps(ifft2370, _mm512_set1_ps(1.5625e-02f), ifft2362);
__m512 ifft2464 = _mm512_fnmadd_ps(ifft2454, _mm512_set1_ps(1.5625e-02f), ifft2446);
__m512 dat720 = ifft2373;
__m512 dat725 = ifft2457;
__m512 dat721 = ifft2375;
__m512 dat726 = ifft2459;
__m512 dat722 = ifft2377;
__m512 dat727 = ifft2461;
__m512 dat723 = ifft2379;
__m512 dat728 = ifft2463;
__m512 dat724 = ifft2374;
__m512 dat729 = ifft2458;
(void)ifft2376;
(void)ifft2460;
(void)ifft2378;
(void)ifft2462;
(void)ifft2380;
(void)ifft2464;
__m512i pm23 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack111 = _mm512_permutex2var_ps(dat720, pm23, dat725);
__m512i pm24 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack112 = _mm512_permutex2var_ps(dat720, pm24, dat725);
__m512 pack113 = _mm512_permutex2var_ps(dat721, pm23, dat726);
__m512 pack114 = _mm512_permutex2var_ps(dat721, pm24, dat726);
__m512 pack115 = _mm512_permutex2var_ps(dat722, pm23, dat727);
__m512 pack116 = _mm512_permutex2var_ps(dat722, pm24, dat727);
__m512 pack117 = _mm512_permutex2var_ps(dat723, pm23, dat728);
__m512 pack118 = _mm512_permutex2var_ps(dat723, pm24, dat728);
__m512 pack119 = _mm512_permutex2var_ps(dat724, pm23, dat729);
__m512 pack120 = _mm512_permutex2var_ps(dat724, pm24, dat729);
pack111 = _mm512_max_ps(_mm512_setzero_ps(), pack111);
pack112 = _mm512_max_ps(_mm512_setzero_ps(), pack112);
pack113 = _mm512_max_ps(_mm512_setzero_ps(), pack113);
pack114 = _mm512_max_ps(_mm512_setzero_ps(), pack114);
pack115 = _mm512_max_ps(_mm512_setzero_ps(), pack115);
pack116 = _mm512_max_ps(_mm512_setzero_ps(), pack116);
pack117 = _mm512_max_ps(_mm512_setzero_ps(), pack117);
pack118 = _mm512_max_ps(_mm512_setzero_ps(), pack118);
pack119 = _mm512_max_ps(_mm512_setzero_ps(), pack119);
pack120 = _mm512_max_ps(_mm512_setzero_ps(), pack120);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack111);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack112);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack113);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack114);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack115);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack116);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack117);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack118);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack119);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack120);
}
}
}
if (j5 >= last2) return;
++j5;
}
if (j5 >= 84) break;
rel5 = 11;
}
if (rel5 < 16) {
if (rel5 < 12) {
ptrdiff_t toH9 = base5+10;
ptrdiff_t toW9 = 105;
ptrdiff_t k33 = 16*w21;
for (; k33 != 16; ++k33) {
ptrdiff_t r10 = 0;
for (; r10 != 2; ++r10) {
ptrdiff_t t16 = 0;
__m512 sfRe169 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfIm169 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfRe173 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfIm173 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfRe170 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfIm170 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfRe174 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfIm174 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfRe171 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfIm171 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfRe175 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfIm175 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfRe172 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfIm172 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfRe176 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfIm176 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512i ifft2465 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2466 = _mm512_permutexvar_ps(ifft2465, sfRe169);
__m512 ifft2557 = _mm512_permutexvar_ps(ifft2465, sfRe173);
__m512i ifft2467 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft2468 = _mm512_permutexvar_ps(ifft2467, sfRe169);
__m512 ifft2558 = _mm512_permutexvar_ps(ifft2467, sfRe173);
__m512 ifft2469 = _mm512_permutexvar_ps(ifft2465, sfIm169);
__m512 ifft2559 = _mm512_permutexvar_ps(ifft2465, sfIm173);
__m512 ifft2470 = _mm512_permutexvar_ps(ifft2467, sfIm169);
__m512 ifft2560 = _mm512_permutexvar_ps(ifft2467, sfIm173);
__m512 ifft2471 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft2472 = _mm512_mask_fmadd_ps(ifft2470, 65021, ifft2471, ifft2466);
__m512 ifft2561 = _mm512_mask_fmadd_ps(ifft2560, 65021, ifft2471, ifft2557);
__m512 ifft2473 = _mm512_mask_fnmadd_ps(ifft2469, 65021, ifft2471, ifft2468);
__m512 ifft2562 = _mm512_mask_fnmadd_ps(ifft2559, 65021, ifft2471, ifft2558);
__m512 ifft2474 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft2475 = _mm512_fmadd_ps(ifft2472, ifft2474, _mm512_shuffle_ps(ifft2472, ifft2472, 177));
__m512 ifft2563 = _mm512_fmadd_ps(ifft2561, ifft2474, _mm512_shuffle_ps(ifft2561, ifft2561, 177));
__m512 ifft2476 = _mm512_fmadd_ps(ifft2473, ifft2474, _mm512_shuffle_ps(ifft2473, ifft2473, 177));
__m512 ifft2564 = _mm512_fmadd_ps(ifft2562, ifft2474, _mm512_shuffle_ps(ifft2562, ifft2562, 177));
__m512 ifft2477 = _mm512_fmadd_ps(sfRe170, ifft2474, _mm512_shuffle_ps(sfRe170, sfRe170, 177));
__m512 ifft2565 = _mm512_fmadd_ps(sfRe174, ifft2474, _mm512_shuffle_ps(sfRe174, sfRe174, 177));
__m512 ifft2478 = _mm512_fmadd_ps(sfIm170, ifft2474, _mm512_shuffle_ps(sfIm170, sfIm170, 177));
__m512 ifft2566 = _mm512_fmadd_ps(sfIm174, ifft2474, _mm512_shuffle_ps(sfIm174, sfIm174, 177));
__m512 ifft2479 = _mm512_fmadd_ps(sfRe171, ifft2474, _mm512_shuffle_ps(sfRe171, sfRe171, 177));
__m512 ifft2567 = _mm512_fmadd_ps(sfRe175, ifft2474, _mm512_shuffle_ps(sfRe175, sfRe175, 177));
__m512 ifft2480 = _mm512_fmadd_ps(sfIm171, ifft2474, _mm512_shuffle_ps(sfIm171, sfIm171, 177));
__m512 ifft2568 = _mm512_fmadd_ps(sfIm175, ifft2474, _mm512_shuffle_ps(sfIm175, sfIm175, 177));
__m512 ifft2481 = _mm512_fmadd_ps(sfRe172, ifft2474, _mm512_shuffle_ps(sfRe172, sfRe172, 177));
__m512 ifft2569 = _mm512_fmadd_ps(sfRe176, ifft2474, _mm512_shuffle_ps(sfRe176, sfRe176, 177));
__m512 ifft2482 = _mm512_fmadd_ps(sfIm172, ifft2474, _mm512_shuffle_ps(sfIm172, sfIm172, 177));
__m512 ifft2570 = _mm512_fmadd_ps(sfIm176, ifft2474, _mm512_shuffle_ps(sfIm176, sfIm176, 177));
__m512 ifft2483 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft2484 = _mm512_mul_ps(ifft2475, ifft2483);
__m512 ifft2571 = _mm512_mul_ps(ifft2563, ifft2483);
__m512 ifft2485 = _mm512_mul_ps(ifft2476, ifft2483);
__m512 ifft2572 = _mm512_mul_ps(ifft2564, ifft2483);
__m512 ifft2486 = _mm512_mul_ps(ifft2477, ifft2483);
__m512 ifft2573 = _mm512_mul_ps(ifft2565, ifft2483);
__m512 ifft2487 = _mm512_mul_ps(ifft2478, ifft2483);
__m512 ifft2574 = _mm512_mul_ps(ifft2566, ifft2483);
__m512 ifft2488 = _mm512_mul_ps(ifft2479, ifft2483);
__m512 ifft2575 = _mm512_mul_ps(ifft2567, ifft2483);
__m512 ifft2489 = _mm512_mul_ps(ifft2480, ifft2483);
__m512 ifft2576 = _mm512_mul_ps(ifft2568, ifft2483);
__m512 ifft2490 = _mm512_mul_ps(ifft2481, ifft2483);
__m512 ifft2577 = _mm512_mul_ps(ifft2569, ifft2483);
__m512 ifft2491 = _mm512_mul_ps(ifft2482, ifft2483);
__m512 ifft2578 = _mm512_mul_ps(ifft2570, ifft2483);
__m512 ifft2492 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft2493 = _mm512_fnmadd_ps(ifft2476, ifft2492, ifft2484);
__m512 ifft2579 = _mm512_fnmadd_ps(ifft2564, ifft2492, ifft2571);
__m512 ifft2494 = _mm512_fmadd_ps(ifft2475, ifft2492, ifft2485);
__m512 ifft2580 = _mm512_fmadd_ps(ifft2563, ifft2492, ifft2572);
__m512 ifft2495 = _mm512_fnmadd_ps(ifft2478, ifft2492, ifft2486);
__m512 ifft2581 = _mm512_fnmadd_ps(ifft2566, ifft2492, ifft2573);
__m512 ifft2496 = _mm512_fmadd_ps(ifft2477, ifft2492, ifft2487);
__m512 ifft2582 = _mm512_fmadd_ps(ifft2565, ifft2492, ifft2574);
__m512 ifft2497 = _mm512_fnmadd_ps(ifft2480, ifft2492, ifft2488);
__m512 ifft2583 = _mm512_fnmadd_ps(ifft2568, ifft2492, ifft2575);
__m512 ifft2498 = _mm512_fmadd_ps(ifft2479, ifft2492, ifft2489);
__m512 ifft2584 = _mm512_fmadd_ps(ifft2567, ifft2492, ifft2576);
__m512 ifft2499 = _mm512_fnmadd_ps(ifft2482, ifft2492, ifft2490);
__m512 ifft2585 = _mm512_fnmadd_ps(ifft2570, ifft2492, ifft2577);
__m512 ifft2500 = _mm512_fmadd_ps(ifft2481, ifft2492, ifft2491);
__m512 ifft2586 = _mm512_fmadd_ps(ifft2569, ifft2492, ifft2578);
__m512 ifft2501 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft2502 = _mm512_fmadd_ps(ifft2493, ifft2501, _mm512_shuffle_ps(ifft2493, ifft2493, 78));
__m512 ifft2587 = _mm512_fmadd_ps(ifft2579, ifft2501, _mm512_shuffle_ps(ifft2579, ifft2579, 78));
__m512 ifft2503 = _mm512_fmadd_ps(ifft2494, ifft2501, _mm512_shuffle_ps(ifft2494, ifft2494, 78));
__m512 ifft2588 = _mm512_fmadd_ps(ifft2580, ifft2501, _mm512_shuffle_ps(ifft2580, ifft2580, 78));
__m512 ifft2504 = _mm512_fmadd_ps(ifft2495, ifft2501, _mm512_shuffle_ps(ifft2495, ifft2495, 78));
__m512 ifft2589 = _mm512_fmadd_ps(ifft2581, ifft2501, _mm512_shuffle_ps(ifft2581, ifft2581, 78));
__m512 ifft2505 = _mm512_fmadd_ps(ifft2496, ifft2501, _mm512_shuffle_ps(ifft2496, ifft2496, 78));
__m512 ifft2590 = _mm512_fmadd_ps(ifft2582, ifft2501, _mm512_shuffle_ps(ifft2582, ifft2582, 78));
__m512 ifft2506 = _mm512_fmadd_ps(ifft2497, ifft2501, _mm512_shuffle_ps(ifft2497, ifft2497, 78));
__m512 ifft2591 = _mm512_fmadd_ps(ifft2583, ifft2501, _mm512_shuffle_ps(ifft2583, ifft2583, 78));
__m512 ifft2507 = _mm512_fmadd_ps(ifft2498, ifft2501, _mm512_shuffle_ps(ifft2498, ifft2498, 78));
__m512 ifft2592 = _mm512_fmadd_ps(ifft2584, ifft2501, _mm512_shuffle_ps(ifft2584, ifft2584, 78));
__m512 ifft2508 = _mm512_fmadd_ps(ifft2499, ifft2501, _mm512_shuffle_ps(ifft2499, ifft2499, 78));
__m512 ifft2593 = _mm512_fmadd_ps(ifft2585, ifft2501, _mm512_shuffle_ps(ifft2585, ifft2585, 78));
__m512 ifft2509 = _mm512_fmadd_ps(ifft2500, ifft2501, _mm512_shuffle_ps(ifft2500, ifft2500, 78));
__m512 ifft2594 = _mm512_fmadd_ps(ifft2586, ifft2501, _mm512_shuffle_ps(ifft2586, ifft2586, 78));
__m512 ifft2510 = _mm512_mask_sub_ps(ifft2502, 49344, _mm512_setzero_ps(), ifft2503);
__m512 ifft2595 = _mm512_mask_sub_ps(ifft2587, 49344, _mm512_setzero_ps(), ifft2588);
__m512 ifft2511 = _mm512_mask_mov_ps(ifft2503, 49344, ifft2502);
__m512 ifft2596 = _mm512_mask_mov_ps(ifft2588, 49344, ifft2587);
__m512 ifft2512 = _mm512_mask_sub_ps(ifft2504, 49344, _mm512_setzero_ps(), ifft2505);
__m512 ifft2597 = _mm512_mask_sub_ps(ifft2589, 49344, _mm512_setzero_ps(), ifft2590);
__m512 ifft2513 = _mm512_mask_mov_ps(ifft2505, 49344, ifft2504);
__m512 ifft2598 = _mm512_mask_mov_ps(ifft2590, 49344, ifft2589);
__m512 ifft2514 = _mm512_mask_sub_ps(ifft2506, 49344, _mm512_setzero_ps(), ifft2507);
__m512 ifft2599 = _mm512_mask_sub_ps(ifft2591, 49344, _mm512_setzero_ps(), ifft2592);
__m512 ifft2515 = _mm512_mask_mov_ps(ifft2507, 49344, ifft2506);
__m512 ifft2600 = _mm512_mask_mov_ps(ifft2592, 49344, ifft2591);
__m512 ifft2516 = _mm512_mask_sub_ps(ifft2508, 49344, _mm512_setzero_ps(), ifft2509);
__m512 ifft2601 = _mm512_mask_sub_ps(ifft2593, 49344, _mm512_setzero_ps(), ifft2594);
__m512 ifft2517 = _mm512_mask_mov_ps(ifft2509, 49344, ifft2508);
__m512 ifft2602 = _mm512_mask_mov_ps(ifft2594, 49344, ifft2593);
__m512 ifft2518 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft2519 = _mm512_fmadd_ps(ifft2510, ifft2518, _mm512_shuffle_f32x4(ifft2510, ifft2510, 177));
__m512 ifft2603 = _mm512_fmadd_ps(ifft2595, ifft2518, _mm512_shuffle_f32x4(ifft2595, ifft2595, 177));
__m512 ifft2520 = _mm512_fmadd_ps(ifft2511, ifft2518, _mm512_shuffle_f32x4(ifft2511, ifft2511, 177));
__m512 ifft2604 = _mm512_fmadd_ps(ifft2596, ifft2518, _mm512_shuffle_f32x4(ifft2596, ifft2596, 177));
__m512 ifft2521 = _mm512_fmadd_ps(ifft2512, ifft2518, _mm512_shuffle_f32x4(ifft2512, ifft2512, 177));
__m512 ifft2605 = _mm512_fmadd_ps(ifft2597, ifft2518, _mm512_shuffle_f32x4(ifft2597, ifft2597, 177));
__m512 ifft2522 = _mm512_fmadd_ps(ifft2513, ifft2518, _mm512_shuffle_f32x4(ifft2513, ifft2513, 177));
__m512 ifft2606 = _mm512_fmadd_ps(ifft2598, ifft2518, _mm512_shuffle_f32x4(ifft2598, ifft2598, 177));
__m512 ifft2523 = _mm512_fmadd_ps(ifft2514, ifft2518, _mm512_shuffle_f32x4(ifft2514, ifft2514, 177));
__m512 ifft2607 = _mm512_fmadd_ps(ifft2599, ifft2518, _mm512_shuffle_f32x4(ifft2599, ifft2599, 177));
__m512 ifft2524 = _mm512_fnmsub_ps(ifft2515, ifft2518, _mm512_shuffle_f32x4(ifft2515, ifft2515, 177));
__m512 ifft2608 = _mm512_fnmsub_ps(ifft2600, ifft2518, _mm512_shuffle_f32x4(ifft2600, ifft2600, 177));
__m512 ifft2525 = _mm512_fmadd_ps(ifft2516, ifft2518, _mm512_shuffle_f32x4(ifft2516, ifft2516, 177));
__m512 ifft2609 = _mm512_fmadd_ps(ifft2601, ifft2518, _mm512_shuffle_f32x4(ifft2601, ifft2601, 177));
__m512 ifft2526 = _mm512_fmadd_ps(ifft2517, ifft2518, _mm512_shuffle_f32x4(ifft2517, ifft2517, 177));
__m512 ifft2610 = _mm512_fmadd_ps(ifft2602, ifft2518, _mm512_shuffle_f32x4(ifft2602, ifft2602, 177));
__m512 ifft2527 = _mm512_add_ps(ifft2519, ifft2520);
__m512 ifft2611 = _mm512_add_ps(ifft2603, ifft2604);
__m512 ifft2528 = _mm512_sub_ps(ifft2519, ifft2520);
__m512 ifft2612 = _mm512_sub_ps(ifft2603, ifft2604);
__m512 ifft2529 = _mm512_sub_ps(ifft2521, ifft2525);
__m512 ifft2613 = _mm512_sub_ps(ifft2605, ifft2609);
__m512 ifft2530 = _mm512_add_ps(ifft2522, ifft2526);
__m512 ifft2614 = _mm512_add_ps(ifft2606, ifft2610);
__m512 ifft2531 = _mm512_add_ps(ifft2521, ifft2525);
__m512 ifft2615 = _mm512_add_ps(ifft2605, ifft2609);
__m512 ifft2532 = _mm512_sub_ps(ifft2522, ifft2526);
__m512 ifft2616 = _mm512_sub_ps(ifft2606, ifft2610);
__m512 ifft2533 = _mm512_mul_ps(ifft2523, _mm512_set1_ps(3.125e-02f));
__m512 ifft2617 = _mm512_mul_ps(ifft2607, _mm512_set1_ps(3.125e-02f));
__m512 ifft2534 = _mm512_mul_ps(ifft2524, _mm512_set1_ps(3.125e-02f));
__m512 ifft2618 = _mm512_mul_ps(ifft2608, _mm512_set1_ps(3.125e-02f));
__m512 ifft2535 = _mm512_fmadd_ps(ifft2527, _mm512_set1_ps(1.5625e-02f), ifft2533);
__m512 ifft2619 = _mm512_fmadd_ps(ifft2611, _mm512_set1_ps(1.5625e-02f), ifft2617);
__m512 ifft2536 = _mm512_fmsub_ps(ifft2527, _mm512_set1_ps(1.5625e-02f), ifft2533);
__m512 ifft2620 = _mm512_fmsub_ps(ifft2611, _mm512_set1_ps(1.5625e-02f), ifft2617);
__m512 ifft2537 = _mm512_fmadd_ps(ifft2528, _mm512_set1_ps(1.5625e-02f), ifft2534);
__m512 ifft2621 = _mm512_fmadd_ps(ifft2612, _mm512_set1_ps(1.5625e-02f), ifft2618);
__m512 ifft2538 = _mm512_fmsub_ps(ifft2528, _mm512_set1_ps(1.5625e-02f), ifft2534);
__m512 ifft2622 = _mm512_fmsub_ps(ifft2612, _mm512_set1_ps(1.5625e-02f), ifft2618);
__m512 ifft2539 = _mm512_add_ps(ifft2529, ifft2530);
__m512 ifft2623 = _mm512_add_ps(ifft2613, ifft2614);
__m512 ifft2540 = _mm512_sub_ps(ifft2529, ifft2530);
__m512 ifft2624 = _mm512_sub_ps(ifft2613, ifft2614);
__m512 ifft2541 = _mm512_fnmadd_ps(ifft2539, _mm512_set1_ps(7.0710677e-01f), ifft2531);
__m512 ifft2625 = _mm512_fnmadd_ps(ifft2623, _mm512_set1_ps(7.0710677e-01f), ifft2615);
__m512 ifft2542 = _mm512_fmadd_ps(ifft2539, _mm512_set1_ps(7.0710677e-01f), ifft2531);
__m512 ifft2626 = _mm512_fmadd_ps(ifft2623, _mm512_set1_ps(7.0710677e-01f), ifft2615);
__m512 ifft2543 = _mm512_fmadd_ps(ifft2540, _mm512_set1_ps(7.0710677e-01f), ifft2532);
__m512 ifft2627 = _mm512_fmadd_ps(ifft2624, _mm512_set1_ps(7.0710677e-01f), ifft2616);
__m512 ifft2544 = _mm512_fmsub_ps(ifft2540, _mm512_set1_ps(7.0710677e-01f), ifft2532);
__m512 ifft2628 = _mm512_fmsub_ps(ifft2624, _mm512_set1_ps(7.0710677e-01f), ifft2616);
__m512 ifft2545 = _mm512_add_ps(ifft2541, ifft2542);
__m512 ifft2629 = _mm512_add_ps(ifft2625, ifft2626);
__m512 ifft2546 = _mm512_sub_ps(ifft2541, ifft2542);
__m512 ifft2630 = _mm512_sub_ps(ifft2625, ifft2626);
__m512 ifft2547 = _mm512_add_ps(ifft2543, ifft2544);
__m512 ifft2631 = _mm512_add_ps(ifft2627, ifft2628);
__m512 ifft2548 = _mm512_sub_ps(ifft2543, ifft2544);
__m512 ifft2632 = _mm512_sub_ps(ifft2627, ifft2628);
__m512 ifft2549 = _mm512_fmadd_ps(ifft2545, _mm512_set1_ps(1.5625e-02f), ifft2535);
__m512 ifft2633 = _mm512_fmadd_ps(ifft2629, _mm512_set1_ps(1.5625e-02f), ifft2619);
__m512 ifft2550 = _mm512_fnmadd_ps(ifft2545, _mm512_set1_ps(1.5625e-02f), ifft2535);
__m512 ifft2634 = _mm512_fnmadd_ps(ifft2629, _mm512_set1_ps(1.5625e-02f), ifft2619);
__m512 ifft2551 = _mm512_fmadd_ps(ifft2547, _mm512_set1_ps(1.5625e-02f), ifft2537);
__m512 ifft2635 = _mm512_fmadd_ps(ifft2631, _mm512_set1_ps(1.5625e-02f), ifft2621);
__m512 ifft2552 = _mm512_fnmadd_ps(ifft2547, _mm512_set1_ps(1.5625e-02f), ifft2537);
__m512 ifft2636 = _mm512_fnmadd_ps(ifft2631, _mm512_set1_ps(1.5625e-02f), ifft2621);
__m512 ifft2553 = _mm512_fnmadd_ps(ifft2548, _mm512_set1_ps(1.5625e-02f), ifft2536);
__m512 ifft2637 = _mm512_fnmadd_ps(ifft2632, _mm512_set1_ps(1.5625e-02f), ifft2620);
__m512 ifft2554 = _mm512_fmadd_ps(ifft2548, _mm512_set1_ps(1.5625e-02f), ifft2536);
__m512 ifft2638 = _mm512_fmadd_ps(ifft2632, _mm512_set1_ps(1.5625e-02f), ifft2620);
__m512 ifft2555 = _mm512_fmadd_ps(ifft2546, _mm512_set1_ps(1.5625e-02f), ifft2538);
__m512 ifft2639 = _mm512_fmadd_ps(ifft2630, _mm512_set1_ps(1.5625e-02f), ifft2622);
__m512 ifft2556 = _mm512_fnmadd_ps(ifft2546, _mm512_set1_ps(1.5625e-02f), ifft2538);
__m512 ifft2640 = _mm512_fnmadd_ps(ifft2630, _mm512_set1_ps(1.5625e-02f), ifft2622);
__m512 dat730 = ifft2549;
__m512 dat735 = ifft2633;
__m512 dat731 = ifft2551;
__m512 dat736 = ifft2635;
__m512 dat732 = ifft2553;
__m512 dat737 = ifft2637;
__m512 dat733 = ifft2555;
__m512 dat738 = ifft2639;
__m512 dat734 = ifft2550;
__m512 dat739 = ifft2634;
(void)ifft2552;
(void)ifft2636;
(void)ifft2554;
(void)ifft2638;
(void)ifft2556;
(void)ifft2640;
__m512i pm25 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack121 = _mm512_permutex2var_ps(dat730, pm25, dat735);
__m512i pm26 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack122 = _mm512_permutex2var_ps(dat730, pm26, dat735);
__m512 pack123 = _mm512_permutex2var_ps(dat731, pm25, dat736);
__m512 pack124 = _mm512_permutex2var_ps(dat731, pm26, dat736);
__m512 pack125 = _mm512_permutex2var_ps(dat732, pm25, dat737);
__m512 pack126 = _mm512_permutex2var_ps(dat732, pm26, dat737);
__m512 pack127 = _mm512_permutex2var_ps(dat733, pm25, dat738);
__m512 pack128 = _mm512_permutex2var_ps(dat733, pm26, dat738);
__m512 pack129 = _mm512_permutex2var_ps(dat734, pm25, dat739);
__m512 pack130 = _mm512_permutex2var_ps(dat734, pm26, dat739);
pack121 = _mm512_max_ps(_mm512_setzero_ps(), pack121);
pack122 = _mm512_max_ps(_mm512_setzero_ps(), pack122);
pack123 = _mm512_max_ps(_mm512_setzero_ps(), pack123);
pack124 = _mm512_max_ps(_mm512_setzero_ps(), pack124);
pack125 = _mm512_max_ps(_mm512_setzero_ps(), pack125);
pack126 = _mm512_max_ps(_mm512_setzero_ps(), pack126);
pack127 = _mm512_max_ps(_mm512_setzero_ps(), pack127);
pack128 = _mm512_max_ps(_mm512_setzero_ps(), pack128);
pack129 = _mm512_max_ps(_mm512_setzero_ps(), pack129);
pack130 = _mm512_max_ps(_mm512_setzero_ps(), pack130);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack121);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack122);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack123);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack124);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack125);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack126);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack127);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack128);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack129);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack130);
ptrdiff_t t17 = 0;
__m512 sfRe177 = _mm512_loadu_ps(sfPtr3+256+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfIm177 = _mm512_loadu_ps(sfPtr3+320+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfRe181 = _mm512_loadu_ps(sfPtr3+384+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfIm181 = _mm512_loadu_ps(sfPtr3+448+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfRe178 = _mm512_loadu_ps(sfPtr3+2167040+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfIm178 = _mm512_loadu_ps(sfPtr3+2167104+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfRe182 = _mm512_loadu_ps(sfPtr3+2167168+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfIm182 = _mm512_loadu_ps(sfPtr3+2167232+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfRe179 = _mm512_loadu_ps(sfPtr3+4333824+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfIm179 = _mm512_loadu_ps(sfPtr3+4333888+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfRe183 = _mm512_loadu_ps(sfPtr3+4333952+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfIm183 = _mm512_loadu_ps(sfPtr3+4334016+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfRe180 = _mm512_loadu_ps(sfPtr3+6500608+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfIm180 = _mm512_loadu_ps(sfPtr3+6500672+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfRe184 = _mm512_loadu_ps(sfPtr3+6500736+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfIm184 = _mm512_loadu_ps(sfPtr3+6500800+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512i ifft2641 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2642 = _mm512_permutexvar_ps(ifft2641, sfRe177);
__m512 ifft2733 = _mm512_permutexvar_ps(ifft2641, sfRe181);
__m512i ifft2643 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft2644 = _mm512_permutexvar_ps(ifft2643, sfRe177);
__m512 ifft2734 = _mm512_permutexvar_ps(ifft2643, sfRe181);
__m512 ifft2645 = _mm512_permutexvar_ps(ifft2641, sfIm177);
__m512 ifft2735 = _mm512_permutexvar_ps(ifft2641, sfIm181);
__m512 ifft2646 = _mm512_permutexvar_ps(ifft2643, sfIm177);
__m512 ifft2736 = _mm512_permutexvar_ps(ifft2643, sfIm181);
__m512 ifft2647 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft2648 = _mm512_mask_fmadd_ps(ifft2646, 65021, ifft2647, ifft2642);
__m512 ifft2737 = _mm512_mask_fmadd_ps(ifft2736, 65021, ifft2647, ifft2733);
__m512 ifft2649 = _mm512_mask_fnmadd_ps(ifft2645, 65021, ifft2647, ifft2644);
__m512 ifft2738 = _mm512_mask_fnmadd_ps(ifft2735, 65021, ifft2647, ifft2734);
__m512 ifft2650 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft2651 = _mm512_fmadd_ps(ifft2648, ifft2650, _mm512_shuffle_ps(ifft2648, ifft2648, 177));
__m512 ifft2739 = _mm512_fmadd_ps(ifft2737, ifft2650, _mm512_shuffle_ps(ifft2737, ifft2737, 177));
__m512 ifft2652 = _mm512_fmadd_ps(ifft2649, ifft2650, _mm512_shuffle_ps(ifft2649, ifft2649, 177));
__m512 ifft2740 = _mm512_fmadd_ps(ifft2738, ifft2650, _mm512_shuffle_ps(ifft2738, ifft2738, 177));
__m512 ifft2653 = _mm512_fmadd_ps(sfRe178, ifft2650, _mm512_shuffle_ps(sfRe178, sfRe178, 177));
__m512 ifft2741 = _mm512_fmadd_ps(sfRe182, ifft2650, _mm512_shuffle_ps(sfRe182, sfRe182, 177));
__m512 ifft2654 = _mm512_fmadd_ps(sfIm178, ifft2650, _mm512_shuffle_ps(sfIm178, sfIm178, 177));
__m512 ifft2742 = _mm512_fmadd_ps(sfIm182, ifft2650, _mm512_shuffle_ps(sfIm182, sfIm182, 177));
__m512 ifft2655 = _mm512_fmadd_ps(sfRe179, ifft2650, _mm512_shuffle_ps(sfRe179, sfRe179, 177));
__m512 ifft2743 = _mm512_fmadd_ps(sfRe183, ifft2650, _mm512_shuffle_ps(sfRe183, sfRe183, 177));
__m512 ifft2656 = _mm512_fmadd_ps(sfIm179, ifft2650, _mm512_shuffle_ps(sfIm179, sfIm179, 177));
__m512 ifft2744 = _mm512_fmadd_ps(sfIm183, ifft2650, _mm512_shuffle_ps(sfIm183, sfIm183, 177));
__m512 ifft2657 = _mm512_fmadd_ps(sfRe180, ifft2650, _mm512_shuffle_ps(sfRe180, sfRe180, 177));
__m512 ifft2745 = _mm512_fmadd_ps(sfRe184, ifft2650, _mm512_shuffle_ps(sfRe184, sfRe184, 177));
__m512 ifft2658 = _mm512_fmadd_ps(sfIm180, ifft2650, _mm512_shuffle_ps(sfIm180, sfIm180, 177));
__m512 ifft2746 = _mm512_fmadd_ps(sfIm184, ifft2650, _mm512_shuffle_ps(sfIm184, sfIm184, 177));
__m512 ifft2659 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft2660 = _mm512_mul_ps(ifft2651, ifft2659);
__m512 ifft2747 = _mm512_mul_ps(ifft2739, ifft2659);
__m512 ifft2661 = _mm512_mul_ps(ifft2652, ifft2659);
__m512 ifft2748 = _mm512_mul_ps(ifft2740, ifft2659);
__m512 ifft2662 = _mm512_mul_ps(ifft2653, ifft2659);
__m512 ifft2749 = _mm512_mul_ps(ifft2741, ifft2659);
__m512 ifft2663 = _mm512_mul_ps(ifft2654, ifft2659);
__m512 ifft2750 = _mm512_mul_ps(ifft2742, ifft2659);
__m512 ifft2664 = _mm512_mul_ps(ifft2655, ifft2659);
__m512 ifft2751 = _mm512_mul_ps(ifft2743, ifft2659);
__m512 ifft2665 = _mm512_mul_ps(ifft2656, ifft2659);
__m512 ifft2752 = _mm512_mul_ps(ifft2744, ifft2659);
__m512 ifft2666 = _mm512_mul_ps(ifft2657, ifft2659);
__m512 ifft2753 = _mm512_mul_ps(ifft2745, ifft2659);
__m512 ifft2667 = _mm512_mul_ps(ifft2658, ifft2659);
__m512 ifft2754 = _mm512_mul_ps(ifft2746, ifft2659);
__m512 ifft2668 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft2669 = _mm512_fnmadd_ps(ifft2652, ifft2668, ifft2660);
__m512 ifft2755 = _mm512_fnmadd_ps(ifft2740, ifft2668, ifft2747);
__m512 ifft2670 = _mm512_fmadd_ps(ifft2651, ifft2668, ifft2661);
__m512 ifft2756 = _mm512_fmadd_ps(ifft2739, ifft2668, ifft2748);
__m512 ifft2671 = _mm512_fnmadd_ps(ifft2654, ifft2668, ifft2662);
__m512 ifft2757 = _mm512_fnmadd_ps(ifft2742, ifft2668, ifft2749);
__m512 ifft2672 = _mm512_fmadd_ps(ifft2653, ifft2668, ifft2663);
__m512 ifft2758 = _mm512_fmadd_ps(ifft2741, ifft2668, ifft2750);
__m512 ifft2673 = _mm512_fnmadd_ps(ifft2656, ifft2668, ifft2664);
__m512 ifft2759 = _mm512_fnmadd_ps(ifft2744, ifft2668, ifft2751);
__m512 ifft2674 = _mm512_fmadd_ps(ifft2655, ifft2668, ifft2665);
__m512 ifft2760 = _mm512_fmadd_ps(ifft2743, ifft2668, ifft2752);
__m512 ifft2675 = _mm512_fnmadd_ps(ifft2658, ifft2668, ifft2666);
__m512 ifft2761 = _mm512_fnmadd_ps(ifft2746, ifft2668, ifft2753);
__m512 ifft2676 = _mm512_fmadd_ps(ifft2657, ifft2668, ifft2667);
__m512 ifft2762 = _mm512_fmadd_ps(ifft2745, ifft2668, ifft2754);
__m512 ifft2677 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft2678 = _mm512_fmadd_ps(ifft2669, ifft2677, _mm512_shuffle_ps(ifft2669, ifft2669, 78));
__m512 ifft2763 = _mm512_fmadd_ps(ifft2755, ifft2677, _mm512_shuffle_ps(ifft2755, ifft2755, 78));
__m512 ifft2679 = _mm512_fmadd_ps(ifft2670, ifft2677, _mm512_shuffle_ps(ifft2670, ifft2670, 78));
__m512 ifft2764 = _mm512_fmadd_ps(ifft2756, ifft2677, _mm512_shuffle_ps(ifft2756, ifft2756, 78));
__m512 ifft2680 = _mm512_fmadd_ps(ifft2671, ifft2677, _mm512_shuffle_ps(ifft2671, ifft2671, 78));
__m512 ifft2765 = _mm512_fmadd_ps(ifft2757, ifft2677, _mm512_shuffle_ps(ifft2757, ifft2757, 78));
__m512 ifft2681 = _mm512_fmadd_ps(ifft2672, ifft2677, _mm512_shuffle_ps(ifft2672, ifft2672, 78));
__m512 ifft2766 = _mm512_fmadd_ps(ifft2758, ifft2677, _mm512_shuffle_ps(ifft2758, ifft2758, 78));
__m512 ifft2682 = _mm512_fmadd_ps(ifft2673, ifft2677, _mm512_shuffle_ps(ifft2673, ifft2673, 78));
__m512 ifft2767 = _mm512_fmadd_ps(ifft2759, ifft2677, _mm512_shuffle_ps(ifft2759, ifft2759, 78));
__m512 ifft2683 = _mm512_fmadd_ps(ifft2674, ifft2677, _mm512_shuffle_ps(ifft2674, ifft2674, 78));
__m512 ifft2768 = _mm512_fmadd_ps(ifft2760, ifft2677, _mm512_shuffle_ps(ifft2760, ifft2760, 78));
__m512 ifft2684 = _mm512_fmadd_ps(ifft2675, ifft2677, _mm512_shuffle_ps(ifft2675, ifft2675, 78));
__m512 ifft2769 = _mm512_fmadd_ps(ifft2761, ifft2677, _mm512_shuffle_ps(ifft2761, ifft2761, 78));
__m512 ifft2685 = _mm512_fmadd_ps(ifft2676, ifft2677, _mm512_shuffle_ps(ifft2676, ifft2676, 78));
__m512 ifft2770 = _mm512_fmadd_ps(ifft2762, ifft2677, _mm512_shuffle_ps(ifft2762, ifft2762, 78));
__m512 ifft2686 = _mm512_mask_sub_ps(ifft2678, 49344, _mm512_setzero_ps(), ifft2679);
__m512 ifft2771 = _mm512_mask_sub_ps(ifft2763, 49344, _mm512_setzero_ps(), ifft2764);
__m512 ifft2687 = _mm512_mask_mov_ps(ifft2679, 49344, ifft2678);
__m512 ifft2772 = _mm512_mask_mov_ps(ifft2764, 49344, ifft2763);
__m512 ifft2688 = _mm512_mask_sub_ps(ifft2680, 49344, _mm512_setzero_ps(), ifft2681);
__m512 ifft2773 = _mm512_mask_sub_ps(ifft2765, 49344, _mm512_setzero_ps(), ifft2766);
__m512 ifft2689 = _mm512_mask_mov_ps(ifft2681, 49344, ifft2680);
__m512 ifft2774 = _mm512_mask_mov_ps(ifft2766, 49344, ifft2765);
__m512 ifft2690 = _mm512_mask_sub_ps(ifft2682, 49344, _mm512_setzero_ps(), ifft2683);
__m512 ifft2775 = _mm512_mask_sub_ps(ifft2767, 49344, _mm512_setzero_ps(), ifft2768);
__m512 ifft2691 = _mm512_mask_mov_ps(ifft2683, 49344, ifft2682);
__m512 ifft2776 = _mm512_mask_mov_ps(ifft2768, 49344, ifft2767);
__m512 ifft2692 = _mm512_mask_sub_ps(ifft2684, 49344, _mm512_setzero_ps(), ifft2685);
__m512 ifft2777 = _mm512_mask_sub_ps(ifft2769, 49344, _mm512_setzero_ps(), ifft2770);
__m512 ifft2693 = _mm512_mask_mov_ps(ifft2685, 49344, ifft2684);
__m512 ifft2778 = _mm512_mask_mov_ps(ifft2770, 49344, ifft2769);
__m512 ifft2694 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft2695 = _mm512_fmadd_ps(ifft2686, ifft2694, _mm512_shuffle_f32x4(ifft2686, ifft2686, 177));
__m512 ifft2779 = _mm512_fmadd_ps(ifft2771, ifft2694, _mm512_shuffle_f32x4(ifft2771, ifft2771, 177));
__m512 ifft2696 = _mm512_fmadd_ps(ifft2687, ifft2694, _mm512_shuffle_f32x4(ifft2687, ifft2687, 177));
__m512 ifft2780 = _mm512_fmadd_ps(ifft2772, ifft2694, _mm512_shuffle_f32x4(ifft2772, ifft2772, 177));
__m512 ifft2697 = _mm512_fmadd_ps(ifft2688, ifft2694, _mm512_shuffle_f32x4(ifft2688, ifft2688, 177));
__m512 ifft2781 = _mm512_fmadd_ps(ifft2773, ifft2694, _mm512_shuffle_f32x4(ifft2773, ifft2773, 177));
__m512 ifft2698 = _mm512_fmadd_ps(ifft2689, ifft2694, _mm512_shuffle_f32x4(ifft2689, ifft2689, 177));
__m512 ifft2782 = _mm512_fmadd_ps(ifft2774, ifft2694, _mm512_shuffle_f32x4(ifft2774, ifft2774, 177));
__m512 ifft2699 = _mm512_fmadd_ps(ifft2690, ifft2694, _mm512_shuffle_f32x4(ifft2690, ifft2690, 177));
__m512 ifft2783 = _mm512_fmadd_ps(ifft2775, ifft2694, _mm512_shuffle_f32x4(ifft2775, ifft2775, 177));
__m512 ifft2700 = _mm512_fnmsub_ps(ifft2691, ifft2694, _mm512_shuffle_f32x4(ifft2691, ifft2691, 177));
__m512 ifft2784 = _mm512_fnmsub_ps(ifft2776, ifft2694, _mm512_shuffle_f32x4(ifft2776, ifft2776, 177));
__m512 ifft2701 = _mm512_fmadd_ps(ifft2692, ifft2694, _mm512_shuffle_f32x4(ifft2692, ifft2692, 177));
__m512 ifft2785 = _mm512_fmadd_ps(ifft2777, ifft2694, _mm512_shuffle_f32x4(ifft2777, ifft2777, 177));
__m512 ifft2702 = _mm512_fmadd_ps(ifft2693, ifft2694, _mm512_shuffle_f32x4(ifft2693, ifft2693, 177));
__m512 ifft2786 = _mm512_fmadd_ps(ifft2778, ifft2694, _mm512_shuffle_f32x4(ifft2778, ifft2778, 177));
__m512 ifft2703 = _mm512_add_ps(ifft2695, ifft2696);
__m512 ifft2787 = _mm512_add_ps(ifft2779, ifft2780);
__m512 ifft2704 = _mm512_sub_ps(ifft2695, ifft2696);
__m512 ifft2788 = _mm512_sub_ps(ifft2779, ifft2780);
__m512 ifft2705 = _mm512_sub_ps(ifft2697, ifft2701);
__m512 ifft2789 = _mm512_sub_ps(ifft2781, ifft2785);
__m512 ifft2706 = _mm512_add_ps(ifft2698, ifft2702);
__m512 ifft2790 = _mm512_add_ps(ifft2782, ifft2786);
__m512 ifft2707 = _mm512_add_ps(ifft2697, ifft2701);
__m512 ifft2791 = _mm512_add_ps(ifft2781, ifft2785);
__m512 ifft2708 = _mm512_sub_ps(ifft2698, ifft2702);
__m512 ifft2792 = _mm512_sub_ps(ifft2782, ifft2786);
__m512 ifft2709 = _mm512_mul_ps(ifft2699, _mm512_set1_ps(3.125e-02f));
__m512 ifft2793 = _mm512_mul_ps(ifft2783, _mm512_set1_ps(3.125e-02f));
__m512 ifft2710 = _mm512_mul_ps(ifft2700, _mm512_set1_ps(3.125e-02f));
__m512 ifft2794 = _mm512_mul_ps(ifft2784, _mm512_set1_ps(3.125e-02f));
__m512 ifft2711 = _mm512_fmadd_ps(ifft2703, _mm512_set1_ps(1.5625e-02f), ifft2709);
__m512 ifft2795 = _mm512_fmadd_ps(ifft2787, _mm512_set1_ps(1.5625e-02f), ifft2793);
__m512 ifft2712 = _mm512_fmsub_ps(ifft2703, _mm512_set1_ps(1.5625e-02f), ifft2709);
__m512 ifft2796 = _mm512_fmsub_ps(ifft2787, _mm512_set1_ps(1.5625e-02f), ifft2793);
__m512 ifft2713 = _mm512_fmadd_ps(ifft2704, _mm512_set1_ps(1.5625e-02f), ifft2710);
__m512 ifft2797 = _mm512_fmadd_ps(ifft2788, _mm512_set1_ps(1.5625e-02f), ifft2794);
__m512 ifft2714 = _mm512_fmsub_ps(ifft2704, _mm512_set1_ps(1.5625e-02f), ifft2710);
__m512 ifft2798 = _mm512_fmsub_ps(ifft2788, _mm512_set1_ps(1.5625e-02f), ifft2794);
__m512 ifft2715 = _mm512_add_ps(ifft2705, ifft2706);
__m512 ifft2799 = _mm512_add_ps(ifft2789, ifft2790);
__m512 ifft2716 = _mm512_sub_ps(ifft2705, ifft2706);
__m512 ifft2800 = _mm512_sub_ps(ifft2789, ifft2790);
__m512 ifft2717 = _mm512_fnmadd_ps(ifft2715, _mm512_set1_ps(7.0710677e-01f), ifft2707);
__m512 ifft2801 = _mm512_fnmadd_ps(ifft2799, _mm512_set1_ps(7.0710677e-01f), ifft2791);
__m512 ifft2718 = _mm512_fmadd_ps(ifft2715, _mm512_set1_ps(7.0710677e-01f), ifft2707);
__m512 ifft2802 = _mm512_fmadd_ps(ifft2799, _mm512_set1_ps(7.0710677e-01f), ifft2791);
__m512 ifft2719 = _mm512_fmadd_ps(ifft2716, _mm512_set1_ps(7.0710677e-01f), ifft2708);
__m512 ifft2803 = _mm512_fmadd_ps(ifft2800, _mm512_set1_ps(7.0710677e-01f), ifft2792);
__m512 ifft2720 = _mm512_fmsub_ps(ifft2716, _mm512_set1_ps(7.0710677e-01f), ifft2708);
__m512 ifft2804 = _mm512_fmsub_ps(ifft2800, _mm512_set1_ps(7.0710677e-01f), ifft2792);
__m512 ifft2721 = _mm512_add_ps(ifft2717, ifft2718);
__m512 ifft2805 = _mm512_add_ps(ifft2801, ifft2802);
__m512 ifft2722 = _mm512_sub_ps(ifft2717, ifft2718);
__m512 ifft2806 = _mm512_sub_ps(ifft2801, ifft2802);
__m512 ifft2723 = _mm512_add_ps(ifft2719, ifft2720);
__m512 ifft2807 = _mm512_add_ps(ifft2803, ifft2804);
__m512 ifft2724 = _mm512_sub_ps(ifft2719, ifft2720);
__m512 ifft2808 = _mm512_sub_ps(ifft2803, ifft2804);
__m512 ifft2725 = _mm512_fmadd_ps(ifft2721, _mm512_set1_ps(1.5625e-02f), ifft2711);
__m512 ifft2809 = _mm512_fmadd_ps(ifft2805, _mm512_set1_ps(1.5625e-02f), ifft2795);
__m512 ifft2726 = _mm512_fnmadd_ps(ifft2721, _mm512_set1_ps(1.5625e-02f), ifft2711);
__m512 ifft2810 = _mm512_fnmadd_ps(ifft2805, _mm512_set1_ps(1.5625e-02f), ifft2795);
__m512 ifft2727 = _mm512_fmadd_ps(ifft2723, _mm512_set1_ps(1.5625e-02f), ifft2713);
__m512 ifft2811 = _mm512_fmadd_ps(ifft2807, _mm512_set1_ps(1.5625e-02f), ifft2797);
__m512 ifft2728 = _mm512_fnmadd_ps(ifft2723, _mm512_set1_ps(1.5625e-02f), ifft2713);
__m512 ifft2812 = _mm512_fnmadd_ps(ifft2807, _mm512_set1_ps(1.5625e-02f), ifft2797);
__m512 ifft2729 = _mm512_fnmadd_ps(ifft2724, _mm512_set1_ps(1.5625e-02f), ifft2712);
__m512 ifft2813 = _mm512_fnmadd_ps(ifft2808, _mm512_set1_ps(1.5625e-02f), ifft2796);
__m512 ifft2730 = _mm512_fmadd_ps(ifft2724, _mm512_set1_ps(1.5625e-02f), ifft2712);
__m512 ifft2814 = _mm512_fmadd_ps(ifft2808, _mm512_set1_ps(1.5625e-02f), ifft2796);
__m512 ifft2731 = _mm512_fmadd_ps(ifft2722, _mm512_set1_ps(1.5625e-02f), ifft2714);
__m512 ifft2815 = _mm512_fmadd_ps(ifft2806, _mm512_set1_ps(1.5625e-02f), ifft2798);
__m512 ifft2732 = _mm512_fnmadd_ps(ifft2722, _mm512_set1_ps(1.5625e-02f), ifft2714);
__m512 ifft2816 = _mm512_fnmadd_ps(ifft2806, _mm512_set1_ps(1.5625e-02f), ifft2798);
__m512 dat740 = ifft2725;
__m512 dat745 = ifft2809;
__m512 dat741 = ifft2727;
__m512 dat746 = ifft2811;
__m512 dat742 = ifft2729;
__m512 dat747 = ifft2813;
__m512 dat743 = ifft2731;
__m512 dat748 = ifft2815;
__m512 dat744 = ifft2726;
__m512 dat749 = ifft2810;
(void)ifft2728;
(void)ifft2812;
(void)ifft2730;
(void)ifft2814;
(void)ifft2732;
(void)ifft2816;
__m512i pm27 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack131 = _mm512_permutex2var_ps(dat740, pm27, dat745);
__m512i pm28 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack132 = _mm512_permutex2var_ps(dat740, pm28, dat745);
__m512 pack133 = _mm512_permutex2var_ps(dat741, pm27, dat746);
__m512 pack134 = _mm512_permutex2var_ps(dat741, pm28, dat746);
__m512 pack135 = _mm512_permutex2var_ps(dat742, pm27, dat747);
__m512 pack136 = _mm512_permutex2var_ps(dat742, pm28, dat747);
__m512 pack137 = _mm512_permutex2var_ps(dat743, pm27, dat748);
__m512 pack138 = _mm512_permutex2var_ps(dat743, pm28, dat748);
__m512 pack139 = _mm512_permutex2var_ps(dat744, pm27, dat749);
__m512 pack140 = _mm512_permutex2var_ps(dat744, pm28, dat749);
pack131 = _mm512_max_ps(_mm512_setzero_ps(), pack131);
pack132 = _mm512_max_ps(_mm512_setzero_ps(), pack132);
pack133 = _mm512_max_ps(_mm512_setzero_ps(), pack133);
pack134 = _mm512_max_ps(_mm512_setzero_ps(), pack134);
pack135 = _mm512_max_ps(_mm512_setzero_ps(), pack135);
pack136 = _mm512_max_ps(_mm512_setzero_ps(), pack136);
pack137 = _mm512_max_ps(_mm512_setzero_ps(), pack137);
pack138 = _mm512_max_ps(_mm512_setzero_ps(), pack138);
pack139 = _mm512_max_ps(_mm512_setzero_ps(), pack139);
pack140 = _mm512_max_ps(_mm512_setzero_ps(), pack140);
_mm512_mask_storeu_ps(datPtr2+1820+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack131);
_mm512_mask_storeu_ps(datPtr2+52060+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack132);
_mm512_mask_storeu_ps(datPtr2+2268+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack133);
_mm512_mask_storeu_ps(datPtr2+52508+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack134);
_mm512_mask_storeu_ps(datPtr2+2716+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack135);
_mm512_mask_storeu_ps(datPtr2+52956+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack136);
_mm512_mask_storeu_ps(datPtr2+3164+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack137);
_mm512_mask_storeu_ps(datPtr2+53404+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack138);
_mm512_mask_storeu_ps(datPtr2+3612+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack139);
_mm512_mask_storeu_ps(datPtr2+53852+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack140);
ptrdiff_t t18 = 0;
__m512 sfRe185 = _mm512_loadu_ps(sfPtr3+512+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfIm185 = _mm512_loadu_ps(sfPtr3+576+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfRe189 = _mm512_loadu_ps(sfPtr3+640+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfIm189 = _mm512_loadu_ps(sfPtr3+704+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfRe186 = _mm512_loadu_ps(sfPtr3+2167296+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfIm186 = _mm512_loadu_ps(sfPtr3+2167360+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfRe190 = _mm512_loadu_ps(sfPtr3+2167424+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfIm190 = _mm512_loadu_ps(sfPtr3+2167488+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfRe187 = _mm512_loadu_ps(sfPtr3+4334080+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfIm187 = _mm512_loadu_ps(sfPtr3+4334144+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfRe191 = _mm512_loadu_ps(sfPtr3+4334208+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfIm191 = _mm512_loadu_ps(sfPtr3+4334272+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfRe188 = _mm512_loadu_ps(sfPtr3+6500864+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfIm188 = _mm512_loadu_ps(sfPtr3+6500928+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfRe192 = _mm512_loadu_ps(sfPtr3+6500992+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfIm192 = _mm512_loadu_ps(sfPtr3+6501056+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512i ifft2817 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2818 = _mm512_permutexvar_ps(ifft2817, sfRe185);
__m512 ifft2909 = _mm512_permutexvar_ps(ifft2817, sfRe189);
__m512i ifft2819 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft2820 = _mm512_permutexvar_ps(ifft2819, sfRe185);
__m512 ifft2910 = _mm512_permutexvar_ps(ifft2819, sfRe189);
__m512 ifft2821 = _mm512_permutexvar_ps(ifft2817, sfIm185);
__m512 ifft2911 = _mm512_permutexvar_ps(ifft2817, sfIm189);
__m512 ifft2822 = _mm512_permutexvar_ps(ifft2819, sfIm185);
__m512 ifft2912 = _mm512_permutexvar_ps(ifft2819, sfIm189);
__m512 ifft2823 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft2824 = _mm512_mask_fmadd_ps(ifft2822, 65021, ifft2823, ifft2818);
__m512 ifft2913 = _mm512_mask_fmadd_ps(ifft2912, 65021, ifft2823, ifft2909);
__m512 ifft2825 = _mm512_mask_fnmadd_ps(ifft2821, 65021, ifft2823, ifft2820);
__m512 ifft2914 = _mm512_mask_fnmadd_ps(ifft2911, 65021, ifft2823, ifft2910);
__m512 ifft2826 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft2827 = _mm512_fmadd_ps(ifft2824, ifft2826, _mm512_shuffle_ps(ifft2824, ifft2824, 177));
__m512 ifft2915 = _mm512_fmadd_ps(ifft2913, ifft2826, _mm512_shuffle_ps(ifft2913, ifft2913, 177));
__m512 ifft2828 = _mm512_fmadd_ps(ifft2825, ifft2826, _mm512_shuffle_ps(ifft2825, ifft2825, 177));
__m512 ifft2916 = _mm512_fmadd_ps(ifft2914, ifft2826, _mm512_shuffle_ps(ifft2914, ifft2914, 177));
__m512 ifft2829 = _mm512_fmadd_ps(sfRe186, ifft2826, _mm512_shuffle_ps(sfRe186, sfRe186, 177));
__m512 ifft2917 = _mm512_fmadd_ps(sfRe190, ifft2826, _mm512_shuffle_ps(sfRe190, sfRe190, 177));
__m512 ifft2830 = _mm512_fmadd_ps(sfIm186, ifft2826, _mm512_shuffle_ps(sfIm186, sfIm186, 177));
__m512 ifft2918 = _mm512_fmadd_ps(sfIm190, ifft2826, _mm512_shuffle_ps(sfIm190, sfIm190, 177));
__m512 ifft2831 = _mm512_fmadd_ps(sfRe187, ifft2826, _mm512_shuffle_ps(sfRe187, sfRe187, 177));
__m512 ifft2919 = _mm512_fmadd_ps(sfRe191, ifft2826, _mm512_shuffle_ps(sfRe191, sfRe191, 177));
__m512 ifft2832 = _mm512_fmadd_ps(sfIm187, ifft2826, _mm512_shuffle_ps(sfIm187, sfIm187, 177));
__m512 ifft2920 = _mm512_fmadd_ps(sfIm191, ifft2826, _mm512_shuffle_ps(sfIm191, sfIm191, 177));
__m512 ifft2833 = _mm512_fmadd_ps(sfRe188, ifft2826, _mm512_shuffle_ps(sfRe188, sfRe188, 177));
__m512 ifft2921 = _mm512_fmadd_ps(sfRe192, ifft2826, _mm512_shuffle_ps(sfRe192, sfRe192, 177));
__m512 ifft2834 = _mm512_fmadd_ps(sfIm188, ifft2826, _mm512_shuffle_ps(sfIm188, sfIm188, 177));
__m512 ifft2922 = _mm512_fmadd_ps(sfIm192, ifft2826, _mm512_shuffle_ps(sfIm192, sfIm192, 177));
__m512 ifft2835 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft2836 = _mm512_mul_ps(ifft2827, ifft2835);
__m512 ifft2923 = _mm512_mul_ps(ifft2915, ifft2835);
__m512 ifft2837 = _mm512_mul_ps(ifft2828, ifft2835);
__m512 ifft2924 = _mm512_mul_ps(ifft2916, ifft2835);
__m512 ifft2838 = _mm512_mul_ps(ifft2829, ifft2835);
__m512 ifft2925 = _mm512_mul_ps(ifft2917, ifft2835);
__m512 ifft2839 = _mm512_mul_ps(ifft2830, ifft2835);
__m512 ifft2926 = _mm512_mul_ps(ifft2918, ifft2835);
__m512 ifft2840 = _mm512_mul_ps(ifft2831, ifft2835);
__m512 ifft2927 = _mm512_mul_ps(ifft2919, ifft2835);
__m512 ifft2841 = _mm512_mul_ps(ifft2832, ifft2835);
__m512 ifft2928 = _mm512_mul_ps(ifft2920, ifft2835);
__m512 ifft2842 = _mm512_mul_ps(ifft2833, ifft2835);
__m512 ifft2929 = _mm512_mul_ps(ifft2921, ifft2835);
__m512 ifft2843 = _mm512_mul_ps(ifft2834, ifft2835);
__m512 ifft2930 = _mm512_mul_ps(ifft2922, ifft2835);
__m512 ifft2844 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft2845 = _mm512_fnmadd_ps(ifft2828, ifft2844, ifft2836);
__m512 ifft2931 = _mm512_fnmadd_ps(ifft2916, ifft2844, ifft2923);
__m512 ifft2846 = _mm512_fmadd_ps(ifft2827, ifft2844, ifft2837);
__m512 ifft2932 = _mm512_fmadd_ps(ifft2915, ifft2844, ifft2924);
__m512 ifft2847 = _mm512_fnmadd_ps(ifft2830, ifft2844, ifft2838);
__m512 ifft2933 = _mm512_fnmadd_ps(ifft2918, ifft2844, ifft2925);
__m512 ifft2848 = _mm512_fmadd_ps(ifft2829, ifft2844, ifft2839);
__m512 ifft2934 = _mm512_fmadd_ps(ifft2917, ifft2844, ifft2926);
__m512 ifft2849 = _mm512_fnmadd_ps(ifft2832, ifft2844, ifft2840);
__m512 ifft2935 = _mm512_fnmadd_ps(ifft2920, ifft2844, ifft2927);
__m512 ifft2850 = _mm512_fmadd_ps(ifft2831, ifft2844, ifft2841);
__m512 ifft2936 = _mm512_fmadd_ps(ifft2919, ifft2844, ifft2928);
__m512 ifft2851 = _mm512_fnmadd_ps(ifft2834, ifft2844, ifft2842);
__m512 ifft2937 = _mm512_fnmadd_ps(ifft2922, ifft2844, ifft2929);
__m512 ifft2852 = _mm512_fmadd_ps(ifft2833, ifft2844, ifft2843);
__m512 ifft2938 = _mm512_fmadd_ps(ifft2921, ifft2844, ifft2930);
__m512 ifft2853 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft2854 = _mm512_fmadd_ps(ifft2845, ifft2853, _mm512_shuffle_ps(ifft2845, ifft2845, 78));
__m512 ifft2939 = _mm512_fmadd_ps(ifft2931, ifft2853, _mm512_shuffle_ps(ifft2931, ifft2931, 78));
__m512 ifft2855 = _mm512_fmadd_ps(ifft2846, ifft2853, _mm512_shuffle_ps(ifft2846, ifft2846, 78));
__m512 ifft2940 = _mm512_fmadd_ps(ifft2932, ifft2853, _mm512_shuffle_ps(ifft2932, ifft2932, 78));
__m512 ifft2856 = _mm512_fmadd_ps(ifft2847, ifft2853, _mm512_shuffle_ps(ifft2847, ifft2847, 78));
__m512 ifft2941 = _mm512_fmadd_ps(ifft2933, ifft2853, _mm512_shuffle_ps(ifft2933, ifft2933, 78));
__m512 ifft2857 = _mm512_fmadd_ps(ifft2848, ifft2853, _mm512_shuffle_ps(ifft2848, ifft2848, 78));
__m512 ifft2942 = _mm512_fmadd_ps(ifft2934, ifft2853, _mm512_shuffle_ps(ifft2934, ifft2934, 78));
__m512 ifft2858 = _mm512_fmadd_ps(ifft2849, ifft2853, _mm512_shuffle_ps(ifft2849, ifft2849, 78));
__m512 ifft2943 = _mm512_fmadd_ps(ifft2935, ifft2853, _mm512_shuffle_ps(ifft2935, ifft2935, 78));
__m512 ifft2859 = _mm512_fmadd_ps(ifft2850, ifft2853, _mm512_shuffle_ps(ifft2850, ifft2850, 78));
__m512 ifft2944 = _mm512_fmadd_ps(ifft2936, ifft2853, _mm512_shuffle_ps(ifft2936, ifft2936, 78));
__m512 ifft2860 = _mm512_fmadd_ps(ifft2851, ifft2853, _mm512_shuffle_ps(ifft2851, ifft2851, 78));
__m512 ifft2945 = _mm512_fmadd_ps(ifft2937, ifft2853, _mm512_shuffle_ps(ifft2937, ifft2937, 78));
__m512 ifft2861 = _mm512_fmadd_ps(ifft2852, ifft2853, _mm512_shuffle_ps(ifft2852, ifft2852, 78));
__m512 ifft2946 = _mm512_fmadd_ps(ifft2938, ifft2853, _mm512_shuffle_ps(ifft2938, ifft2938, 78));
__m512 ifft2862 = _mm512_mask_sub_ps(ifft2854, 49344, _mm512_setzero_ps(), ifft2855);
__m512 ifft2947 = _mm512_mask_sub_ps(ifft2939, 49344, _mm512_setzero_ps(), ifft2940);
__m512 ifft2863 = _mm512_mask_mov_ps(ifft2855, 49344, ifft2854);
__m512 ifft2948 = _mm512_mask_mov_ps(ifft2940, 49344, ifft2939);
__m512 ifft2864 = _mm512_mask_sub_ps(ifft2856, 49344, _mm512_setzero_ps(), ifft2857);
__m512 ifft2949 = _mm512_mask_sub_ps(ifft2941, 49344, _mm512_setzero_ps(), ifft2942);
__m512 ifft2865 = _mm512_mask_mov_ps(ifft2857, 49344, ifft2856);
__m512 ifft2950 = _mm512_mask_mov_ps(ifft2942, 49344, ifft2941);
__m512 ifft2866 = _mm512_mask_sub_ps(ifft2858, 49344, _mm512_setzero_ps(), ifft2859);
__m512 ifft2951 = _mm512_mask_sub_ps(ifft2943, 49344, _mm512_setzero_ps(), ifft2944);
__m512 ifft2867 = _mm512_mask_mov_ps(ifft2859, 49344, ifft2858);
__m512 ifft2952 = _mm512_mask_mov_ps(ifft2944, 49344, ifft2943);
__m512 ifft2868 = _mm512_mask_sub_ps(ifft2860, 49344, _mm512_setzero_ps(), ifft2861);
__m512 ifft2953 = _mm512_mask_sub_ps(ifft2945, 49344, _mm512_setzero_ps(), ifft2946);
__m512 ifft2869 = _mm512_mask_mov_ps(ifft2861, 49344, ifft2860);
__m512 ifft2954 = _mm512_mask_mov_ps(ifft2946, 49344, ifft2945);
__m512 ifft2870 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft2871 = _mm512_fmadd_ps(ifft2862, ifft2870, _mm512_shuffle_f32x4(ifft2862, ifft2862, 177));
__m512 ifft2955 = _mm512_fmadd_ps(ifft2947, ifft2870, _mm512_shuffle_f32x4(ifft2947, ifft2947, 177));
__m512 ifft2872 = _mm512_fmadd_ps(ifft2863, ifft2870, _mm512_shuffle_f32x4(ifft2863, ifft2863, 177));
__m512 ifft2956 = _mm512_fmadd_ps(ifft2948, ifft2870, _mm512_shuffle_f32x4(ifft2948, ifft2948, 177));
__m512 ifft2873 = _mm512_fmadd_ps(ifft2864, ifft2870, _mm512_shuffle_f32x4(ifft2864, ifft2864, 177));
__m512 ifft2957 = _mm512_fmadd_ps(ifft2949, ifft2870, _mm512_shuffle_f32x4(ifft2949, ifft2949, 177));
__m512 ifft2874 = _mm512_fmadd_ps(ifft2865, ifft2870, _mm512_shuffle_f32x4(ifft2865, ifft2865, 177));
__m512 ifft2958 = _mm512_fmadd_ps(ifft2950, ifft2870, _mm512_shuffle_f32x4(ifft2950, ifft2950, 177));
__m512 ifft2875 = _mm512_fmadd_ps(ifft2866, ifft2870, _mm512_shuffle_f32x4(ifft2866, ifft2866, 177));
__m512 ifft2959 = _mm512_fmadd_ps(ifft2951, ifft2870, _mm512_shuffle_f32x4(ifft2951, ifft2951, 177));
__m512 ifft2876 = _mm512_fnmsub_ps(ifft2867, ifft2870, _mm512_shuffle_f32x4(ifft2867, ifft2867, 177));
__m512 ifft2960 = _mm512_fnmsub_ps(ifft2952, ifft2870, _mm512_shuffle_f32x4(ifft2952, ifft2952, 177));
__m512 ifft2877 = _mm512_fmadd_ps(ifft2868, ifft2870, _mm512_shuffle_f32x4(ifft2868, ifft2868, 177));
__m512 ifft2961 = _mm512_fmadd_ps(ifft2953, ifft2870, _mm512_shuffle_f32x4(ifft2953, ifft2953, 177));
__m512 ifft2878 = _mm512_fmadd_ps(ifft2869, ifft2870, _mm512_shuffle_f32x4(ifft2869, ifft2869, 177));
__m512 ifft2962 = _mm512_fmadd_ps(ifft2954, ifft2870, _mm512_shuffle_f32x4(ifft2954, ifft2954, 177));
__m512 ifft2879 = _mm512_add_ps(ifft2871, ifft2872);
__m512 ifft2963 = _mm512_add_ps(ifft2955, ifft2956);
__m512 ifft2880 = _mm512_sub_ps(ifft2871, ifft2872);
__m512 ifft2964 = _mm512_sub_ps(ifft2955, ifft2956);
__m512 ifft2881 = _mm512_sub_ps(ifft2873, ifft2877);
__m512 ifft2965 = _mm512_sub_ps(ifft2957, ifft2961);
__m512 ifft2882 = _mm512_add_ps(ifft2874, ifft2878);
__m512 ifft2966 = _mm512_add_ps(ifft2958, ifft2962);
__m512 ifft2883 = _mm512_add_ps(ifft2873, ifft2877);
__m512 ifft2967 = _mm512_add_ps(ifft2957, ifft2961);
__m512 ifft2884 = _mm512_sub_ps(ifft2874, ifft2878);
__m512 ifft2968 = _mm512_sub_ps(ifft2958, ifft2962);
__m512 ifft2885 = _mm512_mul_ps(ifft2875, _mm512_set1_ps(3.125e-02f));
__m512 ifft2969 = _mm512_mul_ps(ifft2959, _mm512_set1_ps(3.125e-02f));
__m512 ifft2886 = _mm512_mul_ps(ifft2876, _mm512_set1_ps(3.125e-02f));
__m512 ifft2970 = _mm512_mul_ps(ifft2960, _mm512_set1_ps(3.125e-02f));
__m512 ifft2887 = _mm512_fmadd_ps(ifft2879, _mm512_set1_ps(1.5625e-02f), ifft2885);
__m512 ifft2971 = _mm512_fmadd_ps(ifft2963, _mm512_set1_ps(1.5625e-02f), ifft2969);
__m512 ifft2888 = _mm512_fmsub_ps(ifft2879, _mm512_set1_ps(1.5625e-02f), ifft2885);
__m512 ifft2972 = _mm512_fmsub_ps(ifft2963, _mm512_set1_ps(1.5625e-02f), ifft2969);
__m512 ifft2889 = _mm512_fmadd_ps(ifft2880, _mm512_set1_ps(1.5625e-02f), ifft2886);
__m512 ifft2973 = _mm512_fmadd_ps(ifft2964, _mm512_set1_ps(1.5625e-02f), ifft2970);
__m512 ifft2890 = _mm512_fmsub_ps(ifft2880, _mm512_set1_ps(1.5625e-02f), ifft2886);
__m512 ifft2974 = _mm512_fmsub_ps(ifft2964, _mm512_set1_ps(1.5625e-02f), ifft2970);
__m512 ifft2891 = _mm512_add_ps(ifft2881, ifft2882);
__m512 ifft2975 = _mm512_add_ps(ifft2965, ifft2966);
__m512 ifft2892 = _mm512_sub_ps(ifft2881, ifft2882);
__m512 ifft2976 = _mm512_sub_ps(ifft2965, ifft2966);
__m512 ifft2893 = _mm512_fnmadd_ps(ifft2891, _mm512_set1_ps(7.0710677e-01f), ifft2883);
__m512 ifft2977 = _mm512_fnmadd_ps(ifft2975, _mm512_set1_ps(7.0710677e-01f), ifft2967);
__m512 ifft2894 = _mm512_fmadd_ps(ifft2891, _mm512_set1_ps(7.0710677e-01f), ifft2883);
__m512 ifft2978 = _mm512_fmadd_ps(ifft2975, _mm512_set1_ps(7.0710677e-01f), ifft2967);
__m512 ifft2895 = _mm512_fmadd_ps(ifft2892, _mm512_set1_ps(7.0710677e-01f), ifft2884);
__m512 ifft2979 = _mm512_fmadd_ps(ifft2976, _mm512_set1_ps(7.0710677e-01f), ifft2968);
__m512 ifft2896 = _mm512_fmsub_ps(ifft2892, _mm512_set1_ps(7.0710677e-01f), ifft2884);
__m512 ifft2980 = _mm512_fmsub_ps(ifft2976, _mm512_set1_ps(7.0710677e-01f), ifft2968);
__m512 ifft2897 = _mm512_add_ps(ifft2893, ifft2894);
__m512 ifft2981 = _mm512_add_ps(ifft2977, ifft2978);
__m512 ifft2898 = _mm512_sub_ps(ifft2893, ifft2894);
__m512 ifft2982 = _mm512_sub_ps(ifft2977, ifft2978);
__m512 ifft2899 = _mm512_add_ps(ifft2895, ifft2896);
__m512 ifft2983 = _mm512_add_ps(ifft2979, ifft2980);
__m512 ifft2900 = _mm512_sub_ps(ifft2895, ifft2896);
__m512 ifft2984 = _mm512_sub_ps(ifft2979, ifft2980);
__m512 ifft2901 = _mm512_fmadd_ps(ifft2897, _mm512_set1_ps(1.5625e-02f), ifft2887);
__m512 ifft2985 = _mm512_fmadd_ps(ifft2981, _mm512_set1_ps(1.5625e-02f), ifft2971);
__m512 ifft2902 = _mm512_fnmadd_ps(ifft2897, _mm512_set1_ps(1.5625e-02f), ifft2887);
__m512 ifft2986 = _mm512_fnmadd_ps(ifft2981, _mm512_set1_ps(1.5625e-02f), ifft2971);
__m512 ifft2903 = _mm512_fmadd_ps(ifft2899, _mm512_set1_ps(1.5625e-02f), ifft2889);
__m512 ifft2987 = _mm512_fmadd_ps(ifft2983, _mm512_set1_ps(1.5625e-02f), ifft2973);
__m512 ifft2904 = _mm512_fnmadd_ps(ifft2899, _mm512_set1_ps(1.5625e-02f), ifft2889);
__m512 ifft2988 = _mm512_fnmadd_ps(ifft2983, _mm512_set1_ps(1.5625e-02f), ifft2973);
__m512 ifft2905 = _mm512_fnmadd_ps(ifft2900, _mm512_set1_ps(1.5625e-02f), ifft2888);
__m512 ifft2989 = _mm512_fnmadd_ps(ifft2984, _mm512_set1_ps(1.5625e-02f), ifft2972);
__m512 ifft2906 = _mm512_fmadd_ps(ifft2900, _mm512_set1_ps(1.5625e-02f), ifft2888);
__m512 ifft2990 = _mm512_fmadd_ps(ifft2984, _mm512_set1_ps(1.5625e-02f), ifft2972);
__m512 ifft2907 = _mm512_fmadd_ps(ifft2898, _mm512_set1_ps(1.5625e-02f), ifft2890);
__m512 ifft2991 = _mm512_fmadd_ps(ifft2982, _mm512_set1_ps(1.5625e-02f), ifft2974);
__m512 ifft2908 = _mm512_fnmadd_ps(ifft2898, _mm512_set1_ps(1.5625e-02f), ifft2890);
__m512 ifft2992 = _mm512_fnmadd_ps(ifft2982, _mm512_set1_ps(1.5625e-02f), ifft2974);
__m512 dat750 = ifft2901;
__m512 dat755 = ifft2985;
__m512 dat751 = ifft2903;
__m512 dat756 = ifft2987;
__m512 dat752 = ifft2905;
__m512 dat757 = ifft2989;
__m512 dat753 = ifft2907;
__m512 dat758 = ifft2991;
__m512 dat754 = ifft2902;
__m512 dat759 = ifft2986;
(void)ifft2904;
(void)ifft2988;
(void)ifft2906;
(void)ifft2990;
(void)ifft2908;
(void)ifft2992;
__m512i pm29 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack141 = _mm512_permutex2var_ps(dat750, pm29, dat755);
__m512i pm30 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack142 = _mm512_permutex2var_ps(dat750, pm30, dat755);
__m512 pack143 = _mm512_permutex2var_ps(dat751, pm29, dat756);
__m512 pack144 = _mm512_permutex2var_ps(dat751, pm30, dat756);
__m512 pack145 = _mm512_permutex2var_ps(dat752, pm29, dat757);
__m512 pack146 = _mm512_permutex2var_ps(dat752, pm30, dat757);
__m512 pack147 = _mm512_permutex2var_ps(dat753, pm29, dat758);
__m512 pack148 = _mm512_permutex2var_ps(dat753, pm30, dat758);
__m512 pack149 = _mm512_permutex2var_ps(dat754, pm29, dat759);
__m512 pack150 = _mm512_permutex2var_ps(dat754, pm30, dat759);
pack141 = _mm512_max_ps(_mm512_setzero_ps(), pack141);
pack142 = _mm512_max_ps(_mm512_setzero_ps(), pack142);
pack143 = _mm512_max_ps(_mm512_setzero_ps(), pack143);
pack144 = _mm512_max_ps(_mm512_setzero_ps(), pack144);
pack145 = _mm512_max_ps(_mm512_setzero_ps(), pack145);
pack146 = _mm512_max_ps(_mm512_setzero_ps(), pack146);
pack147 = _mm512_max_ps(_mm512_setzero_ps(), pack147);
pack148 = _mm512_max_ps(_mm512_setzero_ps(), pack148);
pack149 = _mm512_max_ps(_mm512_setzero_ps(), pack149);
pack150 = _mm512_max_ps(_mm512_setzero_ps(), pack150);
_mm512_mask_storeu_ps(datPtr2+1860+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack141);
_mm512_mask_storeu_ps(datPtr2+52100+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack142);
_mm512_mask_storeu_ps(datPtr2+2308+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack143);
_mm512_mask_storeu_ps(datPtr2+52548+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack144);
_mm512_mask_storeu_ps(datPtr2+2756+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack145);
_mm512_mask_storeu_ps(datPtr2+52996+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack146);
_mm512_mask_storeu_ps(datPtr2+3204+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack147);
_mm512_mask_storeu_ps(datPtr2+53444+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack148);
_mm512_mask_storeu_ps(datPtr2+3652+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack149);
_mm512_mask_storeu_ps(datPtr2+53892+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack150);
}
}
if (j5 >= last2) return;
++j5;
rel5 = 12;
}
if (rel5 < 15) {
ptrdiff_t toH10 = base5+15;
ptrdiff_t toW10 = -340+30*rel5;
ptrdiff_t jj15 = 14-rel5+j5;
for (; j5 <= jj15; toW10 += 30) {
ptrdiff_t k34 = 16*w21;
for (; k34 != 16; ++k34) {
ptrdiff_t r11 = 0;
for (; r11 != 2; ++r11) {
ptrdiff_t t19 = 0;
for (; t19 < 3; ++t19) {
__m512 sfRe193 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfIm193 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfRe197 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfIm197 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfRe194 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfIm194 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfRe198 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfIm198 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfRe195 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfIm195 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfRe199 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfIm199 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfRe196 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfIm196 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfRe200 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfIm200 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512i ifft2993 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2994 = _mm512_permutexvar_ps(ifft2993, sfRe193);
__m512 ifft3085 = _mm512_permutexvar_ps(ifft2993, sfRe197);
__m512i ifft2995 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft2996 = _mm512_permutexvar_ps(ifft2995, sfRe193);
__m512 ifft3086 = _mm512_permutexvar_ps(ifft2995, sfRe197);
__m512 ifft2997 = _mm512_permutexvar_ps(ifft2993, sfIm193);
__m512 ifft3087 = _mm512_permutexvar_ps(ifft2993, sfIm197);
__m512 ifft2998 = _mm512_permutexvar_ps(ifft2995, sfIm193);
__m512 ifft3088 = _mm512_permutexvar_ps(ifft2995, sfIm197);
__m512 ifft2999 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft3000 = _mm512_mask_fmadd_ps(ifft2998, 65021, ifft2999, ifft2994);
__m512 ifft3089 = _mm512_mask_fmadd_ps(ifft3088, 65021, ifft2999, ifft3085);
__m512 ifft3001 = _mm512_mask_fnmadd_ps(ifft2997, 65021, ifft2999, ifft2996);
__m512 ifft3090 = _mm512_mask_fnmadd_ps(ifft3087, 65021, ifft2999, ifft3086);
__m512 ifft3002 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft3003 = _mm512_fmadd_ps(ifft3000, ifft3002, _mm512_shuffle_ps(ifft3000, ifft3000, 177));
__m512 ifft3091 = _mm512_fmadd_ps(ifft3089, ifft3002, _mm512_shuffle_ps(ifft3089, ifft3089, 177));
__m512 ifft3004 = _mm512_fmadd_ps(ifft3001, ifft3002, _mm512_shuffle_ps(ifft3001, ifft3001, 177));
__m512 ifft3092 = _mm512_fmadd_ps(ifft3090, ifft3002, _mm512_shuffle_ps(ifft3090, ifft3090, 177));
__m512 ifft3005 = _mm512_fmadd_ps(sfRe194, ifft3002, _mm512_shuffle_ps(sfRe194, sfRe194, 177));
__m512 ifft3093 = _mm512_fmadd_ps(sfRe198, ifft3002, _mm512_shuffle_ps(sfRe198, sfRe198, 177));
__m512 ifft3006 = _mm512_fmadd_ps(sfIm194, ifft3002, _mm512_shuffle_ps(sfIm194, sfIm194, 177));
__m512 ifft3094 = _mm512_fmadd_ps(sfIm198, ifft3002, _mm512_shuffle_ps(sfIm198, sfIm198, 177));
__m512 ifft3007 = _mm512_fmadd_ps(sfRe195, ifft3002, _mm512_shuffle_ps(sfRe195, sfRe195, 177));
__m512 ifft3095 = _mm512_fmadd_ps(sfRe199, ifft3002, _mm512_shuffle_ps(sfRe199, sfRe199, 177));
__m512 ifft3008 = _mm512_fmadd_ps(sfIm195, ifft3002, _mm512_shuffle_ps(sfIm195, sfIm195, 177));
__m512 ifft3096 = _mm512_fmadd_ps(sfIm199, ifft3002, _mm512_shuffle_ps(sfIm199, sfIm199, 177));
__m512 ifft3009 = _mm512_fmadd_ps(sfRe196, ifft3002, _mm512_shuffle_ps(sfRe196, sfRe196, 177));
__m512 ifft3097 = _mm512_fmadd_ps(sfRe200, ifft3002, _mm512_shuffle_ps(sfRe200, sfRe200, 177));
__m512 ifft3010 = _mm512_fmadd_ps(sfIm196, ifft3002, _mm512_shuffle_ps(sfIm196, sfIm196, 177));
__m512 ifft3098 = _mm512_fmadd_ps(sfIm200, ifft3002, _mm512_shuffle_ps(sfIm200, sfIm200, 177));
__m512 ifft3011 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft3012 = _mm512_mul_ps(ifft3003, ifft3011);
__m512 ifft3099 = _mm512_mul_ps(ifft3091, ifft3011);
__m512 ifft3013 = _mm512_mul_ps(ifft3004, ifft3011);
__m512 ifft3100 = _mm512_mul_ps(ifft3092, ifft3011);
__m512 ifft3014 = _mm512_mul_ps(ifft3005, ifft3011);
__m512 ifft3101 = _mm512_mul_ps(ifft3093, ifft3011);
__m512 ifft3015 = _mm512_mul_ps(ifft3006, ifft3011);
__m512 ifft3102 = _mm512_mul_ps(ifft3094, ifft3011);
__m512 ifft3016 = _mm512_mul_ps(ifft3007, ifft3011);
__m512 ifft3103 = _mm512_mul_ps(ifft3095, ifft3011);
__m512 ifft3017 = _mm512_mul_ps(ifft3008, ifft3011);
__m512 ifft3104 = _mm512_mul_ps(ifft3096, ifft3011);
__m512 ifft3018 = _mm512_mul_ps(ifft3009, ifft3011);
__m512 ifft3105 = _mm512_mul_ps(ifft3097, ifft3011);
__m512 ifft3019 = _mm512_mul_ps(ifft3010, ifft3011);
__m512 ifft3106 = _mm512_mul_ps(ifft3098, ifft3011);
__m512 ifft3020 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft3021 = _mm512_fnmadd_ps(ifft3004, ifft3020, ifft3012);
__m512 ifft3107 = _mm512_fnmadd_ps(ifft3092, ifft3020, ifft3099);
__m512 ifft3022 = _mm512_fmadd_ps(ifft3003, ifft3020, ifft3013);
__m512 ifft3108 = _mm512_fmadd_ps(ifft3091, ifft3020, ifft3100);
__m512 ifft3023 = _mm512_fnmadd_ps(ifft3006, ifft3020, ifft3014);
__m512 ifft3109 = _mm512_fnmadd_ps(ifft3094, ifft3020, ifft3101);
__m512 ifft3024 = _mm512_fmadd_ps(ifft3005, ifft3020, ifft3015);
__m512 ifft3110 = _mm512_fmadd_ps(ifft3093, ifft3020, ifft3102);
__m512 ifft3025 = _mm512_fnmadd_ps(ifft3008, ifft3020, ifft3016);
__m512 ifft3111 = _mm512_fnmadd_ps(ifft3096, ifft3020, ifft3103);
__m512 ifft3026 = _mm512_fmadd_ps(ifft3007, ifft3020, ifft3017);
__m512 ifft3112 = _mm512_fmadd_ps(ifft3095, ifft3020, ifft3104);
__m512 ifft3027 = _mm512_fnmadd_ps(ifft3010, ifft3020, ifft3018);
__m512 ifft3113 = _mm512_fnmadd_ps(ifft3098, ifft3020, ifft3105);
__m512 ifft3028 = _mm512_fmadd_ps(ifft3009, ifft3020, ifft3019);
__m512 ifft3114 = _mm512_fmadd_ps(ifft3097, ifft3020, ifft3106);
__m512 ifft3029 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft3030 = _mm512_fmadd_ps(ifft3021, ifft3029, _mm512_shuffle_ps(ifft3021, ifft3021, 78));
__m512 ifft3115 = _mm512_fmadd_ps(ifft3107, ifft3029, _mm512_shuffle_ps(ifft3107, ifft3107, 78));
__m512 ifft3031 = _mm512_fmadd_ps(ifft3022, ifft3029, _mm512_shuffle_ps(ifft3022, ifft3022, 78));
__m512 ifft3116 = _mm512_fmadd_ps(ifft3108, ifft3029, _mm512_shuffle_ps(ifft3108, ifft3108, 78));
__m512 ifft3032 = _mm512_fmadd_ps(ifft3023, ifft3029, _mm512_shuffle_ps(ifft3023, ifft3023, 78));
__m512 ifft3117 = _mm512_fmadd_ps(ifft3109, ifft3029, _mm512_shuffle_ps(ifft3109, ifft3109, 78));
__m512 ifft3033 = _mm512_fmadd_ps(ifft3024, ifft3029, _mm512_shuffle_ps(ifft3024, ifft3024, 78));
__m512 ifft3118 = _mm512_fmadd_ps(ifft3110, ifft3029, _mm512_shuffle_ps(ifft3110, ifft3110, 78));
__m512 ifft3034 = _mm512_fmadd_ps(ifft3025, ifft3029, _mm512_shuffle_ps(ifft3025, ifft3025, 78));
__m512 ifft3119 = _mm512_fmadd_ps(ifft3111, ifft3029, _mm512_shuffle_ps(ifft3111, ifft3111, 78));
__m512 ifft3035 = _mm512_fmadd_ps(ifft3026, ifft3029, _mm512_shuffle_ps(ifft3026, ifft3026, 78));
__m512 ifft3120 = _mm512_fmadd_ps(ifft3112, ifft3029, _mm512_shuffle_ps(ifft3112, ifft3112, 78));
__m512 ifft3036 = _mm512_fmadd_ps(ifft3027, ifft3029, _mm512_shuffle_ps(ifft3027, ifft3027, 78));
__m512 ifft3121 = _mm512_fmadd_ps(ifft3113, ifft3029, _mm512_shuffle_ps(ifft3113, ifft3113, 78));
__m512 ifft3037 = _mm512_fmadd_ps(ifft3028, ifft3029, _mm512_shuffle_ps(ifft3028, ifft3028, 78));
__m512 ifft3122 = _mm512_fmadd_ps(ifft3114, ifft3029, _mm512_shuffle_ps(ifft3114, ifft3114, 78));
__m512 ifft3038 = _mm512_mask_sub_ps(ifft3030, 49344, _mm512_setzero_ps(), ifft3031);
__m512 ifft3123 = _mm512_mask_sub_ps(ifft3115, 49344, _mm512_setzero_ps(), ifft3116);
__m512 ifft3039 = _mm512_mask_mov_ps(ifft3031, 49344, ifft3030);
__m512 ifft3124 = _mm512_mask_mov_ps(ifft3116, 49344, ifft3115);
__m512 ifft3040 = _mm512_mask_sub_ps(ifft3032, 49344, _mm512_setzero_ps(), ifft3033);
__m512 ifft3125 = _mm512_mask_sub_ps(ifft3117, 49344, _mm512_setzero_ps(), ifft3118);
__m512 ifft3041 = _mm512_mask_mov_ps(ifft3033, 49344, ifft3032);
__m512 ifft3126 = _mm512_mask_mov_ps(ifft3118, 49344, ifft3117);
__m512 ifft3042 = _mm512_mask_sub_ps(ifft3034, 49344, _mm512_setzero_ps(), ifft3035);
__m512 ifft3127 = _mm512_mask_sub_ps(ifft3119, 49344, _mm512_setzero_ps(), ifft3120);
__m512 ifft3043 = _mm512_mask_mov_ps(ifft3035, 49344, ifft3034);
__m512 ifft3128 = _mm512_mask_mov_ps(ifft3120, 49344, ifft3119);
__m512 ifft3044 = _mm512_mask_sub_ps(ifft3036, 49344, _mm512_setzero_ps(), ifft3037);
__m512 ifft3129 = _mm512_mask_sub_ps(ifft3121, 49344, _mm512_setzero_ps(), ifft3122);
__m512 ifft3045 = _mm512_mask_mov_ps(ifft3037, 49344, ifft3036);
__m512 ifft3130 = _mm512_mask_mov_ps(ifft3122, 49344, ifft3121);
__m512 ifft3046 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft3047 = _mm512_fmadd_ps(ifft3038, ifft3046, _mm512_shuffle_f32x4(ifft3038, ifft3038, 177));
__m512 ifft3131 = _mm512_fmadd_ps(ifft3123, ifft3046, _mm512_shuffle_f32x4(ifft3123, ifft3123, 177));
__m512 ifft3048 = _mm512_fmadd_ps(ifft3039, ifft3046, _mm512_shuffle_f32x4(ifft3039, ifft3039, 177));
__m512 ifft3132 = _mm512_fmadd_ps(ifft3124, ifft3046, _mm512_shuffle_f32x4(ifft3124, ifft3124, 177));
__m512 ifft3049 = _mm512_fmadd_ps(ifft3040, ifft3046, _mm512_shuffle_f32x4(ifft3040, ifft3040, 177));
__m512 ifft3133 = _mm512_fmadd_ps(ifft3125, ifft3046, _mm512_shuffle_f32x4(ifft3125, ifft3125, 177));
__m512 ifft3050 = _mm512_fmadd_ps(ifft3041, ifft3046, _mm512_shuffle_f32x4(ifft3041, ifft3041, 177));
__m512 ifft3134 = _mm512_fmadd_ps(ifft3126, ifft3046, _mm512_shuffle_f32x4(ifft3126, ifft3126, 177));
__m512 ifft3051 = _mm512_fmadd_ps(ifft3042, ifft3046, _mm512_shuffle_f32x4(ifft3042, ifft3042, 177));
__m512 ifft3135 = _mm512_fmadd_ps(ifft3127, ifft3046, _mm512_shuffle_f32x4(ifft3127, ifft3127, 177));
__m512 ifft3052 = _mm512_fnmsub_ps(ifft3043, ifft3046, _mm512_shuffle_f32x4(ifft3043, ifft3043, 177));
__m512 ifft3136 = _mm512_fnmsub_ps(ifft3128, ifft3046, _mm512_shuffle_f32x4(ifft3128, ifft3128, 177));
__m512 ifft3053 = _mm512_fmadd_ps(ifft3044, ifft3046, _mm512_shuffle_f32x4(ifft3044, ifft3044, 177));
__m512 ifft3137 = _mm512_fmadd_ps(ifft3129, ifft3046, _mm512_shuffle_f32x4(ifft3129, ifft3129, 177));
__m512 ifft3054 = _mm512_fmadd_ps(ifft3045, ifft3046, _mm512_shuffle_f32x4(ifft3045, ifft3045, 177));
__m512 ifft3138 = _mm512_fmadd_ps(ifft3130, ifft3046, _mm512_shuffle_f32x4(ifft3130, ifft3130, 177));
__m512 ifft3055 = _mm512_add_ps(ifft3047, ifft3048);
__m512 ifft3139 = _mm512_add_ps(ifft3131, ifft3132);
__m512 ifft3056 = _mm512_sub_ps(ifft3047, ifft3048);
__m512 ifft3140 = _mm512_sub_ps(ifft3131, ifft3132);
__m512 ifft3057 = _mm512_sub_ps(ifft3049, ifft3053);
__m512 ifft3141 = _mm512_sub_ps(ifft3133, ifft3137);
__m512 ifft3058 = _mm512_add_ps(ifft3050, ifft3054);
__m512 ifft3142 = _mm512_add_ps(ifft3134, ifft3138);
__m512 ifft3059 = _mm512_add_ps(ifft3049, ifft3053);
__m512 ifft3143 = _mm512_add_ps(ifft3133, ifft3137);
__m512 ifft3060 = _mm512_sub_ps(ifft3050, ifft3054);
__m512 ifft3144 = _mm512_sub_ps(ifft3134, ifft3138);
__m512 ifft3061 = _mm512_mul_ps(ifft3051, _mm512_set1_ps(3.125e-02f));
__m512 ifft3145 = _mm512_mul_ps(ifft3135, _mm512_set1_ps(3.125e-02f));
__m512 ifft3062 = _mm512_mul_ps(ifft3052, _mm512_set1_ps(3.125e-02f));
__m512 ifft3146 = _mm512_mul_ps(ifft3136, _mm512_set1_ps(3.125e-02f));
__m512 ifft3063 = _mm512_fmadd_ps(ifft3055, _mm512_set1_ps(1.5625e-02f), ifft3061);
__m512 ifft3147 = _mm512_fmadd_ps(ifft3139, _mm512_set1_ps(1.5625e-02f), ifft3145);
__m512 ifft3064 = _mm512_fmsub_ps(ifft3055, _mm512_set1_ps(1.5625e-02f), ifft3061);
__m512 ifft3148 = _mm512_fmsub_ps(ifft3139, _mm512_set1_ps(1.5625e-02f), ifft3145);
__m512 ifft3065 = _mm512_fmadd_ps(ifft3056, _mm512_set1_ps(1.5625e-02f), ifft3062);
__m512 ifft3149 = _mm512_fmadd_ps(ifft3140, _mm512_set1_ps(1.5625e-02f), ifft3146);
__m512 ifft3066 = _mm512_fmsub_ps(ifft3056, _mm512_set1_ps(1.5625e-02f), ifft3062);
__m512 ifft3150 = _mm512_fmsub_ps(ifft3140, _mm512_set1_ps(1.5625e-02f), ifft3146);
__m512 ifft3067 = _mm512_add_ps(ifft3057, ifft3058);
__m512 ifft3151 = _mm512_add_ps(ifft3141, ifft3142);
__m512 ifft3068 = _mm512_sub_ps(ifft3057, ifft3058);
__m512 ifft3152 = _mm512_sub_ps(ifft3141, ifft3142);
__m512 ifft3069 = _mm512_fnmadd_ps(ifft3067, _mm512_set1_ps(7.0710677e-01f), ifft3059);
__m512 ifft3153 = _mm512_fnmadd_ps(ifft3151, _mm512_set1_ps(7.0710677e-01f), ifft3143);
__m512 ifft3070 = _mm512_fmadd_ps(ifft3067, _mm512_set1_ps(7.0710677e-01f), ifft3059);
__m512 ifft3154 = _mm512_fmadd_ps(ifft3151, _mm512_set1_ps(7.0710677e-01f), ifft3143);
__m512 ifft3071 = _mm512_fmadd_ps(ifft3068, _mm512_set1_ps(7.0710677e-01f), ifft3060);
__m512 ifft3155 = _mm512_fmadd_ps(ifft3152, _mm512_set1_ps(7.0710677e-01f), ifft3144);
__m512 ifft3072 = _mm512_fmsub_ps(ifft3068, _mm512_set1_ps(7.0710677e-01f), ifft3060);
__m512 ifft3156 = _mm512_fmsub_ps(ifft3152, _mm512_set1_ps(7.0710677e-01f), ifft3144);
__m512 ifft3073 = _mm512_add_ps(ifft3069, ifft3070);
__m512 ifft3157 = _mm512_add_ps(ifft3153, ifft3154);
__m512 ifft3074 = _mm512_sub_ps(ifft3069, ifft3070);
__m512 ifft3158 = _mm512_sub_ps(ifft3153, ifft3154);
__m512 ifft3075 = _mm512_add_ps(ifft3071, ifft3072);
__m512 ifft3159 = _mm512_add_ps(ifft3155, ifft3156);
__m512 ifft3076 = _mm512_sub_ps(ifft3071, ifft3072);
__m512 ifft3160 = _mm512_sub_ps(ifft3155, ifft3156);
__m512 ifft3077 = _mm512_fmadd_ps(ifft3073, _mm512_set1_ps(1.5625e-02f), ifft3063);
__m512 ifft3161 = _mm512_fmadd_ps(ifft3157, _mm512_set1_ps(1.5625e-02f), ifft3147);
__m512 ifft3078 = _mm512_fnmadd_ps(ifft3073, _mm512_set1_ps(1.5625e-02f), ifft3063);
__m512 ifft3162 = _mm512_fnmadd_ps(ifft3157, _mm512_set1_ps(1.5625e-02f), ifft3147);
__m512 ifft3079 = _mm512_fmadd_ps(ifft3075, _mm512_set1_ps(1.5625e-02f), ifft3065);
__m512 ifft3163 = _mm512_fmadd_ps(ifft3159, _mm512_set1_ps(1.5625e-02f), ifft3149);
__m512 ifft3080 = _mm512_fnmadd_ps(ifft3075, _mm512_set1_ps(1.5625e-02f), ifft3065);
__m512 ifft3164 = _mm512_fnmadd_ps(ifft3159, _mm512_set1_ps(1.5625e-02f), ifft3149);
__m512 ifft3081 = _mm512_fnmadd_ps(ifft3076, _mm512_set1_ps(1.5625e-02f), ifft3064);
__m512 ifft3165 = _mm512_fnmadd_ps(ifft3160, _mm512_set1_ps(1.5625e-02f), ifft3148);
__m512 ifft3082 = _mm512_fmadd_ps(ifft3076, _mm512_set1_ps(1.5625e-02f), ifft3064);
__m512 ifft3166 = _mm512_fmadd_ps(ifft3160, _mm512_set1_ps(1.5625e-02f), ifft3148);
__m512 ifft3083 = _mm512_fmadd_ps(ifft3074, _mm512_set1_ps(1.5625e-02f), ifft3066);
__m512 ifft3167 = _mm512_fmadd_ps(ifft3158, _mm512_set1_ps(1.5625e-02f), ifft3150);
__m512 ifft3084 = _mm512_fnmadd_ps(ifft3074, _mm512_set1_ps(1.5625e-02f), ifft3066);
__m512 ifft3168 = _mm512_fnmadd_ps(ifft3158, _mm512_set1_ps(1.5625e-02f), ifft3150);
__m512 dat760 = ifft3077;
__m512 dat765 = ifft3161;
__m512 dat761 = ifft3079;
__m512 dat766 = ifft3163;
__m512 dat762 = ifft3081;
__m512 dat767 = ifft3165;
__m512 dat763 = ifft3083;
__m512 dat768 = ifft3167;
__m512 dat764 = ifft3078;
__m512 dat769 = ifft3162;
(void)ifft3080;
(void)ifft3164;
(void)ifft3082;
(void)ifft3166;
(void)ifft3084;
(void)ifft3168;
__m512i pm31 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack151 = _mm512_permutex2var_ps(dat760, pm31, dat765);
__m512i pm32 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack152 = _mm512_permutex2var_ps(dat760, pm32, dat765);
__m512 pack153 = _mm512_permutex2var_ps(dat761, pm31, dat766);
__m512 pack154 = _mm512_permutex2var_ps(dat761, pm32, dat766);
__m512 pack155 = _mm512_permutex2var_ps(dat762, pm31, dat767);
__m512 pack156 = _mm512_permutex2var_ps(dat762, pm32, dat767);
__m512 pack157 = _mm512_permutex2var_ps(dat763, pm31, dat768);
__m512 pack158 = _mm512_permutex2var_ps(dat763, pm32, dat768);
__m512 pack159 = _mm512_permutex2var_ps(dat764, pm31, dat769);
__m512 pack160 = _mm512_permutex2var_ps(dat764, pm32, dat769);
pack151 = _mm512_max_ps(_mm512_setzero_ps(), pack151);
pack152 = _mm512_max_ps(_mm512_setzero_ps(), pack152);
pack153 = _mm512_max_ps(_mm512_setzero_ps(), pack153);
pack154 = _mm512_max_ps(_mm512_setzero_ps(), pack154);
pack155 = _mm512_max_ps(_mm512_setzero_ps(), pack155);
pack156 = _mm512_max_ps(_mm512_setzero_ps(), pack156);
pack157 = _mm512_max_ps(_mm512_setzero_ps(), pack157);
pack158 = _mm512_max_ps(_mm512_setzero_ps(), pack158);
pack159 = _mm512_max_ps(_mm512_setzero_ps(), pack159);
pack160 = _mm512_max_ps(_mm512_setzero_ps(), pack160);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack151);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack152);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack153);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack154);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack155);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack156);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack157);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack158);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack159);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack160);
}
}
}
if (j5 >= last2) return;
++j5;
}
rel5 = 15;
}
ptrdiff_t toH11 = base5+15;
ptrdiff_t toW11 = 110;
ptrdiff_t k35 = 16*w21;
for (; k35 != 16; ++k35) {
ptrdiff_t r12 = 0;
for (; r12 != 2; ++r12) {
ptrdiff_t t20 = 0;
__m512 sfRe201 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfIm201 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfRe205 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfIm205 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfRe202 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfIm202 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfRe206 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfIm206 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfRe203 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfIm203 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfRe207 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfIm207 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfRe204 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfIm204 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfRe208 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfIm208 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512i ifft3169 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft3170 = _mm512_permutexvar_ps(ifft3169, sfRe201);
__m512 ifft3261 = _mm512_permutexvar_ps(ifft3169, sfRe205);
__m512i ifft3171 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft3172 = _mm512_permutexvar_ps(ifft3171, sfRe201);
__m512 ifft3262 = _mm512_permutexvar_ps(ifft3171, sfRe205);
__m512 ifft3173 = _mm512_permutexvar_ps(ifft3169, sfIm201);
__m512 ifft3263 = _mm512_permutexvar_ps(ifft3169, sfIm205);
__m512 ifft3174 = _mm512_permutexvar_ps(ifft3171, sfIm201);
__m512 ifft3264 = _mm512_permutexvar_ps(ifft3171, sfIm205);
__m512 ifft3175 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft3176 = _mm512_mask_fmadd_ps(ifft3174, 65021, ifft3175, ifft3170);
__m512 ifft3265 = _mm512_mask_fmadd_ps(ifft3264, 65021, ifft3175, ifft3261);
__m512 ifft3177 = _mm512_mask_fnmadd_ps(ifft3173, 65021, ifft3175, ifft3172);
__m512 ifft3266 = _mm512_mask_fnmadd_ps(ifft3263, 65021, ifft3175, ifft3262);
__m512 ifft3178 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft3179 = _mm512_fmadd_ps(ifft3176, ifft3178, _mm512_shuffle_ps(ifft3176, ifft3176, 177));
__m512 ifft3267 = _mm512_fmadd_ps(ifft3265, ifft3178, _mm512_shuffle_ps(ifft3265, ifft3265, 177));
__m512 ifft3180 = _mm512_fmadd_ps(ifft3177, ifft3178, _mm512_shuffle_ps(ifft3177, ifft3177, 177));
__m512 ifft3268 = _mm512_fmadd_ps(ifft3266, ifft3178, _mm512_shuffle_ps(ifft3266, ifft3266, 177));
__m512 ifft3181 = _mm512_fmadd_ps(sfRe202, ifft3178, _mm512_shuffle_ps(sfRe202, sfRe202, 177));
__m512 ifft3269 = _mm512_fmadd_ps(sfRe206, ifft3178, _mm512_shuffle_ps(sfRe206, sfRe206, 177));
__m512 ifft3182 = _mm512_fmadd_ps(sfIm202, ifft3178, _mm512_shuffle_ps(sfIm202, sfIm202, 177));
__m512 ifft3270 = _mm512_fmadd_ps(sfIm206, ifft3178, _mm512_shuffle_ps(sfIm206, sfIm206, 177));
__m512 ifft3183 = _mm512_fmadd_ps(sfRe203, ifft3178, _mm512_shuffle_ps(sfRe203, sfRe203, 177));
__m512 ifft3271 = _mm512_fmadd_ps(sfRe207, ifft3178, _mm512_shuffle_ps(sfRe207, sfRe207, 177));
__m512 ifft3184 = _mm512_fmadd_ps(sfIm203, ifft3178, _mm512_shuffle_ps(sfIm203, sfIm203, 177));
__m512 ifft3272 = _mm512_fmadd_ps(sfIm207, ifft3178, _mm512_shuffle_ps(sfIm207, sfIm207, 177));
__m512 ifft3185 = _mm512_fmadd_ps(sfRe204, ifft3178, _mm512_shuffle_ps(sfRe204, sfRe204, 177));
__m512 ifft3273 = _mm512_fmadd_ps(sfRe208, ifft3178, _mm512_shuffle_ps(sfRe208, sfRe208, 177));
__m512 ifft3186 = _mm512_fmadd_ps(sfIm204, ifft3178, _mm512_shuffle_ps(sfIm204, sfIm204, 177));
__m512 ifft3274 = _mm512_fmadd_ps(sfIm208, ifft3178, _mm512_shuffle_ps(sfIm208, sfIm208, 177));
__m512 ifft3187 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft3188 = _mm512_mul_ps(ifft3179, ifft3187);
__m512 ifft3275 = _mm512_mul_ps(ifft3267, ifft3187);
__m512 ifft3189 = _mm512_mul_ps(ifft3180, ifft3187);
__m512 ifft3276 = _mm512_mul_ps(ifft3268, ifft3187);
__m512 ifft3190 = _mm512_mul_ps(ifft3181, ifft3187);
__m512 ifft3277 = _mm512_mul_ps(ifft3269, ifft3187);
__m512 ifft3191 = _mm512_mul_ps(ifft3182, ifft3187);
__m512 ifft3278 = _mm512_mul_ps(ifft3270, ifft3187);
__m512 ifft3192 = _mm512_mul_ps(ifft3183, ifft3187);
__m512 ifft3279 = _mm512_mul_ps(ifft3271, ifft3187);
__m512 ifft3193 = _mm512_mul_ps(ifft3184, ifft3187);
__m512 ifft3280 = _mm512_mul_ps(ifft3272, ifft3187);
__m512 ifft3194 = _mm512_mul_ps(ifft3185, ifft3187);
__m512 ifft3281 = _mm512_mul_ps(ifft3273, ifft3187);
__m512 ifft3195 = _mm512_mul_ps(ifft3186, ifft3187);
__m512 ifft3282 = _mm512_mul_ps(ifft3274, ifft3187);
__m512 ifft3196 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft3197 = _mm512_fnmadd_ps(ifft3180, ifft3196, ifft3188);
__m512 ifft3283 = _mm512_fnmadd_ps(ifft3268, ifft3196, ifft3275);
__m512 ifft3198 = _mm512_fmadd_ps(ifft3179, ifft3196, ifft3189);
__m512 ifft3284 = _mm512_fmadd_ps(ifft3267, ifft3196, ifft3276);
__m512 ifft3199 = _mm512_fnmadd_ps(ifft3182, ifft3196, ifft3190);
__m512 ifft3285 = _mm512_fnmadd_ps(ifft3270, ifft3196, ifft3277);
__m512 ifft3200 = _mm512_fmadd_ps(ifft3181, ifft3196, ifft3191);
__m512 ifft3286 = _mm512_fmadd_ps(ifft3269, ifft3196, ifft3278);
__m512 ifft3201 = _mm512_fnmadd_ps(ifft3184, ifft3196, ifft3192);
__m512 ifft3287 = _mm512_fnmadd_ps(ifft3272, ifft3196, ifft3279);
__m512 ifft3202 = _mm512_fmadd_ps(ifft3183, ifft3196, ifft3193);
__m512 ifft3288 = _mm512_fmadd_ps(ifft3271, ifft3196, ifft3280);
__m512 ifft3203 = _mm512_fnmadd_ps(ifft3186, ifft3196, ifft3194);
__m512 ifft3289 = _mm512_fnmadd_ps(ifft3274, ifft3196, ifft3281);
__m512 ifft3204 = _mm512_fmadd_ps(ifft3185, ifft3196, ifft3195);
__m512 ifft3290 = _mm512_fmadd_ps(ifft3273, ifft3196, ifft3282);
__m512 ifft3205 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft3206 = _mm512_fmadd_ps(ifft3197, ifft3205, _mm512_shuffle_ps(ifft3197, ifft3197, 78));
__m512 ifft3291 = _mm512_fmadd_ps(ifft3283, ifft3205, _mm512_shuffle_ps(ifft3283, ifft3283, 78));
__m512 ifft3207 = _mm512_fmadd_ps(ifft3198, ifft3205, _mm512_shuffle_ps(ifft3198, ifft3198, 78));
__m512 ifft3292 = _mm512_fmadd_ps(ifft3284, ifft3205, _mm512_shuffle_ps(ifft3284, ifft3284, 78));
__m512 ifft3208 = _mm512_fmadd_ps(ifft3199, ifft3205, _mm512_shuffle_ps(ifft3199, ifft3199, 78));
__m512 ifft3293 = _mm512_fmadd_ps(ifft3285, ifft3205, _mm512_shuffle_ps(ifft3285, ifft3285, 78));
__m512 ifft3209 = _mm512_fmadd_ps(ifft3200, ifft3205, _mm512_shuffle_ps(ifft3200, ifft3200, 78));
__m512 ifft3294 = _mm512_fmadd_ps(ifft3286, ifft3205, _mm512_shuffle_ps(ifft3286, ifft3286, 78));
__m512 ifft3210 = _mm512_fmadd_ps(ifft3201, ifft3205, _mm512_shuffle_ps(ifft3201, ifft3201, 78));
__m512 ifft3295 = _mm512_fmadd_ps(ifft3287, ifft3205, _mm512_shuffle_ps(ifft3287, ifft3287, 78));
__m512 ifft3211 = _mm512_fmadd_ps(ifft3202, ifft3205, _mm512_shuffle_ps(ifft3202, ifft3202, 78));
__m512 ifft3296 = _mm512_fmadd_ps(ifft3288, ifft3205, _mm512_shuffle_ps(ifft3288, ifft3288, 78));
__m512 ifft3212 = _mm512_fmadd_ps(ifft3203, ifft3205, _mm512_shuffle_ps(ifft3203, ifft3203, 78));
__m512 ifft3297 = _mm512_fmadd_ps(ifft3289, ifft3205, _mm512_shuffle_ps(ifft3289, ifft3289, 78));
__m512 ifft3213 = _mm512_fmadd_ps(ifft3204, ifft3205, _mm512_shuffle_ps(ifft3204, ifft3204, 78));
__m512 ifft3298 = _mm512_fmadd_ps(ifft3290, ifft3205, _mm512_shuffle_ps(ifft3290, ifft3290, 78));
__m512 ifft3214 = _mm512_mask_sub_ps(ifft3206, 49344, _mm512_setzero_ps(), ifft3207);
__m512 ifft3299 = _mm512_mask_sub_ps(ifft3291, 49344, _mm512_setzero_ps(), ifft3292);
__m512 ifft3215 = _mm512_mask_mov_ps(ifft3207, 49344, ifft3206);
__m512 ifft3300 = _mm512_mask_mov_ps(ifft3292, 49344, ifft3291);
__m512 ifft3216 = _mm512_mask_sub_ps(ifft3208, 49344, _mm512_setzero_ps(), ifft3209);
__m512 ifft3301 = _mm512_mask_sub_ps(ifft3293, 49344, _mm512_setzero_ps(), ifft3294);
__m512 ifft3217 = _mm512_mask_mov_ps(ifft3209, 49344, ifft3208);
__m512 ifft3302 = _mm512_mask_mov_ps(ifft3294, 49344, ifft3293);
__m512 ifft3218 = _mm512_mask_sub_ps(ifft3210, 49344, _mm512_setzero_ps(), ifft3211);
__m512 ifft3303 = _mm512_mask_sub_ps(ifft3295, 49344, _mm512_setzero_ps(), ifft3296);
__m512 ifft3219 = _mm512_mask_mov_ps(ifft3211, 49344, ifft3210);
__m512 ifft3304 = _mm512_mask_mov_ps(ifft3296, 49344, ifft3295);
__m512 ifft3220 = _mm512_mask_sub_ps(ifft3212, 49344, _mm512_setzero_ps(), ifft3213);
__m512 ifft3305 = _mm512_mask_sub_ps(ifft3297, 49344, _mm512_setzero_ps(), ifft3298);
__m512 ifft3221 = _mm512_mask_mov_ps(ifft3213, 49344, ifft3212);
__m512 ifft3306 = _mm512_mask_mov_ps(ifft3298, 49344, ifft3297);
__m512 ifft3222 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft3223 = _mm512_fmadd_ps(ifft3214, ifft3222, _mm512_shuffle_f32x4(ifft3214, ifft3214, 177));
__m512 ifft3307 = _mm512_fmadd_ps(ifft3299, ifft3222, _mm512_shuffle_f32x4(ifft3299, ifft3299, 177));
__m512 ifft3224 = _mm512_fmadd_ps(ifft3215, ifft3222, _mm512_shuffle_f32x4(ifft3215, ifft3215, 177));
__m512 ifft3308 = _mm512_fmadd_ps(ifft3300, ifft3222, _mm512_shuffle_f32x4(ifft3300, ifft3300, 177));
__m512 ifft3225 = _mm512_fmadd_ps(ifft3216, ifft3222, _mm512_shuffle_f32x4(ifft3216, ifft3216, 177));
__m512 ifft3309 = _mm512_fmadd_ps(ifft3301, ifft3222, _mm512_shuffle_f32x4(ifft3301, ifft3301, 177));
__m512 ifft3226 = _mm512_fmadd_ps(ifft3217, ifft3222, _mm512_shuffle_f32x4(ifft3217, ifft3217, 177));
__m512 ifft3310 = _mm512_fmadd_ps(ifft3302, ifft3222, _mm512_shuffle_f32x4(ifft3302, ifft3302, 177));
__m512 ifft3227 = _mm512_fmadd_ps(ifft3218, ifft3222, _mm512_shuffle_f32x4(ifft3218, ifft3218, 177));
__m512 ifft3311 = _mm512_fmadd_ps(ifft3303, ifft3222, _mm512_shuffle_f32x4(ifft3303, ifft3303, 177));
__m512 ifft3228 = _mm512_fnmsub_ps(ifft3219, ifft3222, _mm512_shuffle_f32x4(ifft3219, ifft3219, 177));
__m512 ifft3312 = _mm512_fnmsub_ps(ifft3304, ifft3222, _mm512_shuffle_f32x4(ifft3304, ifft3304, 177));
__m512 ifft3229 = _mm512_fmadd_ps(ifft3220, ifft3222, _mm512_shuffle_f32x4(ifft3220, ifft3220, 177));
__m512 ifft3313 = _mm512_fmadd_ps(ifft3305, ifft3222, _mm512_shuffle_f32x4(ifft3305, ifft3305, 177));
__m512 ifft3230 = _mm512_fmadd_ps(ifft3221, ifft3222, _mm512_shuffle_f32x4(ifft3221, ifft3221, 177));
__m512 ifft3314 = _mm512_fmadd_ps(ifft3306, ifft3222, _mm512_shuffle_f32x4(ifft3306, ifft3306, 177));
__m512 ifft3231 = _mm512_add_ps(ifft3223, ifft3224);
__m512 ifft3315 = _mm512_add_ps(ifft3307, ifft3308);
__m512 ifft3232 = _mm512_sub_ps(ifft3223, ifft3224);
__m512 ifft3316 = _mm512_sub_ps(ifft3307, ifft3308);
__m512 ifft3233 = _mm512_sub_ps(ifft3225, ifft3229);
__m512 ifft3317 = _mm512_sub_ps(ifft3309, ifft3313);
__m512 ifft3234 = _mm512_add_ps(ifft3226, ifft3230);
__m512 ifft3318 = _mm512_add_ps(ifft3310, ifft3314);
__m512 ifft3235 = _mm512_add_ps(ifft3225, ifft3229);
__m512 ifft3319 = _mm512_add_ps(ifft3309, ifft3313);
__m512 ifft3236 = _mm512_sub_ps(ifft3226, ifft3230);
__m512 ifft3320 = _mm512_sub_ps(ifft3310, ifft3314);
__m512 ifft3237 = _mm512_mul_ps(ifft3227, _mm512_set1_ps(3.125e-02f));
__m512 ifft3321 = _mm512_mul_ps(ifft3311, _mm512_set1_ps(3.125e-02f));
__m512 ifft3238 = _mm512_mul_ps(ifft3228, _mm512_set1_ps(3.125e-02f));
__m512 ifft3322 = _mm512_mul_ps(ifft3312, _mm512_set1_ps(3.125e-02f));
__m512 ifft3239 = _mm512_fmadd_ps(ifft3231, _mm512_set1_ps(1.5625e-02f), ifft3237);
__m512 ifft3323 = _mm512_fmadd_ps(ifft3315, _mm512_set1_ps(1.5625e-02f), ifft3321);
__m512 ifft3240 = _mm512_fmsub_ps(ifft3231, _mm512_set1_ps(1.5625e-02f), ifft3237);
__m512 ifft3324 = _mm512_fmsub_ps(ifft3315, _mm512_set1_ps(1.5625e-02f), ifft3321);
__m512 ifft3241 = _mm512_fmadd_ps(ifft3232, _mm512_set1_ps(1.5625e-02f), ifft3238);
__m512 ifft3325 = _mm512_fmadd_ps(ifft3316, _mm512_set1_ps(1.5625e-02f), ifft3322);
__m512 ifft3242 = _mm512_fmsub_ps(ifft3232, _mm512_set1_ps(1.5625e-02f), ifft3238);
__m512 ifft3326 = _mm512_fmsub_ps(ifft3316, _mm512_set1_ps(1.5625e-02f), ifft3322);
__m512 ifft3243 = _mm512_add_ps(ifft3233, ifft3234);
__m512 ifft3327 = _mm512_add_ps(ifft3317, ifft3318);
__m512 ifft3244 = _mm512_sub_ps(ifft3233, ifft3234);
__m512 ifft3328 = _mm512_sub_ps(ifft3317, ifft3318);
__m512 ifft3245 = _mm512_fnmadd_ps(ifft3243, _mm512_set1_ps(7.0710677e-01f), ifft3235);
__m512 ifft3329 = _mm512_fnmadd_ps(ifft3327, _mm512_set1_ps(7.0710677e-01f), ifft3319);
__m512 ifft3246 = _mm512_fmadd_ps(ifft3243, _mm512_set1_ps(7.0710677e-01f), ifft3235);
__m512 ifft3330 = _mm512_fmadd_ps(ifft3327, _mm512_set1_ps(7.0710677e-01f), ifft3319);
__m512 ifft3247 = _mm512_fmadd_ps(ifft3244, _mm512_set1_ps(7.0710677e-01f), ifft3236);
__m512 ifft3331 = _mm512_fmadd_ps(ifft3328, _mm512_set1_ps(7.0710677e-01f), ifft3320);
__m512 ifft3248 = _mm512_fmsub_ps(ifft3244, _mm512_set1_ps(7.0710677e-01f), ifft3236);
__m512 ifft3332 = _mm512_fmsub_ps(ifft3328, _mm512_set1_ps(7.0710677e-01f), ifft3320);
__m512 ifft3249 = _mm512_add_ps(ifft3245, ifft3246);
__m512 ifft3333 = _mm512_add_ps(ifft3329, ifft3330);
__m512 ifft3250 = _mm512_sub_ps(ifft3245, ifft3246);
__m512 ifft3334 = _mm512_sub_ps(ifft3329, ifft3330);
__m512 ifft3251 = _mm512_add_ps(ifft3247, ifft3248);
__m512 ifft3335 = _mm512_add_ps(ifft3331, ifft3332);
__m512 ifft3252 = _mm512_sub_ps(ifft3247, ifft3248);
__m512 ifft3336 = _mm512_sub_ps(ifft3331, ifft3332);
__m512 ifft3253 = _mm512_fmadd_ps(ifft3249, _mm512_set1_ps(1.5625e-02f), ifft3239);
__m512 ifft3337 = _mm512_fmadd_ps(ifft3333, _mm512_set1_ps(1.5625e-02f), ifft3323);
__m512 ifft3254 = _mm512_fnmadd_ps(ifft3249, _mm512_set1_ps(1.5625e-02f), ifft3239);
__m512 ifft3338 = _mm512_fnmadd_ps(ifft3333, _mm512_set1_ps(1.5625e-02f), ifft3323);
__m512 ifft3255 = _mm512_fmadd_ps(ifft3251, _mm512_set1_ps(1.5625e-02f), ifft3241);
__m512 ifft3339 = _mm512_fmadd_ps(ifft3335, _mm512_set1_ps(1.5625e-02f), ifft3325);
__m512 ifft3256 = _mm512_fnmadd_ps(ifft3251, _mm512_set1_ps(1.5625e-02f), ifft3241);
__m512 ifft3340 = _mm512_fnmadd_ps(ifft3335, _mm512_set1_ps(1.5625e-02f), ifft3325);
__m512 ifft3257 = _mm512_fnmadd_ps(ifft3252, _mm512_set1_ps(1.5625e-02f), ifft3240);
__m512 ifft3341 = _mm512_fnmadd_ps(ifft3336, _mm512_set1_ps(1.5625e-02f), ifft3324);
__m512 ifft3258 = _mm512_fmadd_ps(ifft3252, _mm512_set1_ps(1.5625e-02f), ifft3240);
__m512 ifft3342 = _mm512_fmadd_ps(ifft3336, _mm512_set1_ps(1.5625e-02f), ifft3324);
__m512 ifft3259 = _mm512_fmadd_ps(ifft3250, _mm512_set1_ps(1.5625e-02f), ifft3242);
__m512 ifft3343 = _mm512_fmadd_ps(ifft3334, _mm512_set1_ps(1.5625e-02f), ifft3326);
__m512 ifft3260 = _mm512_fnmadd_ps(ifft3250, _mm512_set1_ps(1.5625e-02f), ifft3242);
__m512 ifft3344 = _mm512_fnmadd_ps(ifft3334, _mm512_set1_ps(1.5625e-02f), ifft3326);
__m512 dat770 = ifft3253;
__m512 dat775 = ifft3337;
__m512 dat771 = ifft3255;
__m512 dat776 = ifft3339;
__m512 dat772 = ifft3257;
__m512 dat777 = ifft3341;
__m512 dat773 = ifft3259;
__m512 dat778 = ifft3343;
__m512 dat774 = ifft3254;
__m512 dat779 = ifft3338;
(void)ifft3256;
(void)ifft3340;
(void)ifft3258;
(void)ifft3342;
(void)ifft3260;
(void)ifft3344;
dat770 = _mm512_max_ps(_mm512_setzero_ps(), dat770);
dat775 = _mm512_max_ps(_mm512_setzero_ps(), dat775);
dat771 = _mm512_max_ps(_mm512_setzero_ps(), dat771);
dat776 = _mm512_max_ps(_mm512_setzero_ps(), dat776);
dat772 = _mm512_max_ps(_mm512_setzero_ps(), dat772);
dat777 = _mm512_max_ps(_mm512_setzero_ps(), dat777);
dat773 = _mm512_max_ps(_mm512_setzero_ps(), dat773);
dat778 = _mm512_max_ps(_mm512_setzero_ps(), dat778);
dat774 = _mm512_max_ps(_mm512_setzero_ps(), dat774);
dat779 = _mm512_max_ps(_mm512_setzero_ps(), dat779);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 3, dat770);
_mm512_mask_storeu_ps(datPtr2+52008+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 7936, dat770);
_mm512_mask_storeu_ps(datPtr2+1800+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 31, dat775);
_mm512_mask_storeu_ps(datPtr2+50208+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 768, dat775);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 3, dat771);
_mm512_mask_storeu_ps(datPtr2+52456+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 7936, dat771);
_mm512_mask_storeu_ps(datPtr2+2248+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 31, dat776);
_mm512_mask_storeu_ps(datPtr2+50656+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 768, dat776);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 3, dat772);
_mm512_mask_storeu_ps(datPtr2+52904+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 7936, dat772);
_mm512_mask_storeu_ps(datPtr2+2696+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 31, dat777);
_mm512_mask_storeu_ps(datPtr2+51104+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 768, dat777);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 3, dat773);
_mm512_mask_storeu_ps(datPtr2+53352+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 7936, dat773);
_mm512_mask_storeu_ps(datPtr2+3144+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 31, dat778);
_mm512_mask_storeu_ps(datPtr2+51552+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 768, dat778);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 3, dat774);
_mm512_mask_storeu_ps(datPtr2+53800+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 7936, dat774);
_mm512_mask_storeu_ps(datPtr2+3592+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 31, dat779);
_mm512_mask_storeu_ps(datPtr2+52000+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 768, dat779);
ptrdiff_t t21 = 0;
for (; t21 < 2; ++t21) {
__m512 sfRe209 = _mm512_loadu_ps(sfPtr3+256+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfIm209 = _mm512_loadu_ps(sfPtr3+320+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfRe213 = _mm512_loadu_ps(sfPtr3+384+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfIm213 = _mm512_loadu_ps(sfPtr3+448+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfRe210 = _mm512_loadu_ps(sfPtr3+2167040+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfIm210 = _mm512_loadu_ps(sfPtr3+2167104+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfRe214 = _mm512_loadu_ps(sfPtr3+2167168+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfIm214 = _mm512_loadu_ps(sfPtr3+2167232+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfRe211 = _mm512_loadu_ps(sfPtr3+4333824+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfIm211 = _mm512_loadu_ps(sfPtr3+4333888+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfRe215 = _mm512_loadu_ps(sfPtr3+4333952+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfIm215 = _mm512_loadu_ps(sfPtr3+4334016+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfRe212 = _mm512_loadu_ps(sfPtr3+6500608+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfIm212 = _mm512_loadu_ps(sfPtr3+6500672+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfRe216 = _mm512_loadu_ps(sfPtr3+6500736+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfIm216 = _mm512_loadu_ps(sfPtr3+6500800+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512i ifft3345 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft3346 = _mm512_permutexvar_ps(ifft3345, sfRe209);
__m512 ifft3437 = _mm512_permutexvar_ps(ifft3345, sfRe213);
__m512i ifft3347 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft3348 = _mm512_permutexvar_ps(ifft3347, sfRe209);
__m512 ifft3438 = _mm512_permutexvar_ps(ifft3347, sfRe213);
__m512 ifft3349 = _mm512_permutexvar_ps(ifft3345, sfIm209);
__m512 ifft3439 = _mm512_permutexvar_ps(ifft3345, sfIm213);
__m512 ifft3350 = _mm512_permutexvar_ps(ifft3347, sfIm209);
__m512 ifft3440 = _mm512_permutexvar_ps(ifft3347, sfIm213);
__m512 ifft3351 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft3352 = _mm512_mask_fmadd_ps(ifft3350, 65021, ifft3351, ifft3346);
__m512 ifft3441 = _mm512_mask_fmadd_ps(ifft3440, 65021, ifft3351, ifft3437);
__m512 ifft3353 = _mm512_mask_fnmadd_ps(ifft3349, 65021, ifft3351, ifft3348);
__m512 ifft3442 = _mm512_mask_fnmadd_ps(ifft3439, 65021, ifft3351, ifft3438);
__m512 ifft3354 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft3355 = _mm512_fmadd_ps(ifft3352, ifft3354, _mm512_shuffle_ps(ifft3352, ifft3352, 177));
__m512 ifft3443 = _mm512_fmadd_ps(ifft3441, ifft3354, _mm512_shuffle_ps(ifft3441, ifft3441, 177));
__m512 ifft3356 = _mm512_fmadd_ps(ifft3353, ifft3354, _mm512_shuffle_ps(ifft3353, ifft3353, 177));
__m512 ifft3444 = _mm512_fmadd_ps(ifft3442, ifft3354, _mm512_shuffle_ps(ifft3442, ifft3442, 177));
__m512 ifft3357 = _mm512_fmadd_ps(sfRe210, ifft3354, _mm512_shuffle_ps(sfRe210, sfRe210, 177));
__m512 ifft3445 = _mm512_fmadd_ps(sfRe214, ifft3354, _mm512_shuffle_ps(sfRe214, sfRe214, 177));
__m512 ifft3358 = _mm512_fmadd_ps(sfIm210, ifft3354, _mm512_shuffle_ps(sfIm210, sfIm210, 177));
__m512 ifft3446 = _mm512_fmadd_ps(sfIm214, ifft3354, _mm512_shuffle_ps(sfIm214, sfIm214, 177));
__m512 ifft3359 = _mm512_fmadd_ps(sfRe211, ifft3354, _mm512_shuffle_ps(sfRe211, sfRe211, 177));
__m512 ifft3447 = _mm512_fmadd_ps(sfRe215, ifft3354, _mm512_shuffle_ps(sfRe215, sfRe215, 177));
__m512 ifft3360 = _mm512_fmadd_ps(sfIm211, ifft3354, _mm512_shuffle_ps(sfIm211, sfIm211, 177));
__m512 ifft3448 = _mm512_fmadd_ps(sfIm215, ifft3354, _mm512_shuffle_ps(sfIm215, sfIm215, 177));
__m512 ifft3361 = _mm512_fmadd_ps(sfRe212, ifft3354, _mm512_shuffle_ps(sfRe212, sfRe212, 177));
__m512 ifft3449 = _mm512_fmadd_ps(sfRe216, ifft3354, _mm512_shuffle_ps(sfRe216, sfRe216, 177));
__m512 ifft3362 = _mm512_fmadd_ps(sfIm212, ifft3354, _mm512_shuffle_ps(sfIm212, sfIm212, 177));
__m512 ifft3450 = _mm512_fmadd_ps(sfIm216, ifft3354, _mm512_shuffle_ps(sfIm216, sfIm216, 177));
__m512 ifft3363 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft3364 = _mm512_mul_ps(ifft3355, ifft3363);
__m512 ifft3451 = _mm512_mul_ps(ifft3443, ifft3363);
__m512 ifft3365 = _mm512_mul_ps(ifft3356, ifft3363);
__m512 ifft3452 = _mm512_mul_ps(ifft3444, ifft3363);
__m512 ifft3366 = _mm512_mul_ps(ifft3357, ifft3363);
__m512 ifft3453 = _mm512_mul_ps(ifft3445, ifft3363);
__m512 ifft3367 = _mm512_mul_ps(ifft3358, ifft3363);
__m512 ifft3454 = _mm512_mul_ps(ifft3446, ifft3363);
__m512 ifft3368 = _mm512_mul_ps(ifft3359, ifft3363);
__m512 ifft3455 = _mm512_mul_ps(ifft3447, ifft3363);
__m512 ifft3369 = _mm512_mul_ps(ifft3360, ifft3363);
__m512 ifft3456 = _mm512_mul_ps(ifft3448, ifft3363);
__m512 ifft3370 = _mm512_mul_ps(ifft3361, ifft3363);
__m512 ifft3457 = _mm512_mul_ps(ifft3449, ifft3363);
__m512 ifft3371 = _mm512_mul_ps(ifft3362, ifft3363);
__m512 ifft3458 = _mm512_mul_ps(ifft3450, ifft3363);
__m512 ifft3372 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft3373 = _mm512_fnmadd_ps(ifft3356, ifft3372, ifft3364);
__m512 ifft3459 = _mm512_fnmadd_ps(ifft3444, ifft3372, ifft3451);
__m512 ifft3374 = _mm512_fmadd_ps(ifft3355, ifft3372, ifft3365);
__m512 ifft3460 = _mm512_fmadd_ps(ifft3443, ifft3372, ifft3452);
__m512 ifft3375 = _mm512_fnmadd_ps(ifft3358, ifft3372, ifft3366);
__m512 ifft3461 = _mm512_fnmadd_ps(ifft3446, ifft3372, ifft3453);
__m512 ifft3376 = _mm512_fmadd_ps(ifft3357, ifft3372, ifft3367);
__m512 ifft3462 = _mm512_fmadd_ps(ifft3445, ifft3372, ifft3454);
__m512 ifft3377 = _mm512_fnmadd_ps(ifft3360, ifft3372, ifft3368);
__m512 ifft3463 = _mm512_fnmadd_ps(ifft3448, ifft3372, ifft3455);
__m512 ifft3378 = _mm512_fmadd_ps(ifft3359, ifft3372, ifft3369);
__m512 ifft3464 = _mm512_fmadd_ps(ifft3447, ifft3372, ifft3456);
__m512 ifft3379 = _mm512_fnmadd_ps(ifft3362, ifft3372, ifft3370);
__m512 ifft3465 = _mm512_fnmadd_ps(ifft3450, ifft3372, ifft3457);
__m512 ifft3380 = _mm512_fmadd_ps(ifft3361, ifft3372, ifft3371);
__m512 ifft3466 = _mm512_fmadd_ps(ifft3449, ifft3372, ifft3458);
__m512 ifft3381 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft3382 = _mm512_fmadd_ps(ifft3373, ifft3381, _mm512_shuffle_ps(ifft3373, ifft3373, 78));
__m512 ifft3467 = _mm512_fmadd_ps(ifft3459, ifft3381, _mm512_shuffle_ps(ifft3459, ifft3459, 78));
__m512 ifft3383 = _mm512_fmadd_ps(ifft3374, ifft3381, _mm512_shuffle_ps(ifft3374, ifft3374, 78));
__m512 ifft3468 = _mm512_fmadd_ps(ifft3460, ifft3381, _mm512_shuffle_ps(ifft3460, ifft3460, 78));
__m512 ifft3384 = _mm512_fmadd_ps(ifft3375, ifft3381, _mm512_shuffle_ps(ifft3375, ifft3375, 78));
__m512 ifft3469 = _mm512_fmadd_ps(ifft3461, ifft3381, _mm512_shuffle_ps(ifft3461, ifft3461, 78));
__m512 ifft3385 = _mm512_fmadd_ps(ifft3376, ifft3381, _mm512_shuffle_ps(ifft3376, ifft3376, 78));
__m512 ifft3470 = _mm512_fmadd_ps(ifft3462, ifft3381, _mm512_shuffle_ps(ifft3462, ifft3462, 78));
__m512 ifft3386 = _mm512_fmadd_ps(ifft3377, ifft3381, _mm512_shuffle_ps(ifft3377, ifft3377, 78));
__m512 ifft3471 = _mm512_fmadd_ps(ifft3463, ifft3381, _mm512_shuffle_ps(ifft3463, ifft3463, 78));
__m512 ifft3387 = _mm512_fmadd_ps(ifft3378, ifft3381, _mm512_shuffle_ps(ifft3378, ifft3378, 78));
__m512 ifft3472 = _mm512_fmadd_ps(ifft3464, ifft3381, _mm512_shuffle_ps(ifft3464, ifft3464, 78));
__m512 ifft3388 = _mm512_fmadd_ps(ifft3379, ifft3381, _mm512_shuffle_ps(ifft3379, ifft3379, 78));
__m512 ifft3473 = _mm512_fmadd_ps(ifft3465, ifft3381, _mm512_shuffle_ps(ifft3465, ifft3465, 78));
__m512 ifft3389 = _mm512_fmadd_ps(ifft3380, ifft3381, _mm512_shuffle_ps(ifft3380, ifft3380, 78));
__m512 ifft3474 = _mm512_fmadd_ps(ifft3466, ifft3381, _mm512_shuffle_ps(ifft3466, ifft3466, 78));
__m512 ifft3390 = _mm512_mask_sub_ps(ifft3382, 49344, _mm512_setzero_ps(), ifft3383);
__m512 ifft3475 = _mm512_mask_sub_ps(ifft3467, 49344, _mm512_setzero_ps(), ifft3468);
__m512 ifft3391 = _mm512_mask_mov_ps(ifft3383, 49344, ifft3382);
__m512 ifft3476 = _mm512_mask_mov_ps(ifft3468, 49344, ifft3467);
__m512 ifft3392 = _mm512_mask_sub_ps(ifft3384, 49344, _mm512_setzero_ps(), ifft3385);
__m512 ifft3477 = _mm512_mask_sub_ps(ifft3469, 49344, _mm512_setzero_ps(), ifft3470);
__m512 ifft3393 = _mm512_mask_mov_ps(ifft3385, 49344, ifft3384);
__m512 ifft3478 = _mm512_mask_mov_ps(ifft3470, 49344, ifft3469);
__m512 ifft3394 = _mm512_mask_sub_ps(ifft3386, 49344, _mm512_setzero_ps(), ifft3387);
__m512 ifft3479 = _mm512_mask_sub_ps(ifft3471, 49344, _mm512_setzero_ps(), ifft3472);
__m512 ifft3395 = _mm512_mask_mov_ps(ifft3387, 49344, ifft3386);
__m512 ifft3480 = _mm512_mask_mov_ps(ifft3472, 49344, ifft3471);
__m512 ifft3396 = _mm512_mask_sub_ps(ifft3388, 49344, _mm512_setzero_ps(), ifft3389);
__m512 ifft3481 = _mm512_mask_sub_ps(ifft3473, 49344, _mm512_setzero_ps(), ifft3474);
__m512 ifft3397 = _mm512_mask_mov_ps(ifft3389, 49344, ifft3388);
__m512 ifft3482 = _mm512_mask_mov_ps(ifft3474, 49344, ifft3473);
__m512 ifft3398 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft3399 = _mm512_fmadd_ps(ifft3390, ifft3398, _mm512_shuffle_f32x4(ifft3390, ifft3390, 177));
__m512 ifft3483 = _mm512_fmadd_ps(ifft3475, ifft3398, _mm512_shuffle_f32x4(ifft3475, ifft3475, 177));
__m512 ifft3400 = _mm512_fmadd_ps(ifft3391, ifft3398, _mm512_shuffle_f32x4(ifft3391, ifft3391, 177));
__m512 ifft3484 = _mm512_fmadd_ps(ifft3476, ifft3398, _mm512_shuffle_f32x4(ifft3476, ifft3476, 177));
__m512 ifft3401 = _mm512_fmadd_ps(ifft3392, ifft3398, _mm512_shuffle_f32x4(ifft3392, ifft3392, 177));
__m512 ifft3485 = _mm512_fmadd_ps(ifft3477, ifft3398, _mm512_shuffle_f32x4(ifft3477, ifft3477, 177));
__m512 ifft3402 = _mm512_fmadd_ps(ifft3393, ifft3398, _mm512_shuffle_f32x4(ifft3393, ifft3393, 177));
__m512 ifft3486 = _mm512_fmadd_ps(ifft3478, ifft3398, _mm512_shuffle_f32x4(ifft3478, ifft3478, 177));
__m512 ifft3403 = _mm512_fmadd_ps(ifft3394, ifft3398, _mm512_shuffle_f32x4(ifft3394, ifft3394, 177));
__m512 ifft3487 = _mm512_fmadd_ps(ifft3479, ifft3398, _mm512_shuffle_f32x4(ifft3479, ifft3479, 177));
__m512 ifft3404 = _mm512_fnmsub_ps(ifft3395, ifft3398, _mm512_shuffle_f32x4(ifft3395, ifft3395, 177));
__m512 ifft3488 = _mm512_fnmsub_ps(ifft3480, ifft3398, _mm512_shuffle_f32x4(ifft3480, ifft3480, 177));
__m512 ifft3405 = _mm512_fmadd_ps(ifft3396, ifft3398, _mm512_shuffle_f32x4(ifft3396, ifft3396, 177));
__m512 ifft3489 = _mm512_fmadd_ps(ifft3481, ifft3398, _mm512_shuffle_f32x4(ifft3481, ifft3481, 177));
__m512 ifft3406 = _mm512_fmadd_ps(ifft3397, ifft3398, _mm512_shuffle_f32x4(ifft3397, ifft3397, 177));
__m512 ifft3490 = _mm512_fmadd_ps(ifft3482, ifft3398, _mm512_shuffle_f32x4(ifft3482, ifft3482, 177));
__m512 ifft3407 = _mm512_add_ps(ifft3399, ifft3400);
__m512 ifft3491 = _mm512_add_ps(ifft3483, ifft3484);
__m512 ifft3408 = _mm512_sub_ps(ifft3399, ifft3400);
__m512 ifft3492 = _mm512_sub_ps(ifft3483, ifft3484);
__m512 ifft3409 = _mm512_sub_ps(ifft3401, ifft3405);
__m512 ifft3493 = _mm512_sub_ps(ifft3485, ifft3489);
__m512 ifft3410 = _mm512_add_ps(ifft3402, ifft3406);
__m512 ifft3494 = _mm512_add_ps(ifft3486, ifft3490);
__m512 ifft3411 = _mm512_add_ps(ifft3401, ifft3405);
__m512 ifft3495 = _mm512_add_ps(ifft3485, ifft3489);
__m512 ifft3412 = _mm512_sub_ps(ifft3402, ifft3406);
__m512 ifft3496 = _mm512_sub_ps(ifft3486, ifft3490);
__m512 ifft3413 = _mm512_mul_ps(ifft3403, _mm512_set1_ps(3.125e-02f));
__m512 ifft3497 = _mm512_mul_ps(ifft3487, _mm512_set1_ps(3.125e-02f));
__m512 ifft3414 = _mm512_mul_ps(ifft3404, _mm512_set1_ps(3.125e-02f));
__m512 ifft3498 = _mm512_mul_ps(ifft3488, _mm512_set1_ps(3.125e-02f));
__m512 ifft3415 = _mm512_fmadd_ps(ifft3407, _mm512_set1_ps(1.5625e-02f), ifft3413);
__m512 ifft3499 = _mm512_fmadd_ps(ifft3491, _mm512_set1_ps(1.5625e-02f), ifft3497);
__m512 ifft3416 = _mm512_fmsub_ps(ifft3407, _mm512_set1_ps(1.5625e-02f), ifft3413);
__m512 ifft3500 = _mm512_fmsub_ps(ifft3491, _mm512_set1_ps(1.5625e-02f), ifft3497);
__m512 ifft3417 = _mm512_fmadd_ps(ifft3408, _mm512_set1_ps(1.5625e-02f), ifft3414);
__m512 ifft3501 = _mm512_fmadd_ps(ifft3492, _mm512_set1_ps(1.5625e-02f), ifft3498);
__m512 ifft3418 = _mm512_fmsub_ps(ifft3408, _mm512_set1_ps(1.5625e-02f), ifft3414);
__m512 ifft3502 = _mm512_fmsub_ps(ifft3492, _mm512_set1_ps(1.5625e-02f), ifft3498);
__m512 ifft3419 = _mm512_add_ps(ifft3409, ifft3410);
__m512 ifft3503 = _mm512_add_ps(ifft3493, ifft3494);
__m512 ifft3420 = _mm512_sub_ps(ifft3409, ifft3410);
__m512 ifft3504 = _mm512_sub_ps(ifft3493, ifft3494);
__m512 ifft3421 = _mm512_fnmadd_ps(ifft3419, _mm512_set1_ps(7.0710677e-01f), ifft3411);
__m512 ifft3505 = _mm512_fnmadd_ps(ifft3503, _mm512_set1_ps(7.0710677e-01f), ifft3495);
__m512 ifft3422 = _mm512_fmadd_ps(ifft3419, _mm512_set1_ps(7.0710677e-01f), ifft3411);
__m512 ifft3506 = _mm512_fmadd_ps(ifft3503, _mm512_set1_ps(7.0710677e-01f), ifft3495);
__m512 ifft3423 = _mm512_fmadd_ps(ifft3420, _mm512_set1_ps(7.0710677e-01f), ifft3412);
__m512 ifft3507 = _mm512_fmadd_ps(ifft3504, _mm512_set1_ps(7.0710677e-01f), ifft3496);
__m512 ifft3424 = _mm512_fmsub_ps(ifft3420, _mm512_set1_ps(7.0710677e-01f), ifft3412);
__m512 ifft3508 = _mm512_fmsub_ps(ifft3504, _mm512_set1_ps(7.0710677e-01f), ifft3496);
__m512 ifft3425 = _mm512_add_ps(ifft3421, ifft3422);
__m512 ifft3509 = _mm512_add_ps(ifft3505, ifft3506);
__m512 ifft3426 = _mm512_sub_ps(ifft3421, ifft3422);
__m512 ifft3510 = _mm512_sub_ps(ifft3505, ifft3506);
__m512 ifft3427 = _mm512_add_ps(ifft3423, ifft3424);
__m512 ifft3511 = _mm512_add_ps(ifft3507, ifft3508);
__m512 ifft3428 = _mm512_sub_ps(ifft3423, ifft3424);
__m512 ifft3512 = _mm512_sub_ps(ifft3507, ifft3508);
__m512 ifft3429 = _mm512_fmadd_ps(ifft3425, _mm512_set1_ps(1.5625e-02f), ifft3415);
__m512 ifft3513 = _mm512_fmadd_ps(ifft3509, _mm512_set1_ps(1.5625e-02f), ifft3499);
__m512 ifft3430 = _mm512_fnmadd_ps(ifft3425, _mm512_set1_ps(1.5625e-02f), ifft3415);
__m512 ifft3514 = _mm512_fnmadd_ps(ifft3509, _mm512_set1_ps(1.5625e-02f), ifft3499);
__m512 ifft3431 = _mm512_fmadd_ps(ifft3427, _mm512_set1_ps(1.5625e-02f), ifft3417);
__m512 ifft3515 = _mm512_fmadd_ps(ifft3511, _mm512_set1_ps(1.5625e-02f), ifft3501);
__m512 ifft3432 = _mm512_fnmadd_ps(ifft3427, _mm512_set1_ps(1.5625e-02f), ifft3417);
__m512 ifft3516 = _mm512_fnmadd_ps(ifft3511, _mm512_set1_ps(1.5625e-02f), ifft3501);
__m512 ifft3433 = _mm512_fnmadd_ps(ifft3428, _mm512_set1_ps(1.5625e-02f), ifft3416);
__m512 ifft3517 = _mm512_fnmadd_ps(ifft3512, _mm512_set1_ps(1.5625e-02f), ifft3500);
__m512 ifft3434 = _mm512_fmadd_ps(ifft3428, _mm512_set1_ps(1.5625e-02f), ifft3416);
__m512 ifft3518 = _mm512_fmadd_ps(ifft3512, _mm512_set1_ps(1.5625e-02f), ifft3500);
__m512 ifft3435 = _mm512_fmadd_ps(ifft3426, _mm512_set1_ps(1.5625e-02f), ifft3418);
__m512 ifft3519 = _mm512_fmadd_ps(ifft3510, _mm512_set1_ps(1.5625e-02f), ifft3502);
__m512 ifft3436 = _mm512_fnmadd_ps(ifft3426, _mm512_set1_ps(1.5625e-02f), ifft3418);
__m512 ifft3520 = _mm512_fnmadd_ps(ifft3510, _mm512_set1_ps(1.5625e-02f), ifft3502);
__m512 dat780 = ifft3429;
__m512 dat785 = ifft3513;
__m512 dat781 = ifft3431;
__m512 dat786 = ifft3515;
__m512 dat782 = ifft3433;
__m512 dat787 = ifft3517;
__m512 dat783 = ifft3435;
__m512 dat788 = ifft3519;
__m512 dat784 = ifft3430;
__m512 dat789 = ifft3514;
(void)ifft3432;
(void)ifft3516;
(void)ifft3434;
(void)ifft3518;
(void)ifft3436;
(void)ifft3520;
__m512i pm33 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack161 = _mm512_permutex2var_ps(dat780, pm33, dat785);
__m512i pm34 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack162 = _mm512_permutex2var_ps(dat780, pm34, dat785);
__m512 pack163 = _mm512_permutex2var_ps(dat781, pm33, dat786);
__m512 pack164 = _mm512_permutex2var_ps(dat781, pm34, dat786);
__m512 pack165 = _mm512_permutex2var_ps(dat782, pm33, dat787);
__m512 pack166 = _mm512_permutex2var_ps(dat782, pm34, dat787);
__m512 pack167 = _mm512_permutex2var_ps(dat783, pm33, dat788);
__m512 pack168 = _mm512_permutex2var_ps(dat783, pm34, dat788);
__m512 pack169 = _mm512_permutex2var_ps(dat784, pm33, dat789);
__m512 pack170 = _mm512_permutex2var_ps(dat784, pm34, dat789);
pack161 = _mm512_max_ps(_mm512_setzero_ps(), pack161);
pack162 = _mm512_max_ps(_mm512_setzero_ps(), pack162);
pack163 = _mm512_max_ps(_mm512_setzero_ps(), pack163);
pack164 = _mm512_max_ps(_mm512_setzero_ps(), pack164);
pack165 = _mm512_max_ps(_mm512_setzero_ps(), pack165);
pack166 = _mm512_max_ps(_mm512_setzero_ps(), pack166);
pack167 = _mm512_max_ps(_mm512_setzero_ps(), pack167);
pack168 = _mm512_max_ps(_mm512_setzero_ps(), pack168);
pack169 = _mm512_max_ps(_mm512_setzero_ps(), pack169);
pack170 = _mm512_max_ps(_mm512_setzero_ps(), pack170);
_mm512_mask_storeu_ps(datPtr2+1820+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack161);
_mm512_mask_storeu_ps(datPtr2+52060+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack162);
_mm512_mask_storeu_ps(datPtr2+2268+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack163);
_mm512_mask_storeu_ps(datPtr2+52508+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack164);
_mm512_mask_storeu_ps(datPtr2+2716+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack165);
_mm512_mask_storeu_ps(datPtr2+52956+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack166);
_mm512_mask_storeu_ps(datPtr2+3164+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack167);
_mm512_mask_storeu_ps(datPtr2+53404+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack168);
_mm512_mask_storeu_ps(datPtr2+3612+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack169);
_mm512_mask_storeu_ps(datPtr2+53852+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack170);
}
}
}
if (j5 >= last2) return;
++j5;
rel5 = 16;
}
if (rel5 < 19) {
if (rel5 < 18) {
ptrdiff_t toH12 = base5+20;
ptrdiff_t toW12 = -455+30*rel5;
ptrdiff_t jj16 = 17-rel5+j5;
for (; j5 <= jj16; toW12 += 30) {
ptrdiff_t k36 = 16*w21;
for (; k36 != 16; ++k36) {
ptrdiff_t r13 = 0;
for (; r13 != 2; ++r13) {
ptrdiff_t t22 = 0;
for (; t22 < 3; ++t22) {
__m512 sfRe217 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfIm217 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfRe221 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfIm221 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfRe218 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfIm218 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfRe222 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfIm222 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfRe219 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfIm219 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfRe223 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfIm223 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfRe220 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfIm220 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfRe224 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfIm224 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512i ifft3521 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft3522 = _mm512_permutexvar_ps(ifft3521, sfRe217);
__m512 ifft3613 = _mm512_permutexvar_ps(ifft3521, sfRe221);
__m512i ifft3523 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft3524 = _mm512_permutexvar_ps(ifft3523, sfRe217);
__m512 ifft3614 = _mm512_permutexvar_ps(ifft3523, sfRe221);
__m512 ifft3525 = _mm512_permutexvar_ps(ifft3521, sfIm217);
__m512 ifft3615 = _mm512_permutexvar_ps(ifft3521, sfIm221);
__m512 ifft3526 = _mm512_permutexvar_ps(ifft3523, sfIm217);
__m512 ifft3616 = _mm512_permutexvar_ps(ifft3523, sfIm221);
__m512 ifft3527 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft3528 = _mm512_mask_fmadd_ps(ifft3526, 65021, ifft3527, ifft3522);
__m512 ifft3617 = _mm512_mask_fmadd_ps(ifft3616, 65021, ifft3527, ifft3613);
__m512 ifft3529 = _mm512_mask_fnmadd_ps(ifft3525, 65021, ifft3527, ifft3524);
__m512 ifft3618 = _mm512_mask_fnmadd_ps(ifft3615, 65021, ifft3527, ifft3614);
__m512 ifft3530 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft3531 = _mm512_fmadd_ps(ifft3528, ifft3530, _mm512_shuffle_ps(ifft3528, ifft3528, 177));
__m512 ifft3619 = _mm512_fmadd_ps(ifft3617, ifft3530, _mm512_shuffle_ps(ifft3617, ifft3617, 177));
__m512 ifft3532 = _mm512_fmadd_ps(ifft3529, ifft3530, _mm512_shuffle_ps(ifft3529, ifft3529, 177));
__m512 ifft3620 = _mm512_fmadd_ps(ifft3618, ifft3530, _mm512_shuffle_ps(ifft3618, ifft3618, 177));
__m512 ifft3533 = _mm512_fmadd_ps(sfRe218, ifft3530, _mm512_shuffle_ps(sfRe218, sfRe218, 177));
__m512 ifft3621 = _mm512_fmadd_ps(sfRe222, ifft3530, _mm512_shuffle_ps(sfRe222, sfRe222, 177));
__m512 ifft3534 = _mm512_fmadd_ps(sfIm218, ifft3530, _mm512_shuffle_ps(sfIm218, sfIm218, 177));
__m512 ifft3622 = _mm512_fmadd_ps(sfIm222, ifft3530, _mm512_shuffle_ps(sfIm222, sfIm222, 177));
__m512 ifft3535 = _mm512_fmadd_ps(sfRe219, ifft3530, _mm512_shuffle_ps(sfRe219, sfRe219, 177));
__m512 ifft3623 = _mm512_fmadd_ps(sfRe223, ifft3530, _mm512_shuffle_ps(sfRe223, sfRe223, 177));
__m512 ifft3536 = _mm512_fmadd_ps(sfIm219, ifft3530, _mm512_shuffle_ps(sfIm219, sfIm219, 177));
__m512 ifft3624 = _mm512_fmadd_ps(sfIm223, ifft3530, _mm512_shuffle_ps(sfIm223, sfIm223, 177));
__m512 ifft3537 = _mm512_fmadd_ps(sfRe220, ifft3530, _mm512_shuffle_ps(sfRe220, sfRe220, 177));
__m512 ifft3625 = _mm512_fmadd_ps(sfRe224, ifft3530, _mm512_shuffle_ps(sfRe224, sfRe224, 177));
__m512 ifft3538 = _mm512_fmadd_ps(sfIm220, ifft3530, _mm512_shuffle_ps(sfIm220, sfIm220, 177));
__m512 ifft3626 = _mm512_fmadd_ps(sfIm224, ifft3530, _mm512_shuffle_ps(sfIm224, sfIm224, 177));
__m512 ifft3539 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft3540 = _mm512_mul_ps(ifft3531, ifft3539);
__m512 ifft3627 = _mm512_mul_ps(ifft3619, ifft3539);
__m512 ifft3541 = _mm512_mul_ps(ifft3532, ifft3539);
__m512 ifft3628 = _mm512_mul_ps(ifft3620, ifft3539);
__m512 ifft3542 = _mm512_mul_ps(ifft3533, ifft3539);
__m512 ifft3629 = _mm512_mul_ps(ifft3621, ifft3539);
__m512 ifft3543 = _mm512_mul_ps(ifft3534, ifft3539);
__m512 ifft3630 = _mm512_mul_ps(ifft3622, ifft3539);
__m512 ifft3544 = _mm512_mul_ps(ifft3535, ifft3539);
__m512 ifft3631 = _mm512_mul_ps(ifft3623, ifft3539);
__m512 ifft3545 = _mm512_mul_ps(ifft3536, ifft3539);
__m512 ifft3632 = _mm512_mul_ps(ifft3624, ifft3539);
__m512 ifft3546 = _mm512_mul_ps(ifft3537, ifft3539);
__m512 ifft3633 = _mm512_mul_ps(ifft3625, ifft3539);
__m512 ifft3547 = _mm512_mul_ps(ifft3538, ifft3539);
__m512 ifft3634 = _mm512_mul_ps(ifft3626, ifft3539);
__m512 ifft3548 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft3549 = _mm512_fnmadd_ps(ifft3532, ifft3548, ifft3540);
__m512 ifft3635 = _mm512_fnmadd_ps(ifft3620, ifft3548, ifft3627);
__m512 ifft3550 = _mm512_fmadd_ps(ifft3531, ifft3548, ifft3541);
__m512 ifft3636 = _mm512_fmadd_ps(ifft3619, ifft3548, ifft3628);
__m512 ifft3551 = _mm512_fnmadd_ps(ifft3534, ifft3548, ifft3542);
__m512 ifft3637 = _mm512_fnmadd_ps(ifft3622, ifft3548, ifft3629);
__m512 ifft3552 = _mm512_fmadd_ps(ifft3533, ifft3548, ifft3543);
__m512 ifft3638 = _mm512_fmadd_ps(ifft3621, ifft3548, ifft3630);
__m512 ifft3553 = _mm512_fnmadd_ps(ifft3536, ifft3548, ifft3544);
__m512 ifft3639 = _mm512_fnmadd_ps(ifft3624, ifft3548, ifft3631);
__m512 ifft3554 = _mm512_fmadd_ps(ifft3535, ifft3548, ifft3545);
__m512 ifft3640 = _mm512_fmadd_ps(ifft3623, ifft3548, ifft3632);
__m512 ifft3555 = _mm512_fnmadd_ps(ifft3538, ifft3548, ifft3546);
__m512 ifft3641 = _mm512_fnmadd_ps(ifft3626, ifft3548, ifft3633);
__m512 ifft3556 = _mm512_fmadd_ps(ifft3537, ifft3548, ifft3547);
__m512 ifft3642 = _mm512_fmadd_ps(ifft3625, ifft3548, ifft3634);
__m512 ifft3557 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft3558 = _mm512_fmadd_ps(ifft3549, ifft3557, _mm512_shuffle_ps(ifft3549, ifft3549, 78));
__m512 ifft3643 = _mm512_fmadd_ps(ifft3635, ifft3557, _mm512_shuffle_ps(ifft3635, ifft3635, 78));
__m512 ifft3559 = _mm512_fmadd_ps(ifft3550, ifft3557, _mm512_shuffle_ps(ifft3550, ifft3550, 78));
__m512 ifft3644 = _mm512_fmadd_ps(ifft3636, ifft3557, _mm512_shuffle_ps(ifft3636, ifft3636, 78));
__m512 ifft3560 = _mm512_fmadd_ps(ifft3551, ifft3557, _mm512_shuffle_ps(ifft3551, ifft3551, 78));
__m512 ifft3645 = _mm512_fmadd_ps(ifft3637, ifft3557, _mm512_shuffle_ps(ifft3637, ifft3637, 78));
__m512 ifft3561 = _mm512_fmadd_ps(ifft3552, ifft3557, _mm512_shuffle_ps(ifft3552, ifft3552, 78));
__m512 ifft3646 = _mm512_fmadd_ps(ifft3638, ifft3557, _mm512_shuffle_ps(ifft3638, ifft3638, 78));
__m512 ifft3562 = _mm512_fmadd_ps(ifft3553, ifft3557, _mm512_shuffle_ps(ifft3553, ifft3553, 78));
__m512 ifft3647 = _mm512_fmadd_ps(ifft3639, ifft3557, _mm512_shuffle_ps(ifft3639, ifft3639, 78));
__m512 ifft3563 = _mm512_fmadd_ps(ifft3554, ifft3557, _mm512_shuffle_ps(ifft3554, ifft3554, 78));
__m512 ifft3648 = _mm512_fmadd_ps(ifft3640, ifft3557, _mm512_shuffle_ps(ifft3640, ifft3640, 78));
__m512 ifft3564 = _mm512_fmadd_ps(ifft3555, ifft3557, _mm512_shuffle_ps(ifft3555, ifft3555, 78));
__m512 ifft3649 = _mm512_fmadd_ps(ifft3641, ifft3557, _mm512_shuffle_ps(ifft3641, ifft3641, 78));
__m512 ifft3565 = _mm512_fmadd_ps(ifft3556, ifft3557, _mm512_shuffle_ps(ifft3556, ifft3556, 78));
__m512 ifft3650 = _mm512_fmadd_ps(ifft3642, ifft3557, _mm512_shuffle_ps(ifft3642, ifft3642, 78));
__m512 ifft3566 = _mm512_mask_sub_ps(ifft3558, 49344, _mm512_setzero_ps(), ifft3559);
__m512 ifft3651 = _mm512_mask_sub_ps(ifft3643, 49344, _mm512_setzero_ps(), ifft3644);
__m512 ifft3567 = _mm512_mask_mov_ps(ifft3559, 49344, ifft3558);
__m512 ifft3652 = _mm512_mask_mov_ps(ifft3644, 49344, ifft3643);
__m512 ifft3568 = _mm512_mask_sub_ps(ifft3560, 49344, _mm512_setzero_ps(), ifft3561);
__m512 ifft3653 = _mm512_mask_sub_ps(ifft3645, 49344, _mm512_setzero_ps(), ifft3646);
__m512 ifft3569 = _mm512_mask_mov_ps(ifft3561, 49344, ifft3560);
__m512 ifft3654 = _mm512_mask_mov_ps(ifft3646, 49344, ifft3645);
__m512 ifft3570 = _mm512_mask_sub_ps(ifft3562, 49344, _mm512_setzero_ps(), ifft3563);
__m512 ifft3655 = _mm512_mask_sub_ps(ifft3647, 49344, _mm512_setzero_ps(), ifft3648);
__m512 ifft3571 = _mm512_mask_mov_ps(ifft3563, 49344, ifft3562);
__m512 ifft3656 = _mm512_mask_mov_ps(ifft3648, 49344, ifft3647);
__m512 ifft3572 = _mm512_mask_sub_ps(ifft3564, 49344, _mm512_setzero_ps(), ifft3565);
__m512 ifft3657 = _mm512_mask_sub_ps(ifft3649, 49344, _mm512_setzero_ps(), ifft3650);
__m512 ifft3573 = _mm512_mask_mov_ps(ifft3565, 49344, ifft3564);
__m512 ifft3658 = _mm512_mask_mov_ps(ifft3650, 49344, ifft3649);
__m512 ifft3574 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft3575 = _mm512_fmadd_ps(ifft3566, ifft3574, _mm512_shuffle_f32x4(ifft3566, ifft3566, 177));
__m512 ifft3659 = _mm512_fmadd_ps(ifft3651, ifft3574, _mm512_shuffle_f32x4(ifft3651, ifft3651, 177));
__m512 ifft3576 = _mm512_fmadd_ps(ifft3567, ifft3574, _mm512_shuffle_f32x4(ifft3567, ifft3567, 177));
__m512 ifft3660 = _mm512_fmadd_ps(ifft3652, ifft3574, _mm512_shuffle_f32x4(ifft3652, ifft3652, 177));
__m512 ifft3577 = _mm512_fmadd_ps(ifft3568, ifft3574, _mm512_shuffle_f32x4(ifft3568, ifft3568, 177));
__m512 ifft3661 = _mm512_fmadd_ps(ifft3653, ifft3574, _mm512_shuffle_f32x4(ifft3653, ifft3653, 177));
__m512 ifft3578 = _mm512_fmadd_ps(ifft3569, ifft3574, _mm512_shuffle_f32x4(ifft3569, ifft3569, 177));
__m512 ifft3662 = _mm512_fmadd_ps(ifft3654, ifft3574, _mm512_shuffle_f32x4(ifft3654, ifft3654, 177));
__m512 ifft3579 = _mm512_fmadd_ps(ifft3570, ifft3574, _mm512_shuffle_f32x4(ifft3570, ifft3570, 177));
__m512 ifft3663 = _mm512_fmadd_ps(ifft3655, ifft3574, _mm512_shuffle_f32x4(ifft3655, ifft3655, 177));
__m512 ifft3580 = _mm512_fnmsub_ps(ifft3571, ifft3574, _mm512_shuffle_f32x4(ifft3571, ifft3571, 177));
__m512 ifft3664 = _mm512_fnmsub_ps(ifft3656, ifft3574, _mm512_shuffle_f32x4(ifft3656, ifft3656, 177));
__m512 ifft3581 = _mm512_fmadd_ps(ifft3572, ifft3574, _mm512_shuffle_f32x4(ifft3572, ifft3572, 177));
__m512 ifft3665 = _mm512_fmadd_ps(ifft3657, ifft3574, _mm512_shuffle_f32x4(ifft3657, ifft3657, 177));
__m512 ifft3582 = _mm512_fmadd_ps(ifft3573, ifft3574, _mm512_shuffle_f32x4(ifft3573, ifft3573, 177));
__m512 ifft3666 = _mm512_fmadd_ps(ifft3658, ifft3574, _mm512_shuffle_f32x4(ifft3658, ifft3658, 177));
__m512 ifft3583 = _mm512_add_ps(ifft3575, ifft3576);
__m512 ifft3667 = _mm512_add_ps(ifft3659, ifft3660);
__m512 ifft3584 = _mm512_sub_ps(ifft3575, ifft3576);
__m512 ifft3668 = _mm512_sub_ps(ifft3659, ifft3660);
__m512 ifft3585 = _mm512_sub_ps(ifft3577, ifft3581);
__m512 ifft3669 = _mm512_sub_ps(ifft3661, ifft3665);
__m512 ifft3586 = _mm512_add_ps(ifft3578, ifft3582);
__m512 ifft3670 = _mm512_add_ps(ifft3662, ifft3666);
__m512 ifft3587 = _mm512_add_ps(ifft3577, ifft3581);
__m512 ifft3671 = _mm512_add_ps(ifft3661, ifft3665);
__m512 ifft3588 = _mm512_sub_ps(ifft3578, ifft3582);
__m512 ifft3672 = _mm512_sub_ps(ifft3662, ifft3666);
__m512 ifft3589 = _mm512_mul_ps(ifft3579, _mm512_set1_ps(3.125e-02f));
__m512 ifft3673 = _mm512_mul_ps(ifft3663, _mm512_set1_ps(3.125e-02f));
__m512 ifft3590 = _mm512_mul_ps(ifft3580, _mm512_set1_ps(3.125e-02f));
__m512 ifft3674 = _mm512_mul_ps(ifft3664, _mm512_set1_ps(3.125e-02f));
__m512 ifft3591 = _mm512_fmadd_ps(ifft3583, _mm512_set1_ps(1.5625e-02f), ifft3589);
__m512 ifft3675 = _mm512_fmadd_ps(ifft3667, _mm512_set1_ps(1.5625e-02f), ifft3673);
__m512 ifft3592 = _mm512_fmsub_ps(ifft3583, _mm512_set1_ps(1.5625e-02f), ifft3589);
__m512 ifft3676 = _mm512_fmsub_ps(ifft3667, _mm512_set1_ps(1.5625e-02f), ifft3673);
__m512 ifft3593 = _mm512_fmadd_ps(ifft3584, _mm512_set1_ps(1.5625e-02f), ifft3590);
__m512 ifft3677 = _mm512_fmadd_ps(ifft3668, _mm512_set1_ps(1.5625e-02f), ifft3674);
__m512 ifft3594 = _mm512_fmsub_ps(ifft3584, _mm512_set1_ps(1.5625e-02f), ifft3590);
__m512 ifft3678 = _mm512_fmsub_ps(ifft3668, _mm512_set1_ps(1.5625e-02f), ifft3674);
__m512 ifft3595 = _mm512_add_ps(ifft3585, ifft3586);
__m512 ifft3679 = _mm512_add_ps(ifft3669, ifft3670);
__m512 ifft3596 = _mm512_sub_ps(ifft3585, ifft3586);
__m512 ifft3680 = _mm512_sub_ps(ifft3669, ifft3670);
__m512 ifft3597 = _mm512_fnmadd_ps(ifft3595, _mm512_set1_ps(7.0710677e-01f), ifft3587);
__m512 ifft3681 = _mm512_fnmadd_ps(ifft3679, _mm512_set1_ps(7.0710677e-01f), ifft3671);
__m512 ifft3598 = _mm512_fmadd_ps(ifft3595, _mm512_set1_ps(7.0710677e-01f), ifft3587);
__m512 ifft3682 = _mm512_fmadd_ps(ifft3679, _mm512_set1_ps(7.0710677e-01f), ifft3671);
__m512 ifft3599 = _mm512_fmadd_ps(ifft3596, _mm512_set1_ps(7.0710677e-01f), ifft3588);
__m512 ifft3683 = _mm512_fmadd_ps(ifft3680, _mm512_set1_ps(7.0710677e-01f), ifft3672);
__m512 ifft3600 = _mm512_fmsub_ps(ifft3596, _mm512_set1_ps(7.0710677e-01f), ifft3588);
__m512 ifft3684 = _mm512_fmsub_ps(ifft3680, _mm512_set1_ps(7.0710677e-01f), ifft3672);
__m512 ifft3601 = _mm512_add_ps(ifft3597, ifft3598);
__m512 ifft3685 = _mm512_add_ps(ifft3681, ifft3682);
__m512 ifft3602 = _mm512_sub_ps(ifft3597, ifft3598);
__m512 ifft3686 = _mm512_sub_ps(ifft3681, ifft3682);
__m512 ifft3603 = _mm512_add_ps(ifft3599, ifft3600);
__m512 ifft3687 = _mm512_add_ps(ifft3683, ifft3684);
__m512 ifft3604 = _mm512_sub_ps(ifft3599, ifft3600);
__m512 ifft3688 = _mm512_sub_ps(ifft3683, ifft3684);
__m512 ifft3605 = _mm512_fmadd_ps(ifft3601, _mm512_set1_ps(1.5625e-02f), ifft3591);
__m512 ifft3689 = _mm512_fmadd_ps(ifft3685, _mm512_set1_ps(1.5625e-02f), ifft3675);
__m512 ifft3606 = _mm512_fnmadd_ps(ifft3601, _mm512_set1_ps(1.5625e-02f), ifft3591);
__m512 ifft3690 = _mm512_fnmadd_ps(ifft3685, _mm512_set1_ps(1.5625e-02f), ifft3675);
__m512 ifft3607 = _mm512_fmadd_ps(ifft3603, _mm512_set1_ps(1.5625e-02f), ifft3593);
__m512 ifft3691 = _mm512_fmadd_ps(ifft3687, _mm512_set1_ps(1.5625e-02f), ifft3677);
__m512 ifft3608 = _mm512_fnmadd_ps(ifft3603, _mm512_set1_ps(1.5625e-02f), ifft3593);
__m512 ifft3692 = _mm512_fnmadd_ps(ifft3687, _mm512_set1_ps(1.5625e-02f), ifft3677);
__m512 ifft3609 = _mm512_fnmadd_ps(ifft3604, _mm512_set1_ps(1.5625e-02f), ifft3592);
__m512 ifft3693 = _mm512_fnmadd_ps(ifft3688, _mm512_set1_ps(1.5625e-02f), ifft3676);
__m512 ifft3610 = _mm512_fmadd_ps(ifft3604, _mm512_set1_ps(1.5625e-02f), ifft3592);
__m512 ifft3694 = _mm512_fmadd_ps(ifft3688, _mm512_set1_ps(1.5625e-02f), ifft3676);
__m512 ifft3611 = _mm512_fmadd_ps(ifft3602, _mm512_set1_ps(1.5625e-02f), ifft3594);
__m512 ifft3695 = _mm512_fmadd_ps(ifft3686, _mm512_set1_ps(1.5625e-02f), ifft3678);
__m512 ifft3612 = _mm512_fnmadd_ps(ifft3602, _mm512_set1_ps(1.5625e-02f), ifft3594);
__m512 ifft3696 = _mm512_fnmadd_ps(ifft3686, _mm512_set1_ps(1.5625e-02f), ifft3678);
__m512 dat790 = ifft3605;
__m512 dat795 = ifft3689;
__m512 dat791 = ifft3607;
__m512 dat796 = ifft3691;
__m512 dat792 = ifft3609;
__m512 dat797 = ifft3693;
__m512 dat793 = ifft3611;
__m512 dat798 = ifft3695;
__m512 dat794 = ifft3606;
__m512 dat799 = ifft3690;
(void)ifft3608;
(void)ifft3692;
(void)ifft3610;
(void)ifft3694;
(void)ifft3612;
(void)ifft3696;
__m512i pm35 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack171 = _mm512_permutex2var_ps(dat790, pm35, dat795);
__m512i pm36 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack172 = _mm512_permutex2var_ps(dat790, pm36, dat795);
__m512 pack173 = _mm512_permutex2var_ps(dat791, pm35, dat796);
__m512 pack174 = _mm512_permutex2var_ps(dat791, pm36, dat796);
__m512 pack175 = _mm512_permutex2var_ps(dat792, pm35, dat797);
__m512 pack176 = _mm512_permutex2var_ps(dat792, pm36, dat797);
__m512 pack177 = _mm512_permutex2var_ps(dat793, pm35, dat798);
__m512 pack178 = _mm512_permutex2var_ps(dat793, pm36, dat798);
__m512 pack179 = _mm512_permutex2var_ps(dat794, pm35, dat799);
__m512 pack180 = _mm512_permutex2var_ps(dat794, pm36, dat799);
pack171 = _mm512_max_ps(_mm512_setzero_ps(), pack171);
pack172 = _mm512_max_ps(_mm512_setzero_ps(), pack172);
pack173 = _mm512_max_ps(_mm512_setzero_ps(), pack173);
pack174 = _mm512_max_ps(_mm512_setzero_ps(), pack174);
pack175 = _mm512_max_ps(_mm512_setzero_ps(), pack175);
pack176 = _mm512_max_ps(_mm512_setzero_ps(), pack176);
pack177 = _mm512_max_ps(_mm512_setzero_ps(), pack177);
pack178 = _mm512_max_ps(_mm512_setzero_ps(), pack178);
pack179 = _mm512_max_ps(_mm512_setzero_ps(), pack179);
pack180 = _mm512_max_ps(_mm512_setzero_ps(), pack180);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack171);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack172);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack173);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack174);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack175);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack176);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack177);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack178);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack179);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack180);
}
}
}
if (j5 >= last2) return;
++j5;
}
rel5 = 18;
}
ptrdiff_t toH13 = base5+20;
ptrdiff_t toW13 = 85;
ptrdiff_t k37 = 16*w21;
for (; k37 != 16; ++k37) {
ptrdiff_t r14 = 0;
for (; r14 != 2; ++r14) {
ptrdiff_t t23 = 0;
for (; t23 < 2; ++t23) {
__m512 sfRe225 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfIm225 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfRe229 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfIm229 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfRe226 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfIm226 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfRe230 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfIm230 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfRe227 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfIm227 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfRe231 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfIm231 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfRe228 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfIm228 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfRe232 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfIm232 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512i ifft3697 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft3698 = _mm512_permutexvar_ps(ifft3697, sfRe225);
__m512 ifft3789 = _mm512_permutexvar_ps(ifft3697, sfRe229);
__m512i ifft3699 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft3700 = _mm512_permutexvar_ps(ifft3699, sfRe225);
__m512 ifft3790 = _mm512_permutexvar_ps(ifft3699, sfRe229);
__m512 ifft3701 = _mm512_permutexvar_ps(ifft3697, sfIm225);
__m512 ifft3791 = _mm512_permutexvar_ps(ifft3697, sfIm229);
__m512 ifft3702 = _mm512_permutexvar_ps(ifft3699, sfIm225);
__m512 ifft3792 = _mm512_permutexvar_ps(ifft3699, sfIm229);
__m512 ifft3703 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft3704 = _mm512_mask_fmadd_ps(ifft3702, 65021, ifft3703, ifft3698);
__m512 ifft3793 = _mm512_mask_fmadd_ps(ifft3792, 65021, ifft3703, ifft3789);
__m512 ifft3705 = _mm512_mask_fnmadd_ps(ifft3701, 65021, ifft3703, ifft3700);
__m512 ifft3794 = _mm512_mask_fnmadd_ps(ifft3791, 65021, ifft3703, ifft3790);
__m512 ifft3706 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft3707 = _mm512_fmadd_ps(ifft3704, ifft3706, _mm512_shuffle_ps(ifft3704, ifft3704, 177));
__m512 ifft3795 = _mm512_fmadd_ps(ifft3793, ifft3706, _mm512_shuffle_ps(ifft3793, ifft3793, 177));
__m512 ifft3708 = _mm512_fmadd_ps(ifft3705, ifft3706, _mm512_shuffle_ps(ifft3705, ifft3705, 177));
__m512 ifft3796 = _mm512_fmadd_ps(ifft3794, ifft3706, _mm512_shuffle_ps(ifft3794, ifft3794, 177));
__m512 ifft3709 = _mm512_fmadd_ps(sfRe226, ifft3706, _mm512_shuffle_ps(sfRe226, sfRe226, 177));
__m512 ifft3797 = _mm512_fmadd_ps(sfRe230, ifft3706, _mm512_shuffle_ps(sfRe230, sfRe230, 177));
__m512 ifft3710 = _mm512_fmadd_ps(sfIm226, ifft3706, _mm512_shuffle_ps(sfIm226, sfIm226, 177));
__m512 ifft3798 = _mm512_fmadd_ps(sfIm230, ifft3706, _mm512_shuffle_ps(sfIm230, sfIm230, 177));
__m512 ifft3711 = _mm512_fmadd_ps(sfRe227, ifft3706, _mm512_shuffle_ps(sfRe227, sfRe227, 177));
__m512 ifft3799 = _mm512_fmadd_ps(sfRe231, ifft3706, _mm512_shuffle_ps(sfRe231, sfRe231, 177));
__m512 ifft3712 = _mm512_fmadd_ps(sfIm227, ifft3706, _mm512_shuffle_ps(sfIm227, sfIm227, 177));
__m512 ifft3800 = _mm512_fmadd_ps(sfIm231, ifft3706, _mm512_shuffle_ps(sfIm231, sfIm231, 177));
__m512 ifft3713 = _mm512_fmadd_ps(sfRe228, ifft3706, _mm512_shuffle_ps(sfRe228, sfRe228, 177));
__m512 ifft3801 = _mm512_fmadd_ps(sfRe232, ifft3706, _mm512_shuffle_ps(sfRe232, sfRe232, 177));
__m512 ifft3714 = _mm512_fmadd_ps(sfIm228, ifft3706, _mm512_shuffle_ps(sfIm228, sfIm228, 177));
__m512 ifft3802 = _mm512_fmadd_ps(sfIm232, ifft3706, _mm512_shuffle_ps(sfIm232, sfIm232, 177));
__m512 ifft3715 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft3716 = _mm512_mul_ps(ifft3707, ifft3715);
__m512 ifft3803 = _mm512_mul_ps(ifft3795, ifft3715);
__m512 ifft3717 = _mm512_mul_ps(ifft3708, ifft3715);
__m512 ifft3804 = _mm512_mul_ps(ifft3796, ifft3715);
__m512 ifft3718 = _mm512_mul_ps(ifft3709, ifft3715);
__m512 ifft3805 = _mm512_mul_ps(ifft3797, ifft3715);
__m512 ifft3719 = _mm512_mul_ps(ifft3710, ifft3715);
__m512 ifft3806 = _mm512_mul_ps(ifft3798, ifft3715);
__m512 ifft3720 = _mm512_mul_ps(ifft3711, ifft3715);
__m512 ifft3807 = _mm512_mul_ps(ifft3799, ifft3715);
__m512 ifft3721 = _mm512_mul_ps(ifft3712, ifft3715);
__m512 ifft3808 = _mm512_mul_ps(ifft3800, ifft3715);
__m512 ifft3722 = _mm512_mul_ps(ifft3713, ifft3715);
__m512 ifft3809 = _mm512_mul_ps(ifft3801, ifft3715);
__m512 ifft3723 = _mm512_mul_ps(ifft3714, ifft3715);
__m512 ifft3810 = _mm512_mul_ps(ifft3802, ifft3715);
__m512 ifft3724 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft3725 = _mm512_fnmadd_ps(ifft3708, ifft3724, ifft3716);
__m512 ifft3811 = _mm512_fnmadd_ps(ifft3796, ifft3724, ifft3803);
__m512 ifft3726 = _mm512_fmadd_ps(ifft3707, ifft3724, ifft3717);
__m512 ifft3812 = _mm512_fmadd_ps(ifft3795, ifft3724, ifft3804);
__m512 ifft3727 = _mm512_fnmadd_ps(ifft3710, ifft3724, ifft3718);
__m512 ifft3813 = _mm512_fnmadd_ps(ifft3798, ifft3724, ifft3805);
__m512 ifft3728 = _mm512_fmadd_ps(ifft3709, ifft3724, ifft3719);
__m512 ifft3814 = _mm512_fmadd_ps(ifft3797, ifft3724, ifft3806);
__m512 ifft3729 = _mm512_fnmadd_ps(ifft3712, ifft3724, ifft3720);
__m512 ifft3815 = _mm512_fnmadd_ps(ifft3800, ifft3724, ifft3807);
__m512 ifft3730 = _mm512_fmadd_ps(ifft3711, ifft3724, ifft3721);
__m512 ifft3816 = _mm512_fmadd_ps(ifft3799, ifft3724, ifft3808);
__m512 ifft3731 = _mm512_fnmadd_ps(ifft3714, ifft3724, ifft3722);
__m512 ifft3817 = _mm512_fnmadd_ps(ifft3802, ifft3724, ifft3809);
__m512 ifft3732 = _mm512_fmadd_ps(ifft3713, ifft3724, ifft3723);
__m512 ifft3818 = _mm512_fmadd_ps(ifft3801, ifft3724, ifft3810);
__m512 ifft3733 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft3734 = _mm512_fmadd_ps(ifft3725, ifft3733, _mm512_shuffle_ps(ifft3725, ifft3725, 78));
__m512 ifft3819 = _mm512_fmadd_ps(ifft3811, ifft3733, _mm512_shuffle_ps(ifft3811, ifft3811, 78));
__m512 ifft3735 = _mm512_fmadd_ps(ifft3726, ifft3733, _mm512_shuffle_ps(ifft3726, ifft3726, 78));
__m512 ifft3820 = _mm512_fmadd_ps(ifft3812, ifft3733, _mm512_shuffle_ps(ifft3812, ifft3812, 78));
__m512 ifft3736 = _mm512_fmadd_ps(ifft3727, ifft3733, _mm512_shuffle_ps(ifft3727, ifft3727, 78));
__m512 ifft3821 = _mm512_fmadd_ps(ifft3813, ifft3733, _mm512_shuffle_ps(ifft3813, ifft3813, 78));
__m512 ifft3737 = _mm512_fmadd_ps(ifft3728, ifft3733, _mm512_shuffle_ps(ifft3728, ifft3728, 78));
__m512 ifft3822 = _mm512_fmadd_ps(ifft3814, ifft3733, _mm512_shuffle_ps(ifft3814, ifft3814, 78));
__m512 ifft3738 = _mm512_fmadd_ps(ifft3729, ifft3733, _mm512_shuffle_ps(ifft3729, ifft3729, 78));
__m512 ifft3823 = _mm512_fmadd_ps(ifft3815, ifft3733, _mm512_shuffle_ps(ifft3815, ifft3815, 78));
__m512 ifft3739 = _mm512_fmadd_ps(ifft3730, ifft3733, _mm512_shuffle_ps(ifft3730, ifft3730, 78));
__m512 ifft3824 = _mm512_fmadd_ps(ifft3816, ifft3733, _mm512_shuffle_ps(ifft3816, ifft3816, 78));
__m512 ifft3740 = _mm512_fmadd_ps(ifft3731, ifft3733, _mm512_shuffle_ps(ifft3731, ifft3731, 78));
__m512 ifft3825 = _mm512_fmadd_ps(ifft3817, ifft3733, _mm512_shuffle_ps(ifft3817, ifft3817, 78));
__m512 ifft3741 = _mm512_fmadd_ps(ifft3732, ifft3733, _mm512_shuffle_ps(ifft3732, ifft3732, 78));
__m512 ifft3826 = _mm512_fmadd_ps(ifft3818, ifft3733, _mm512_shuffle_ps(ifft3818, ifft3818, 78));
__m512 ifft3742 = _mm512_mask_sub_ps(ifft3734, 49344, _mm512_setzero_ps(), ifft3735);
__m512 ifft3827 = _mm512_mask_sub_ps(ifft3819, 49344, _mm512_setzero_ps(), ifft3820);
__m512 ifft3743 = _mm512_mask_mov_ps(ifft3735, 49344, ifft3734);
__m512 ifft3828 = _mm512_mask_mov_ps(ifft3820, 49344, ifft3819);
__m512 ifft3744 = _mm512_mask_sub_ps(ifft3736, 49344, _mm512_setzero_ps(), ifft3737);
__m512 ifft3829 = _mm512_mask_sub_ps(ifft3821, 49344, _mm512_setzero_ps(), ifft3822);
__m512 ifft3745 = _mm512_mask_mov_ps(ifft3737, 49344, ifft3736);
__m512 ifft3830 = _mm512_mask_mov_ps(ifft3822, 49344, ifft3821);
__m512 ifft3746 = _mm512_mask_sub_ps(ifft3738, 49344, _mm512_setzero_ps(), ifft3739);
__m512 ifft3831 = _mm512_mask_sub_ps(ifft3823, 49344, _mm512_setzero_ps(), ifft3824);
__m512 ifft3747 = _mm512_mask_mov_ps(ifft3739, 49344, ifft3738);
__m512 ifft3832 = _mm512_mask_mov_ps(ifft3824, 49344, ifft3823);
__m512 ifft3748 = _mm512_mask_sub_ps(ifft3740, 49344, _mm512_setzero_ps(), ifft3741);
__m512 ifft3833 = _mm512_mask_sub_ps(ifft3825, 49344, _mm512_setzero_ps(), ifft3826);
__m512 ifft3749 = _mm512_mask_mov_ps(ifft3741, 49344, ifft3740);
__m512 ifft3834 = _mm512_mask_mov_ps(ifft3826, 49344, ifft3825);
__m512 ifft3750 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft3751 = _mm512_fmadd_ps(ifft3742, ifft3750, _mm512_shuffle_f32x4(ifft3742, ifft3742, 177));
__m512 ifft3835 = _mm512_fmadd_ps(ifft3827, ifft3750, _mm512_shuffle_f32x4(ifft3827, ifft3827, 177));
__m512 ifft3752 = _mm512_fmadd_ps(ifft3743, ifft3750, _mm512_shuffle_f32x4(ifft3743, ifft3743, 177));
__m512 ifft3836 = _mm512_fmadd_ps(ifft3828, ifft3750, _mm512_shuffle_f32x4(ifft3828, ifft3828, 177));
__m512 ifft3753 = _mm512_fmadd_ps(ifft3744, ifft3750, _mm512_shuffle_f32x4(ifft3744, ifft3744, 177));
__m512 ifft3837 = _mm512_fmadd_ps(ifft3829, ifft3750, _mm512_shuffle_f32x4(ifft3829, ifft3829, 177));
__m512 ifft3754 = _mm512_fmadd_ps(ifft3745, ifft3750, _mm512_shuffle_f32x4(ifft3745, ifft3745, 177));
__m512 ifft3838 = _mm512_fmadd_ps(ifft3830, ifft3750, _mm512_shuffle_f32x4(ifft3830, ifft3830, 177));
__m512 ifft3755 = _mm512_fmadd_ps(ifft3746, ifft3750, _mm512_shuffle_f32x4(ifft3746, ifft3746, 177));
__m512 ifft3839 = _mm512_fmadd_ps(ifft3831, ifft3750, _mm512_shuffle_f32x4(ifft3831, ifft3831, 177));
__m512 ifft3756 = _mm512_fnmsub_ps(ifft3747, ifft3750, _mm512_shuffle_f32x4(ifft3747, ifft3747, 177));
__m512 ifft3840 = _mm512_fnmsub_ps(ifft3832, ifft3750, _mm512_shuffle_f32x4(ifft3832, ifft3832, 177));
__m512 ifft3757 = _mm512_fmadd_ps(ifft3748, ifft3750, _mm512_shuffle_f32x4(ifft3748, ifft3748, 177));
__m512 ifft3841 = _mm512_fmadd_ps(ifft3833, ifft3750, _mm512_shuffle_f32x4(ifft3833, ifft3833, 177));
__m512 ifft3758 = _mm512_fmadd_ps(ifft3749, ifft3750, _mm512_shuffle_f32x4(ifft3749, ifft3749, 177));
__m512 ifft3842 = _mm512_fmadd_ps(ifft3834, ifft3750, _mm512_shuffle_f32x4(ifft3834, ifft3834, 177));
__m512 ifft3759 = _mm512_add_ps(ifft3751, ifft3752);
__m512 ifft3843 = _mm512_add_ps(ifft3835, ifft3836);
__m512 ifft3760 = _mm512_sub_ps(ifft3751, ifft3752);
__m512 ifft3844 = _mm512_sub_ps(ifft3835, ifft3836);
__m512 ifft3761 = _mm512_sub_ps(ifft3753, ifft3757);
__m512 ifft3845 = _mm512_sub_ps(ifft3837, ifft3841);
__m512 ifft3762 = _mm512_add_ps(ifft3754, ifft3758);
__m512 ifft3846 = _mm512_add_ps(ifft3838, ifft3842);
__m512 ifft3763 = _mm512_add_ps(ifft3753, ifft3757);
__m512 ifft3847 = _mm512_add_ps(ifft3837, ifft3841);
__m512 ifft3764 = _mm512_sub_ps(ifft3754, ifft3758);
__m512 ifft3848 = _mm512_sub_ps(ifft3838, ifft3842);
__m512 ifft3765 = _mm512_mul_ps(ifft3755, _mm512_set1_ps(3.125e-02f));
__m512 ifft3849 = _mm512_mul_ps(ifft3839, _mm512_set1_ps(3.125e-02f));
__m512 ifft3766 = _mm512_mul_ps(ifft3756, _mm512_set1_ps(3.125e-02f));
__m512 ifft3850 = _mm512_mul_ps(ifft3840, _mm512_set1_ps(3.125e-02f));
__m512 ifft3767 = _mm512_fmadd_ps(ifft3759, _mm512_set1_ps(1.5625e-02f), ifft3765);
__m512 ifft3851 = _mm512_fmadd_ps(ifft3843, _mm512_set1_ps(1.5625e-02f), ifft3849);
__m512 ifft3768 = _mm512_fmsub_ps(ifft3759, _mm512_set1_ps(1.5625e-02f), ifft3765);
__m512 ifft3852 = _mm512_fmsub_ps(ifft3843, _mm512_set1_ps(1.5625e-02f), ifft3849);
__m512 ifft3769 = _mm512_fmadd_ps(ifft3760, _mm512_set1_ps(1.5625e-02f), ifft3766);
__m512 ifft3853 = _mm512_fmadd_ps(ifft3844, _mm512_set1_ps(1.5625e-02f), ifft3850);
__m512 ifft3770 = _mm512_fmsub_ps(ifft3760, _mm512_set1_ps(1.5625e-02f), ifft3766);
__m512 ifft3854 = _mm512_fmsub_ps(ifft3844, _mm512_set1_ps(1.5625e-02f), ifft3850);
__m512 ifft3771 = _mm512_add_ps(ifft3761, ifft3762);
__m512 ifft3855 = _mm512_add_ps(ifft3845, ifft3846);
__m512 ifft3772 = _mm512_sub_ps(ifft3761, ifft3762);
__m512 ifft3856 = _mm512_sub_ps(ifft3845, ifft3846);
__m512 ifft3773 = _mm512_fnmadd_ps(ifft3771, _mm512_set1_ps(7.0710677e-01f), ifft3763);
__m512 ifft3857 = _mm512_fnmadd_ps(ifft3855, _mm512_set1_ps(7.0710677e-01f), ifft3847);
__m512 ifft3774 = _mm512_fmadd_ps(ifft3771, _mm512_set1_ps(7.0710677e-01f), ifft3763);
__m512 ifft3858 = _mm512_fmadd_ps(ifft3855, _mm512_set1_ps(7.0710677e-01f), ifft3847);
__m512 ifft3775 = _mm512_fmadd_ps(ifft3772, _mm512_set1_ps(7.0710677e-01f), ifft3764);
__m512 ifft3859 = _mm512_fmadd_ps(ifft3856, _mm512_set1_ps(7.0710677e-01f), ifft3848);
__m512 ifft3776 = _mm512_fmsub_ps(ifft3772, _mm512_set1_ps(7.0710677e-01f), ifft3764);
__m512 ifft3860 = _mm512_fmsub_ps(ifft3856, _mm512_set1_ps(7.0710677e-01f), ifft3848);
__m512 ifft3777 = _mm512_add_ps(ifft3773, ifft3774);
__m512 ifft3861 = _mm512_add_ps(ifft3857, ifft3858);
__m512 ifft3778 = _mm512_sub_ps(ifft3773, ifft3774);
__m512 ifft3862 = _mm512_sub_ps(ifft3857, ifft3858);
__m512 ifft3779 = _mm512_add_ps(ifft3775, ifft3776);
__m512 ifft3863 = _mm512_add_ps(ifft3859, ifft3860);
__m512 ifft3780 = _mm512_sub_ps(ifft3775, ifft3776);
__m512 ifft3864 = _mm512_sub_ps(ifft3859, ifft3860);
__m512 ifft3781 = _mm512_fmadd_ps(ifft3777, _mm512_set1_ps(1.5625e-02f), ifft3767);
__m512 ifft3865 = _mm512_fmadd_ps(ifft3861, _mm512_set1_ps(1.5625e-02f), ifft3851);
__m512 ifft3782 = _mm512_fnmadd_ps(ifft3777, _mm512_set1_ps(1.5625e-02f), ifft3767);
__m512 ifft3866 = _mm512_fnmadd_ps(ifft3861, _mm512_set1_ps(1.5625e-02f), ifft3851);
__m512 ifft3783 = _mm512_fmadd_ps(ifft3779, _mm512_set1_ps(1.5625e-02f), ifft3769);
__m512 ifft3867 = _mm512_fmadd_ps(ifft3863, _mm512_set1_ps(1.5625e-02f), ifft3853);
__m512 ifft3784 = _mm512_fnmadd_ps(ifft3779, _mm512_set1_ps(1.5625e-02f), ifft3769);
__m512 ifft3868 = _mm512_fnmadd_ps(ifft3863, _mm512_set1_ps(1.5625e-02f), ifft3853);
__m512 ifft3785 = _mm512_fnmadd_ps(ifft3780, _mm512_set1_ps(1.5625e-02f), ifft3768);
__m512 ifft3869 = _mm512_fnmadd_ps(ifft3864, _mm512_set1_ps(1.5625e-02f), ifft3852);
__m512 ifft3786 = _mm512_fmadd_ps(ifft3780, _mm512_set1_ps(1.5625e-02f), ifft3768);
__m512 ifft3870 = _mm512_fmadd_ps(ifft3864, _mm512_set1_ps(1.5625e-02f), ifft3852);
__m512 ifft3787 = _mm512_fmadd_ps(ifft3778, _mm512_set1_ps(1.5625e-02f), ifft3770);
__m512 ifft3871 = _mm512_fmadd_ps(ifft3862, _mm512_set1_ps(1.5625e-02f), ifft3854);
__m512 ifft3788 = _mm512_fnmadd_ps(ifft3778, _mm512_set1_ps(1.5625e-02f), ifft3770);
__m512 ifft3872 = _mm512_fnmadd_ps(ifft3862, _mm512_set1_ps(1.5625e-02f), ifft3854);
__m512 dat800 = ifft3781;
__m512 dat805 = ifft3865;
__m512 dat801 = ifft3783;
__m512 dat806 = ifft3867;
__m512 dat802 = ifft3785;
__m512 dat807 = ifft3869;
__m512 dat803 = ifft3787;
__m512 dat808 = ifft3871;
__m512 dat804 = ifft3782;
__m512 dat809 = ifft3866;
(void)ifft3784;
(void)ifft3868;
(void)ifft3786;
(void)ifft3870;
(void)ifft3788;
(void)ifft3872;
__m512i pm37 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack181 = _mm512_permutex2var_ps(dat800, pm37, dat805);
__m512i pm38 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack182 = _mm512_permutex2var_ps(dat800, pm38, dat805);
__m512 pack183 = _mm512_permutex2var_ps(dat801, pm37, dat806);
__m512 pack184 = _mm512_permutex2var_ps(dat801, pm38, dat806);
__m512 pack185 = _mm512_permutex2var_ps(dat802, pm37, dat807);
__m512 pack186 = _mm512_permutex2var_ps(dat802, pm38, dat807);
__m512 pack187 = _mm512_permutex2var_ps(dat803, pm37, dat808);
__m512 pack188 = _mm512_permutex2var_ps(dat803, pm38, dat808);
__m512 pack189 = _mm512_permutex2var_ps(dat804, pm37, dat809);
__m512 pack190 = _mm512_permutex2var_ps(dat804, pm38, dat809);
pack181 = _mm512_max_ps(_mm512_setzero_ps(), pack181);
pack182 = _mm512_max_ps(_mm512_setzero_ps(), pack182);
pack183 = _mm512_max_ps(_mm512_setzero_ps(), pack183);
pack184 = _mm512_max_ps(_mm512_setzero_ps(), pack184);
pack185 = _mm512_max_ps(_mm512_setzero_ps(), pack185);
pack186 = _mm512_max_ps(_mm512_setzero_ps(), pack186);
pack187 = _mm512_max_ps(_mm512_setzero_ps(), pack187);
pack188 = _mm512_max_ps(_mm512_setzero_ps(), pack188);
pack189 = _mm512_max_ps(_mm512_setzero_ps(), pack189);
pack190 = _mm512_max_ps(_mm512_setzero_ps(), pack190);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack181);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack182);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack183);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack184);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack185);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack186);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack187);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack188);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack189);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack190);
}
ptrdiff_t t24 = 0;
__m512 sfRe233 = _mm512_loadu_ps(sfPtr3+512+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfIm233 = _mm512_loadu_ps(sfPtr3+576+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfRe237 = _mm512_loadu_ps(sfPtr3+640+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfIm237 = _mm512_loadu_ps(sfPtr3+704+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfRe234 = _mm512_loadu_ps(sfPtr3+2167296+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfIm234 = _mm512_loadu_ps(sfPtr3+2167360+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfRe238 = _mm512_loadu_ps(sfPtr3+2167424+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfIm238 = _mm512_loadu_ps(sfPtr3+2167488+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfRe235 = _mm512_loadu_ps(sfPtr3+4334080+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfIm235 = _mm512_loadu_ps(sfPtr3+4334144+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfRe239 = _mm512_loadu_ps(sfPtr3+4334208+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfIm239 = _mm512_loadu_ps(sfPtr3+4334272+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfRe236 = _mm512_loadu_ps(sfPtr3+6500864+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfIm236 = _mm512_loadu_ps(sfPtr3+6500928+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfRe240 = _mm512_loadu_ps(sfPtr3+6500992+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfIm240 = _mm512_loadu_ps(sfPtr3+6501056+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512i ifft3873 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft3874 = _mm512_permutexvar_ps(ifft3873, sfRe233);
__m512 ifft3965 = _mm512_permutexvar_ps(ifft3873, sfRe237);
__m512i ifft3875 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft3876 = _mm512_permutexvar_ps(ifft3875, sfRe233);
__m512 ifft3966 = _mm512_permutexvar_ps(ifft3875, sfRe237);
__m512 ifft3877 = _mm512_permutexvar_ps(ifft3873, sfIm233);
__m512 ifft3967 = _mm512_permutexvar_ps(ifft3873, sfIm237);
__m512 ifft3878 = _mm512_permutexvar_ps(ifft3875, sfIm233);
__m512 ifft3968 = _mm512_permutexvar_ps(ifft3875, sfIm237);
__m512 ifft3879 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft3880 = _mm512_mask_fmadd_ps(ifft3878, 65021, ifft3879, ifft3874);
__m512 ifft3969 = _mm512_mask_fmadd_ps(ifft3968, 65021, ifft3879, ifft3965);
__m512 ifft3881 = _mm512_mask_fnmadd_ps(ifft3877, 65021, ifft3879, ifft3876);
__m512 ifft3970 = _mm512_mask_fnmadd_ps(ifft3967, 65021, ifft3879, ifft3966);
__m512 ifft3882 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft3883 = _mm512_fmadd_ps(ifft3880, ifft3882, _mm512_shuffle_ps(ifft3880, ifft3880, 177));
__m512 ifft3971 = _mm512_fmadd_ps(ifft3969, ifft3882, _mm512_shuffle_ps(ifft3969, ifft3969, 177));
__m512 ifft3884 = _mm512_fmadd_ps(ifft3881, ifft3882, _mm512_shuffle_ps(ifft3881, ifft3881, 177));
__m512 ifft3972 = _mm512_fmadd_ps(ifft3970, ifft3882, _mm512_shuffle_ps(ifft3970, ifft3970, 177));
__m512 ifft3885 = _mm512_fmadd_ps(sfRe234, ifft3882, _mm512_shuffle_ps(sfRe234, sfRe234, 177));
__m512 ifft3973 = _mm512_fmadd_ps(sfRe238, ifft3882, _mm512_shuffle_ps(sfRe238, sfRe238, 177));
__m512 ifft3886 = _mm512_fmadd_ps(sfIm234, ifft3882, _mm512_shuffle_ps(sfIm234, sfIm234, 177));
__m512 ifft3974 = _mm512_fmadd_ps(sfIm238, ifft3882, _mm512_shuffle_ps(sfIm238, sfIm238, 177));
__m512 ifft3887 = _mm512_fmadd_ps(sfRe235, ifft3882, _mm512_shuffle_ps(sfRe235, sfRe235, 177));
__m512 ifft3975 = _mm512_fmadd_ps(sfRe239, ifft3882, _mm512_shuffle_ps(sfRe239, sfRe239, 177));
__m512 ifft3888 = _mm512_fmadd_ps(sfIm235, ifft3882, _mm512_shuffle_ps(sfIm235, sfIm235, 177));
__m512 ifft3976 = _mm512_fmadd_ps(sfIm239, ifft3882, _mm512_shuffle_ps(sfIm239, sfIm239, 177));
__m512 ifft3889 = _mm512_fmadd_ps(sfRe236, ifft3882, _mm512_shuffle_ps(sfRe236, sfRe236, 177));
__m512 ifft3977 = _mm512_fmadd_ps(sfRe240, ifft3882, _mm512_shuffle_ps(sfRe240, sfRe240, 177));
__m512 ifft3890 = _mm512_fmadd_ps(sfIm236, ifft3882, _mm512_shuffle_ps(sfIm236, sfIm236, 177));
__m512 ifft3978 = _mm512_fmadd_ps(sfIm240, ifft3882, _mm512_shuffle_ps(sfIm240, sfIm240, 177));
__m512 ifft3891 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft3892 = _mm512_mul_ps(ifft3883, ifft3891);
__m512 ifft3979 = _mm512_mul_ps(ifft3971, ifft3891);
__m512 ifft3893 = _mm512_mul_ps(ifft3884, ifft3891);
__m512 ifft3980 = _mm512_mul_ps(ifft3972, ifft3891);
__m512 ifft3894 = _mm512_mul_ps(ifft3885, ifft3891);
__m512 ifft3981 = _mm512_mul_ps(ifft3973, ifft3891);
__m512 ifft3895 = _mm512_mul_ps(ifft3886, ifft3891);
__m512 ifft3982 = _mm512_mul_ps(ifft3974, ifft3891);
__m512 ifft3896 = _mm512_mul_ps(ifft3887, ifft3891);
__m512 ifft3983 = _mm512_mul_ps(ifft3975, ifft3891);
__m512 ifft3897 = _mm512_mul_ps(ifft3888, ifft3891);
__m512 ifft3984 = _mm512_mul_ps(ifft3976, ifft3891);
__m512 ifft3898 = _mm512_mul_ps(ifft3889, ifft3891);
__m512 ifft3985 = _mm512_mul_ps(ifft3977, ifft3891);
__m512 ifft3899 = _mm512_mul_ps(ifft3890, ifft3891);
__m512 ifft3986 = _mm512_mul_ps(ifft3978, ifft3891);
__m512 ifft3900 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft3901 = _mm512_fnmadd_ps(ifft3884, ifft3900, ifft3892);
__m512 ifft3987 = _mm512_fnmadd_ps(ifft3972, ifft3900, ifft3979);
__m512 ifft3902 = _mm512_fmadd_ps(ifft3883, ifft3900, ifft3893);
__m512 ifft3988 = _mm512_fmadd_ps(ifft3971, ifft3900, ifft3980);
__m512 ifft3903 = _mm512_fnmadd_ps(ifft3886, ifft3900, ifft3894);
__m512 ifft3989 = _mm512_fnmadd_ps(ifft3974, ifft3900, ifft3981);
__m512 ifft3904 = _mm512_fmadd_ps(ifft3885, ifft3900, ifft3895);
__m512 ifft3990 = _mm512_fmadd_ps(ifft3973, ifft3900, ifft3982);
__m512 ifft3905 = _mm512_fnmadd_ps(ifft3888, ifft3900, ifft3896);
__m512 ifft3991 = _mm512_fnmadd_ps(ifft3976, ifft3900, ifft3983);
__m512 ifft3906 = _mm512_fmadd_ps(ifft3887, ifft3900, ifft3897);
__m512 ifft3992 = _mm512_fmadd_ps(ifft3975, ifft3900, ifft3984);
__m512 ifft3907 = _mm512_fnmadd_ps(ifft3890, ifft3900, ifft3898);
__m512 ifft3993 = _mm512_fnmadd_ps(ifft3978, ifft3900, ifft3985);
__m512 ifft3908 = _mm512_fmadd_ps(ifft3889, ifft3900, ifft3899);
__m512 ifft3994 = _mm512_fmadd_ps(ifft3977, ifft3900, ifft3986);
__m512 ifft3909 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft3910 = _mm512_fmadd_ps(ifft3901, ifft3909, _mm512_shuffle_ps(ifft3901, ifft3901, 78));
__m512 ifft3995 = _mm512_fmadd_ps(ifft3987, ifft3909, _mm512_shuffle_ps(ifft3987, ifft3987, 78));
__m512 ifft3911 = _mm512_fmadd_ps(ifft3902, ifft3909, _mm512_shuffle_ps(ifft3902, ifft3902, 78));
__m512 ifft3996 = _mm512_fmadd_ps(ifft3988, ifft3909, _mm512_shuffle_ps(ifft3988, ifft3988, 78));
__m512 ifft3912 = _mm512_fmadd_ps(ifft3903, ifft3909, _mm512_shuffle_ps(ifft3903, ifft3903, 78));
__m512 ifft3997 = _mm512_fmadd_ps(ifft3989, ifft3909, _mm512_shuffle_ps(ifft3989, ifft3989, 78));
__m512 ifft3913 = _mm512_fmadd_ps(ifft3904, ifft3909, _mm512_shuffle_ps(ifft3904, ifft3904, 78));
__m512 ifft3998 = _mm512_fmadd_ps(ifft3990, ifft3909, _mm512_shuffle_ps(ifft3990, ifft3990, 78));
__m512 ifft3914 = _mm512_fmadd_ps(ifft3905, ifft3909, _mm512_shuffle_ps(ifft3905, ifft3905, 78));
__m512 ifft3999 = _mm512_fmadd_ps(ifft3991, ifft3909, _mm512_shuffle_ps(ifft3991, ifft3991, 78));
__m512 ifft3915 = _mm512_fmadd_ps(ifft3906, ifft3909, _mm512_shuffle_ps(ifft3906, ifft3906, 78));
__m512 ifft4000 = _mm512_fmadd_ps(ifft3992, ifft3909, _mm512_shuffle_ps(ifft3992, ifft3992, 78));
__m512 ifft3916 = _mm512_fmadd_ps(ifft3907, ifft3909, _mm512_shuffle_ps(ifft3907, ifft3907, 78));
__m512 ifft4001 = _mm512_fmadd_ps(ifft3993, ifft3909, _mm512_shuffle_ps(ifft3993, ifft3993, 78));
__m512 ifft3917 = _mm512_fmadd_ps(ifft3908, ifft3909, _mm512_shuffle_ps(ifft3908, ifft3908, 78));
__m512 ifft4002 = _mm512_fmadd_ps(ifft3994, ifft3909, _mm512_shuffle_ps(ifft3994, ifft3994, 78));
__m512 ifft3918 = _mm512_mask_sub_ps(ifft3910, 49344, _mm512_setzero_ps(), ifft3911);
__m512 ifft4003 = _mm512_mask_sub_ps(ifft3995, 49344, _mm512_setzero_ps(), ifft3996);
__m512 ifft3919 = _mm512_mask_mov_ps(ifft3911, 49344, ifft3910);
__m512 ifft4004 = _mm512_mask_mov_ps(ifft3996, 49344, ifft3995);
__m512 ifft3920 = _mm512_mask_sub_ps(ifft3912, 49344, _mm512_setzero_ps(), ifft3913);
__m512 ifft4005 = _mm512_mask_sub_ps(ifft3997, 49344, _mm512_setzero_ps(), ifft3998);
__m512 ifft3921 = _mm512_mask_mov_ps(ifft3913, 49344, ifft3912);
__m512 ifft4006 = _mm512_mask_mov_ps(ifft3998, 49344, ifft3997);
__m512 ifft3922 = _mm512_mask_sub_ps(ifft3914, 49344, _mm512_setzero_ps(), ifft3915);
__m512 ifft4007 = _mm512_mask_sub_ps(ifft3999, 49344, _mm512_setzero_ps(), ifft4000);
__m512 ifft3923 = _mm512_mask_mov_ps(ifft3915, 49344, ifft3914);
__m512 ifft4008 = _mm512_mask_mov_ps(ifft4000, 49344, ifft3999);
__m512 ifft3924 = _mm512_mask_sub_ps(ifft3916, 49344, _mm512_setzero_ps(), ifft3917);
__m512 ifft4009 = _mm512_mask_sub_ps(ifft4001, 49344, _mm512_setzero_ps(), ifft4002);
__m512 ifft3925 = _mm512_mask_mov_ps(ifft3917, 49344, ifft3916);
__m512 ifft4010 = _mm512_mask_mov_ps(ifft4002, 49344, ifft4001);
__m512 ifft3926 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft3927 = _mm512_fmadd_ps(ifft3918, ifft3926, _mm512_shuffle_f32x4(ifft3918, ifft3918, 177));
__m512 ifft4011 = _mm512_fmadd_ps(ifft4003, ifft3926, _mm512_shuffle_f32x4(ifft4003, ifft4003, 177));
__m512 ifft3928 = _mm512_fmadd_ps(ifft3919, ifft3926, _mm512_shuffle_f32x4(ifft3919, ifft3919, 177));
__m512 ifft4012 = _mm512_fmadd_ps(ifft4004, ifft3926, _mm512_shuffle_f32x4(ifft4004, ifft4004, 177));
__m512 ifft3929 = _mm512_fmadd_ps(ifft3920, ifft3926, _mm512_shuffle_f32x4(ifft3920, ifft3920, 177));
__m512 ifft4013 = _mm512_fmadd_ps(ifft4005, ifft3926, _mm512_shuffle_f32x4(ifft4005, ifft4005, 177));
__m512 ifft3930 = _mm512_fmadd_ps(ifft3921, ifft3926, _mm512_shuffle_f32x4(ifft3921, ifft3921, 177));
__m512 ifft4014 = _mm512_fmadd_ps(ifft4006, ifft3926, _mm512_shuffle_f32x4(ifft4006, ifft4006, 177));
__m512 ifft3931 = _mm512_fmadd_ps(ifft3922, ifft3926, _mm512_shuffle_f32x4(ifft3922, ifft3922, 177));
__m512 ifft4015 = _mm512_fmadd_ps(ifft4007, ifft3926, _mm512_shuffle_f32x4(ifft4007, ifft4007, 177));
__m512 ifft3932 = _mm512_fnmsub_ps(ifft3923, ifft3926, _mm512_shuffle_f32x4(ifft3923, ifft3923, 177));
__m512 ifft4016 = _mm512_fnmsub_ps(ifft4008, ifft3926, _mm512_shuffle_f32x4(ifft4008, ifft4008, 177));
__m512 ifft3933 = _mm512_fmadd_ps(ifft3924, ifft3926, _mm512_shuffle_f32x4(ifft3924, ifft3924, 177));
__m512 ifft4017 = _mm512_fmadd_ps(ifft4009, ifft3926, _mm512_shuffle_f32x4(ifft4009, ifft4009, 177));
__m512 ifft3934 = _mm512_fmadd_ps(ifft3925, ifft3926, _mm512_shuffle_f32x4(ifft3925, ifft3925, 177));
__m512 ifft4018 = _mm512_fmadd_ps(ifft4010, ifft3926, _mm512_shuffle_f32x4(ifft4010, ifft4010, 177));
__m512 ifft3935 = _mm512_add_ps(ifft3927, ifft3928);
__m512 ifft4019 = _mm512_add_ps(ifft4011, ifft4012);
__m512 ifft3936 = _mm512_sub_ps(ifft3927, ifft3928);
__m512 ifft4020 = _mm512_sub_ps(ifft4011, ifft4012);
__m512 ifft3937 = _mm512_sub_ps(ifft3929, ifft3933);
__m512 ifft4021 = _mm512_sub_ps(ifft4013, ifft4017);
__m512 ifft3938 = _mm512_add_ps(ifft3930, ifft3934);
__m512 ifft4022 = _mm512_add_ps(ifft4014, ifft4018);
__m512 ifft3939 = _mm512_add_ps(ifft3929, ifft3933);
__m512 ifft4023 = _mm512_add_ps(ifft4013, ifft4017);
__m512 ifft3940 = _mm512_sub_ps(ifft3930, ifft3934);
__m512 ifft4024 = _mm512_sub_ps(ifft4014, ifft4018);
__m512 ifft3941 = _mm512_mul_ps(ifft3931, _mm512_set1_ps(3.125e-02f));
__m512 ifft4025 = _mm512_mul_ps(ifft4015, _mm512_set1_ps(3.125e-02f));
__m512 ifft3942 = _mm512_mul_ps(ifft3932, _mm512_set1_ps(3.125e-02f));
__m512 ifft4026 = _mm512_mul_ps(ifft4016, _mm512_set1_ps(3.125e-02f));
__m512 ifft3943 = _mm512_fmadd_ps(ifft3935, _mm512_set1_ps(1.5625e-02f), ifft3941);
__m512 ifft4027 = _mm512_fmadd_ps(ifft4019, _mm512_set1_ps(1.5625e-02f), ifft4025);
__m512 ifft3944 = _mm512_fmsub_ps(ifft3935, _mm512_set1_ps(1.5625e-02f), ifft3941);
__m512 ifft4028 = _mm512_fmsub_ps(ifft4019, _mm512_set1_ps(1.5625e-02f), ifft4025);
__m512 ifft3945 = _mm512_fmadd_ps(ifft3936, _mm512_set1_ps(1.5625e-02f), ifft3942);
__m512 ifft4029 = _mm512_fmadd_ps(ifft4020, _mm512_set1_ps(1.5625e-02f), ifft4026);
__m512 ifft3946 = _mm512_fmsub_ps(ifft3936, _mm512_set1_ps(1.5625e-02f), ifft3942);
__m512 ifft4030 = _mm512_fmsub_ps(ifft4020, _mm512_set1_ps(1.5625e-02f), ifft4026);
__m512 ifft3947 = _mm512_add_ps(ifft3937, ifft3938);
__m512 ifft4031 = _mm512_add_ps(ifft4021, ifft4022);
__m512 ifft3948 = _mm512_sub_ps(ifft3937, ifft3938);
__m512 ifft4032 = _mm512_sub_ps(ifft4021, ifft4022);
__m512 ifft3949 = _mm512_fnmadd_ps(ifft3947, _mm512_set1_ps(7.0710677e-01f), ifft3939);
__m512 ifft4033 = _mm512_fnmadd_ps(ifft4031, _mm512_set1_ps(7.0710677e-01f), ifft4023);
__m512 ifft3950 = _mm512_fmadd_ps(ifft3947, _mm512_set1_ps(7.0710677e-01f), ifft3939);
__m512 ifft4034 = _mm512_fmadd_ps(ifft4031, _mm512_set1_ps(7.0710677e-01f), ifft4023);
__m512 ifft3951 = _mm512_fmadd_ps(ifft3948, _mm512_set1_ps(7.0710677e-01f), ifft3940);
__m512 ifft4035 = _mm512_fmadd_ps(ifft4032, _mm512_set1_ps(7.0710677e-01f), ifft4024);
__m512 ifft3952 = _mm512_fmsub_ps(ifft3948, _mm512_set1_ps(7.0710677e-01f), ifft3940);
__m512 ifft4036 = _mm512_fmsub_ps(ifft4032, _mm512_set1_ps(7.0710677e-01f), ifft4024);
__m512 ifft3953 = _mm512_add_ps(ifft3949, ifft3950);
__m512 ifft4037 = _mm512_add_ps(ifft4033, ifft4034);
__m512 ifft3954 = _mm512_sub_ps(ifft3949, ifft3950);
__m512 ifft4038 = _mm512_sub_ps(ifft4033, ifft4034);
__m512 ifft3955 = _mm512_add_ps(ifft3951, ifft3952);
__m512 ifft4039 = _mm512_add_ps(ifft4035, ifft4036);
__m512 ifft3956 = _mm512_sub_ps(ifft3951, ifft3952);
__m512 ifft4040 = _mm512_sub_ps(ifft4035, ifft4036);
__m512 ifft3957 = _mm512_fmadd_ps(ifft3953, _mm512_set1_ps(1.5625e-02f), ifft3943);
__m512 ifft4041 = _mm512_fmadd_ps(ifft4037, _mm512_set1_ps(1.5625e-02f), ifft4027);
__m512 ifft3958 = _mm512_fnmadd_ps(ifft3953, _mm512_set1_ps(1.5625e-02f), ifft3943);
__m512 ifft4042 = _mm512_fnmadd_ps(ifft4037, _mm512_set1_ps(1.5625e-02f), ifft4027);
__m512 ifft3959 = _mm512_fmadd_ps(ifft3955, _mm512_set1_ps(1.5625e-02f), ifft3945);
__m512 ifft4043 = _mm512_fmadd_ps(ifft4039, _mm512_set1_ps(1.5625e-02f), ifft4029);
__m512 ifft3960 = _mm512_fnmadd_ps(ifft3955, _mm512_set1_ps(1.5625e-02f), ifft3945);
__m512 ifft4044 = _mm512_fnmadd_ps(ifft4039, _mm512_set1_ps(1.5625e-02f), ifft4029);
__m512 ifft3961 = _mm512_fnmadd_ps(ifft3956, _mm512_set1_ps(1.5625e-02f), ifft3944);
__m512 ifft4045 = _mm512_fnmadd_ps(ifft4040, _mm512_set1_ps(1.5625e-02f), ifft4028);
__m512 ifft3962 = _mm512_fmadd_ps(ifft3956, _mm512_set1_ps(1.5625e-02f), ifft3944);
__m512 ifft4046 = _mm512_fmadd_ps(ifft4040, _mm512_set1_ps(1.5625e-02f), ifft4028);
__m512 ifft3963 = _mm512_fmadd_ps(ifft3954, _mm512_set1_ps(1.5625e-02f), ifft3946);
__m512 ifft4047 = _mm512_fmadd_ps(ifft4038, _mm512_set1_ps(1.5625e-02f), ifft4030);
__m512 ifft3964 = _mm512_fnmadd_ps(ifft3954, _mm512_set1_ps(1.5625e-02f), ifft3946);
__m512 ifft4048 = _mm512_fnmadd_ps(ifft4038, _mm512_set1_ps(1.5625e-02f), ifft4030);
__m512 dat810 = ifft3957;
__m512 dat815 = ifft4041;
__m512 dat811 = ifft3959;
__m512 dat816 = ifft4043;
__m512 dat812 = ifft3961;
__m512 dat817 = ifft4045;
__m512 dat813 = ifft3963;
__m512 dat818 = ifft4047;
__m512 dat814 = ifft3958;
__m512 dat819 = ifft4042;
(void)ifft3960;
(void)ifft4044;
(void)ifft3962;
(void)ifft4046;
(void)ifft3964;
(void)ifft4048;
__m512i pm39 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack191 = _mm512_permutex2var_ps(dat810, pm39, dat815);
__m512i pm40 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack192 = _mm512_permutex2var_ps(dat810, pm40, dat815);
__m512 pack193 = _mm512_permutex2var_ps(dat811, pm39, dat816);
__m512 pack194 = _mm512_permutex2var_ps(dat811, pm40, dat816);
__m512 pack195 = _mm512_permutex2var_ps(dat812, pm39, dat817);
__m512 pack196 = _mm512_permutex2var_ps(dat812, pm40, dat817);
__m512 pack197 = _mm512_permutex2var_ps(dat813, pm39, dat818);
__m512 pack198 = _mm512_permutex2var_ps(dat813, pm40, dat818);
__m512 pack199 = _mm512_permutex2var_ps(dat814, pm39, dat819);
__m512 pack200 = _mm512_permutex2var_ps(dat814, pm40, dat819);
pack191 = _mm512_max_ps(_mm512_setzero_ps(), pack191);
pack192 = _mm512_max_ps(_mm512_setzero_ps(), pack192);
pack193 = _mm512_max_ps(_mm512_setzero_ps(), pack193);
pack194 = _mm512_max_ps(_mm512_setzero_ps(), pack194);
pack195 = _mm512_max_ps(_mm512_setzero_ps(), pack195);
pack196 = _mm512_max_ps(_mm512_setzero_ps(), pack196);
pack197 = _mm512_max_ps(_mm512_setzero_ps(), pack197);
pack198 = _mm512_max_ps(_mm512_setzero_ps(), pack198);
pack199 = _mm512_max_ps(_mm512_setzero_ps(), pack199);
pack200 = _mm512_max_ps(_mm512_setzero_ps(), pack200);
_mm512_mask_storeu_ps(datPtr2+80+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack191);
_mm512_mask_storeu_ps(datPtr2+50320+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack192);
_mm512_mask_storeu_ps(datPtr2+528+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack193);
_mm512_mask_storeu_ps(datPtr2+50768+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack194);
_mm512_mask_storeu_ps(datPtr2+976+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack195);
_mm512_mask_storeu_ps(datPtr2+51216+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack196);
_mm512_mask_storeu_ps(datPtr2+1424+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack197);
_mm512_mask_storeu_ps(datPtr2+51664+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack198);
_mm512_mask_storeu_ps(datPtr2+1872+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack199);
_mm512_mask_storeu_ps(datPtr2+52112+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack200);
}
}
if (j5 >= last2) return;
++j5;
rel5 = 19;
}
if (rel5 < 20) {
ptrdiff_t toH14 = base5+25;
ptrdiff_t toW14 = 0;
ptrdiff_t k38 = 16*w21;
for (; k38 != 16; ++k38) {
ptrdiff_t r15 = 0;
for (; r15 != 2; ++r15) {
ptrdiff_t t25 = 0;
__m512 sfRe241 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfIm241 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfRe245 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfIm245 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfRe242 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfIm242 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfRe246 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfIm246 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfRe243 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfIm243 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfRe247 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfIm247 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfRe244 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfIm244 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfRe248 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfIm248 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512i ifft4049 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft4050 = _mm512_permutexvar_ps(ifft4049, sfRe241);
__m512 ifft4141 = _mm512_permutexvar_ps(ifft4049, sfRe245);
__m512i ifft4051 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4052 = _mm512_permutexvar_ps(ifft4051, sfRe241);
__m512 ifft4142 = _mm512_permutexvar_ps(ifft4051, sfRe245);
__m512 ifft4053 = _mm512_permutexvar_ps(ifft4049, sfIm241);
__m512 ifft4143 = _mm512_permutexvar_ps(ifft4049, sfIm245);
__m512 ifft4054 = _mm512_permutexvar_ps(ifft4051, sfIm241);
__m512 ifft4144 = _mm512_permutexvar_ps(ifft4051, sfIm245);
__m512 ifft4055 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft4056 = _mm512_mask_fmadd_ps(ifft4054, 65021, ifft4055, ifft4050);
__m512 ifft4145 = _mm512_mask_fmadd_ps(ifft4144, 65021, ifft4055, ifft4141);
__m512 ifft4057 = _mm512_mask_fnmadd_ps(ifft4053, 65021, ifft4055, ifft4052);
__m512 ifft4146 = _mm512_mask_fnmadd_ps(ifft4143, 65021, ifft4055, ifft4142);
__m512 ifft4058 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft4059 = _mm512_fmadd_ps(ifft4056, ifft4058, _mm512_shuffle_ps(ifft4056, ifft4056, 177));
__m512 ifft4147 = _mm512_fmadd_ps(ifft4145, ifft4058, _mm512_shuffle_ps(ifft4145, ifft4145, 177));
__m512 ifft4060 = _mm512_fmadd_ps(ifft4057, ifft4058, _mm512_shuffle_ps(ifft4057, ifft4057, 177));
__m512 ifft4148 = _mm512_fmadd_ps(ifft4146, ifft4058, _mm512_shuffle_ps(ifft4146, ifft4146, 177));
__m512 ifft4061 = _mm512_fmadd_ps(sfRe242, ifft4058, _mm512_shuffle_ps(sfRe242, sfRe242, 177));
__m512 ifft4149 = _mm512_fmadd_ps(sfRe246, ifft4058, _mm512_shuffle_ps(sfRe246, sfRe246, 177));
__m512 ifft4062 = _mm512_fmadd_ps(sfIm242, ifft4058, _mm512_shuffle_ps(sfIm242, sfIm242, 177));
__m512 ifft4150 = _mm512_fmadd_ps(sfIm246, ifft4058, _mm512_shuffle_ps(sfIm246, sfIm246, 177));
__m512 ifft4063 = _mm512_fmadd_ps(sfRe243, ifft4058, _mm512_shuffle_ps(sfRe243, sfRe243, 177));
__m512 ifft4151 = _mm512_fmadd_ps(sfRe247, ifft4058, _mm512_shuffle_ps(sfRe247, sfRe247, 177));
__m512 ifft4064 = _mm512_fmadd_ps(sfIm243, ifft4058, _mm512_shuffle_ps(sfIm243, sfIm243, 177));
__m512 ifft4152 = _mm512_fmadd_ps(sfIm247, ifft4058, _mm512_shuffle_ps(sfIm247, sfIm247, 177));
__m512 ifft4065 = _mm512_fmadd_ps(sfRe244, ifft4058, _mm512_shuffle_ps(sfRe244, sfRe244, 177));
__m512 ifft4153 = _mm512_fmadd_ps(sfRe248, ifft4058, _mm512_shuffle_ps(sfRe248, sfRe248, 177));
__m512 ifft4066 = _mm512_fmadd_ps(sfIm244, ifft4058, _mm512_shuffle_ps(sfIm244, sfIm244, 177));
__m512 ifft4154 = _mm512_fmadd_ps(sfIm248, ifft4058, _mm512_shuffle_ps(sfIm248, sfIm248, 177));
__m512 ifft4067 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft4068 = _mm512_mul_ps(ifft4059, ifft4067);
__m512 ifft4155 = _mm512_mul_ps(ifft4147, ifft4067);
__m512 ifft4069 = _mm512_mul_ps(ifft4060, ifft4067);
__m512 ifft4156 = _mm512_mul_ps(ifft4148, ifft4067);
__m512 ifft4070 = _mm512_mul_ps(ifft4061, ifft4067);
__m512 ifft4157 = _mm512_mul_ps(ifft4149, ifft4067);
__m512 ifft4071 = _mm512_mul_ps(ifft4062, ifft4067);
__m512 ifft4158 = _mm512_mul_ps(ifft4150, ifft4067);
__m512 ifft4072 = _mm512_mul_ps(ifft4063, ifft4067);
__m512 ifft4159 = _mm512_mul_ps(ifft4151, ifft4067);
__m512 ifft4073 = _mm512_mul_ps(ifft4064, ifft4067);
__m512 ifft4160 = _mm512_mul_ps(ifft4152, ifft4067);
__m512 ifft4074 = _mm512_mul_ps(ifft4065, ifft4067);
__m512 ifft4161 = _mm512_mul_ps(ifft4153, ifft4067);
__m512 ifft4075 = _mm512_mul_ps(ifft4066, ifft4067);
__m512 ifft4162 = _mm512_mul_ps(ifft4154, ifft4067);
__m512 ifft4076 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft4077 = _mm512_fnmadd_ps(ifft4060, ifft4076, ifft4068);
__m512 ifft4163 = _mm512_fnmadd_ps(ifft4148, ifft4076, ifft4155);
__m512 ifft4078 = _mm512_fmadd_ps(ifft4059, ifft4076, ifft4069);
__m512 ifft4164 = _mm512_fmadd_ps(ifft4147, ifft4076, ifft4156);
__m512 ifft4079 = _mm512_fnmadd_ps(ifft4062, ifft4076, ifft4070);
__m512 ifft4165 = _mm512_fnmadd_ps(ifft4150, ifft4076, ifft4157);
__m512 ifft4080 = _mm512_fmadd_ps(ifft4061, ifft4076, ifft4071);
__m512 ifft4166 = _mm512_fmadd_ps(ifft4149, ifft4076, ifft4158);
__m512 ifft4081 = _mm512_fnmadd_ps(ifft4064, ifft4076, ifft4072);
__m512 ifft4167 = _mm512_fnmadd_ps(ifft4152, ifft4076, ifft4159);
__m512 ifft4082 = _mm512_fmadd_ps(ifft4063, ifft4076, ifft4073);
__m512 ifft4168 = _mm512_fmadd_ps(ifft4151, ifft4076, ifft4160);
__m512 ifft4083 = _mm512_fnmadd_ps(ifft4066, ifft4076, ifft4074);
__m512 ifft4169 = _mm512_fnmadd_ps(ifft4154, ifft4076, ifft4161);
__m512 ifft4084 = _mm512_fmadd_ps(ifft4065, ifft4076, ifft4075);
__m512 ifft4170 = _mm512_fmadd_ps(ifft4153, ifft4076, ifft4162);
__m512 ifft4085 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft4086 = _mm512_fmadd_ps(ifft4077, ifft4085, _mm512_shuffle_ps(ifft4077, ifft4077, 78));
__m512 ifft4171 = _mm512_fmadd_ps(ifft4163, ifft4085, _mm512_shuffle_ps(ifft4163, ifft4163, 78));
__m512 ifft4087 = _mm512_fmadd_ps(ifft4078, ifft4085, _mm512_shuffle_ps(ifft4078, ifft4078, 78));
__m512 ifft4172 = _mm512_fmadd_ps(ifft4164, ifft4085, _mm512_shuffle_ps(ifft4164, ifft4164, 78));
__m512 ifft4088 = _mm512_fmadd_ps(ifft4079, ifft4085, _mm512_shuffle_ps(ifft4079, ifft4079, 78));
__m512 ifft4173 = _mm512_fmadd_ps(ifft4165, ifft4085, _mm512_shuffle_ps(ifft4165, ifft4165, 78));
__m512 ifft4089 = _mm512_fmadd_ps(ifft4080, ifft4085, _mm512_shuffle_ps(ifft4080, ifft4080, 78));
__m512 ifft4174 = _mm512_fmadd_ps(ifft4166, ifft4085, _mm512_shuffle_ps(ifft4166, ifft4166, 78));
__m512 ifft4090 = _mm512_fmadd_ps(ifft4081, ifft4085, _mm512_shuffle_ps(ifft4081, ifft4081, 78));
__m512 ifft4175 = _mm512_fmadd_ps(ifft4167, ifft4085, _mm512_shuffle_ps(ifft4167, ifft4167, 78));
__m512 ifft4091 = _mm512_fmadd_ps(ifft4082, ifft4085, _mm512_shuffle_ps(ifft4082, ifft4082, 78));
__m512 ifft4176 = _mm512_fmadd_ps(ifft4168, ifft4085, _mm512_shuffle_ps(ifft4168, ifft4168, 78));
__m512 ifft4092 = _mm512_fmadd_ps(ifft4083, ifft4085, _mm512_shuffle_ps(ifft4083, ifft4083, 78));
__m512 ifft4177 = _mm512_fmadd_ps(ifft4169, ifft4085, _mm512_shuffle_ps(ifft4169, ifft4169, 78));
__m512 ifft4093 = _mm512_fmadd_ps(ifft4084, ifft4085, _mm512_shuffle_ps(ifft4084, ifft4084, 78));
__m512 ifft4178 = _mm512_fmadd_ps(ifft4170, ifft4085, _mm512_shuffle_ps(ifft4170, ifft4170, 78));
__m512 ifft4094 = _mm512_mask_sub_ps(ifft4086, 49344, _mm512_setzero_ps(), ifft4087);
__m512 ifft4179 = _mm512_mask_sub_ps(ifft4171, 49344, _mm512_setzero_ps(), ifft4172);
__m512 ifft4095 = _mm512_mask_mov_ps(ifft4087, 49344, ifft4086);
__m512 ifft4180 = _mm512_mask_mov_ps(ifft4172, 49344, ifft4171);
__m512 ifft4096 = _mm512_mask_sub_ps(ifft4088, 49344, _mm512_setzero_ps(), ifft4089);
__m512 ifft4181 = _mm512_mask_sub_ps(ifft4173, 49344, _mm512_setzero_ps(), ifft4174);
__m512 ifft4097 = _mm512_mask_mov_ps(ifft4089, 49344, ifft4088);
__m512 ifft4182 = _mm512_mask_mov_ps(ifft4174, 49344, ifft4173);
__m512 ifft4098 = _mm512_mask_sub_ps(ifft4090, 49344, _mm512_setzero_ps(), ifft4091);
__m512 ifft4183 = _mm512_mask_sub_ps(ifft4175, 49344, _mm512_setzero_ps(), ifft4176);
__m512 ifft4099 = _mm512_mask_mov_ps(ifft4091, 49344, ifft4090);
__m512 ifft4184 = _mm512_mask_mov_ps(ifft4176, 49344, ifft4175);
__m512 ifft4100 = _mm512_mask_sub_ps(ifft4092, 49344, _mm512_setzero_ps(), ifft4093);
__m512 ifft4185 = _mm512_mask_sub_ps(ifft4177, 49344, _mm512_setzero_ps(), ifft4178);
__m512 ifft4101 = _mm512_mask_mov_ps(ifft4093, 49344, ifft4092);
__m512 ifft4186 = _mm512_mask_mov_ps(ifft4178, 49344, ifft4177);
__m512 ifft4102 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft4103 = _mm512_fmadd_ps(ifft4094, ifft4102, _mm512_shuffle_f32x4(ifft4094, ifft4094, 177));
__m512 ifft4187 = _mm512_fmadd_ps(ifft4179, ifft4102, _mm512_shuffle_f32x4(ifft4179, ifft4179, 177));
__m512 ifft4104 = _mm512_fmadd_ps(ifft4095, ifft4102, _mm512_shuffle_f32x4(ifft4095, ifft4095, 177));
__m512 ifft4188 = _mm512_fmadd_ps(ifft4180, ifft4102, _mm512_shuffle_f32x4(ifft4180, ifft4180, 177));
__m512 ifft4105 = _mm512_fmadd_ps(ifft4096, ifft4102, _mm512_shuffle_f32x4(ifft4096, ifft4096, 177));
__m512 ifft4189 = _mm512_fmadd_ps(ifft4181, ifft4102, _mm512_shuffle_f32x4(ifft4181, ifft4181, 177));
__m512 ifft4106 = _mm512_fmadd_ps(ifft4097, ifft4102, _mm512_shuffle_f32x4(ifft4097, ifft4097, 177));
__m512 ifft4190 = _mm512_fmadd_ps(ifft4182, ifft4102, _mm512_shuffle_f32x4(ifft4182, ifft4182, 177));
__m512 ifft4107 = _mm512_fmadd_ps(ifft4098, ifft4102, _mm512_shuffle_f32x4(ifft4098, ifft4098, 177));
__m512 ifft4191 = _mm512_fmadd_ps(ifft4183, ifft4102, _mm512_shuffle_f32x4(ifft4183, ifft4183, 177));
__m512 ifft4108 = _mm512_fnmsub_ps(ifft4099, ifft4102, _mm512_shuffle_f32x4(ifft4099, ifft4099, 177));
__m512 ifft4192 = _mm512_fnmsub_ps(ifft4184, ifft4102, _mm512_shuffle_f32x4(ifft4184, ifft4184, 177));
__m512 ifft4109 = _mm512_fmadd_ps(ifft4100, ifft4102, _mm512_shuffle_f32x4(ifft4100, ifft4100, 177));
__m512 ifft4193 = _mm512_fmadd_ps(ifft4185, ifft4102, _mm512_shuffle_f32x4(ifft4185, ifft4185, 177));
__m512 ifft4110 = _mm512_fmadd_ps(ifft4101, ifft4102, _mm512_shuffle_f32x4(ifft4101, ifft4101, 177));
__m512 ifft4194 = _mm512_fmadd_ps(ifft4186, ifft4102, _mm512_shuffle_f32x4(ifft4186, ifft4186, 177));
__m512 ifft4111 = _mm512_add_ps(ifft4103, ifft4104);
__m512 ifft4195 = _mm512_add_ps(ifft4187, ifft4188);
__m512 ifft4112 = _mm512_sub_ps(ifft4103, ifft4104);
__m512 ifft4196 = _mm512_sub_ps(ifft4187, ifft4188);
__m512 ifft4113 = _mm512_sub_ps(ifft4105, ifft4109);
__m512 ifft4197 = _mm512_sub_ps(ifft4189, ifft4193);
__m512 ifft4114 = _mm512_add_ps(ifft4106, ifft4110);
__m512 ifft4198 = _mm512_add_ps(ifft4190, ifft4194);
__m512 ifft4115 = _mm512_add_ps(ifft4105, ifft4109);
__m512 ifft4199 = _mm512_add_ps(ifft4189, ifft4193);
__m512 ifft4116 = _mm512_sub_ps(ifft4106, ifft4110);
__m512 ifft4200 = _mm512_sub_ps(ifft4190, ifft4194);
__m512 ifft4117 = _mm512_mul_ps(ifft4107, _mm512_set1_ps(3.125e-02f));
__m512 ifft4201 = _mm512_mul_ps(ifft4191, _mm512_set1_ps(3.125e-02f));
__m512 ifft4118 = _mm512_mul_ps(ifft4108, _mm512_set1_ps(3.125e-02f));
__m512 ifft4202 = _mm512_mul_ps(ifft4192, _mm512_set1_ps(3.125e-02f));
__m512 ifft4119 = _mm512_fmadd_ps(ifft4111, _mm512_set1_ps(1.5625e-02f), ifft4117);
__m512 ifft4203 = _mm512_fmadd_ps(ifft4195, _mm512_set1_ps(1.5625e-02f), ifft4201);
__m512 ifft4120 = _mm512_fmsub_ps(ifft4111, _mm512_set1_ps(1.5625e-02f), ifft4117);
__m512 ifft4204 = _mm512_fmsub_ps(ifft4195, _mm512_set1_ps(1.5625e-02f), ifft4201);
__m512 ifft4121 = _mm512_fmadd_ps(ifft4112, _mm512_set1_ps(1.5625e-02f), ifft4118);
__m512 ifft4205 = _mm512_fmadd_ps(ifft4196, _mm512_set1_ps(1.5625e-02f), ifft4202);
__m512 ifft4122 = _mm512_fmsub_ps(ifft4112, _mm512_set1_ps(1.5625e-02f), ifft4118);
__m512 ifft4206 = _mm512_fmsub_ps(ifft4196, _mm512_set1_ps(1.5625e-02f), ifft4202);
__m512 ifft4123 = _mm512_add_ps(ifft4113, ifft4114);
__m512 ifft4207 = _mm512_add_ps(ifft4197, ifft4198);
__m512 ifft4124 = _mm512_sub_ps(ifft4113, ifft4114);
__m512 ifft4208 = _mm512_sub_ps(ifft4197, ifft4198);
__m512 ifft4125 = _mm512_fnmadd_ps(ifft4123, _mm512_set1_ps(7.0710677e-01f), ifft4115);
__m512 ifft4209 = _mm512_fnmadd_ps(ifft4207, _mm512_set1_ps(7.0710677e-01f), ifft4199);
__m512 ifft4126 = _mm512_fmadd_ps(ifft4123, _mm512_set1_ps(7.0710677e-01f), ifft4115);
__m512 ifft4210 = _mm512_fmadd_ps(ifft4207, _mm512_set1_ps(7.0710677e-01f), ifft4199);
__m512 ifft4127 = _mm512_fmadd_ps(ifft4124, _mm512_set1_ps(7.0710677e-01f), ifft4116);
__m512 ifft4211 = _mm512_fmadd_ps(ifft4208, _mm512_set1_ps(7.0710677e-01f), ifft4200);
__m512 ifft4128 = _mm512_fmsub_ps(ifft4124, _mm512_set1_ps(7.0710677e-01f), ifft4116);
__m512 ifft4212 = _mm512_fmsub_ps(ifft4208, _mm512_set1_ps(7.0710677e-01f), ifft4200);
__m512 ifft4129 = _mm512_add_ps(ifft4125, ifft4126);
__m512 ifft4213 = _mm512_add_ps(ifft4209, ifft4210);
__m512 ifft4130 = _mm512_sub_ps(ifft4125, ifft4126);
__m512 ifft4214 = _mm512_sub_ps(ifft4209, ifft4210);
__m512 ifft4131 = _mm512_add_ps(ifft4127, ifft4128);
__m512 ifft4215 = _mm512_add_ps(ifft4211, ifft4212);
__m512 ifft4132 = _mm512_sub_ps(ifft4127, ifft4128);
__m512 ifft4216 = _mm512_sub_ps(ifft4211, ifft4212);
__m512 ifft4133 = _mm512_fmadd_ps(ifft4129, _mm512_set1_ps(1.5625e-02f), ifft4119);
__m512 ifft4217 = _mm512_fmadd_ps(ifft4213, _mm512_set1_ps(1.5625e-02f), ifft4203);
__m512 ifft4134 = _mm512_fnmadd_ps(ifft4129, _mm512_set1_ps(1.5625e-02f), ifft4119);
__m512 ifft4218 = _mm512_fnmadd_ps(ifft4213, _mm512_set1_ps(1.5625e-02f), ifft4203);
__m512 ifft4135 = _mm512_fmadd_ps(ifft4131, _mm512_set1_ps(1.5625e-02f), ifft4121);
__m512 ifft4219 = _mm512_fmadd_ps(ifft4215, _mm512_set1_ps(1.5625e-02f), ifft4205);
__m512 ifft4136 = _mm512_fnmadd_ps(ifft4131, _mm512_set1_ps(1.5625e-02f), ifft4121);
__m512 ifft4220 = _mm512_fnmadd_ps(ifft4215, _mm512_set1_ps(1.5625e-02f), ifft4205);
__m512 ifft4137 = _mm512_fnmadd_ps(ifft4132, _mm512_set1_ps(1.5625e-02f), ifft4120);
__m512 ifft4221 = _mm512_fnmadd_ps(ifft4216, _mm512_set1_ps(1.5625e-02f), ifft4204);
__m512 ifft4138 = _mm512_fmadd_ps(ifft4132, _mm512_set1_ps(1.5625e-02f), ifft4120);
__m512 ifft4222 = _mm512_fmadd_ps(ifft4216, _mm512_set1_ps(1.5625e-02f), ifft4204);
__m512 ifft4139 = _mm512_fmadd_ps(ifft4130, _mm512_set1_ps(1.5625e-02f), ifft4122);
__m512 ifft4223 = _mm512_fmadd_ps(ifft4214, _mm512_set1_ps(1.5625e-02f), ifft4206);
__m512 ifft4140 = _mm512_fnmadd_ps(ifft4130, _mm512_set1_ps(1.5625e-02f), ifft4122);
__m512 ifft4224 = _mm512_fnmadd_ps(ifft4214, _mm512_set1_ps(1.5625e-02f), ifft4206);
__m512 dat820 = ifft4133;
__m512 dat825 = ifft4217;
__m512 dat821 = ifft4135;
__m512 dat826 = ifft4219;
__m512 dat822 = ifft4137;
__m512 dat827 = ifft4221;
__m512 dat823 = ifft4139;
__m512 dat828 = ifft4223;
__m512 dat824 = ifft4134;
__m512 dat829 = ifft4218;
(void)ifft4136;
(void)ifft4220;
(void)ifft4138;
(void)ifft4222;
(void)ifft4140;
(void)ifft4224;
__m512i pm41 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack201 = _mm512_permutex2var_ps(dat820, pm41, dat825);
__m512i pm42 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack202 = _mm512_permutex2var_ps(dat820, pm42, dat825);
__m512 pack203 = _mm512_permutex2var_ps(dat821, pm41, dat826);
__m512 pack204 = _mm512_permutex2var_ps(dat821, pm42, dat826);
__m512 pack205 = _mm512_permutex2var_ps(dat822, pm41, dat827);
__m512 pack206 = _mm512_permutex2var_ps(dat822, pm42, dat827);
__m512 pack207 = _mm512_permutex2var_ps(dat823, pm41, dat828);
__m512 pack208 = _mm512_permutex2var_ps(dat823, pm42, dat828);
__m512 pack209 = _mm512_permutex2var_ps(dat824, pm41, dat829);
__m512 pack210 = _mm512_permutex2var_ps(dat824, pm42, dat829);
pack201 = _mm512_max_ps(_mm512_setzero_ps(), pack201);
pack202 = _mm512_max_ps(_mm512_setzero_ps(), pack202);
pack203 = _mm512_max_ps(_mm512_setzero_ps(), pack203);
pack204 = _mm512_max_ps(_mm512_setzero_ps(), pack204);
pack205 = _mm512_max_ps(_mm512_setzero_ps(), pack205);
pack206 = _mm512_max_ps(_mm512_setzero_ps(), pack206);
pack207 = _mm512_max_ps(_mm512_setzero_ps(), pack207);
pack208 = _mm512_max_ps(_mm512_setzero_ps(), pack208);
pack209 = _mm512_max_ps(_mm512_setzero_ps(), pack209);
pack210 = _mm512_max_ps(_mm512_setzero_ps(), pack210);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack201);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack202);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack203);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack204);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack205);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack206);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack207);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack208);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack209);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack210);
ptrdiff_t t26 = 0;
for (; t26 < 2; ++t26) {
__m512 sfRe249 = _mm512_loadu_ps(sfPtr3+256+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfIm249 = _mm512_loadu_ps(sfPtr3+320+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfRe253 = _mm512_loadu_ps(sfPtr3+384+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfIm253 = _mm512_loadu_ps(sfPtr3+448+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfRe250 = _mm512_loadu_ps(sfPtr3+2167040+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfIm250 = _mm512_loadu_ps(sfPtr3+2167104+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfRe254 = _mm512_loadu_ps(sfPtr3+2167168+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfIm254 = _mm512_loadu_ps(sfPtr3+2167232+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfRe251 = _mm512_loadu_ps(sfPtr3+4333824+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfIm251 = _mm512_loadu_ps(sfPtr3+4333888+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfRe255 = _mm512_loadu_ps(sfPtr3+4333952+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfIm255 = _mm512_loadu_ps(sfPtr3+4334016+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfRe252 = _mm512_loadu_ps(sfPtr3+6500608+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfIm252 = _mm512_loadu_ps(sfPtr3+6500672+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfRe256 = _mm512_loadu_ps(sfPtr3+6500736+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfIm256 = _mm512_loadu_ps(sfPtr3+6500800+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512i ifft4225 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft4226 = _mm512_permutexvar_ps(ifft4225, sfRe249);
__m512 ifft4317 = _mm512_permutexvar_ps(ifft4225, sfRe253);
__m512i ifft4227 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4228 = _mm512_permutexvar_ps(ifft4227, sfRe249);
__m512 ifft4318 = _mm512_permutexvar_ps(ifft4227, sfRe253);
__m512 ifft4229 = _mm512_permutexvar_ps(ifft4225, sfIm249);
__m512 ifft4319 = _mm512_permutexvar_ps(ifft4225, sfIm253);
__m512 ifft4230 = _mm512_permutexvar_ps(ifft4227, sfIm249);
__m512 ifft4320 = _mm512_permutexvar_ps(ifft4227, sfIm253);
__m512 ifft4231 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft4232 = _mm512_mask_fmadd_ps(ifft4230, 65021, ifft4231, ifft4226);
__m512 ifft4321 = _mm512_mask_fmadd_ps(ifft4320, 65021, ifft4231, ifft4317);
__m512 ifft4233 = _mm512_mask_fnmadd_ps(ifft4229, 65021, ifft4231, ifft4228);
__m512 ifft4322 = _mm512_mask_fnmadd_ps(ifft4319, 65021, ifft4231, ifft4318);
__m512 ifft4234 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft4235 = _mm512_fmadd_ps(ifft4232, ifft4234, _mm512_shuffle_ps(ifft4232, ifft4232, 177));
__m512 ifft4323 = _mm512_fmadd_ps(ifft4321, ifft4234, _mm512_shuffle_ps(ifft4321, ifft4321, 177));
__m512 ifft4236 = _mm512_fmadd_ps(ifft4233, ifft4234, _mm512_shuffle_ps(ifft4233, ifft4233, 177));
__m512 ifft4324 = _mm512_fmadd_ps(ifft4322, ifft4234, _mm512_shuffle_ps(ifft4322, ifft4322, 177));
__m512 ifft4237 = _mm512_fmadd_ps(sfRe250, ifft4234, _mm512_shuffle_ps(sfRe250, sfRe250, 177));
__m512 ifft4325 = _mm512_fmadd_ps(sfRe254, ifft4234, _mm512_shuffle_ps(sfRe254, sfRe254, 177));
__m512 ifft4238 = _mm512_fmadd_ps(sfIm250, ifft4234, _mm512_shuffle_ps(sfIm250, sfIm250, 177));
__m512 ifft4326 = _mm512_fmadd_ps(sfIm254, ifft4234, _mm512_shuffle_ps(sfIm254, sfIm254, 177));
__m512 ifft4239 = _mm512_fmadd_ps(sfRe251, ifft4234, _mm512_shuffle_ps(sfRe251, sfRe251, 177));
__m512 ifft4327 = _mm512_fmadd_ps(sfRe255, ifft4234, _mm512_shuffle_ps(sfRe255, sfRe255, 177));
__m512 ifft4240 = _mm512_fmadd_ps(sfIm251, ifft4234, _mm512_shuffle_ps(sfIm251, sfIm251, 177));
__m512 ifft4328 = _mm512_fmadd_ps(sfIm255, ifft4234, _mm512_shuffle_ps(sfIm255, sfIm255, 177));
__m512 ifft4241 = _mm512_fmadd_ps(sfRe252, ifft4234, _mm512_shuffle_ps(sfRe252, sfRe252, 177));
__m512 ifft4329 = _mm512_fmadd_ps(sfRe256, ifft4234, _mm512_shuffle_ps(sfRe256, sfRe256, 177));
__m512 ifft4242 = _mm512_fmadd_ps(sfIm252, ifft4234, _mm512_shuffle_ps(sfIm252, sfIm252, 177));
__m512 ifft4330 = _mm512_fmadd_ps(sfIm256, ifft4234, _mm512_shuffle_ps(sfIm256, sfIm256, 177));
__m512 ifft4243 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft4244 = _mm512_mul_ps(ifft4235, ifft4243);
__m512 ifft4331 = _mm512_mul_ps(ifft4323, ifft4243);
__m512 ifft4245 = _mm512_mul_ps(ifft4236, ifft4243);
__m512 ifft4332 = _mm512_mul_ps(ifft4324, ifft4243);
__m512 ifft4246 = _mm512_mul_ps(ifft4237, ifft4243);
__m512 ifft4333 = _mm512_mul_ps(ifft4325, ifft4243);
__m512 ifft4247 = _mm512_mul_ps(ifft4238, ifft4243);
__m512 ifft4334 = _mm512_mul_ps(ifft4326, ifft4243);
__m512 ifft4248 = _mm512_mul_ps(ifft4239, ifft4243);
__m512 ifft4335 = _mm512_mul_ps(ifft4327, ifft4243);
__m512 ifft4249 = _mm512_mul_ps(ifft4240, ifft4243);
__m512 ifft4336 = _mm512_mul_ps(ifft4328, ifft4243);
__m512 ifft4250 = _mm512_mul_ps(ifft4241, ifft4243);
__m512 ifft4337 = _mm512_mul_ps(ifft4329, ifft4243);
__m512 ifft4251 = _mm512_mul_ps(ifft4242, ifft4243);
__m512 ifft4338 = _mm512_mul_ps(ifft4330, ifft4243);
__m512 ifft4252 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft4253 = _mm512_fnmadd_ps(ifft4236, ifft4252, ifft4244);
__m512 ifft4339 = _mm512_fnmadd_ps(ifft4324, ifft4252, ifft4331);
__m512 ifft4254 = _mm512_fmadd_ps(ifft4235, ifft4252, ifft4245);
__m512 ifft4340 = _mm512_fmadd_ps(ifft4323, ifft4252, ifft4332);
__m512 ifft4255 = _mm512_fnmadd_ps(ifft4238, ifft4252, ifft4246);
__m512 ifft4341 = _mm512_fnmadd_ps(ifft4326, ifft4252, ifft4333);
__m512 ifft4256 = _mm512_fmadd_ps(ifft4237, ifft4252, ifft4247);
__m512 ifft4342 = _mm512_fmadd_ps(ifft4325, ifft4252, ifft4334);
__m512 ifft4257 = _mm512_fnmadd_ps(ifft4240, ifft4252, ifft4248);
__m512 ifft4343 = _mm512_fnmadd_ps(ifft4328, ifft4252, ifft4335);
__m512 ifft4258 = _mm512_fmadd_ps(ifft4239, ifft4252, ifft4249);
__m512 ifft4344 = _mm512_fmadd_ps(ifft4327, ifft4252, ifft4336);
__m512 ifft4259 = _mm512_fnmadd_ps(ifft4242, ifft4252, ifft4250);
__m512 ifft4345 = _mm512_fnmadd_ps(ifft4330, ifft4252, ifft4337);
__m512 ifft4260 = _mm512_fmadd_ps(ifft4241, ifft4252, ifft4251);
__m512 ifft4346 = _mm512_fmadd_ps(ifft4329, ifft4252, ifft4338);
__m512 ifft4261 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft4262 = _mm512_fmadd_ps(ifft4253, ifft4261, _mm512_shuffle_ps(ifft4253, ifft4253, 78));
__m512 ifft4347 = _mm512_fmadd_ps(ifft4339, ifft4261, _mm512_shuffle_ps(ifft4339, ifft4339, 78));
__m512 ifft4263 = _mm512_fmadd_ps(ifft4254, ifft4261, _mm512_shuffle_ps(ifft4254, ifft4254, 78));
__m512 ifft4348 = _mm512_fmadd_ps(ifft4340, ifft4261, _mm512_shuffle_ps(ifft4340, ifft4340, 78));
__m512 ifft4264 = _mm512_fmadd_ps(ifft4255, ifft4261, _mm512_shuffle_ps(ifft4255, ifft4255, 78));
__m512 ifft4349 = _mm512_fmadd_ps(ifft4341, ifft4261, _mm512_shuffle_ps(ifft4341, ifft4341, 78));
__m512 ifft4265 = _mm512_fmadd_ps(ifft4256, ifft4261, _mm512_shuffle_ps(ifft4256, ifft4256, 78));
__m512 ifft4350 = _mm512_fmadd_ps(ifft4342, ifft4261, _mm512_shuffle_ps(ifft4342, ifft4342, 78));
__m512 ifft4266 = _mm512_fmadd_ps(ifft4257, ifft4261, _mm512_shuffle_ps(ifft4257, ifft4257, 78));
__m512 ifft4351 = _mm512_fmadd_ps(ifft4343, ifft4261, _mm512_shuffle_ps(ifft4343, ifft4343, 78));
__m512 ifft4267 = _mm512_fmadd_ps(ifft4258, ifft4261, _mm512_shuffle_ps(ifft4258, ifft4258, 78));
__m512 ifft4352 = _mm512_fmadd_ps(ifft4344, ifft4261, _mm512_shuffle_ps(ifft4344, ifft4344, 78));
__m512 ifft4268 = _mm512_fmadd_ps(ifft4259, ifft4261, _mm512_shuffle_ps(ifft4259, ifft4259, 78));
__m512 ifft4353 = _mm512_fmadd_ps(ifft4345, ifft4261, _mm512_shuffle_ps(ifft4345, ifft4345, 78));
__m512 ifft4269 = _mm512_fmadd_ps(ifft4260, ifft4261, _mm512_shuffle_ps(ifft4260, ifft4260, 78));
__m512 ifft4354 = _mm512_fmadd_ps(ifft4346, ifft4261, _mm512_shuffle_ps(ifft4346, ifft4346, 78));
__m512 ifft4270 = _mm512_mask_sub_ps(ifft4262, 49344, _mm512_setzero_ps(), ifft4263);
__m512 ifft4355 = _mm512_mask_sub_ps(ifft4347, 49344, _mm512_setzero_ps(), ifft4348);
__m512 ifft4271 = _mm512_mask_mov_ps(ifft4263, 49344, ifft4262);
__m512 ifft4356 = _mm512_mask_mov_ps(ifft4348, 49344, ifft4347);
__m512 ifft4272 = _mm512_mask_sub_ps(ifft4264, 49344, _mm512_setzero_ps(), ifft4265);
__m512 ifft4357 = _mm512_mask_sub_ps(ifft4349, 49344, _mm512_setzero_ps(), ifft4350);
__m512 ifft4273 = _mm512_mask_mov_ps(ifft4265, 49344, ifft4264);
__m512 ifft4358 = _mm512_mask_mov_ps(ifft4350, 49344, ifft4349);
__m512 ifft4274 = _mm512_mask_sub_ps(ifft4266, 49344, _mm512_setzero_ps(), ifft4267);
__m512 ifft4359 = _mm512_mask_sub_ps(ifft4351, 49344, _mm512_setzero_ps(), ifft4352);
__m512 ifft4275 = _mm512_mask_mov_ps(ifft4267, 49344, ifft4266);
__m512 ifft4360 = _mm512_mask_mov_ps(ifft4352, 49344, ifft4351);
__m512 ifft4276 = _mm512_mask_sub_ps(ifft4268, 49344, _mm512_setzero_ps(), ifft4269);
__m512 ifft4361 = _mm512_mask_sub_ps(ifft4353, 49344, _mm512_setzero_ps(), ifft4354);
__m512 ifft4277 = _mm512_mask_mov_ps(ifft4269, 49344, ifft4268);
__m512 ifft4362 = _mm512_mask_mov_ps(ifft4354, 49344, ifft4353);
__m512 ifft4278 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft4279 = _mm512_fmadd_ps(ifft4270, ifft4278, _mm512_shuffle_f32x4(ifft4270, ifft4270, 177));
__m512 ifft4363 = _mm512_fmadd_ps(ifft4355, ifft4278, _mm512_shuffle_f32x4(ifft4355, ifft4355, 177));
__m512 ifft4280 = _mm512_fmadd_ps(ifft4271, ifft4278, _mm512_shuffle_f32x4(ifft4271, ifft4271, 177));
__m512 ifft4364 = _mm512_fmadd_ps(ifft4356, ifft4278, _mm512_shuffle_f32x4(ifft4356, ifft4356, 177));
__m512 ifft4281 = _mm512_fmadd_ps(ifft4272, ifft4278, _mm512_shuffle_f32x4(ifft4272, ifft4272, 177));
__m512 ifft4365 = _mm512_fmadd_ps(ifft4357, ifft4278, _mm512_shuffle_f32x4(ifft4357, ifft4357, 177));
__m512 ifft4282 = _mm512_fmadd_ps(ifft4273, ifft4278, _mm512_shuffle_f32x4(ifft4273, ifft4273, 177));
__m512 ifft4366 = _mm512_fmadd_ps(ifft4358, ifft4278, _mm512_shuffle_f32x4(ifft4358, ifft4358, 177));
__m512 ifft4283 = _mm512_fmadd_ps(ifft4274, ifft4278, _mm512_shuffle_f32x4(ifft4274, ifft4274, 177));
__m512 ifft4367 = _mm512_fmadd_ps(ifft4359, ifft4278, _mm512_shuffle_f32x4(ifft4359, ifft4359, 177));
__m512 ifft4284 = _mm512_fnmsub_ps(ifft4275, ifft4278, _mm512_shuffle_f32x4(ifft4275, ifft4275, 177));
__m512 ifft4368 = _mm512_fnmsub_ps(ifft4360, ifft4278, _mm512_shuffle_f32x4(ifft4360, ifft4360, 177));
__m512 ifft4285 = _mm512_fmadd_ps(ifft4276, ifft4278, _mm512_shuffle_f32x4(ifft4276, ifft4276, 177));
__m512 ifft4369 = _mm512_fmadd_ps(ifft4361, ifft4278, _mm512_shuffle_f32x4(ifft4361, ifft4361, 177));
__m512 ifft4286 = _mm512_fmadd_ps(ifft4277, ifft4278, _mm512_shuffle_f32x4(ifft4277, ifft4277, 177));
__m512 ifft4370 = _mm512_fmadd_ps(ifft4362, ifft4278, _mm512_shuffle_f32x4(ifft4362, ifft4362, 177));
__m512 ifft4287 = _mm512_add_ps(ifft4279, ifft4280);
__m512 ifft4371 = _mm512_add_ps(ifft4363, ifft4364);
__m512 ifft4288 = _mm512_sub_ps(ifft4279, ifft4280);
__m512 ifft4372 = _mm512_sub_ps(ifft4363, ifft4364);
__m512 ifft4289 = _mm512_sub_ps(ifft4281, ifft4285);
__m512 ifft4373 = _mm512_sub_ps(ifft4365, ifft4369);
__m512 ifft4290 = _mm512_add_ps(ifft4282, ifft4286);
__m512 ifft4374 = _mm512_add_ps(ifft4366, ifft4370);
__m512 ifft4291 = _mm512_add_ps(ifft4281, ifft4285);
__m512 ifft4375 = _mm512_add_ps(ifft4365, ifft4369);
__m512 ifft4292 = _mm512_sub_ps(ifft4282, ifft4286);
__m512 ifft4376 = _mm512_sub_ps(ifft4366, ifft4370);
__m512 ifft4293 = _mm512_mul_ps(ifft4283, _mm512_set1_ps(3.125e-02f));
__m512 ifft4377 = _mm512_mul_ps(ifft4367, _mm512_set1_ps(3.125e-02f));
__m512 ifft4294 = _mm512_mul_ps(ifft4284, _mm512_set1_ps(3.125e-02f));
__m512 ifft4378 = _mm512_mul_ps(ifft4368, _mm512_set1_ps(3.125e-02f));
__m512 ifft4295 = _mm512_fmadd_ps(ifft4287, _mm512_set1_ps(1.5625e-02f), ifft4293);
__m512 ifft4379 = _mm512_fmadd_ps(ifft4371, _mm512_set1_ps(1.5625e-02f), ifft4377);
__m512 ifft4296 = _mm512_fmsub_ps(ifft4287, _mm512_set1_ps(1.5625e-02f), ifft4293);
__m512 ifft4380 = _mm512_fmsub_ps(ifft4371, _mm512_set1_ps(1.5625e-02f), ifft4377);
__m512 ifft4297 = _mm512_fmadd_ps(ifft4288, _mm512_set1_ps(1.5625e-02f), ifft4294);
__m512 ifft4381 = _mm512_fmadd_ps(ifft4372, _mm512_set1_ps(1.5625e-02f), ifft4378);
__m512 ifft4298 = _mm512_fmsub_ps(ifft4288, _mm512_set1_ps(1.5625e-02f), ifft4294);
__m512 ifft4382 = _mm512_fmsub_ps(ifft4372, _mm512_set1_ps(1.5625e-02f), ifft4378);
__m512 ifft4299 = _mm512_add_ps(ifft4289, ifft4290);
__m512 ifft4383 = _mm512_add_ps(ifft4373, ifft4374);
__m512 ifft4300 = _mm512_sub_ps(ifft4289, ifft4290);
__m512 ifft4384 = _mm512_sub_ps(ifft4373, ifft4374);
__m512 ifft4301 = _mm512_fnmadd_ps(ifft4299, _mm512_set1_ps(7.0710677e-01f), ifft4291);
__m512 ifft4385 = _mm512_fnmadd_ps(ifft4383, _mm512_set1_ps(7.0710677e-01f), ifft4375);
__m512 ifft4302 = _mm512_fmadd_ps(ifft4299, _mm512_set1_ps(7.0710677e-01f), ifft4291);
__m512 ifft4386 = _mm512_fmadd_ps(ifft4383, _mm512_set1_ps(7.0710677e-01f), ifft4375);
__m512 ifft4303 = _mm512_fmadd_ps(ifft4300, _mm512_set1_ps(7.0710677e-01f), ifft4292);
__m512 ifft4387 = _mm512_fmadd_ps(ifft4384, _mm512_set1_ps(7.0710677e-01f), ifft4376);
__m512 ifft4304 = _mm512_fmsub_ps(ifft4300, _mm512_set1_ps(7.0710677e-01f), ifft4292);
__m512 ifft4388 = _mm512_fmsub_ps(ifft4384, _mm512_set1_ps(7.0710677e-01f), ifft4376);
__m512 ifft4305 = _mm512_add_ps(ifft4301, ifft4302);
__m512 ifft4389 = _mm512_add_ps(ifft4385, ifft4386);
__m512 ifft4306 = _mm512_sub_ps(ifft4301, ifft4302);
__m512 ifft4390 = _mm512_sub_ps(ifft4385, ifft4386);
__m512 ifft4307 = _mm512_add_ps(ifft4303, ifft4304);
__m512 ifft4391 = _mm512_add_ps(ifft4387, ifft4388);
__m512 ifft4308 = _mm512_sub_ps(ifft4303, ifft4304);
__m512 ifft4392 = _mm512_sub_ps(ifft4387, ifft4388);
__m512 ifft4309 = _mm512_fmadd_ps(ifft4305, _mm512_set1_ps(1.5625e-02f), ifft4295);
__m512 ifft4393 = _mm512_fmadd_ps(ifft4389, _mm512_set1_ps(1.5625e-02f), ifft4379);
__m512 ifft4310 = _mm512_fnmadd_ps(ifft4305, _mm512_set1_ps(1.5625e-02f), ifft4295);
__m512 ifft4394 = _mm512_fnmadd_ps(ifft4389, _mm512_set1_ps(1.5625e-02f), ifft4379);
__m512 ifft4311 = _mm512_fmadd_ps(ifft4307, _mm512_set1_ps(1.5625e-02f), ifft4297);
__m512 ifft4395 = _mm512_fmadd_ps(ifft4391, _mm512_set1_ps(1.5625e-02f), ifft4381);
__m512 ifft4312 = _mm512_fnmadd_ps(ifft4307, _mm512_set1_ps(1.5625e-02f), ifft4297);
__m512 ifft4396 = _mm512_fnmadd_ps(ifft4391, _mm512_set1_ps(1.5625e-02f), ifft4381);
__m512 ifft4313 = _mm512_fnmadd_ps(ifft4308, _mm512_set1_ps(1.5625e-02f), ifft4296);
__m512 ifft4397 = _mm512_fnmadd_ps(ifft4392, _mm512_set1_ps(1.5625e-02f), ifft4380);
__m512 ifft4314 = _mm512_fmadd_ps(ifft4308, _mm512_set1_ps(1.5625e-02f), ifft4296);
__m512 ifft4398 = _mm512_fmadd_ps(ifft4392, _mm512_set1_ps(1.5625e-02f), ifft4380);
__m512 ifft4315 = _mm512_fmadd_ps(ifft4306, _mm512_set1_ps(1.5625e-02f), ifft4298);
__m512 ifft4399 = _mm512_fmadd_ps(ifft4390, _mm512_set1_ps(1.5625e-02f), ifft4382);
__m512 ifft4316 = _mm512_fnmadd_ps(ifft4306, _mm512_set1_ps(1.5625e-02f), ifft4298);
__m512 ifft4400 = _mm512_fnmadd_ps(ifft4390, _mm512_set1_ps(1.5625e-02f), ifft4382);
__m512 dat830 = ifft4309;
__m512 dat835 = ifft4393;
__m512 dat831 = ifft4311;
__m512 dat836 = ifft4395;
__m512 dat832 = ifft4313;
__m512 dat837 = ifft4397;
__m512 dat833 = ifft4315;
__m512 dat838 = ifft4399;
__m512 dat834 = ifft4310;
__m512 dat839 = ifft4394;
(void)ifft4312;
(void)ifft4396;
(void)ifft4314;
(void)ifft4398;
(void)ifft4316;
(void)ifft4400;
__m512i pm43 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack211 = _mm512_permutex2var_ps(dat830, pm43, dat835);
__m512i pm44 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack212 = _mm512_permutex2var_ps(dat830, pm44, dat835);
__m512 pack213 = _mm512_permutex2var_ps(dat831, pm43, dat836);
__m512 pack214 = _mm512_permutex2var_ps(dat831, pm44, dat836);
__m512 pack215 = _mm512_permutex2var_ps(dat832, pm43, dat837);
__m512 pack216 = _mm512_permutex2var_ps(dat832, pm44, dat837);
__m512 pack217 = _mm512_permutex2var_ps(dat833, pm43, dat838);
__m512 pack218 = _mm512_permutex2var_ps(dat833, pm44, dat838);
__m512 pack219 = _mm512_permutex2var_ps(dat834, pm43, dat839);
__m512 pack220 = _mm512_permutex2var_ps(dat834, pm44, dat839);
pack211 = _mm512_max_ps(_mm512_setzero_ps(), pack211);
pack212 = _mm512_max_ps(_mm512_setzero_ps(), pack212);
pack213 = _mm512_max_ps(_mm512_setzero_ps(), pack213);
pack214 = _mm512_max_ps(_mm512_setzero_ps(), pack214);
pack215 = _mm512_max_ps(_mm512_setzero_ps(), pack215);
pack216 = _mm512_max_ps(_mm512_setzero_ps(), pack216);
pack217 = _mm512_max_ps(_mm512_setzero_ps(), pack217);
pack218 = _mm512_max_ps(_mm512_setzero_ps(), pack218);
pack219 = _mm512_max_ps(_mm512_setzero_ps(), pack219);
pack220 = _mm512_max_ps(_mm512_setzero_ps(), pack220);
_mm512_mask_storeu_ps(datPtr2+40+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack211);
_mm512_mask_storeu_ps(datPtr2+50280+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack212);
_mm512_mask_storeu_ps(datPtr2+488+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack213);
_mm512_mask_storeu_ps(datPtr2+50728+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack214);
_mm512_mask_storeu_ps(datPtr2+936+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack215);
_mm512_mask_storeu_ps(datPtr2+51176+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack216);
_mm512_mask_storeu_ps(datPtr2+1384+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack217);
_mm512_mask_storeu_ps(datPtr2+51624+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack218);
_mm512_mask_storeu_ps(datPtr2+1832+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack219);
_mm512_mask_storeu_ps(datPtr2+52072+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack220);
}
}
}
if (j5 >= last2) return;
++j5;
rel5 = 20;
}
if (rel5 < 22) {
ptrdiff_t toH15 = base5+25;
ptrdiff_t toW15 = -570+30*rel5;
ptrdiff_t jj17 = 21-rel5+j5;
for (; j5 <= jj17; toW15 += 30) {
ptrdiff_t k39 = 16*w21;
for (; k39 != 16; ++k39) {
ptrdiff_t r16 = 0;
for (; r16 != 2; ++r16) {
ptrdiff_t t27 = 0;
for (; t27 < 3; ++t27) {
__m512 sfRe257 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfIm257 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfRe261 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfIm261 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfRe258 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfIm258 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfRe262 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfIm262 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfRe259 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfIm259 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfRe263 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfIm263 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfRe260 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfIm260 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfRe264 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfIm264 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512i ifft4401 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft4402 = _mm512_permutexvar_ps(ifft4401, sfRe257);
__m512 ifft4493 = _mm512_permutexvar_ps(ifft4401, sfRe261);
__m512i ifft4403 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4404 = _mm512_permutexvar_ps(ifft4403, sfRe257);
__m512 ifft4494 = _mm512_permutexvar_ps(ifft4403, sfRe261);
__m512 ifft4405 = _mm512_permutexvar_ps(ifft4401, sfIm257);
__m512 ifft4495 = _mm512_permutexvar_ps(ifft4401, sfIm261);
__m512 ifft4406 = _mm512_permutexvar_ps(ifft4403, sfIm257);
__m512 ifft4496 = _mm512_permutexvar_ps(ifft4403, sfIm261);
__m512 ifft4407 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft4408 = _mm512_mask_fmadd_ps(ifft4406, 65021, ifft4407, ifft4402);
__m512 ifft4497 = _mm512_mask_fmadd_ps(ifft4496, 65021, ifft4407, ifft4493);
__m512 ifft4409 = _mm512_mask_fnmadd_ps(ifft4405, 65021, ifft4407, ifft4404);
__m512 ifft4498 = _mm512_mask_fnmadd_ps(ifft4495, 65021, ifft4407, ifft4494);
__m512 ifft4410 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft4411 = _mm512_fmadd_ps(ifft4408, ifft4410, _mm512_shuffle_ps(ifft4408, ifft4408, 177));
__m512 ifft4499 = _mm512_fmadd_ps(ifft4497, ifft4410, _mm512_shuffle_ps(ifft4497, ifft4497, 177));
__m512 ifft4412 = _mm512_fmadd_ps(ifft4409, ifft4410, _mm512_shuffle_ps(ifft4409, ifft4409, 177));
__m512 ifft4500 = _mm512_fmadd_ps(ifft4498, ifft4410, _mm512_shuffle_ps(ifft4498, ifft4498, 177));
__m512 ifft4413 = _mm512_fmadd_ps(sfRe258, ifft4410, _mm512_shuffle_ps(sfRe258, sfRe258, 177));
__m512 ifft4501 = _mm512_fmadd_ps(sfRe262, ifft4410, _mm512_shuffle_ps(sfRe262, sfRe262, 177));
__m512 ifft4414 = _mm512_fmadd_ps(sfIm258, ifft4410, _mm512_shuffle_ps(sfIm258, sfIm258, 177));
__m512 ifft4502 = _mm512_fmadd_ps(sfIm262, ifft4410, _mm512_shuffle_ps(sfIm262, sfIm262, 177));
__m512 ifft4415 = _mm512_fmadd_ps(sfRe259, ifft4410, _mm512_shuffle_ps(sfRe259, sfRe259, 177));
__m512 ifft4503 = _mm512_fmadd_ps(sfRe263, ifft4410, _mm512_shuffle_ps(sfRe263, sfRe263, 177));
__m512 ifft4416 = _mm512_fmadd_ps(sfIm259, ifft4410, _mm512_shuffle_ps(sfIm259, sfIm259, 177));
__m512 ifft4504 = _mm512_fmadd_ps(sfIm263, ifft4410, _mm512_shuffle_ps(sfIm263, sfIm263, 177));
__m512 ifft4417 = _mm512_fmadd_ps(sfRe260, ifft4410, _mm512_shuffle_ps(sfRe260, sfRe260, 177));
__m512 ifft4505 = _mm512_fmadd_ps(sfRe264, ifft4410, _mm512_shuffle_ps(sfRe264, sfRe264, 177));
__m512 ifft4418 = _mm512_fmadd_ps(sfIm260, ifft4410, _mm512_shuffle_ps(sfIm260, sfIm260, 177));
__m512 ifft4506 = _mm512_fmadd_ps(sfIm264, ifft4410, _mm512_shuffle_ps(sfIm264, sfIm264, 177));
__m512 ifft4419 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft4420 = _mm512_mul_ps(ifft4411, ifft4419);
__m512 ifft4507 = _mm512_mul_ps(ifft4499, ifft4419);
__m512 ifft4421 = _mm512_mul_ps(ifft4412, ifft4419);
__m512 ifft4508 = _mm512_mul_ps(ifft4500, ifft4419);
__m512 ifft4422 = _mm512_mul_ps(ifft4413, ifft4419);
__m512 ifft4509 = _mm512_mul_ps(ifft4501, ifft4419);
__m512 ifft4423 = _mm512_mul_ps(ifft4414, ifft4419);
__m512 ifft4510 = _mm512_mul_ps(ifft4502, ifft4419);
__m512 ifft4424 = _mm512_mul_ps(ifft4415, ifft4419);
__m512 ifft4511 = _mm512_mul_ps(ifft4503, ifft4419);
__m512 ifft4425 = _mm512_mul_ps(ifft4416, ifft4419);
__m512 ifft4512 = _mm512_mul_ps(ifft4504, ifft4419);
__m512 ifft4426 = _mm512_mul_ps(ifft4417, ifft4419);
__m512 ifft4513 = _mm512_mul_ps(ifft4505, ifft4419);
__m512 ifft4427 = _mm512_mul_ps(ifft4418, ifft4419);
__m512 ifft4514 = _mm512_mul_ps(ifft4506, ifft4419);
__m512 ifft4428 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft4429 = _mm512_fnmadd_ps(ifft4412, ifft4428, ifft4420);
__m512 ifft4515 = _mm512_fnmadd_ps(ifft4500, ifft4428, ifft4507);
__m512 ifft4430 = _mm512_fmadd_ps(ifft4411, ifft4428, ifft4421);
__m512 ifft4516 = _mm512_fmadd_ps(ifft4499, ifft4428, ifft4508);
__m512 ifft4431 = _mm512_fnmadd_ps(ifft4414, ifft4428, ifft4422);
__m512 ifft4517 = _mm512_fnmadd_ps(ifft4502, ifft4428, ifft4509);
__m512 ifft4432 = _mm512_fmadd_ps(ifft4413, ifft4428, ifft4423);
__m512 ifft4518 = _mm512_fmadd_ps(ifft4501, ifft4428, ifft4510);
__m512 ifft4433 = _mm512_fnmadd_ps(ifft4416, ifft4428, ifft4424);
__m512 ifft4519 = _mm512_fnmadd_ps(ifft4504, ifft4428, ifft4511);
__m512 ifft4434 = _mm512_fmadd_ps(ifft4415, ifft4428, ifft4425);
__m512 ifft4520 = _mm512_fmadd_ps(ifft4503, ifft4428, ifft4512);
__m512 ifft4435 = _mm512_fnmadd_ps(ifft4418, ifft4428, ifft4426);
__m512 ifft4521 = _mm512_fnmadd_ps(ifft4506, ifft4428, ifft4513);
__m512 ifft4436 = _mm512_fmadd_ps(ifft4417, ifft4428, ifft4427);
__m512 ifft4522 = _mm512_fmadd_ps(ifft4505, ifft4428, ifft4514);
__m512 ifft4437 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft4438 = _mm512_fmadd_ps(ifft4429, ifft4437, _mm512_shuffle_ps(ifft4429, ifft4429, 78));
__m512 ifft4523 = _mm512_fmadd_ps(ifft4515, ifft4437, _mm512_shuffle_ps(ifft4515, ifft4515, 78));
__m512 ifft4439 = _mm512_fmadd_ps(ifft4430, ifft4437, _mm512_shuffle_ps(ifft4430, ifft4430, 78));
__m512 ifft4524 = _mm512_fmadd_ps(ifft4516, ifft4437, _mm512_shuffle_ps(ifft4516, ifft4516, 78));
__m512 ifft4440 = _mm512_fmadd_ps(ifft4431, ifft4437, _mm512_shuffle_ps(ifft4431, ifft4431, 78));
__m512 ifft4525 = _mm512_fmadd_ps(ifft4517, ifft4437, _mm512_shuffle_ps(ifft4517, ifft4517, 78));
__m512 ifft4441 = _mm512_fmadd_ps(ifft4432, ifft4437, _mm512_shuffle_ps(ifft4432, ifft4432, 78));
__m512 ifft4526 = _mm512_fmadd_ps(ifft4518, ifft4437, _mm512_shuffle_ps(ifft4518, ifft4518, 78));
__m512 ifft4442 = _mm512_fmadd_ps(ifft4433, ifft4437, _mm512_shuffle_ps(ifft4433, ifft4433, 78));
__m512 ifft4527 = _mm512_fmadd_ps(ifft4519, ifft4437, _mm512_shuffle_ps(ifft4519, ifft4519, 78));
__m512 ifft4443 = _mm512_fmadd_ps(ifft4434, ifft4437, _mm512_shuffle_ps(ifft4434, ifft4434, 78));
__m512 ifft4528 = _mm512_fmadd_ps(ifft4520, ifft4437, _mm512_shuffle_ps(ifft4520, ifft4520, 78));
__m512 ifft4444 = _mm512_fmadd_ps(ifft4435, ifft4437, _mm512_shuffle_ps(ifft4435, ifft4435, 78));
__m512 ifft4529 = _mm512_fmadd_ps(ifft4521, ifft4437, _mm512_shuffle_ps(ifft4521, ifft4521, 78));
__m512 ifft4445 = _mm512_fmadd_ps(ifft4436, ifft4437, _mm512_shuffle_ps(ifft4436, ifft4436, 78));
__m512 ifft4530 = _mm512_fmadd_ps(ifft4522, ifft4437, _mm512_shuffle_ps(ifft4522, ifft4522, 78));
__m512 ifft4446 = _mm512_mask_sub_ps(ifft4438, 49344, _mm512_setzero_ps(), ifft4439);
__m512 ifft4531 = _mm512_mask_sub_ps(ifft4523, 49344, _mm512_setzero_ps(), ifft4524);
__m512 ifft4447 = _mm512_mask_mov_ps(ifft4439, 49344, ifft4438);
__m512 ifft4532 = _mm512_mask_mov_ps(ifft4524, 49344, ifft4523);
__m512 ifft4448 = _mm512_mask_sub_ps(ifft4440, 49344, _mm512_setzero_ps(), ifft4441);
__m512 ifft4533 = _mm512_mask_sub_ps(ifft4525, 49344, _mm512_setzero_ps(), ifft4526);
__m512 ifft4449 = _mm512_mask_mov_ps(ifft4441, 49344, ifft4440);
__m512 ifft4534 = _mm512_mask_mov_ps(ifft4526, 49344, ifft4525);
__m512 ifft4450 = _mm512_mask_sub_ps(ifft4442, 49344, _mm512_setzero_ps(), ifft4443);
__m512 ifft4535 = _mm512_mask_sub_ps(ifft4527, 49344, _mm512_setzero_ps(), ifft4528);
__m512 ifft4451 = _mm512_mask_mov_ps(ifft4443, 49344, ifft4442);
__m512 ifft4536 = _mm512_mask_mov_ps(ifft4528, 49344, ifft4527);
__m512 ifft4452 = _mm512_mask_sub_ps(ifft4444, 49344, _mm512_setzero_ps(), ifft4445);
__m512 ifft4537 = _mm512_mask_sub_ps(ifft4529, 49344, _mm512_setzero_ps(), ifft4530);
__m512 ifft4453 = _mm512_mask_mov_ps(ifft4445, 49344, ifft4444);
__m512 ifft4538 = _mm512_mask_mov_ps(ifft4530, 49344, ifft4529);
__m512 ifft4454 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft4455 = _mm512_fmadd_ps(ifft4446, ifft4454, _mm512_shuffle_f32x4(ifft4446, ifft4446, 177));
__m512 ifft4539 = _mm512_fmadd_ps(ifft4531, ifft4454, _mm512_shuffle_f32x4(ifft4531, ifft4531, 177));
__m512 ifft4456 = _mm512_fmadd_ps(ifft4447, ifft4454, _mm512_shuffle_f32x4(ifft4447, ifft4447, 177));
__m512 ifft4540 = _mm512_fmadd_ps(ifft4532, ifft4454, _mm512_shuffle_f32x4(ifft4532, ifft4532, 177));
__m512 ifft4457 = _mm512_fmadd_ps(ifft4448, ifft4454, _mm512_shuffle_f32x4(ifft4448, ifft4448, 177));
__m512 ifft4541 = _mm512_fmadd_ps(ifft4533, ifft4454, _mm512_shuffle_f32x4(ifft4533, ifft4533, 177));
__m512 ifft4458 = _mm512_fmadd_ps(ifft4449, ifft4454, _mm512_shuffle_f32x4(ifft4449, ifft4449, 177));
__m512 ifft4542 = _mm512_fmadd_ps(ifft4534, ifft4454, _mm512_shuffle_f32x4(ifft4534, ifft4534, 177));
__m512 ifft4459 = _mm512_fmadd_ps(ifft4450, ifft4454, _mm512_shuffle_f32x4(ifft4450, ifft4450, 177));
__m512 ifft4543 = _mm512_fmadd_ps(ifft4535, ifft4454, _mm512_shuffle_f32x4(ifft4535, ifft4535, 177));
__m512 ifft4460 = _mm512_fnmsub_ps(ifft4451, ifft4454, _mm512_shuffle_f32x4(ifft4451, ifft4451, 177));
__m512 ifft4544 = _mm512_fnmsub_ps(ifft4536, ifft4454, _mm512_shuffle_f32x4(ifft4536, ifft4536, 177));
__m512 ifft4461 = _mm512_fmadd_ps(ifft4452, ifft4454, _mm512_shuffle_f32x4(ifft4452, ifft4452, 177));
__m512 ifft4545 = _mm512_fmadd_ps(ifft4537, ifft4454, _mm512_shuffle_f32x4(ifft4537, ifft4537, 177));
__m512 ifft4462 = _mm512_fmadd_ps(ifft4453, ifft4454, _mm512_shuffle_f32x4(ifft4453, ifft4453, 177));
__m512 ifft4546 = _mm512_fmadd_ps(ifft4538, ifft4454, _mm512_shuffle_f32x4(ifft4538, ifft4538, 177));
__m512 ifft4463 = _mm512_add_ps(ifft4455, ifft4456);
__m512 ifft4547 = _mm512_add_ps(ifft4539, ifft4540);
__m512 ifft4464 = _mm512_sub_ps(ifft4455, ifft4456);
__m512 ifft4548 = _mm512_sub_ps(ifft4539, ifft4540);
__m512 ifft4465 = _mm512_sub_ps(ifft4457, ifft4461);
__m512 ifft4549 = _mm512_sub_ps(ifft4541, ifft4545);
__m512 ifft4466 = _mm512_add_ps(ifft4458, ifft4462);
__m512 ifft4550 = _mm512_add_ps(ifft4542, ifft4546);
__m512 ifft4467 = _mm512_add_ps(ifft4457, ifft4461);
__m512 ifft4551 = _mm512_add_ps(ifft4541, ifft4545);
__m512 ifft4468 = _mm512_sub_ps(ifft4458, ifft4462);
__m512 ifft4552 = _mm512_sub_ps(ifft4542, ifft4546);
__m512 ifft4469 = _mm512_mul_ps(ifft4459, _mm512_set1_ps(3.125e-02f));
__m512 ifft4553 = _mm512_mul_ps(ifft4543, _mm512_set1_ps(3.125e-02f));
__m512 ifft4470 = _mm512_mul_ps(ifft4460, _mm512_set1_ps(3.125e-02f));
__m512 ifft4554 = _mm512_mul_ps(ifft4544, _mm512_set1_ps(3.125e-02f));
__m512 ifft4471 = _mm512_fmadd_ps(ifft4463, _mm512_set1_ps(1.5625e-02f), ifft4469);
__m512 ifft4555 = _mm512_fmadd_ps(ifft4547, _mm512_set1_ps(1.5625e-02f), ifft4553);
__m512 ifft4472 = _mm512_fmsub_ps(ifft4463, _mm512_set1_ps(1.5625e-02f), ifft4469);
__m512 ifft4556 = _mm512_fmsub_ps(ifft4547, _mm512_set1_ps(1.5625e-02f), ifft4553);
__m512 ifft4473 = _mm512_fmadd_ps(ifft4464, _mm512_set1_ps(1.5625e-02f), ifft4470);
__m512 ifft4557 = _mm512_fmadd_ps(ifft4548, _mm512_set1_ps(1.5625e-02f), ifft4554);
__m512 ifft4474 = _mm512_fmsub_ps(ifft4464, _mm512_set1_ps(1.5625e-02f), ifft4470);
__m512 ifft4558 = _mm512_fmsub_ps(ifft4548, _mm512_set1_ps(1.5625e-02f), ifft4554);
__m512 ifft4475 = _mm512_add_ps(ifft4465, ifft4466);
__m512 ifft4559 = _mm512_add_ps(ifft4549, ifft4550);
__m512 ifft4476 = _mm512_sub_ps(ifft4465, ifft4466);
__m512 ifft4560 = _mm512_sub_ps(ifft4549, ifft4550);
__m512 ifft4477 = _mm512_fnmadd_ps(ifft4475, _mm512_set1_ps(7.0710677e-01f), ifft4467);
__m512 ifft4561 = _mm512_fnmadd_ps(ifft4559, _mm512_set1_ps(7.0710677e-01f), ifft4551);
__m512 ifft4478 = _mm512_fmadd_ps(ifft4475, _mm512_set1_ps(7.0710677e-01f), ifft4467);
__m512 ifft4562 = _mm512_fmadd_ps(ifft4559, _mm512_set1_ps(7.0710677e-01f), ifft4551);
__m512 ifft4479 = _mm512_fmadd_ps(ifft4476, _mm512_set1_ps(7.0710677e-01f), ifft4468);
__m512 ifft4563 = _mm512_fmadd_ps(ifft4560, _mm512_set1_ps(7.0710677e-01f), ifft4552);
__m512 ifft4480 = _mm512_fmsub_ps(ifft4476, _mm512_set1_ps(7.0710677e-01f), ifft4468);
__m512 ifft4564 = _mm512_fmsub_ps(ifft4560, _mm512_set1_ps(7.0710677e-01f), ifft4552);
__m512 ifft4481 = _mm512_add_ps(ifft4477, ifft4478);
__m512 ifft4565 = _mm512_add_ps(ifft4561, ifft4562);
__m512 ifft4482 = _mm512_sub_ps(ifft4477, ifft4478);
__m512 ifft4566 = _mm512_sub_ps(ifft4561, ifft4562);
__m512 ifft4483 = _mm512_add_ps(ifft4479, ifft4480);
__m512 ifft4567 = _mm512_add_ps(ifft4563, ifft4564);
__m512 ifft4484 = _mm512_sub_ps(ifft4479, ifft4480);
__m512 ifft4568 = _mm512_sub_ps(ifft4563, ifft4564);
__m512 ifft4485 = _mm512_fmadd_ps(ifft4481, _mm512_set1_ps(1.5625e-02f), ifft4471);
__m512 ifft4569 = _mm512_fmadd_ps(ifft4565, _mm512_set1_ps(1.5625e-02f), ifft4555);
__m512 ifft4486 = _mm512_fnmadd_ps(ifft4481, _mm512_set1_ps(1.5625e-02f), ifft4471);
__m512 ifft4570 = _mm512_fnmadd_ps(ifft4565, _mm512_set1_ps(1.5625e-02f), ifft4555);
__m512 ifft4487 = _mm512_fmadd_ps(ifft4483, _mm512_set1_ps(1.5625e-02f), ifft4473);
__m512 ifft4571 = _mm512_fmadd_ps(ifft4567, _mm512_set1_ps(1.5625e-02f), ifft4557);
__m512 ifft4488 = _mm512_fnmadd_ps(ifft4483, _mm512_set1_ps(1.5625e-02f), ifft4473);
__m512 ifft4572 = _mm512_fnmadd_ps(ifft4567, _mm512_set1_ps(1.5625e-02f), ifft4557);
__m512 ifft4489 = _mm512_fnmadd_ps(ifft4484, _mm512_set1_ps(1.5625e-02f), ifft4472);
__m512 ifft4573 = _mm512_fnmadd_ps(ifft4568, _mm512_set1_ps(1.5625e-02f), ifft4556);
__m512 ifft4490 = _mm512_fmadd_ps(ifft4484, _mm512_set1_ps(1.5625e-02f), ifft4472);
__m512 ifft4574 = _mm512_fmadd_ps(ifft4568, _mm512_set1_ps(1.5625e-02f), ifft4556);
__m512 ifft4491 = _mm512_fmadd_ps(ifft4482, _mm512_set1_ps(1.5625e-02f), ifft4474);
__m512 ifft4575 = _mm512_fmadd_ps(ifft4566, _mm512_set1_ps(1.5625e-02f), ifft4558);
__m512 ifft4492 = _mm512_fnmadd_ps(ifft4482, _mm512_set1_ps(1.5625e-02f), ifft4474);
__m512 ifft4576 = _mm512_fnmadd_ps(ifft4566, _mm512_set1_ps(1.5625e-02f), ifft4558);
__m512 dat840 = ifft4485;
__m512 dat845 = ifft4569;
__m512 dat841 = ifft4487;
__m512 dat846 = ifft4571;
__m512 dat842 = ifft4489;
__m512 dat847 = ifft4573;
__m512 dat843 = ifft4491;
__m512 dat848 = ifft4575;
__m512 dat844 = ifft4486;
__m512 dat849 = ifft4570;
(void)ifft4488;
(void)ifft4572;
(void)ifft4490;
(void)ifft4574;
(void)ifft4492;
(void)ifft4576;
__m512i pm45 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack221 = _mm512_permutex2var_ps(dat840, pm45, dat845);
__m512i pm46 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack222 = _mm512_permutex2var_ps(dat840, pm46, dat845);
__m512 pack223 = _mm512_permutex2var_ps(dat841, pm45, dat846);
__m512 pack224 = _mm512_permutex2var_ps(dat841, pm46, dat846);
__m512 pack225 = _mm512_permutex2var_ps(dat842, pm45, dat847);
__m512 pack226 = _mm512_permutex2var_ps(dat842, pm46, dat847);
__m512 pack227 = _mm512_permutex2var_ps(dat843, pm45, dat848);
__m512 pack228 = _mm512_permutex2var_ps(dat843, pm46, dat848);
__m512 pack229 = _mm512_permutex2var_ps(dat844, pm45, dat849);
__m512 pack230 = _mm512_permutex2var_ps(dat844, pm46, dat849);
pack221 = _mm512_max_ps(_mm512_setzero_ps(), pack221);
pack222 = _mm512_max_ps(_mm512_setzero_ps(), pack222);
pack223 = _mm512_max_ps(_mm512_setzero_ps(), pack223);
pack224 = _mm512_max_ps(_mm512_setzero_ps(), pack224);
pack225 = _mm512_max_ps(_mm512_setzero_ps(), pack225);
pack226 = _mm512_max_ps(_mm512_setzero_ps(), pack226);
pack227 = _mm512_max_ps(_mm512_setzero_ps(), pack227);
pack228 = _mm512_max_ps(_mm512_setzero_ps(), pack228);
pack229 = _mm512_max_ps(_mm512_setzero_ps(), pack229);
pack230 = _mm512_max_ps(_mm512_setzero_ps(), pack230);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack221);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack222);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack223);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack224);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack225);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack226);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack227);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack228);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack229);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack230);
}
}
}
if (j5 >= last2) return;
++j5;
}
rel5 = 22;
}
ptrdiff_t toH16 = base5+25;
ptrdiff_t toW16 = 90;
ptrdiff_t k40 = 16*w21;
for (; k40 != 16; ++k40) {
ptrdiff_t r17 = 0;
for (; r17 != 2; ++r17) {
ptrdiff_t t28 = 0;
for (; t28 < 2; ++t28) {
__m512 sfRe265 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfIm265 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfRe269 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfIm269 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfRe266 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfIm266 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfRe270 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfIm270 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfRe267 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfIm267 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfRe271 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfIm271 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfRe268 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfIm268 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfRe272 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfIm272 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512i ifft4577 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft4578 = _mm512_permutexvar_ps(ifft4577, sfRe265);
__m512 ifft4669 = _mm512_permutexvar_ps(ifft4577, sfRe269);
__m512i ifft4579 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4580 = _mm512_permutexvar_ps(ifft4579, sfRe265);
__m512 ifft4670 = _mm512_permutexvar_ps(ifft4579, sfRe269);
__m512 ifft4581 = _mm512_permutexvar_ps(ifft4577, sfIm265);
__m512 ifft4671 = _mm512_permutexvar_ps(ifft4577, sfIm269);
__m512 ifft4582 = _mm512_permutexvar_ps(ifft4579, sfIm265);
__m512 ifft4672 = _mm512_permutexvar_ps(ifft4579, sfIm269);
__m512 ifft4583 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft4584 = _mm512_mask_fmadd_ps(ifft4582, 65021, ifft4583, ifft4578);
__m512 ifft4673 = _mm512_mask_fmadd_ps(ifft4672, 65021, ifft4583, ifft4669);
__m512 ifft4585 = _mm512_mask_fnmadd_ps(ifft4581, 65021, ifft4583, ifft4580);
__m512 ifft4674 = _mm512_mask_fnmadd_ps(ifft4671, 65021, ifft4583, ifft4670);
__m512 ifft4586 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft4587 = _mm512_fmadd_ps(ifft4584, ifft4586, _mm512_shuffle_ps(ifft4584, ifft4584, 177));
__m512 ifft4675 = _mm512_fmadd_ps(ifft4673, ifft4586, _mm512_shuffle_ps(ifft4673, ifft4673, 177));
__m512 ifft4588 = _mm512_fmadd_ps(ifft4585, ifft4586, _mm512_shuffle_ps(ifft4585, ifft4585, 177));
__m512 ifft4676 = _mm512_fmadd_ps(ifft4674, ifft4586, _mm512_shuffle_ps(ifft4674, ifft4674, 177));
__m512 ifft4589 = _mm512_fmadd_ps(sfRe266, ifft4586, _mm512_shuffle_ps(sfRe266, sfRe266, 177));
__m512 ifft4677 = _mm512_fmadd_ps(sfRe270, ifft4586, _mm512_shuffle_ps(sfRe270, sfRe270, 177));
__m512 ifft4590 = _mm512_fmadd_ps(sfIm266, ifft4586, _mm512_shuffle_ps(sfIm266, sfIm266, 177));
__m512 ifft4678 = _mm512_fmadd_ps(sfIm270, ifft4586, _mm512_shuffle_ps(sfIm270, sfIm270, 177));
__m512 ifft4591 = _mm512_fmadd_ps(sfRe267, ifft4586, _mm512_shuffle_ps(sfRe267, sfRe267, 177));
__m512 ifft4679 = _mm512_fmadd_ps(sfRe271, ifft4586, _mm512_shuffle_ps(sfRe271, sfRe271, 177));
__m512 ifft4592 = _mm512_fmadd_ps(sfIm267, ifft4586, _mm512_shuffle_ps(sfIm267, sfIm267, 177));
__m512 ifft4680 = _mm512_fmadd_ps(sfIm271, ifft4586, _mm512_shuffle_ps(sfIm271, sfIm271, 177));
__m512 ifft4593 = _mm512_fmadd_ps(sfRe268, ifft4586, _mm512_shuffle_ps(sfRe268, sfRe268, 177));
__m512 ifft4681 = _mm512_fmadd_ps(sfRe272, ifft4586, _mm512_shuffle_ps(sfRe272, sfRe272, 177));
__m512 ifft4594 = _mm512_fmadd_ps(sfIm268, ifft4586, _mm512_shuffle_ps(sfIm268, sfIm268, 177));
__m512 ifft4682 = _mm512_fmadd_ps(sfIm272, ifft4586, _mm512_shuffle_ps(sfIm272, sfIm272, 177));
__m512 ifft4595 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft4596 = _mm512_mul_ps(ifft4587, ifft4595);
__m512 ifft4683 = _mm512_mul_ps(ifft4675, ifft4595);
__m512 ifft4597 = _mm512_mul_ps(ifft4588, ifft4595);
__m512 ifft4684 = _mm512_mul_ps(ifft4676, ifft4595);
__m512 ifft4598 = _mm512_mul_ps(ifft4589, ifft4595);
__m512 ifft4685 = _mm512_mul_ps(ifft4677, ifft4595);
__m512 ifft4599 = _mm512_mul_ps(ifft4590, ifft4595);
__m512 ifft4686 = _mm512_mul_ps(ifft4678, ifft4595);
__m512 ifft4600 = _mm512_mul_ps(ifft4591, ifft4595);
__m512 ifft4687 = _mm512_mul_ps(ifft4679, ifft4595);
__m512 ifft4601 = _mm512_mul_ps(ifft4592, ifft4595);
__m512 ifft4688 = _mm512_mul_ps(ifft4680, ifft4595);
__m512 ifft4602 = _mm512_mul_ps(ifft4593, ifft4595);
__m512 ifft4689 = _mm512_mul_ps(ifft4681, ifft4595);
__m512 ifft4603 = _mm512_mul_ps(ifft4594, ifft4595);
__m512 ifft4690 = _mm512_mul_ps(ifft4682, ifft4595);
__m512 ifft4604 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft4605 = _mm512_fnmadd_ps(ifft4588, ifft4604, ifft4596);
__m512 ifft4691 = _mm512_fnmadd_ps(ifft4676, ifft4604, ifft4683);
__m512 ifft4606 = _mm512_fmadd_ps(ifft4587, ifft4604, ifft4597);
__m512 ifft4692 = _mm512_fmadd_ps(ifft4675, ifft4604, ifft4684);
__m512 ifft4607 = _mm512_fnmadd_ps(ifft4590, ifft4604, ifft4598);
__m512 ifft4693 = _mm512_fnmadd_ps(ifft4678, ifft4604, ifft4685);
__m512 ifft4608 = _mm512_fmadd_ps(ifft4589, ifft4604, ifft4599);
__m512 ifft4694 = _mm512_fmadd_ps(ifft4677, ifft4604, ifft4686);
__m512 ifft4609 = _mm512_fnmadd_ps(ifft4592, ifft4604, ifft4600);
__m512 ifft4695 = _mm512_fnmadd_ps(ifft4680, ifft4604, ifft4687);
__m512 ifft4610 = _mm512_fmadd_ps(ifft4591, ifft4604, ifft4601);
__m512 ifft4696 = _mm512_fmadd_ps(ifft4679, ifft4604, ifft4688);
__m512 ifft4611 = _mm512_fnmadd_ps(ifft4594, ifft4604, ifft4602);
__m512 ifft4697 = _mm512_fnmadd_ps(ifft4682, ifft4604, ifft4689);
__m512 ifft4612 = _mm512_fmadd_ps(ifft4593, ifft4604, ifft4603);
__m512 ifft4698 = _mm512_fmadd_ps(ifft4681, ifft4604, ifft4690);
__m512 ifft4613 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft4614 = _mm512_fmadd_ps(ifft4605, ifft4613, _mm512_shuffle_ps(ifft4605, ifft4605, 78));
__m512 ifft4699 = _mm512_fmadd_ps(ifft4691, ifft4613, _mm512_shuffle_ps(ifft4691, ifft4691, 78));
__m512 ifft4615 = _mm512_fmadd_ps(ifft4606, ifft4613, _mm512_shuffle_ps(ifft4606, ifft4606, 78));
__m512 ifft4700 = _mm512_fmadd_ps(ifft4692, ifft4613, _mm512_shuffle_ps(ifft4692, ifft4692, 78));
__m512 ifft4616 = _mm512_fmadd_ps(ifft4607, ifft4613, _mm512_shuffle_ps(ifft4607, ifft4607, 78));
__m512 ifft4701 = _mm512_fmadd_ps(ifft4693, ifft4613, _mm512_shuffle_ps(ifft4693, ifft4693, 78));
__m512 ifft4617 = _mm512_fmadd_ps(ifft4608, ifft4613, _mm512_shuffle_ps(ifft4608, ifft4608, 78));
__m512 ifft4702 = _mm512_fmadd_ps(ifft4694, ifft4613, _mm512_shuffle_ps(ifft4694, ifft4694, 78));
__m512 ifft4618 = _mm512_fmadd_ps(ifft4609, ifft4613, _mm512_shuffle_ps(ifft4609, ifft4609, 78));
__m512 ifft4703 = _mm512_fmadd_ps(ifft4695, ifft4613, _mm512_shuffle_ps(ifft4695, ifft4695, 78));
__m512 ifft4619 = _mm512_fmadd_ps(ifft4610, ifft4613, _mm512_shuffle_ps(ifft4610, ifft4610, 78));
__m512 ifft4704 = _mm512_fmadd_ps(ifft4696, ifft4613, _mm512_shuffle_ps(ifft4696, ifft4696, 78));
__m512 ifft4620 = _mm512_fmadd_ps(ifft4611, ifft4613, _mm512_shuffle_ps(ifft4611, ifft4611, 78));
__m512 ifft4705 = _mm512_fmadd_ps(ifft4697, ifft4613, _mm512_shuffle_ps(ifft4697, ifft4697, 78));
__m512 ifft4621 = _mm512_fmadd_ps(ifft4612, ifft4613, _mm512_shuffle_ps(ifft4612, ifft4612, 78));
__m512 ifft4706 = _mm512_fmadd_ps(ifft4698, ifft4613, _mm512_shuffle_ps(ifft4698, ifft4698, 78));
__m512 ifft4622 = _mm512_mask_sub_ps(ifft4614, 49344, _mm512_setzero_ps(), ifft4615);
__m512 ifft4707 = _mm512_mask_sub_ps(ifft4699, 49344, _mm512_setzero_ps(), ifft4700);
__m512 ifft4623 = _mm512_mask_mov_ps(ifft4615, 49344, ifft4614);
__m512 ifft4708 = _mm512_mask_mov_ps(ifft4700, 49344, ifft4699);
__m512 ifft4624 = _mm512_mask_sub_ps(ifft4616, 49344, _mm512_setzero_ps(), ifft4617);
__m512 ifft4709 = _mm512_mask_sub_ps(ifft4701, 49344, _mm512_setzero_ps(), ifft4702);
__m512 ifft4625 = _mm512_mask_mov_ps(ifft4617, 49344, ifft4616);
__m512 ifft4710 = _mm512_mask_mov_ps(ifft4702, 49344, ifft4701);
__m512 ifft4626 = _mm512_mask_sub_ps(ifft4618, 49344, _mm512_setzero_ps(), ifft4619);
__m512 ifft4711 = _mm512_mask_sub_ps(ifft4703, 49344, _mm512_setzero_ps(), ifft4704);
__m512 ifft4627 = _mm512_mask_mov_ps(ifft4619, 49344, ifft4618);
__m512 ifft4712 = _mm512_mask_mov_ps(ifft4704, 49344, ifft4703);
__m512 ifft4628 = _mm512_mask_sub_ps(ifft4620, 49344, _mm512_setzero_ps(), ifft4621);
__m512 ifft4713 = _mm512_mask_sub_ps(ifft4705, 49344, _mm512_setzero_ps(), ifft4706);
__m512 ifft4629 = _mm512_mask_mov_ps(ifft4621, 49344, ifft4620);
__m512 ifft4714 = _mm512_mask_mov_ps(ifft4706, 49344, ifft4705);
__m512 ifft4630 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft4631 = _mm512_fmadd_ps(ifft4622, ifft4630, _mm512_shuffle_f32x4(ifft4622, ifft4622, 177));
__m512 ifft4715 = _mm512_fmadd_ps(ifft4707, ifft4630, _mm512_shuffle_f32x4(ifft4707, ifft4707, 177));
__m512 ifft4632 = _mm512_fmadd_ps(ifft4623, ifft4630, _mm512_shuffle_f32x4(ifft4623, ifft4623, 177));
__m512 ifft4716 = _mm512_fmadd_ps(ifft4708, ifft4630, _mm512_shuffle_f32x4(ifft4708, ifft4708, 177));
__m512 ifft4633 = _mm512_fmadd_ps(ifft4624, ifft4630, _mm512_shuffle_f32x4(ifft4624, ifft4624, 177));
__m512 ifft4717 = _mm512_fmadd_ps(ifft4709, ifft4630, _mm512_shuffle_f32x4(ifft4709, ifft4709, 177));
__m512 ifft4634 = _mm512_fmadd_ps(ifft4625, ifft4630, _mm512_shuffle_f32x4(ifft4625, ifft4625, 177));
__m512 ifft4718 = _mm512_fmadd_ps(ifft4710, ifft4630, _mm512_shuffle_f32x4(ifft4710, ifft4710, 177));
__m512 ifft4635 = _mm512_fmadd_ps(ifft4626, ifft4630, _mm512_shuffle_f32x4(ifft4626, ifft4626, 177));
__m512 ifft4719 = _mm512_fmadd_ps(ifft4711, ifft4630, _mm512_shuffle_f32x4(ifft4711, ifft4711, 177));
__m512 ifft4636 = _mm512_fnmsub_ps(ifft4627, ifft4630, _mm512_shuffle_f32x4(ifft4627, ifft4627, 177));
__m512 ifft4720 = _mm512_fnmsub_ps(ifft4712, ifft4630, _mm512_shuffle_f32x4(ifft4712, ifft4712, 177));
__m512 ifft4637 = _mm512_fmadd_ps(ifft4628, ifft4630, _mm512_shuffle_f32x4(ifft4628, ifft4628, 177));
__m512 ifft4721 = _mm512_fmadd_ps(ifft4713, ifft4630, _mm512_shuffle_f32x4(ifft4713, ifft4713, 177));
__m512 ifft4638 = _mm512_fmadd_ps(ifft4629, ifft4630, _mm512_shuffle_f32x4(ifft4629, ifft4629, 177));
__m512 ifft4722 = _mm512_fmadd_ps(ifft4714, ifft4630, _mm512_shuffle_f32x4(ifft4714, ifft4714, 177));
__m512 ifft4639 = _mm512_add_ps(ifft4631, ifft4632);
__m512 ifft4723 = _mm512_add_ps(ifft4715, ifft4716);
__m512 ifft4640 = _mm512_sub_ps(ifft4631, ifft4632);
__m512 ifft4724 = _mm512_sub_ps(ifft4715, ifft4716);
__m512 ifft4641 = _mm512_sub_ps(ifft4633, ifft4637);
__m512 ifft4725 = _mm512_sub_ps(ifft4717, ifft4721);
__m512 ifft4642 = _mm512_add_ps(ifft4634, ifft4638);
__m512 ifft4726 = _mm512_add_ps(ifft4718, ifft4722);
__m512 ifft4643 = _mm512_add_ps(ifft4633, ifft4637);
__m512 ifft4727 = _mm512_add_ps(ifft4717, ifft4721);
__m512 ifft4644 = _mm512_sub_ps(ifft4634, ifft4638);
__m512 ifft4728 = _mm512_sub_ps(ifft4718, ifft4722);
__m512 ifft4645 = _mm512_mul_ps(ifft4635, _mm512_set1_ps(3.125e-02f));
__m512 ifft4729 = _mm512_mul_ps(ifft4719, _mm512_set1_ps(3.125e-02f));
__m512 ifft4646 = _mm512_mul_ps(ifft4636, _mm512_set1_ps(3.125e-02f));
__m512 ifft4730 = _mm512_mul_ps(ifft4720, _mm512_set1_ps(3.125e-02f));
__m512 ifft4647 = _mm512_fmadd_ps(ifft4639, _mm512_set1_ps(1.5625e-02f), ifft4645);
__m512 ifft4731 = _mm512_fmadd_ps(ifft4723, _mm512_set1_ps(1.5625e-02f), ifft4729);
__m512 ifft4648 = _mm512_fmsub_ps(ifft4639, _mm512_set1_ps(1.5625e-02f), ifft4645);
__m512 ifft4732 = _mm512_fmsub_ps(ifft4723, _mm512_set1_ps(1.5625e-02f), ifft4729);
__m512 ifft4649 = _mm512_fmadd_ps(ifft4640, _mm512_set1_ps(1.5625e-02f), ifft4646);
__m512 ifft4733 = _mm512_fmadd_ps(ifft4724, _mm512_set1_ps(1.5625e-02f), ifft4730);
__m512 ifft4650 = _mm512_fmsub_ps(ifft4640, _mm512_set1_ps(1.5625e-02f), ifft4646);
__m512 ifft4734 = _mm512_fmsub_ps(ifft4724, _mm512_set1_ps(1.5625e-02f), ifft4730);
__m512 ifft4651 = _mm512_add_ps(ifft4641, ifft4642);
__m512 ifft4735 = _mm512_add_ps(ifft4725, ifft4726);
__m512 ifft4652 = _mm512_sub_ps(ifft4641, ifft4642);
__m512 ifft4736 = _mm512_sub_ps(ifft4725, ifft4726);
__m512 ifft4653 = _mm512_fnmadd_ps(ifft4651, _mm512_set1_ps(7.0710677e-01f), ifft4643);
__m512 ifft4737 = _mm512_fnmadd_ps(ifft4735, _mm512_set1_ps(7.0710677e-01f), ifft4727);
__m512 ifft4654 = _mm512_fmadd_ps(ifft4651, _mm512_set1_ps(7.0710677e-01f), ifft4643);
__m512 ifft4738 = _mm512_fmadd_ps(ifft4735, _mm512_set1_ps(7.0710677e-01f), ifft4727);
__m512 ifft4655 = _mm512_fmadd_ps(ifft4652, _mm512_set1_ps(7.0710677e-01f), ifft4644);
__m512 ifft4739 = _mm512_fmadd_ps(ifft4736, _mm512_set1_ps(7.0710677e-01f), ifft4728);
__m512 ifft4656 = _mm512_fmsub_ps(ifft4652, _mm512_set1_ps(7.0710677e-01f), ifft4644);
__m512 ifft4740 = _mm512_fmsub_ps(ifft4736, _mm512_set1_ps(7.0710677e-01f), ifft4728);
__m512 ifft4657 = _mm512_add_ps(ifft4653, ifft4654);
__m512 ifft4741 = _mm512_add_ps(ifft4737, ifft4738);
__m512 ifft4658 = _mm512_sub_ps(ifft4653, ifft4654);
__m512 ifft4742 = _mm512_sub_ps(ifft4737, ifft4738);
__m512 ifft4659 = _mm512_add_ps(ifft4655, ifft4656);
__m512 ifft4743 = _mm512_add_ps(ifft4739, ifft4740);
__m512 ifft4660 = _mm512_sub_ps(ifft4655, ifft4656);
__m512 ifft4744 = _mm512_sub_ps(ifft4739, ifft4740);
__m512 ifft4661 = _mm512_fmadd_ps(ifft4657, _mm512_set1_ps(1.5625e-02f), ifft4647);
__m512 ifft4745 = _mm512_fmadd_ps(ifft4741, _mm512_set1_ps(1.5625e-02f), ifft4731);
__m512 ifft4662 = _mm512_fnmadd_ps(ifft4657, _mm512_set1_ps(1.5625e-02f), ifft4647);
__m512 ifft4746 = _mm512_fnmadd_ps(ifft4741, _mm512_set1_ps(1.5625e-02f), ifft4731);
__m512 ifft4663 = _mm512_fmadd_ps(ifft4659, _mm512_set1_ps(1.5625e-02f), ifft4649);
__m512 ifft4747 = _mm512_fmadd_ps(ifft4743, _mm512_set1_ps(1.5625e-02f), ifft4733);
__m512 ifft4664 = _mm512_fnmadd_ps(ifft4659, _mm512_set1_ps(1.5625e-02f), ifft4649);
__m512 ifft4748 = _mm512_fnmadd_ps(ifft4743, _mm512_set1_ps(1.5625e-02f), ifft4733);
__m512 ifft4665 = _mm512_fnmadd_ps(ifft4660, _mm512_set1_ps(1.5625e-02f), ifft4648);
__m512 ifft4749 = _mm512_fnmadd_ps(ifft4744, _mm512_set1_ps(1.5625e-02f), ifft4732);
__m512 ifft4666 = _mm512_fmadd_ps(ifft4660, _mm512_set1_ps(1.5625e-02f), ifft4648);
__m512 ifft4750 = _mm512_fmadd_ps(ifft4744, _mm512_set1_ps(1.5625e-02f), ifft4732);
__m512 ifft4667 = _mm512_fmadd_ps(ifft4658, _mm512_set1_ps(1.5625e-02f), ifft4650);
__m512 ifft4751 = _mm512_fmadd_ps(ifft4742, _mm512_set1_ps(1.5625e-02f), ifft4734);
__m512 ifft4668 = _mm512_fnmadd_ps(ifft4658, _mm512_set1_ps(1.5625e-02f), ifft4650);
__m512 ifft4752 = _mm512_fnmadd_ps(ifft4742, _mm512_set1_ps(1.5625e-02f), ifft4734);
__m512 dat850 = ifft4661;
__m512 dat855 = ifft4745;
__m512 dat851 = ifft4663;
__m512 dat856 = ifft4747;
__m512 dat852 = ifft4665;
__m512 dat857 = ifft4749;
__m512 dat853 = ifft4667;
__m512 dat858 = ifft4751;
__m512 dat854 = ifft4662;
__m512 dat859 = ifft4746;
(void)ifft4664;
(void)ifft4748;
(void)ifft4666;
(void)ifft4750;
(void)ifft4668;
(void)ifft4752;
__m512i pm47 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack231 = _mm512_permutex2var_ps(dat850, pm47, dat855);
__m512i pm48 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack232 = _mm512_permutex2var_ps(dat850, pm48, dat855);
__m512 pack233 = _mm512_permutex2var_ps(dat851, pm47, dat856);
__m512 pack234 = _mm512_permutex2var_ps(dat851, pm48, dat856);
__m512 pack235 = _mm512_permutex2var_ps(dat852, pm47, dat857);
__m512 pack236 = _mm512_permutex2var_ps(dat852, pm48, dat857);
__m512 pack237 = _mm512_permutex2var_ps(dat853, pm47, dat858);
__m512 pack238 = _mm512_permutex2var_ps(dat853, pm48, dat858);
__m512 pack239 = _mm512_permutex2var_ps(dat854, pm47, dat859);
__m512 pack240 = _mm512_permutex2var_ps(dat854, pm48, dat859);
pack231 = _mm512_max_ps(_mm512_setzero_ps(), pack231);
pack232 = _mm512_max_ps(_mm512_setzero_ps(), pack232);
pack233 = _mm512_max_ps(_mm512_setzero_ps(), pack233);
pack234 = _mm512_max_ps(_mm512_setzero_ps(), pack234);
pack235 = _mm512_max_ps(_mm512_setzero_ps(), pack235);
pack236 = _mm512_max_ps(_mm512_setzero_ps(), pack236);
pack237 = _mm512_max_ps(_mm512_setzero_ps(), pack237);
pack238 = _mm512_max_ps(_mm512_setzero_ps(), pack238);
pack239 = _mm512_max_ps(_mm512_setzero_ps(), pack239);
pack240 = _mm512_max_ps(_mm512_setzero_ps(), pack240);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack231);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack232);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack233);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack234);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack235);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack236);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack237);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack238);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack239);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack240);
}
ptrdiff_t t29 = 0;
__m512 sfRe273 = _mm512_loadu_ps(sfPtr3+512+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfIm273 = _mm512_loadu_ps(sfPtr3+576+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfRe277 = _mm512_loadu_ps(sfPtr3+640+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfIm277 = _mm512_loadu_ps(sfPtr3+704+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfRe274 = _mm512_loadu_ps(sfPtr3+2167296+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfIm274 = _mm512_loadu_ps(sfPtr3+2167360+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfRe278 = _mm512_loadu_ps(sfPtr3+2167424+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfIm278 = _mm512_loadu_ps(sfPtr3+2167488+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfRe275 = _mm512_loadu_ps(sfPtr3+4334080+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfIm275 = _mm512_loadu_ps(sfPtr3+4334144+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfRe279 = _mm512_loadu_ps(sfPtr3+4334208+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfIm279 = _mm512_loadu_ps(sfPtr3+4334272+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfRe276 = _mm512_loadu_ps(sfPtr3+6500864+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfIm276 = _mm512_loadu_ps(sfPtr3+6500928+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfRe280 = _mm512_loadu_ps(sfPtr3+6500992+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfIm280 = _mm512_loadu_ps(sfPtr3+6501056+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512i ifft4753 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft4754 = _mm512_permutexvar_ps(ifft4753, sfRe273);
__m512 ifft4845 = _mm512_permutexvar_ps(ifft4753, sfRe277);
__m512i ifft4755 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4756 = _mm512_permutexvar_ps(ifft4755, sfRe273);
__m512 ifft4846 = _mm512_permutexvar_ps(ifft4755, sfRe277);
__m512 ifft4757 = _mm512_permutexvar_ps(ifft4753, sfIm273);
__m512 ifft4847 = _mm512_permutexvar_ps(ifft4753, sfIm277);
__m512 ifft4758 = _mm512_permutexvar_ps(ifft4755, sfIm273);
__m512 ifft4848 = _mm512_permutexvar_ps(ifft4755, sfIm277);
__m512 ifft4759 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft4760 = _mm512_mask_fmadd_ps(ifft4758, 65021, ifft4759, ifft4754);
__m512 ifft4849 = _mm512_mask_fmadd_ps(ifft4848, 65021, ifft4759, ifft4845);
__m512 ifft4761 = _mm512_mask_fnmadd_ps(ifft4757, 65021, ifft4759, ifft4756);
__m512 ifft4850 = _mm512_mask_fnmadd_ps(ifft4847, 65021, ifft4759, ifft4846);
__m512 ifft4762 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft4763 = _mm512_fmadd_ps(ifft4760, ifft4762, _mm512_shuffle_ps(ifft4760, ifft4760, 177));
__m512 ifft4851 = _mm512_fmadd_ps(ifft4849, ifft4762, _mm512_shuffle_ps(ifft4849, ifft4849, 177));
__m512 ifft4764 = _mm512_fmadd_ps(ifft4761, ifft4762, _mm512_shuffle_ps(ifft4761, ifft4761, 177));
__m512 ifft4852 = _mm512_fmadd_ps(ifft4850, ifft4762, _mm512_shuffle_ps(ifft4850, ifft4850, 177));
__m512 ifft4765 = _mm512_fmadd_ps(sfRe274, ifft4762, _mm512_shuffle_ps(sfRe274, sfRe274, 177));
__m512 ifft4853 = _mm512_fmadd_ps(sfRe278, ifft4762, _mm512_shuffle_ps(sfRe278, sfRe278, 177));
__m512 ifft4766 = _mm512_fmadd_ps(sfIm274, ifft4762, _mm512_shuffle_ps(sfIm274, sfIm274, 177));
__m512 ifft4854 = _mm512_fmadd_ps(sfIm278, ifft4762, _mm512_shuffle_ps(sfIm278, sfIm278, 177));
__m512 ifft4767 = _mm512_fmadd_ps(sfRe275, ifft4762, _mm512_shuffle_ps(sfRe275, sfRe275, 177));
__m512 ifft4855 = _mm512_fmadd_ps(sfRe279, ifft4762, _mm512_shuffle_ps(sfRe279, sfRe279, 177));
__m512 ifft4768 = _mm512_fmadd_ps(sfIm275, ifft4762, _mm512_shuffle_ps(sfIm275, sfIm275, 177));
__m512 ifft4856 = _mm512_fmadd_ps(sfIm279, ifft4762, _mm512_shuffle_ps(sfIm279, sfIm279, 177));
__m512 ifft4769 = _mm512_fmadd_ps(sfRe276, ifft4762, _mm512_shuffle_ps(sfRe276, sfRe276, 177));
__m512 ifft4857 = _mm512_fmadd_ps(sfRe280, ifft4762, _mm512_shuffle_ps(sfRe280, sfRe280, 177));
__m512 ifft4770 = _mm512_fmadd_ps(sfIm276, ifft4762, _mm512_shuffle_ps(sfIm276, sfIm276, 177));
__m512 ifft4858 = _mm512_fmadd_ps(sfIm280, ifft4762, _mm512_shuffle_ps(sfIm280, sfIm280, 177));
__m512 ifft4771 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft4772 = _mm512_mul_ps(ifft4763, ifft4771);
__m512 ifft4859 = _mm512_mul_ps(ifft4851, ifft4771);
__m512 ifft4773 = _mm512_mul_ps(ifft4764, ifft4771);
__m512 ifft4860 = _mm512_mul_ps(ifft4852, ifft4771);
__m512 ifft4774 = _mm512_mul_ps(ifft4765, ifft4771);
__m512 ifft4861 = _mm512_mul_ps(ifft4853, ifft4771);
__m512 ifft4775 = _mm512_mul_ps(ifft4766, ifft4771);
__m512 ifft4862 = _mm512_mul_ps(ifft4854, ifft4771);
__m512 ifft4776 = _mm512_mul_ps(ifft4767, ifft4771);
__m512 ifft4863 = _mm512_mul_ps(ifft4855, ifft4771);
__m512 ifft4777 = _mm512_mul_ps(ifft4768, ifft4771);
__m512 ifft4864 = _mm512_mul_ps(ifft4856, ifft4771);
__m512 ifft4778 = _mm512_mul_ps(ifft4769, ifft4771);
__m512 ifft4865 = _mm512_mul_ps(ifft4857, ifft4771);
__m512 ifft4779 = _mm512_mul_ps(ifft4770, ifft4771);
__m512 ifft4866 = _mm512_mul_ps(ifft4858, ifft4771);
__m512 ifft4780 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft4781 = _mm512_fnmadd_ps(ifft4764, ifft4780, ifft4772);
__m512 ifft4867 = _mm512_fnmadd_ps(ifft4852, ifft4780, ifft4859);
__m512 ifft4782 = _mm512_fmadd_ps(ifft4763, ifft4780, ifft4773);
__m512 ifft4868 = _mm512_fmadd_ps(ifft4851, ifft4780, ifft4860);
__m512 ifft4783 = _mm512_fnmadd_ps(ifft4766, ifft4780, ifft4774);
__m512 ifft4869 = _mm512_fnmadd_ps(ifft4854, ifft4780, ifft4861);
__m512 ifft4784 = _mm512_fmadd_ps(ifft4765, ifft4780, ifft4775);
__m512 ifft4870 = _mm512_fmadd_ps(ifft4853, ifft4780, ifft4862);
__m512 ifft4785 = _mm512_fnmadd_ps(ifft4768, ifft4780, ifft4776);
__m512 ifft4871 = _mm512_fnmadd_ps(ifft4856, ifft4780, ifft4863);
__m512 ifft4786 = _mm512_fmadd_ps(ifft4767, ifft4780, ifft4777);
__m512 ifft4872 = _mm512_fmadd_ps(ifft4855, ifft4780, ifft4864);
__m512 ifft4787 = _mm512_fnmadd_ps(ifft4770, ifft4780, ifft4778);
__m512 ifft4873 = _mm512_fnmadd_ps(ifft4858, ifft4780, ifft4865);
__m512 ifft4788 = _mm512_fmadd_ps(ifft4769, ifft4780, ifft4779);
__m512 ifft4874 = _mm512_fmadd_ps(ifft4857, ifft4780, ifft4866);
__m512 ifft4789 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft4790 = _mm512_fmadd_ps(ifft4781, ifft4789, _mm512_shuffle_ps(ifft4781, ifft4781, 78));
__m512 ifft4875 = _mm512_fmadd_ps(ifft4867, ifft4789, _mm512_shuffle_ps(ifft4867, ifft4867, 78));
__m512 ifft4791 = _mm512_fmadd_ps(ifft4782, ifft4789, _mm512_shuffle_ps(ifft4782, ifft4782, 78));
__m512 ifft4876 = _mm512_fmadd_ps(ifft4868, ifft4789, _mm512_shuffle_ps(ifft4868, ifft4868, 78));
__m512 ifft4792 = _mm512_fmadd_ps(ifft4783, ifft4789, _mm512_shuffle_ps(ifft4783, ifft4783, 78));
__m512 ifft4877 = _mm512_fmadd_ps(ifft4869, ifft4789, _mm512_shuffle_ps(ifft4869, ifft4869, 78));
__m512 ifft4793 = _mm512_fmadd_ps(ifft4784, ifft4789, _mm512_shuffle_ps(ifft4784, ifft4784, 78));
__m512 ifft4878 = _mm512_fmadd_ps(ifft4870, ifft4789, _mm512_shuffle_ps(ifft4870, ifft4870, 78));
__m512 ifft4794 = _mm512_fmadd_ps(ifft4785, ifft4789, _mm512_shuffle_ps(ifft4785, ifft4785, 78));
__m512 ifft4879 = _mm512_fmadd_ps(ifft4871, ifft4789, _mm512_shuffle_ps(ifft4871, ifft4871, 78));
__m512 ifft4795 = _mm512_fmadd_ps(ifft4786, ifft4789, _mm512_shuffle_ps(ifft4786, ifft4786, 78));
__m512 ifft4880 = _mm512_fmadd_ps(ifft4872, ifft4789, _mm512_shuffle_ps(ifft4872, ifft4872, 78));
__m512 ifft4796 = _mm512_fmadd_ps(ifft4787, ifft4789, _mm512_shuffle_ps(ifft4787, ifft4787, 78));
__m512 ifft4881 = _mm512_fmadd_ps(ifft4873, ifft4789, _mm512_shuffle_ps(ifft4873, ifft4873, 78));
__m512 ifft4797 = _mm512_fmadd_ps(ifft4788, ifft4789, _mm512_shuffle_ps(ifft4788, ifft4788, 78));
__m512 ifft4882 = _mm512_fmadd_ps(ifft4874, ifft4789, _mm512_shuffle_ps(ifft4874, ifft4874, 78));
__m512 ifft4798 = _mm512_mask_sub_ps(ifft4790, 49344, _mm512_setzero_ps(), ifft4791);
__m512 ifft4883 = _mm512_mask_sub_ps(ifft4875, 49344, _mm512_setzero_ps(), ifft4876);
__m512 ifft4799 = _mm512_mask_mov_ps(ifft4791, 49344, ifft4790);
__m512 ifft4884 = _mm512_mask_mov_ps(ifft4876, 49344, ifft4875);
__m512 ifft4800 = _mm512_mask_sub_ps(ifft4792, 49344, _mm512_setzero_ps(), ifft4793);
__m512 ifft4885 = _mm512_mask_sub_ps(ifft4877, 49344, _mm512_setzero_ps(), ifft4878);
__m512 ifft4801 = _mm512_mask_mov_ps(ifft4793, 49344, ifft4792);
__m512 ifft4886 = _mm512_mask_mov_ps(ifft4878, 49344, ifft4877);
__m512 ifft4802 = _mm512_mask_sub_ps(ifft4794, 49344, _mm512_setzero_ps(), ifft4795);
__m512 ifft4887 = _mm512_mask_sub_ps(ifft4879, 49344, _mm512_setzero_ps(), ifft4880);
__m512 ifft4803 = _mm512_mask_mov_ps(ifft4795, 49344, ifft4794);
__m512 ifft4888 = _mm512_mask_mov_ps(ifft4880, 49344, ifft4879);
__m512 ifft4804 = _mm512_mask_sub_ps(ifft4796, 49344, _mm512_setzero_ps(), ifft4797);
__m512 ifft4889 = _mm512_mask_sub_ps(ifft4881, 49344, _mm512_setzero_ps(), ifft4882);
__m512 ifft4805 = _mm512_mask_mov_ps(ifft4797, 49344, ifft4796);
__m512 ifft4890 = _mm512_mask_mov_ps(ifft4882, 49344, ifft4881);
__m512 ifft4806 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft4807 = _mm512_fmadd_ps(ifft4798, ifft4806, _mm512_shuffle_f32x4(ifft4798, ifft4798, 177));
__m512 ifft4891 = _mm512_fmadd_ps(ifft4883, ifft4806, _mm512_shuffle_f32x4(ifft4883, ifft4883, 177));
__m512 ifft4808 = _mm512_fmadd_ps(ifft4799, ifft4806, _mm512_shuffle_f32x4(ifft4799, ifft4799, 177));
__m512 ifft4892 = _mm512_fmadd_ps(ifft4884, ifft4806, _mm512_shuffle_f32x4(ifft4884, ifft4884, 177));
__m512 ifft4809 = _mm512_fmadd_ps(ifft4800, ifft4806, _mm512_shuffle_f32x4(ifft4800, ifft4800, 177));
__m512 ifft4893 = _mm512_fmadd_ps(ifft4885, ifft4806, _mm512_shuffle_f32x4(ifft4885, ifft4885, 177));
__m512 ifft4810 = _mm512_fmadd_ps(ifft4801, ifft4806, _mm512_shuffle_f32x4(ifft4801, ifft4801, 177));
__m512 ifft4894 = _mm512_fmadd_ps(ifft4886, ifft4806, _mm512_shuffle_f32x4(ifft4886, ifft4886, 177));
__m512 ifft4811 = _mm512_fmadd_ps(ifft4802, ifft4806, _mm512_shuffle_f32x4(ifft4802, ifft4802, 177));
__m512 ifft4895 = _mm512_fmadd_ps(ifft4887, ifft4806, _mm512_shuffle_f32x4(ifft4887, ifft4887, 177));
__m512 ifft4812 = _mm512_fnmsub_ps(ifft4803, ifft4806, _mm512_shuffle_f32x4(ifft4803, ifft4803, 177));
__m512 ifft4896 = _mm512_fnmsub_ps(ifft4888, ifft4806, _mm512_shuffle_f32x4(ifft4888, ifft4888, 177));
__m512 ifft4813 = _mm512_fmadd_ps(ifft4804, ifft4806, _mm512_shuffle_f32x4(ifft4804, ifft4804, 177));
__m512 ifft4897 = _mm512_fmadd_ps(ifft4889, ifft4806, _mm512_shuffle_f32x4(ifft4889, ifft4889, 177));
__m512 ifft4814 = _mm512_fmadd_ps(ifft4805, ifft4806, _mm512_shuffle_f32x4(ifft4805, ifft4805, 177));
__m512 ifft4898 = _mm512_fmadd_ps(ifft4890, ifft4806, _mm512_shuffle_f32x4(ifft4890, ifft4890, 177));
__m512 ifft4815 = _mm512_add_ps(ifft4807, ifft4808);
__m512 ifft4899 = _mm512_add_ps(ifft4891, ifft4892);
__m512 ifft4816 = _mm512_sub_ps(ifft4807, ifft4808);
__m512 ifft4900 = _mm512_sub_ps(ifft4891, ifft4892);
__m512 ifft4817 = _mm512_sub_ps(ifft4809, ifft4813);
__m512 ifft4901 = _mm512_sub_ps(ifft4893, ifft4897);
__m512 ifft4818 = _mm512_add_ps(ifft4810, ifft4814);
__m512 ifft4902 = _mm512_add_ps(ifft4894, ifft4898);
__m512 ifft4819 = _mm512_add_ps(ifft4809, ifft4813);
__m512 ifft4903 = _mm512_add_ps(ifft4893, ifft4897);
__m512 ifft4820 = _mm512_sub_ps(ifft4810, ifft4814);
__m512 ifft4904 = _mm512_sub_ps(ifft4894, ifft4898);
__m512 ifft4821 = _mm512_mul_ps(ifft4811, _mm512_set1_ps(3.125e-02f));
__m512 ifft4905 = _mm512_mul_ps(ifft4895, _mm512_set1_ps(3.125e-02f));
__m512 ifft4822 = _mm512_mul_ps(ifft4812, _mm512_set1_ps(3.125e-02f));
__m512 ifft4906 = _mm512_mul_ps(ifft4896, _mm512_set1_ps(3.125e-02f));
__m512 ifft4823 = _mm512_fmadd_ps(ifft4815, _mm512_set1_ps(1.5625e-02f), ifft4821);
__m512 ifft4907 = _mm512_fmadd_ps(ifft4899, _mm512_set1_ps(1.5625e-02f), ifft4905);
__m512 ifft4824 = _mm512_fmsub_ps(ifft4815, _mm512_set1_ps(1.5625e-02f), ifft4821);
__m512 ifft4908 = _mm512_fmsub_ps(ifft4899, _mm512_set1_ps(1.5625e-02f), ifft4905);
__m512 ifft4825 = _mm512_fmadd_ps(ifft4816, _mm512_set1_ps(1.5625e-02f), ifft4822);
__m512 ifft4909 = _mm512_fmadd_ps(ifft4900, _mm512_set1_ps(1.5625e-02f), ifft4906);
__m512 ifft4826 = _mm512_fmsub_ps(ifft4816, _mm512_set1_ps(1.5625e-02f), ifft4822);
__m512 ifft4910 = _mm512_fmsub_ps(ifft4900, _mm512_set1_ps(1.5625e-02f), ifft4906);
__m512 ifft4827 = _mm512_add_ps(ifft4817, ifft4818);
__m512 ifft4911 = _mm512_add_ps(ifft4901, ifft4902);
__m512 ifft4828 = _mm512_sub_ps(ifft4817, ifft4818);
__m512 ifft4912 = _mm512_sub_ps(ifft4901, ifft4902);
__m512 ifft4829 = _mm512_fnmadd_ps(ifft4827, _mm512_set1_ps(7.0710677e-01f), ifft4819);
__m512 ifft4913 = _mm512_fnmadd_ps(ifft4911, _mm512_set1_ps(7.0710677e-01f), ifft4903);
__m512 ifft4830 = _mm512_fmadd_ps(ifft4827, _mm512_set1_ps(7.0710677e-01f), ifft4819);
__m512 ifft4914 = _mm512_fmadd_ps(ifft4911, _mm512_set1_ps(7.0710677e-01f), ifft4903);
__m512 ifft4831 = _mm512_fmadd_ps(ifft4828, _mm512_set1_ps(7.0710677e-01f), ifft4820);
__m512 ifft4915 = _mm512_fmadd_ps(ifft4912, _mm512_set1_ps(7.0710677e-01f), ifft4904);
__m512 ifft4832 = _mm512_fmsub_ps(ifft4828, _mm512_set1_ps(7.0710677e-01f), ifft4820);
__m512 ifft4916 = _mm512_fmsub_ps(ifft4912, _mm512_set1_ps(7.0710677e-01f), ifft4904);
__m512 ifft4833 = _mm512_add_ps(ifft4829, ifft4830);
__m512 ifft4917 = _mm512_add_ps(ifft4913, ifft4914);
__m512 ifft4834 = _mm512_sub_ps(ifft4829, ifft4830);
__m512 ifft4918 = _mm512_sub_ps(ifft4913, ifft4914);
__m512 ifft4835 = _mm512_add_ps(ifft4831, ifft4832);
__m512 ifft4919 = _mm512_add_ps(ifft4915, ifft4916);
__m512 ifft4836 = _mm512_sub_ps(ifft4831, ifft4832);
__m512 ifft4920 = _mm512_sub_ps(ifft4915, ifft4916);
__m512 ifft4837 = _mm512_fmadd_ps(ifft4833, _mm512_set1_ps(1.5625e-02f), ifft4823);
__m512 ifft4921 = _mm512_fmadd_ps(ifft4917, _mm512_set1_ps(1.5625e-02f), ifft4907);
__m512 ifft4838 = _mm512_fnmadd_ps(ifft4833, _mm512_set1_ps(1.5625e-02f), ifft4823);
__m512 ifft4922 = _mm512_fnmadd_ps(ifft4917, _mm512_set1_ps(1.5625e-02f), ifft4907);
__m512 ifft4839 = _mm512_fmadd_ps(ifft4835, _mm512_set1_ps(1.5625e-02f), ifft4825);
__m512 ifft4923 = _mm512_fmadd_ps(ifft4919, _mm512_set1_ps(1.5625e-02f), ifft4909);
__m512 ifft4840 = _mm512_fnmadd_ps(ifft4835, _mm512_set1_ps(1.5625e-02f), ifft4825);
__m512 ifft4924 = _mm512_fnmadd_ps(ifft4919, _mm512_set1_ps(1.5625e-02f), ifft4909);
__m512 ifft4841 = _mm512_fnmadd_ps(ifft4836, _mm512_set1_ps(1.5625e-02f), ifft4824);
__m512 ifft4925 = _mm512_fnmadd_ps(ifft4920, _mm512_set1_ps(1.5625e-02f), ifft4908);
__m512 ifft4842 = _mm512_fmadd_ps(ifft4836, _mm512_set1_ps(1.5625e-02f), ifft4824);
__m512 ifft4926 = _mm512_fmadd_ps(ifft4920, _mm512_set1_ps(1.5625e-02f), ifft4908);
__m512 ifft4843 = _mm512_fmadd_ps(ifft4834, _mm512_set1_ps(1.5625e-02f), ifft4826);
__m512 ifft4927 = _mm512_fmadd_ps(ifft4918, _mm512_set1_ps(1.5625e-02f), ifft4910);
__m512 ifft4844 = _mm512_fnmadd_ps(ifft4834, _mm512_set1_ps(1.5625e-02f), ifft4826);
__m512 ifft4928 = _mm512_fnmadd_ps(ifft4918, _mm512_set1_ps(1.5625e-02f), ifft4910);
__m512 dat860 = ifft4837;
__m512 dat865 = ifft4921;
__m512 dat861 = ifft4839;
__m512 dat866 = ifft4923;
__m512 dat862 = ifft4841;
__m512 dat867 = ifft4925;
__m512 dat863 = ifft4843;
__m512 dat868 = ifft4927;
__m512 dat864 = ifft4838;
__m512 dat869 = ifft4922;
(void)ifft4840;
(void)ifft4924;
(void)ifft4842;
(void)ifft4926;
(void)ifft4844;
(void)ifft4928;
dat860 = _mm512_max_ps(_mm512_setzero_ps(), dat860);
dat865 = _mm512_max_ps(_mm512_setzero_ps(), dat865);
dat861 = _mm512_max_ps(_mm512_setzero_ps(), dat861);
dat866 = _mm512_max_ps(_mm512_setzero_ps(), dat866);
dat862 = _mm512_max_ps(_mm512_setzero_ps(), dat862);
dat867 = _mm512_max_ps(_mm512_setzero_ps(), dat867);
dat863 = _mm512_max_ps(_mm512_setzero_ps(), dat863);
dat868 = _mm512_max_ps(_mm512_setzero_ps(), dat868);
dat864 = _mm512_max_ps(_mm512_setzero_ps(), dat864);
dat869 = _mm512_max_ps(_mm512_setzero_ps(), dat869);
_mm512_mask_storeu_ps(datPtr2+80+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 3, dat860);
_mm512_mask_storeu_ps(datPtr2+52088+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 7936, dat860);
_mm512_mask_storeu_ps(datPtr2+1880+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 31, dat865);
_mm512_mask_storeu_ps(datPtr2+50288+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 768, dat865);
_mm512_mask_storeu_ps(datPtr2+528+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 3, dat861);
_mm512_mask_storeu_ps(datPtr2+52536+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 7936, dat861);
_mm512_mask_storeu_ps(datPtr2+2328+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 31, dat866);
_mm512_mask_storeu_ps(datPtr2+50736+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 768, dat866);
_mm512_mask_storeu_ps(datPtr2+976+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 3, dat862);
_mm512_mask_storeu_ps(datPtr2+52984+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 7936, dat862);
_mm512_mask_storeu_ps(datPtr2+2776+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 31, dat867);
_mm512_mask_storeu_ps(datPtr2+51184+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 768, dat867);
_mm512_mask_storeu_ps(datPtr2+1424+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 3, dat863);
_mm512_mask_storeu_ps(datPtr2+53432+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 7936, dat863);
_mm512_mask_storeu_ps(datPtr2+3224+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 31, dat868);
_mm512_mask_storeu_ps(datPtr2+51632+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 768, dat868);
_mm512_mask_storeu_ps(datPtr2+1872+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 3, dat864);
_mm512_mask_storeu_ps(datPtr2+53880+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 7936, dat864);
_mm512_mask_storeu_ps(datPtr2+3672+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 31, dat869);
_mm512_mask_storeu_ps(datPtr2+52080+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 768, dat869);
}
}
if (j5 >= last2) return;
++j5;
}
j5 = 84;
}
ptrdiff_t rel6 = j5-84;
ptrdiff_t base6 = 105;
if (rel6 < 1) {
ptrdiff_t toH17 = base6+0;
ptrdiff_t toW17 = 105;
ptrdiff_t k41 = 16*w21;
for (; k41 != 16; ++k41) {
ptrdiff_t r18 = 0;
for (; r18 != 2; ++r18) {
ptrdiff_t t30 = 0;
__m512 sfRe281 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfIm281 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfRe285 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfIm285 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfRe282 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfIm282 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfRe286 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfIm286 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfRe283 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfIm283 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfRe287 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfIm287 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfRe284 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfIm284 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfRe288 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfIm288 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512i ifft4929 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft4930 = _mm512_permutexvar_ps(ifft4929, sfRe281);
__m512 ifft5021 = _mm512_permutexvar_ps(ifft4929, sfRe285);
__m512i ifft4931 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4932 = _mm512_permutexvar_ps(ifft4931, sfRe281);
__m512 ifft5022 = _mm512_permutexvar_ps(ifft4931, sfRe285);
__m512 ifft4933 = _mm512_permutexvar_ps(ifft4929, sfIm281);
__m512 ifft5023 = _mm512_permutexvar_ps(ifft4929, sfIm285);
__m512 ifft4934 = _mm512_permutexvar_ps(ifft4931, sfIm281);
__m512 ifft5024 = _mm512_permutexvar_ps(ifft4931, sfIm285);
__m512 ifft4935 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft4936 = _mm512_mask_fmadd_ps(ifft4934, 65021, ifft4935, ifft4930);
__m512 ifft5025 = _mm512_mask_fmadd_ps(ifft5024, 65021, ifft4935, ifft5021);
__m512 ifft4937 = _mm512_mask_fnmadd_ps(ifft4933, 65021, ifft4935, ifft4932);
__m512 ifft5026 = _mm512_mask_fnmadd_ps(ifft5023, 65021, ifft4935, ifft5022);
__m512 ifft4938 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft4939 = _mm512_fmadd_ps(ifft4936, ifft4938, _mm512_shuffle_ps(ifft4936, ifft4936, 177));
__m512 ifft5027 = _mm512_fmadd_ps(ifft5025, ifft4938, _mm512_shuffle_ps(ifft5025, ifft5025, 177));
__m512 ifft4940 = _mm512_fmadd_ps(ifft4937, ifft4938, _mm512_shuffle_ps(ifft4937, ifft4937, 177));
__m512 ifft5028 = _mm512_fmadd_ps(ifft5026, ifft4938, _mm512_shuffle_ps(ifft5026, ifft5026, 177));
__m512 ifft4941 = _mm512_fmadd_ps(sfRe282, ifft4938, _mm512_shuffle_ps(sfRe282, sfRe282, 177));
__m512 ifft5029 = _mm512_fmadd_ps(sfRe286, ifft4938, _mm512_shuffle_ps(sfRe286, sfRe286, 177));
__m512 ifft4942 = _mm512_fmadd_ps(sfIm282, ifft4938, _mm512_shuffle_ps(sfIm282, sfIm282, 177));
__m512 ifft5030 = _mm512_fmadd_ps(sfIm286, ifft4938, _mm512_shuffle_ps(sfIm286, sfIm286, 177));
__m512 ifft4943 = _mm512_fmadd_ps(sfRe283, ifft4938, _mm512_shuffle_ps(sfRe283, sfRe283, 177));
__m512 ifft5031 = _mm512_fmadd_ps(sfRe287, ifft4938, _mm512_shuffle_ps(sfRe287, sfRe287, 177));
__m512 ifft4944 = _mm512_fmadd_ps(sfIm283, ifft4938, _mm512_shuffle_ps(sfIm283, sfIm283, 177));
__m512 ifft5032 = _mm512_fmadd_ps(sfIm287, ifft4938, _mm512_shuffle_ps(sfIm287, sfIm287, 177));
__m512 ifft4945 = _mm512_fmadd_ps(sfRe284, ifft4938, _mm512_shuffle_ps(sfRe284, sfRe284, 177));
__m512 ifft5033 = _mm512_fmadd_ps(sfRe288, ifft4938, _mm512_shuffle_ps(sfRe288, sfRe288, 177));
__m512 ifft4946 = _mm512_fmadd_ps(sfIm284, ifft4938, _mm512_shuffle_ps(sfIm284, sfIm284, 177));
__m512 ifft5034 = _mm512_fmadd_ps(sfIm288, ifft4938, _mm512_shuffle_ps(sfIm288, sfIm288, 177));
__m512 ifft4947 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft4948 = _mm512_mul_ps(ifft4939, ifft4947);
__m512 ifft5035 = _mm512_mul_ps(ifft5027, ifft4947);
__m512 ifft4949 = _mm512_mul_ps(ifft4940, ifft4947);
__m512 ifft5036 = _mm512_mul_ps(ifft5028, ifft4947);
__m512 ifft4950 = _mm512_mul_ps(ifft4941, ifft4947);
__m512 ifft5037 = _mm512_mul_ps(ifft5029, ifft4947);
__m512 ifft4951 = _mm512_mul_ps(ifft4942, ifft4947);
__m512 ifft5038 = _mm512_mul_ps(ifft5030, ifft4947);
__m512 ifft4952 = _mm512_mul_ps(ifft4943, ifft4947);
__m512 ifft5039 = _mm512_mul_ps(ifft5031, ifft4947);
__m512 ifft4953 = _mm512_mul_ps(ifft4944, ifft4947);
__m512 ifft5040 = _mm512_mul_ps(ifft5032, ifft4947);
__m512 ifft4954 = _mm512_mul_ps(ifft4945, ifft4947);
__m512 ifft5041 = _mm512_mul_ps(ifft5033, ifft4947);
__m512 ifft4955 = _mm512_mul_ps(ifft4946, ifft4947);
__m512 ifft5042 = _mm512_mul_ps(ifft5034, ifft4947);
__m512 ifft4956 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft4957 = _mm512_fnmadd_ps(ifft4940, ifft4956, ifft4948);
__m512 ifft5043 = _mm512_fnmadd_ps(ifft5028, ifft4956, ifft5035);
__m512 ifft4958 = _mm512_fmadd_ps(ifft4939, ifft4956, ifft4949);
__m512 ifft5044 = _mm512_fmadd_ps(ifft5027, ifft4956, ifft5036);
__m512 ifft4959 = _mm512_fnmadd_ps(ifft4942, ifft4956, ifft4950);
__m512 ifft5045 = _mm512_fnmadd_ps(ifft5030, ifft4956, ifft5037);
__m512 ifft4960 = _mm512_fmadd_ps(ifft4941, ifft4956, ifft4951);
__m512 ifft5046 = _mm512_fmadd_ps(ifft5029, ifft4956, ifft5038);
__m512 ifft4961 = _mm512_fnmadd_ps(ifft4944, ifft4956, ifft4952);
__m512 ifft5047 = _mm512_fnmadd_ps(ifft5032, ifft4956, ifft5039);
__m512 ifft4962 = _mm512_fmadd_ps(ifft4943, ifft4956, ifft4953);
__m512 ifft5048 = _mm512_fmadd_ps(ifft5031, ifft4956, ifft5040);
__m512 ifft4963 = _mm512_fnmadd_ps(ifft4946, ifft4956, ifft4954);
__m512 ifft5049 = _mm512_fnmadd_ps(ifft5034, ifft4956, ifft5041);
__m512 ifft4964 = _mm512_fmadd_ps(ifft4945, ifft4956, ifft4955);
__m512 ifft5050 = _mm512_fmadd_ps(ifft5033, ifft4956, ifft5042);
__m512 ifft4965 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft4966 = _mm512_fmadd_ps(ifft4957, ifft4965, _mm512_shuffle_ps(ifft4957, ifft4957, 78));
__m512 ifft5051 = _mm512_fmadd_ps(ifft5043, ifft4965, _mm512_shuffle_ps(ifft5043, ifft5043, 78));
__m512 ifft4967 = _mm512_fmadd_ps(ifft4958, ifft4965, _mm512_shuffle_ps(ifft4958, ifft4958, 78));
__m512 ifft5052 = _mm512_fmadd_ps(ifft5044, ifft4965, _mm512_shuffle_ps(ifft5044, ifft5044, 78));
__m512 ifft4968 = _mm512_fmadd_ps(ifft4959, ifft4965, _mm512_shuffle_ps(ifft4959, ifft4959, 78));
__m512 ifft5053 = _mm512_fmadd_ps(ifft5045, ifft4965, _mm512_shuffle_ps(ifft5045, ifft5045, 78));
__m512 ifft4969 = _mm512_fmadd_ps(ifft4960, ifft4965, _mm512_shuffle_ps(ifft4960, ifft4960, 78));
__m512 ifft5054 = _mm512_fmadd_ps(ifft5046, ifft4965, _mm512_shuffle_ps(ifft5046, ifft5046, 78));
__m512 ifft4970 = _mm512_fmadd_ps(ifft4961, ifft4965, _mm512_shuffle_ps(ifft4961, ifft4961, 78));
__m512 ifft5055 = _mm512_fmadd_ps(ifft5047, ifft4965, _mm512_shuffle_ps(ifft5047, ifft5047, 78));
__m512 ifft4971 = _mm512_fmadd_ps(ifft4962, ifft4965, _mm512_shuffle_ps(ifft4962, ifft4962, 78));
__m512 ifft5056 = _mm512_fmadd_ps(ifft5048, ifft4965, _mm512_shuffle_ps(ifft5048, ifft5048, 78));
__m512 ifft4972 = _mm512_fmadd_ps(ifft4963, ifft4965, _mm512_shuffle_ps(ifft4963, ifft4963, 78));
__m512 ifft5057 = _mm512_fmadd_ps(ifft5049, ifft4965, _mm512_shuffle_ps(ifft5049, ifft5049, 78));
__m512 ifft4973 = _mm512_fmadd_ps(ifft4964, ifft4965, _mm512_shuffle_ps(ifft4964, ifft4964, 78));
__m512 ifft5058 = _mm512_fmadd_ps(ifft5050, ifft4965, _mm512_shuffle_ps(ifft5050, ifft5050, 78));
__m512 ifft4974 = _mm512_mask_sub_ps(ifft4966, 49344, _mm512_setzero_ps(), ifft4967);
__m512 ifft5059 = _mm512_mask_sub_ps(ifft5051, 49344, _mm512_setzero_ps(), ifft5052);
__m512 ifft4975 = _mm512_mask_mov_ps(ifft4967, 49344, ifft4966);
__m512 ifft5060 = _mm512_mask_mov_ps(ifft5052, 49344, ifft5051);
__m512 ifft4976 = _mm512_mask_sub_ps(ifft4968, 49344, _mm512_setzero_ps(), ifft4969);
__m512 ifft5061 = _mm512_mask_sub_ps(ifft5053, 49344, _mm512_setzero_ps(), ifft5054);
__m512 ifft4977 = _mm512_mask_mov_ps(ifft4969, 49344, ifft4968);
__m512 ifft5062 = _mm512_mask_mov_ps(ifft5054, 49344, ifft5053);
__m512 ifft4978 = _mm512_mask_sub_ps(ifft4970, 49344, _mm512_setzero_ps(), ifft4971);
__m512 ifft5063 = _mm512_mask_sub_ps(ifft5055, 49344, _mm512_setzero_ps(), ifft5056);
__m512 ifft4979 = _mm512_mask_mov_ps(ifft4971, 49344, ifft4970);
__m512 ifft5064 = _mm512_mask_mov_ps(ifft5056, 49344, ifft5055);
__m512 ifft4980 = _mm512_mask_sub_ps(ifft4972, 49344, _mm512_setzero_ps(), ifft4973);
__m512 ifft5065 = _mm512_mask_sub_ps(ifft5057, 49344, _mm512_setzero_ps(), ifft5058);
__m512 ifft4981 = _mm512_mask_mov_ps(ifft4973, 49344, ifft4972);
__m512 ifft5066 = _mm512_mask_mov_ps(ifft5058, 49344, ifft5057);
__m512 ifft4982 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft4983 = _mm512_fmadd_ps(ifft4974, ifft4982, _mm512_shuffle_f32x4(ifft4974, ifft4974, 177));
__m512 ifft5067 = _mm512_fmadd_ps(ifft5059, ifft4982, _mm512_shuffle_f32x4(ifft5059, ifft5059, 177));
__m512 ifft4984 = _mm512_fmadd_ps(ifft4975, ifft4982, _mm512_shuffle_f32x4(ifft4975, ifft4975, 177));
__m512 ifft5068 = _mm512_fmadd_ps(ifft5060, ifft4982, _mm512_shuffle_f32x4(ifft5060, ifft5060, 177));
__m512 ifft4985 = _mm512_fmadd_ps(ifft4976, ifft4982, _mm512_shuffle_f32x4(ifft4976, ifft4976, 177));
__m512 ifft5069 = _mm512_fmadd_ps(ifft5061, ifft4982, _mm512_shuffle_f32x4(ifft5061, ifft5061, 177));
__m512 ifft4986 = _mm512_fmadd_ps(ifft4977, ifft4982, _mm512_shuffle_f32x4(ifft4977, ifft4977, 177));
__m512 ifft5070 = _mm512_fmadd_ps(ifft5062, ifft4982, _mm512_shuffle_f32x4(ifft5062, ifft5062, 177));
__m512 ifft4987 = _mm512_fmadd_ps(ifft4978, ifft4982, _mm512_shuffle_f32x4(ifft4978, ifft4978, 177));
__m512 ifft5071 = _mm512_fmadd_ps(ifft5063, ifft4982, _mm512_shuffle_f32x4(ifft5063, ifft5063, 177));
__m512 ifft4988 = _mm512_fnmsub_ps(ifft4979, ifft4982, _mm512_shuffle_f32x4(ifft4979, ifft4979, 177));
__m512 ifft5072 = _mm512_fnmsub_ps(ifft5064, ifft4982, _mm512_shuffle_f32x4(ifft5064, ifft5064, 177));
__m512 ifft4989 = _mm512_fmadd_ps(ifft4980, ifft4982, _mm512_shuffle_f32x4(ifft4980, ifft4980, 177));
__m512 ifft5073 = _mm512_fmadd_ps(ifft5065, ifft4982, _mm512_shuffle_f32x4(ifft5065, ifft5065, 177));
__m512 ifft4990 = _mm512_fmadd_ps(ifft4981, ifft4982, _mm512_shuffle_f32x4(ifft4981, ifft4981, 177));
__m512 ifft5074 = _mm512_fmadd_ps(ifft5066, ifft4982, _mm512_shuffle_f32x4(ifft5066, ifft5066, 177));
__m512 ifft4991 = _mm512_add_ps(ifft4983, ifft4984);
__m512 ifft5075 = _mm512_add_ps(ifft5067, ifft5068);
__m512 ifft4992 = _mm512_sub_ps(ifft4983, ifft4984);
__m512 ifft5076 = _mm512_sub_ps(ifft5067, ifft5068);
__m512 ifft4993 = _mm512_sub_ps(ifft4985, ifft4989);
__m512 ifft5077 = _mm512_sub_ps(ifft5069, ifft5073);
__m512 ifft4994 = _mm512_add_ps(ifft4986, ifft4990);
__m512 ifft5078 = _mm512_add_ps(ifft5070, ifft5074);
__m512 ifft4995 = _mm512_add_ps(ifft4985, ifft4989);
__m512 ifft5079 = _mm512_add_ps(ifft5069, ifft5073);
__m512 ifft4996 = _mm512_sub_ps(ifft4986, ifft4990);
__m512 ifft5080 = _mm512_sub_ps(ifft5070, ifft5074);
__m512 ifft4997 = _mm512_mul_ps(ifft4987, _mm512_set1_ps(3.125e-02f));
__m512 ifft5081 = _mm512_mul_ps(ifft5071, _mm512_set1_ps(3.125e-02f));
__m512 ifft4998 = _mm512_mul_ps(ifft4988, _mm512_set1_ps(3.125e-02f));
__m512 ifft5082 = _mm512_mul_ps(ifft5072, _mm512_set1_ps(3.125e-02f));
__m512 ifft4999 = _mm512_fmadd_ps(ifft4991, _mm512_set1_ps(1.5625e-02f), ifft4997);
__m512 ifft5083 = _mm512_fmadd_ps(ifft5075, _mm512_set1_ps(1.5625e-02f), ifft5081);
__m512 ifft5000 = _mm512_fmsub_ps(ifft4991, _mm512_set1_ps(1.5625e-02f), ifft4997);
__m512 ifft5084 = _mm512_fmsub_ps(ifft5075, _mm512_set1_ps(1.5625e-02f), ifft5081);
__m512 ifft5001 = _mm512_fmadd_ps(ifft4992, _mm512_set1_ps(1.5625e-02f), ifft4998);
__m512 ifft5085 = _mm512_fmadd_ps(ifft5076, _mm512_set1_ps(1.5625e-02f), ifft5082);
__m512 ifft5002 = _mm512_fmsub_ps(ifft4992, _mm512_set1_ps(1.5625e-02f), ifft4998);
__m512 ifft5086 = _mm512_fmsub_ps(ifft5076, _mm512_set1_ps(1.5625e-02f), ifft5082);
__m512 ifft5003 = _mm512_add_ps(ifft4993, ifft4994);
__m512 ifft5087 = _mm512_add_ps(ifft5077, ifft5078);
__m512 ifft5004 = _mm512_sub_ps(ifft4993, ifft4994);
__m512 ifft5088 = _mm512_sub_ps(ifft5077, ifft5078);
__m512 ifft5005 = _mm512_fnmadd_ps(ifft5003, _mm512_set1_ps(7.0710677e-01f), ifft4995);
__m512 ifft5089 = _mm512_fnmadd_ps(ifft5087, _mm512_set1_ps(7.0710677e-01f), ifft5079);
__m512 ifft5006 = _mm512_fmadd_ps(ifft5003, _mm512_set1_ps(7.0710677e-01f), ifft4995);
__m512 ifft5090 = _mm512_fmadd_ps(ifft5087, _mm512_set1_ps(7.0710677e-01f), ifft5079);
__m512 ifft5007 = _mm512_fmadd_ps(ifft5004, _mm512_set1_ps(7.0710677e-01f), ifft4996);
__m512 ifft5091 = _mm512_fmadd_ps(ifft5088, _mm512_set1_ps(7.0710677e-01f), ifft5080);
__m512 ifft5008 = _mm512_fmsub_ps(ifft5004, _mm512_set1_ps(7.0710677e-01f), ifft4996);
__m512 ifft5092 = _mm512_fmsub_ps(ifft5088, _mm512_set1_ps(7.0710677e-01f), ifft5080);
__m512 ifft5009 = _mm512_add_ps(ifft5005, ifft5006);
__m512 ifft5093 = _mm512_add_ps(ifft5089, ifft5090);
__m512 ifft5010 = _mm512_sub_ps(ifft5005, ifft5006);
__m512 ifft5094 = _mm512_sub_ps(ifft5089, ifft5090);
__m512 ifft5011 = _mm512_add_ps(ifft5007, ifft5008);
__m512 ifft5095 = _mm512_add_ps(ifft5091, ifft5092);
__m512 ifft5012 = _mm512_sub_ps(ifft5007, ifft5008);
__m512 ifft5096 = _mm512_sub_ps(ifft5091, ifft5092);
__m512 ifft5013 = _mm512_fmadd_ps(ifft5009, _mm512_set1_ps(1.5625e-02f), ifft4999);
__m512 ifft5097 = _mm512_fmadd_ps(ifft5093, _mm512_set1_ps(1.5625e-02f), ifft5083);
__m512 ifft5014 = _mm512_fnmadd_ps(ifft5009, _mm512_set1_ps(1.5625e-02f), ifft4999);
__m512 ifft5098 = _mm512_fnmadd_ps(ifft5093, _mm512_set1_ps(1.5625e-02f), ifft5083);
__m512 ifft5015 = _mm512_fmadd_ps(ifft5011, _mm512_set1_ps(1.5625e-02f), ifft5001);
__m512 ifft5099 = _mm512_fmadd_ps(ifft5095, _mm512_set1_ps(1.5625e-02f), ifft5085);
__m512 ifft5016 = _mm512_fnmadd_ps(ifft5011, _mm512_set1_ps(1.5625e-02f), ifft5001);
__m512 ifft5100 = _mm512_fnmadd_ps(ifft5095, _mm512_set1_ps(1.5625e-02f), ifft5085);
__m512 ifft5017 = _mm512_fnmadd_ps(ifft5012, _mm512_set1_ps(1.5625e-02f), ifft5000);
__m512 ifft5101 = _mm512_fnmadd_ps(ifft5096, _mm512_set1_ps(1.5625e-02f), ifft5084);
__m512 ifft5018 = _mm512_fmadd_ps(ifft5012, _mm512_set1_ps(1.5625e-02f), ifft5000);
__m512 ifft5102 = _mm512_fmadd_ps(ifft5096, _mm512_set1_ps(1.5625e-02f), ifft5084);
__m512 ifft5019 = _mm512_fmadd_ps(ifft5010, _mm512_set1_ps(1.5625e-02f), ifft5002);
__m512 ifft5103 = _mm512_fmadd_ps(ifft5094, _mm512_set1_ps(1.5625e-02f), ifft5086);
__m512 ifft5020 = _mm512_fnmadd_ps(ifft5010, _mm512_set1_ps(1.5625e-02f), ifft5002);
__m512 ifft5104 = _mm512_fnmadd_ps(ifft5094, _mm512_set1_ps(1.5625e-02f), ifft5086);
__m512 dat870 = ifft5013;
__m512 dat875 = ifft5097;
__m512 dat871 = ifft5015;
__m512 dat876 = ifft5099;
__m512 dat872 = ifft5017;
__m512 dat877 = ifft5101;
__m512 dat873 = ifft5019;
__m512 dat878 = ifft5103;
__m512 dat874 = ifft5014;
__m512 dat879 = ifft5098;
(void)ifft5016;
(void)ifft5100;
(void)ifft5018;
(void)ifft5102;
(void)ifft5020;
(void)ifft5104;
__m512i pm49 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack241 = _mm512_permutex2var_ps(dat870, pm49, dat875);
__m512i pm50 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack242 = _mm512_permutex2var_ps(dat870, pm50, dat875);
__m512 pack243 = _mm512_permutex2var_ps(dat871, pm49, dat876);
__m512 pack244 = _mm512_permutex2var_ps(dat871, pm50, dat876);
__m512 pack245 = _mm512_permutex2var_ps(dat872, pm49, dat877);
__m512 pack246 = _mm512_permutex2var_ps(dat872, pm50, dat877);
__m512 pack247 = _mm512_permutex2var_ps(dat873, pm49, dat878);
__m512 pack248 = _mm512_permutex2var_ps(dat873, pm50, dat878);
__m512 pack249 = _mm512_permutex2var_ps(dat874, pm49, dat879);
__m512 pack250 = _mm512_permutex2var_ps(dat874, pm50, dat879);
pack241 = _mm512_max_ps(_mm512_setzero_ps(), pack241);
pack242 = _mm512_max_ps(_mm512_setzero_ps(), pack242);
pack243 = _mm512_max_ps(_mm512_setzero_ps(), pack243);
pack244 = _mm512_max_ps(_mm512_setzero_ps(), pack244);
pack245 = _mm512_max_ps(_mm512_setzero_ps(), pack245);
pack246 = _mm512_max_ps(_mm512_setzero_ps(), pack246);
pack247 = _mm512_max_ps(_mm512_setzero_ps(), pack247);
pack248 = _mm512_max_ps(_mm512_setzero_ps(), pack248);
pack249 = _mm512_max_ps(_mm512_setzero_ps(), pack249);
pack250 = _mm512_max_ps(_mm512_setzero_ps(), pack250);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack241);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack242);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack243);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack244);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack245);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack246);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack247);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack248);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack249);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack250);
ptrdiff_t t31 = 0;
__m512 sfRe289 = _mm512_loadu_ps(sfPtr3+256+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfIm289 = _mm512_loadu_ps(sfPtr3+320+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfRe293 = _mm512_loadu_ps(sfPtr3+384+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfIm293 = _mm512_loadu_ps(sfPtr3+448+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfRe290 = _mm512_loadu_ps(sfPtr3+2167040+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfIm290 = _mm512_loadu_ps(sfPtr3+2167104+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfRe294 = _mm512_loadu_ps(sfPtr3+2167168+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfIm294 = _mm512_loadu_ps(sfPtr3+2167232+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfRe291 = _mm512_loadu_ps(sfPtr3+4333824+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfIm291 = _mm512_loadu_ps(sfPtr3+4333888+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfRe295 = _mm512_loadu_ps(sfPtr3+4333952+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfIm295 = _mm512_loadu_ps(sfPtr3+4334016+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfRe292 = _mm512_loadu_ps(sfPtr3+6500608+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfIm292 = _mm512_loadu_ps(sfPtr3+6500672+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfRe296 = _mm512_loadu_ps(sfPtr3+6500736+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfIm296 = _mm512_loadu_ps(sfPtr3+6500800+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512i ifft5105 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft5106 = _mm512_permutexvar_ps(ifft5105, sfRe289);
__m512 ifft5197 = _mm512_permutexvar_ps(ifft5105, sfRe293);
__m512i ifft5107 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft5108 = _mm512_permutexvar_ps(ifft5107, sfRe289);
__m512 ifft5198 = _mm512_permutexvar_ps(ifft5107, sfRe293);
__m512 ifft5109 = _mm512_permutexvar_ps(ifft5105, sfIm289);
__m512 ifft5199 = _mm512_permutexvar_ps(ifft5105, sfIm293);
__m512 ifft5110 = _mm512_permutexvar_ps(ifft5107, sfIm289);
__m512 ifft5200 = _mm512_permutexvar_ps(ifft5107, sfIm293);
__m512 ifft5111 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft5112 = _mm512_mask_fmadd_ps(ifft5110, 65021, ifft5111, ifft5106);
__m512 ifft5201 = _mm512_mask_fmadd_ps(ifft5200, 65021, ifft5111, ifft5197);
__m512 ifft5113 = _mm512_mask_fnmadd_ps(ifft5109, 65021, ifft5111, ifft5108);
__m512 ifft5202 = _mm512_mask_fnmadd_ps(ifft5199, 65021, ifft5111, ifft5198);
__m512 ifft5114 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft5115 = _mm512_fmadd_ps(ifft5112, ifft5114, _mm512_shuffle_ps(ifft5112, ifft5112, 177));
__m512 ifft5203 = _mm512_fmadd_ps(ifft5201, ifft5114, _mm512_shuffle_ps(ifft5201, ifft5201, 177));
__m512 ifft5116 = _mm512_fmadd_ps(ifft5113, ifft5114, _mm512_shuffle_ps(ifft5113, ifft5113, 177));
__m512 ifft5204 = _mm512_fmadd_ps(ifft5202, ifft5114, _mm512_shuffle_ps(ifft5202, ifft5202, 177));
__m512 ifft5117 = _mm512_fmadd_ps(sfRe290, ifft5114, _mm512_shuffle_ps(sfRe290, sfRe290, 177));
__m512 ifft5205 = _mm512_fmadd_ps(sfRe294, ifft5114, _mm512_shuffle_ps(sfRe294, sfRe294, 177));
__m512 ifft5118 = _mm512_fmadd_ps(sfIm290, ifft5114, _mm512_shuffle_ps(sfIm290, sfIm290, 177));
__m512 ifft5206 = _mm512_fmadd_ps(sfIm294, ifft5114, _mm512_shuffle_ps(sfIm294, sfIm294, 177));
__m512 ifft5119 = _mm512_fmadd_ps(sfRe291, ifft5114, _mm512_shuffle_ps(sfRe291, sfRe291, 177));
__m512 ifft5207 = _mm512_fmadd_ps(sfRe295, ifft5114, _mm512_shuffle_ps(sfRe295, sfRe295, 177));
__m512 ifft5120 = _mm512_fmadd_ps(sfIm291, ifft5114, _mm512_shuffle_ps(sfIm291, sfIm291, 177));
__m512 ifft5208 = _mm512_fmadd_ps(sfIm295, ifft5114, _mm512_shuffle_ps(sfIm295, sfIm295, 177));
__m512 ifft5121 = _mm512_fmadd_ps(sfRe292, ifft5114, _mm512_shuffle_ps(sfRe292, sfRe292, 177));
__m512 ifft5209 = _mm512_fmadd_ps(sfRe296, ifft5114, _mm512_shuffle_ps(sfRe296, sfRe296, 177));
__m512 ifft5122 = _mm512_fmadd_ps(sfIm292, ifft5114, _mm512_shuffle_ps(sfIm292, sfIm292, 177));
__m512 ifft5210 = _mm512_fmadd_ps(sfIm296, ifft5114, _mm512_shuffle_ps(sfIm296, sfIm296, 177));
__m512 ifft5123 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft5124 = _mm512_mul_ps(ifft5115, ifft5123);
__m512 ifft5211 = _mm512_mul_ps(ifft5203, ifft5123);
__m512 ifft5125 = _mm512_mul_ps(ifft5116, ifft5123);
__m512 ifft5212 = _mm512_mul_ps(ifft5204, ifft5123);
__m512 ifft5126 = _mm512_mul_ps(ifft5117, ifft5123);
__m512 ifft5213 = _mm512_mul_ps(ifft5205, ifft5123);
__m512 ifft5127 = _mm512_mul_ps(ifft5118, ifft5123);
__m512 ifft5214 = _mm512_mul_ps(ifft5206, ifft5123);
__m512 ifft5128 = _mm512_mul_ps(ifft5119, ifft5123);
__m512 ifft5215 = _mm512_mul_ps(ifft5207, ifft5123);
__m512 ifft5129 = _mm512_mul_ps(ifft5120, ifft5123);
__m512 ifft5216 = _mm512_mul_ps(ifft5208, ifft5123);
__m512 ifft5130 = _mm512_mul_ps(ifft5121, ifft5123);
__m512 ifft5217 = _mm512_mul_ps(ifft5209, ifft5123);
__m512 ifft5131 = _mm512_mul_ps(ifft5122, ifft5123);
__m512 ifft5218 = _mm512_mul_ps(ifft5210, ifft5123);
__m512 ifft5132 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft5133 = _mm512_fnmadd_ps(ifft5116, ifft5132, ifft5124);
__m512 ifft5219 = _mm512_fnmadd_ps(ifft5204, ifft5132, ifft5211);
__m512 ifft5134 = _mm512_fmadd_ps(ifft5115, ifft5132, ifft5125);
__m512 ifft5220 = _mm512_fmadd_ps(ifft5203, ifft5132, ifft5212);
__m512 ifft5135 = _mm512_fnmadd_ps(ifft5118, ifft5132, ifft5126);
__m512 ifft5221 = _mm512_fnmadd_ps(ifft5206, ifft5132, ifft5213);
__m512 ifft5136 = _mm512_fmadd_ps(ifft5117, ifft5132, ifft5127);
__m512 ifft5222 = _mm512_fmadd_ps(ifft5205, ifft5132, ifft5214);
__m512 ifft5137 = _mm512_fnmadd_ps(ifft5120, ifft5132, ifft5128);
__m512 ifft5223 = _mm512_fnmadd_ps(ifft5208, ifft5132, ifft5215);
__m512 ifft5138 = _mm512_fmadd_ps(ifft5119, ifft5132, ifft5129);
__m512 ifft5224 = _mm512_fmadd_ps(ifft5207, ifft5132, ifft5216);
__m512 ifft5139 = _mm512_fnmadd_ps(ifft5122, ifft5132, ifft5130);
__m512 ifft5225 = _mm512_fnmadd_ps(ifft5210, ifft5132, ifft5217);
__m512 ifft5140 = _mm512_fmadd_ps(ifft5121, ifft5132, ifft5131);
__m512 ifft5226 = _mm512_fmadd_ps(ifft5209, ifft5132, ifft5218);
__m512 ifft5141 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft5142 = _mm512_fmadd_ps(ifft5133, ifft5141, _mm512_shuffle_ps(ifft5133, ifft5133, 78));
__m512 ifft5227 = _mm512_fmadd_ps(ifft5219, ifft5141, _mm512_shuffle_ps(ifft5219, ifft5219, 78));
__m512 ifft5143 = _mm512_fmadd_ps(ifft5134, ifft5141, _mm512_shuffle_ps(ifft5134, ifft5134, 78));
__m512 ifft5228 = _mm512_fmadd_ps(ifft5220, ifft5141, _mm512_shuffle_ps(ifft5220, ifft5220, 78));
__m512 ifft5144 = _mm512_fmadd_ps(ifft5135, ifft5141, _mm512_shuffle_ps(ifft5135, ifft5135, 78));
__m512 ifft5229 = _mm512_fmadd_ps(ifft5221, ifft5141, _mm512_shuffle_ps(ifft5221, ifft5221, 78));
__m512 ifft5145 = _mm512_fmadd_ps(ifft5136, ifft5141, _mm512_shuffle_ps(ifft5136, ifft5136, 78));
__m512 ifft5230 = _mm512_fmadd_ps(ifft5222, ifft5141, _mm512_shuffle_ps(ifft5222, ifft5222, 78));
__m512 ifft5146 = _mm512_fmadd_ps(ifft5137, ifft5141, _mm512_shuffle_ps(ifft5137, ifft5137, 78));
__m512 ifft5231 = _mm512_fmadd_ps(ifft5223, ifft5141, _mm512_shuffle_ps(ifft5223, ifft5223, 78));
__m512 ifft5147 = _mm512_fmadd_ps(ifft5138, ifft5141, _mm512_shuffle_ps(ifft5138, ifft5138, 78));
__m512 ifft5232 = _mm512_fmadd_ps(ifft5224, ifft5141, _mm512_shuffle_ps(ifft5224, ifft5224, 78));
__m512 ifft5148 = _mm512_fmadd_ps(ifft5139, ifft5141, _mm512_shuffle_ps(ifft5139, ifft5139, 78));
__m512 ifft5233 = _mm512_fmadd_ps(ifft5225, ifft5141, _mm512_shuffle_ps(ifft5225, ifft5225, 78));
__m512 ifft5149 = _mm512_fmadd_ps(ifft5140, ifft5141, _mm512_shuffle_ps(ifft5140, ifft5140, 78));
__m512 ifft5234 = _mm512_fmadd_ps(ifft5226, ifft5141, _mm512_shuffle_ps(ifft5226, ifft5226, 78));
__m512 ifft5150 = _mm512_mask_sub_ps(ifft5142, 49344, _mm512_setzero_ps(), ifft5143);
__m512 ifft5235 = _mm512_mask_sub_ps(ifft5227, 49344, _mm512_setzero_ps(), ifft5228);
__m512 ifft5151 = _mm512_mask_mov_ps(ifft5143, 49344, ifft5142);
__m512 ifft5236 = _mm512_mask_mov_ps(ifft5228, 49344, ifft5227);
__m512 ifft5152 = _mm512_mask_sub_ps(ifft5144, 49344, _mm512_setzero_ps(), ifft5145);
__m512 ifft5237 = _mm512_mask_sub_ps(ifft5229, 49344, _mm512_setzero_ps(), ifft5230);
__m512 ifft5153 = _mm512_mask_mov_ps(ifft5145, 49344, ifft5144);
__m512 ifft5238 = _mm512_mask_mov_ps(ifft5230, 49344, ifft5229);
__m512 ifft5154 = _mm512_mask_sub_ps(ifft5146, 49344, _mm512_setzero_ps(), ifft5147);
__m512 ifft5239 = _mm512_mask_sub_ps(ifft5231, 49344, _mm512_setzero_ps(), ifft5232);
__m512 ifft5155 = _mm512_mask_mov_ps(ifft5147, 49344, ifft5146);
__m512 ifft5240 = _mm512_mask_mov_ps(ifft5232, 49344, ifft5231);
__m512 ifft5156 = _mm512_mask_sub_ps(ifft5148, 49344, _mm512_setzero_ps(), ifft5149);
__m512 ifft5241 = _mm512_mask_sub_ps(ifft5233, 49344, _mm512_setzero_ps(), ifft5234);
__m512 ifft5157 = _mm512_mask_mov_ps(ifft5149, 49344, ifft5148);
__m512 ifft5242 = _mm512_mask_mov_ps(ifft5234, 49344, ifft5233);
__m512 ifft5158 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft5159 = _mm512_fmadd_ps(ifft5150, ifft5158, _mm512_shuffle_f32x4(ifft5150, ifft5150, 177));
__m512 ifft5243 = _mm512_fmadd_ps(ifft5235, ifft5158, _mm512_shuffle_f32x4(ifft5235, ifft5235, 177));
__m512 ifft5160 = _mm512_fmadd_ps(ifft5151, ifft5158, _mm512_shuffle_f32x4(ifft5151, ifft5151, 177));
__m512 ifft5244 = _mm512_fmadd_ps(ifft5236, ifft5158, _mm512_shuffle_f32x4(ifft5236, ifft5236, 177));
__m512 ifft5161 = _mm512_fmadd_ps(ifft5152, ifft5158, _mm512_shuffle_f32x4(ifft5152, ifft5152, 177));
__m512 ifft5245 = _mm512_fmadd_ps(ifft5237, ifft5158, _mm512_shuffle_f32x4(ifft5237, ifft5237, 177));
__m512 ifft5162 = _mm512_fmadd_ps(ifft5153, ifft5158, _mm512_shuffle_f32x4(ifft5153, ifft5153, 177));
__m512 ifft5246 = _mm512_fmadd_ps(ifft5238, ifft5158, _mm512_shuffle_f32x4(ifft5238, ifft5238, 177));
__m512 ifft5163 = _mm512_fmadd_ps(ifft5154, ifft5158, _mm512_shuffle_f32x4(ifft5154, ifft5154, 177));
__m512 ifft5247 = _mm512_fmadd_ps(ifft5239, ifft5158, _mm512_shuffle_f32x4(ifft5239, ifft5239, 177));
__m512 ifft5164 = _mm512_fnmsub_ps(ifft5155, ifft5158, _mm512_shuffle_f32x4(ifft5155, ifft5155, 177));
__m512 ifft5248 = _mm512_fnmsub_ps(ifft5240, ifft5158, _mm512_shuffle_f32x4(ifft5240, ifft5240, 177));
__m512 ifft5165 = _mm512_fmadd_ps(ifft5156, ifft5158, _mm512_shuffle_f32x4(ifft5156, ifft5156, 177));
__m512 ifft5249 = _mm512_fmadd_ps(ifft5241, ifft5158, _mm512_shuffle_f32x4(ifft5241, ifft5241, 177));
__m512 ifft5166 = _mm512_fmadd_ps(ifft5157, ifft5158, _mm512_shuffle_f32x4(ifft5157, ifft5157, 177));
__m512 ifft5250 = _mm512_fmadd_ps(ifft5242, ifft5158, _mm512_shuffle_f32x4(ifft5242, ifft5242, 177));
__m512 ifft5167 = _mm512_add_ps(ifft5159, ifft5160);
__m512 ifft5251 = _mm512_add_ps(ifft5243, ifft5244);
__m512 ifft5168 = _mm512_sub_ps(ifft5159, ifft5160);
__m512 ifft5252 = _mm512_sub_ps(ifft5243, ifft5244);
__m512 ifft5169 = _mm512_sub_ps(ifft5161, ifft5165);
__m512 ifft5253 = _mm512_sub_ps(ifft5245, ifft5249);
__m512 ifft5170 = _mm512_add_ps(ifft5162, ifft5166);
__m512 ifft5254 = _mm512_add_ps(ifft5246, ifft5250);
__m512 ifft5171 = _mm512_add_ps(ifft5161, ifft5165);
__m512 ifft5255 = _mm512_add_ps(ifft5245, ifft5249);
__m512 ifft5172 = _mm512_sub_ps(ifft5162, ifft5166);
__m512 ifft5256 = _mm512_sub_ps(ifft5246, ifft5250);
__m512 ifft5173 = _mm512_mul_ps(ifft5163, _mm512_set1_ps(3.125e-02f));
__m512 ifft5257 = _mm512_mul_ps(ifft5247, _mm512_set1_ps(3.125e-02f));
__m512 ifft5174 = _mm512_mul_ps(ifft5164, _mm512_set1_ps(3.125e-02f));
__m512 ifft5258 = _mm512_mul_ps(ifft5248, _mm512_set1_ps(3.125e-02f));
__m512 ifft5175 = _mm512_fmadd_ps(ifft5167, _mm512_set1_ps(1.5625e-02f), ifft5173);
__m512 ifft5259 = _mm512_fmadd_ps(ifft5251, _mm512_set1_ps(1.5625e-02f), ifft5257);
__m512 ifft5176 = _mm512_fmsub_ps(ifft5167, _mm512_set1_ps(1.5625e-02f), ifft5173);
__m512 ifft5260 = _mm512_fmsub_ps(ifft5251, _mm512_set1_ps(1.5625e-02f), ifft5257);
__m512 ifft5177 = _mm512_fmadd_ps(ifft5168, _mm512_set1_ps(1.5625e-02f), ifft5174);
__m512 ifft5261 = _mm512_fmadd_ps(ifft5252, _mm512_set1_ps(1.5625e-02f), ifft5258);
__m512 ifft5178 = _mm512_fmsub_ps(ifft5168, _mm512_set1_ps(1.5625e-02f), ifft5174);
__m512 ifft5262 = _mm512_fmsub_ps(ifft5252, _mm512_set1_ps(1.5625e-02f), ifft5258);
__m512 ifft5179 = _mm512_add_ps(ifft5169, ifft5170);
__m512 ifft5263 = _mm512_add_ps(ifft5253, ifft5254);
__m512 ifft5180 = _mm512_sub_ps(ifft5169, ifft5170);
__m512 ifft5264 = _mm512_sub_ps(ifft5253, ifft5254);
__m512 ifft5181 = _mm512_fnmadd_ps(ifft5179, _mm512_set1_ps(7.0710677e-01f), ifft5171);
__m512 ifft5265 = _mm512_fnmadd_ps(ifft5263, _mm512_set1_ps(7.0710677e-01f), ifft5255);
__m512 ifft5182 = _mm512_fmadd_ps(ifft5179, _mm512_set1_ps(7.0710677e-01f), ifft5171);
__m512 ifft5266 = _mm512_fmadd_ps(ifft5263, _mm512_set1_ps(7.0710677e-01f), ifft5255);
__m512 ifft5183 = _mm512_fmadd_ps(ifft5180, _mm512_set1_ps(7.0710677e-01f), ifft5172);
__m512 ifft5267 = _mm512_fmadd_ps(ifft5264, _mm512_set1_ps(7.0710677e-01f), ifft5256);
__m512 ifft5184 = _mm512_fmsub_ps(ifft5180, _mm512_set1_ps(7.0710677e-01f), ifft5172);
__m512 ifft5268 = _mm512_fmsub_ps(ifft5264, _mm512_set1_ps(7.0710677e-01f), ifft5256);
__m512 ifft5185 = _mm512_add_ps(ifft5181, ifft5182);
__m512 ifft5269 = _mm512_add_ps(ifft5265, ifft5266);
__m512 ifft5186 = _mm512_sub_ps(ifft5181, ifft5182);
__m512 ifft5270 = _mm512_sub_ps(ifft5265, ifft5266);
__m512 ifft5187 = _mm512_add_ps(ifft5183, ifft5184);
__m512 ifft5271 = _mm512_add_ps(ifft5267, ifft5268);
__m512 ifft5188 = _mm512_sub_ps(ifft5183, ifft5184);
__m512 ifft5272 = _mm512_sub_ps(ifft5267, ifft5268);
__m512 ifft5189 = _mm512_fmadd_ps(ifft5185, _mm512_set1_ps(1.5625e-02f), ifft5175);
__m512 ifft5273 = _mm512_fmadd_ps(ifft5269, _mm512_set1_ps(1.5625e-02f), ifft5259);
__m512 ifft5190 = _mm512_fnmadd_ps(ifft5185, _mm512_set1_ps(1.5625e-02f), ifft5175);
__m512 ifft5274 = _mm512_fnmadd_ps(ifft5269, _mm512_set1_ps(1.5625e-02f), ifft5259);
__m512 ifft5191 = _mm512_fmadd_ps(ifft5187, _mm512_set1_ps(1.5625e-02f), ifft5177);
__m512 ifft5275 = _mm512_fmadd_ps(ifft5271, _mm512_set1_ps(1.5625e-02f), ifft5261);
__m512 ifft5192 = _mm512_fnmadd_ps(ifft5187, _mm512_set1_ps(1.5625e-02f), ifft5177);
__m512 ifft5276 = _mm512_fnmadd_ps(ifft5271, _mm512_set1_ps(1.5625e-02f), ifft5261);
__m512 ifft5193 = _mm512_fnmadd_ps(ifft5188, _mm512_set1_ps(1.5625e-02f), ifft5176);
__m512 ifft5277 = _mm512_fnmadd_ps(ifft5272, _mm512_set1_ps(1.5625e-02f), ifft5260);
__m512 ifft5194 = _mm512_fmadd_ps(ifft5188, _mm512_set1_ps(1.5625e-02f), ifft5176);
__m512 ifft5278 = _mm512_fmadd_ps(ifft5272, _mm512_set1_ps(1.5625e-02f), ifft5260);
__m512 ifft5195 = _mm512_fmadd_ps(ifft5186, _mm512_set1_ps(1.5625e-02f), ifft5178);
__m512 ifft5279 = _mm512_fmadd_ps(ifft5270, _mm512_set1_ps(1.5625e-02f), ifft5262);
__m512 ifft5196 = _mm512_fnmadd_ps(ifft5186, _mm512_set1_ps(1.5625e-02f), ifft5178);
__m512 ifft5280 = _mm512_fnmadd_ps(ifft5270, _mm512_set1_ps(1.5625e-02f), ifft5262);
__m512 dat880 = ifft5189;
__m512 dat882 = ifft5273;
__m512 dat881 = ifft5191;
__m512 dat883 = ifft5275;
(void)ifft5193;
(void)ifft5277;
(void)ifft5195;
(void)ifft5279;
(void)ifft5190;
(void)ifft5274;
(void)ifft5192;
(void)ifft5276;
(void)ifft5194;
(void)ifft5278;
(void)ifft5196;
(void)ifft5280;
__m512i pm51 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack251 = _mm512_permutex2var_ps(dat880, pm51, dat882);
__m512i pm52 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack252 = _mm512_permutex2var_ps(dat880, pm52, dat882);
__m512 pack253 = _mm512_permutex2var_ps(dat881, pm51, dat883);
__m512 pack254 = _mm512_permutex2var_ps(dat881, pm52, dat883);
pack251 = _mm512_max_ps(_mm512_setzero_ps(), pack251);
pack252 = _mm512_max_ps(_mm512_setzero_ps(), pack252);
pack253 = _mm512_max_ps(_mm512_setzero_ps(), pack253);
pack254 = _mm512_max_ps(_mm512_setzero_ps(), pack254);
_mm512_mask_storeu_ps(datPtr2+1820+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t31, 1023, pack251);
_mm512_mask_storeu_ps(datPtr2+52060+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t31, 1023, pack252);
_mm512_mask_storeu_ps(datPtr2+2268+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t31, 1023, pack253);
_mm512_mask_storeu_ps(datPtr2+52508+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t31, 1023, pack254);
ptrdiff_t t32 = 0;
__m512 sfRe297 = _mm512_loadu_ps(sfPtr3+512+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfIm297 = _mm512_loadu_ps(sfPtr3+576+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfRe301 = _mm512_loadu_ps(sfPtr3+640+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfIm301 = _mm512_loadu_ps(sfPtr3+704+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfRe298 = _mm512_loadu_ps(sfPtr3+2167296+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfIm298 = _mm512_loadu_ps(sfPtr3+2167360+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfRe302 = _mm512_loadu_ps(sfPtr3+2167424+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfIm302 = _mm512_loadu_ps(sfPtr3+2167488+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfRe299 = _mm512_loadu_ps(sfPtr3+4334080+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfIm299 = _mm512_loadu_ps(sfPtr3+4334144+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfRe303 = _mm512_loadu_ps(sfPtr3+4334208+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfIm303 = _mm512_loadu_ps(sfPtr3+4334272+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfRe300 = _mm512_loadu_ps(sfPtr3+6500864+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfIm300 = _mm512_loadu_ps(sfPtr3+6500928+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfRe304 = _mm512_loadu_ps(sfPtr3+6500992+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfIm304 = _mm512_loadu_ps(sfPtr3+6501056+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512i ifft5281 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft5282 = _mm512_permutexvar_ps(ifft5281, sfRe297);
__m512 ifft5373 = _mm512_permutexvar_ps(ifft5281, sfRe301);
__m512i ifft5283 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft5284 = _mm512_permutexvar_ps(ifft5283, sfRe297);
__m512 ifft5374 = _mm512_permutexvar_ps(ifft5283, sfRe301);
__m512 ifft5285 = _mm512_permutexvar_ps(ifft5281, sfIm297);
__m512 ifft5375 = _mm512_permutexvar_ps(ifft5281, sfIm301);
__m512 ifft5286 = _mm512_permutexvar_ps(ifft5283, sfIm297);
__m512 ifft5376 = _mm512_permutexvar_ps(ifft5283, sfIm301);
__m512 ifft5287 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft5288 = _mm512_mask_fmadd_ps(ifft5286, 65021, ifft5287, ifft5282);
__m512 ifft5377 = _mm512_mask_fmadd_ps(ifft5376, 65021, ifft5287, ifft5373);
__m512 ifft5289 = _mm512_mask_fnmadd_ps(ifft5285, 65021, ifft5287, ifft5284);
__m512 ifft5378 = _mm512_mask_fnmadd_ps(ifft5375, 65021, ifft5287, ifft5374);
__m512 ifft5290 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft5291 = _mm512_fmadd_ps(ifft5288, ifft5290, _mm512_shuffle_ps(ifft5288, ifft5288, 177));
__m512 ifft5379 = _mm512_fmadd_ps(ifft5377, ifft5290, _mm512_shuffle_ps(ifft5377, ifft5377, 177));
__m512 ifft5292 = _mm512_fmadd_ps(ifft5289, ifft5290, _mm512_shuffle_ps(ifft5289, ifft5289, 177));
__m512 ifft5380 = _mm512_fmadd_ps(ifft5378, ifft5290, _mm512_shuffle_ps(ifft5378, ifft5378, 177));
__m512 ifft5293 = _mm512_fmadd_ps(sfRe298, ifft5290, _mm512_shuffle_ps(sfRe298, sfRe298, 177));
__m512 ifft5381 = _mm512_fmadd_ps(sfRe302, ifft5290, _mm512_shuffle_ps(sfRe302, sfRe302, 177));
__m512 ifft5294 = _mm512_fmadd_ps(sfIm298, ifft5290, _mm512_shuffle_ps(sfIm298, sfIm298, 177));
__m512 ifft5382 = _mm512_fmadd_ps(sfIm302, ifft5290, _mm512_shuffle_ps(sfIm302, sfIm302, 177));
__m512 ifft5295 = _mm512_fmadd_ps(sfRe299, ifft5290, _mm512_shuffle_ps(sfRe299, sfRe299, 177));
__m512 ifft5383 = _mm512_fmadd_ps(sfRe303, ifft5290, _mm512_shuffle_ps(sfRe303, sfRe303, 177));
__m512 ifft5296 = _mm512_fmadd_ps(sfIm299, ifft5290, _mm512_shuffle_ps(sfIm299, sfIm299, 177));
__m512 ifft5384 = _mm512_fmadd_ps(sfIm303, ifft5290, _mm512_shuffle_ps(sfIm303, sfIm303, 177));
__m512 ifft5297 = _mm512_fmadd_ps(sfRe300, ifft5290, _mm512_shuffle_ps(sfRe300, sfRe300, 177));
__m512 ifft5385 = _mm512_fmadd_ps(sfRe304, ifft5290, _mm512_shuffle_ps(sfRe304, sfRe304, 177));
__m512 ifft5298 = _mm512_fmadd_ps(sfIm300, ifft5290, _mm512_shuffle_ps(sfIm300, sfIm300, 177));
__m512 ifft5386 = _mm512_fmadd_ps(sfIm304, ifft5290, _mm512_shuffle_ps(sfIm304, sfIm304, 177));
__m512 ifft5299 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft5300 = _mm512_mul_ps(ifft5291, ifft5299);
__m512 ifft5387 = _mm512_mul_ps(ifft5379, ifft5299);
__m512 ifft5301 = _mm512_mul_ps(ifft5292, ifft5299);
__m512 ifft5388 = _mm512_mul_ps(ifft5380, ifft5299);
__m512 ifft5302 = _mm512_mul_ps(ifft5293, ifft5299);
__m512 ifft5389 = _mm512_mul_ps(ifft5381, ifft5299);
__m512 ifft5303 = _mm512_mul_ps(ifft5294, ifft5299);
__m512 ifft5390 = _mm512_mul_ps(ifft5382, ifft5299);
__m512 ifft5304 = _mm512_mul_ps(ifft5295, ifft5299);
__m512 ifft5391 = _mm512_mul_ps(ifft5383, ifft5299);
__m512 ifft5305 = _mm512_mul_ps(ifft5296, ifft5299);
__m512 ifft5392 = _mm512_mul_ps(ifft5384, ifft5299);
__m512 ifft5306 = _mm512_mul_ps(ifft5297, ifft5299);
__m512 ifft5393 = _mm512_mul_ps(ifft5385, ifft5299);
__m512 ifft5307 = _mm512_mul_ps(ifft5298, ifft5299);
__m512 ifft5394 = _mm512_mul_ps(ifft5386, ifft5299);
__m512 ifft5308 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft5309 = _mm512_fnmadd_ps(ifft5292, ifft5308, ifft5300);
__m512 ifft5395 = _mm512_fnmadd_ps(ifft5380, ifft5308, ifft5387);
__m512 ifft5310 = _mm512_fmadd_ps(ifft5291, ifft5308, ifft5301);
__m512 ifft5396 = _mm512_fmadd_ps(ifft5379, ifft5308, ifft5388);
__m512 ifft5311 = _mm512_fnmadd_ps(ifft5294, ifft5308, ifft5302);
__m512 ifft5397 = _mm512_fnmadd_ps(ifft5382, ifft5308, ifft5389);
__m512 ifft5312 = _mm512_fmadd_ps(ifft5293, ifft5308, ifft5303);
__m512 ifft5398 = _mm512_fmadd_ps(ifft5381, ifft5308, ifft5390);
__m512 ifft5313 = _mm512_fnmadd_ps(ifft5296, ifft5308, ifft5304);
__m512 ifft5399 = _mm512_fnmadd_ps(ifft5384, ifft5308, ifft5391);
__m512 ifft5314 = _mm512_fmadd_ps(ifft5295, ifft5308, ifft5305);
__m512 ifft5400 = _mm512_fmadd_ps(ifft5383, ifft5308, ifft5392);
__m512 ifft5315 = _mm512_fnmadd_ps(ifft5298, ifft5308, ifft5306);
__m512 ifft5401 = _mm512_fnmadd_ps(ifft5386, ifft5308, ifft5393);
__m512 ifft5316 = _mm512_fmadd_ps(ifft5297, ifft5308, ifft5307);
__m512 ifft5402 = _mm512_fmadd_ps(ifft5385, ifft5308, ifft5394);
__m512 ifft5317 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft5318 = _mm512_fmadd_ps(ifft5309, ifft5317, _mm512_shuffle_ps(ifft5309, ifft5309, 78));
__m512 ifft5403 = _mm512_fmadd_ps(ifft5395, ifft5317, _mm512_shuffle_ps(ifft5395, ifft5395, 78));
__m512 ifft5319 = _mm512_fmadd_ps(ifft5310, ifft5317, _mm512_shuffle_ps(ifft5310, ifft5310, 78));
__m512 ifft5404 = _mm512_fmadd_ps(ifft5396, ifft5317, _mm512_shuffle_ps(ifft5396, ifft5396, 78));
__m512 ifft5320 = _mm512_fmadd_ps(ifft5311, ifft5317, _mm512_shuffle_ps(ifft5311, ifft5311, 78));
__m512 ifft5405 = _mm512_fmadd_ps(ifft5397, ifft5317, _mm512_shuffle_ps(ifft5397, ifft5397, 78));
__m512 ifft5321 = _mm512_fmadd_ps(ifft5312, ifft5317, _mm512_shuffle_ps(ifft5312, ifft5312, 78));
__m512 ifft5406 = _mm512_fmadd_ps(ifft5398, ifft5317, _mm512_shuffle_ps(ifft5398, ifft5398, 78));
__m512 ifft5322 = _mm512_fmadd_ps(ifft5313, ifft5317, _mm512_shuffle_ps(ifft5313, ifft5313, 78));
__m512 ifft5407 = _mm512_fmadd_ps(ifft5399, ifft5317, _mm512_shuffle_ps(ifft5399, ifft5399, 78));
__m512 ifft5323 = _mm512_fmadd_ps(ifft5314, ifft5317, _mm512_shuffle_ps(ifft5314, ifft5314, 78));
__m512 ifft5408 = _mm512_fmadd_ps(ifft5400, ifft5317, _mm512_shuffle_ps(ifft5400, ifft5400, 78));
__m512 ifft5324 = _mm512_fmadd_ps(ifft5315, ifft5317, _mm512_shuffle_ps(ifft5315, ifft5315, 78));
__m512 ifft5409 = _mm512_fmadd_ps(ifft5401, ifft5317, _mm512_shuffle_ps(ifft5401, ifft5401, 78));
__m512 ifft5325 = _mm512_fmadd_ps(ifft5316, ifft5317, _mm512_shuffle_ps(ifft5316, ifft5316, 78));
__m512 ifft5410 = _mm512_fmadd_ps(ifft5402, ifft5317, _mm512_shuffle_ps(ifft5402, ifft5402, 78));
__m512 ifft5326 = _mm512_mask_sub_ps(ifft5318, 49344, _mm512_setzero_ps(), ifft5319);
__m512 ifft5411 = _mm512_mask_sub_ps(ifft5403, 49344, _mm512_setzero_ps(), ifft5404);
__m512 ifft5327 = _mm512_mask_mov_ps(ifft5319, 49344, ifft5318);
__m512 ifft5412 = _mm512_mask_mov_ps(ifft5404, 49344, ifft5403);
__m512 ifft5328 = _mm512_mask_sub_ps(ifft5320, 49344, _mm512_setzero_ps(), ifft5321);
__m512 ifft5413 = _mm512_mask_sub_ps(ifft5405, 49344, _mm512_setzero_ps(), ifft5406);
__m512 ifft5329 = _mm512_mask_mov_ps(ifft5321, 49344, ifft5320);
__m512 ifft5414 = _mm512_mask_mov_ps(ifft5406, 49344, ifft5405);
__m512 ifft5330 = _mm512_mask_sub_ps(ifft5322, 49344, _mm512_setzero_ps(), ifft5323);
__m512 ifft5415 = _mm512_mask_sub_ps(ifft5407, 49344, _mm512_setzero_ps(), ifft5408);
__m512 ifft5331 = _mm512_mask_mov_ps(ifft5323, 49344, ifft5322);
__m512 ifft5416 = _mm512_mask_mov_ps(ifft5408, 49344, ifft5407);
__m512 ifft5332 = _mm512_mask_sub_ps(ifft5324, 49344, _mm512_setzero_ps(), ifft5325);
__m512 ifft5417 = _mm512_mask_sub_ps(ifft5409, 49344, _mm512_setzero_ps(), ifft5410);
__m512 ifft5333 = _mm512_mask_mov_ps(ifft5325, 49344, ifft5324);
__m512 ifft5418 = _mm512_mask_mov_ps(ifft5410, 49344, ifft5409);
__m512 ifft5334 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft5335 = _mm512_fmadd_ps(ifft5326, ifft5334, _mm512_shuffle_f32x4(ifft5326, ifft5326, 177));
__m512 ifft5419 = _mm512_fmadd_ps(ifft5411, ifft5334, _mm512_shuffle_f32x4(ifft5411, ifft5411, 177));
__m512 ifft5336 = _mm512_fmadd_ps(ifft5327, ifft5334, _mm512_shuffle_f32x4(ifft5327, ifft5327, 177));
__m512 ifft5420 = _mm512_fmadd_ps(ifft5412, ifft5334, _mm512_shuffle_f32x4(ifft5412, ifft5412, 177));
__m512 ifft5337 = _mm512_fmadd_ps(ifft5328, ifft5334, _mm512_shuffle_f32x4(ifft5328, ifft5328, 177));
__m512 ifft5421 = _mm512_fmadd_ps(ifft5413, ifft5334, _mm512_shuffle_f32x4(ifft5413, ifft5413, 177));
__m512 ifft5338 = _mm512_fmadd_ps(ifft5329, ifft5334, _mm512_shuffle_f32x4(ifft5329, ifft5329, 177));
__m512 ifft5422 = _mm512_fmadd_ps(ifft5414, ifft5334, _mm512_shuffle_f32x4(ifft5414, ifft5414, 177));
__m512 ifft5339 = _mm512_fmadd_ps(ifft5330, ifft5334, _mm512_shuffle_f32x4(ifft5330, ifft5330, 177));
__m512 ifft5423 = _mm512_fmadd_ps(ifft5415, ifft5334, _mm512_shuffle_f32x4(ifft5415, ifft5415, 177));
__m512 ifft5340 = _mm512_fnmsub_ps(ifft5331, ifft5334, _mm512_shuffle_f32x4(ifft5331, ifft5331, 177));
__m512 ifft5424 = _mm512_fnmsub_ps(ifft5416, ifft5334, _mm512_shuffle_f32x4(ifft5416, ifft5416, 177));
__m512 ifft5341 = _mm512_fmadd_ps(ifft5332, ifft5334, _mm512_shuffle_f32x4(ifft5332, ifft5332, 177));
__m512 ifft5425 = _mm512_fmadd_ps(ifft5417, ifft5334, _mm512_shuffle_f32x4(ifft5417, ifft5417, 177));
__m512 ifft5342 = _mm512_fmadd_ps(ifft5333, ifft5334, _mm512_shuffle_f32x4(ifft5333, ifft5333, 177));
__m512 ifft5426 = _mm512_fmadd_ps(ifft5418, ifft5334, _mm512_shuffle_f32x4(ifft5418, ifft5418, 177));
__m512 ifft5343 = _mm512_add_ps(ifft5335, ifft5336);
__m512 ifft5427 = _mm512_add_ps(ifft5419, ifft5420);
__m512 ifft5344 = _mm512_sub_ps(ifft5335, ifft5336);
__m512 ifft5428 = _mm512_sub_ps(ifft5419, ifft5420);
__m512 ifft5345 = _mm512_sub_ps(ifft5337, ifft5341);
__m512 ifft5429 = _mm512_sub_ps(ifft5421, ifft5425);
__m512 ifft5346 = _mm512_add_ps(ifft5338, ifft5342);
__m512 ifft5430 = _mm512_add_ps(ifft5422, ifft5426);
__m512 ifft5347 = _mm512_add_ps(ifft5337, ifft5341);
__m512 ifft5431 = _mm512_add_ps(ifft5421, ifft5425);
__m512 ifft5348 = _mm512_sub_ps(ifft5338, ifft5342);
__m512 ifft5432 = _mm512_sub_ps(ifft5422, ifft5426);
__m512 ifft5349 = _mm512_mul_ps(ifft5339, _mm512_set1_ps(3.125e-02f));
__m512 ifft5433 = _mm512_mul_ps(ifft5423, _mm512_set1_ps(3.125e-02f));
__m512 ifft5350 = _mm512_mul_ps(ifft5340, _mm512_set1_ps(3.125e-02f));
__m512 ifft5434 = _mm512_mul_ps(ifft5424, _mm512_set1_ps(3.125e-02f));
__m512 ifft5351 = _mm512_fmadd_ps(ifft5343, _mm512_set1_ps(1.5625e-02f), ifft5349);
__m512 ifft5435 = _mm512_fmadd_ps(ifft5427, _mm512_set1_ps(1.5625e-02f), ifft5433);
__m512 ifft5352 = _mm512_fmsub_ps(ifft5343, _mm512_set1_ps(1.5625e-02f), ifft5349);
__m512 ifft5436 = _mm512_fmsub_ps(ifft5427, _mm512_set1_ps(1.5625e-02f), ifft5433);
__m512 ifft5353 = _mm512_fmadd_ps(ifft5344, _mm512_set1_ps(1.5625e-02f), ifft5350);
__m512 ifft5437 = _mm512_fmadd_ps(ifft5428, _mm512_set1_ps(1.5625e-02f), ifft5434);
__m512 ifft5354 = _mm512_fmsub_ps(ifft5344, _mm512_set1_ps(1.5625e-02f), ifft5350);
__m512 ifft5438 = _mm512_fmsub_ps(ifft5428, _mm512_set1_ps(1.5625e-02f), ifft5434);
__m512 ifft5355 = _mm512_add_ps(ifft5345, ifft5346);
__m512 ifft5439 = _mm512_add_ps(ifft5429, ifft5430);
__m512 ifft5356 = _mm512_sub_ps(ifft5345, ifft5346);
__m512 ifft5440 = _mm512_sub_ps(ifft5429, ifft5430);
__m512 ifft5357 = _mm512_fnmadd_ps(ifft5355, _mm512_set1_ps(7.0710677e-01f), ifft5347);
__m512 ifft5441 = _mm512_fnmadd_ps(ifft5439, _mm512_set1_ps(7.0710677e-01f), ifft5431);
__m512 ifft5358 = _mm512_fmadd_ps(ifft5355, _mm512_set1_ps(7.0710677e-01f), ifft5347);
__m512 ifft5442 = _mm512_fmadd_ps(ifft5439, _mm512_set1_ps(7.0710677e-01f), ifft5431);
__m512 ifft5359 = _mm512_fmadd_ps(ifft5356, _mm512_set1_ps(7.0710677e-01f), ifft5348);
__m512 ifft5443 = _mm512_fmadd_ps(ifft5440, _mm512_set1_ps(7.0710677e-01f), ifft5432);
__m512 ifft5360 = _mm512_fmsub_ps(ifft5356, _mm512_set1_ps(7.0710677e-01f), ifft5348);
__m512 ifft5444 = _mm512_fmsub_ps(ifft5440, _mm512_set1_ps(7.0710677e-01f), ifft5432);
__m512 ifft5361 = _mm512_add_ps(ifft5357, ifft5358);
__m512 ifft5445 = _mm512_add_ps(ifft5441, ifft5442);
__m512 ifft5362 = _mm512_sub_ps(ifft5357, ifft5358);
__m512 ifft5446 = _mm512_sub_ps(ifft5441, ifft5442);
__m512 ifft5363 = _mm512_add_ps(ifft5359, ifft5360);
__m512 ifft5447 = _mm512_add_ps(ifft5443, ifft5444);
__m512 ifft5364 = _mm512_sub_ps(ifft5359, ifft5360);
__m512 ifft5448 = _mm512_sub_ps(ifft5443, ifft5444);
__m512 ifft5365 = _mm512_fmadd_ps(ifft5361, _mm512_set1_ps(1.5625e-02f), ifft5351);
__m512 ifft5449 = _mm512_fmadd_ps(ifft5445, _mm512_set1_ps(1.5625e-02f), ifft5435);
__m512 ifft5366 = _mm512_fnmadd_ps(ifft5361, _mm512_set1_ps(1.5625e-02f), ifft5351);
__m512 ifft5450 = _mm512_fnmadd_ps(ifft5445, _mm512_set1_ps(1.5625e-02f), ifft5435);
__m512 ifft5367 = _mm512_fmadd_ps(ifft5363, _mm512_set1_ps(1.5625e-02f), ifft5353);
__m512 ifft5451 = _mm512_fmadd_ps(ifft5447, _mm512_set1_ps(1.5625e-02f), ifft5437);
__m512 ifft5368 = _mm512_fnmadd_ps(ifft5363, _mm512_set1_ps(1.5625e-02f), ifft5353);
__m512 ifft5452 = _mm512_fnmadd_ps(ifft5447, _mm512_set1_ps(1.5625e-02f), ifft5437);
__m512 ifft5369 = _mm512_fnmadd_ps(ifft5364, _mm512_set1_ps(1.5625e-02f), ifft5352);
__m512 ifft5453 = _mm512_fnmadd_ps(ifft5448, _mm512_set1_ps(1.5625e-02f), ifft5436);
__m512 ifft5370 = _mm512_fmadd_ps(ifft5364, _mm512_set1_ps(1.5625e-02f), ifft5352);
__m512 ifft5454 = _mm512_fmadd_ps(ifft5448, _mm512_set1_ps(1.5625e-02f), ifft5436);
__m512 ifft5371 = _mm512_fmadd_ps(ifft5362, _mm512_set1_ps(1.5625e-02f), ifft5354);
__m512 ifft5455 = _mm512_fmadd_ps(ifft5446, _mm512_set1_ps(1.5625e-02f), ifft5438);
__m512 ifft5372 = _mm512_fnmadd_ps(ifft5362, _mm512_set1_ps(1.5625e-02f), ifft5354);
__m512 ifft5456 = _mm512_fnmadd_ps(ifft5446, _mm512_set1_ps(1.5625e-02f), ifft5438);
__m512 dat884 = ifft5365;
__m512 dat886 = ifft5449;
__m512 dat885 = ifft5367;
__m512 dat887 = ifft5451;
(void)ifft5369;
(void)ifft5453;
(void)ifft5371;
(void)ifft5455;
(void)ifft5366;
(void)ifft5450;
(void)ifft5368;
(void)ifft5452;
(void)ifft5370;
(void)ifft5454;
(void)ifft5372;
(void)ifft5456;
__m512i pm53 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack255 = _mm512_permutex2var_ps(dat884, pm53, dat886);
__m512i pm54 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack256 = _mm512_permutex2var_ps(dat884, pm54, dat886);
__m512 pack257 = _mm512_permutex2var_ps(dat885, pm53, dat887);
__m512 pack258 = _mm512_permutex2var_ps(dat885, pm54, dat887);
pack255 = _mm512_max_ps(_mm512_setzero_ps(), pack255);
pack256 = _mm512_max_ps(_mm512_setzero_ps(), pack256);
pack257 = _mm512_max_ps(_mm512_setzero_ps(), pack257);
pack258 = _mm512_max_ps(_mm512_setzero_ps(), pack258);
_mm512_mask_storeu_ps(datPtr2+1860+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+40*t32, 1023, pack255);
_mm512_mask_storeu_ps(datPtr2+52100+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+40*t32, 1023, pack256);
_mm512_mask_storeu_ps(datPtr2+2308+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+40*t32, 1023, pack257);
_mm512_mask_storeu_ps(datPtr2+52548+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+40*t32, 1023, pack258);
}
}
if (j5 >= last2) return;
++j5;
rel6 = 1;
}
if (rel6 < 4) {
ptrdiff_t toH18 = base6+5;
ptrdiff_t toW18 = -10+30*rel6;
ptrdiff_t jj18 = 3-rel6+j5;
for (; j5 <= jj18; toW18 += 30) {
ptrdiff_t k42 = 16*w21;
for (; k42 != 16; ++k42) {
ptrdiff_t r19 = 0;
for (; r19 != 2; ++r19) {
ptrdiff_t t33 = 0;
for (; t33 < 3; ++t33) {
__m512 sfRe305 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfIm305 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfRe309 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfIm309 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfRe306 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfIm306 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfRe310 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfIm310 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfRe307 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfIm307 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfRe311 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfIm311 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfRe308 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfIm308 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfRe312 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfIm312 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512i ifft5457 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft5458 = _mm512_permutexvar_ps(ifft5457, sfRe305);
__m512 ifft5549 = _mm512_permutexvar_ps(ifft5457, sfRe309);
__m512i ifft5459 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft5460 = _mm512_permutexvar_ps(ifft5459, sfRe305);
__m512 ifft5550 = _mm512_permutexvar_ps(ifft5459, sfRe309);
__m512 ifft5461 = _mm512_permutexvar_ps(ifft5457, sfIm305);
__m512 ifft5551 = _mm512_permutexvar_ps(ifft5457, sfIm309);
__m512 ifft5462 = _mm512_permutexvar_ps(ifft5459, sfIm305);
__m512 ifft5552 = _mm512_permutexvar_ps(ifft5459, sfIm309);
__m512 ifft5463 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft5464 = _mm512_mask_fmadd_ps(ifft5462, 65021, ifft5463, ifft5458);
__m512 ifft5553 = _mm512_mask_fmadd_ps(ifft5552, 65021, ifft5463, ifft5549);
__m512 ifft5465 = _mm512_mask_fnmadd_ps(ifft5461, 65021, ifft5463, ifft5460);
__m512 ifft5554 = _mm512_mask_fnmadd_ps(ifft5551, 65021, ifft5463, ifft5550);
__m512 ifft5466 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft5467 = _mm512_fmadd_ps(ifft5464, ifft5466, _mm512_shuffle_ps(ifft5464, ifft5464, 177));
__m512 ifft5555 = _mm512_fmadd_ps(ifft5553, ifft5466, _mm512_shuffle_ps(ifft5553, ifft5553, 177));
__m512 ifft5468 = _mm512_fmadd_ps(ifft5465, ifft5466, _mm512_shuffle_ps(ifft5465, ifft5465, 177));
__m512 ifft5556 = _mm512_fmadd_ps(ifft5554, ifft5466, _mm512_shuffle_ps(ifft5554, ifft5554, 177));
__m512 ifft5469 = _mm512_fmadd_ps(sfRe306, ifft5466, _mm512_shuffle_ps(sfRe306, sfRe306, 177));
__m512 ifft5557 = _mm512_fmadd_ps(sfRe310, ifft5466, _mm512_shuffle_ps(sfRe310, sfRe310, 177));
__m512 ifft5470 = _mm512_fmadd_ps(sfIm306, ifft5466, _mm512_shuffle_ps(sfIm306, sfIm306, 177));
__m512 ifft5558 = _mm512_fmadd_ps(sfIm310, ifft5466, _mm512_shuffle_ps(sfIm310, sfIm310, 177));
__m512 ifft5471 = _mm512_fmadd_ps(sfRe307, ifft5466, _mm512_shuffle_ps(sfRe307, sfRe307, 177));
__m512 ifft5559 = _mm512_fmadd_ps(sfRe311, ifft5466, _mm512_shuffle_ps(sfRe311, sfRe311, 177));
__m512 ifft5472 = _mm512_fmadd_ps(sfIm307, ifft5466, _mm512_shuffle_ps(sfIm307, sfIm307, 177));
__m512 ifft5560 = _mm512_fmadd_ps(sfIm311, ifft5466, _mm512_shuffle_ps(sfIm311, sfIm311, 177));
__m512 ifft5473 = _mm512_fmadd_ps(sfRe308, ifft5466, _mm512_shuffle_ps(sfRe308, sfRe308, 177));
__m512 ifft5561 = _mm512_fmadd_ps(sfRe312, ifft5466, _mm512_shuffle_ps(sfRe312, sfRe312, 177));
__m512 ifft5474 = _mm512_fmadd_ps(sfIm308, ifft5466, _mm512_shuffle_ps(sfIm308, sfIm308, 177));
__m512 ifft5562 = _mm512_fmadd_ps(sfIm312, ifft5466, _mm512_shuffle_ps(sfIm312, sfIm312, 177));
__m512 ifft5475 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft5476 = _mm512_mul_ps(ifft5467, ifft5475);
__m512 ifft5563 = _mm512_mul_ps(ifft5555, ifft5475);
__m512 ifft5477 = _mm512_mul_ps(ifft5468, ifft5475);
__m512 ifft5564 = _mm512_mul_ps(ifft5556, ifft5475);
__m512 ifft5478 = _mm512_mul_ps(ifft5469, ifft5475);
__m512 ifft5565 = _mm512_mul_ps(ifft5557, ifft5475);
__m512 ifft5479 = _mm512_mul_ps(ifft5470, ifft5475);
__m512 ifft5566 = _mm512_mul_ps(ifft5558, ifft5475);
__m512 ifft5480 = _mm512_mul_ps(ifft5471, ifft5475);
__m512 ifft5567 = _mm512_mul_ps(ifft5559, ifft5475);
__m512 ifft5481 = _mm512_mul_ps(ifft5472, ifft5475);
__m512 ifft5568 = _mm512_mul_ps(ifft5560, ifft5475);
__m512 ifft5482 = _mm512_mul_ps(ifft5473, ifft5475);
__m512 ifft5569 = _mm512_mul_ps(ifft5561, ifft5475);
__m512 ifft5483 = _mm512_mul_ps(ifft5474, ifft5475);
__m512 ifft5570 = _mm512_mul_ps(ifft5562, ifft5475);
__m512 ifft5484 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft5485 = _mm512_fnmadd_ps(ifft5468, ifft5484, ifft5476);
__m512 ifft5571 = _mm512_fnmadd_ps(ifft5556, ifft5484, ifft5563);
__m512 ifft5486 = _mm512_fmadd_ps(ifft5467, ifft5484, ifft5477);
__m512 ifft5572 = _mm512_fmadd_ps(ifft5555, ifft5484, ifft5564);
__m512 ifft5487 = _mm512_fnmadd_ps(ifft5470, ifft5484, ifft5478);
__m512 ifft5573 = _mm512_fnmadd_ps(ifft5558, ifft5484, ifft5565);
__m512 ifft5488 = _mm512_fmadd_ps(ifft5469, ifft5484, ifft5479);
__m512 ifft5574 = _mm512_fmadd_ps(ifft5557, ifft5484, ifft5566);
__m512 ifft5489 = _mm512_fnmadd_ps(ifft5472, ifft5484, ifft5480);
__m512 ifft5575 = _mm512_fnmadd_ps(ifft5560, ifft5484, ifft5567);
__m512 ifft5490 = _mm512_fmadd_ps(ifft5471, ifft5484, ifft5481);
__m512 ifft5576 = _mm512_fmadd_ps(ifft5559, ifft5484, ifft5568);
__m512 ifft5491 = _mm512_fnmadd_ps(ifft5474, ifft5484, ifft5482);
__m512 ifft5577 = _mm512_fnmadd_ps(ifft5562, ifft5484, ifft5569);
__m512 ifft5492 = _mm512_fmadd_ps(ifft5473, ifft5484, ifft5483);
__m512 ifft5578 = _mm512_fmadd_ps(ifft5561, ifft5484, ifft5570);
__m512 ifft5493 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft5494 = _mm512_fmadd_ps(ifft5485, ifft5493, _mm512_shuffle_ps(ifft5485, ifft5485, 78));
__m512 ifft5579 = _mm512_fmadd_ps(ifft5571, ifft5493, _mm512_shuffle_ps(ifft5571, ifft5571, 78));
__m512 ifft5495 = _mm512_fmadd_ps(ifft5486, ifft5493, _mm512_shuffle_ps(ifft5486, ifft5486, 78));
__m512 ifft5580 = _mm512_fmadd_ps(ifft5572, ifft5493, _mm512_shuffle_ps(ifft5572, ifft5572, 78));
__m512 ifft5496 = _mm512_fmadd_ps(ifft5487, ifft5493, _mm512_shuffle_ps(ifft5487, ifft5487, 78));
__m512 ifft5581 = _mm512_fmadd_ps(ifft5573, ifft5493, _mm512_shuffle_ps(ifft5573, ifft5573, 78));
__m512 ifft5497 = _mm512_fmadd_ps(ifft5488, ifft5493, _mm512_shuffle_ps(ifft5488, ifft5488, 78));
__m512 ifft5582 = _mm512_fmadd_ps(ifft5574, ifft5493, _mm512_shuffle_ps(ifft5574, ifft5574, 78));
__m512 ifft5498 = _mm512_fmadd_ps(ifft5489, ifft5493, _mm512_shuffle_ps(ifft5489, ifft5489, 78));
__m512 ifft5583 = _mm512_fmadd_ps(ifft5575, ifft5493, _mm512_shuffle_ps(ifft5575, ifft5575, 78));
__m512 ifft5499 = _mm512_fmadd_ps(ifft5490, ifft5493, _mm512_shuffle_ps(ifft5490, ifft5490, 78));
__m512 ifft5584 = _mm512_fmadd_ps(ifft5576, ifft5493, _mm512_shuffle_ps(ifft5576, ifft5576, 78));
__m512 ifft5500 = _mm512_fmadd_ps(ifft5491, ifft5493, _mm512_shuffle_ps(ifft5491, ifft5491, 78));
__m512 ifft5585 = _mm512_fmadd_ps(ifft5577, ifft5493, _mm512_shuffle_ps(ifft5577, ifft5577, 78));
__m512 ifft5501 = _mm512_fmadd_ps(ifft5492, ifft5493, _mm512_shuffle_ps(ifft5492, ifft5492, 78));
__m512 ifft5586 = _mm512_fmadd_ps(ifft5578, ifft5493, _mm512_shuffle_ps(ifft5578, ifft5578, 78));
__m512 ifft5502 = _mm512_mask_sub_ps(ifft5494, 49344, _mm512_setzero_ps(), ifft5495);
__m512 ifft5587 = _mm512_mask_sub_ps(ifft5579, 49344, _mm512_setzero_ps(), ifft5580);
__m512 ifft5503 = _mm512_mask_mov_ps(ifft5495, 49344, ifft5494);
__m512 ifft5588 = _mm512_mask_mov_ps(ifft5580, 49344, ifft5579);
__m512 ifft5504 = _mm512_mask_sub_ps(ifft5496, 49344, _mm512_setzero_ps(), ifft5497);
__m512 ifft5589 = _mm512_mask_sub_ps(ifft5581, 49344, _mm512_setzero_ps(), ifft5582);
__m512 ifft5505 = _mm512_mask_mov_ps(ifft5497, 49344, ifft5496);
__m512 ifft5590 = _mm512_mask_mov_ps(ifft5582, 49344, ifft5581);
__m512 ifft5506 = _mm512_mask_sub_ps(ifft5498, 49344, _mm512_setzero_ps(), ifft5499);
__m512 ifft5591 = _mm512_mask_sub_ps(ifft5583, 49344, _mm512_setzero_ps(), ifft5584);
__m512 ifft5507 = _mm512_mask_mov_ps(ifft5499, 49344, ifft5498);
__m512 ifft5592 = _mm512_mask_mov_ps(ifft5584, 49344, ifft5583);
__m512 ifft5508 = _mm512_mask_sub_ps(ifft5500, 49344, _mm512_setzero_ps(), ifft5501);
__m512 ifft5593 = _mm512_mask_sub_ps(ifft5585, 49344, _mm512_setzero_ps(), ifft5586);
__m512 ifft5509 = _mm512_mask_mov_ps(ifft5501, 49344, ifft5500);
__m512 ifft5594 = _mm512_mask_mov_ps(ifft5586, 49344, ifft5585);
__m512 ifft5510 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft5511 = _mm512_fmadd_ps(ifft5502, ifft5510, _mm512_shuffle_f32x4(ifft5502, ifft5502, 177));
__m512 ifft5595 = _mm512_fmadd_ps(ifft5587, ifft5510, _mm512_shuffle_f32x4(ifft5587, ifft5587, 177));
__m512 ifft5512 = _mm512_fmadd_ps(ifft5503, ifft5510, _mm512_shuffle_f32x4(ifft5503, ifft5503, 177));
__m512 ifft5596 = _mm512_fmadd_ps(ifft5588, ifft5510, _mm512_shuffle_f32x4(ifft5588, ifft5588, 177));
__m512 ifft5513 = _mm512_fmadd_ps(ifft5504, ifft5510, _mm512_shuffle_f32x4(ifft5504, ifft5504, 177));
__m512 ifft5597 = _mm512_fmadd_ps(ifft5589, ifft5510, _mm512_shuffle_f32x4(ifft5589, ifft5589, 177));
__m512 ifft5514 = _mm512_fmadd_ps(ifft5505, ifft5510, _mm512_shuffle_f32x4(ifft5505, ifft5505, 177));
__m512 ifft5598 = _mm512_fmadd_ps(ifft5590, ifft5510, _mm512_shuffle_f32x4(ifft5590, ifft5590, 177));
__m512 ifft5515 = _mm512_fmadd_ps(ifft5506, ifft5510, _mm512_shuffle_f32x4(ifft5506, ifft5506, 177));
__m512 ifft5599 = _mm512_fmadd_ps(ifft5591, ifft5510, _mm512_shuffle_f32x4(ifft5591, ifft5591, 177));
__m512 ifft5516 = _mm512_fnmsub_ps(ifft5507, ifft5510, _mm512_shuffle_f32x4(ifft5507, ifft5507, 177));
__m512 ifft5600 = _mm512_fnmsub_ps(ifft5592, ifft5510, _mm512_shuffle_f32x4(ifft5592, ifft5592, 177));
__m512 ifft5517 = _mm512_fmadd_ps(ifft5508, ifft5510, _mm512_shuffle_f32x4(ifft5508, ifft5508, 177));
__m512 ifft5601 = _mm512_fmadd_ps(ifft5593, ifft5510, _mm512_shuffle_f32x4(ifft5593, ifft5593, 177));
__m512 ifft5518 = _mm512_fmadd_ps(ifft5509, ifft5510, _mm512_shuffle_f32x4(ifft5509, ifft5509, 177));
__m512 ifft5602 = _mm512_fmadd_ps(ifft5594, ifft5510, _mm512_shuffle_f32x4(ifft5594, ifft5594, 177));
__m512 ifft5519 = _mm512_add_ps(ifft5511, ifft5512);
__m512 ifft5603 = _mm512_add_ps(ifft5595, ifft5596);
__m512 ifft5520 = _mm512_sub_ps(ifft5511, ifft5512);
__m512 ifft5604 = _mm512_sub_ps(ifft5595, ifft5596);
__m512 ifft5521 = _mm512_sub_ps(ifft5513, ifft5517);
__m512 ifft5605 = _mm512_sub_ps(ifft5597, ifft5601);
__m512 ifft5522 = _mm512_add_ps(ifft5514, ifft5518);
__m512 ifft5606 = _mm512_add_ps(ifft5598, ifft5602);
__m512 ifft5523 = _mm512_add_ps(ifft5513, ifft5517);
__m512 ifft5607 = _mm512_add_ps(ifft5597, ifft5601);
__m512 ifft5524 = _mm512_sub_ps(ifft5514, ifft5518);
__m512 ifft5608 = _mm512_sub_ps(ifft5598, ifft5602);
__m512 ifft5525 = _mm512_mul_ps(ifft5515, _mm512_set1_ps(3.125e-02f));
__m512 ifft5609 = _mm512_mul_ps(ifft5599, _mm512_set1_ps(3.125e-02f));
__m512 ifft5526 = _mm512_mul_ps(ifft5516, _mm512_set1_ps(3.125e-02f));
__m512 ifft5610 = _mm512_mul_ps(ifft5600, _mm512_set1_ps(3.125e-02f));
__m512 ifft5527 = _mm512_fmadd_ps(ifft5519, _mm512_set1_ps(1.5625e-02f), ifft5525);
__m512 ifft5611 = _mm512_fmadd_ps(ifft5603, _mm512_set1_ps(1.5625e-02f), ifft5609);
__m512 ifft5528 = _mm512_fmsub_ps(ifft5519, _mm512_set1_ps(1.5625e-02f), ifft5525);
__m512 ifft5612 = _mm512_fmsub_ps(ifft5603, _mm512_set1_ps(1.5625e-02f), ifft5609);
__m512 ifft5529 = _mm512_fmadd_ps(ifft5520, _mm512_set1_ps(1.5625e-02f), ifft5526);
__m512 ifft5613 = _mm512_fmadd_ps(ifft5604, _mm512_set1_ps(1.5625e-02f), ifft5610);
__m512 ifft5530 = _mm512_fmsub_ps(ifft5520, _mm512_set1_ps(1.5625e-02f), ifft5526);
__m512 ifft5614 = _mm512_fmsub_ps(ifft5604, _mm512_set1_ps(1.5625e-02f), ifft5610);
__m512 ifft5531 = _mm512_add_ps(ifft5521, ifft5522);
__m512 ifft5615 = _mm512_add_ps(ifft5605, ifft5606);
__m512 ifft5532 = _mm512_sub_ps(ifft5521, ifft5522);
__m512 ifft5616 = _mm512_sub_ps(ifft5605, ifft5606);
__m512 ifft5533 = _mm512_fnmadd_ps(ifft5531, _mm512_set1_ps(7.0710677e-01f), ifft5523);
__m512 ifft5617 = _mm512_fnmadd_ps(ifft5615, _mm512_set1_ps(7.0710677e-01f), ifft5607);
__m512 ifft5534 = _mm512_fmadd_ps(ifft5531, _mm512_set1_ps(7.0710677e-01f), ifft5523);
__m512 ifft5618 = _mm512_fmadd_ps(ifft5615, _mm512_set1_ps(7.0710677e-01f), ifft5607);
__m512 ifft5535 = _mm512_fmadd_ps(ifft5532, _mm512_set1_ps(7.0710677e-01f), ifft5524);
__m512 ifft5619 = _mm512_fmadd_ps(ifft5616, _mm512_set1_ps(7.0710677e-01f), ifft5608);
__m512 ifft5536 = _mm512_fmsub_ps(ifft5532, _mm512_set1_ps(7.0710677e-01f), ifft5524);
__m512 ifft5620 = _mm512_fmsub_ps(ifft5616, _mm512_set1_ps(7.0710677e-01f), ifft5608);
__m512 ifft5537 = _mm512_add_ps(ifft5533, ifft5534);
__m512 ifft5621 = _mm512_add_ps(ifft5617, ifft5618);
__m512 ifft5538 = _mm512_sub_ps(ifft5533, ifft5534);
__m512 ifft5622 = _mm512_sub_ps(ifft5617, ifft5618);
__m512 ifft5539 = _mm512_add_ps(ifft5535, ifft5536);
__m512 ifft5623 = _mm512_add_ps(ifft5619, ifft5620);
__m512 ifft5540 = _mm512_sub_ps(ifft5535, ifft5536);
__m512 ifft5624 = _mm512_sub_ps(ifft5619, ifft5620);
__m512 ifft5541 = _mm512_fmadd_ps(ifft5537, _mm512_set1_ps(1.5625e-02f), ifft5527);
__m512 ifft5625 = _mm512_fmadd_ps(ifft5621, _mm512_set1_ps(1.5625e-02f), ifft5611);
__m512 ifft5542 = _mm512_fnmadd_ps(ifft5537, _mm512_set1_ps(1.5625e-02f), ifft5527);
__m512 ifft5626 = _mm512_fnmadd_ps(ifft5621, _mm512_set1_ps(1.5625e-02f), ifft5611);
__m512 ifft5543 = _mm512_fmadd_ps(ifft5539, _mm512_set1_ps(1.5625e-02f), ifft5529);
__m512 ifft5627 = _mm512_fmadd_ps(ifft5623, _mm512_set1_ps(1.5625e-02f), ifft5613);
__m512 ifft5544 = _mm512_fnmadd_ps(ifft5539, _mm512_set1_ps(1.5625e-02f), ifft5529);
__m512 ifft5628 = _mm512_fnmadd_ps(ifft5623, _mm512_set1_ps(1.5625e-02f), ifft5613);
__m512 ifft5545 = _mm512_fnmadd_ps(ifft5540, _mm512_set1_ps(1.5625e-02f), ifft5528);
__m512 ifft5629 = _mm512_fnmadd_ps(ifft5624, _mm512_set1_ps(1.5625e-02f), ifft5612);
__m512 ifft5546 = _mm512_fmadd_ps(ifft5540, _mm512_set1_ps(1.5625e-02f), ifft5528);
__m512 ifft5630 = _mm512_fmadd_ps(ifft5624, _mm512_set1_ps(1.5625e-02f), ifft5612);
__m512 ifft5547 = _mm512_fmadd_ps(ifft5538, _mm512_set1_ps(1.5625e-02f), ifft5530);
__m512 ifft5631 = _mm512_fmadd_ps(ifft5622, _mm512_set1_ps(1.5625e-02f), ifft5614);
__m512 ifft5548 = _mm512_fnmadd_ps(ifft5538, _mm512_set1_ps(1.5625e-02f), ifft5530);
__m512 ifft5632 = _mm512_fnmadd_ps(ifft5622, _mm512_set1_ps(1.5625e-02f), ifft5614);
__m512 dat888 = ifft5541;
__m512 dat890 = ifft5625;
__m512 dat889 = ifft5543;
__m512 dat891 = ifft5627;
(void)ifft5545;
(void)ifft5629;
(void)ifft5547;
(void)ifft5631;
(void)ifft5542;
(void)ifft5626;
(void)ifft5544;
(void)ifft5628;
(void)ifft5546;
(void)ifft5630;
(void)ifft5548;
(void)ifft5632;
__m512i pm55 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack259 = _mm512_permutex2var_ps(dat888, pm55, dat890);
__m512i pm56 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack260 = _mm512_permutex2var_ps(dat888, pm56, dat890);
__m512 pack261 = _mm512_permutex2var_ps(dat889, pm55, dat891);
__m512 pack262 = _mm512_permutex2var_ps(dat889, pm56, dat891);
pack259 = _mm512_max_ps(_mm512_setzero_ps(), pack259);
pack260 = _mm512_max_ps(_mm512_setzero_ps(), pack260);
pack261 = _mm512_max_ps(_mm512_setzero_ps(), pack261);
pack262 = _mm512_max_ps(_mm512_setzero_ps(), pack262);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k42+100480*r19+448*toH18+4*toW18+40*t33, 1023, pack259);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k42+100480*r19+448*toH18+4*toW18+40*t33, 1023, pack260);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k42+100480*r19+448*toH18+4*toW18+40*t33, 1023, pack261);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k42+100480*r19+448*toH18+4*toW18+40*t33, 1023, pack262);
}
}
}
if (j5 >= last2) return;
++j5;
}
rel6 = 4;
}
ptrdiff_t toH19 = base6+5;
ptrdiff_t toW19 = 110;
ptrdiff_t k43 = 16*w21;
for (; k43 != 16; ++k43) {
ptrdiff_t r20 = 0;
for (; r20 != 2; ++r20) {
ptrdiff_t t34 = 0;
__m512 sfRe313 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+256*k43+128*r20+0*t34);
__m512 sfIm313 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+256*k43+128*r20+0*t34);
__m512 sfRe314 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+256*k43+128*r20+0*t34);
__m512 sfIm314 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+256*k43+128*r20+0*t34);
__m512 sfRe315 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+256*k43+128*r20+0*t34);
__m512 sfIm315 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+256*k43+128*r20+0*t34);
__m512 sfRe316 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+256*k43+128*r20+0*t34);
__m512 sfIm316 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+256*k43+128*r20+0*t34);
__m512i ifft5633 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft5634 = _mm512_permutexvar_ps(ifft5633, sfRe313);
__m512i ifft5635 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft5636 = _mm512_permutexvar_ps(ifft5635, sfRe313);
__m512 ifft5637 = _mm512_permutexvar_ps(ifft5633, sfIm313);
__m512 ifft5638 = _mm512_permutexvar_ps(ifft5635, sfIm313);
__m512 ifft5639 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft5640 = _mm512_mask_fmadd_ps(ifft5638, 65021, ifft5639, ifft5634);
__m512 ifft5641 = _mm512_mask_fnmadd_ps(ifft5637, 65021, ifft5639, ifft5636);
__m512 ifft5642 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft5643 = _mm512_fmadd_ps(ifft5640, ifft5642, _mm512_shuffle_ps(ifft5640, ifft5640, 177));
__m512 ifft5644 = _mm512_fmadd_ps(ifft5641, ifft5642, _mm512_shuffle_ps(ifft5641, ifft5641, 177));
__m512 ifft5645 = _mm512_fmadd_ps(sfRe314, ifft5642, _mm512_shuffle_ps(sfRe314, sfRe314, 177));
__m512 ifft5646 = _mm512_fmadd_ps(sfIm314, ifft5642, _mm512_shuffle_ps(sfIm314, sfIm314, 177));
__m512 ifft5647 = _mm512_fmadd_ps(sfRe315, ifft5642, _mm512_shuffle_ps(sfRe315, sfRe315, 177));
__m512 ifft5648 = _mm512_fmadd_ps(sfIm315, ifft5642, _mm512_shuffle_ps(sfIm315, sfIm315, 177));
__m512 ifft5649 = _mm512_fmadd_ps(sfRe316, ifft5642, _mm512_shuffle_ps(sfRe316, sfRe316, 177));
__m512 ifft5650 = _mm512_fmadd_ps(sfIm316, ifft5642, _mm512_shuffle_ps(sfIm316, sfIm316, 177));
__m512 ifft5651 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft5652 = _mm512_mul_ps(ifft5643, ifft5651);
__m512 ifft5653 = _mm512_mul_ps(ifft5644, ifft5651);
__m512 ifft5654 = _mm512_mul_ps(ifft5645, ifft5651);
__m512 ifft5655 = _mm512_mul_ps(ifft5646, ifft5651);
__m512 ifft5656 = _mm512_mul_ps(ifft5647, ifft5651);
__m512 ifft5657 = _mm512_mul_ps(ifft5648, ifft5651);
__m512 ifft5658 = _mm512_mul_ps(ifft5649, ifft5651);
__m512 ifft5659 = _mm512_mul_ps(ifft5650, ifft5651);
__m512 ifft5660 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft5661 = _mm512_fnmadd_ps(ifft5644, ifft5660, ifft5652);
__m512 ifft5662 = _mm512_fmadd_ps(ifft5643, ifft5660, ifft5653);
__m512 ifft5663 = _mm512_fnmadd_ps(ifft5646, ifft5660, ifft5654);
__m512 ifft5664 = _mm512_fmadd_ps(ifft5645, ifft5660, ifft5655);
__m512 ifft5665 = _mm512_fnmadd_ps(ifft5648, ifft5660, ifft5656);
__m512 ifft5666 = _mm512_fmadd_ps(ifft5647, ifft5660, ifft5657);
__m512 ifft5667 = _mm512_fnmadd_ps(ifft5650, ifft5660, ifft5658);
__m512 ifft5668 = _mm512_fmadd_ps(ifft5649, ifft5660, ifft5659);
__m512 ifft5669 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft5670 = _mm512_fmadd_ps(ifft5661, ifft5669, _mm512_shuffle_ps(ifft5661, ifft5661, 78));
__m512 ifft5671 = _mm512_fmadd_ps(ifft5662, ifft5669, _mm512_shuffle_ps(ifft5662, ifft5662, 78));
__m512 ifft5672 = _mm512_fmadd_ps(ifft5663, ifft5669, _mm512_shuffle_ps(ifft5663, ifft5663, 78));
__m512 ifft5673 = _mm512_fmadd_ps(ifft5664, ifft5669, _mm512_shuffle_ps(ifft5664, ifft5664, 78));
__m512 ifft5674 = _mm512_fmadd_ps(ifft5665, ifft5669, _mm512_shuffle_ps(ifft5665, ifft5665, 78));
__m512 ifft5675 = _mm512_fmadd_ps(ifft5666, ifft5669, _mm512_shuffle_ps(ifft5666, ifft5666, 78));
__m512 ifft5676 = _mm512_fmadd_ps(ifft5667, ifft5669, _mm512_shuffle_ps(ifft5667, ifft5667, 78));
__m512 ifft5677 = _mm512_fmadd_ps(ifft5668, ifft5669, _mm512_shuffle_ps(ifft5668, ifft5668, 78));
__m512 ifft5678 = _mm512_mask_sub_ps(ifft5670, 49344, _mm512_setzero_ps(), ifft5671);
__m512 ifft5679 = _mm512_mask_mov_ps(ifft5671, 49344, ifft5670);
__m512 ifft5680 = _mm512_mask_sub_ps(ifft5672, 49344, _mm512_setzero_ps(), ifft5673);
__m512 ifft5681 = _mm512_mask_mov_ps(ifft5673, 49344, ifft5672);
__m512 ifft5682 = _mm512_mask_sub_ps(ifft5674, 49344, _mm512_setzero_ps(), ifft5675);
__m512 ifft5683 = _mm512_mask_mov_ps(ifft5675, 49344, ifft5674);
__m512 ifft5684 = _mm512_mask_sub_ps(ifft5676, 49344, _mm512_setzero_ps(), ifft5677);
__m512 ifft5685 = _mm512_mask_mov_ps(ifft5677, 49344, ifft5676);
__m512 ifft5686 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft5687 = _mm512_fmadd_ps(ifft5678, ifft5686, _mm512_shuffle_f32x4(ifft5678, ifft5678, 177));
__m512 ifft5688 = _mm512_fmadd_ps(ifft5679, ifft5686, _mm512_shuffle_f32x4(ifft5679, ifft5679, 177));
__m512 ifft5689 = _mm512_fmadd_ps(ifft5680, ifft5686, _mm512_shuffle_f32x4(ifft5680, ifft5680, 177));
__m512 ifft5690 = _mm512_fmadd_ps(ifft5681, ifft5686, _mm512_shuffle_f32x4(ifft5681, ifft5681, 177));
__m512 ifft5691 = _mm512_fmadd_ps(ifft5682, ifft5686, _mm512_shuffle_f32x4(ifft5682, ifft5682, 177));
__m512 ifft5692 = _mm512_fnmsub_ps(ifft5683, ifft5686, _mm512_shuffle_f32x4(ifft5683, ifft5683, 177));
__m512 ifft5693 = _mm512_fmadd_ps(ifft5684, ifft5686, _mm512_shuffle_f32x4(ifft5684, ifft5684, 177));
__m512 ifft5694 = _mm512_fmadd_ps(ifft5685, ifft5686, _mm512_shuffle_f32x4(ifft5685, ifft5685, 177));
__m512 ifft5695 = _mm512_add_ps(ifft5687, ifft5688);
__m512 ifft5696 = _mm512_sub_ps(ifft5687, ifft5688);
__m512 ifft5697 = _mm512_sub_ps(ifft5689, ifft5693);
__m512 ifft5698 = _mm512_add_ps(ifft5690, ifft5694);
__m512 ifft5699 = _mm512_add_ps(ifft5689, ifft5693);
__m512 ifft5700 = _mm512_sub_ps(ifft5690, ifft5694);
__m512 ifft5701 = _mm512_mul_ps(ifft5691, _mm512_set1_ps(3.125e-02f));
__m512 ifft5702 = _mm512_mul_ps(ifft5692, _mm512_set1_ps(3.125e-02f));
__m512 ifft5703 = _mm512_fmadd_ps(ifft5695, _mm512_set1_ps(1.5625e-02f), ifft5701);
__m512 ifft5704 = _mm512_fmsub_ps(ifft5695, _mm512_set1_ps(1.5625e-02f), ifft5701);
__m512 ifft5705 = _mm512_fmadd_ps(ifft5696, _mm512_set1_ps(1.5625e-02f), ifft5702);
__m512 ifft5706 = _mm512_fmsub_ps(ifft5696, _mm512_set1_ps(1.5625e-02f), ifft5702);
__m512 ifft5707 = _mm512_add_ps(ifft5697, ifft5698);
__m512 ifft5708 = _mm512_sub_ps(ifft5697, ifft5698);
__m512 ifft5709 = _mm512_fnmadd_ps(ifft5707, _mm512_set1_ps(7.0710677e-01f), ifft5699);
__m512 ifft5710 = _mm512_fmadd_ps(ifft5707, _mm512_set1_ps(7.0710677e-01f), ifft5699);
__m512 ifft5711 = _mm512_fmadd_ps(ifft5708, _mm512_set1_ps(7.0710677e-01f), ifft5700);
__m512 ifft5712 = _mm512_fmsub_ps(ifft5708, _mm512_set1_ps(7.0710677e-01f), ifft5700);
__m512 ifft5713 = _mm512_add_ps(ifft5709, ifft5710);
__m512 ifft5714 = _mm512_sub_ps(ifft5709, ifft5710);
__m512 ifft5715 = _mm512_add_ps(ifft5711, ifft5712);
__m512 ifft5716 = _mm512_sub_ps(ifft5711, ifft5712);
__m512 ifft5717 = _mm512_fmadd_ps(ifft5713, _mm512_set1_ps(1.5625e-02f), ifft5703);
__m512 ifft5718 = _mm512_fnmadd_ps(ifft5713, _mm512_set1_ps(1.5625e-02f), ifft5703);
__m512 ifft5719 = _mm512_fmadd_ps(ifft5715, _mm512_set1_ps(1.5625e-02f), ifft5705);
__m512 ifft5720 = _mm512_fnmadd_ps(ifft5715, _mm512_set1_ps(1.5625e-02f), ifft5705);
__m512 ifft5721 = _mm512_fnmadd_ps(ifft5716, _mm512_set1_ps(1.5625e-02f), ifft5704);
__m512 ifft5722 = _mm512_fmadd_ps(ifft5716, _mm512_set1_ps(1.5625e-02f), ifft5704);
__m512 ifft5723 = _mm512_fmadd_ps(ifft5714, _mm512_set1_ps(1.5625e-02f), ifft5706);
__m512 ifft5724 = _mm512_fnmadd_ps(ifft5714, _mm512_set1_ps(1.5625e-02f), ifft5706);
__m512 dat892 = ifft5717;
__m512 dat893 = ifft5719;
(void)ifft5721;
(void)ifft5723;
(void)ifft5718;
(void)ifft5720;
(void)ifft5722;
(void)ifft5724;
dat892 = _mm512_max_ps(_mm512_setzero_ps(), dat892);
dat893 = _mm512_max_ps(_mm512_setzero_ps(), dat893);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k43+100480*r20+448*toH19+4*toW19+0*t34, 3, dat892);
_mm512_mask_storeu_ps(datPtr2+50208+3215360*i9+200960*k43+100480*r20+448*toH19+4*toW19+0*t34, 768, dat892);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k43+100480*r20+448*toH19+4*toW19+0*t34, 3, dat893);
_mm512_mask_storeu_ps(datPtr2+50656+3215360*i9+200960*k43+100480*r20+448*toH19+4*toW19+0*t34, 768, dat893);
}
}
if (j5 >= last2) return;
++j5;
}

static void ResNet50StriderConsumeSums1(ResNet50ThreaderTeam1* team17, char** tensors7) {
ResNet50ThreaderTask1 task11;
task11.callee1 = ResNet50StriderConsumeSums1Callee1;
task11.any1 = tensors7;
task11.nd1 = 3;
task11.hull1[0] = 1;
task11.hull1[1] = 44;
task11.hull1[2] = 1;
ResNet50ThreaderDo1(team17, &task11);
}

struct ResNet50Net {
char* alloc1;
char* align1;
};

void ResNet50NetDestroy(ResNet50Net* net2) {
free(net2->alloc1);
free(net2);
}

char* ResNet50NetCreate(
ResNet50Net** net1,
ResNet50Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return ResNet50Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(213782239);
if (__builtin_expect(!alloc3, 0)) {
return ResNet50Errmsg1(__LINE__, "errno %d", errno);
}
char* align4 = (void*)(((size_t)alloc3+63)&-64);
char* tmpAlloc1 = malloc(20543);
if (__builtin_expect(!tmpAlloc1, 0)) {
char* msg6 = ResNet50Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
char* tmpAlign1 = (void*)(((size_t)tmpAlloc1+63)&-64);
ResNet50ThreaderTeam1* team12 = 0;
char* err8 = ResNet50ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(tmpAlloc1);
free(alloc3);
return err8;
}
{
ResNet50BnSimplify1(
params1->bn1Means,
params1->bn1Variances,
params1->bn1Scales,
params1->bn1Shifts,
tmpAlign1+0
);
char* tensors275[] = {
(char*)params1->sevenDSWeights,
(char*)params1->sevenDSBiases,
tmpAlign1+0,
align4+0
};
ResNet50StriderArrangeFilts1(team12, tensors275);
}
{
ResNet50BnSimplify2(
params1->bn2Means,
params1->bn2Variances,
params1->bn2Scales,
params1->bn2Shifts,
tmpAlign1+0
);
ResNet50BnSimplify1(
params1->bn3Means,
params1->bn3Variances,
params1->bn3Scales,
params1->bn3Shifts,
tmpAlign1+2048
);
char* tensors276[] = {
(char*)params1->one1Weights,
(char*)params1->one1Biases,
tmpAlign1+0,
(char*)params1->one2Weights,
(char*)params1->one2Biases,
tmpAlign1+2048,
align4+98560
};
ResNet50OneArrangeWts1(team12, tensors276);
}
{
ResNet50BnSimplify1(
params1->bn4Means,
params1->bn4Variances,
params1->bn4Scales,
params1->bn4Shifts,
tmpAlign1+0
);
char* tensors277[] = {
(char*)params1->three1Weights,
(char*)params1->three1Biases,
tmpAlign1+0,
align4+181760
};
ResNet50ThreeArrangeFilts1(team12, tensors277);
}
{
ResNet50BnSimplify2(
params1->bn5Means,
params1->bn5Variances,
params1->bn5Scales,
params1->bn5Shifts,
tmpAlign1+0
);
char* tensors278[] = {
(char*)params1->one3Weights,
(char*)params1->one3Biases,
tmpAlign1+0,
align4+706304
};
ResNet50OneArrangeWts2(team12, tensors278);
}
{
ResNet50BnSimplify1(
params1->bn6Means,
params1->bn6Variances,
params1->bn6Scales,
params1->bn6Shifts,
tmpAlign1+0
);
char* tensors279[] = {
(char*)params1->one4Weights,
(char*)params1->one4Biases,
tmpAlign1+0,
align4+772864
};
ResNet50OneArrangeWts3(team12, tensors279);
}
{
ResNet50BnSimplify1(
params1->bn7Means,
params1->bn7Variances,
params1->bn7Scales,
params1->bn7Shifts,
tmpAlign1+0
);
char* tensors280[] = {
(char*)params1->three2Weights,
(char*)params1->three2Biases,
tmpAlign1+0,
align4+838656
};
ResNet50ThreeArrangeFilts2(team12, tensors280);
}
{
ResNet50BnSimplify2(
params1->bn8Means,
params1->bn8Variances,
params1->bn8Scales,
params1->bn8Shifts,
tmpAlign1+0
);
char* tensors281[] = {
(char*)params1->one5Weights,
(char*)params1->one5Biases,
tmpAlign1+0,
align4+1363200
};
ResNet50OneArrangeWts2(team12, tensors281);
}
{
ResNet50BnSimplify1(
params1->bn9Means,
params1->bn9Variances,
params1->bn9Scales,
params1->bn9Shifts,
tmpAlign1+0
);
char* tensors282[] = {
(char*)params1->one6Weights,
(char*)params1->one6Biases,
tmpAlign1+0,
align4+1429760
};
ResNet50OneArrangeWts3(team12, tensors282);
}
{
ResNet50BnSimplify1(
params1->bn10Means,
params1->bn10Variances,
params1->bn10Scales,
params1->bn10Shifts,
tmpAlign1+0
);
char* tensors283[] = {
(char*)params1->three3Weights,
(char*)params1->three3Biases,
tmpAlign1+0,
align4+1495552
};
ResNet50ThreeArrangeFilts2(team12, tensors283);
}
{
ResNet50BnSimplify2(
params1->bn11Means,
params1->bn11Variances,
params1->bn11Scales,
params1->bn11Shifts,
tmpAlign1+0
);
char* tensors284[] = {
(char*)params1->one7Weights,
(char*)params1->one7Biases,
tmpAlign1+0,
align4+2020096
};
ResNet50OneArrangeWts2(team12, tensors284);
}
{
ResNet50BnSimplify3(
params1->bn12Means,
params1->bn12Variances,
params1->bn12Scales,
params1->bn12Shifts,
tmpAlign1+0
);
ResNet50BnSimplify4(
params1->bn13Means,
params1->bn13Variances,
params1->bn13Scales,
params1->bn13Shifts,
tmpAlign1+4096
);
char* tensors285[] = {
(char*)params1->oneDS1Weights,
(char*)params1->oneDS1Biases,
tmpAlign1+0,
(char*)params1->oneDS2Weights,
(char*)params1->oneDS2Biases,
tmpAlign1+4096,
align4+2086656
};
ResNet50OneArrangeWts4(team12, tensors285);
}
{
ResNet50BnSimplify4(
params1->bn14Means,
params1->bn14Variances,
params1->bn14Scales,
params1->bn14Shifts,
tmpAlign1+0
);
char* tensors286[] = {
(char*)params1->three4Weights,
(char*)params1->three4Biases,
tmpAlign1+0,
align4+2744576
};
ResNet50ThreeArrangeFilts3(team12, tensors286);
}
{
ResNet50BnSimplify3(
params1->bn15Means,
params1->bn15Variances,
params1->bn15Scales,
params1->bn15Shifts,
tmpAlign1+0
);
char* tensors287[] = {
(char*)params1->one8Weights,
(char*)params1->one8Biases,
tmpAlign1+0,
align4+4842240
};
ResNet50OneArrangeWts5(team12, tensors287);
}
{
ResNet50BnSimplify4(
params1->bn16Means,
params1->bn16Variances,
params1->bn16Scales,
params1->bn16Shifts,
tmpAlign1+0
);
char* tensors288[] = {
(char*)params1->one9Weights,
(char*)params1->one9Biases,
tmpAlign1+0,
align4+5106432
};
ResNet50OneArrangeWts6(team12, tensors288);
}
{
ResNet50BnSimplify4(
params1->bn17Means,
params1->bn17Variances,
params1->bn17Scales,
params1->bn17Shifts,
tmpAlign1+0
);
char* tensors289[] = {
(char*)params1->three5Weights,
(char*)params1->three5Biases,
tmpAlign1+0,
align4+5369088
};
ResNet50ThreeArrangeFilts4(team12, tensors289);
}
{
ResNet50BnSimplify3(
params1->bn18Means,
params1->bn18Variances,
params1->bn18Scales,
params1->bn18Shifts,
tmpAlign1+0
);
char* tensors290[] = {
(char*)params1->one10Weights,
(char*)params1->one10Biases,
tmpAlign1+0,
align4+7466752
};
ResNet50OneArrangeWts5(team12, tensors290);
}
{
ResNet50BnSimplify4(
params1->bn19Means,
params1->bn19Variances,
params1->bn19Scales,
params1->bn19Shifts,
tmpAlign1+0
);
char* tensors291[] = {
(char*)params1->one11Weights,
(char*)params1->one11Biases,
tmpAlign1+0,
align4+7730944
};
ResNet50OneArrangeWts6(team12, tensors291);
}
{
ResNet50BnSimplify4(
params1->bn20Means,
params1->bn20Variances,
params1->bn20Scales,
params1->bn20Shifts,
tmpAlign1+0
);
char* tensors292[] = {
(char*)params1->three6Weights,
(char*)params1->three6Biases,
tmpAlign1+0,
align4+7993600
};
ResNet50ThreeArrangeFilts4(team12, tensors292);
}
{
ResNet50BnSimplify3(
params1->bn21Means,
params1->bn21Variances,
params1->bn21Scales,
params1->bn21Shifts,
tmpAlign1+0
);
char* tensors293[] = {
(char*)params1->one12Weights,
(char*)params1->one12Biases,
tmpAlign1+0,
align4+10091264
};
ResNet50OneArrangeWts5(team12, tensors293);
}
{
ResNet50BnSimplify4(
params1->bn22Means,
params1->bn22Variances,
params1->bn22Scales,
params1->bn22Shifts,
tmpAlign1+0
);
char* tensors294[] = {
(char*)params1->one13Weights,
(char*)params1->one13Biases,
tmpAlign1+0,
align4+10355456
};
ResNet50OneArrangeWts6(team12, tensors294);
}
{
ResNet50BnSimplify4(
params1->bn23Means,
params1->bn23Variances,
params1->bn23Scales,
params1->bn23Shifts,
tmpAlign1+0
);
char* tensors295[] = {
(char*)params1->three7Weights,
(char*)params1->three7Biases,
tmpAlign1+0,
align4+10618112
};
ResNet50ThreeArrangeFilts4(team12, tensors295);
}
{
ResNet50BnSimplify3(
params1->bn24Means,
params1->bn24Variances,
params1->bn24Scales,
params1->bn24Shifts,
tmpAlign1+0
);
char* tensors296[] = {
(char*)params1->one14Weights,
(char*)params1->one14Biases,
tmpAlign1+0,
align4+12715776
};
ResNet50OneArrangeWts5(team12, tensors296);
}
{
ResNet50BnSimplify5(
params1->bn25Means,
params1->bn25Variances,
params1->bn25Scales,
params1->bn25Shifts,
tmpAlign1+0
);
ResNet50BnSimplify2(
params1->bn26Means,
params1->bn26Variances,
params1->bn26Scales,
params1->bn26Shifts,
tmpAlign1+8192
);
char* tensors297[] = {
(char*)params1->oneDS3Weights,
(char*)params1->oneDS3Biases,
tmpAlign1+0,
(char*)params1->oneDS4Weights,
(char*)params1->oneDS4Biases,
tmpAlign1+8192,
align4+12979968
};
ResNet50OneArrangeWts7(team12, tensors297);
}
{
ResNet50BnSimplify2(
params1->bn27Means,
params1->bn27Variances,
params1->bn27Scales,
params1->bn27Shifts,
tmpAlign1+0
);
char* tensors298[] = {
(char*)params1->three8Weights,
(char*)params1->three8Biases,
tmpAlign1+0,
align4+15606528
};
ResNet50ThreeArrangeFilts5(team12, tensors298);
}
{
ResNet50BnSimplify5(
params1->bn28Means,
params1->bn28Variances,
params1->bn28Scales,
params1->bn28Shifts,
tmpAlign1+0
);
char* tensors299[] = {
(char*)params1->one15Weights,
(char*)params1->one15Biases,
tmpAlign1+0,
align4+23996160
};
ResNet50OneArrangeWts8(team12, tensors299);
}
{
ResNet50BnSimplify2(
params1->bn29Means,
params1->bn29Variances,
params1->bn29Scales,
params1->bn29Shifts,
tmpAlign1+0
);
char* tensors300[] = {
(char*)params1->one16Weights,
(char*)params1->one16Biases,
tmpAlign1+0,
align4+25048832
};
ResNet50OneArrangeWts9(team12, tensors300);
}
{
ResNet50BnSimplify2(
params1->bn30Means,
params1->bn30Variances,
params1->bn30Scales,
params1->bn30Shifts,
tmpAlign1+0
);
char* tensors301[] = {
(char*)params1->three9Weights,
(char*)params1->three9Biases,
tmpAlign1+0,
align4+26098432
};
ResNet50ThreeArrangeFilts6(team12, tensors301);
}
{
ResNet50BnSimplify5(
params1->bn31Means,
params1->bn31Variances,
params1->bn31Scales,
params1->bn31Shifts,
tmpAlign1+0
);
char* tensors302[] = {
(char*)params1->one17Weights,
(char*)params1->one17Biases,
tmpAlign1+0,
align4+34488064
};
ResNet50OneArrangeWts8(team12, tensors302);
}
{
ResNet50BnSimplify2(
params1->bn32Means,
params1->bn32Variances,
params1->bn32Scales,
params1->bn32Shifts,
tmpAlign1+0
);
char* tensors303[] = {
(char*)params1->one18Weights,
(char*)params1->one18Biases,
tmpAlign1+0,
align4+35540736
};
ResNet50OneArrangeWts9(team12, tensors303);
}
{
ResNet50BnSimplify2(
params1->bn33Means,
params1->bn33Variances,
params1->bn33Scales,
params1->bn33Shifts,
tmpAlign1+0
);
char* tensors304[] = {
(char*)params1->three10Weights,
(char*)params1->three10Biases,
tmpAlign1+0,
align4+36590336
};
ResNet50ThreeArrangeFilts6(team12, tensors304);
}
{
ResNet50BnSimplify5(
params1->bn34Means,
params1->bn34Variances,
params1->bn34Scales,
params1->bn34Shifts,
tmpAlign1+0
);
char* tensors305[] = {
(char*)params1->one19Weights,
(char*)params1->one19Biases,
tmpAlign1+0,
align4+44979968
};
ResNet50OneArrangeWts8(team12, tensors305);
}
{
ResNet50BnSimplify2(
params1->bn35Means,
params1->bn35Variances,
params1->bn35Scales,
params1->bn35Shifts,
tmpAlign1+0
);
char* tensors306[] = {
(char*)params1->one20Weights,
(char*)params1->one20Biases,
tmpAlign1+0,
align4+46032640
};
ResNet50OneArrangeWts9(team12, tensors306);
}
{
ResNet50BnSimplify2(
params1->bn36Means,
params1->bn36Variances,
params1->bn36Scales,
params1->bn36Shifts,
tmpAlign1+0
);
char* tensors307[] = {
(char*)params1->three11Weights,
(char*)params1->three11Biases,
tmpAlign1+0,
align4+47082240
};
ResNet50ThreeArrangeFilts6(team12, tensors307);
}
{
ResNet50BnSimplify5(
params1->bn37Means,
params1->bn37Variances,
params1->bn37Scales,
params1->bn37Shifts,
tmpAlign1+0
);
char* tensors308[] = {
(char*)params1->one21Weights,
(char*)params1->one21Biases,
tmpAlign1+0,
align4+55471872
};
ResNet50OneArrangeWts8(team12, tensors308);
}
{
ResNet50BnSimplify2(
params1->bn38Means,
params1->bn38Variances,
params1->bn38Scales,
params1->bn38Shifts,
tmpAlign1+0
);
char* tensors309[] = {
(char*)params1->one22Weights,
(char*)params1->one22Biases,
tmpAlign1+0,
align4+56524544
};
ResNet50OneArrangeWts9(team12, tensors309);
}
{
ResNet50BnSimplify2(
params1->bn39Means,
params1->bn39Variances,
params1->bn39Scales,
params1->bn39Shifts,
tmpAlign1+0
);
char* tensors310[] = {
(char*)params1->three12Weights,
(char*)params1->three12Biases,
tmpAlign1+0,
align4+57574144
};
ResNet50ThreeArrangeFilts6(team12, tensors310);
}
{
ResNet50BnSimplify5(
params1->bn40Means,
params1->bn40Variances,
params1->bn40Scales,
params1->bn40Shifts,
tmpAlign1+0
);
char* tensors311[] = {
(char*)params1->one23Weights,
(char*)params1->one23Biases,
tmpAlign1+0,
align4+65963776
};
ResNet50OneArrangeWts8(team12, tensors311);
}
{
ResNet50BnSimplify2(
params1->bn41Means,
params1->bn41Variances,
params1->bn41Scales,
params1->bn41Shifts,
tmpAlign1+0
);
char* tensors312[] = {
(char*)params1->one24Weights,
(char*)params1->one24Biases,
tmpAlign1+0,
align4+67016448
};
ResNet50OneArrangeWts9(team12, tensors312);
}
{
ResNet50BnSimplify2(
params1->bn42Means,
params1->bn42Variances,
params1->bn42Scales,
params1->bn42Shifts,
tmpAlign1+0
);
char* tensors313[] = {
(char*)params1->three13Weights,
(char*)params1->three13Biases,
tmpAlign1+0,
align4+68066048
};
ResNet50ThreeArrangeFilts6(team12, tensors313);
}
{
ResNet50BnSimplify5(
params1->bn43Means,
params1->bn43Variances,
params1->bn43Scales,
params1->bn43Shifts,
tmpAlign1+0
);
char* tensors314[] = {
(char*)params1->one25Weights,
(char*)params1->one25Biases,
tmpAlign1+0,
align4+76455680
};
ResNet50OneArrangeWts8(team12, tensors314);
}
{
ResNet50BnSimplify6(
params1->bn44Means,
params1->bn44Variances,
params1->bn44Scales,
params1->bn44Shifts,
tmpAlign1+0
);
ResNet50BnSimplify3(
params1->bn45Means,
params1->bn45Variances,
params1->bn45Scales,
params1->bn45Shifts,
tmpAlign1+16384
);
char* tensors315[] = {
(char*)params1->oneDS5Weights,
(char*)params1->oneDS5Biases,
tmpAlign1+0,
(char*)params1->oneDS6Weights,
(char*)params1->oneDS6Biases,
tmpAlign1+16384,
align4+77508352
};
ResNet50OneArrangeWts10(team12, tensors315);
}
{
ResNet50BnSimplify3(
params1->bn46Means,
params1->bn46Variances,
params1->bn46Scales,
params1->bn46Shifts,
tmpAlign1+0
);
char* tensors316[] = {
(char*)params1->three14Weights,
(char*)params1->three14Biases,
tmpAlign1+0,
align4+88004352
};
ResNet50ThreeArrangeFilts7(team12, tensors316);
}
{
ResNet50BnSimplify6(
params1->bn47Means,
params1->bn47Variances,
params1->bn47Scales,
params1->bn47Shifts,
tmpAlign1+0
);
char* tensors317[] = {
(char*)params1->one26Weights,
(char*)params1->one26Biases,
tmpAlign1+0,
align4+121560832
};
ResNet50OneArrangeWts11(team12, tensors317);
}
{
ResNet50BnSimplify3(
params1->bn48Means,
params1->bn48Variances,
params1->bn48Scales,
params1->bn48Shifts,
tmpAlign1+0
);
char* tensors318[] = {
(char*)params1->one27Weights,
(char*)params1->one27Biases,
tmpAlign1+0,
align4+125763328
};
ResNet50OneArrangeWts12(team12, tensors318);
}
{
ResNet50BnSimplify3(
params1->bn49Means,
params1->bn49Variances,
params1->bn49Scales,
params1->bn49Shifts,
tmpAlign1+0
);
char* tensors319[] = {
(char*)params1->three15Weights,
(char*)params1->three15Biases,
tmpAlign1+0,
align4+129963776
};
ResNet50ThreeArrangeFilts8(team12, tensors319);
}
{
ResNet50BnSimplify6(
params1->bn50Means,
params1->bn50Variances,
params1->bn50Scales,
params1->bn50Shifts,
tmpAlign1+0
);
char* tensors320[] = {
(char*)params1->one28Weights,
(char*)params1->one28Biases,
tmpAlign1+0,
align4+163520256
};
ResNet50OneArrangeWts11(team12, tensors320);
}
{
ResNet50BnSimplify3(
params1->bn51Means,
params1->bn51Variances,
params1->bn51Scales,
params1->bn51Shifts,
tmpAlign1+0
);
char* tensors321[] = {
(char*)params1->one29Weights,
(char*)params1->one29Biases,
tmpAlign1+0,
align4+167722752
};
ResNet50OneArrangeWts12(team12, tensors321);
}
{
ResNet50BnSimplify3(
params1->bn52Means,
params1->bn52Variances,
params1->bn52Scales,
params1->bn52Shifts,
tmpAlign1+0
);
char* tensors322[] = {
(char*)params1->three16Weights,
(char*)params1->three16Biases,
tmpAlign1+0,
align4+171923200
};
ResNet50ThreeArrangeFilts8(team12, tensors322);
}
{
ResNet50BnSimplify6(
params1->bn53Means,
params1->bn53Variances,
params1->bn53Scales,
params1->bn53Shifts,
tmpAlign1+0
);
char* tensors323[] = {
(char*)params1->one30Weights,
(char*)params1->one30Biases,
tmpAlign1+0,
align4+205479680
};
ResNet50OneArrangeWts11(team12, tensors323);
}
{
char* tensors324[] = {
(char*)params1->fcWeights,
(char*)params1->fcBiases,
align4+209682176
};
ResNet50FcArrange1(team12, tensors324);
}
ResNet50ThreaderDestroy1(team12);
free(tmpAlloc1);
ResNet50Net* net5 = malloc(sizeof(ResNet50Net));
if (__builtin_expect(!net5, 0)) {
char* msg7 = ResNet50Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg7;
}
net5->alloc1 = alloc3;
net5->align1 = align4;
*net1 = net5;
return 0;
}

struct ResNet50Engine {
ResNet50Net* net3;
ResNet50ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* ResNet50EnginePthreadT(
ResNet50Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return ResNet50ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void ResNet50EngineDestroy(ResNet50Engine* eng3) {
ResNet50ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* ResNet50EngineCreate(
ResNet50Engine** eng4,
ResNet50Net* net4,
ptrdiff_t threads2
) {
ResNet50Engine* eng5 = malloc(sizeof(ResNet50Engine));
if (__builtin_expect(!eng5, 0)) {
return ResNet50Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(18364671);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = ResNet50Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = ResNet50ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void ResNet50EngineInference(
ResNet50Engine* eng1,
float* imageData,
float* probData
) {
char* netAlign1 = eng1->net3->align1;
ResNet50ThreaderTeam1* team14 = eng1->team11;
char* align3 = eng1->align2;
{
char* tensors156[] = {
(char*)imageData,
align3+8069312
};
ResNet50StriderArrangeDats1(team14, tensors156);
char* tensors157[] = {
netAlign1+0,
align3+8069312,
align3+9697472
};
ResNet50StriderProduceSums1(team14, tensors157);
char* tensors158[] = {
align3+9697472,
align3+0
};
ResNet50StriderConsumeSums1(team14, tensors158);
}
{
char* tensors159[] = {
align3+0,
align3+4034624
};
ResNet50Thrpl1(team14, tensors159);
}
{
char* tensors160[] = {
align3+4034624,
align3+8069312
};
ResNet50OneArrangeDats1(team14, tensors160);
char* tensors161[] = {
netAlign1+98560,
align3+8069312,
align3+0
};
ResNet50OneApply1(team14, tensors161);
}
{
char* tensors162[] = {
align3+3227648,
align3+8069312
};
ResNet50ThreeArrangeDats1(team14, tensors162);
char* tensors163[] = {
netAlign1+181760,
align3+8069312,
align3+9707712
};
ResNet50ThreeProduceSums1(team14, tensors163);
char* tensors164[] = {
align3+9707712,
align3+7262336
};
ResNet50ThreeConsumeSums1(team14, tensors164);
}
{
char* tensors165[] = {
align3+7262336,
align3+8069312
};
ResNet50OneArrangeDats2(team14, tensors165);
char* tensors166[] = {
netAlign1+706304,
align3+8069312,
align3+0,
align3+4034624
};
ResNet50OneApply2(team14, tensors166);
}
{
char* tensors167[] = {
align3+4034624,
align3+8069312
};
ResNet50OneArrangeDats3(team14, tensors167);
char* tensors168[] = {
netAlign1+772864,
align3+8069312,
align3+0
};
ResNet50OneApply3(team14, tensors168);
}
{
char* tensors169[] = {
align3+0,
align3+8069312
};
ResNet50ThreeArrangeDats2(team14, tensors169);
char* tensors170[] = {
netAlign1+838656,
align3+8069312,
align3+9707712
};
ResNet50ThreeProduceSums2(team14, tensors170);
char* tensors171[] = {
align3+9707712,
align3+7262336
};
ResNet50ThreeConsumeSums2(team14, tensors171);
}
{
char* tensors172[] = {
align3+7262336,
align3+8069312
};
ResNet50OneArrangeDats2(team14, tensors172);
char* tensors173[] = {
netAlign1+1363200,
align3+8069312,
align3+4034624,
align3+0
};
ResNet50OneApply2(team14, tensors173);
}
{
char* tensors174[] = {
align3+0,
align3+8069312
};
ResNet50OneArrangeDats3(team14, tensors174);
char* tensors175[] = {
netAlign1+1429760,
align3+8069312,
align3+3227712
};
ResNet50OneApply3(team14, tensors175);
}
{
char* tensors176[] = {
align3+3227712,
align3+8069312
};
ResNet50ThreeArrangeDats2(team14, tensors176);
char* tensors177[] = {
netAlign1+1495552,
align3+8069312,
align3+9707712
};
ResNet50ThreeProduceSums2(team14, tensors177);
char* tensors178[] = {
align3+9707712,
align3+6455424
};
ResNet50ThreeConsumeSums2(team14, tensors178);
}
{
char* tensors179[] = {
align3+6455424,
align3+8069312
};
ResNet50OneArrangeDats2(team14, tensors179);
char* tensors180[] = {
netAlign1+2020096,
align3+8069312,
align3+0,
align3+3227712
};
ResNet50OneApply2(team14, tensors180);
}
{
char* tensors181[] = {
align3+3227712,
align3+8069312
};
ResNet50OneArrangeDats4(team14, tensors181);
char* tensors182[] = {
netAlign1+2086656,
align3+8069312,
align3+0
};
ResNet50OneApply4(team14, tensors182);
}
{
char* tensors183[] = {
align3+1605632,
align3+8069312
};
ResNet50ThreeArrangeDats3(team14, tensors183);
char* tensors184[] = {
netAlign1+2744576,
align3+8069312,
align3+8888512
};
ResNet50ThreeProduceSums3(team14, tensors184);
char* tensors185[] = {
align3+8888512,
align3+3612800
};
ResNet50ThreeConsumeSums3(team14, tensors185);
}
{
char* tensors186[] = {
align3+3612800,
align3+8069312
};
ResNet50OneArrangeDats5(team14, tensors186);
char* tensors187[] = {
netAlign1+4842240,
align3+8069312,
align3+0,
align3+2007104
};
ResNet50OneApply5(team14, tensors187);
}
{
char* tensors188[] = {
align3+2007104,
align3+8069312
};
ResNet50OneArrangeDats6(team14, tensors188);
char* tensors189[] = {
netAlign1+5106432,
align3+8069312,
align3+0
};
ResNet50OneApply6(team14, tensors189);
}
{
char* tensors190[] = {
align3+0,
align3+8069312
};
ResNet50ThreeArrangeDats4(team14, tensors190);
char* tensors191[] = {
netAlign1+5369088,
align3+8069312,
align3+8888512
};
ResNet50ThreeProduceSums4(team14, tensors191);
char* tensors192[] = {
align3+8888512,
align3+3612800
};
ResNet50ThreeConsumeSums4(team14, tensors192);
}
{
char* tensors193[] = {
align3+3612800,
align3+8069312
};
ResNet50OneArrangeDats5(team14, tensors193);
char* tensors194[] = {
netAlign1+7466752,
align3+8069312,
align3+2007104,
align3+0
};
ResNet50OneApply5(team14, tensors194);
}
{
char* tensors195[] = {
align3+0,
align3+8069312
};
ResNet50OneArrangeDats6(team14, tensors195);
char* tensors196[] = {
netAlign1+7730944,
align3+8069312,
align3+1605696
};
ResNet50OneApply6(team14, tensors196);
}
{
char* tensors197[] = {
align3+1605696,
align3+8069312
};
ResNet50ThreeArrangeDats4(team14, tensors197);
char* tensors198[] = {
netAlign1+7993600,
align3+8069312,
align3+8888512
};
ResNet50ThreeProduceSums4(team14, tensors198);
char* tensors199[] = {
align3+8888512,
align3+3211392
};
ResNet50ThreeConsumeSums4(team14, tensors199);
}
{
char* tensors200[] = {
align3+3211392,
align3+8069312
};
ResNet50OneArrangeDats5(team14, tensors200);
char* tensors201[] = {
netAlign1+10091264,
align3+8069312,
align3+0,
align3+1605696
};
ResNet50OneApply5(team14, tensors201);
}
{
char* tensors202[] = {
align3+1605696,
align3+8069312
};
ResNet50OneArrangeDats6(team14, tensors202);
char* tensors203[] = {
netAlign1+10355456,
align3+8069312,
align3+0
};
ResNet50OneApply6(team14, tensors203);
}
{
char* tensors204[] = {
align3+0,
align3+8069312
};
ResNet50ThreeArrangeDats4(team14, tensors204);
char* tensors205[] = {
netAlign1+10618112,
align3+8069312,
align3+8888512
};
ResNet50ThreeProduceSums4(team14, tensors205);
char* tensors206[] = {
align3+8888512,
align3+3211392
};
ResNet50ThreeConsumeSums4(team14, tensors206);
}
{
char* tensors207[] = {
align3+3211392,
align3+8069312
};
ResNet50OneArrangeDats5(team14, tensors207);
char* tensors208[] = {
netAlign1+12715776,
align3+8069312,
align3+1605696,
align3+0
};
ResNet50OneApply5(team14, tensors208);
}
{
char* tensors209[] = {
align3+0,
align3+8069312
};
ResNet50OneArrangeDats7(team14, tensors209);
char* tensors210[] = {
netAlign1+12979968,
align3+8069312,
align3+1605696
};
ResNet50OneApply7(team14, tensors210);
}
{
char* tensors211[] = {
align3+2457664,
align3+8069312
};
ResNet50ThreeArrangeDats5(team14, tensors211);
char* tensors212[] = {
netAlign1+15606528,
align3+8069312,
align3+8659136
};
ResNet50ThreeProduceSums5(team14, tensors212);
char* tensors213[] = {
align3+8659136,
align3+852032
};
ResNet50ThreeConsumeSums5(team14, tensors213);
}
{
char* tensors214[] = {
align3+852032,
align3+8069312
};
ResNet50OneArrangeDats8(team14, tensors214);
char* tensors215[] = {
netAlign1+23996160,
align3+8069312,
align3+1605696,
align3+0
};
ResNet50OneApply8(team14, tensors215);
}
{
char* tensors216[] = {
align3+0,
align3+8069312
};
ResNet50OneArrangeDats9(team14, tensors216);
char* tensors217[] = {
netAlign1+25048832,
align3+8069312,
align3+852032
};
ResNet50OneApply9(team14, tensors217);
}
{
char* tensors218[] = {
align3+852032,
align3+8069312
};
ResNet50ThreeArrangeDats6(team14, tensors218);
char* tensors219[] = {
netAlign1+26098432,
align3+8069312,
align3+8659136
};
ResNet50ThreeProduceSums6(team14, tensors219);
char* tensors220[] = {
align3+8659136,
align3+1704064
};
ResNet50ThreeConsumeSums6(team14, tensors220);
}
{
char* tensors221[] = {
align3+1704064,
align3+8069312
};
ResNet50OneArrangeDats8(team14, tensors221);
char* tensors222[] = {
netAlign1+34488064,
align3+8069312,
align3+0,
align3+852032
};
ResNet50OneApply8(team14, tensors222);
}
{
char* tensors223[] = {
align3+852032,
align3+8069312
};
ResNet50OneArrangeDats9(team14, tensors223);
char* tensors224[] = {
netAlign1+35540736,
align3+8069312,
align3+0
};
ResNet50OneApply9(team14, tensors224);
}
{
char* tensors225[] = {
align3+0,
align3+8069312
};
ResNet50ThreeArrangeDats6(team14, tensors225);
char* tensors226[] = {
netAlign1+36590336,
align3+8069312,
align3+8659136
};
ResNet50ThreeProduceSums6(team14, tensors226);
char* tensors227[] = {
align3+8659136,
align3+1704064
};
ResNet50ThreeConsumeSums6(team14, tensors227);
}
{
char* tensors228[] = {
align3+1704064,
align3+8069312
};
ResNet50OneArrangeDats8(team14, tensors228);
char* tensors229[] = {
netAlign1+44979968,
align3+8069312,
align3+852032,
align3+0
};
ResNet50OneApply8(team14, tensors229);
}
{
char* tensors230[] = {
align3+0,
align3+8069312
};
ResNet50OneArrangeDats9(team14, tensors230);
char* tensors231[] = {
netAlign1+46032640,
align3+8069312,
align3+852032
};
ResNet50OneApply9(team14, tensors231);
}
{
char* tensors232[] = {
align3+852032,
align3+8069312
};
ResNet50ThreeArrangeDats6(team14, tensors232);
char* tensors233[] = {
netAlign1+47082240,
align3+8069312,
align3+8659136
};
ResNet50ThreeProduceSums6(team14, tensors233);
char* tensors234[] = {
align3+8659136,
align3+1704064
};
ResNet50ThreeConsumeSums6(team14, tensors234);
}
{
char* tensors235[] = {
align3+1704064,
align3+8069312
};
ResNet50OneArrangeDats8(team14, tensors235);
char* tensors236[] = {
netAlign1+55471872,
align3+8069312,
align3+0,
align3+852032
};
ResNet50OneApply8(team14, tensors236);
}
{
char* tensors237[] = {
align3+852032,
align3+8069312
};
ResNet50OneArrangeDats9(team14, tensors237);
char* tensors238[] = {
netAlign1+56524544,
align3+8069312,
align3+0
};
ResNet50OneApply9(team14, tensors238);
}
{
char* tensors239[] = {
align3+0,
align3+8069312
};
ResNet50ThreeArrangeDats6(team14, tensors239);
char* tensors240[] = {
netAlign1+57574144,
align3+8069312,
align3+8659136
};
ResNet50ThreeProduceSums6(team14, tensors240);
char* tensors241[] = {
align3+8659136,
align3+1704064
};
ResNet50ThreeConsumeSums6(team14, tensors241);
}
{
char* tensors242[] = {
align3+1704064,
align3+8069312
};
ResNet50OneArrangeDats8(team14, tensors242);
char* tensors243[] = {
netAlign1+65963776,
align3+8069312,
align3+852032,
align3+0
};
ResNet50OneApply8(team14, tensors243);
}
{
char* tensors244[] = {
align3+0,
align3+8069312
};
ResNet50OneArrangeDats9(team14, tensors244);
char* tensors245[] = {
netAlign1+67016448,
align3+8069312,
align3+852032
};
ResNet50OneApply9(team14, tensors245);
}
{
char* tensors246[] = {
align3+852032,
align3+8069312
};
ResNet50ThreeArrangeDats6(team14, tensors246);
char* tensors247[] = {
netAlign1+68066048,
align3+8069312,
align3+8659136
};
ResNet50ThreeProduceSums6(team14, tensors247);
char* tensors248[] = {
align3+8659136,
align3+1704064
};
ResNet50ThreeConsumeSums6(team14, tensors248);
}
{
char* tensors249[] = {
align3+1704064,
align3+8069312
};
ResNet50OneArrangeDats8(team14, tensors249);
char* tensors250[] = {
netAlign1+76455680,
align3+8069312,
align3+0,
align3+852032
};
ResNet50OneApply8(team14, tensors250);
}
{
char* tensors251[] = {
align3+852032,
align3+8069312
};
ResNet50OneArrangeDats10(team14, tensors251);
char* tensors252[] = {
netAlign1+77508352,
align3+8069312,
align3+0
};
ResNet50OneApply10(team14, tensors252);
}
{
char* tensors253[] = {
align3+655360,
align3+8069312
};
ResNet50ThreeArrangeDats7(team14, tensors253);
char* tensors254[] = {
netAlign1+88004352,
align3+8069312,
align3+8593600
};
ResNet50ThreeProduceSums7(team14, tensors254);
char* tensors255[] = {
align3+8593600,
align3+1474688
};
ResNet50ThreeConsumeSums7(team14, tensors255);
}
{
char* tensors256[] = {
align3+1474688,
align3+8069312
};
ResNet50OneArrangeDats11(team14, tensors256);
char* tensors257[] = {
netAlign1+121560832,
align3+8069312,
align3+0,
align3+819264
};
ResNet50OneApply11(team14, tensors257);
}
{
char* tensors258[] = {
align3+819264,
align3+8069312
};
ResNet50OneArrangeDats12(team14, tensors258);
char* tensors259[] = {
netAlign1+125763328,
align3+8069312,
align3+0
};
ResNet50OneApply12(team14, tensors259);
}
{
char* tensors260[] = {
align3+0,
align3+8069312
};
ResNet50ThreeArrangeDats8(team14, tensors260);
char* tensors261[] = {
netAlign1+129963776,
align3+8069312,
align3+8593600
};
ResNet50ThreeProduceSums8(team14, tensors261);
char* tensors262[] = {
align3+8593600,
align3+1474688
};
ResNet50ThreeConsumeSums8(team14, tensors262);
}
{
char* tensors263[] = {
align3+1474688,
align3+8069312
};
ResNet50OneArrangeDats11(team14, tensors263);
char* tensors264[] = {
netAlign1+163520256,
align3+8069312,
align3+819264,
align3+0
};
ResNet50OneApply11(team14, tensors264);
}
{
char* tensors265[] = {
align3+0,
align3+8069312
};
ResNet50OneArrangeDats12(team14, tensors265);
char* tensors266[] = {
netAlign1+167722752,
align3+8069312,
align3+655424
};
ResNet50OneApply12(team14, tensors266);
}
{
char* tensors267[] = {
align3+655424,
align3+8069312
};
ResNet50ThreeArrangeDats8(team14, tensors267);
char* tensors268[] = {
netAlign1+171923200,
align3+8069312,
align3+8593600
};
ResNet50ThreeProduceSums8(team14, tensors268);
char* tensors269[] = {
align3+8593600,
align3+1310848
};
ResNet50ThreeConsumeSums8(team14, tensors269);
}
{
char* tensors270[] = {
align3+1310848,
align3+8069312
};
ResNet50OneArrangeDats11(team14, tensors270);
char* tensors271[] = {
netAlign1+205479680,
align3+8069312,
align3+0,
align3+655424
};
ResNet50OneApply11(team14, tensors271);
}
{
char* tensors272[] = {
align3+655424,
align3+0
};
ResNet50Glopl1(team14, tensors272);
}
{
char* tensors273[] = {
netAlign1+209682176,
align3+0,
align3+8192
};
ResNet50FcApply1(team14, tensors273);
}
{
char* tensors274[] = {
align3+8192,
(char*)probData
};
ResNet50Softmax1(team14, tensors274);
}
}

// End of file.

Top