#include "src/internal_includes/HLSLCrossCompilerContext.h" #include "src/internal_includes/LoopTransform.h" #include "src/internal_includes/Shader.h" #include "src/internal_includes/debug.h" #include #include #include namespace HLSLcc { struct LoopInfo { public: LoopInfo() : m_StartLoop(0), m_EndLoop(0), m_ExitPoints(), m_IsSwitch(false) {} Instruction * m_StartLoop; // OPCODE_LOOP Instruction * m_EndLoop; // OPCODE_ENDLOOP that matches the LOOP above. std::vector m_ExitPoints; // Any BREAK/RET/BREAKC instructions within the same loop depth bool m_IsSwitch; // True if this is a switch-case and not a LOOP/ENDLOOP pair. Used as a helper when parsing. }; typedef std::list Loops; // Build a loopinfo array of all the loops in this shader phase void BuildLoopInfo(ShaderPhase &phase, Loops &res) { using namespace std; res.clear(); // A stack of loopinfo elements (stored in res) list loopStack; // Storage for dummy LoopInfo elements to be used for switch-cases. We don't want them cluttering the Loops list so store them here. list dummyLIForSwitches; for (std::vector::iterator instItr = phase.psInst.begin(); instItr != phase.psInst.end(); instItr++) { Instruction *i = &*instItr; if (i->eOpcode == OPCODE_LOOP) { LoopInfo *currLoopInfo = &*res.insert(res.end(), LoopInfo()); currLoopInfo->m_StartLoop = i; loopStack.push_front(currLoopInfo); } else if (i->eOpcode == OPCODE_ENDLOOP) { ASSERT(!loopStack.empty()); LoopInfo *li = *loopStack.begin(); loopStack.pop_front(); li->m_EndLoop = i; } else if (i->eOpcode == OPCODE_SWITCH) { // Create a dummy entry into the stack LoopInfo *li = &*dummyLIForSwitches.insert(dummyLIForSwitches.end(), LoopInfo()); li->m_IsSwitch = true; loopStack.push_front(li); } else if (i->eOpcode == OPCODE_ENDSWITCH) { ASSERT(!loopStack.empty()); LoopInfo *li = *loopStack.begin(); loopStack.pop_front(); ASSERT(li->m_IsSwitch); } else if (i->eOpcode == OPCODE_BREAK || i->eOpcode == OPCODE_BREAKC) { // Get the current loopstack head ASSERT(!loopStack.empty()); LoopInfo *li = *loopStack.begin(); // Ignore breaks from switch-cases if (!li->m_IsSwitch) { li->m_ExitPoints.push_back(i); } } } } // Returns true if the given instruction is a non-vectorized int or uint comparison instruction that reads from at least one temp and writes to a temp static bool IsScalarTempComparisonInstruction(const Instruction *i) { switch (i->eOpcode) { default: return false; case OPCODE_IGE: case OPCODE_ILT: case OPCODE_IEQ: case OPCODE_INE: case OPCODE_UGE: case OPCODE_ULT: break; } if (i->asOperands[0].eType != OPERAND_TYPE_TEMP) return false; int tempOp = -1; if (i->asOperands[1].eType == OPERAND_TYPE_TEMP) tempOp = 1; else if (i->asOperands[2].eType == OPERAND_TYPE_TEMP) tempOp = 2; // Also reject comparisons where we compare temp.x vs temp.y if (i->asOperands[1].eType == OPERAND_TYPE_TEMP && i->asOperands[2].eType == OPERAND_TYPE_TEMP && i->asOperands[1].ui32RegisterNumber == i->asOperands[2].ui32RegisterNumber) return false; if (tempOp == -1) return false; if (i->asOperands[0].GetNumSwizzleElements() != 1) return false; return true; } // Returns true iff both instructions perform identical operation. For the purposes of Loop transformation, we only consider operations of type tX = tX imm32 static bool AreInstructionsIdentical(const Instruction *a, const Instruction *b) { if (a->eOpcode != b->eOpcode) return false; ASSERT(a->ui32NumOperands == b->ui32NumOperands); uint32_t dstReg = 0; if (a->asOperands[0].eType != OPERAND_TYPE_TEMP) return false; dstReg = a->asOperands[0].ui32RegisterNumber; for (uint32_t i = 0; i < a->ui32NumOperands; i++) { const Operand &aop = a->asOperands[i]; const Operand &bop = b->asOperands[i]; if (aop.eType != bop.eType) return false; if (aop.GetAccessMask() != bop.GetAccessMask()) return false; if (aop.GetNumSwizzleElements() != 1) return false; if (aop.eType == OPERAND_TYPE_TEMP) { if (aop.ui32RegisterNumber != bop.ui32RegisterNumber) return false; if (aop.ui32RegisterNumber != dstReg) return false; } else if (aop.eType == OPERAND_TYPE_IMMEDIATE32) { if (memcmp(aop.afImmediates, bop.afImmediates, 4 * sizeof(float)) != 0) return false; } } return true; } // Attempt to transform a single loop into a for-statement static void AttemptLoopTransform(HLSLCrossCompilerContext *psContext, ShaderPhase &phase, LoopInfo &li) { // In order to transform a loop into a for, the following has to hold: // - The loop must start with a comparison instruction where one of the src operands is a temp (induction variable), followed by OPCODE_BREAKC. // - The loop must end with an arithmetic operation (SUB or ADD) where the dest operand is the same temp as one of the sources in the comparison instruction above // Additionally, if the loop induction variable is initialized before the start of the loop and it has only uses inside the LOOP/ENDLOOP pair, we can declare that inside the for statement. // Also, the loop induction variable must be standalone (as in, never used as part of a larger vector) Instruction *cmpInst = li.m_StartLoop + 1; if (!IsScalarTempComparisonInstruction(cmpInst)) return; Instruction *breakInst = li.m_StartLoop + 2; if (breakInst->eOpcode != OPCODE_BREAKC) return; if (breakInst->asOperands[0].eType != OPERAND_TYPE_TEMP) return; if (breakInst->asOperands[0].ui32RegisterNumber != cmpInst->asOperands[0].ui32RegisterNumber) return; // Check that the comparison result isn't used anywhere else if (cmpInst->m_Uses.size() != 1) return; ASSERT(cmpInst->m_Uses[0].m_Inst == breakInst); // Ok, at least we have the comparison + breakc combo at top. Try to find the induction variable uint32_t inductionVarIdx = 0; Instruction *lastInst = li.m_EndLoop - 1; if (lastInst->eOpcode != OPCODE_IADD) return; if (lastInst->asOperands[0].eType != OPERAND_TYPE_TEMP) return; if (lastInst->asOperands[0].GetNumSwizzleElements() != 1) return; uint32_t indVar = lastInst->asOperands[0].ui32RegisterNumber; // Verify that the induction variable actually matches. if (cmpInst->asOperands[1].eType == OPERAND_TYPE_TEMP && cmpInst->asOperands[1].ui32RegisterNumber == indVar) inductionVarIdx = 1; else if (cmpInst->asOperands[2].eType == OPERAND_TYPE_TEMP && cmpInst->asOperands[2].ui32RegisterNumber == indVar) inductionVarIdx = 2; else return; // Verify that we also read from the induction variable in the last instruction if (!((lastInst->asOperands[1].eType == OPERAND_TYPE_TEMP && lastInst->asOperands[1].ui32RegisterNumber == indVar) || (lastInst->asOperands[2].eType == OPERAND_TYPE_TEMP && lastInst->asOperands[2].ui32RegisterNumber == indVar))) return; // Nvidia compiler bug workaround: The shader compiler tries to be smart and unrolls constant loops, // but then fails miserably if the loop variable is used as an index to UAV loads/stores or some other cases ("array access too complex") // This is also triggered when the driver optimizer sees "simple enough" arithmetics (whatever that is) done on the loop variable before indexing. // So, disable for-loop transformation altogether whenever we see a UAV load or store inside a loop. if (psContext->psShader->eTargetLanguage >= LANG_400 && psContext->psShader->eTargetLanguage < LANG_GL_LAST && !psContext->IsVulkan()) { for (auto itr = li.m_StartLoop; itr != li.m_EndLoop; itr++) { switch (itr->eOpcode) { case OPCODE_LD_RAW: case OPCODE_LD_STRUCTURED: case OPCODE_LD_UAV_TYPED: case OPCODE_STORE_RAW: case OPCODE_STORE_STRUCTURED: case OPCODE_STORE_UAV_TYPED: return; // Nope, can't do a for, not even a partial one. default: break; } } } // One more thing to check: The comparison input may only see 1 definition that originates from inside the loop range: the one in lastInst. // Anything else means that there's a continue statement, or another break/breakc and that means that lastInst wouldn't get called. // Of course, if all those instructions are identical, then it's fine. // Ideally, if there's only one definition that's from outside the loop range, then we can use that as the initializer, as well. Instruction *initializer = NULL; std::vector definitionsOutsideRange; std::vector definitionsInsideRange; std::for_each(cmpInst->asOperands[inductionVarIdx].m_Defines.begin(), cmpInst->asOperands[inductionVarIdx].m_Defines.end(), [&](const Operand::Define &def) { if (def.m_Inst < li.m_StartLoop || def.m_Inst > li.m_EndLoop) definitionsOutsideRange.push_back(&def); else definitionsInsideRange.push_back(&def); }); if (definitionsInsideRange.size() != 1) { // All definitions must be identical for (std::vector::iterator itr = definitionsInsideRange.begin() + 1; itr != definitionsInsideRange.end(); itr++) { if (!AreInstructionsIdentical((*itr)->m_Inst, definitionsInsideRange[0]->m_Inst)) return; } } ASSERT(definitionsOutsideRange.size() > 0); if (definitionsOutsideRange.size() == 1) initializer = definitionsOutsideRange[0]->m_Inst; // Initializer must only write to one component if (initializer && initializer->asOperands[0].GetNumSwizzleElements() != 1) initializer = 0; // Initializer data type must be int or uint if (initializer) { SHADER_VARIABLE_TYPE dataType = initializer->asOperands[0].GetDataType(psContext); if (dataType != SVT_INT && dataType != SVT_UINT) return; } // Check that the initializer is only used within the range so we can move it to for statement if (initializer) { bool hasUsesOutsideRange = false; std::for_each(initializer->m_Uses.begin(), initializer->m_Uses.end(), [&](const Instruction::Use &u) { if (u.m_Inst < li.m_StartLoop || u.m_Inst > li.m_EndLoop) hasUsesOutsideRange = true; }); // Has outside uses? we cannot pull that up to the for statement if (hasUsesOutsideRange) initializer = 0; } // Check that the loop adder instruction only has uses inside the loop range, otherwise we cannot move the initializer either if (initializer) { bool cannotDoInitializer = false; for (auto itr = lastInst->m_Uses.begin(); itr != lastInst->m_Uses.end(); itr++) { const Instruction::Use &u = *itr; if (u.m_Inst < li.m_StartLoop || u.m_Inst > li.m_EndLoop) { cannotDoInitializer = true; break; } // Also check that the uses are not vector ops (temp splitting has already pulled everything to .x if this is a standalone var) if (u.m_Op->GetAccessMask() != 1) { cannotDoInitializer = true; break; } } // Has outside uses? we cannot pull that up to the for statement if (cannotDoInitializer) initializer = 0; } if (initializer) { // We can declare the initializer in the for loop header, allocate a new number for it and change all uses into that. uint32_t newRegister = phase.m_NextFreeTempRegister++; li.m_StartLoop->m_InductorRegister = newRegister; std::for_each(initializer->m_Uses.begin(), initializer->m_Uses.end(), [newRegister](const Instruction::Use &u) { u.m_Op->m_ForLoopInductorName = newRegister; }); // Also tweak the destinations for cmpInst, and lastInst if (cmpInst->asOperands[1].eType == OPERAND_TYPE_TEMP && cmpInst->asOperands[1].ui32RegisterNumber == initializer->asOperands[0].ui32RegisterNumber) cmpInst->asOperands[1].m_ForLoopInductorName = newRegister; else cmpInst->asOperands[2].m_ForLoopInductorName = newRegister; if (lastInst->asOperands[1].eType == OPERAND_TYPE_TEMP && lastInst->asOperands[1].ui32RegisterNumber == initializer->asOperands[0].ui32RegisterNumber) lastInst->asOperands[1].m_ForLoopInductorName = newRegister; else lastInst->asOperands[2].m_ForLoopInductorName = newRegister; lastInst->asOperands[0].m_ForLoopInductorName = newRegister; initializer->asOperands[0].m_ForLoopInductorName = newRegister; } // This loop can be transformed to for-loop. Do the necessary magicks. li.m_StartLoop->m_LoopInductors[0] = initializer; li.m_StartLoop->m_LoopInductors[1] = cmpInst; li.m_StartLoop->m_LoopInductors[2] = breakInst; li.m_StartLoop->m_LoopInductors[3] = lastInst; if (initializer) initializer->m_SkipTranslation = true; cmpInst->m_SkipTranslation = true; breakInst->m_SkipTranslation = true; lastInst->m_SkipTranslation = true; } void DoLoopTransform(HLSLCrossCompilerContext *psContext, ShaderPhase &phase) { Loops loops; BuildLoopInfo(phase, loops); std::for_each(loops.begin(), loops.end(), [&phase, psContext](LoopInfo &li) { // Some sanity checks: start and end points must be initialized, we shouldn't have any switches here, and each loop must have at least one exit point // Also that there's at least 2 instructions in loop body ASSERT(li.m_StartLoop != 0); ASSERT(li.m_EndLoop != 0); ASSERT(li.m_EndLoop > li.m_StartLoop + 2); ASSERT(!li.m_IsSwitch); ASSERT(!li.m_ExitPoints.empty()); AttemptLoopTransform(psContext, phase, li); }); } }