@@ -2077,10 +2077,22 @@ void HWConformityPro::fixMadw(INST_LIST_ITER it, G4_BB *bb) {
20772077 mullhDst = builder.duplicateOperand (dst);
20782078 tmpType = dstType;
20792079 } else {
2080- // If src2 is not 0, then madw will convert to gen mullh + addc + add:
2081- // mullh (16) mullh_dst<1>:d src0<1;1,0>:d src1<1;1,0>:d
2082- // addc (16) dst_lo32<1>:d mullh_dst_lo32<1;1,0>:d src2<1;1,0>:d
2083- // add (16) dst_hi32<1>:d acc0.0<1;1,0>:d mullh_dst_hi32<1;1,0>:d
2080+ // If src2 is not 0 and has unsigned datatype, then madw will convert to gen
2081+ // mullh + addc + add:
2082+ // madw (16) dst<1>:ud src0<1;1,0>:ud src0<1;1,0>:ud src2<1;1,0>:ud
2083+ // =>
2084+ // mullh (16) mullh_dst<1>:ud src0<1;1,0>:ud src0<1;1,0>:ud
2085+ // addc (16) dst_lo32<1>:ud mullh_dst_lo32<1;1,0>:ud src2<1;1,0>:ud
2086+ // add (16) dst_hi32<1>:ud acc0.0<1;1,0>:ud mullh_dst_hi32<1;1,0>:ud
2087+ // If src2 is not 0 and has signed datatype, then madw will convert to gen
2088+ // mullh + addc + mov + add3:
2089+ // madw (16) dst<1>:d src0<1;1,0>:ud src0<1;1,0>:ud src2<1;1,0>:d
2090+ // =>
2091+ // mullh (16) mullh_dst<1>:d src0<1;1,0>:ud src1<1;1,0>:ud
2092+ // addc (16) dst_lo32<1>:ud mullh_dst_lo32<1;1,0>:ud src2<1;1,0>:ud
2093+ // mov (16) signExt<1>:q src2<1;1,0>:d
2094+ // add3 (16) dst_hi32<1>:d signExt.1<2;1,0>:d acc0.0<1;1,0>:d
2095+ // mullh_dst_hi32<1;1,0>:d
20842096 tmpType =
20852097 (IS_UNSIGNED_INT (src0->getType ()) && IS_UNSIGNED_INT (src1->getType ()) &&
20862098 IS_UNSIGNED_INT (src2->getType ()))
@@ -2133,8 +2145,27 @@ void HWConformityPro::fixMadw(INST_LIST_ITER it, G4_BB *bb) {
21332145 addcInst->setOptionOn (InstOpt_AccWrCtrl);
21342146 auto insertIter = bb->insertAfter (it, addcInst);
21352147
2136- // Create add instruction:
2148+ // If src2 is signed datatype, we need to extend the sign bit of src2 which
2149+ // is the addend for higher 32-bits result calculation:
2150+ // mov (16) signExt<1>:q src2<1;1,0>:d
2151+ G4_Declare *signExtDclQword = nullptr ;
2152+ if (src2->getType () == Type_D) {
2153+ signExtDclQword = builder.createTempVar (
2154+ builder.numEltPerGRF (Type_Q) * execSize, Type_Q, builder.getGRFAlign ());
2155+ auto movDst = builder.createDstRegRegion (signExtDclQword, 1 );
2156+ auto movInst = builder.createMov (
2157+ execSize, movDst, builder.duplicateOperand (src2), origOptions, false );
2158+ movInst->setPredicate (builder.duplicateOperand (origPredicate));
2159+ movInst->setOptionOff (InstOpt_AccWrCtrl);
2160+ insertIter = bb->insertAfter (insertIter, movInst);
2161+ }
2162+
2163+ // Create add or add3 instruction:
2164+ // If src2 is unsigned datatype:
21372165 // add (16) dst_hi32<1>:d acc0.0<1;1,0>:d mullh_dst_hi32<1;1,0>:d
2166+ // Otherwise:
2167+ // add3 (16) dst_hi32<1>:d signExt.1<2;1,0>:d acc0.0<1;1,0>:d
2168+ // mullh_dst_hi32<1;1,0>:d
21382169 int DstHiRegOffset = (int )std::ceil (
21392170 (float )(execSize * dst->getExecTypeSize ()) / builder.getGRFSize ());
21402171 auto *dstHi32 =
@@ -2145,19 +2176,35 @@ void HWConformityPro::fixMadw(INST_LIST_ITER it, G4_BB *bb) {
21452176 tmpType, builder.getGRFAlign ());
21462177 mullhTmpDclHi->setAliasDeclare (mullhTmpDcl,
21472178 mullhDstLowGRFNum * builder.getGRFSize ());
2148- auto src1Add = builder.createSrcRegRegion (
2179+ auto srcAdd = builder.createSrcRegRegion (
21492180 mullhTmpDclHi, execSize == g4::SIMD1 ? builder.getRegionScalar ()
21502181 : builder.getRegionStride1 ());
21512182 auto accSrcOpnd =
21522183 builder.createSrc (builder.phyregpool .getAcc0Reg (), 0 , 0 ,
21532184 execSize == g4::SIMD1 ? builder.getRegionScalar ()
21542185 : builder.getRegionStride1 (),
21552186 tmpType);
2156- auto addInst = builder.createBinOp (G4_add, execSize, dstHi32, accSrcOpnd,
2157- src1Add, origOptions, false );
2158- addInst->setPredicate (builder.duplicateOperand (origPredicate));
2159- addInst->setOptionOff (InstOpt_AccWrCtrl);
2160- bb->insertAfter (insertIter, addInst);
2187+ G4_INST *addOrAdd3Inst = nullptr ;
2188+ if (src2->getType () == Type_D) {
2189+ G4_Declare *signExtDclDword =
2190+ builder.createTempVar (builder.numEltPerGRF (tmpType) * execSize * 2 ,
2191+ tmpType, builder.getGRFAlign ());
2192+ signExtDclDword->setAliasDeclare (signExtDclQword, 0 );
2193+ auto src0Add3 =
2194+ builder.createSrc (signExtDclDword->getRegVar (), 0 , 1 ,
2195+ execSize == g4::SIMD1 ? builder.getRegionScalar ()
2196+ : builder.getRegionStride2 (),
2197+ tmpType);
2198+ addOrAdd3Inst = builder.createInternalInst (
2199+ nullptr , G4_add3, nullptr , g4::NOSAT, execSize, dstHi32, src0Add3,
2200+ accSrcOpnd, srcAdd, origOptions);
2201+ } else {
2202+ addOrAdd3Inst = builder.createBinOp (G4_add, execSize, dstHi32, accSrcOpnd,
2203+ srcAdd, origOptions, false );
2204+ }
2205+ addOrAdd3Inst->setPredicate (builder.duplicateOperand (origPredicate));
2206+ addOrAdd3Inst->setOptionOff (InstOpt_AccWrCtrl);
2207+ bb->insertAfter (insertIter, addOrAdd3Inst);
21612208}
21622209
21632210// Restrictions for fcvt instruction:
0 commit comments