From 329efd732fef17da82c90c19535848fe326cefe5 Mon Sep 17 00:00:00 2001 From: jianwenyyy Date: Thu, 27 Jun 2024 15:54:00 +0800 Subject: [PATCH] [compiler] fix inf/nan convert on x86_64 arch --- compiler/include/byteir/Conversion/Passes.td | 6 +- .../byteir/Conversion/ToLinalg/ToLinalg.h | 10 +- .../byteir/Pipelines/LinalgTensorOpt.h | 3 + .../lib/Conversion/ToLinalg/HloToLinalg.cpp | 137 ++++++++++++++++-- compiler/lib/Pipelines/LinalgTensorOpt.cpp | 12 +- compiler/python/byteir/compile.py | 3 +- tests/numerical_test/execute.py | 3 + .../cpu_ops/convert_f32_i32_special_val.mlir | 4 + 8 files changed, 155 insertions(+), 23 deletions(-) create mode 100644 tests/numerical_test/mlir_tests/cpu_ops/convert_f32_i32_special_val.mlir diff --git a/compiler/include/byteir/Conversion/Passes.td b/compiler/include/byteir/Conversion/Passes.td index f7d79311c..01dd52c7f 100644 --- a/compiler/include/byteir/Conversion/Passes.td +++ b/compiler/include/byteir/Conversion/Passes.td @@ -63,7 +63,11 @@ def HloFusionToLinalg : Pass<"hlo-fusion-to-linalg", "func::FuncOp"> { Option<"enablePrimitiveOps", "enable-primitive-ops", "bool", /*default=*/"false", "Lower to primitive Linalg ops (map, reduce and " - "transpose) when possible, instead of linalg.generic"> + "transpose) when possible, instead of linalg.generic">, + Option<"target", "target", "std::string", /*default*/ "", + "Specificy the target">, + Option<"arch", "arch", "std::string", /*default*/ "", + "Specificy the target arch"> ]; } diff --git a/compiler/include/byteir/Conversion/ToLinalg/ToLinalg.h b/compiler/include/byteir/Conversion/ToLinalg/ToLinalg.h index 8e64ce9d3..8a8c0f114 100644 --- a/compiler/include/byteir/Conversion/ToLinalg/ToLinalg.h +++ b/compiler/include/byteir/Conversion/ToLinalg/ToLinalg.h @@ -41,11 +41,13 @@ void populateTensorToLinalgConversionPatterns(RewritePatternSet &patterns); void populateLinalgExtToLinalgConversionPatterns(RewritePatternSet &patterns); void populateHloToLinalgExtConversionPattern(TypeConverter &typeConverter, - RewritePatternSet &patterns); + RewritePatternSet &patterns, + const std::string &target = "", + const std::string &arch = ""); -std::unique_ptr> -createHloFusionToLinalgPass(llvm::StringRef anchorTag = "", - bool enablePrimitiveOps = false); +std::unique_ptr> createHloFusionToLinalgPass( + llvm::StringRef anchorTag = "", bool enablePrimitiveOps = false, + const std::string &target = "", const std::string &arch = ""); std::unique_ptr> createUnrealizedCastToLinalgPass(); diff --git a/compiler/include/byteir/Pipelines/LinalgTensorOpt.h b/compiler/include/byteir/Pipelines/LinalgTensorOpt.h index e8428ba4b..a5d521b2d 100644 --- a/compiler/include/byteir/Pipelines/LinalgTensorOpt.h +++ b/compiler/include/byteir/Pipelines/LinalgTensorOpt.h @@ -30,6 +30,9 @@ struct LinalgTensorOptPipelineOptions *this, "target", llvm::cl::desc("An optional attribute to speicify target."), llvm::cl::init("")}; + Option arch{ + *this, "arch", llvm::cl::desc("An optional attribute to speicify arch."), + llvm::cl::init("")}; }; void createLinalgTensorOptPipeline( diff --git a/compiler/lib/Conversion/ToLinalg/HloToLinalg.cpp b/compiler/lib/Conversion/ToLinalg/HloToLinalg.cpp index 2cba82b6f..0174da139 100644 --- a/compiler/lib/Conversion/ToLinalg/HloToLinalg.cpp +++ b/compiler/lib/Conversion/ToLinalg/HloToLinalg.cpp @@ -1267,13 +1267,116 @@ class ByteirRepeatCustomCallConverter } }; +/// Code below is copied from legalize_to_linalg.cc +/// Remove this when upstream FPToSIOp solves inf/nan convert. +Value coerceTensorShape(OpBuilder &builder, Location loc, + TypedValue value, ShapedType targetType) { + return builder.createOrFold( + loc, targetType.cloneWith(std::nullopt, value.getType().getElementType()), + value); +} + +inline Value mapFPToSIConvertOpToStdScalarOp(Location loc, + ArrayRef targetTypes, + ArrayRef resultTypes, + ValueRange args, OpBuilder *b) { + assert(targetTypes.size() == 1 && "ConvertOp should return a single result"); + assert(resultTypes.size() == 1 && "ConvertOp should return a single result"); + assert(args.size() == 1 && "ConvertOp should take a single argument"); + + Type targetType = getElementTypeOrSelf(targetTypes.front()); + Type convertedSourceType = getElementTypeOrSelf(args.front()); + + if (mlir::arith::FPToSIOp::areCastCompatible(convertedSourceType, + targetType)) { + Value infValue = b->create( + loc, + b->getFloatAttr( + convertedSourceType, + APFloat::getInf( + dyn_cast(convertedSourceType).getFloatSemantics()))); + Value isInf = b->create(loc, arith::CmpFPredicate::OEQ, + args.front(), infValue); + Value isNan = b->create(loc, arith::CmpFPredicate::UNO, + args.front(), args.front()); + Value maxIntval = b->create( + loc, + b->getIntegerAttr(targetType, + APInt::getSignedMaxValue( + dyn_cast(targetType).getWidth()))); + Value zeroIntval = + b->create(loc, b->getZeroAttr(targetType)); + return b->create<::mlir::arith::SelectOp>( + loc, isInf, maxIntval, + b->create<::mlir::arith::SelectOp>( + loc, isNan, zeroIntval, + b->create(loc, resultTypes, args, + std::nullopt))); + } + return nullptr; +} + +class FPToSIConvertOpConverter : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(mhlo::ConvertOp op, typename mhlo::ConvertOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const final { + auto loc = op.getLoc(); + RankedTensorType type = dyn_cast(op.getType()); + if (!type || !type.hasStaticShape()) { + return failure(); + } + // Apply only if convert type is FPToInt32 + if (!mlir::arith::FPToSIOp::areCastCompatible(op.getOperand().getType(), + op.getType())) { + return failure(); + } + auto targetType = op.getType().getElementType(); + if (isa(targetType) && + (cast(targetType).getWidth() != 32 || + cast(targetType).isUnsigned())) { + return failure(); + } + // Find input/output values and types. + std::optional resultTy = + this->typeConverter->convertType(op->getResultTypes().front()) + .template dyn_cast(); + Value emptyTensor = + getEmptyTensorFor(rewriter, loc, *resultTy, op, adaptor.getOperands()); + // Mapped inputs are cast to the same shape as the init tensor. + SmallVector mappedInputs; + for (Value input : adaptor.getOperands()) { + mappedInputs.push_back( + coerceTensorShape(rewriter, loc, cast>(input), + cast(emptyTensor.getType()))); + } + + auto mapOp = rewriter.create( + loc, mappedInputs, emptyTensor, + [&](OpBuilder &b, Location loc, ValueRange args) { + Value innerResult = mapFPToSIConvertOpToStdScalarOp( + op.getLoc(), op.getType(), getElementTypeOrSelf(emptyTensor), + args, &b); + b.create(loc, innerResult); + }, + linalg::getPrunedAttributeList(op)); + rewriter.replaceOp(op, mapOp->getResults()); + return success(); + } +}; + struct HloFusionToLinalgPass : public HloFusionToLinalgBase { - HloFusionToLinalgPass(StringRef tag, bool enablePrimitiveOps) + HloFusionToLinalgPass(StringRef tag, bool enablePrimitiveOps, + StringRef target, StringRef arch) : HloFusionToLinalgBase() { anchorTag = tag.str(); this->enablePrimitiveOps = enablePrimitiveOps; + this->target = target.str(); + this->arch = arch.str(); } void getDependentDialects(DialectRegistry ®istry) const final { @@ -1293,13 +1396,13 @@ struct HloFusionToLinalgPass MLIRContext &ctx = getContext(); RewritePatternSet patterns(&ctx); - ConversionTarget target(ctx); - target.addLegalDialect< + ConversionTarget conversionTarget(ctx); + conversionTarget.addLegalDialect< arith::ArithDialect, cf::ControlFlowDialect, func::FuncDialect, linalg::LinalgDialect, math::MathDialect, tensor::TensorDialect, scf::SCFDialect, shape::ShapeDialect, linalg_ext::LinalgExtDialect>(); - target.addLegalOp(); + conversionTarget.addLegalOp(); auto typeConverter = createHloToLinalgTypeConverter(); @@ -1308,22 +1411,31 @@ struct HloFusionToLinalgPass [](Operation *op) { return isInBodyOfLinalgOps(op); }); mhlo::populateHloToLinalgConversionPattern(&ctx, *typeConverter, &patterns, enablePrimitiveOps); - populateHloToLinalgExtConversionPattern(*typeConverter, patterns); + populateHloToLinalgExtConversionPattern(*typeConverter, patterns, + this->target, this->arch); FrozenRewritePatternSet frozenPatterns(std::move(patterns)); - if (failed(applyPartialConversion(func, target, frozenPatterns))) { + if (failed( + applyPartialConversion(func, conversionTarget, frozenPatterns))) { signalPassFailure(); } } }; + } // namespace -void mlir::populateHloToLinalgExtConversionPattern( - TypeConverter &typeConverter, RewritePatternSet &patterns) { +void mlir::populateHloToLinalgExtConversionPattern(TypeConverter &typeConverter, + RewritePatternSet &patterns, + const std::string &target, + const std::string &arch) { auto ctx = patterns.getContext(); patterns.add(typeConverter, ctx, PatternBenefit(2)); patterns.add(typeConverter, ctx, PatternBenefit(2)); + if (target == "cpu" && arch == "x86_64") { + patterns.add(typeConverter, ctx, + PatternBenefit(2)); + } patterns.add(ctx); patterns.add(ctx); patterns.add(ctx); @@ -1333,8 +1445,9 @@ void mlir::populateHloToLinalgExtConversionPattern( patterns.add(ctx); } -std::unique_ptr> -mlir::createHloFusionToLinalgPass(llvm::StringRef anchorTag, - bool enablePrimitiveOps) { - return std::make_unique(anchorTag, enablePrimitiveOps); +std::unique_ptr> mlir::createHloFusionToLinalgPass( + llvm::StringRef anchorTag, bool enablePrimitiveOps, + const std::string &target, const std::string &arch) { + return std::make_unique(anchorTag, enablePrimitiveOps, + target, arch); } diff --git a/compiler/lib/Pipelines/LinalgTensorOpt.cpp b/compiler/lib/Pipelines/LinalgTensorOpt.cpp index 4cc957771..b1d75341c 100644 --- a/compiler/lib/Pipelines/LinalgTensorOpt.cpp +++ b/compiler/lib/Pipelines/LinalgTensorOpt.cpp @@ -228,9 +228,10 @@ void addGenericLinalgPasses(OpPassManager &pm) { } } -void addCPULinalgOptPasses(OpPassManager &pm) { +void addCPULinalgOptPasses(OpPassManager &pm, const std::string &target, + const std::string &arch) { pm.addNestedPass(createHloFusionToLinalgPass( - getByteIRHloAggressiveFusionAttrName(), true)); + getByteIRHloAggressiveFusionAttrName(), true, target, arch)); pm.addNestedPass(createUnrealizedCastToLinalgPass()); { TileAndVectorizeTransposeOptions options; @@ -248,9 +249,10 @@ void addCPULinalgOptPasses(OpPassManager &pm) { } void createLinalgTensorOptPipelineImpl(OpPassManager &pm, - const std::string &target) { + const std::string &target, + const std::string &arch) { if (target == "cpu") { - addCPULinalgOptPasses(pm); + addCPULinalgOptPasses(pm, target, arch); } else { addGenericLinalgPasses(pm); } @@ -260,5 +262,5 @@ void createLinalgTensorOptPipelineImpl(OpPassManager &pm, void mlir::createLinalgTensorOptPipeline( OpPassManager &pm, const LinalgTensorOptPipelineOptions &options) { invokeOpPassPipelineBuilder(createLinalgTensorOptPipelineImpl, pm, - options.target); + options.target, options.arch); } diff --git a/compiler/python/byteir/compile.py b/compiler/python/byteir/compile.py index 1b7cbe902..7f98549e2 100644 --- a/compiler/python/byteir/compile.py +++ b/compiler/python/byteir/compile.py @@ -297,6 +297,7 @@ def _compile_cpu( entry_func_str = "entry-func={}".format(entry_func) target_str = "target={}".format(target) + arch_str="arch={}".format(cpu_arch) with context: PassManager().parse("builtin.module(hlo-graph-opt{" + entry_func_str + " " + target_str + "})").run(module.operation) _print_verbose(module, "// IR Dump After Hlo Graph Opt:") if verbose else ... @@ -304,7 +305,7 @@ def _compile_cpu( PassManager().parse("builtin.module(hlo-fusion-opt{" + entry_func_str + " " + target_str + " outline-single-elemwise-op})").run(module.operation) _print_verbose(module, "// IR Dump After Hlo Fusion Opt:") if verbose else ... with context: - PassManager.parse("builtin.module(linalg-tensor-opt{" + target_str + "})").run(module.operation) + PassManager.parse("builtin.module(linalg-tensor-opt{" + target_str + " " + arch_str + "})").run(module.operation) _print_verbose(module, "// IR Dump After Linalg Tensor Opt:") if verbose else ... with context: PassManager.parse("builtin.module(byre-tensor-opt{{append-arg-types {}}})".format(entry_func_str)).run(module.operation) diff --git a/tests/numerical_test/execute.py b/tests/numerical_test/execute.py index 3095df438..6339605e5 100644 --- a/tests/numerical_test/execute.py +++ b/tests/numerical_test/execute.py @@ -33,6 +33,9 @@ MLIR_TEST_SPECIAL_INPUTS = { "cpu@log_plus_one.mlir": [ np.random.uniform(low=0.5, high=1.0, size=(256, 64)).astype(np.float16) + ], + "cpu@convert_f32_i32_special_val.mlir": [ + np.array([[np.inf, -np.inf, np.nan], [1., 999.999, -np.inf]], dtype=np.float32), ] } diff --git a/tests/numerical_test/mlir_tests/cpu_ops/convert_f32_i32_special_val.mlir b/tests/numerical_test/mlir_tests/cpu_ops/convert_f32_i32_special_val.mlir new file mode 100644 index 000000000..abc656c89 --- /dev/null +++ b/tests/numerical_test/mlir_tests/cpu_ops/convert_f32_i32_special_val.mlir @@ -0,0 +1,4 @@ +func.func @convert_f32_i32_special_val(%arg0 : tensor<2x3xf32>) -> tensor<2x3xi32> { + %0 = stablehlo.convert %arg0 : (tensor<2x3xf32>) -> tensor<2x3xi32> + func.return %0 : tensor<2x3xi32> +} \ No newline at end of file