@@ -444,3 +444,158 @@ loop:
444
444
exit:
445
445
ret i64 %rdx.next
446
446
}
447
+
448
+ define void @reduction_with_intermediate_store (ptr %src , ptr %sum ) {
449
+ ; CHECK-LABEL: define void @reduction_with_intermediate_store(
450
+ ; CHECK-SAME: ptr [[SRC:%.*]], ptr [[SUM:%.*]]) {
451
+ ; CHECK-NEXT: [[ENTRY:.*]]:
452
+ ; CHECK-NEXT: [[SUM_PROMOTED:%.*]] = load i32, ptr [[SUM]], align 4
453
+ ; CHECK-NEXT: br label %[[LOOP:.*]]
454
+ ; CHECK: [[LOOP]]:
455
+ ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
456
+ ; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[SUM_PROMOTED]], %[[ENTRY]] ], [ [[RED_NEXT_3:%.*]], %[[LOOP]] ]
457
+ ; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw i32, ptr [[SRC]], i64 [[IV]]
458
+ ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
459
+ ; CHECK-NEXT: [[RED_NEXT:%.*]] = add nsw i32 [[RED]], [[L]]
460
+ ; CHECK-NEXT: store i32 [[RED_NEXT]], ptr [[SUM]], align 4
461
+ ; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
462
+ ; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds nuw i32, ptr [[SRC]], i64 [[IV_NEXT]]
463
+ ; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
464
+ ; CHECK-NEXT: [[RED_NEXT_1:%.*]] = add nsw i32 [[RED_NEXT]], [[L_1]]
465
+ ; CHECK-NEXT: store i32 [[RED_NEXT_1]], ptr [[SUM]], align 4
466
+ ; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
467
+ ; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds nuw i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
468
+ ; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
469
+ ; CHECK-NEXT: [[RED_NEXT_2:%.*]] = add nsw i32 [[RED_NEXT_1]], [[L_2]]
470
+ ; CHECK-NEXT: store i32 [[RED_NEXT_2]], ptr [[SUM]], align 4
471
+ ; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
472
+ ; CHECK-NEXT: [[GEP_SRC_3:%.*]] = getelementptr inbounds nuw i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
473
+ ; CHECK-NEXT: [[L_3:%.*]] = load i32, ptr [[GEP_SRC_3]], align 4
474
+ ; CHECK-NEXT: [[RED_NEXT_3]] = add nsw i32 [[RED_NEXT_2]], [[L_3]]
475
+ ; CHECK-NEXT: store i32 [[RED_NEXT_3]], ptr [[SUM]], align 4
476
+ ; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
477
+ ; CHECK-NEXT: [[EC_3:%.*]] = icmp eq i64 [[IV_NEXT_3]], 10000
478
+ ; CHECK-NEXT: br i1 [[EC_3]], label %[[EXIT:.*]], label %[[LOOP]]
479
+ ; CHECK: [[EXIT]]:
480
+ ; CHECK-NEXT: ret void
481
+ ;
482
+ entry:
483
+ %sum.promoted = load i32 , ptr %sum , align 4
484
+ br label %loop
485
+
486
+ loop:
487
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %loop ]
488
+ %red = phi i32 [ %sum.promoted , %entry ], [ %red.next , %loop ]
489
+ %gep.src = getelementptr inbounds nuw i32 , ptr %src , i64 %iv
490
+ %l = load i32 , ptr %gep.src , align 4
491
+ %red.next = add nsw i32 %red , %l
492
+ store i32 %red.next , ptr %sum , align 4
493
+ %iv.next = add nuw nsw i64 %iv , 1
494
+ %ec = icmp eq i64 %iv.next , 10000
495
+ br i1 %ec , label %exit , label %loop
496
+
497
+ exit:
498
+ ret void
499
+ }
500
+
501
+ declare i32 @foo ()
502
+
503
+ ; Loop with a call cannot be handled by LoopVectorize, introducing additional
504
+ ; accumulators when unrolling increases throughput.
505
+ define i32 @test_add_with_call (i64 %n , i32 %start ) {
506
+ ; CHECK-LABEL: define i32 @test_add_with_call(
507
+ ; CHECK-SAME: i64 [[N:%.*]], i32 [[START:%.*]]) {
508
+ ; CHECK-NEXT: [[ENTRY:.*]]:
509
+ ; CHECK-NEXT: br label %[[LOOP:.*]]
510
+ ; CHECK: [[LOOP]]:
511
+ ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
512
+ ; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
513
+ ; CHECK-NEXT: [[L:%.*]] = call i32 @foo()
514
+ ; CHECK-NEXT: [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
515
+ ; CHECK-NEXT: [[L_1:%.*]] = call i32 @foo()
516
+ ; CHECK-NEXT: [[RDX_2:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
517
+ ; CHECK-NEXT: [[L_2:%.*]] = call i32 @foo()
518
+ ; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = add i32 [[RDX_2]], [[L_2]]
519
+ ; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
520
+ ; CHECK-NEXT: [[L_3:%.*]] = call i32 @foo()
521
+ ; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_3]]
522
+ ; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
523
+ ; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
524
+ ; CHECK: [[EXIT]]:
525
+ ; CHECK-NEXT: [[BIN_RDX2:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
526
+ ; CHECK-NEXT: ret i32 [[BIN_RDX2]]
527
+ ;
528
+ entry:
529
+ br label %loop
530
+
531
+ loop:
532
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %loop ]
533
+ %rdx = phi i32 [ %start , %entry ], [ %rdx.next , %loop ]
534
+ %iv.next = add i64 %iv , 1
535
+ %l = call i32 @foo ()
536
+ %rdx.next = add i32 %rdx , %l
537
+ %ec = icmp ne i64 %iv.next , 1000
538
+ br i1 %ec , label %loop , label %exit
539
+
540
+ exit:
541
+ ret i32 %rdx.next
542
+ }
543
+
544
+ ; Loop with backward dependence cannot be handled LoopVectorize, introducing additional
545
+ ; accumulators when unrolling increases throughput.
546
+ define i32 @test_add_with_backward_dep (ptr %p , i64 %n , i32 %start ) {
547
+ ; CHECK-LABEL: define i32 @test_add_with_backward_dep(
548
+ ; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
549
+ ; CHECK-NEXT: [[ENTRY:.*]]:
550
+ ; CHECK-NEXT: br label %[[LOOP:.*]]
551
+ ; CHECK: [[LOOP]]:
552
+ ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
553
+ ; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
554
+ ; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
555
+ ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]]
556
+ ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP]], align 4
557
+ ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT]]
558
+ ; CHECK-NEXT: store i32 0, ptr [[GEP_1]], align 4
559
+ ; CHECK-NEXT: [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
560
+ ; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
561
+ ; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT]]
562
+ ; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_11]], align 4
563
+ ; CHECK-NEXT: [[GEP_1_1:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_1]]
564
+ ; CHECK-NEXT: store i32 0, ptr [[GEP_1_1]], align 4
565
+ ; CHECK-NEXT: [[RDX_2:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
566
+ ; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
567
+ ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_1]]
568
+ ; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[GEP_2]], align 4
569
+ ; CHECK-NEXT: [[GEP_1_2:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_2]]
570
+ ; CHECK-NEXT: store i32 0, ptr [[GEP_1_2]], align 4
571
+ ; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = add i32 [[RDX_2]], [[L_2]]
572
+ ; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
573
+ ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_2]]
574
+ ; CHECK-NEXT: [[L_3:%.*]] = load i32, ptr [[GEP_3]], align 4
575
+ ; CHECK-NEXT: [[GEP_1_3:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_3]]
576
+ ; CHECK-NEXT: store i32 0, ptr [[GEP_1_3]], align 4
577
+ ; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_3]]
578
+ ; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
579
+ ; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
580
+ ; CHECK: [[EXIT]]:
581
+ ; CHECK-NEXT: [[BIN_RDX3:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
582
+ ; CHECK-NEXT: ret i32 [[BIN_RDX3]]
583
+ ;
584
+ entry:
585
+ br label %loop
586
+
587
+ loop:
588
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %loop ]
589
+ %rdx = phi i32 [ %start , %entry ], [ %rdx.next , %loop ]
590
+ %iv.next = add i64 %iv , 1
591
+ %gep = getelementptr inbounds nuw i32 , ptr %p , i64 %iv
592
+ %l = load i32 , ptr %gep
593
+ %gep.1 = getelementptr inbounds nuw i32 , ptr %p , i64 %iv.next
594
+ store i32 0 , ptr %gep.1
595
+ %rdx.next = add i32 %rdx , %l
596
+ %ec = icmp ne i64 %iv.next , 1000
597
+ br i1 %ec , label %loop , label %exit
598
+
599
+ exit:
600
+ ret i32 %rdx.next
601
+ }
0 commit comments