@@ -398,6 +398,144 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
398
398
out [0 ] = __builtin_amdgcn_cvt_pk_f16_bf8 (a );
399
399
}
400
400
401
+ // CHECK-LABEL: @test_cvt_pk_bf8_f16(
402
+ // CHECK-NEXT: entry:
403
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
404
+ // CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
405
+ // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
406
+ // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
407
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
408
+ // CHECK-NEXT: store <2 x half> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
409
+ // CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4
410
+ // CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.cvt.pk.bf8.f16(<2 x half> [[TMP0]])
411
+ // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
412
+ // CHECK-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
413
+ // CHECK-NEXT: ret void
414
+ //
415
+ void test_cvt_pk_bf8_f16 (global short* out , half2 a )
416
+ {
417
+ * out = __builtin_amdgcn_cvt_pk_bf8_f16 (a );
418
+ }
419
+
420
+ // CHECK-LABEL: @test_cvt_pk_fp8_f16(
421
+ // CHECK-NEXT: entry:
422
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
423
+ // CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
424
+ // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
425
+ // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
426
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
427
+ // CHECK-NEXT: store <2 x half> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
428
+ // CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4
429
+ // CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.cvt.pk.fp8.f16(<2 x half> [[TMP0]])
430
+ // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
431
+ // CHECK-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
432
+ // CHECK-NEXT: ret void
433
+ //
434
+ void test_cvt_pk_fp8_f16 (global short* out , half2 a )
435
+ {
436
+ * out = __builtin_amdgcn_cvt_pk_fp8_f16 (a );
437
+ }
438
+
439
+ // CHECK-LABEL: @test_cvt_sr_bf8_f16(
440
+ // CHECK-NEXT: entry:
441
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
442
+ // CHECK-NEXT: [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5)
443
+ // CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
444
+ // CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
445
+ // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
446
+ // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
447
+ // CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr
448
+ // CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
449
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
450
+ // CHECK-NEXT: store half [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
451
+ // CHECK-NEXT: store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4
452
+ // CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
453
+ // CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
454
+ // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
455
+ // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
456
+ // CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 0)
457
+ // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
458
+ // CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
459
+ // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
460
+ // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
461
+ // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
462
+ // CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i32 1)
463
+ // CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
464
+ // CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
465
+ // CHECK-NEXT: [[TMP10:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
466
+ // CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
467
+ // CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
468
+ // CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP10]], i32 [[TMP11]], i32 [[TMP12]], i32 2)
469
+ // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
470
+ // CHECK-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
471
+ // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
472
+ // CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
473
+ // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
474
+ // CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP15]], i32 [[TMP16]], i32 [[TMP17]], i32 3)
475
+ // CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
476
+ // CHECK-NEXT: store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
477
+ // CHECK-NEXT: ret void
478
+ //
479
+ void test_cvt_sr_bf8_f16 (global int * out , half a , uint sr , int old )
480
+ {
481
+ * out = __builtin_amdgcn_cvt_sr_bf8_f16 (a , sr , old , 0 );
482
+ * out = __builtin_amdgcn_cvt_sr_bf8_f16 (a , sr , old , 1 );
483
+ * out = __builtin_amdgcn_cvt_sr_bf8_f16 (a , sr , old , 2 );
484
+ * out = __builtin_amdgcn_cvt_sr_bf8_f16 (a , sr , old , 3 );
485
+ }
486
+
487
+ // CHECK-LABEL: @test_cvt_sr_fp8_f16(
488
+ // CHECK-NEXT: entry:
489
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
490
+ // CHECK-NEXT: [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5)
491
+ // CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
492
+ // CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
493
+ // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
494
+ // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
495
+ // CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr
496
+ // CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
497
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
498
+ // CHECK-NEXT: store half [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
499
+ // CHECK-NEXT: store i16 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 2
500
+ // CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
501
+ // CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
502
+ // CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
503
+ // CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32
504
+ // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
505
+ // CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP0]], i32 [[CONV]], i32 [[TMP2]], i32 0)
506
+ // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
507
+ // CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
508
+ // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
509
+ // CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
510
+ // CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[TMP6]] to i32
511
+ // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
512
+ // CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP5]], i32 [[CONV1]], i32 [[TMP7]], i32 1)
513
+ // CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
514
+ // CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
515
+ // CHECK-NEXT: [[TMP10:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
516
+ // CHECK-NEXT: [[TMP11:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
517
+ // CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP11]] to i32
518
+ // CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
519
+ // CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP10]], i32 [[CONV2]], i32 [[TMP12]], i32 2)
520
+ // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
521
+ // CHECK-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
522
+ // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
523
+ // CHECK-NEXT: [[TMP16:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
524
+ // CHECK-NEXT: [[CONV3:%.*]] = sext i16 [[TMP16]] to i32
525
+ // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
526
+ // CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP15]], i32 [[CONV3]], i32 [[TMP17]], i32 3)
527
+ // CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
528
+ // CHECK-NEXT: store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
529
+ // CHECK-NEXT: ret void
530
+ //
531
+ void test_cvt_sr_fp8_f16 (global int * out , half a , short sr , int old )
532
+ {
533
+ * out = __builtin_amdgcn_cvt_sr_fp8_f16 (a , sr , old , 0 );
534
+ * out = __builtin_amdgcn_cvt_sr_fp8_f16 (a , sr , old , 1 );
535
+ * out = __builtin_amdgcn_cvt_sr_fp8_f16 (a , sr , old , 2 );
536
+ * out = __builtin_amdgcn_cvt_sr_fp8_f16 (a , sr , old , 3 );
537
+ }
538
+
401
539
// CHECK-LABEL: @test_sat_pk4_i4_i8(
402
540
// CHECK-NEXT: entry:
403
541
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
0 commit comments