Skip to content

Commit f9f401d

Browse files
committed
[X86][AVX] Add additional 256/512-bit test cases for PACKSS/PACKUS shuffle patterns
Also add lowerShuffleWithPACK call to lowerV32I16Shuffle - shuffle combining was catching it but we avoid a lot of temporary shuffle creations if we catch it at lowering first.
1 parent 3c9064e commit f9f401d

File tree

5 files changed

+274
-3
lines changed

5 files changed

+274
-3
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17216,6 +17216,11 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1721617216
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
1721717217
return V;
1721817218

17219+
// Use dedicated pack instructions for masks that match their pattern.
17220+
if (SDValue V =
17221+
lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
17222+
return V;
17223+
1721917224
// Try to use shift instructions.
1722017225
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
1722117226
Zeroable, Subtarget, DAG))
@@ -17237,13 +17242,13 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1723717242
// As this is a single-input shuffle, the repeated mask should be
1723817243
// a strictly valid v8i16 mask that we can pass through to the v8i16
1723917244
// lowering to handle even the v32 case.
17240-
return lowerV8I16GeneralSingleInputShuffle(
17241-
DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
17245+
return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17246+
RepeatedMask, Subtarget, DAG);
1724217247
}
1724317248
}
1724417249

1724517250
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17246-
Zeroable, Subtarget, DAG))
17251+
Zeroable, Subtarget, DAG))
1724717252
return Blend;
1724817253

1724917254
if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,

llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6914,6 +6914,102 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2
69146914
ret <16 x i16> %4
69156915
}
69166916

6917+
define <16 x i16> @shuffle_v16i16_ashr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30(<8 x i32> %a0, <8 x i32> %a1) {
6918+
; AVX1-LABEL: shuffle_v16i16_ashr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30:
6919+
; AVX1: # %bb.0:
6920+
; AVX1-NEXT: vpsrad $25, %xmm0, %xmm2
6921+
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
6922+
; AVX1-NEXT: vpsrad $25, %xmm0, %xmm0
6923+
; AVX1-NEXT: vpsrad $25, %xmm1, %xmm3
6924+
; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
6925+
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
6926+
; AVX1-NEXT: vpsrad $25, %xmm1, %xmm1
6927+
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
6928+
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
6929+
; AVX1-NEXT: retq
6930+
;
6931+
; AVX2OR512VL-LABEL: shuffle_v16i16_ashr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30:
6932+
; AVX2OR512VL: # %bb.0:
6933+
; AVX2OR512VL-NEXT: vpsrad $25, %ymm0, %ymm0
6934+
; AVX2OR512VL-NEXT: vpsrad $25, %ymm1, %ymm1
6935+
; AVX2OR512VL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
6936+
; AVX2OR512VL-NEXT: retq
6937+
;
6938+
; XOPAVX1-LABEL: shuffle_v16i16_ashr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30:
6939+
; XOPAVX1: # %bb.0:
6940+
; XOPAVX1-NEXT: vpsrad $25, %xmm0, %xmm2
6941+
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
6942+
; XOPAVX1-NEXT: vpsrad $25, %xmm0, %xmm0
6943+
; XOPAVX1-NEXT: vpsrad $25, %xmm1, %xmm3
6944+
; XOPAVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
6945+
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
6946+
; XOPAVX1-NEXT: vpsrad $25, %xmm1, %xmm1
6947+
; XOPAVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
6948+
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
6949+
; XOPAVX1-NEXT: retq
6950+
;
6951+
; XOPAVX2-LABEL: shuffle_v16i16_ashr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30:
6952+
; XOPAVX2: # %bb.0:
6953+
; XOPAVX2-NEXT: vpsrad $25, %ymm0, %ymm0
6954+
; XOPAVX2-NEXT: vpsrad $25, %ymm1, %ymm1
6955+
; XOPAVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
6956+
; XOPAVX2-NEXT: retq
6957+
%1 = ashr <8 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
6958+
%2 = ashr <8 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
6959+
%3 = bitcast <8 x i32> %1 to <16 x i16>
6960+
%4 = bitcast <8 x i32> %2 to <16 x i16>
6961+
%5 = shufflevector <16 x i16> %3, <16 x i16> %4, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
6962+
ret <16 x i16> %5
6963+
}
6964+
6965+
define <16 x i16> @shuffle_v16i16_lshr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30(<8 x i32> %a0, <8 x i32> %a1) {
6966+
; AVX1-LABEL: shuffle_v16i16_lshr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30:
6967+
; AVX1: # %bb.0:
6968+
; AVX1-NEXT: vpsrld $25, %xmm0, %xmm2
6969+
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
6970+
; AVX1-NEXT: vpsrld $25, %xmm0, %xmm0
6971+
; AVX1-NEXT: vpsrld $25, %xmm1, %xmm3
6972+
; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
6973+
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
6974+
; AVX1-NEXT: vpsrld $25, %xmm1, %xmm1
6975+
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
6976+
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
6977+
; AVX1-NEXT: retq
6978+
;
6979+
; AVX2OR512VL-LABEL: shuffle_v16i16_lshr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30:
6980+
; AVX2OR512VL: # %bb.0:
6981+
; AVX2OR512VL-NEXT: vpsrld $25, %ymm0, %ymm0
6982+
; AVX2OR512VL-NEXT: vpsrld $25, %ymm1, %ymm1
6983+
; AVX2OR512VL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
6984+
; AVX2OR512VL-NEXT: retq
6985+
;
6986+
; XOPAVX1-LABEL: shuffle_v16i16_lshr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30:
6987+
; XOPAVX1: # %bb.0:
6988+
; XOPAVX1-NEXT: vpsrld $25, %xmm0, %xmm2
6989+
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
6990+
; XOPAVX1-NEXT: vpsrld $25, %xmm0, %xmm0
6991+
; XOPAVX1-NEXT: vpsrld $25, %xmm1, %xmm3
6992+
; XOPAVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
6993+
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
6994+
; XOPAVX1-NEXT: vpsrld $25, %xmm1, %xmm1
6995+
; XOPAVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
6996+
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
6997+
; XOPAVX1-NEXT: retq
6998+
;
6999+
; XOPAVX2-LABEL: shuffle_v16i16_lshr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30:
7000+
; XOPAVX2: # %bb.0:
7001+
; XOPAVX2-NEXT: vpsrld $25, %ymm0, %ymm0
7002+
; XOPAVX2-NEXT: vpsrld $25, %ymm1, %ymm1
7003+
; XOPAVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
7004+
; XOPAVX2-NEXT: retq
7005+
%1 = lshr <8 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
7006+
%2 = lshr <8 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
7007+
%3 = bitcast <8 x i32> %1 to <16 x i16>
7008+
%4 = bitcast <8 x i32> %2 to <16 x i16>
7009+
%5 = shufflevector <16 x i16> %3, <16 x i16> %4, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
7010+
ret <16 x i16> %5
7011+
}
7012+
69177013
define <16 x i16> @shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13(<16 x i16> %a) {
69187014
; AVX1-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
69197015
; AVX1: # %bb.0:

llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,58 @@ define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a
216216
ret <32 x i16> %shuffle
217217
}
218218

219+
define <32 x i16> @shuffle_v32i16_ashr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62(<16 x i32> %a0, <16 x i32> %a1) nounwind {
220+
; KNL-LABEL: shuffle_v32i16_ashr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62:
221+
; KNL: ## %bb.0:
222+
; KNL-NEXT: vpsrad $25, %zmm0, %zmm0
223+
; KNL-NEXT: vpsrad $25, %zmm1, %zmm1
224+
; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
225+
; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
226+
; KNL-NEXT: vpackssdw %ymm3, %ymm2, %ymm2
227+
; KNL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
228+
; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
229+
; KNL-NEXT: retq
230+
;
231+
; SKX-LABEL: shuffle_v32i16_ashr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62:
232+
; SKX: ## %bb.0:
233+
; SKX-NEXT: vpsrad $25, %zmm0, %zmm0
234+
; SKX-NEXT: vpsrad $25, %zmm1, %zmm1
235+
; SKX-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
236+
; SKX-NEXT: retq
237+
%1 = ashr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
238+
%2 = ashr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
239+
%3 = bitcast <16 x i32> %1 to <32 x i16>
240+
%4 = bitcast <16 x i32> %2 to <32 x i16>
241+
%5 = shufflevector <32 x i16> %3, <32 x i16> %4, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 32, i32 34, i32 36, i32 38, i32 8, i32 10, i32 12, i32 14, i32 40, i32 42, i32 44, i32 46, i32 16, i32 18, i32 20, i32 22, i32 48, i32 50, i32 52, i32 54, i32 24, i32 26, i32 28, i32 30, i32 56, i32 58, i32 60, i32 62>
242+
ret <32 x i16> %5
243+
}
244+
245+
define <32 x i16> @shuffle_v32i16_lshr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62(<16 x i32> %a0, <16 x i32> %a1) nounwind {
246+
; KNL-LABEL: shuffle_v32i16_lshr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62:
247+
; KNL: ## %bb.0:
248+
; KNL-NEXT: vpsrld $25, %zmm0, %zmm0
249+
; KNL-NEXT: vpsrld $25, %zmm1, %zmm1
250+
; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
251+
; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
252+
; KNL-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
253+
; KNL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
254+
; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
255+
; KNL-NEXT: retq
256+
;
257+
; SKX-LABEL: shuffle_v32i16_lshr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62:
258+
; SKX: ## %bb.0:
259+
; SKX-NEXT: vpsrld $25, %zmm0, %zmm0
260+
; SKX-NEXT: vpsrld $25, %zmm1, %zmm1
261+
; SKX-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
262+
; SKX-NEXT: retq
263+
%1 = lshr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
264+
%2 = lshr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
265+
%3 = bitcast <16 x i32> %1 to <32 x i16>
266+
%4 = bitcast <16 x i32> %2 to <32 x i16>
267+
%5 = shufflevector <32 x i16> %3, <32 x i16> %4, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 32, i32 34, i32 36, i32 38, i32 8, i32 10, i32 12, i32 14, i32 40, i32 42, i32 44, i32 46, i32 16, i32 18, i32 20, i32 22, i32 48, i32 50, i32 52, i32 54, i32 24, i32 26, i32 28, i32 30, i32 56, i32 58, i32 60, i32 62>
268+
ret <32 x i16> %5
269+
}
270+
219271
define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) {
220272
; KNL-LABEL: insert_dup_mem_v32i16_i32:
221273
; KNL: ## %bb.0:

llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,94 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_
546546
ret <64 x i8> %shuffle
547547
}
548548

549+
define <64 x i8> @shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125(<16 x i32> %a0, <16 x i32> %a1) nounwind {
550+
; AVX512F-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
551+
; AVX512F: # %bb.0:
552+
; AVX512F-NEXT: vpsrad $25, %zmm0, %zmm0
553+
; AVX512F-NEXT: vpsrad $25, %zmm1, %zmm1
554+
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
555+
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
556+
; AVX512F-NEXT: vpackssdw %ymm3, %ymm2, %ymm2
557+
; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
558+
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
559+
; AVX512F-NEXT: retq
560+
;
561+
; AVX512BW-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
562+
; AVX512BW: # %bb.0:
563+
; AVX512BW-NEXT: vpsrad $25, %zmm0, %zmm0
564+
; AVX512BW-NEXT: vpsrad $25, %zmm1, %zmm1
565+
; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
566+
; AVX512BW-NEXT: retq
567+
;
568+
; AVX512DQ-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
569+
; AVX512DQ: # %bb.0:
570+
; AVX512DQ-NEXT: vpsrad $25, %zmm0, %zmm0
571+
; AVX512DQ-NEXT: vpsrad $25, %zmm1, %zmm1
572+
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
573+
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
574+
; AVX512DQ-NEXT: vpackssdw %ymm3, %ymm2, %ymm2
575+
; AVX512DQ-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
576+
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
577+
; AVX512DQ-NEXT: retq
578+
;
579+
; AVX512VBMI-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
580+
; AVX512VBMI: # %bb.0:
581+
; AVX512VBMI-NEXT: vpsrad $25, %zmm0, %zmm0
582+
; AVX512VBMI-NEXT: vpsrad $25, %zmm1, %zmm1
583+
; AVX512VBMI-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
584+
; AVX512VBMI-NEXT: retq
585+
%1 = ashr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
586+
%2 = ashr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
587+
%3 = bitcast <16 x i32> %1 to <64 x i8>
588+
%4 = bitcast <16 x i32> %2 to <64 x i8>
589+
%5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13, i32 64, i32 65, i32 68, i32 69, i32 72, i32 73, i32 76, i32 77, i32 16, i32 17, i32 20, i32 21, i32 24, i32 25, i32 28, i32 29, i32 80, i32 81, i32 84, i32 85, i32 88, i32 89, i32 92, i32 93, i32 32, i32 33, i32 36, i32 37, i32 40, i32 41, i32 44, i32 45, i32 96, i32 97, i32 100, i32 101, i32 104, i32 105, i32 108, i32 109, i32 48, i32 49, i32 52, i32 53, i32 56, i32 57, i32 60, i32 61, i32 112, i32 113, i32 116, i32 117, i32 120, i32 121, i32 124, i32 125>
590+
ret <64 x i8> %5
591+
}
592+
593+
define <64 x i8> @shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125(<16 x i32> %a0, <16 x i32> %a1) nounwind {
594+
; AVX512F-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
595+
; AVX512F: # %bb.0:
596+
; AVX512F-NEXT: vpsrld $25, %zmm0, %zmm0
597+
; AVX512F-NEXT: vpsrld $25, %zmm1, %zmm1
598+
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
599+
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
600+
; AVX512F-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
601+
; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
602+
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
603+
; AVX512F-NEXT: retq
604+
;
605+
; AVX512BW-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
606+
; AVX512BW: # %bb.0:
607+
; AVX512BW-NEXT: vpsrld $25, %zmm0, %zmm0
608+
; AVX512BW-NEXT: vpsrld $25, %zmm1, %zmm1
609+
; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
610+
; AVX512BW-NEXT: retq
611+
;
612+
; AVX512DQ-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
613+
; AVX512DQ: # %bb.0:
614+
; AVX512DQ-NEXT: vpsrld $25, %zmm0, %zmm0
615+
; AVX512DQ-NEXT: vpsrld $25, %zmm1, %zmm1
616+
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
617+
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
618+
; AVX512DQ-NEXT: vpackusdw %ymm3, %ymm2, %ymm2
619+
; AVX512DQ-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
620+
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
621+
; AVX512DQ-NEXT: retq
622+
;
623+
; AVX512VBMI-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
624+
; AVX512VBMI: # %bb.0:
625+
; AVX512VBMI-NEXT: vpsrld $25, %zmm0, %zmm0
626+
; AVX512VBMI-NEXT: vpsrld $25, %zmm1, %zmm1
627+
; AVX512VBMI-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
628+
; AVX512VBMI-NEXT: retq
629+
%1 = lshr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
630+
%2 = lshr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
631+
%3 = bitcast <16 x i32> %1 to <64 x i8>
632+
%4 = bitcast <16 x i32> %2 to <64 x i8>
633+
%5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13, i32 64, i32 65, i32 68, i32 69, i32 72, i32 73, i32 76, i32 77, i32 16, i32 17, i32 20, i32 21, i32 24, i32 25, i32 28, i32 29, i32 80, i32 81, i32 84, i32 85, i32 88, i32 89, i32 92, i32 93, i32 32, i32 33, i32 36, i32 37, i32 40, i32 41, i32 44, i32 45, i32 96, i32 97, i32 100, i32 101, i32 104, i32 105, i32 108, i32 109, i32 48, i32 49, i32 52, i32 53, i32 56, i32 57, i32 60, i32 61, i32 112, i32 113, i32 116, i32 117, i32 120, i32 121, i32 124, i32 125>
634+
ret <64 x i8> %5
635+
}
636+
549637
define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) {
550638
; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
551639
; AVX512F: # %bb.0:

llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,36 @@ define <32 x i16> @combine_pshufb_as_pshufhw(<32 x i16> %a0) {
159159
ret <32 x i16> %1
160160
}
161161

162+
define <32 x i16> @combine_vpermi2var_as_packssdw(<16 x i32> %a0, <16 x i32> %a1) nounwind {
163+
; CHECK-LABEL: combine_vpermi2var_as_packssdw:
164+
; CHECK: # %bb.0:
165+
; CHECK-NEXT: vpsrad $25, %zmm0, %zmm0
166+
; CHECK-NEXT: vpsrad $25, %zmm1, %zmm1
167+
; CHECK-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
168+
; CHECK-NEXT: ret{{[l|q]}}
169+
%1 = ashr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
170+
%2 = ashr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
171+
%3 = bitcast <16 x i32> %1 to <32 x i16>
172+
%4 = bitcast <16 x i32> %2 to <32 x i16>
173+
%5 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %3, <32 x i16> <i16 0, i16 2, i16 4, i16 6, i16 32, i16 34, i16 36, i16 38, i16 8, i16 10, i16 12, i16 14, i16 40, i16 42, i16 44, i16 46, i16 16, i16 18, i16 20, i16 22, i16 48, i16 50, i16 52, i16 54, i16 24, i16 26, i16 28, i16 30, i16 56, i16 58, i16 60, i16 62>, <32 x i16> %4, i32 -1)
174+
ret <32 x i16> %5
175+
}
176+
177+
define <32 x i16> @combine_vpermi2var_as_packusdw(<16 x i32> %a0, <16 x i32> %a1) nounwind {
178+
; CHECK-LABEL: combine_vpermi2var_as_packusdw:
179+
; CHECK: # %bb.0:
180+
; CHECK-NEXT: vpsrld $25, %zmm0, %zmm0
181+
; CHECK-NEXT: vpsrld $25, %zmm1, %zmm1
182+
; CHECK-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
183+
; CHECK-NEXT: ret{{[l|q]}}
184+
%1 = lshr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
185+
%2 = lshr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
186+
%3 = bitcast <16 x i32> %1 to <32 x i16>
187+
%4 = bitcast <16 x i32> %2 to <32 x i16>
188+
%5 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %3, <32 x i16> <i16 0, i16 2, i16 4, i16 6, i16 32, i16 34, i16 36, i16 38, i16 8, i16 10, i16 12, i16 14, i16 40, i16 42, i16 44, i16 46, i16 16, i16 18, i16 20, i16 22, i16 48, i16 50, i16 52, i16 54, i16 24, i16 26, i16 28, i16 30, i16 56, i16 58, i16 60, i16 62>, <32 x i16> %4, i32 -1)
189+
ret <32 x i16> %5
190+
}
191+
162192
define <64 x i8> @combine_pshufb_as_packsswb(<32 x i16> %a0, <32 x i16> %a1) nounwind {
163193
; CHECK-LABEL: combine_pshufb_as_packsswb:
164194
; CHECK: # %bb.0:

0 commit comments

Comments
 (0)