-
Notifications
You must be signed in to change notification settings - Fork 15.2k
DAG: Allow select ptr combine for non-0 address spaces #167909
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-backend-nvptx @llvm/pr-subscribers-llvm-selectiondag Author: Matt Arsenault (arsenm) ChangesPatch is 44.32 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167909.diff 12 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index df353c4d91b1a..c5f3cd29f684e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -29031,9 +29031,9 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
// over-conservative. It would be beneficial to be able to remember
// both potential memory locations. Since we are discarding
// src value info, don't do the transformation if the memory
- // locations are not in the default address space.
- LLD->getPointerInfo().getAddrSpace() != 0 ||
- RLD->getPointerInfo().getAddrSpace() != 0 ||
+ // locations are not in the same address space.
+ LLD->getPointerInfo().getAddrSpace() !=
+ RLD->getPointerInfo().getAddrSpace() ||
// We can't produce a CMOV of a TargetFrameIndex since we won't
// generate the address generation required.
LLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex ||
@@ -29115,6 +29115,9 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
// but the new load must be the minimum (most restrictive) alignment of the
// inputs.
Align Alignment = std::min(LLD->getAlign(), RLD->getAlign());
+ unsigned AddrSpace = LLD->getAddressSpace();
+ assert(AddrSpace == RLD->getAddressSpace());
+
MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags();
if (!RLD->isInvariant())
MMOFlags &= ~MachineMemOperand::MOInvariant;
@@ -29123,15 +29126,16 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
// FIXME: Discards pointer and AA info.
Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect),
- LLD->getChain(), Addr, MachinePointerInfo(), Alignment,
- MMOFlags);
+ LLD->getChain(), Addr, MachinePointerInfo(AddrSpace),
+ Alignment, MMOFlags);
} else {
// FIXME: Discards pointer and AA info.
Load = DAG.getExtLoad(
LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType()
: LLD->getExtensionType(),
SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr,
- MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags);
+ MachinePointerInfo(AddrSpace), LLD->getMemoryVT(), Alignment,
+ MMOFlags);
}
// Users of the select now use the result of the load.
diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
index d9ad9590d9762..5aabad682ad30 100644
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -7,27 +7,31 @@
define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %ptr0, [8 x i32], ptr %ptr1, [8 x i32], ptr addrspace(1) %ptr2) {
; GCN-LABEL: select_ptr_crash_i64_flat:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[8:9], 0x0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x28
-; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x50
-; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x78
; GCN-NEXT: s_add_i32 s12, s12, s17
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_load_dword s2, s[8:9], 0x0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x78
+; GCN-NEXT: s_add_u32 s4, s8, 40
+; GCN-NEXT: s_addc_u32 s3, s9, 0
+; GCN-NEXT: s_add_u32 s5, s8, 0x50
+; GCN-NEXT: s_addc_u32 s6, s9, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_eq_u32 s6, 0
-; GCN-NEXT: s_cselect_b32 s0, s0, s2
-; GCN-NEXT: s_cselect_b32 s1, s1, s3
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: s_add_u32 s0, s0, 4
+; GCN-NEXT: s_cmp_eq_u32 s2, 0
+; GCN-NEXT: s_cselect_b32 s3, s3, s6
+; GCN-NEXT: s_cselect_b32 s2, s4, s5
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_add_u32 s2, s2, 4
; GCN-NEXT: flat_load_dword v0, v[0:1]
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: s_addc_u32 s3, s3, 0
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: flat_load_dword v1, v[1:2]
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
@@ -45,25 +49,28 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %p
define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], ptr addrspace(1) %ptr0, [8 x i32], ptr addrspace(1) %ptr1, [8 x i32], ptr addrspace(1) %ptr2) {
; GCN-LABEL: select_ptr_crash_i64_global:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x28
-; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x50
-; GCN-NEXT: s_load_dword s6, s[8:9], 0x0
-; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x78
; GCN-NEXT: s_add_i32 s12, s12, s17
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_load_dword s2, s[8:9], 0x0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x78
+; GCN-NEXT: s_add_u32 s4, s8, 40
+; GCN-NEXT: s_addc_u32 s3, s9, 0
+; GCN-NEXT: s_add_u32 s5, s8, 0x50
+; GCN-NEXT: s_addc_u32 s6, s9, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GCN-NEXT: s_cmp_eq_u32 s2, 0
+; GCN-NEXT: s_cselect_b32 s3, s3, s6
+; GCN-NEXT: s_cselect_b32 s2, s4, s5
; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NEXT: s_cmp_eq_u32 s6, 0
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: v_mov_b32_e32 v3, s5
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cselect_b32 s1, s1, s3
-; GCN-NEXT: s_cselect_b32 s0, s0, s2
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
%tmp2 = icmp eq i32 %tmp, 0
%tmp3 = load i64, ptr addrspace(1) %ptr0, align 8
@@ -78,22 +85,18 @@ define amdgpu_kernel void @select_ptr_crash_i64_local(i32 %tmp, ptr addrspace(3)
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: s_add_i32 s12, s12, s17
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s1
-; GCN-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NEXT: ds_read_b64 v[0:1], v0
-; GCN-NEXT: ds_read_b64 v[2:3], v2
; GCN-NEXT: s_cmp_eq_u32 s0, 0
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GCN-NEXT: s_cselect_b32 s0, s1, s2
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ds_read_b64 v[0:1], v0
; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
%tmp2 = icmp eq i32 %tmp, 0
@@ -112,22 +115,20 @@ define amdgpu_kernel void @select_ptr_crash_i64_local_offsets(i32 %tmp, ptr addr
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: s_add_i32 s12, s12, s17
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s1
-; GCN-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128
-; GCN-NEXT: ds_read_b64 v[2:3], v2 offset:512
+; GCN-NEXT: s_addk_i32 s1, 0x80
+; GCN-NEXT: s_addk_i32 s2, 0x200
; GCN-NEXT: s_cmp_eq_u32 s0, 0
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GCN-NEXT: s_cselect_b32 s0, s1, s2
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ds_read_b64 v[0:1], v0
; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
%tmp2 = icmp eq i32 %tmp, 0
diff --git a/llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll b/llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll
index 423fb7d52d3e3..cc5ae2717faf0 100644
--- a/llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll
@@ -22,12 +22,12 @@ define i32 @select_load_i32_p1(i1 %cond, ptr addrspace(1) %a, ptr addrspace(1) %
; CHECK-LABEL: select_load_i32_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dword v5, v[1:2], off
-; CHECK-NEXT: global_load_dword v6, v[3:4], off
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT: global_load_dword v0, v[1:2], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%ld0 = load i32, ptr addrspace(1) %a
%ld1 = load i32, ptr addrspace(1) %b
@@ -39,12 +39,11 @@ define i32 @select_load_i32_p3(i1 %cond, ptr addrspace(3) %a, ptr addrspace(3) %
; CHECK-LABEL: select_load_i32_p3:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read_b32 v1, v1
-; CHECK-NEXT: ds_read_b32 v2, v2
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; CHECK-NEXT: ds_read_b32 v0, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%ld0 = load i32, ptr addrspace(3) %a
%ld1 = load i32, ptr addrspace(3) %b
@@ -90,12 +89,12 @@ define i8 @select_load_i8_p1(i1 %cond, ptr addrspace(1) %a, ptr addrspace(1) %b)
; CHECK-LABEL: select_load_i8_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_ubyte v5, v[1:2], off
-; CHECK-NEXT: global_load_ubyte v6, v[3:4], off
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT: global_load_ubyte v0, v[1:2], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%ld0 = load i8, ptr addrspace(1) %a
%ld1 = load i8, ptr addrspace(1) %b
@@ -107,12 +106,16 @@ define i32 @select_load_i32_p1_offset(i1 %cond, ptr addrspace(1) %a, ptr addrspa
; CHECK-LABEL: select_load_i32_p1_offset:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dword v3, v[1:2], off offset:256
-; CHECK-NEXT: global_load_dword v4, v[1:2], off offset:512
+; CHECK-NEXT: v_add_co_u32_e32 v3, vcc, 0x100, v1
+; CHECK-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v2, vcc
+; CHECK-NEXT: v_add_co_u32_e32 v5, vcc, 0x200, v1
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc
+; CHECK-NEXT: global_load_dword v0, v[0:1], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%gep.a = getelementptr i8, ptr addrspace(1) %a, i64 256
%gep.b = getelementptr i8, ptr addrspace(1) %a, i64 512
diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index bee00f6efbd12..e754f665c5f43 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -16,9 +16,9 @@
; SelectionDAGBuilder for some reason changes the select type.
; VI: s_cselect_b64
; VI: v_cndmask_b32
-define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
%a = load <2 x i8>, ptr addrspace(1) %a.ptr, align 2
- %b = load <2 x i8>, ptr addrspace(1) %b.ptr, align 2
+ %b = load <2 x i8>, ptr addrspace(4) %b.ptr, align 2
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b
store <2 x i8> %select, ptr addrspace(1) %out, align 2
@@ -28,9 +28,9 @@ define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1)
; GCN-LABEL: {{^}}v_select_v4i8:
; GCN: v_cndmask_b32_e32
; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
%a = load <4 x i8>, ptr addrspace(1) %a.ptr
- %b = load <4 x i8>, ptr addrspace(1) %b.ptr
+ %b = load <4 x i8>, ptr addrspace(4) %b.ptr
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
store <4 x i8> %select, ptr addrspace(1) %out, align 4
@@ -41,9 +41,9 @@ define amdgpu_kernel void @v_select_v4i8(ptr addrspace(1) %out, ptr addrspace(1)
; GCN: v_cndmask_b32_e32
; GCN: v_cndmask_b32_e32
; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
%a = load <8 x i8>, ptr addrspace(1) %a.ptr
- %b = load <8 x i8>, ptr addrspace(1) %b.ptr
+ %b = load <8 x i8>, ptr addrspace(4) %b.ptr
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b
store <8 x i8> %select, ptr addrspace(1) %out, align 4
@@ -56,9 +56,9 @@ define amdgpu_kernel void @v_select_v8i8(ptr addrspace(1) %out, ptr addrspace(1)
; GCN: v_cndmask_b32_e32
; GCN: v_cndmask_b32_e32
; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v16i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v16i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
%a = load <16 x i8>, ptr addrspace(1) %a.ptr
- %b = load <16 x i8>, ptr addrspace(1) %b.ptr
+ %b = load <16 x i8>, ptr addrspace(4) %b.ptr
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b
store <16 x i8> %select, ptr addrspace(1) %out, align 4
@@ -93,13 +93,16 @@ define amdgpu_kernel void @select_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2
}
; GCN-LABEL: {{^}}v_select_v2i16:
-; GCN: buffer_load_dword v
-; GCN: buffer_load_dword v
+; GCN: {{buffer|flat|global}}_load_dword v
+; GCN: {{buffer|flat|global}}_load_dword v
; GCN: v_cndmask_b32
; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
- %a = load <2 x i16>, ptr addrspace(1) %a.ptr
- %b = load <2 x i16>, ptr addrspace(1) %b.ptr
+define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.a = getelementptr <2 x i16>, ptr addrspace(1) %a.ptr, i32 %id
+ %gep.b = getelementptr <2 x i16>, ptr addrspace(4) %b.ptr, i32 %id
+ %a = load <2 x i16>, ptr addrspace(1) %gep.a
+ %b = load <2 x i16>, ptr addrspace(4) %gep.b
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
store <2 x i16> %select, ptr addrspace(1) %out, align 4
@@ -114,9 +117,9 @@ define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1
; VI: s_cselect_b64
; GFX9: cndmask
; GFX9: cndmask
-define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
%a = load <3 x i16>, ptr addrspace(1) %a.ptr
- %b = load <3 x i16>, ptr addrspace(1) %b.ptr
+ %b = load <3 x i16>, ptr addrspace(4) %b.ptr
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b
store <3 x i16> %select, ptr addrspace(1) %out, align 4
@@ -127,9 +130,9 @@ define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1
; GCN: v_cndmask_b32_e32
; GCN: v_cndmask_b32_e32
; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
%a = load <4 x i16>, ptr addrspace(1) %a.ptr
- %b = load <4 x i16>, ptr addrspace(1) %b.ptr
+ %b = load <4 x i16>, ptr addrspace(4) %b.ptr
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
store <4 x i16> %select, ptr addrspace(1) %out, align 4
@@ -142,9 +145,9 @@ define amdgpu_kernel void @v_select_v4i16(ptr addrspace(1) %out, ptr addrspace(1
; GCN: v_cndmask_b32_e32
; GCN: v_cndmask_b32_e32
; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
%a = load <8 x i16>, ptr addrspace(1) %a.ptr
- %b = load <8 x i16>, ptr addrspace(1) %b.ptr
+ %b = load <8 x i16>, ptr addrspace(4) %b.ptr
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b
store <8 x i16> %select, ptr addrspace(1) %out, align 4
@@ -161,9 +164,9 @@ define amdgpu_kernel void @v_select_v8i16(ptr addrspace(1) %out, ptr addrspace(1
; GCN: v_cndmask_b32_e32
; GCN: v_cndmask_b32_e32
; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
%a = load <16 x i16>, ptr addrspace(1) %a.ptr
- %b = load <16 x i16>, ptr addrspace(1) %b.ptr
+ %b = load <16 x i16>, ptr addrspace(4) %b.ptr
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <16 x i16> %a, <16 x i16> %b
store <16 x i16> %select, ptr addrspace(1) %out, align 4
@@ -188,9 +191,9 @@ define amdgpu_kernel void @v_select_v16i16(ptr addrspace(1) %out, ptr addrspace(
; GCN: v_cndmask_b32_e32
; GCN: v_cndmask_b32_e32
; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
%a = load <32 x i16>, ptr addr...
[truncated]
|
shiltian
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Interesting. The change looks pretty mechanical but quite surprised to see this was not done already.
I've had this sitting in a branch since 2018 |
780323f to
57cf94b
Compare

No description provided.