Bug 108949 - RADV: Subgroup codegen is sub-optimal
Summary: RADV: Subgroup codegen is sub-optimal
Status: NEW
Alias: None
Product: Mesa
Classification: Unclassified
Component: Drivers/Vulkan/radeon (show other bugs)
Version: 18.2
Hardware: Other All
: medium normal
Assignee: mesa-dev
QA Contact: mesa-dev
URL:
Whiteboard:
Keywords:
Depends on:
Blocks:
 
Reported: 2018-12-05 11:20 UTC by maister
Modified: 2018-12-05 11:48 UTC (History)
0 users

See Also:
i915 platform:
i915 features:


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description maister 2018-12-05 11:20:29 UTC
I have some code using subgroups which generates suboptimal code where I expect more use of SGPRs, but I see a lot of VGPRs/vector loads being used instead. The code-gen is worse than AMDVLK and much worse than AMD's Windows driver as a result. I filed a similar issue here: https://github.com/GPUOpen-Drivers/AMDVLK/issues/68.
On a more useful, complicated test, I get 0% uplift from subgroup on RADV, 5% on AMDVLK and 15% on Windows. GPU is RX 470 (Polaris). Mesa version is 18.2.5.

With a trivial test: https://github.com/Themaister/Granite/blob/master/tests/assets/shaders/subgroup.comp
I expect the subgroupBroadcastFirst(subgroupOr) to trigger all scalar loads, but I get in the loop:

BB629_1:
	s_load_dwordx4 s[8:11], s[0:1], 0x0                                  ; C00A0200 00000000
	s_ff1_i32_b32 s3, s2                                                 ; BE831002
	v_mul_u32_u24_e64 v7, s3, 48                                         ; D1080007 00016003
	v_or_b32_e32 v5, 4, v7                                               ; 280A0E84
	v_mad_u32_u24 v10, s3, 48, 20                                        ; D1C3000A 02516003
	v_mad_u32_u24 v8, s3, 48, 16                                         ; D1C30008 02416003
	s_waitcnt lgkmcnt(0)                                                 ; BF8C007F
*	buffer_load_dwordx2 v[5:6], v5, s[8:11], 0 offen                     ; E0541000 80020505
*	buffer_load_dword v10, v10, s[8:11], 0 offen                         ; E0501000 80020A0A
*	buffer_load_dword v14, v7, s[8:11], 0 offen                          ; E0501000 80020E07
*	buffer_load_dword v8, v8, s[8:11], 0 offen                           ; E0501000 80020808
	v_mad_u32_u24 v11, s3, 48, 24                                        ; D1C3000B 02616003
	v_or_b32_e32 v7, 12, v7                                              ; 280E0E8C
	v_mad_u32_u24 v12, s3, 48, 28                                        ; D1C3000C 02716003
*	buffer_load_dword v7, v7, s[8:11], 0 offen                           ; E0501000 80020707
	v_mad_u32_u24 v9, s3, 48, 32                                         ; D1C30009 02816003
	buffer_load_dword v11, v11, s[8:11], 0 offen                         ; E0501000 80020B0B
	v_mad_u32_u24 v13, s3, 48, 36                                        ; D1C3000D 02916003
*	buffer_load_dword v12, v12, s[8:11], 0 offen                         ; E0501000 80020C0C
*	buffer_load_dword v9, v9, s[8:11], 0 offen                           ; E0501000 80020909
...

where Windows codegen is:

label_0028:
  s_cmp_eq_i32  s0, 0                                   // 0000000000A0: BF008000
  s_cbranch_scc1  label_0052                            // 0000000000A4: BF850028
  s_and_b32     s1, s3, 0x0000ffff                      // 0000000000A8: 8601FF03 0000FFFF
  s_ff1_i32_b32  s4, s0                                 // 0000000000B0: BE841000
  s_andn2_b32   s1, s1, 0x3fff0000                      // 0000000000B4: 8901FF01 3FFF0000
  s_mul_i32     s5, s4, 48                              // 0000000000BC: 9205B004
  s_mov_b32     s12, s2                                 // 0000000000C0: BE8C0002
  s_mov_b32     s13, s1                                 // 0000000000C4: BE8D0001
  s_movk_i32    s14, 0xffff                             // 0000000000C8: B00EFFFF
  s_mov_b32     s15, 0x00024fac                         // 0000000000CC: BE8F00FF 00024FAC
  s_buffer_load_dwordx8  s[16:23], s[12:15], s5         // 0000000000D4: C02C0406 00000005
  s_add_u32     s1, s5, 32                              // 0000000000DC: 8001A005
  s_buffer_load_dwordx4  s[12:15], s[12:15], s1         // 0000000000E0: C0280306 00000001
  s_lshl_b32    s1, 1, s4                               // 0000000000E8: 8E010481
  s_xor_b32     s0, s0, s1                              // 0000000000EC: 88000100
  s_waitcnt     vmcnt(0) & lgkmcnt(0)                   // 0000000000F0: BF8C0070
...

The subgroupOr is implemented strangely, getting similar code as AMDVLK, i.e. this:

	v_mov_b32_dpp v7, v7  quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf ; 7E0E02FA FF00B107
	v_or_b32_e32 v5, v5, v7                                              ; 280A0F05
	v_mov_b32_e32 v7, v5                                                 ; 7E0E0305
	s_nop 1                                                              ; BF800001
	v_mov_b32_dpp v7, v7  quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xf ; 7E0E02FA FF004E07
	v_or_b32_e32 v5, v5, v7                                              ; 280A0F05
	v_mov_b32_e32 v7, 0                                                  ; 7E0E0280
	s_nop 1                                                              ; BF800001
	v_mov_b32_dpp v7, v5  row_half_mirror row_mask:0xf bank_mask:0xf     ; 7E0E02FA FF014105
	v_or_b32_e32 v5, v5, v7                                              ; 280A0F05
	v_mov_b32_e32 v7, 0                                                  ; 7E0E0280
	s_nop 1                                                              ; BF800001
	v_mov_b32_dpp v7, v5  row_mirror row_mask:0xf bank_mask:0xf          ; 7E0E02FA FF014005
	v_or_b32_e32 v5, v5, v7                                              ; 280A0F05
	v_mov_b32_e32 v7, 0                                                  ; 7E0E0280
	s_nop 1                                                              ; BF800001
	v_mov_b32_dpp v7, v5  row_bcast:15 row_mask:0xa bank_mask:0xf        ; 7E0E02FA AF014205
	v_or_b32_e32 v5, v5, v7                                              ; 280A0F05
	s_nop 1                                                              ; BF800001
	v_mov_b32_dpp v6, v5  row_bcast:31 row_mask:0xc bank_mask:0xf        ; 7E0C02FA CF014305
	v_or_b32_e32 v5, v5, v6                                              ; 280A0D05
	v_readlane_b32 s8, v5, 63                                            ; D2890008 00017F05

instead of:
  v_or_b32      v13, v13, v13 row_shr:1                 // 000000000048: 281A1AFA FF01110D
  s_nop         0x0001                                  // 000000000050: BF800001
  v_or_b32      v13, v13, v13 row_shr:2                 // 000000000054: 281A1AFA FF01120D
  s_nop         0x0001                                  // 00000000005C: BF800001
  v_or_b32      v13, v13, v13 row_shr:4                 // 000000000060: 281A1AFA FF01140D
  s_nop         0x0001                                  // 000000000068: BF800001
  v_or_b32      v13, v13, v13 row_shr:8                 // 00000000006C: 281A1AFA FF01180D
  s_nop         0x0001                                  // 000000000074: BF800001
  v_or_b32      v13, v13, v13 row_bcast:15 row_mask:0xa // 000000000078: 281A1AFA AF01420D
  s_nop         0x0001                                  // 000000000080: BF800001
  v_or_b32      v13, v13, v13 row_bcast:31 row_mask:0xc // 000000000084: 281A1AFA CF01430D
  s_mov_b64     exec, s[0:1]                            // 00000000008C: BEFE0100
  v_readlane_b32  s0, v13, 63                           // 000000000090: D2890000 00017F0D
Comment 1 Connor Abbott 2018-12-05 11:35:21 UTC
This should be fixed by https://github.com/llvm-mirror/llvm/commit/e3924b1c15606bb5bf98392e0c20e731b4965311 which was just committed 5 days ago. You'll need to build LLVM and Mesa master to try it out.
Comment 2 maister 2018-12-05 11:48:12 UTC
Interesting. No, haven't tried with an LLVM that recent. I'll post when I have results.


Use of freedesktop.org services, including Bugzilla, is subject to our Code of Conduct. How we collect and use information is described in our Privacy Policy.