111496 – dEQP-GLES31.functional.shaders.builtin_functions.integer.umulextended.uint_highp_vertex fails with bad intrinsic

Bug 111496 - dEQP-GLES31.functional.shaders.builtin_functions.integer.umulextended.uint_highp_vertex fails with bad intrinsic

Summary: dEQP-GLES31.functional.shaders.builtin_functions.integer.umulextended.uint_hi...

Status:	RESOLVED FIXED

Alias:	None

Product:	Mesa
Classification:	Unclassified
Component:	Drivers/Gallium/llvmpipe (show other bugs)
Version:	unspecified
Hardware:	Other All

Importance:	not set not set
Assignee:	mesa-dev
QA Contact:	mesa-dev

URL:
Whiteboard:
Keywords:

Depends on:
Blocks:

Reported:	2019-08-27 07:32 UTC by Dave Airlie
Modified:	2019-08-29 14:58 UTC (History)
CC List:	0 users

See Also:
i915 platform:
i915 features:

Attachments

Description Dave Airlie 2019-08-27 07:32:44 UTC

Debug below:

llvm (version 0x800) found no intrinsic for llvm.x86.avx2.pmulu.dq, going to crash...
On a skylake cpu.

llvmpipe: Fragment shader #131 variant #0:
FRAG
DCL IN[0].xy, GENERIC[9], CONSTANT
DCL OUT[0], COLOR
DCL OUT[1], COLOR[1]
  0: MOV OUT[1].x, IN[0].xxxx
  1: MOV OUT[0].x, IN[0].yyyy
  2: END
fs variant 0x1f0f0bc:
cbuf_format[0] = PIPE_FORMAT_R32_UINT
cbuf_format[1] = PIPE_FORMAT_R32_UINT
blend.colormask = 0x1
variant->opaque = 0

; ModuleID = 'fs131_variant0'
source_filename = "fs131_variant0"
target datalayout = "e-p:64:64:64-i64:64:64-a0:0:64-s0:64:64"

; Function Attrs: nounwind readnone speculatable
declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>) #0

; Function Attrs: nounwind readnone
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) #1

; Function Attrs: nounwind readnone
declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) #1

define void @fs131_variant0_partial({ [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* noalias %context, i32 %x, i32 %y, i32, float* noalias %a0, float* noalias %dadx, float* noalias %dady, <16 x i8>** noalias %color_ptr_ptr, i8* noalias %depth, i32 %mask_input, { { [2048 x i32], [128 x i64] }*, i64, i64, i32 }* noalias %thread_data, i32* noalias %stride_ptr, i32 %depth_stride) {
entry:
  %output16 = alloca <8 x float>
  %output15 = alloca <8 x float>
  %output14 = alloca <8 x float>
  %output13 = alloca <8 x float>
  %output12 = alloca <8 x float>
  %output11 = alloca <8 x float>
  %output10 = alloca <8 x float>
  %output = alloca <8 x float>
  %looplimiter = alloca i32
  %execution_mask = alloca <8 x i32>
  %color9 = alloca <8 x float>, i32 2
  %color8 = alloca <8 x float>, i32 2
  %color7 = alloca <8 x float>, i32 2
  %color6 = alloca <8 x float>, i32 2
  %color5 = alloca <8 x float>, i32 2
  %color4 = alloca <8 x float>, i32 2
  %color3 = alloca <8 x float>, i32 2
  %color = alloca <8 x float>, i32 2
  %loop_counter = alloca i32
  %1 = alloca <8 x float>, i32 2
  %2 = alloca <8 x float>, i32 2
  %mask_store = alloca <8 x i32>, i32 2
  %thread_data.invocs_ptr = getelementptr { { [2048 x i32], [128 x i64] }*, i64, i64, i32 }, { { [2048 x i32], [128 x i64] }*, i64, i64, i32 }* %thread_data, i32 0, i32 2
  %3 = load i64, i64* %thread_data.invocs_ptr
  %invoc_count = add i64 %3, 1
  store i64 %invoc_count, i64* %thread_data.invocs_ptr
  %4 = sitofp i32 %x to float
  %5 = sitofp i32 %y to float
  %6 = getelementptr <8 x float>, <8 x float>* %2, i32 0
  store <8 x float> <float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 2.000000e+00, float 3.000000e+00>, <8 x float>* %6
  %7 = getelementptr <8 x float>, <8 x float>* %1, i32 0
  store <8 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, <8 x float>* %7
  %8 = getelementptr <8 x float>, <8 x float>* %2, i32 1
  store <8 x float> <float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 2.000000e+00, float 3.000000e+00>, <8 x float>* %8
  %9 = getelementptr <8 x float>, <8 x float>* %1, i32 1
  store <8 x float> <float 2.000000e+00, float 2.000000e+00, float 3.000000e+00, float 3.000000e+00, float 2.000000e+00, float 2.000000e+00, float 3.000000e+00, float 3.000000e+00>, <8 x float>* %9
  %10 = getelementptr float, float* %dadx, i32 0
  %11 = bitcast float* %10 to <4 x float>*
  %pos.x.dadxaos = load <4 x float>, <4 x float>* %11
  %12 = getelementptr float, float* %dady, i32 0
  %13 = bitcast float* %12 to <4 x float>*
  %pos.x.dadyaos = load <4 x float>, <4 x float>* %13
  %14 = getelementptr float, float* %a0, i32 0
  %15 = bitcast float* %14 to <4 x float>*
  %pos.x.a0aos = load <4 x float>, <4 x float>* %15
  %16 = getelementptr float, float* %a0, i32 4
  %17 = bitcast float* %16 to <4 x float>*
  %input0.x.a0aos = load <4 x float>, <4 x float>* %17
  %mask_ptr = getelementptr <8 x i32>, <8 x i32>* %mask_store, i32 0
  %18 = lshr i32 %mask_input, 0
  %19 = insertelement <8 x i32> undef, i32 %18, i32 0
  %20 = shufflevector <8 x i32> %19, <8 x i32> undef, <8 x i32> zeroinitializer
  %21 = and <8 x i32> %20, <i32 1, i32 2, i32 16, i32 32, i32 4, i32 8, i32 64, i32 128>
  %22 = icmp eq <8 x i32> %21, <i32 1, i32 2, i32 16, i32 32, i32 4, i32 8, i32 64, i32 128>
  %23 = sext <8 x i1> %22 to <8 x i32>
  store <8 x i32> %23, <8 x i32>* %mask_ptr
  %mask_ptr1 = getelementptr <8 x i32>, <8 x i32>* %mask_store, i32 1
  %24 = lshr i32 %mask_input, 8
  %25 = insertelement <8 x i32> undef, i32 %24, i32 0
  %26 = shufflevector <8 x i32> %25, <8 x i32> undef, <8 x i32> zeroinitializer
  %27 = and <8 x i32> %26, <i32 1, i32 2, i32 16, i32 32, i32 4, i32 8, i32 64, i32 128>
  %28 = icmp eq <8 x i32> %27, <i32 1, i32 2, i32 16, i32 32, i32 4, i32 8, i32 64, i32 128>
  %29 = sext <8 x i1> %28 to <8 x i32>
  store <8 x i32> %29, <8 x i32>* %mask_ptr1
  %context.stencil_ref_front_ptr = getelementptr { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }, { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* %context, i32 0, i32 6
  %context.stencil_ref_front = load i32, i32* %context.stencil_ref_front_ptr
  %context.stencil_ref_back_ptr = getelementptr { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }, { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* %context, i32 0, i32 7
  %context.stencil_ref_back = load i32, i32* %context.stencil_ref_back_ptr
  %30 = insertelement <8 x i32> undef, i32 %context.stencil_ref_front, i32 0
  %31 = shufflevector <8 x i32> %30, <8 x i32> undef, <8 x i32> zeroinitializer
  %32 = insertelement <8 x i32> undef, i32 %context.stencil_ref_back, i32 0
  %33 = shufflevector <8 x i32> %32, <8 x i32> undef, <8 x i32> zeroinitializer
  %context.constants_ptr = getelementptr { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }, { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* %context, i32 0, i32 0
  %context.num_constants_ptr = getelementptr { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }, { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* %context, i32 0, i32 1
  %context.ssbos_ptr = getelementptr { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }, { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* %context, i32 0, i32 11
  %context.num_ssbos_ptr = getelementptr { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }, { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* %context, i32 0, i32 12
  store i32 0, i32* %loop_counter
  store i32 0, i32* %loop_counter
  br label %loop_begin

loop_begin:                                       ; preds = %skip, %entry
  %34 = load i32, i32* %loop_counter
  %35 = icmp ult i32 %34, 2
  br i1 %35, label %loop_body, label %loop_exit

loop_body:                                        ; preds = %loop_begin
  %mask_ptr2 = getelementptr <8 x i32>, <8 x i32>* %mask_store, i32 %34
  %36 = load <8 x i32>, <8 x i32>* %mask_ptr2
  store <8 x i32> zeroinitializer, <8 x i32>* %execution_mask
  store <8 x i32> %36, <8 x i32>* %execution_mask
  %37 = load <8 x i32>, <8 x i32>* %execution_mask
  %38 = bitcast <8 x i32> %37 to i256
  %39 = icmp eq i256 %38, 0
  br i1 %39, label %skip, label %40

; <label>:40:                                     ; preds = %loop_body
  %41 = getelementptr <8 x float>, <8 x float>* %2, i32 %34
  %42 = load <8 x float>, <8 x float>* %41
  %43 = getelementptr <8 x float>, <8 x float>* %1, i32 %34
  %44 = load <8 x float>, <8 x float>* %43
  %45 = insertelement <8 x float> undef, float %4, i32 0
  %46 = shufflevector <8 x float> %45, <8 x float> undef, <8 x i32> zeroinitializer
  %47 = fadd <8 x float> %42, %46
  %48 = insertelement <8 x float> undef, float %5, i32 0
  %49 = shufflevector <8 x float> %48, <8 x float> undef, <8 x i32> zeroinitializer
  %50 = fadd <8 x float> %44, %49
  %51 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <8 x float> %47, <8 x float> <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>) #2
  %52 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> zeroinitializer, <8 x float> %50, <8 x float> %51) #2
  %53 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> zeroinitializer, <8 x float> %47, <8 x float> <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>) #2
  %54 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <8 x float> %50, <8 x float> %53) #2
  %55 = shufflevector <4 x float> %pos.x.dadxaos, <4 x float> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  %56 = shufflevector <4 x float> %pos.x.dadyaos, <4 x float> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  %57 = shufflevector <4 x float> %pos.x.a0aos, <4 x float> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  %58 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %55, <8 x float> %47, <8 x float> %57) #2
  %59 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %56, <8 x float> %50, <8 x float> %58) #2
  %60 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %59, <8 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>) #2
  %61 = shufflevector <4 x float> %pos.x.dadxaos, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  %62 = shufflevector <4 x float> %pos.x.dadyaos, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  %63 = shufflevector <4 x float> %pos.x.a0aos, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  %64 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %61, <8 x float> %47, <8 x float> %63) #2
  %65 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %62, <8 x float> %50, <8 x float> %64) #2
  %66 = getelementptr <8 x float>, <8 x float>* %2, i32 %34
  %67 = load <8 x float>, <8 x float>* %66
  %68 = getelementptr <8 x float>, <8 x float>* %1, i32 %34
  %69 = load <8 x float>, <8 x float>* %68
  %70 = insertelement <8 x float> undef, float %4, i32 0
  %71 = shufflevector <8 x float> %70, <8 x float> undef, <8 x i32> zeroinitializer
  %72 = fadd <8 x float> %67, %71
  %73 = insertelement <8 x float> undef, float %5, i32 0
  %74 = shufflevector <8 x float> %73, <8 x float> undef, <8 x i32> zeroinitializer
  %75 = fadd <8 x float> %69, %74
  %76 = shufflevector <4 x float> %input0.x.a0aos, <4 x float> undef, <8 x i32> zeroinitializer
  %77 = shufflevector <4 x float> %input0.x.a0aos, <4 x float> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  store i32 0, i32* %looplimiter
  store i32 65535, i32* %looplimiter
  store <8 x float> zeroinitializer, <8 x float>* %output
  store <8 x float> zeroinitializer, <8 x float>* %output10
  store <8 x float> zeroinitializer, <8 x float>* %output11
  store <8 x float> zeroinitializer, <8 x float>* %output12
  store <8 x float> zeroinitializer, <8 x float>* %output13
  store <8 x float> zeroinitializer, <8 x float>* %output14
  store <8 x float> zeroinitializer, <8 x float>* %output15
  store <8 x float> zeroinitializer, <8 x float>* %output16
  store <8 x float> %76, <8 x float>* %output13
  store <8 x float> %77, <8 x float>* %output
  %color0.r = load <8 x float>, <8 x float>* %output
  %78 = getelementptr <8 x float>, <8 x float>* %color, i32 %34
  store <8 x float> %color0.r, <8 x float>* %78
  %color0.g = load <8 x float>, <8 x float>* %output10
  %79 = getelementptr <8 x float>, <8 x float>* %color3, i32 %34
  store <8 x float> %color0.g, <8 x float>* %79
  %color0.b = load <8 x float>, <8 x float>* %output11
  %80 = getelementptr <8 x float>, <8 x float>* %color4, i32 %34
  store <8 x float> %color0.b, <8 x float>* %80
  %color0.a = load <8 x float>, <8 x float>* %output12
  %81 = getelementptr <8 x float>, <8 x float>* %color5, i32 %34
  store <8 x float> %color0.a, <8 x float>* %81
  %color1.r = load <8 x float>, <8 x float>* %output13
  %82 = getelementptr <8 x float>, <8 x float>* %color6, i32 %34
  store <8 x float> %color1.r, <8 x float>* %82
  %color1.g = load <8 x float>, <8 x float>* %output14
  %83 = getelementptr <8 x float>, <8 x float>* %color7, i32 %34
  store <8 x float> %color1.g, <8 x float>* %83
  %color1.b = load <8 x float>, <8 x float>* %output15
  %84 = getelementptr <8 x float>, <8 x float>* %color8, i32 %34
  store <8 x float> %color1.b, <8 x float>* %84
  %color1.a = load <8 x float>, <8 x float>* %output16
  %85 = getelementptr <8 x float>, <8 x float>* %color9, i32 %34
  store <8 x float> %color1.a, <8 x float>* %85
  br label %skip

skip:                                             ; preds = %40, %loop_body
  %86 = load <8 x i32>, <8 x i32>* %execution_mask
  store <8 x i32> %86, <8 x i32>* %mask_ptr2
  %87 = add i32 %34, 1
  store i32 %87, i32* %loop_counter
  br label %loop_begin

loop_exit:                                        ; preds = %loop_begin
  %88 = getelementptr <8 x i32>, <8 x i32>* %mask_store, i32 0
  %mask = load <8 x i32>, <8 x i32>* %88
  %89 = getelementptr <8 x float>, <8 x float>* %color, i32 0
  %90 = getelementptr <8 x float>, <8 x float>* %color3, i32 0
  %91 = getelementptr <8 x float>, <8 x float>* %color4, i32 0
  %92 = getelementptr <8 x float>, <8 x float>* %color5, i32 0
  %93 = getelementptr <8 x float>, <8 x float>* %color6, i32 0
  %94 = getelementptr <8 x float>, <8 x float>* %color7, i32 0
  %95 = getelementptr <8 x float>, <8 x float>* %color8, i32 0
  %96 = getelementptr <8 x float>, <8 x float>* %color9, i32 0
  %97 = getelementptr <8 x i32>, <8 x i32>* %mask_store, i32 1
  %mask17 = load <8 x i32>, <8 x i32>* %97
  %98 = getelementptr <8 x float>, <8 x float>* %color, i32 1
  %99 = getelementptr <8 x float>, <8 x float>* %color3, i32 1
  %100 = getelementptr <8 x float>, <8 x float>* %color4, i32 1
  %101 = getelementptr <8 x float>, <8 x float>* %color5, i32 1
  %102 = getelementptr <8 x float>, <8 x float>* %color6, i32 1
  %103 = getelementptr <8 x float>, <8 x float>* %color7, i32 1
  %104 = getelementptr <8 x float>, <8 x float>* %color8, i32 1
  %105 = getelementptr <8 x float>, <8 x float>* %color9, i32 1
  %106 = getelementptr <16 x i8>*, <16 x i8>** %color_ptr_ptr, i32 0
  %color_ptr0 = load <16 x i8>*, <16 x i8>** %106
  %107 = getelementptr i32, i32* %stride_ptr, i32 0
  %108 = load i32, i32* %107
  %109 = load <8 x float>, <8 x float>* %92
  %110 = load <8 x float>, <8 x float>* %89
  %111 = shufflevector <8 x i32> %mask, <8 x i32> %mask, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %112 = shufflevector <8 x i32> %mask, <8 x i32> %mask, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %113 = shufflevector <8 x float> %109, <8 x float> %109, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %114 = shufflevector <8 x float> %109, <8 x float> %109, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %115 = load <8 x float>, <8 x float>* %101
  %116 = load <8 x float>, <8 x float>* %98
  %117 = shufflevector <8 x i32> %mask17, <8 x i32> %mask17, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %118 = shufflevector <8 x i32> %mask17, <8 x i32> %mask17, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %119 = shufflevector <8 x float> %115, <8 x float> %115, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %120 = shufflevector <8 x float> %115, <8 x float> %115, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %121 = bitcast <8 x float> %110 to <8 x i32>
  %122 = bitcast <8 x float> %116 to <8 x i32>
  %123 = shufflevector <8 x i32> %122, <8 x i32> %122, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %124 = shufflevector <8 x i32> %122, <8 x i32> %122, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %125 = shufflevector <8 x i32> %121, <8 x i32> %121, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %126 = shufflevector <8 x i32> %121, <8 x i32> %121, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %127 = bitcast <4 x i32> %126 to <2 x i64>
  %128 = bitcast <4 x i32> %125 to <2 x i64>
  %129 = shufflevector <2 x i64> %127, <2 x i64> %128, <2 x i32> <i32 0, i32 2>
  %130 = shufflevector <2 x i64> %127, <2 x i64> %128, <2 x i32> <i32 1, i32 3>
  %131 = bitcast <2 x i64> %129 to <4 x i32>
  %132 = bitcast <2 x i64> %130 to <4 x i32>
  %133 = bitcast <4 x i32> %124 to <2 x i64>
  %134 = bitcast <4 x i32> %123 to <2 x i64>
  %135 = shufflevector <2 x i64> %133, <2 x i64> %134, <2 x i32> <i32 0, i32 2>
  %136 = shufflevector <2 x i64> %133, <2 x i64> %134, <2 x i32> <i32 1, i32 3>
  %137 = bitcast <2 x i64> %135 to <4 x i32>
  %138 = bitcast <2 x i64> %136 to <4 x i32>
  %context.f_blend_color_ptr = getelementptr { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }, { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* %context, i32 0, i32 9
  %context.f_blend_color = load float*, float** %context.f_blend_color_ptr
  %139 = bitcast float* %context.f_blend_color to <4 x i32>*
  %140 = getelementptr <4 x i32>, <4 x i32>* %139, i32 0
  %141 = load <4 x i32>, <4 x i32>* %140
  %142 = shufflevector <4 x i32> %141, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
  %143 = shufflevector <4 x i32> %141, <4 x i32> undef, <4 x i32> zeroinitializer
  %144 = bitcast <4 x i32> %111 to <2 x i64>
  %145 = bitcast <4 x i32> %112 to <2 x i64>
  %146 = shufflevector <2 x i64> %144, <2 x i64> %145, <2 x i32> <i32 0, i32 2>
  %147 = shufflevector <2 x i64> %144, <2 x i64> %145, <2 x i32> <i32 1, i32 3>
  %148 = bitcast <2 x i64> %146 to <4 x i32>
  %149 = bitcast <2 x i64> %147 to <4 x i32>
  %150 = bitcast <4 x i32> %117 to <2 x i64>
  %151 = bitcast <4 x i32> %118 to <2 x i64>
  %152 = shufflevector <2 x i64> %150, <2 x i64> %151, <2 x i32> <i32 0, i32 2>
  %153 = shufflevector <2 x i64> %150, <2 x i64> %151, <2 x i32> <i32 1, i32 3>
  %154 = bitcast <2 x i64> %152 to <4 x i32>
  %155 = bitcast <2 x i64> %153 to <4 x i32>
  %156 = bitcast <4 x float> %113 to <2 x i64>
  %157 = bitcast <4 x float> %114 to <2 x i64>
  %158 = shufflevector <2 x i64> %156, <2 x i64> %157, <2 x i32> <i32 0, i32 2>
  %159 = shufflevector <2 x i64> %156, <2 x i64> %157, <2 x i32> <i32 1, i32 3>
  %160 = bitcast <2 x i64> %158 to <4 x i32>
  %161 = bitcast <2 x i64> %159 to <4 x i32>
  %162 = bitcast <4 x float> %119 to <2 x i64>
  %163 = bitcast <4 x float> %120 to <2 x i64>
  %164 = shufflevector <2 x i64> %162, <2 x i64> %163, <2 x i32> <i32 0, i32 2>
  %165 = shufflevector <2 x i64> %162, <2 x i64> %163, <2 x i32> <i32 1, i32 3>
  %166 = bitcast <2 x i64> %164 to <4 x i32>
  %167 = bitcast <2 x i64> %165 to <4 x i32>
  %168 = shufflevector <4 x i32> %160, <4 x i32> %160, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %169 = shufflevector <4 x i32> %161, <4 x i32> %161, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %170 = shufflevector <4 x i32> %166, <4 x i32> %166, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %171 = shufflevector <4 x i32> %167, <4 x i32> %167, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %172 = mul i32 0, %108
  %173 = add i32 0, %172
  %174 = getelementptr <16 x i8>, <16 x i8>* %color_ptr0, i32 0, i32 %173
  %175 = bitcast i8* %174 to <4 x i32>*
  %176 = load <4 x i32>, <4 x i32>* %175, align 16
  %177 = mul i32 1, %108
  %178 = add i32 0, %177
  %179 = getelementptr <16 x i8>, <16 x i8>* %color_ptr0, i32 0, i32 %178
  %180 = bitcast i8* %179 to <4 x i32>*
  %181 = load <4 x i32>, <4 x i32>* %180, align 16
  %182 = mul i32 2, %108
  %183 = add i32 0, %182
  %184 = getelementptr <16 x i8>, <16 x i8>* %color_ptr0, i32 0, i32 %183
  %185 = bitcast i8* %184 to <4 x i32>*
  %186 = load <4 x i32>, <4 x i32>* %185, align 16
  %187 = mul i32 3, %108
  %188 = add i32 0, %187
  %189 = getelementptr <16 x i8>, <16 x i8>* %color_ptr0, i32 0, i32 %188
  %190 = bitcast i8* %189 to <4 x i32>*
  %191 = load <4 x i32>, <4 x i32>* %190, align 16
  %192 = bitcast <4 x i32> %148 to <16 x i8>
  %193 = bitcast <4 x i32> %131 to <16 x i8>
  %194 = bitcast <4 x i32> %176 to <16 x i8>
  %195 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %194, <16 x i8> %193, <16 x i8> %192) #2
  %196 = bitcast <16 x i8> %195 to <4 x i32>
  %197 = bitcast <4 x i32> %149 to <16 x i8>
  %198 = bitcast <4 x i32> %132 to <16 x i8>
  %199 = bitcast <4 x i32> %181 to <16 x i8>
  %200 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %199, <16 x i8> %198, <16 x i8> %197) #2
  %201 = bitcast <16 x i8> %200 to <4 x i32>
  %202 = bitcast <4 x i32> %154 to <16 x i8>
  %203 = bitcast <4 x i32> %137 to <16 x i8>
  %204 = bitcast <4 x i32> %186 to <16 x i8>
  %205 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %204, <16 x i8> %203, <16 x i8> %202) #2
  %206 = bitcast <16 x i8> %205 to <4 x i32>
  %207 = bitcast <4 x i32> %155 to <16 x i8>
  %208 = bitcast <4 x i32> %138 to <16 x i8>
  %209 = bitcast <4 x i32> %191 to <16 x i8>
  %210 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %209, <16 x i8> %208, <16 x i8> %207) #2
  %211 = bitcast <16 x i8> %210 to <4 x i32>
  %212 = mul i32 0, %108
  %213 = add i32 0, %212
  %214 = getelementptr <16 x i8>, <16 x i8>* %color_ptr0, i32 0, i32 %213
  %215 = bitcast i8* %214 to <4 x i32>*
  store <4 x i32> %196, <4 x i32>* %215, align 16
  %216 = mul i32 1, %108
  %217 = add i32 0, %216
  %218 = getelementptr <16 x i8>, <16 x i8>* %color_ptr0, i32 0, i32 %217
  %219 = bitcast i8* %218 to <4 x i32>*
  store <4 x i32> %201, <4 x i32>* %219, align 16
  %220 = mul i32 2, %108
  %221 = add i32 0, %220
  %222 = getelementptr <16 x i8>, <16 x i8>* %color_ptr0, i32 0, i32 %221
  %223 = bitcast i8* %222 to <4 x i32>*
  store <4 x i32> %206, <4 x i32>* %223, align 16
  %224 = mul i32 3, %108
  %225 = add i32 0, %224
  %226 = getelementptr <16 x i8>, <16 x i8>* %color_ptr0, i32 0, i32 %225
  %227 = bitcast i8* %226 to <4 x i32>*
  store <4 x i32> %211, <4 x i32>* %227, align 16
  %228 = getelementptr <16 x i8>*, <16 x i8>** %color_ptr_ptr, i32 1
  %color_ptr1 = load <16 x i8>*, <16 x i8>** %228
  %229 = getelementptr i32, i32* %stride_ptr, i32 1
  %230 = load i32, i32* %229
  %231 = load <8 x float>, <8 x float>* %96
  %232 = load <8 x float>, <8 x float>* %93
  %233 = shufflevector <8 x i32> %mask, <8 x i32> %mask, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %234 = shufflevector <8 x i32> %mask, <8 x i32> %mask, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %235 = shufflevector <8 x float> %231, <8 x float> %231, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %236 = shufflevector <8 x float> %231, <8 x float> %231, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %237 = load <8 x float>, <8 x float>* %105
  %238 = load <8 x float>, <8 x float>* %102
  %239 = shufflevector <8 x i32> %mask17, <8 x i32> %mask17, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %240 = shufflevector <8 x i32> %mask17, <8 x i32> %mask17, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %241 = shufflevector <8 x float> %237, <8 x float> %237, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %242 = shufflevector <8 x float> %237, <8 x float> %237, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %243 = bitcast <8 x float> %232 to <8 x i32>
  %244 = bitcast <8 x float> %238 to <8 x i32>
  %245 = shufflevector <8 x i32> %244, <8 x i32> %244, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %246 = shufflevector <8 x i32> %244, <8 x i32> %244, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %247 = shufflevector <8 x i32> %243, <8 x i32> %243, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %248 = shufflevector <8 x i32> %243, <8 x i32> %243, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %249 = bitcast <4 x i32> %248 to <2 x i64>
  %250 = bitcast <4 x i32> %247 to <2 x i64>
  %251 = shufflevector <2 x i64> %249, <2 x i64> %250, <2 x i32> <i32 0, i32 2>
  %252 = shufflevector <2 x i64> %249, <2 x i64> %250, <2 x i32> <i32 1, i32 3>
  %253 = bitcast <2 x i64> %251 to <4 x i32>
  %254 = bitcast <2 x i64> %252 to <4 x i32>
  %255 = bitcast <4 x i32> %246 to <2 x i64>
  %256 = bitcast <4 x i32> %245 to <2 x i64>
  %257 = shufflevector <2 x i64> %255, <2 x i64> %256, <2 x i32> <i32 0, i32 2>
  %258 = shufflevector <2 x i64> %255, <2 x i64> %256, <2 x i32> <i32 1, i32 3>
  %259 = bitcast <2 x i64> %257 to <4 x i32>
  %260 = bitcast <2 x i64> %258 to <4 x i32>
  %context.f_blend_color_ptr18 = getelementptr { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }, { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* %context, i32 0, i32 9
  %context.f_blend_color19 = load float*, float** %context.f_blend_color_ptr18
  %261 = bitcast float* %context.f_blend_color19 to <4 x i32>*
  %262 = getelementptr <4 x i32>, <4 x i32>* %261, i32 0
  %263 = load <4 x i32>, <4 x i32>* %262
  %264 = shufflevector <4 x i32> %263, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
  %265 = shufflevector <4 x i32> %263, <4 x i32> undef, <4 x i32> zeroinitializer
  %266 = bitcast <4 x i32> %233 to <2 x i64>
  %267 = bitcast <4 x i32> %234 to <2 x i64>
  %268 = shufflevector <2 x i64> %266, <2 x i64> %267, <2 x i32> <i32 0, i32 2>
  %269 = shufflevector <2 x i64> %266, <2 x i64> %267, <2 x i32> <i32 1, i32 3>
  %270 = bitcast <2 x i64> %268 to <4 x i32>
  %271 = bitcast <2 x i64> %269 to <4 x i32>
  %272 = bitcast <4 x i32> %239 to <2 x i64>
  %273 = bitcast <4 x i32> %240 to <2 x i64>
  %274 = shufflevector <2 x i64> %272, <2 x i64> %273, <2 x i32> <i32 0, i32 2>
  %275 = shufflevector <2 x i64> %272, <2 x i64> %273, <2 x i32> <i32 1, i32 3>
  %276 = bitcast <2 x i64> %274 to <4 x i32>
  %277 = bitcast <2 x i64> %275 to <4 x i32>
  %278 = bitcast <4 x float> %235 to <2 x i64>
  %279 = bitcast <4 x float> %236 to <2 x i64>
  %280 = shufflevector <2 x i64> %278, <2 x i64> %279, <2 x i32> <i32 0, i32 2>
  %281 = shufflevector <2 x i64> %278, <2 x i64> %279, <2 x i32> <i32 1, i32 3>
  %282 = bitcast <2 x i64> %280 to <4 x i32>
  %283 = bitcast <2 x i64> %281 to <4 x i32>
  %284 = bitcast <4 x float> %241 to <2 x i64>
  %285 = bitcast <4 x float> %242 to <2 x i64>
  %286 = shufflevector <2 x i64> %284, <2 x i64> %285, <2 x i32> <i32 0, i32 2>
  %287 = shufflevector <2 x i64> %284, <2 x i64> %285, <2 x i32> <i32 1, i32 3>
  %288 = bitcast <2 x i64> %286 to <4 x i32>
  %289 = bitcast <2 x i64> %287 to <4 x i32>
  %290 = shufflevector <4 x i32> %282, <4 x i32> %282, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %291 = shufflevector <4 x i32> %283, <4 x i32> %283, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %292 = shufflevector <4 x i32> %288, <4 x i32> %288, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %293 = shufflevector <4 x i32> %289, <4 x i32> %289, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %294 = mul i32 0, %230
  %295 = add i32 0, %294
  %296 = getelementptr <16 x i8>, <16 x i8>* %color_ptr1, i32 0, i32 %295
  %297 = bitcast i8* %296 to <4 x i32>*
  %298 = load <4 x i32>, <4 x i32>* %297, align 16
  %299 = mul i32 1, %230
  %300 = add i32 0, %299
  %301 = getelementptr <16 x i8>, <16 x i8>* %color_ptr1, i32 0, i32 %300
  %302 = bitcast i8* %301 to <4 x i32>*
  %303 = load <4 x i32>, <4 x i32>* %302, align 16
  %304 = mul i32 2, %230
  %305 = add i32 0, %304
  %306 = getelementptr <16 x i8>, <16 x i8>* %color_ptr1, i32 0, i32 %305
  %307 = bitcast i8* %306 to <4 x i32>*
  %308 = load <4 x i32>, <4 x i32>* %307, align 16
  %309 = mul i32 3, %230
  %310 = add i32 0, %309
  %311 = getelementptr <16 x i8>, <16 x i8>* %color_ptr1, i32 0, i32 %310
  %312 = bitcast i8* %311 to <4 x i32>*
  %313 = load <4 x i32>, <4 x i32>* %312, align 16
  %314 = bitcast <4 x i32> %270 to <16 x i8>
  %315 = bitcast <4 x i32> %253 to <16 x i8>
  %316 = bitcast <4 x i32> %298 to <16 x i8>
  %317 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %316, <16 x i8> %315, <16 x i8> %314) #2
  %318 = bitcast <16 x i8> %317 to <4 x i32>
  %319 = bitcast <4 x i32> %271 to <16 x i8>
  %320 = bitcast <4 x i32> %254 to <16 x i8>
  %321 = bitcast <4 x i32> %303 to <16 x i8>
  %322 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %321, <16 x i8> %320, <16 x i8> %319) #2
  %323 = bitcast <16 x i8> %322 to <4 x i32>
  %324 = bitcast <4 x i32> %276 to <16 x i8>
  %325 = bitcast <4 x i32> %259 to <16 x i8>
  %326 = bitcast <4 x i32> %308 to <16 x i8>
  %327 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %326, <16 x i8> %325, <16 x i8> %324) #2
  %328 = bitcast <16 x i8> %327 to <4 x i32>
  %329 = bitcast <4 x i32> %277 to <16 x i8>
  %330 = bitcast <4 x i32> %260 to <16 x i8>
  %331 = bitcast <4 x i32> %313 to <16 x i8>
  %332 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %331, <16 x i8> %330, <16 x i8> %329) #2
  %333 = bitcast <16 x i8> %332 to <4 x i32>
  %334 = mul i32 0, %230
  %335 = add i32 0, %334
  %336 = getelementptr <16 x i8>, <16 x i8>* %color_ptr1, i32 0, i32 %335
  %337 = bitcast i8* %336 to <4 x i32>*
  store <4 x i32> %318, <4 x i32>* %337, align 16
  %338 = mul i32 1, %230
  %339 = add i32 0, %338
  %340 = getelementptr <16 x i8>, <16 x i8>* %color_ptr1, i32 0, i32 %339
  %341 = bitcast i8* %340 to <4 x i32>*
  store <4 x i32> %323, <4 x i32>* %341, align 16
  %342 = mul i32 2, %230
  %343 = add i32 0, %342
  %344 = getelementptr <16 x i8>, <16 x i8>* %color_ptr1, i32 0, i32 %343
  %345 = bitcast i8* %344 to <4 x i32>*
  store <4 x i32> %328, <4 x i32>* %345, align 16
  %346 = mul i32 3, %230
  %347 = add i32 0, %346
  %348 = getelementptr <16 x i8>, <16 x i8>* %color_ptr1, i32 0, i32 %347
  %349 = bitcast i8* %348 to <4 x i32>*
  store <4 x i32> %333, <4 x i32>* %349, align 16
  ret void
}

llc -mattr option(s): +sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,-ptwrite,+xsavec,+popcnt,+aes,-avx512bitalg,-movdiri,+xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-pconfig,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-movdir64b,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vbmi2,+64bit,-avx512vl,+invpcid,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,+sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3
llc -mcpu option: skylake

define void @setup_variant_0(<4 x float>* noalias %in_v0, <4 x float>* noalias %in_v1, <4 x float>* noalias %in_v2, i32 %in_facing, <4 x float>* noalias %out_a0, <4 x float>* noalias %out_dadx, <4 x float>* noalias %out_dady) {
entry:
  %0 = getelementptr <4 x float>, <4 x float>* %in_v0, i32 0
  %v0a = load <4 x float>, <4 x float>* %0
  %1 = getelementptr <4 x float>, <4 x float>* %in_v1, i32 0
  %v1a = load <4 x float>, <4 x float>* %1
  %2 = getelementptr <4 x float>, <4 x float>* %in_v2, i32 0
  %v2a = load <4 x float>, <4 x float>* %2
  %xy0_center = fsub <4 x float> %v0a, <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
  %dxy01 = fsub <4 x float> %v0a, %v1a
  %dxy20 = fsub <4 x float> %v2a, %v0a
  %3 = shufflevector <4 x float> %dxy20, <4 x float> %dxy20, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
  %ef = fmul <4 x float> %dxy01, %3
  %4 = extractelement <4 x float> %ef, i32 0
  %5 = extractelement <4 x float> %ef, i32 1
  %6 = fsub float %4, %5
  %ooa = fdiv float 1.000000e+00, %6
  %7 = insertelement <4 x float> undef, float %ooa, i32 0
  %8 = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> zeroinitializer
  %9 = fmul <4 x float> %dxy20, %8
  %10 = fmul <4 x float> %dxy01, %8
  %11 = shufflevector <4 x float> %9, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
  %12 = shufflevector <4 x float> %10, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
  %13 = shufflevector <4 x float> %9, <4 x float> undef, <4 x i32> zeroinitializer
  %14 = shufflevector <4 x float> %10, <4 x float> undef, <4 x i32> zeroinitializer
  %15 = shufflevector <4 x float> %xy0_center, <4 x float> undef, <4 x i32> zeroinitializer
  %16 = shufflevector <4 x float> %xy0_center, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
  %da01 = fsub <4 x float> %v0a, %v1a
  %da20 = fsub <4 x float> %v2a, %v0a
  %da01_dy20_ooa = fmul <4 x float> %da01, %11
  %da20_dy01_ooa = fmul <4 x float> %da20, %12
  %dadx = fsub <4 x float> %da01_dy20_ooa, %da20_dy01_ooa
  %da01_dx20_ooa = fmul <4 x float> %da01, %13
  %da20_dx01_ooa = fmul <4 x float> %da20, %14
  %dady = fsub <4 x float> %da20_dx01_ooa, %da01_dx20_ooa
  %dadx_x0 = fmul <4 x float> %dadx, %15
  %dady_y0 = fmul <4 x float> %dady, %16
  %attr_v0 = fadd <4 x float> %dadx_x0, %dady_y0
  %attr_0 = fsub <4 x float> %v0a, %attr_v0
  %17 = getelementptr <4 x float>, <4 x float>* %out_a0, i32 0
  store <4 x float> %attr_0, <4 x float>* %17
  %18 = getelementptr <4 x float>, <4 x float>* %out_dadx, i32 0
  store <4 x float> %dadx, <4 x float>* %18
  %19 = getelementptr <4 x float>, <4 x float>* %out_dady, i32 0
  store <4 x float> %dady, <4 x float>* %19
  %20 = getelementptr <4 x float>, <4 x float>* %in_v0, i32 1
  %v0a1 = load <4 x float>, <4 x float>* %20
  %21 = getelementptr <4 x float>, <4 x float>* %in_v1, i32 1
  %v1a2 = load <4 x float>, <4 x float>* %21
  %22 = getelementptr <4 x float>, <4 x float>* %in_v2, i32 1
  %v2a3 = load <4 x float>, <4 x float>* %22
  %23 = getelementptr <4 x float>, <4 x float>* %out_a0, i32 1
  store <4 x float> %v2a3, <4 x float>* %23
  %24 = getelementptr <4 x float>, <4 x float>* %out_dadx, i32 1
  store <4 x float> zeroinitializer, <4 x float>* %24
  %25 = getelementptr <4 x float>, <4 x float>* %out_dady, i32 1
  store <4 x float> zeroinitializer, <4 x float>* %25
  ret void
}

llc -mattr option(s): +sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,-ptwrite,+xsavec,+popcnt,+aes,-avx512bitalg,-movdiri,+xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-pconfig,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-movdir64b,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vbmi2,+64bit,-avx512vl,+invpcid,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,+sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3
llc -mcpu option: skylake
VERT
PROPERTY NEXT_SHADER FRAG
DCL IN[0]
DCL IN[1]
DCL IN[2]
DCL OUT[0], POSITION
DCL OUT[1].x, PSIZE
DCL OUT[2].xy, GENERIC[9]
DCL TEMP[0..1], LOCAL
IMM[0] FLT32 {    1.0000,     0.0000,     0.0000,     0.0000}
  0: UMUL TEMP[0].x, IN[1].xxxx, IN[2].xxxx
  1: UMUL_HI TEMP[1].x, IN[1].xxxx, IN[2].xxxx
  2: MOV OUT[0], IN[0]
  3: MOV OUT[2].y, TEMP[1].xxxx
  4: MOV OUT[1].x, IMM[0].xxxx
  5: MOV OUT[2].x, TEMP[0].xxxx
  6: END
clamp_vertex_color = 0
clip_xy = 1
clip_z = 1
clip_user = 0
bypass_viewport = 0
clip_halfz = 0
need_edgeflags = 0
has_gs = 0
ucp_enable = 0
vertex_element[0].src_offset = 0
vertex_element[0].instance_divisor = 0
vertex_element[0].vertex_buffer_index = 0
vertex_element[0].src_format = PIPE_FORMAT_R32G32_FLOAT
vertex_element[1].src_offset = 0
vertex_element[1].instance_divisor = 0
vertex_element[1].vertex_buffer_index = 1
vertex_element[1].src_format = PIPE_FORMAT_R32_UINT
vertex_element[2].src_offset = 400
vertex_element[2].instance_divisor = 0
vertex_element[2].vertex_buffer_index = 1
vertex_element[2].src_format = PIPE_FORMAT_R32_UINT

; Function Attrs: nounwind readnone speculatable
declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #0

; Function Attrs: nounwind readonly
declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) #1
llvm (version 0x800) found no intrinsic for llvm.x86.avx2.pmulu.dq, going to crash...

Comment 1 Roland Scheidegger 2019-08-28 19:00:50 UTC

It looks like llvm 7.0+ got rid of the umul intrinsic (the optimized umul code was once used by draw but nowadays it's only used for these 32x32->64bit muls).
Need to update the code to follow the autoupgrade pattern of llvm instead.
(For unsigned mul that would be bitcast/and/mul/shuffle, for signed it's bitcast/shl/ashr/mul/shuffle.)

Comment 2 Roland Scheidegger 2019-08-29 14:58:33 UTC

Fixed by 332b21db55e6e6ec777b940f1b95843010d22157

Use of freedesktop.org services, including Bugzilla, is subject to our Code of Conduct. How we collect and use information is described in our Privacy Policy.