| Summary: |
dEQP-GLES31.functional.shaders.builtin_functions.integer.umulextended.uint_highp_vertex fails with bad intrinsic |
| Product: |
Mesa
|
Reporter: |
Dave Airlie <airlied> |
| Component: |
Drivers/Gallium/llvmpipe | Assignee: |
mesa-dev |
| Status: |
RESOLVED
FIXED
|
QA Contact: |
mesa-dev |
| Severity: |
not set
|
|
|
| Priority: |
not set
|
|
|
| Version: |
unspecified | |
|
| Hardware: |
Other | |
|
| OS: |
All | |
|
| Whiteboard: |
|
|
i915 platform:
|
|
i915 features:
|
|
Use of freedesktop.org services, including Bugzilla, is subject to our Code of Conduct. How we collect and use information is described in our Privacy Policy.
Debug below: llvm (version 0x800) found no intrinsic for llvm.x86.avx2.pmulu.dq, going to crash... On a skylake cpu. llvmpipe: Fragment shader #131 variant #0: FRAG DCL IN[0].xy, GENERIC[9], CONSTANT DCL OUT[0], COLOR DCL OUT[1], COLOR[1] 0: MOV OUT[1].x, IN[0].xxxx 1: MOV OUT[0].x, IN[0].yyyy 2: END fs variant 0x1f0f0bc: cbuf_format[0] = PIPE_FORMAT_R32_UINT cbuf_format[1] = PIPE_FORMAT_R32_UINT blend.colormask = 0x1 variant->opaque = 0 ; ModuleID = 'fs131_variant0' source_filename = "fs131_variant0" target datalayout = "e-p:64:64:64-i64:64:64-a0:0:64-s0:64:64" ; Function Attrs: nounwind readnone speculatable declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>) #0 ; Function Attrs: nounwind readnone declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) #1 ; Function Attrs: nounwind readnone declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) #1 define void @fs131_variant0_partial({ [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* noalias %context, i32 %x, i32 %y, i32, float* noalias %a0, float* noalias %dadx, float* noalias %dady, <16 x i8>** noalias %color_ptr_ptr, i8* noalias %depth, i32 %mask_input, { { [2048 x i32], [128 x i64] }*, i64, i64, i32 }* noalias %thread_data, i32* noalias %stride_ptr, i32 %depth_stride) { entry: %output16 = alloca <8 x float> %output15 = alloca <8 x float> %output14 = alloca <8 x float> %output13 = alloca <8 x float> %output12 = alloca <8 x float> %output11 = alloca <8 x float> %output10 = alloca <8 x float> %output = alloca <8 x float> %looplimiter = alloca i32 %execution_mask = alloca <8 x i32> %color9 = alloca <8 x float>, i32 2 %color8 = alloca <8 x float>, i32 2 %color7 = alloca <8 x float>, i32 2 %color6 = alloca <8 x float>, i32 2 %color5 = alloca <8 x float>, i32 2 %color4 = alloca <8 x float>, i32 2 %color3 = alloca <8 x float>, i32 2 %color = alloca <8 x float>, i32 2 %loop_counter = alloca i32 %1 = alloca <8 x float>, i32 2 %2 = alloca <8 x float>, i32 2 %mask_store = alloca <8 x i32>, i32 2 %thread_data.invocs_ptr = getelementptr { { [2048 x i32], [128 x i64] }*, i64, i64, i32 }, { { [2048 x i32], [128 x i64] }*, i64, i64, i32 }* %thread_data, i32 0, i32 2 %3 = load i64, i64* %thread_data.invocs_ptr %invoc_count = add i64 %3, 1 store i64 %invoc_count, i64* %thread_data.invocs_ptr %4 = sitofp i32 %x to float %5 = sitofp i32 %y to float %6 = getelementptr <8 x float>, <8 x float>* %2, i32 0 store <8 x float> <float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 2.000000e+00, float 3.000000e+00>, <8 x float>* %6 %7 = getelementptr <8 x float>, <8 x float>* %1, i32 0 store <8 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, <8 x float>* %7 %8 = getelementptr <8 x float>, <8 x float>* %2, i32 1 store <8 x float> <float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 2.000000e+00, float 3.000000e+00>, <8 x float>* %8 %9 = getelementptr <8 x float>, <8 x float>* %1, i32 1 store <8 x float> <float 2.000000e+00, float 2.000000e+00, float 3.000000e+00, float 3.000000e+00, float 2.000000e+00, float 2.000000e+00, float 3.000000e+00, float 3.000000e+00>, <8 x float>* %9 %10 = getelementptr float, float* %dadx, i32 0 %11 = bitcast float* %10 to <4 x float>* %pos.x.dadxaos = load <4 x float>, <4 x float>* %11 %12 = getelementptr float, float* %dady, i32 0 %13 = bitcast float* %12 to <4 x float>* %pos.x.dadyaos = load <4 x float>, <4 x float>* %13 %14 = getelementptr float, float* %a0, i32 0 %15 = bitcast float* %14 to <4 x float>* %pos.x.a0aos = load <4 x float>, <4 x float>* %15 %16 = getelementptr float, float* %a0, i32 4 %17 = bitcast float* %16 to <4 x float>* %input0.x.a0aos = load <4 x float>, <4 x float>* %17 %mask_ptr = getelementptr <8 x i32>, <8 x i32>* %mask_store, i32 0 %18 = lshr i32 %mask_input, 0 %19 = insertelement <8 x i32> undef, i32 %18, i32 0 %20 = shufflevector <8 x i32> %19, <8 x i32> undef, <8 x i32> zeroinitializer %21 = and <8 x i32> %20, <i32 1, i32 2, i32 16, i32 32, i32 4, i32 8, i32 64, i32 128> %22 = icmp eq <8 x i32> %21, <i32 1, i32 2, i32 16, i32 32, i32 4, i32 8, i32 64, i32 128> %23 = sext <8 x i1> %22 to <8 x i32> store <8 x i32> %23, <8 x i32>* %mask_ptr %mask_ptr1 = getelementptr <8 x i32>, <8 x i32>* %mask_store, i32 1 %24 = lshr i32 %mask_input, 8 %25 = insertelement <8 x i32> undef, i32 %24, i32 0 %26 = shufflevector <8 x i32> %25, <8 x i32> undef, <8 x i32> zeroinitializer %27 = and <8 x i32> %26, <i32 1, i32 2, i32 16, i32 32, i32 4, i32 8, i32 64, i32 128> %28 = icmp eq <8 x i32> %27, <i32 1, i32 2, i32 16, i32 32, i32 4, i32 8, i32 64, i32 128> %29 = sext <8 x i1> %28 to <8 x i32> store <8 x i32> %29, <8 x i32>* %mask_ptr1 %context.stencil_ref_front_ptr = getelementptr { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }, { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* %context, i32 0, i32 6 %context.stencil_ref_front = load i32, i32* %context.stencil_ref_front_ptr %context.stencil_ref_back_ptr = getelementptr { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }, { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* %context, i32 0, i32 7 %context.stencil_ref_back = load i32, i32* %context.stencil_ref_back_ptr %30 = insertelement <8 x i32> undef, i32 %context.stencil_ref_front, i32 0 %31 = shufflevector <8 x i32> %30, <8 x i32> undef, <8 x i32> zeroinitializer %32 = insertelement <8 x i32> undef, i32 %context.stencil_ref_back, i32 0 %33 = shufflevector <8 x i32> %32, <8 x i32> undef, <8 x i32> zeroinitializer %context.constants_ptr = getelementptr { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }, { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* %context, i32 0, i32 0 %context.num_constants_ptr = getelementptr { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }, { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* %context, i32 0, i32 1 %context.ssbos_ptr = getelementptr { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }, { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* %context, i32 0, i32 11 %context.num_ssbos_ptr = getelementptr { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }, { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* %context, i32 0, i32 12 store i32 0, i32* %loop_counter store i32 0, i32* %loop_counter br label %loop_begin loop_begin: ; preds = %skip, %entry %34 = load i32, i32* %loop_counter %35 = icmp ult i32 %34, 2 br i1 %35, label %loop_body, label %loop_exit loop_body: ; preds = %loop_begin %mask_ptr2 = getelementptr <8 x i32>, <8 x i32>* %mask_store, i32 %34 %36 = load <8 x i32>, <8 x i32>* %mask_ptr2 store <8 x i32> zeroinitializer, <8 x i32>* %execution_mask store <8 x i32> %36, <8 x i32>* %execution_mask %37 = load <8 x i32>, <8 x i32>* %execution_mask %38 = bitcast <8 x i32> %37 to i256 %39 = icmp eq i256 %38, 0 br i1 %39, label %skip, label %40 ; <label>:40: ; preds = %loop_body %41 = getelementptr <8 x float>, <8 x float>* %2, i32 %34 %42 = load <8 x float>, <8 x float>* %41 %43 = getelementptr <8 x float>, <8 x float>* %1, i32 %34 %44 = load <8 x float>, <8 x float>* %43 %45 = insertelement <8 x float> undef, float %4, i32 0 %46 = shufflevector <8 x float> %45, <8 x float> undef, <8 x i32> zeroinitializer %47 = fadd <8 x float> %42, %46 %48 = insertelement <8 x float> undef, float %5, i32 0 %49 = shufflevector <8 x float> %48, <8 x float> undef, <8 x i32> zeroinitializer %50 = fadd <8 x float> %44, %49 %51 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <8 x float> %47, <8 x float> <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>) #2 %52 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> zeroinitializer, <8 x float> %50, <8 x float> %51) #2 %53 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> zeroinitializer, <8 x float> %47, <8 x float> <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>) #2 %54 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <8 x float> %50, <8 x float> %53) #2 %55 = shufflevector <4 x float> %pos.x.dadxaos, <4 x float> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> %56 = shufflevector <4 x float> %pos.x.dadyaos, <4 x float> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> %57 = shufflevector <4 x float> %pos.x.a0aos, <4 x float> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> %58 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %55, <8 x float> %47, <8 x float> %57) #2 %59 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %56, <8 x float> %50, <8 x float> %58) #2 %60 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %59, <8 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>) #2 %61 = shufflevector <4 x float> %pos.x.dadxaos, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> %62 = shufflevector <4 x float> %pos.x.dadyaos, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> %63 = shufflevector <4 x float> %pos.x.a0aos, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> %64 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %61, <8 x float> %47, <8 x float> %63) #2 %65 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %62, <8 x float> %50, <8 x float> %64) #2 %66 = getelementptr <8 x float>, <8 x float>* %2, i32 %34 %67 = load <8 x float>, <8 x float>* %66 %68 = getelementptr <8 x float>, <8 x float>* %1, i32 %34 %69 = load <8 x float>, <8 x float>* %68 %70 = insertelement <8 x float> undef, float %4, i32 0 %71 = shufflevector <8 x float> %70, <8 x float> undef, <8 x i32> zeroinitializer %72 = fadd <8 x float> %67, %71 %73 = insertelement <8 x float> undef, float %5, i32 0 %74 = shufflevector <8 x float> %73, <8 x float> undef, <8 x i32> zeroinitializer %75 = fadd <8 x float> %69, %74 %76 = shufflevector <4 x float> %input0.x.a0aos, <4 x float> undef, <8 x i32> zeroinitializer %77 = shufflevector <4 x float> %input0.x.a0aos, <4 x float> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> store i32 0, i32* %looplimiter store i32 65535, i32* %looplimiter store <8 x float> zeroinitializer, <8 x float>* %output store <8 x float> zeroinitializer, <8 x float>* %output10 store <8 x float> zeroinitializer, <8 x float>* %output11 store <8 x float> zeroinitializer, <8 x float>* %output12 store <8 x float> zeroinitializer, <8 x float>* %output13 store <8 x float> zeroinitializer, <8 x float>* %output14 store <8 x float> zeroinitializer, <8 x float>* %output15 store <8 x float> zeroinitializer, <8 x float>* %output16 store <8 x float> %76, <8 x float>* %output13 store <8 x float> %77, <8 x float>* %output %color0.r = load <8 x float>, <8 x float>* %output %78 = getelementptr <8 x float>, <8 x float>* %color, i32 %34 store <8 x float> %color0.r, <8 x float>* %78 %color0.g = load <8 x float>, <8 x float>* %output10 %79 = getelementptr <8 x float>, <8 x float>* %color3, i32 %34 store <8 x float> %color0.g, <8 x float>* %79 %color0.b = load <8 x float>, <8 x float>* %output11 %80 = getelementptr <8 x float>, <8 x float>* %color4, i32 %34 store <8 x float> %color0.b, <8 x float>* %80 %color0.a = load <8 x float>, <8 x float>* %output12 %81 = getelementptr <8 x float>, <8 x float>* %color5, i32 %34 store <8 x float> %color0.a, <8 x float>* %81 %color1.r = load <8 x float>, <8 x float>* %output13 %82 = getelementptr <8 x float>, <8 x float>* %color6, i32 %34 store <8 x float> %color1.r, <8 x float>* %82 %color1.g = load <8 x float>, <8 x float>* %output14 %83 = getelementptr <8 x float>, <8 x float>* %color7, i32 %34 store <8 x float> %color1.g, <8 x float>* %83 %color1.b = load <8 x float>, <8 x float>* %output15 %84 = getelementptr <8 x float>, <8 x float>* %color8, i32 %34 store <8 x float> %color1.b, <8 x float>* %84 %color1.a = load <8 x float>, <8 x float>* %output16 %85 = getelementptr <8 x float>, <8 x float>* %color9, i32 %34 store <8 x float> %color1.a, <8 x float>* %85 br label %skip skip: ; preds = %40, %loop_body %86 = load <8 x i32>, <8 x i32>* %execution_mask store <8 x i32> %86, <8 x i32>* %mask_ptr2 %87 = add i32 %34, 1 store i32 %87, i32* %loop_counter br label %loop_begin loop_exit: ; preds = %loop_begin %88 = getelementptr <8 x i32>, <8 x i32>* %mask_store, i32 0 %mask = load <8 x i32>, <8 x i32>* %88 %89 = getelementptr <8 x float>, <8 x float>* %color, i32 0 %90 = getelementptr <8 x float>, <8 x float>* %color3, i32 0 %91 = getelementptr <8 x float>, <8 x float>* %color4, i32 0 %92 = getelementptr <8 x float>, <8 x float>* %color5, i32 0 %93 = getelementptr <8 x float>, <8 x float>* %color6, i32 0 %94 = getelementptr <8 x float>, <8 x float>* %color7, i32 0 %95 = getelementptr <8 x float>, <8 x float>* %color8, i32 0 %96 = getelementptr <8 x float>, <8 x float>* %color9, i32 0 %97 = getelementptr <8 x i32>, <8 x i32>* %mask_store, i32 1 %mask17 = load <8 x i32>, <8 x i32>* %97 %98 = getelementptr <8 x float>, <8 x float>* %color, i32 1 %99 = getelementptr <8 x float>, <8 x float>* %color3, i32 1 %100 = getelementptr <8 x float>, <8 x float>* %color4, i32 1 %101 = getelementptr <8 x float>, <8 x float>* %color5, i32 1 %102 = getelementptr <8 x float>, <8 x float>* %color6, i32 1 %103 = getelementptr <8 x float>, <8 x float>* %color7, i32 1 %104 = getelementptr <8 x float>, <8 x float>* %color8, i32 1 %105 = getelementptr <8 x float>, <8 x float>* %color9, i32 1 %106 = getelementptr <16 x i8>*, <16 x i8>** %color_ptr_ptr, i32 0 %color_ptr0 = load <16 x i8>*, <16 x i8>** %106 %107 = getelementptr i32, i32* %stride_ptr, i32 0 %108 = load i32, i32* %107 %109 = load <8 x float>, <8 x float>* %92 %110 = load <8 x float>, <8 x float>* %89 %111 = shufflevector <8 x i32> %mask, <8 x i32> %mask, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %112 = shufflevector <8 x i32> %mask, <8 x i32> %mask, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %113 = shufflevector <8 x float> %109, <8 x float> %109, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %114 = shufflevector <8 x float> %109, <8 x float> %109, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %115 = load <8 x float>, <8 x float>* %101 %116 = load <8 x float>, <8 x float>* %98 %117 = shufflevector <8 x i32> %mask17, <8 x i32> %mask17, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %118 = shufflevector <8 x i32> %mask17, <8 x i32> %mask17, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %119 = shufflevector <8 x float> %115, <8 x float> %115, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %120 = shufflevector <8 x float> %115, <8 x float> %115, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %121 = bitcast <8 x float> %110 to <8 x i32> %122 = bitcast <8 x float> %116 to <8 x i32> %123 = shufflevector <8 x i32> %122, <8 x i32> %122, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %124 = shufflevector <8 x i32> %122, <8 x i32> %122, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %125 = shufflevector <8 x i32> %121, <8 x i32> %121, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %126 = shufflevector <8 x i32> %121, <8 x i32> %121, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %127 = bitcast <4 x i32> %126 to <2 x i64> %128 = bitcast <4 x i32> %125 to <2 x i64> %129 = shufflevector <2 x i64> %127, <2 x i64> %128, <2 x i32> <i32 0, i32 2> %130 = shufflevector <2 x i64> %127, <2 x i64> %128, <2 x i32> <i32 1, i32 3> %131 = bitcast <2 x i64> %129 to <4 x i32> %132 = bitcast <2 x i64> %130 to <4 x i32> %133 = bitcast <4 x i32> %124 to <2 x i64> %134 = bitcast <4 x i32> %123 to <2 x i64> %135 = shufflevector <2 x i64> %133, <2 x i64> %134, <2 x i32> <i32 0, i32 2> %136 = shufflevector <2 x i64> %133, <2 x i64> %134, <2 x i32> <i32 1, i32 3> %137 = bitcast <2 x i64> %135 to <4 x i32> %138 = bitcast <2 x i64> %136 to <4 x i32> %context.f_blend_color_ptr = getelementptr { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }, { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* %context, i32 0, i32 9 %context.f_blend_color = load float*, float** %context.f_blend_color_ptr %139 = bitcast float* %context.f_blend_color to <4 x i32>* %140 = getelementptr <4 x i32>, <4 x i32>* %139, i32 0 %141 = load <4 x i32>, <4 x i32>* %140 %142 = shufflevector <4 x i32> %141, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %143 = shufflevector <4 x i32> %141, <4 x i32> undef, <4 x i32> zeroinitializer %144 = bitcast <4 x i32> %111 to <2 x i64> %145 = bitcast <4 x i32> %112 to <2 x i64> %146 = shufflevector <2 x i64> %144, <2 x i64> %145, <2 x i32> <i32 0, i32 2> %147 = shufflevector <2 x i64> %144, <2 x i64> %145, <2 x i32> <i32 1, i32 3> %148 = bitcast <2 x i64> %146 to <4 x i32> %149 = bitcast <2 x i64> %147 to <4 x i32> %150 = bitcast <4 x i32> %117 to <2 x i64> %151 = bitcast <4 x i32> %118 to <2 x i64> %152 = shufflevector <2 x i64> %150, <2 x i64> %151, <2 x i32> <i32 0, i32 2> %153 = shufflevector <2 x i64> %150, <2 x i64> %151, <2 x i32> <i32 1, i32 3> %154 = bitcast <2 x i64> %152 to <4 x i32> %155 = bitcast <2 x i64> %153 to <4 x i32> %156 = bitcast <4 x float> %113 to <2 x i64> %157 = bitcast <4 x float> %114 to <2 x i64> %158 = shufflevector <2 x i64> %156, <2 x i64> %157, <2 x i32> <i32 0, i32 2> %159 = shufflevector <2 x i64> %156, <2 x i64> %157, <2 x i32> <i32 1, i32 3> %160 = bitcast <2 x i64> %158 to <4 x i32> %161 = bitcast <2 x i64> %159 to <4 x i32> %162 = bitcast <4 x float> %119 to <2 x i64> %163 = bitcast <4 x float> %120 to <2 x i64> %164 = shufflevector <2 x i64> %162, <2 x i64> %163, <2 x i32> <i32 0, i32 2> %165 = shufflevector <2 x i64> %162, <2 x i64> %163, <2 x i32> <i32 1, i32 3> %166 = bitcast <2 x i64> %164 to <4 x i32> %167 = bitcast <2 x i64> %165 to <4 x i32> %168 = shufflevector <4 x i32> %160, <4 x i32> %160, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %169 = shufflevector <4 x i32> %161, <4 x i32> %161, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %170 = shufflevector <4 x i32> %166, <4 x i32> %166, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %171 = shufflevector <4 x i32> %167, <4 x i32> %167, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %172 = mul i32 0, %108 %173 = add i32 0, %172 %174 = getelementptr <16 x i8>, <16 x i8>* %color_ptr0, i32 0, i32 %173 %175 = bitcast i8* %174 to <4 x i32>* %176 = load <4 x i32>, <4 x i32>* %175, align 16 %177 = mul i32 1, %108 %178 = add i32 0, %177 %179 = getelementptr <16 x i8>, <16 x i8>* %color_ptr0, i32 0, i32 %178 %180 = bitcast i8* %179 to <4 x i32>* %181 = load <4 x i32>, <4 x i32>* %180, align 16 %182 = mul i32 2, %108 %183 = add i32 0, %182 %184 = getelementptr <16 x i8>, <16 x i8>* %color_ptr0, i32 0, i32 %183 %185 = bitcast i8* %184 to <4 x i32>* %186 = load <4 x i32>, <4 x i32>* %185, align 16 %187 = mul i32 3, %108 %188 = add i32 0, %187 %189 = getelementptr <16 x i8>, <16 x i8>* %color_ptr0, i32 0, i32 %188 %190 = bitcast i8* %189 to <4 x i32>* %191 = load <4 x i32>, <4 x i32>* %190, align 16 %192 = bitcast <4 x i32> %148 to <16 x i8> %193 = bitcast <4 x i32> %131 to <16 x i8> %194 = bitcast <4 x i32> %176 to <16 x i8> %195 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %194, <16 x i8> %193, <16 x i8> %192) #2 %196 = bitcast <16 x i8> %195 to <4 x i32> %197 = bitcast <4 x i32> %149 to <16 x i8> %198 = bitcast <4 x i32> %132 to <16 x i8> %199 = bitcast <4 x i32> %181 to <16 x i8> %200 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %199, <16 x i8> %198, <16 x i8> %197) #2 %201 = bitcast <16 x i8> %200 to <4 x i32> %202 = bitcast <4 x i32> %154 to <16 x i8> %203 = bitcast <4 x i32> %137 to <16 x i8> %204 = bitcast <4 x i32> %186 to <16 x i8> %205 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %204, <16 x i8> %203, <16 x i8> %202) #2 %206 = bitcast <16 x i8> %205 to <4 x i32> %207 = bitcast <4 x i32> %155 to <16 x i8> %208 = bitcast <4 x i32> %138 to <16 x i8> %209 = bitcast <4 x i32> %191 to <16 x i8> %210 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %209, <16 x i8> %208, <16 x i8> %207) #2 %211 = bitcast <16 x i8> %210 to <4 x i32> %212 = mul i32 0, %108 %213 = add i32 0, %212 %214 = getelementptr <16 x i8>, <16 x i8>* %color_ptr0, i32 0, i32 %213 %215 = bitcast i8* %214 to <4 x i32>* store <4 x i32> %196, <4 x i32>* %215, align 16 %216 = mul i32 1, %108 %217 = add i32 0, %216 %218 = getelementptr <16 x i8>, <16 x i8>* %color_ptr0, i32 0, i32 %217 %219 = bitcast i8* %218 to <4 x i32>* store <4 x i32> %201, <4 x i32>* %219, align 16 %220 = mul i32 2, %108 %221 = add i32 0, %220 %222 = getelementptr <16 x i8>, <16 x i8>* %color_ptr0, i32 0, i32 %221 %223 = bitcast i8* %222 to <4 x i32>* store <4 x i32> %206, <4 x i32>* %223, align 16 %224 = mul i32 3, %108 %225 = add i32 0, %224 %226 = getelementptr <16 x i8>, <16 x i8>* %color_ptr0, i32 0, i32 %225 %227 = bitcast i8* %226 to <4 x i32>* store <4 x i32> %211, <4 x i32>* %227, align 16 %228 = getelementptr <16 x i8>*, <16 x i8>** %color_ptr_ptr, i32 1 %color_ptr1 = load <16 x i8>*, <16 x i8>** %228 %229 = getelementptr i32, i32* %stride_ptr, i32 1 %230 = load i32, i32* %229 %231 = load <8 x float>, <8 x float>* %96 %232 = load <8 x float>, <8 x float>* %93 %233 = shufflevector <8 x i32> %mask, <8 x i32> %mask, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %234 = shufflevector <8 x i32> %mask, <8 x i32> %mask, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %235 = shufflevector <8 x float> %231, <8 x float> %231, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %236 = shufflevector <8 x float> %231, <8 x float> %231, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %237 = load <8 x float>, <8 x float>* %105 %238 = load <8 x float>, <8 x float>* %102 %239 = shufflevector <8 x i32> %mask17, <8 x i32> %mask17, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %240 = shufflevector <8 x i32> %mask17, <8 x i32> %mask17, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %241 = shufflevector <8 x float> %237, <8 x float> %237, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %242 = shufflevector <8 x float> %237, <8 x float> %237, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %243 = bitcast <8 x float> %232 to <8 x i32> %244 = bitcast <8 x float> %238 to <8 x i32> %245 = shufflevector <8 x i32> %244, <8 x i32> %244, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %246 = shufflevector <8 x i32> %244, <8 x i32> %244, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %247 = shufflevector <8 x i32> %243, <8 x i32> %243, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %248 = shufflevector <8 x i32> %243, <8 x i32> %243, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %249 = bitcast <4 x i32> %248 to <2 x i64> %250 = bitcast <4 x i32> %247 to <2 x i64> %251 = shufflevector <2 x i64> %249, <2 x i64> %250, <2 x i32> <i32 0, i32 2> %252 = shufflevector <2 x i64> %249, <2 x i64> %250, <2 x i32> <i32 1, i32 3> %253 = bitcast <2 x i64> %251 to <4 x i32> %254 = bitcast <2 x i64> %252 to <4 x i32> %255 = bitcast <4 x i32> %246 to <2 x i64> %256 = bitcast <4 x i32> %245 to <2 x i64> %257 = shufflevector <2 x i64> %255, <2 x i64> %256, <2 x i32> <i32 0, i32 2> %258 = shufflevector <2 x i64> %255, <2 x i64> %256, <2 x i32> <i32 1, i32 3> %259 = bitcast <2 x i64> %257 to <4 x i32> %260 = bitcast <2 x i64> %258 to <4 x i32> %context.f_blend_color_ptr18 = getelementptr { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }, { [16 x float*], [16 x i32], [128 x { i32, i32, i32, i8*, [14 x i32], [14 x i32], i32, i32, [14 x i32] }], [32 x { float, float, float, [4 x float] }], [32 x { i32, i32, i32, i8*, i32, i32 }], float, i32, i32, i8*, float*, { float, float }*, [16 x i32*], [16 x i32] }* %context, i32 0, i32 9 %context.f_blend_color19 = load float*, float** %context.f_blend_color_ptr18 %261 = bitcast float* %context.f_blend_color19 to <4 x i32>* %262 = getelementptr <4 x i32>, <4 x i32>* %261, i32 0 %263 = load <4 x i32>, <4 x i32>* %262 %264 = shufflevector <4 x i32> %263, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %265 = shufflevector <4 x i32> %263, <4 x i32> undef, <4 x i32> zeroinitializer %266 = bitcast <4 x i32> %233 to <2 x i64> %267 = bitcast <4 x i32> %234 to <2 x i64> %268 = shufflevector <2 x i64> %266, <2 x i64> %267, <2 x i32> <i32 0, i32 2> %269 = shufflevector <2 x i64> %266, <2 x i64> %267, <2 x i32> <i32 1, i32 3> %270 = bitcast <2 x i64> %268 to <4 x i32> %271 = bitcast <2 x i64> %269 to <4 x i32> %272 = bitcast <4 x i32> %239 to <2 x i64> %273 = bitcast <4 x i32> %240 to <2 x i64> %274 = shufflevector <2 x i64> %272, <2 x i64> %273, <2 x i32> <i32 0, i32 2> %275 = shufflevector <2 x i64> %272, <2 x i64> %273, <2 x i32> <i32 1, i32 3> %276 = bitcast <2 x i64> %274 to <4 x i32> %277 = bitcast <2 x i64> %275 to <4 x i32> %278 = bitcast <4 x float> %235 to <2 x i64> %279 = bitcast <4 x float> %236 to <2 x i64> %280 = shufflevector <2 x i64> %278, <2 x i64> %279, <2 x i32> <i32 0, i32 2> %281 = shufflevector <2 x i64> %278, <2 x i64> %279, <2 x i32> <i32 1, i32 3> %282 = bitcast <2 x i64> %280 to <4 x i32> %283 = bitcast <2 x i64> %281 to <4 x i32> %284 = bitcast <4 x float> %241 to <2 x i64> %285 = bitcast <4 x float> %242 to <2 x i64> %286 = shufflevector <2 x i64> %284, <2 x i64> %285, <2 x i32> <i32 0, i32 2> %287 = shufflevector <2 x i64> %284, <2 x i64> %285, <2 x i32> <i32 1, i32 3> %288 = bitcast <2 x i64> %286 to <4 x i32> %289 = bitcast <2 x i64> %287 to <4 x i32> %290 = shufflevector <4 x i32> %282, <4 x i32> %282, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %291 = shufflevector <4 x i32> %283, <4 x i32> %283, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %292 = shufflevector <4 x i32> %288, <4 x i32> %288, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %293 = shufflevector <4 x i32> %289, <4 x i32> %289, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %294 = mul i32 0, %230 %295 = add i32 0, %294 %296 = getelementptr <16 x i8>, <16 x i8>* %color_ptr1, i32 0, i32 %295 %297 = bitcast i8* %296 to <4 x i32>* %298 = load <4 x i32>, <4 x i32>* %297, align 16 %299 = mul i32 1, %230 %300 = add i32 0, %299 %301 = getelementptr <16 x i8>, <16 x i8>* %color_ptr1, i32 0, i32 %300 %302 = bitcast i8* %301 to <4 x i32>* %303 = load <4 x i32>, <4 x i32>* %302, align 16 %304 = mul i32 2, %230 %305 = add i32 0, %304 %306 = getelementptr <16 x i8>, <16 x i8>* %color_ptr1, i32 0, i32 %305 %307 = bitcast i8* %306 to <4 x i32>* %308 = load <4 x i32>, <4 x i32>* %307, align 16 %309 = mul i32 3, %230 %310 = add i32 0, %309 %311 = getelementptr <16 x i8>, <16 x i8>* %color_ptr1, i32 0, i32 %310 %312 = bitcast i8* %311 to <4 x i32>* %313 = load <4 x i32>, <4 x i32>* %312, align 16 %314 = bitcast <4 x i32> %270 to <16 x i8> %315 = bitcast <4 x i32> %253 to <16 x i8> %316 = bitcast <4 x i32> %298 to <16 x i8> %317 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %316, <16 x i8> %315, <16 x i8> %314) #2 %318 = bitcast <16 x i8> %317 to <4 x i32> %319 = bitcast <4 x i32> %271 to <16 x i8> %320 = bitcast <4 x i32> %254 to <16 x i8> %321 = bitcast <4 x i32> %303 to <16 x i8> %322 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %321, <16 x i8> %320, <16 x i8> %319) #2 %323 = bitcast <16 x i8> %322 to <4 x i32> %324 = bitcast <4 x i32> %276 to <16 x i8> %325 = bitcast <4 x i32> %259 to <16 x i8> %326 = bitcast <4 x i32> %308 to <16 x i8> %327 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %326, <16 x i8> %325, <16 x i8> %324) #2 %328 = bitcast <16 x i8> %327 to <4 x i32> %329 = bitcast <4 x i32> %277 to <16 x i8> %330 = bitcast <4 x i32> %260 to <16 x i8> %331 = bitcast <4 x i32> %313 to <16 x i8> %332 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %331, <16 x i8> %330, <16 x i8> %329) #2 %333 = bitcast <16 x i8> %332 to <4 x i32> %334 = mul i32 0, %230 %335 = add i32 0, %334 %336 = getelementptr <16 x i8>, <16 x i8>* %color_ptr1, i32 0, i32 %335 %337 = bitcast i8* %336 to <4 x i32>* store <4 x i32> %318, <4 x i32>* %337, align 16 %338 = mul i32 1, %230 %339 = add i32 0, %338 %340 = getelementptr <16 x i8>, <16 x i8>* %color_ptr1, i32 0, i32 %339 %341 = bitcast i8* %340 to <4 x i32>* store <4 x i32> %323, <4 x i32>* %341, align 16 %342 = mul i32 2, %230 %343 = add i32 0, %342 %344 = getelementptr <16 x i8>, <16 x i8>* %color_ptr1, i32 0, i32 %343 %345 = bitcast i8* %344 to <4 x i32>* store <4 x i32> %328, <4 x i32>* %345, align 16 %346 = mul i32 3, %230 %347 = add i32 0, %346 %348 = getelementptr <16 x i8>, <16 x i8>* %color_ptr1, i32 0, i32 %347 %349 = bitcast i8* %348 to <4 x i32>* store <4 x i32> %333, <4 x i32>* %349, align 16 ret void } llc -mattr option(s): +sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,-ptwrite,+xsavec,+popcnt,+aes,-avx512bitalg,-movdiri,+xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-pconfig,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-movdir64b,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vbmi2,+64bit,-avx512vl,+invpcid,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,+sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3 llc -mcpu option: skylake define void @setup_variant_0(<4 x float>* noalias %in_v0, <4 x float>* noalias %in_v1, <4 x float>* noalias %in_v2, i32 %in_facing, <4 x float>* noalias %out_a0, <4 x float>* noalias %out_dadx, <4 x float>* noalias %out_dady) { entry: %0 = getelementptr <4 x float>, <4 x float>* %in_v0, i32 0 %v0a = load <4 x float>, <4 x float>* %0 %1 = getelementptr <4 x float>, <4 x float>* %in_v1, i32 0 %v1a = load <4 x float>, <4 x float>* %1 %2 = getelementptr <4 x float>, <4 x float>* %in_v2, i32 0 %v2a = load <4 x float>, <4 x float>* %2 %xy0_center = fsub <4 x float> %v0a, <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01> %dxy01 = fsub <4 x float> %v0a, %v1a %dxy20 = fsub <4 x float> %v2a, %v0a %3 = shufflevector <4 x float> %dxy20, <4 x float> %dxy20, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef> %ef = fmul <4 x float> %dxy01, %3 %4 = extractelement <4 x float> %ef, i32 0 %5 = extractelement <4 x float> %ef, i32 1 %6 = fsub float %4, %5 %ooa = fdiv float 1.000000e+00, %6 %7 = insertelement <4 x float> undef, float %ooa, i32 0 %8 = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> zeroinitializer %9 = fmul <4 x float> %dxy20, %8 %10 = fmul <4 x float> %dxy01, %8 %11 = shufflevector <4 x float> %9, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %12 = shufflevector <4 x float> %10, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %13 = shufflevector <4 x float> %9, <4 x float> undef, <4 x i32> zeroinitializer %14 = shufflevector <4 x float> %10, <4 x float> undef, <4 x i32> zeroinitializer %15 = shufflevector <4 x float> %xy0_center, <4 x float> undef, <4 x i32> zeroinitializer %16 = shufflevector <4 x float> %xy0_center, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %da01 = fsub <4 x float> %v0a, %v1a %da20 = fsub <4 x float> %v2a, %v0a %da01_dy20_ooa = fmul <4 x float> %da01, %11 %da20_dy01_ooa = fmul <4 x float> %da20, %12 %dadx = fsub <4 x float> %da01_dy20_ooa, %da20_dy01_ooa %da01_dx20_ooa = fmul <4 x float> %da01, %13 %da20_dx01_ooa = fmul <4 x float> %da20, %14 %dady = fsub <4 x float> %da20_dx01_ooa, %da01_dx20_ooa %dadx_x0 = fmul <4 x float> %dadx, %15 %dady_y0 = fmul <4 x float> %dady, %16 %attr_v0 = fadd <4 x float> %dadx_x0, %dady_y0 %attr_0 = fsub <4 x float> %v0a, %attr_v0 %17 = getelementptr <4 x float>, <4 x float>* %out_a0, i32 0 store <4 x float> %attr_0, <4 x float>* %17 %18 = getelementptr <4 x float>, <4 x float>* %out_dadx, i32 0 store <4 x float> %dadx, <4 x float>* %18 %19 = getelementptr <4 x float>, <4 x float>* %out_dady, i32 0 store <4 x float> %dady, <4 x float>* %19 %20 = getelementptr <4 x float>, <4 x float>* %in_v0, i32 1 %v0a1 = load <4 x float>, <4 x float>* %20 %21 = getelementptr <4 x float>, <4 x float>* %in_v1, i32 1 %v1a2 = load <4 x float>, <4 x float>* %21 %22 = getelementptr <4 x float>, <4 x float>* %in_v2, i32 1 %v2a3 = load <4 x float>, <4 x float>* %22 %23 = getelementptr <4 x float>, <4 x float>* %out_a0, i32 1 store <4 x float> %v2a3, <4 x float>* %23 %24 = getelementptr <4 x float>, <4 x float>* %out_dadx, i32 1 store <4 x float> zeroinitializer, <4 x float>* %24 %25 = getelementptr <4 x float>, <4 x float>* %out_dady, i32 1 store <4 x float> zeroinitializer, <4 x float>* %25 ret void } llc -mattr option(s): +sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,-ptwrite,+xsavec,+popcnt,+aes,-avx512bitalg,-movdiri,+xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-pconfig,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-movdir64b,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vbmi2,+64bit,-avx512vl,+invpcid,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,+sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3 llc -mcpu option: skylake VERT PROPERTY NEXT_SHADER FRAG DCL IN[0] DCL IN[1] DCL IN[2] DCL OUT[0], POSITION DCL OUT[1].x, PSIZE DCL OUT[2].xy, GENERIC[9] DCL TEMP[0..1], LOCAL IMM[0] FLT32 { 1.0000, 0.0000, 0.0000, 0.0000} 0: UMUL TEMP[0].x, IN[1].xxxx, IN[2].xxxx 1: UMUL_HI TEMP[1].x, IN[1].xxxx, IN[2].xxxx 2: MOV OUT[0], IN[0] 3: MOV OUT[2].y, TEMP[1].xxxx 4: MOV OUT[1].x, IMM[0].xxxx 5: MOV OUT[2].x, TEMP[0].xxxx 6: END clamp_vertex_color = 0 clip_xy = 1 clip_z = 1 clip_user = 0 bypass_viewport = 0 clip_halfz = 0 need_edgeflags = 0 has_gs = 0 ucp_enable = 0 vertex_element[0].src_offset = 0 vertex_element[0].instance_divisor = 0 vertex_element[0].vertex_buffer_index = 0 vertex_element[0].src_format = PIPE_FORMAT_R32G32_FLOAT vertex_element[1].src_offset = 0 vertex_element[1].instance_divisor = 0 vertex_element[1].vertex_buffer_index = 1 vertex_element[1].src_format = PIPE_FORMAT_R32_UINT vertex_element[2].src_offset = 400 vertex_element[2].instance_divisor = 0 vertex_element[2].vertex_buffer_index = 1 vertex_element[2].src_format = PIPE_FORMAT_R32_UINT ; Function Attrs: nounwind readnone speculatable declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #0 ; Function Attrs: nounwind readonly declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) #1 llvm (version 0x800) found no intrinsic for llvm.x86.avx2.pmulu.dq, going to crash...