r300: DRM version: 2.50.0, Name: ATI RS690, ID: 0x791f, GB: 1, Z: 1
r300: GART size: 509 MB, VRAM size: 288 MB
r300: AA compression RAM: YES, Z compression RAM: NO, HiZ RAM: NO
r300: DRM version: 2.50.0, Name: ATI RS690, ID: 0x791f, GB: 1, Z: 1
r300: GART size: 509 MB, VRAM size: 288 MB
r300: AA compression RAM: YES, Z compression RAM: NO, HiZ RAM: NO
VERT
PROPERTY NEXT_SHADER FRAG
DCL IN[0]
DCL IN[1]
DCL OUT[0], POSITION
DCL OUT[1], GENERIC[0]
DCL OUT[2], GENERIC[1]
DCL TEMP[0]
  0: MOV TEMP[0], IN[0]
  1: MOV OUT[1], IN[1]
  2: MOV OUT[0], TEMP[0]
  3: MOV OUT[2], TEMP[0]
  4: END
clamp_vertex_color = 0
clip_xy = 1
clip_z = 1
clip_user = 0
bypass_viewport = 0
clip_halfz = 0
need_edgeflags = 0
has_gs = 0
ucp_enable = 0
vertex_element[0].src_offset = 0
vertex_element[0].instance_divisor = 0
vertex_element[0].vertex_buffer_index = 0
vertex_element[0].src_format = PIPE_FORMAT_R32G32B32_FLOAT
vertex_element[1].src_offset = 12
vertex_element[1].instance_divisor = 0
vertex_element[1].vertex_buffer_index = 0
vertex_element[1].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT

; Function Attrs: nounwind readnone
declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #0

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #0

define i8 @draw_llvm_vs_variant0({ [16 x float*], [16 x i32], [14 x [4 x float]]*, float*, [128 x { i32, i32, i32, i32, i32, i8*, [16 x i32], [16 x i32], [16 x i32] }], [32 x { float, float, float, [4 x float] }] }* noalias %context, { i32, [4 x float], [3 x [4 x float]] }* noalias %io, { i8*, i32 }* noalias %vbuffers, i32 %count, i32 %start_or_maxelt, i32 %stride, { i16, i8, i32, i8* }* noalias %vb, i32 %instance_id, i32 %vertex_id_offset, i32 %start_instance, i32* noalias %fetch_elts) {
entry:
  %temp37 = alloca <4 x float>
  %temp36 = alloca <4 x float>
  %temp35 = alloca <4 x float>
  %temp = alloca <4 x float>
  %output34 = alloca <4 x float>
  %output33 = alloca <4 x float>
  %output32 = alloca <4 x float>
  %output31 = alloca <4 x float>
  %output30 = alloca <4 x float>
  %output29 = alloca <4 x float>
  %output28 = alloca <4 x float>
  %output27 = alloca <4 x float>
  %output26 = alloca <4 x float>
  %output25 = alloca <4 x float>
  %output24 = alloca <4 x float>
  %output = alloca <4 x float>
  %looplimiter = alloca i32
  %index_store = alloca <4 x i32>
  %loop_counter = alloca i32
  %0 = alloca i8*
  %1 = alloca i8*
  %2 = alloca <4 x i64>
  %3 = alloca <4 x i32>
  store <4 x i32> zeroinitializer, <4 x i32>* %3
  %4 = bitcast <4 x i64>* %2 to i8*
  %5 = getelementptr i8, i8* %4, i32 0
  %6 = icmp ne i32* null, %fetch_elts
  %fetch_max = sub i32 %count, 1
  %7 = insertelement <4 x i32> undef, i32 %fetch_max, i32 0
  %8 = shufflevector <4 x i32> %7, <4 x i32> undef, <4 x i32> zeroinitializer
  %9 = insertelement <4 x i32> undef, i32 %start_or_maxelt, i32 0
  %10 = shufflevector <4 x i32> %9, <4 x i32> undef, <4 x i32> zeroinitializer
  %11 = getelementptr { i8*, i32 }, { i8*, i32 }* %vbuffers, i32 0
  %12 = getelementptr { i16, i8, i32, i8* }, { i16, i8, i32, i8* }* %vb, i32 0
  %.stride_ptr = getelementptr { i16, i8, i32, i8* }, { i16, i8, i32, i8* }* %12, i32 0, i32 0
  %.stride = load i16, i16* %.stride_ptr
  %13 = zext i16 %.stride to i32
  %.buffer_offset_ptr = getelementptr { i16, i8, i32, i8* }, { i16, i8, i32, i8* }* %12, i32 0, i32 2
  %.buffer_offset = load i32, i32* %.buffer_offset_ptr
  %.map_ptr = getelementptr { i8*, i32 }, { i8*, i32 }* %11, i32 0, i32 0
  %.map = load i8*, i8** %.map_ptr
  %.size_ptr = getelementptr { i8*, i32 }, { i8*, i32 }* %11, i32 0, i32 1
  %.size = load i32, i32* %.size_ptr
  %14 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %.size, i32 11) #1
  %15 = extractvalue { i32, i1 } %14, 1
  %16 = extractvalue { i32, i1 } %14, 0
  %17 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %16, i32 %.buffer_offset) #1
  %18 = extractvalue { i32, i1 } %17, 1
  %19 = or i1 %15, %18
  %20 = extractvalue { i32, i1 } %17, 0
  %21 = select i1 %19, i32 0, i32 %20
  br i1 %19, label %if-true-block, label %if-false-block

if-true-block:                                    ; preds = %entry
  store i8* %5, i8** %1
  br label %endif-block

if-false-block:                                   ; preds = %entry
  %22 = getelementptr i8, i8* %.map, i32 %.buffer_offset
  store i8* %22, i8** %1
  br label %endif-block

endif-block:                                      ; preds = %if-false-block, %if-true-block
  %map_ptr = load i8*, i8** %1
  %23 = getelementptr { i8*, i32 }, { i8*, i32 }* %vbuffers, i32 0
  %24 = getelementptr { i16, i8, i32, i8* }, { i16, i8, i32, i8* }* %vb, i32 0
  %.stride_ptr1 = getelementptr { i16, i8, i32, i8* }, { i16, i8, i32, i8* }* %24, i32 0, i32 0
  %.stride2 = load i16, i16* %.stride_ptr1
  %25 = zext i16 %.stride2 to i32
  %.buffer_offset_ptr3 = getelementptr { i16, i8, i32, i8* }, { i16, i8, i32, i8* }* %24, i32 0, i32 2
  %.buffer_offset4 = load i32, i32* %.buffer_offset_ptr3
  %.map_ptr5 = getelementptr { i8*, i32 }, { i8*, i32 }* %23, i32 0, i32 0
  %.map6 = load i8*, i8** %.map_ptr5
  %.size_ptr7 = getelementptr { i8*, i32 }, { i8*, i32 }* %23, i32 0, i32 1
  %.size8 = load i32, i32* %.size_ptr7
  %26 = add i32 %.buffer_offset4, 12
  %27 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %.size8, i32 15) #1
  %28 = extractvalue { i32, i1 } %27, 1
  %29 = extractvalue { i32, i1 } %27, 0
  %30 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %29, i32 %26) #1
  %31 = extractvalue { i32, i1 } %30, 1
  %32 = or i1 %28, %31
  %33 = extractvalue { i32, i1 } %30, 0
  %34 = select i1 %32, i32 0, i32 %33
  br i1 %32, label %if-true-block10, label %if-false-block11

if-true-block10:                                  ; preds = %endif-block
  store i8* %5, i8** %0
  br label %endif-block9

if-false-block11:                                 ; preds = %endif-block
  %35 = getelementptr i8, i8* %.map6, i32 %26
  store i8* %35, i8** %0
  br label %endif-block9

endif-block9:                                     ; preds = %if-false-block11, %if-true-block10
  %map_ptr12 = load i8*, i8** %0
  store i32 0, i32* %loop_counter
  store i32 0, i32* %loop_counter
  br label %loop_begin

loop_begin:                                       ; preds = %endif-block13, %endif-block9
  %36 = load i32, i32* %loop_counter
  %37 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %io, i32 %36
  %38 = insertelement <4 x i32> undef, i32 %36, i32 0
  %39 = shufflevector <4 x i32> %38, <4 x i32> undef, <4 x i32> zeroinitializer
  %40 = add <4 x i32> %39, <i32 0, i32 1, i32 2, i32 3>
  %41 = icmp ult <4 x i32> %40, %8
  %42 = sext <4 x i1> %41 to <4 x i32>
  %43 = trunc <4 x i32> %42 to <4 x i1>
  %44 = select <4 x i1> %43, <4 x i32> %40, <4 x i32> %8
  br i1 %6, label %if-true-block14, label %if-false-block15

if-true-block14:                                  ; preds = %loop_begin
  %45 = shl <4 x i32> %44, <i32 2, i32 2, i32 2, i32 2>
  %46 = bitcast i32* %fetch_elts to i8*
  %47 = extractelement <4 x i32> %45, i32 0
  %48 = getelementptr i8, i8* %46, i32 %47
  %49 = bitcast i8* %48 to i32*
  %50 = load i32, i32* %49
  %51 = insertelement <4 x i32> undef, i32 %50, i32 0
  %52 = extractelement <4 x i32> %45, i32 1
  %53 = getelementptr i8, i8* %46, i32 %52
  %54 = bitcast i8* %53 to i32*
  %55 = load i32, i32* %54
  %56 = insertelement <4 x i32> %51, i32 %55, i32 1
  %57 = extractelement <4 x i32> %45, i32 2
  %58 = getelementptr i8, i8* %46, i32 %57
  %59 = bitcast i8* %58 to i32*
  %60 = load i32, i32* %59
  %61 = insertelement <4 x i32> %56, i32 %60, i32 2
  %62 = extractelement <4 x i32> %45, i32 3
  %63 = getelementptr i8, i8* %46, i32 %62
  %64 = bitcast i8* %63 to i32*
  %65 = load i32, i32* %64
  %66 = insertelement <4 x i32> %61, i32 %65, i32 3
  store <4 x i32> %66, <4 x i32>* %index_store
  br label %endif-block13

if-false-block15:                                 ; preds = %loop_begin
  %67 = add <4 x i32> %44, %10
  store <4 x i32> %67, <4 x i32>* %index_store
  br label %endif-block13

endif-block13:                                    ; preds = %if-false-block15, %if-true-block14
  %68 = load <4 x i32>, <4 x i32>* %index_store
  %69 = insertelement <4 x i32> undef, i32 %13, i32 0
  %70 = shufflevector <4 x i32> %69, <4 x i32> undef, <4 x i32> zeroinitializer
  %71 = insertelement <4 x i32> undef, i32 %21, i32 0
  %72 = shufflevector <4 x i32> %71, <4 x i32> undef, <4 x i32> zeroinitializer
  %73 = mul <4 x i32> %70, %68
  %74 = icmp ult <4 x i32> %73, %72
  %75 = sext <4 x i1> %74 to <4 x i32>
  %76 = and <4 x i32> %73, %75
  %77 = shufflevector <4 x i32> %76, <4 x i32> %76, <1 x i32> zeroinitializer
  %78 = extractelement <1 x i32> %77, i32 0
  %79 = getelementptr i8, i8* %map_ptr, i32 %78
  %80 = bitcast i8* %79 to <3 x i32>*
  %81 = load <3 x i32>, <3 x i32>* %80, align 1
  %82 = shufflevector <3 x i32> %81, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %83 = shufflevector <4 x i32> %76, <4 x i32> %76, <1 x i32> <i32 1>
  %84 = extractelement <1 x i32> %83, i32 0
  %85 = getelementptr i8, i8* %map_ptr, i32 %84
  %86 = bitcast i8* %85 to <3 x i32>*
  %87 = load <3 x i32>, <3 x i32>* %86, align 1
  %88 = shufflevector <3 x i32> %87, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %89 = shufflevector <4 x i32> %76, <4 x i32> %76, <1 x i32> <i32 2>
  %90 = extractelement <1 x i32> %89, i32 0
  %91 = getelementptr i8, i8* %map_ptr, i32 %90
  %92 = bitcast i8* %91 to <3 x i32>*
  %93 = load <3 x i32>, <3 x i32>* %92, align 1
  %94 = shufflevector <3 x i32> %93, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %95 = shufflevector <4 x i32> %76, <4 x i32> %76, <1 x i32> <i32 3>
  %96 = extractelement <1 x i32> %95, i32 0
  %97 = getelementptr i8, i8* %map_ptr, i32 %96
  %98 = bitcast i8* %97 to <3 x i32>*
  %99 = load <3 x i32>, <3 x i32>* %98, align 1
  %100 = shufflevector <3 x i32> %99, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %101 = shufflevector <4 x i32> %82, <4 x i32> %88, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %102 = shufflevector <4 x i32> %94, <4 x i32> %100, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %103 = shufflevector <4 x i32> %82, <4 x i32> %88, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %104 = shufflevector <4 x i32> %94, <4 x i32> %100, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %t0 = bitcast <4 x i32> %101 to <2 x i64>
  %t1 = bitcast <4 x i32> %102 to <2 x i64>
  %t2 = bitcast <4 x i32> %103 to <2 x i64>
  %t3 = bitcast <4 x i32> %104 to <2 x i64>
  %105 = shufflevector <2 x i64> %t0, <2 x i64> %t1, <2 x i32> <i32 0, i32 2>
  %106 = shufflevector <2 x i64> %t0, <2 x i64> %t1, <2 x i32> <i32 1, i32 3>
  %107 = shufflevector <2 x i64> %t2, <2 x i64> %t3, <2 x i32> <i32 0, i32 2>
  %108 = shufflevector <2 x i64> %t2, <2 x i64> %t3, <2 x i32> <i32 1, i32 3>
  %dst0 = bitcast <2 x i64> %105 to <4 x i32>
  %dst1 = bitcast <2 x i64> %106 to <4 x i32>
  %dst2 = bitcast <2 x i64> %107 to <4 x i32>
  %dst3 = bitcast <2 x i64> %108 to <4 x i32>
  %109 = bitcast <4 x i32> %dst0 to <4 x float>
  %110 = bitcast <4 x i32> %dst1 to <4 x float>
  %111 = bitcast <4 x i32> %dst2 to <4 x float>
  %112 = bitcast <4 x float> %109 to <4 x i32>
  %113 = and <4 x i32> %112, %75
  %114 = bitcast <4 x i32> %113 to <4 x float>
  %115 = bitcast <4 x float> %110 to <4 x i32>
  %116 = and <4 x i32> %115, %75
  %117 = bitcast <4 x i32> %116 to <4 x float>
  %118 = bitcast <4 x float> %111 to <4 x i32>
  %119 = and <4 x i32> %118, %75
  %120 = bitcast <4 x i32> %119 to <4 x float>
  %121 = and <4 x i32> <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>, %75
  %122 = bitcast <4 x i32> %121 to <4 x float>
  %123 = insertelement <4 x i32> undef, i32 %25, i32 0
  %124 = shufflevector <4 x i32> %123, <4 x i32> undef, <4 x i32> zeroinitializer
  %125 = insertelement <4 x i32> undef, i32 %34, i32 0
  %126 = shufflevector <4 x i32> %125, <4 x i32> undef, <4 x i32> zeroinitializer
  %127 = mul <4 x i32> %124, %68
  %128 = icmp ult <4 x i32> %127, %126
  %129 = sext <4 x i1> %128 to <4 x i32>
  %130 = and <4 x i32> %127, %129
  %131 = shufflevector <4 x i32> %130, <4 x i32> %130, <1 x i32> zeroinitializer
  %132 = extractelement <1 x i32> %131, i32 0
  %133 = getelementptr i8, i8* %map_ptr12, i32 %132
  %134 = bitcast i8* %133 to <4 x i32>*
  %135 = load <4 x i32>, <4 x i32>* %134, align 1
  %136 = shufflevector <4 x i32> %130, <4 x i32> %130, <1 x i32> <i32 1>
  %137 = extractelement <1 x i32> %136, i32 0
  %138 = getelementptr i8, i8* %map_ptr12, i32 %137
  %139 = bitcast i8* %138 to <4 x i32>*
  %140 = load <4 x i32>, <4 x i32>* %139, align 1
  %141 = shufflevector <4 x i32> %130, <4 x i32> %130, <1 x i32> <i32 2>
  %142 = extractelement <1 x i32> %141, i32 0
  %143 = getelementptr i8, i8* %map_ptr12, i32 %142
  %144 = bitcast i8* %143 to <4 x i32>*
  %145 = load <4 x i32>, <4 x i32>* %144, align 1
  %146 = shufflevector <4 x i32> %130, <4 x i32> %130, <1 x i32> <i32 3>
  %147 = extractelement <1 x i32> %146, i32 0
  %148 = getelementptr i8, i8* %map_ptr12, i32 %147
  %149 = bitcast i8* %148 to <4 x i32>*
  %150 = load <4 x i32>, <4 x i32>* %149, align 1
  %151 = shufflevector <4 x i32> %135, <4 x i32> %140, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %152 = shufflevector <4 x i32> %145, <4 x i32> %150, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %153 = shufflevector <4 x i32> %135, <4 x i32> %140, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %154 = shufflevector <4 x i32> %145, <4 x i32> %150, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %t016 = bitcast <4 x i32> %151 to <2 x i64>
  %t117 = bitcast <4 x i32> %152 to <2 x i64>
  %t218 = bitcast <4 x i32> %153 to <2 x i64>
  %t319 = bitcast <4 x i32> %154 to <2 x i64>
  %155 = shufflevector <2 x i64> %t016, <2 x i64> %t117, <2 x i32> <i32 0, i32 2>
  %156 = shufflevector <2 x i64> %t016, <2 x i64> %t117, <2 x i32> <i32 1, i32 3>
  %157 = shufflevector <2 x i64> %t218, <2 x i64> %t319, <2 x i32> <i32 0, i32 2>
  %158 = shufflevector <2 x i64> %t218, <2 x i64> %t319, <2 x i32> <i32 1, i32 3>
  %dst020 = bitcast <2 x i64> %155 to <4 x i32>
  %dst121 = bitcast <2 x i64> %156 to <4 x i32>
  %dst222 = bitcast <2 x i64> %157 to <4 x i32>
  %dst323 = bitcast <2 x i64> %158 to <4 x i32>
  %159 = bitcast <4 x i32> %dst020 to <4 x float>
  %160 = bitcast <4 x i32> %dst121 to <4 x float>
  %161 = bitcast <4 x i32> %dst222 to <4 x float>
  %162 = bitcast <4 x i32> %dst323 to <4 x float>
  %163 = bitcast <4 x float> %159 to <4 x i32>
  %164 = and <4 x i32> %163, %129
  %165 = bitcast <4 x i32> %164 to <4 x float>
  %166 = bitcast <4 x float> %160 to <4 x i32>
  %167 = and <4 x i32> %166, %129
  %168 = bitcast <4 x i32> %167 to <4 x float>
  %169 = bitcast <4 x float> %161 to <4 x i32>
  %170 = and <4 x i32> %169, %129
  %171 = bitcast <4 x i32> %170 to <4 x float>
  %172 = bitcast <4 x float> %162 to <4 x i32>
  %173 = and <4 x i32> %172, %129
  %174 = bitcast <4 x i32> %173 to <4 x float>
  %175 = insertelement <4 x i32> undef, i32 %vertex_id_offset, i32 0
  %176 = shufflevector <4 x i32> %175, <4 x i32> undef, <4 x i32> zeroinitializer
  %177 = sub <4 x i32> %68, %176
  %context.vs_constants_ptr = getelementptr { [16 x float*], [16 x i32], [14 x [4 x float]]*, float*, [128 x { i32, i32, i32, i32, i32, i8*, [16 x i32], [16 x i32], [16 x i32] }], [32 x { float, float, float, [4 x float] }] }, { [16 x float*], [16 x i32], [14 x [4 x float]]*, float*, [128 x { i32, i32, i32, i32, i32, i8*, [16 x i32], [16 x i32], [16 x i32] }], [32 x { float, float, float, [4 x float] }] }* %context, i32 0, i32 0
  %context.num_vs_constants_ptr = getelementptr { [16 x float*], [16 x i32], [14 x [4 x float]]*, float*, [128 x { i32, i32, i32, i32, i32, i8*, [16 x i32], [16 x i32], [16 x i32] }], [32 x { float, float, float, [4 x float] }] }, { [16 x float*], [16 x i32], [14 x [4 x float]]*, float*, [128 x { i32, i32, i32, i32, i32, i8*, [16 x i32], [16 x i32], [16 x i32] }], [32 x { float, float, float, [4 x float] }] }* %context, i32 0, i32 1
  store i32 0, i32* %looplimiter
  store i32 65535, i32* %looplimiter
  store <4 x float> zeroinitializer, <4 x float>* %output
  store <4 x float> zeroinitializer, <4 x float>* %output24
  store <4 x float> zeroinitializer, <4 x float>* %output25
  store <4 x float> zeroinitializer, <4 x float>* %output26
  store <4 x float> zeroinitializer, <4 x float>* %output27
  store <4 x float> zeroinitializer, <4 x float>* %output28
  store <4 x float> zeroinitializer, <4 x float>* %output29
  store <4 x float> zeroinitializer, <4 x float>* %output30
  store <4 x float> zeroinitializer, <4 x float>* %output31
  store <4 x float> zeroinitializer, <4 x float>* %output32
  store <4 x float> zeroinitializer, <4 x float>* %output33
  store <4 x float> zeroinitializer, <4 x float>* %output34
  store <4 x float> zeroinitializer, <4 x float>* %temp
  store <4 x float> zeroinitializer, <4 x float>* %temp35
  store <4 x float> zeroinitializer, <4 x float>* %temp36
  store <4 x float> zeroinitializer, <4 x float>* %temp37
  store <4 x float> %114, <4 x float>* %temp
  store <4 x float> %117, <4 x float>* %temp35
  store <4 x float> %120, <4 x float>* %temp36
  store <4 x float> %122, <4 x float>* %temp37
  store <4 x float> %165, <4 x float>* %output27
  store <4 x float> %168, <4 x float>* %output28
  store <4 x float> %171, <4 x float>* %output29
  store <4 x float> %174, <4 x float>* %output30
  %178 = load <4 x float>, <4 x float>* %temp
  %179 = load <4 x float>, <4 x float>* %temp35
  %180 = load <4 x float>, <4 x float>* %temp36
  %181 = load <4 x float>, <4 x float>* %temp37
  store <4 x float> %178, <4 x float>* %output
  store <4 x float> %179, <4 x float>* %output24
  store <4 x float> %180, <4 x float>* %output25
  store <4 x float> %181, <4 x float>* %output26
  %182 = load <4 x float>, <4 x float>* %temp
  %183 = load <4 x float>, <4 x float>* %temp35
  %184 = load <4 x float>, <4 x float>* %temp36
  %185 = load <4 x float>, <4 x float>* %temp37
  store <4 x float> %182, <4 x float>* %output31
  store <4 x float> %183, <4 x float>* %output32
  store <4 x float> %184, <4 x float>* %output33
  store <4 x float> %185, <4 x float>* %output34
  %186 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %37, i32 0
  %187 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %37, i32 1
  %188 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %37, i32 2
  %189 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %37, i32 3
  %190 = load <4 x float>, <4 x float>* %output
  %191 = load <4 x float>, <4 x float>* %output24
  %192 = load <4 x float>, <4 x float>* %output25
  %193 = load <4 x float>, <4 x float>* %output26
  %.clip_pos_ptr = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %186, i32 0, i32 1
  %.clip_pos_ptr38 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %187, i32 0, i32 1
  %.clip_pos_ptr39 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %188, i32 0, i32 1
  %.clip_pos_ptr40 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %189, i32 0, i32 1
  %194 = shufflevector <4 x float> %190, <4 x float> %191, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %195 = shufflevector <4 x float> %192, <4 x float> %193, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %196 = shufflevector <4 x float> %190, <4 x float> %191, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %197 = shufflevector <4 x float> %192, <4 x float> %193, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %t041 = bitcast <4 x float> %194 to <2 x double>
  %t142 = bitcast <4 x float> %195 to <2 x double>
  %t243 = bitcast <4 x float> %196 to <2 x double>
  %t344 = bitcast <4 x float> %197 to <2 x double>
  %198 = shufflevector <2 x double> %t041, <2 x double> %t142, <2 x i32> <i32 0, i32 2>
  %199 = shufflevector <2 x double> %t041, <2 x double> %t142, <2 x i32> <i32 1, i32 3>
  %200 = shufflevector <2 x double> %t243, <2 x double> %t344, <2 x i32> <i32 0, i32 2>
  %201 = shufflevector <2 x double> %t243, <2 x double> %t344, <2 x i32> <i32 1, i32 3>
  %dst045 = bitcast <2 x double> %198 to <4 x float>
  %dst146 = bitcast <2 x double> %199 to <4 x float>
  %dst247 = bitcast <2 x double> %200 to <4 x float>
  %dst348 = bitcast <2 x double> %201 to <4 x float>
  %202 = shufflevector <4 x float> %dst045, <4 x float> %dst045, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %203 = shufflevector <4 x float> %dst146, <4 x float> %dst146, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %204 = shufflevector <4 x float> %dst247, <4 x float> %dst247, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %205 = shufflevector <4 x float> %dst348, <4 x float> %dst348, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %clipo = getelementptr [4 x float], [4 x float]* %.clip_pos_ptr, i32 0, i32 0
  %206 = bitcast float* %clipo to <4 x float>*
  store <4 x float> %202, <4 x float>* %206, align 4
  %clipo49 = getelementptr [4 x float], [4 x float]* %.clip_pos_ptr38, i32 0, i32 0
  %207 = bitcast float* %clipo49 to <4 x float>*
  store <4 x float> %203, <4 x float>* %207, align 4
  %clipo50 = getelementptr [4 x float], [4 x float]* %.clip_pos_ptr39, i32 0, i32 0
  %208 = bitcast float* %clipo50 to <4 x float>*
  store <4 x float> %204, <4 x float>* %208, align 4
  %clipo51 = getelementptr [4 x float], [4 x float]* %.clip_pos_ptr40, i32 0, i32 0
  %209 = bitcast float* %clipo51 to <4 x float>*
  store <4 x float> %205, <4 x float>* %209, align 4
  %210 = load <4 x i32>, <4 x i32>* %3
  %211 = load <4 x float>, <4 x float>* %output
  %212 = load <4 x float>, <4 x float>* %output24
  %213 = load <4 x float>, <4 x float>* %output25
  %214 = load <4 x float>, <4 x float>* %output26
  %215 = fcmp ugt <4 x float> %211, %214
  %216 = sext <4 x i1> %215 to <4 x i32>
  %217 = and <4 x i32> %216, <i32 1, i32 1, i32 1, i32 1>
  %218 = fadd <4 x float> %211, %214
  %219 = fcmp ugt <4 x float> zeroinitializer, %218
  %220 = sext <4 x i1> %219 to <4 x i32>
  %221 = and <4 x i32> %220, <i32 2, i32 2, i32 2, i32 2>
  %222 = or <4 x i32> %217, %221
  %223 = fcmp ugt <4 x float> %212, %214
  %224 = sext <4 x i1> %223 to <4 x i32>
  %225 = and <4 x i32> %224, <i32 4, i32 4, i32 4, i32 4>
  %226 = or <4 x i32> %222, %225
  %227 = fadd <4 x float> %212, %214
  %228 = fcmp ugt <4 x float> zeroinitializer, %227
  %229 = sext <4 x i1> %228 to <4 x i32>
  %230 = and <4 x i32> %229, <i32 8, i32 8, i32 8, i32 8>
  %231 = or <4 x i32> %226, %230
  %232 = fadd <4 x float> %213, %214
  %233 = fcmp ugt <4 x float> zeroinitializer, %232
  %234 = sext <4 x i1> %233 to <4 x i32>
  %235 = and <4 x i32> %234, <i32 16, i32 16, i32 16, i32 16>
  %236 = or <4 x i32> %231, %235
  %237 = fcmp ugt <4 x float> %213, %214
  %238 = sext <4 x i1> %237 to <4 x i32>
  %239 = and <4 x i32> %238, <i32 32, i32 32, i32 32, i32 32>
  %240 = or <4 x i32> %236, %239
  %241 = or <4 x i32> %240, %210
  store <4 x i32> %241, <4 x i32>* %3
  %242 = load <4 x float>, <4 x float>* %output26
  %context.viewports_ptr = getelementptr { [16 x float*], [16 x i32], [14 x [4 x float]]*, float*, [128 x { i32, i32, i32, i32, i32, i8*, [16 x i32], [16 x i32], [16 x i32] }], [32 x { float, float, float, [4 x float] }] }, { [16 x float*], [16 x i32], [14 x [4 x float]]*, float*, [128 x { i32, i32, i32, i32, i32, i8*, [16 x i32], [16 x i32], [16 x i32] }], [32 x { float, float, float, [4 x float] }] }* %context, i32 0, i32 3
  %context.viewports = load float*, float** %context.viewports_ptr
  %243 = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %242
  store <4 x float> %243, <4 x float>* %output26
  %244 = load <4 x float>, <4 x float>* %output
  %245 = getelementptr float, float* %context.viewports, i32 0
  %246 = getelementptr float, float* %context.viewports, i32 3
  %scale = load float, float* %245
  %247 = insertelement <4 x float> undef, float %scale, i32 0
  %248 = shufflevector <4 x float> %247, <4 x float> undef, <4 x i32> zeroinitializer
  %trans = load float, float* %246
  %249 = insertelement <4 x float> undef, float %trans, i32 0
  %250 = shufflevector <4 x float> %249, <4 x float> undef, <4 x i32> zeroinitializer
  %251 = fmul <4 x float> %244, %243
  %252 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %251, <4 x float> %248, <4 x float> %250) #1
  store <4 x float> %252, <4 x float>* %output
  %253 = load <4 x float>, <4 x float>* %output24
  %254 = getelementptr float, float* %context.viewports, i32 1
  %255 = getelementptr float, float* %context.viewports, i32 4
  %scale52 = load float, float* %254
  %256 = insertelement <4 x float> undef, float %scale52, i32 0
  %257 = shufflevector <4 x float> %256, <4 x float> undef, <4 x i32> zeroinitializer
  %trans53 = load float, float* %255
  %258 = insertelement <4 x float> undef, float %trans53, i32 0
  %259 = shufflevector <4 x float> %258, <4 x float> undef, <4 x i32> zeroinitializer
  %260 = fmul <4 x float> %253, %243
  %261 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %260, <4 x float> %257, <4 x float> %259) #1
  store <4 x float> %261, <4 x float>* %output24
  %262 = load <4 x float>, <4 x float>* %output25
  %263 = getelementptr float, float* %context.viewports, i32 2
  %264 = getelementptr float, float* %context.viewports, i32 5
  %scale54 = load float, float* %263
  %265 = insertelement <4 x float> undef, float %scale54, i32 0
  %266 = shufflevector <4 x float> %265, <4 x float> undef, <4 x i32> zeroinitializer
  %trans55 = load float, float* %264
  %267 = insertelement <4 x float> undef, float %trans55, i32 0
  %268 = shufflevector <4 x float> %267, <4 x float> undef, <4 x i32> zeroinitializer
  %269 = fmul <4 x float> %262, %243
  %270 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %269, <4 x float> %266, <4 x float> %268) #1
  store <4 x float> %270, <4 x float>* %output25
  %output0.x = load <4 x float>, <4 x float>* %output
  %output0.y = load <4 x float>, <4 x float>* %output24
  %output0.z = load <4 x float>, <4 x float>* %output25
  %output0.w = load <4 x float>, <4 x float>* %output26
  %271 = shufflevector <4 x float> %output0.x, <4 x float> %output0.y, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %272 = shufflevector <4 x float> %output0.z, <4 x float> %output0.w, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %273 = shufflevector <4 x float> %output0.x, <4 x float> %output0.y, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %274 = shufflevector <4 x float> %output0.z, <4 x float> %output0.w, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %t056 = bitcast <4 x float> %271 to <2 x double>
  %t157 = bitcast <4 x float> %272 to <2 x double>
  %t258 = bitcast <4 x float> %273 to <2 x double>
  %t359 = bitcast <4 x float> %274 to <2 x double>
  %275 = shufflevector <2 x double> %t056, <2 x double> %t157, <2 x i32> <i32 0, i32 2>
  %276 = shufflevector <2 x double> %t056, <2 x double> %t157, <2 x i32> <i32 1, i32 3>
  %277 = shufflevector <2 x double> %t258, <2 x double> %t359, <2 x i32> <i32 0, i32 2>
  %278 = shufflevector <2 x double> %t258, <2 x double> %t359, <2 x i32> <i32 1, i32 3>
  %dst060 = bitcast <2 x double> %275 to <4 x float>
  %dst161 = bitcast <2 x double> %276 to <4 x float>
  %dst262 = bitcast <2 x double> %277 to <4 x float>
  %dst363 = bitcast <2 x double> %278 to <4 x float>
  %279 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %37, i32 0
  %280 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %37, i32 1
  %281 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %37, i32 2
  %282 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %37, i32 3
  %283 = or <4 x i32> <i32 -49152, i32 -49152, i32 -49152, i32 -49152>, %240
  %.id_ptr = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %279, i32 0, i32 0
  %284 = extractelement <4 x i32> %283, i32 0
  store i32 %284, i32* %.id_ptr
  %.id_ptr64 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %280, i32 0, i32 0
  %285 = extractelement <4 x i32> %283, i32 1
  store i32 %285, i32* %.id_ptr64
  %.id_ptr65 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %281, i32 0, i32 0
  %286 = extractelement <4 x i32> %283, i32 2
  store i32 %286, i32* %.id_ptr65
  %.id_ptr66 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %282, i32 0, i32 0
  %287 = extractelement <4 x i32> %283, i32 3
  store i32 %287, i32* %.id_ptr66
  %.data_ptr = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %279, i32 0, i32 2
  %288 = getelementptr [3 x [4 x float]], [3 x [4 x float]]* %.data_ptr, i32 0, i32 0, i32 0
  %289 = bitcast float* %288 to <4 x float>*
  store <4 x float> %dst060, <4 x float>* %289, align 4
  %.data_ptr67 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %280, i32 0, i32 2
  %290 = getelementptr [3 x [4 x float]], [3 x [4 x float]]* %.data_ptr67, i32 0, i32 0, i32 0
  %291 = bitcast float* %290 to <4 x float>*
  store <4 x float> %dst161, <4 x float>* %291, align 4
  %.data_ptr68 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %281, i32 0, i32 2
  %292 = getelementptr [3 x [4 x float]], [3 x [4 x float]]* %.data_ptr68, i32 0, i32 0, i32 0
  %293 = bitcast float* %292 to <4 x float>*
  store <4 x float> %dst262, <4 x float>* %293, align 4
  %.data_ptr69 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %282, i32 0, i32 2
  %294 = getelementptr [3 x [4 x float]], [3 x [4 x float]]* %.data_ptr69, i32 0, i32 0, i32 0
  %295 = bitcast float* %294 to <4 x float>*
  store <4 x float> %dst363, <4 x float>* %295, align 4
  %output1.x = load <4 x float>, <4 x float>* %output27
  %output1.y = load <4 x float>, <4 x float>* %output28
  %output1.z = load <4 x float>, <4 x float>* %output29
  %output1.w = load <4 x float>, <4 x float>* %output30
  %296 = shufflevector <4 x float> %output1.x, <4 x float> %output1.y, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %297 = shufflevector <4 x float> %output1.z, <4 x float> %output1.w, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %298 = shufflevector <4 x float> %output1.x, <4 x float> %output1.y, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %299 = shufflevector <4 x float> %output1.z, <4 x float> %output1.w, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %t070 = bitcast <4 x float> %296 to <2 x double>
  %t171 = bitcast <4 x float> %297 to <2 x double>
  %t272 = bitcast <4 x float> %298 to <2 x double>
  %t373 = bitcast <4 x float> %299 to <2 x double>
  %300 = shufflevector <2 x double> %t070, <2 x double> %t171, <2 x i32> <i32 0, i32 2>
  %301 = shufflevector <2 x double> %t070, <2 x double> %t171, <2 x i32> <i32 1, i32 3>
  %302 = shufflevector <2 x double> %t272, <2 x double> %t373, <2 x i32> <i32 0, i32 2>
  %303 = shufflevector <2 x double> %t272, <2 x double> %t373, <2 x i32> <i32 1, i32 3>
  %dst074 = bitcast <2 x double> %300 to <4 x float>
  %dst175 = bitcast <2 x double> %301 to <4 x float>
  %dst276 = bitcast <2 x double> %302 to <4 x float>
  %dst377 = bitcast <2 x double> %303 to <4 x float>
  %304 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %37, i32 0
  %305 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %37, i32 1
  %306 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %37, i32 2
  %307 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %37, i32 3
  %.data_ptr78 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %304, i32 0, i32 2
  %308 = getelementptr [3 x [4 x float]], [3 x [4 x float]]* %.data_ptr78, i32 0, i32 1, i32 0
  %309 = bitcast float* %308 to <4 x float>*
  store <4 x float> %dst074, <4 x float>* %309, align 4
  %.data_ptr79 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %305, i32 0, i32 2
  %310 = getelementptr [3 x [4 x float]], [3 x [4 x float]]* %.data_ptr79, i32 0, i32 1, i32 0
  %311 = bitcast float* %310 to <4 x float>*
  store <4 x float> %dst175, <4 x float>* %311, align 4
  %.data_ptr80 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %306, i32 0, i32 2
  %312 = getelementptr [3 x [4 x float]], [3 x [4 x float]]* %.data_ptr80, i32 0, i32 1, i32 0
  %313 = bitcast float* %312 to <4 x float>*
  store <4 x float> %dst276, <4 x float>* %313, align 4
  %.data_ptr81 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %307, i32 0, i32 2
  %314 = getelementptr [3 x [4 x float]], [3 x [4 x float]]* %.data_ptr81, i32 0, i32 1, i32 0
  %315 = bitcast float* %314 to <4 x float>*
  store <4 x float> %dst377, <4 x float>* %315, align 4
  %output2.x = load <4 x float>, <4 x float>* %output31
  %output2.y = load <4 x float>, <4 x float>* %output32
  %output2.z = load <4 x float>, <4 x float>* %output33
  %output2.w = load <4 x float>, <4 x float>* %output34
  %316 = shufflevector <4 x float> %output2.x, <4 x float> %output2.y, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %317 = shufflevector <4 x float> %output2.z, <4 x float> %output2.w, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  %318 = shufflevector <4 x float> %output2.x, <4 x float> %output2.y, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %319 = shufflevector <4 x float> %output2.z, <4 x float> %output2.w, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
  %t082 = bitcast <4 x float> %316 to <2 x double>
  %t183 = bitcast <4 x float> %317 to <2 x double>
  %t284 = bitcast <4 x float> %318 to <2 x double>
  %t385 = bitcast <4 x float> %319 to <2 x double>
  %320 = shufflevector <2 x double> %t082, <2 x double> %t183, <2 x i32> <i32 0, i32 2>
  %321 = shufflevector <2 x double> %t082, <2 x double> %t183, <2 x i32> <i32 1, i32 3>
  %322 = shufflevector <2 x double> %t284, <2 x double> %t385, <2 x i32> <i32 0, i32 2>
  %323 = shufflevector <2 x double> %t284, <2 x double> %t385, <2 x i32> <i32 1, i32 3>
  %dst086 = bitcast <2 x double> %320 to <4 x float>
  %dst187 = bitcast <2 x double> %321 to <4 x float>
  %dst288 = bitcast <2 x double> %322 to <4 x float>
  %dst389 = bitcast <2 x double> %323 to <4 x float>
  %324 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %37, i32 0
  %325 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %37, i32 1
  %326 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %37, i32 2
  %327 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %37, i32 3
  %.data_ptr90 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %324, i32 0, i32 2
  %328 = getelementptr [3 x [4 x float]], [3 x [4 x float]]* %.data_ptr90, i32 0, i32 2, i32 0
  %329 = bitcast float* %328 to <4 x float>*
  store <4 x float> %dst086, <4 x float>* %329, align 4
  %.data_ptr91 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %325, i32 0, i32 2
  %330 = getelementptr [3 x [4 x float]], [3 x [4 x float]]* %.data_ptr91, i32 0, i32 2, i32 0
  %331 = bitcast float* %330 to <4 x float>*
  store <4 x float> %dst187, <4 x float>* %331, align 4
  %.data_ptr92 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %326, i32 0, i32 2
  %332 = getelementptr [3 x [4 x float]], [3 x [4 x float]]* %.data_ptr92, i32 0, i32 2, i32 0
  %333 = bitcast float* %332 to <4 x float>*
  store <4 x float> %dst288, <4 x float>* %333, align 4
  %.data_ptr93 = getelementptr { i32, [4 x float], [3 x [4 x float]] }, { i32, [4 x float], [3 x [4 x float]] }* %327, i32 0, i32 2
  %334 = getelementptr [3 x [4 x float]], [3 x [4 x float]]* %.data_ptr93, i32 0, i32 2, i32 0
  %335 = bitcast float* %334 to <4 x float>*
  store <4 x float> %dst389, <4 x float>* %335, align 4
  %336 = add i32 %36, 4
  store i32 %336, i32* %loop_counter
  %337 = icmp uge i32 %336, %count
  br i1 %337, label %loop_end, label %loop_begin

loop_end:                                         ; preds = %endif-block13
  %338 = load i32, i32* %loop_counter
  %339 = load <4 x i32>, <4 x i32>* %3
  %340 = bitcast <4 x i32> %339 to i128
  %341 = icmp ne i128 %340, 0
  %342 = zext i1 %341 to i8
  ret i8 %342
}

llc -mattr option(s): +sse2,+cx16,-tbm,-avx512ifma,-avx512dq,-fma4,+prfchw,-bmi2,-xsavec,-fsgsbase,-popcnt,-aes,-pcommit,-xsaves,-avx512er,-clwb,-avx512f,-pku,-smap,+mmx,-xop,-rdseed,-hle,-sse4a,-avx512bw,-clflushopt,-xsave,-avx512vl,-invpcid,-avx512cd,-avx,-rtm,-fma,-bmi,-mwaitx,-rdrnd,-sse4.1,-sse4.2,-avx2,+sse,-lzcnt,-pclmul,-prefetchwt1,-f16c,-ssse3,-sgx,+cmov,-avx512vbmi,-movbe,-xsaveopt,-sha,-adx,-avx512pf,+sse3
llc -mcpu option: k8-sse3
draw_llvm_vs_variant0:
     0:		pushq	%rbp
     1:		movq	%rsp, %rbp
     4:		pushq	%r15
     6:		pushq	%r14
     8:		pushq	%r13
    10:		pushq	%r12
    12:		pushq	%rbx
    13:		andq	$-32, %rsp
    17:		subq	$384, %rsp
    24:		movq	%rdi, %r15
    27:		movq	48(%rbp), %r9
    31:		movq	16(%rbp), %rax
    35:		leal	-1(%rcx), %ebx
    38:		movd	%ebx, %xmm0
    42:		pshufd	$0, %xmm0, %xmm1
    47:		movd	%r8d, %xmm0
    52:		pshufd	$0, %xmm0, %xmm0
    57:		movdqa	%xmm0, 48(%rsp)
    63:		movzwl	(%rax), %edi
    66:		movl	%edi, (%rsp)
    69:		movslq	4(%rax), %r8
    73:		movq	(%rdx), %r14
    76:		movl	8(%rdx), %eax
    79:		movl	%eax, %ebx
    81:		subl	$11, %ebx
    84:		setb	%dil
    88:		subl	%r8d, %ebx
    91:		setb	%r12b
    95:		leal	12(%r8), %r13d
    99:		addq	%r14, %r8
   102:		xorl	%r10d, %r10d
   105:		orb	%dil, %r12b
   108:		cmovnel	%r10d, %ebx
   112:		leaq	320(%rsp), %r11
   120:		cmovneq	%r11, %r8
   124:		subl	$15, %eax
   127:		setb	%r12b
   131:		subl	%r13d, %eax
   134:		setb	%dil
   138:		movslq	%r13d, %r13
   141:		addq	%r14, %r13
   144:		orb	%r12b, %dil
   147:		cmovnel	%r10d, %eax
   151:		cmovneq	%r11, %r13
   155:		xorps	%xmm5, %xmm5
   158:		movabsq	$140126383685632, %rdi
   168:		movaps	(%rdi), %xmm0
   171:		movaps	%xmm0, 256(%rsp)
   179:		movabsq	$140126383685648, %rdi
   189:		movdqa	(%rdi), %xmm4
   193:		movdqa	%xmm1, 272(%rsp)
   202:		pxor	%xmm4, %xmm1
   206:		movdqa	%xmm1, 224(%rsp)
   215:		movd	(%rsp), %xmm0
   220:		pshufd	$0, %xmm0, %xmm0
   225:		movdqa	%xmm0, 208(%rsp)
   234:		movd	%ebx, %xmm0
   238:		pshufd	$0, %xmm0, %xmm0
   243:		pxor	%xmm4, %xmm0
   247:		movdqa	%xmm0, 192(%rsp)
   256:		movabsq	$140126383685664, %rdi
   266:		movaps	(%rdi), %xmm0
   269:		movaps	%xmm0, 16(%rsp)
   274:		movd	%eax, %xmm0
   278:		pshufd	$0, %xmm0, %xmm0
   283:		pxor	%xmm4, %xmm0
   287:		movdqa	%xmm0, 176(%rsp)
   296:		movabsq	$140126383685680, %rax
   306:		movaps	(%rax), %xmm0
   309:		movaps	%xmm0, 160(%rsp)
   317:		movabsq	$140126383685696, %rax
   327:		movaps	(%rax), %xmm0
   330:		movaps	%xmm0, 144(%rsp)
   338:		movabsq	$140126383685712, %rax
   348:		movaps	(%rax), %xmm0
   351:		movaps	%xmm0, 128(%rsp)
   359:		movabsq	$140126383685728, %rax
   369:		movaps	(%rax), %xmm0
   372:		movaps	%xmm0, 112(%rsp)
   377:		movabsq	$140126383685744, %rax
   387:		movaps	(%rax), %xmm0
   390:		movaps	%xmm0, 96(%rsp)
   395:		movabsq	$140126383685760, %rax
   405:		movaps	(%rax), %xmm0
   408:		movaps	%xmm0, 80(%rsp)
   413:		movabsq	$140126383685776, %rax
   423:		movaps	(%rax), %xmm0
   426:		movaps	%xmm0, 64(%rsp)
   431:		movdqa	%xmm4, 240(%rsp)
   440:		nopl	(%rax,%rax)
   448:		movslq	%r10d, %rdi
   451:		imulq	$68, %rdi, %r11
   455:		leaq	(%rsi,%r11), %rax
   459:		testq	%r9, %r9
   462:		movd	%edi, %xmm0
   466:		pshufd	$0, %xmm0, %xmm1
   471:		paddd	256(%rsp), %xmm1
   480:		movdqa	%xmm1, %xmm2
   484:		pxor	%xmm4, %xmm2
   488:		movdqa	224(%rsp), %xmm0
   497:		pcmpgtd	%xmm2, %xmm0
   501:		pand	%xmm0, %xmm1
   505:		pandn	272(%rsp), %xmm0
   514:		por	%xmm1, %xmm0
   518:		movaps	%xmm5, 304(%rsp)
   526:		je	80
   528:		pslld	$2, %xmm0
   533:		pshufd	$78, %xmm0, %xmm1
   538:		movd	%xmm1, %rdi
   543:		movd	%xmm0, %rbx
   548:		movslq	%ebx, %r14
   551:		sarq	$32, %rbx
   555:		movslq	%edi, %rdx
   558:		sarq	$32, %rdi
   562:		movd	(%r9,%rbx), %xmm1
   568:		movd	(%r9,%rdi), %xmm0
   574:		punpckldq	%xmm0, %xmm1
   578:		movd	(%r9,%r14), %xmm0
   584:		movd	(%r9,%rdx), %xmm2
   590:		punpckldq	%xmm2, %xmm0
   594:		punpckldq	%xmm1, %xmm0
   598:		jmp	14
   600:		nopl	(%rax,%rax)
   608:		paddd	48(%rsp), %xmm0
   614:		pshufd	$245, %xmm0, %xmm1
   619:		movdqa	208(%rsp), %xmm2
   628:		pmuludq	%xmm2, %xmm1
   632:		pshufd	$232, %xmm1, %xmm1
   637:		pmuludq	%xmm2, %xmm0
   641:		pshufd	$232, %xmm0, %xmm0
   646:		punpckldq	%xmm1, %xmm0
   650:		movdqa	%xmm0, %xmm3
   654:		pxor	%xmm4, %xmm3
   658:		movdqa	192(%rsp), %xmm7
   667:		pcmpgtd	%xmm3, %xmm7
   671:		movdqa	%xmm7, %xmm1
   675:		pand	%xmm0, %xmm1
   679:		pshufd	$78, %xmm1, %xmm2
   684:		movd	%xmm2, %rdx
   689:		movd	%xmm1, %rdi
   694:		movslq	%edi, %rbx
   697:		movsd	(%r8,%rbx), %xmm11
   703:		movss	8(%r8,%rbx), %xmm1
   710:		shufps	$48, %xmm11, %xmm1
   715:		shufps	$132, %xmm1, %xmm11
   720:		sarq	$32, %rdi
   724:		movsd	(%r8,%rdi), %xmm1
   730:		movss	8(%r8,%rdi), %xmm2
   737:		shufps	$48, %xmm1, %xmm2
   741:		shufps	$132, %xmm2, %xmm1
   745:		movslq	%edx, %rdi
   748:		movsd	(%r8,%rdi), %xmm2
   754:		movss	8(%r8,%rdi), %xmm4
   761:		shufps	$48, %xmm2, %xmm4
   765:		shufps	$132, %xmm4, %xmm2
   769:		sarq	$32, %rdx
   773:		movsd	(%r8,%rdx), %xmm4
   779:		movss	8(%r8,%rdx), %xmm5
   786:		shufps	$48, %xmm4, %xmm5
   790:		shufps	$132, %xmm5, %xmm4
   794:		movaps	%xmm11, %xmm6
   798:		punpckldq	%xmm1, %xmm6
   802:		movaps	%xmm2, %xmm5
   805:		punpckldq	%xmm4, %xmm5
   809:		punpckhdq	%xmm1, %xmm11
   814:		punpckhdq	%xmm4, %xmm2
   818:		movdqa	%xmm6, %xmm10
   823:		punpcklqdq	%xmm5, %xmm10
   828:		pand	%xmm7, %xmm10
   833:		punpckhqdq	%xmm5, %xmm6
   837:		pand	%xmm7, %xmm6
   841:		punpcklqdq	%xmm2, %xmm11
   846:		pand	%xmm7, %xmm11
   851:		pand	16(%rsp), %xmm7
   857:		movdqa	176(%rsp), %xmm1
   866:		pcmpgtd	%xmm3, %xmm1
   870:		pand	%xmm1, %xmm0
   874:		pshufd	$78, %xmm0, %xmm2
   879:		movd	%xmm2, %rdx
   884:		movd	%xmm0, %rdi
   889:		movslq	%edi, %rbx
   892:		movdqu	(%r13,%rbx), %xmm8
   899:		sarq	$32, %rdi
   903:		movdqu	(%r13,%rdi), %xmm0
   910:		movslq	%edx, %rdi
   913:		movdqu	(%r13,%rdi), %xmm2
   920:		sarq	$32, %rdx
   924:		movdqu	(%r13,%rdx), %xmm3
   931:		movdqa	%xmm8, %xmm15
   936:		punpckldq	%xmm0, %xmm15
   941:		movdqa	%xmm2, %xmm4
   945:		punpckldq	%xmm3, %xmm4
   949:		punpckhdq	%xmm0, %xmm8
   954:		punpckhdq	%xmm3, %xmm2
   958:		movdqa	%xmm15, %xmm14
   963:		punpcklqdq	%xmm4, %xmm14
   968:		pand	%xmm1, %xmm14
   973:		punpckhqdq	%xmm4, %xmm15
   978:		pand	%xmm1, %xmm15
   983:		movdqa	%xmm8, %xmm12
   988:		punpcklqdq	%xmm2, %xmm12
   993:		pand	%xmm1, %xmm12
   998:		punpckhqdq	%xmm2, %xmm8
  1003:		pand	%xmm1, %xmm8
  1008:		movdqa	%xmm10, %xmm0
  1013:		unpcklps	%xmm6, %xmm0
  1016:		movdqa	%xmm11, %xmm2
  1021:		unpcklps	%xmm7, %xmm2
  1024:		movdqa	%xmm10, %xmm1
  1029:		unpckhps	%xmm6, %xmm1
  1032:		movdqa	%xmm11, %xmm13
  1037:		unpckhps	%xmm7, %xmm13
  1041:		movaps	%xmm0, %xmm3
  1044:		unpcklpd	%xmm2, %xmm3
  1048:		movapd	%xmm3, 288(%rsp)
  1057:		movhlps	%xmm0, %xmm2
  1060:		movaps	%xmm2, (%rsp)
  1064:		movaps	%xmm1, %xmm0
  1067:		unpcklpd	%xmm13, %xmm0
  1072:		movapd	%xmm0, 32(%rsp)
  1078:		movhlps	%xmm1, %xmm13
  1082:		movdqa	%xmm10, %xmm5
  1087:		cmpnleps	%xmm7, %xmm5
  1091:		movaps	%xmm7, %xmm2
  1094:		movaps	%xmm7, %xmm0
  1097:		addps	%xmm10, %xmm0
  1101:		xorpd	%xmm3, %xmm3
  1105:		cmpnleps	%xmm0, %xmm3
  1109:		addps	%xmm6, %xmm2
  1112:		xorps	%xmm1, %xmm1
  1115:		cmpnleps	%xmm2, %xmm1
  1119:		movaps	%xmm6, %xmm2
  1122:		cmpnleps	%xmm7, %xmm2
  1126:		movdqa	%xmm11, %xmm9
  1131:		cmpnleps	%xmm7, %xmm9
  1136:		movaps	16(%rsp), %xmm4
  1141:		divps	%xmm7, %xmm4
  1144:		addps	%xmm11, %xmm7
  1148:		xorps	%xmm0, %xmm0
  1151:		cmpnleps	%xmm7, %xmm0
  1155:		andps	160(%rsp), %xmm5
  1163:		andps	80(%rsp), %xmm9
  1169:		orps	%xmm5, %xmm9
  1173:		andps	144(%rsp), %xmm3
  1181:		orps	%xmm3, %xmm9
  1185:		andps	112(%rsp), %xmm1
  1190:		orps	%xmm1, %xmm9
  1194:		andps	96(%rsp), %xmm0
  1199:		orps	%xmm0, %xmm9
  1203:		movaps	288(%rsp), %xmm7
  1211:		movups	%xmm7, 4(%rsi,%r11)
  1217:		movaps	(%rsp), %xmm0
  1221:		movups	%xmm0, 72(%rax)
  1225:		movaps	32(%rsp), %xmm0
  1230:		movups	%xmm0, 140(%rax)
  1237:		movups	%xmm13, 208(%rax)
  1245:		andps	128(%rsp), %xmm2
  1253:		orps	%xmm2, %xmm9
  1257:		movaps	304(%rsp), %xmm5
  1265:		orps	%xmm9, %xmm5
  1269:		movq	200(%r15), %rdx
  1276:		movss	(%rdx), %xmm0
  1280:		shufps	$0, %xmm0, %xmm0
  1284:		mulps	%xmm4, %xmm10
  1288:		mulps	%xmm0, %xmm10
  1292:		movss	12(%rdx), %xmm0
  1297:		shufps	$0, %xmm0, %xmm0
  1301:		addps	%xmm0, %xmm10
  1305:		movss	4(%rdx), %xmm0
  1310:		shufps	$0, %xmm0, %xmm0
  1314:		mulps	%xmm4, %xmm6
  1317:		mulps	%xmm0, %xmm6
  1320:		movss	16(%rdx), %xmm0
  1325:		shufps	$0, %xmm0, %xmm0
  1329:		addps	%xmm0, %xmm6
  1332:		movss	8(%rdx), %xmm0
  1337:		shufps	$0, %xmm0, %xmm0
  1341:		mulps	%xmm4, %xmm11
  1345:		mulps	%xmm0, %xmm11
  1349:		movss	20(%rdx), %xmm0
  1354:		shufps	$0, %xmm0, %xmm0
  1358:		addps	%xmm0, %xmm11
  1362:		movaps	%xmm10, %xmm0
  1366:		unpcklps	%xmm6, %xmm0
  1369:		unpckhps	%xmm6, %xmm10
  1373:		movaps	%xmm11, %xmm1
  1377:		unpcklps	%xmm4, %xmm1
  1380:		unpckhps	%xmm4, %xmm11
  1384:		movdqa	240(%rsp), %xmm4
  1393:		movaps	%xmm0, %xmm2
  1396:		unpcklpd	%xmm1, %xmm2
  1400:		movhlps	%xmm0, %xmm1
  1403:		movaps	%xmm10, %xmm0
  1407:		unpcklpd	%xmm11, %xmm0
  1412:		movhlps	%xmm10, %xmm11
  1416:		orps	64(%rsp), %xmm9
  1422:		movss	%xmm9, (%rax)
  1427:		pshufd	$229, %xmm9, %xmm3
  1433:		movd	%xmm3, 68(%rax)
  1438:		pshufd	$78, %xmm9, %xmm3
  1444:		movd	%xmm3, 136(%rax)
  1452:		pshufd	$231, %xmm9, %xmm3
  1458:		movd	%xmm3, 204(%rax)
  1466:		movupd	%xmm2, 20(%rsi,%r11)
  1473:		movups	%xmm1, 88(%rax)
  1477:		movupd	%xmm0, 156(%rax)
  1485:		movups	%xmm11, 224(%rax)
  1493:		movdqa	%xmm14, %xmm0
  1498:		unpcklps	%xmm15, %xmm0
  1502:		unpckhps	%xmm15, %xmm14
  1506:		movdqa	%xmm12, %xmm1
  1511:		unpcklps	%xmm8, %xmm1
  1515:		unpckhps	%xmm8, %xmm12
  1519:		movaps	%xmm0, %xmm2
  1522:		unpcklpd	%xmm1, %xmm2
  1526:		movhlps	%xmm0, %xmm1
  1529:		movaps	%xmm14, %xmm0
  1533:		unpcklpd	%xmm12, %xmm0
  1538:		movhlps	%xmm14, %xmm12
  1542:		movupd	%xmm2, 36(%rsi,%r11)
  1549:		movups	%xmm1, 104(%rax)
  1553:		movupd	%xmm0, 172(%rax)
  1561:		movups	%xmm12, 240(%rax)
  1569:		movups	%xmm7, 52(%rsi,%r11)
  1575:		movaps	(%rsp), %xmm0
  1579:		movups	%xmm0, 120(%rax)
  1583:		movaps	32(%rsp), %xmm0
  1588:		movups	%xmm0, 188(%rax)
  1595:		movups	%xmm13, 256(%rax)
  1603:		addl	$4, %r10d
  1607:		cmpl	%ecx, %r10d
  1610:		jb	-1168
  1616:		movd	%xmm5, %rax
  1621:		pshufd	$78, %xmm5, %xmm0
  1626:		movd	%xmm0, %rcx
  1631:		orq	%rax, %rcx
  1634:		setne	%al
  1637:		leaq	-40(%rbp), %rsp
  1641:		popq	%rbx
  1642:		popq	%r12
  1644:		popq	%r13
  1646:		popq	%r14
  1648:		popq	%r15
  1650:		popq	%rbp
  1651:		retq