From d8e683386f8c04a875c16deaad737a2fc3ffa89b Mon Sep 17 00:00:00 2001
From: Ray Strode <rstrode@redhat.com>
Date: Fri, 21 Apr 2017 15:22:14 -0400
Subject: [PATCH] gallivm: fixes misrendering on big-endian systems

Not sure if this change is right, I just know it makes the problem
go away.
---
 src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 98eb694..a0a5507 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -413,442 +413,441 @@ void
 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                         const struct util_format_description *format_desc,
                         struct lp_type type,
                         boolean aligned,
                         LLVMValueRef base_ptr,
                         LLVMValueRef offset,
                         LLVMValueRef i,
                         LLVMValueRef j,
                         LLVMValueRef cache,
                         LLVMValueRef rgba_out[4])
 {
    LLVMBuilderRef builder = gallivm->builder;
    enum pipe_format format = format_desc->format;
    struct lp_type fetch_type;
 
    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
        format_desc->block.width == 1 &&
        format_desc->block.height == 1 &&
        format_desc->block.bits <= type.width &&
        (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
         format_desc->channel[0].size == 32 ||
         format_desc->channel[0].size == 16))
    {
       /*
        * The packed pixel fits into an element of the destination format. Put
        * the packed pixels into a vector and extract each component for all
        * vector elements in parallel.
        */
 
       LLVMValueRef packed;
 
       /*
        * gather the texels from the texture
        * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
        */
       assert(format_desc->block.bits <= type.width);
       fetch_type = lp_type_uint(type.width);
       packed = lp_build_gather(gallivm,
                                type.length,
                                format_desc->block.bits,
                                fetch_type,
                                aligned,
                                base_ptr, offset, FALSE);
 
       /*
        * convert texels to float rgba
        */
       lp_build_unpack_rgba_soa(gallivm,
                                format_desc,
                                type,
                                packed, rgba_out);
       return;
    }
 
 
    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
        format_desc->block.width == 1 &&
        format_desc->block.height == 1 &&
        format_desc->block.bits > type.width &&
        ((format_desc->block.bits <= type.width * type.length &&
          format_desc->channel[0].size <= type.width) ||
         (format_desc->channel[0].size == 64 &&
          format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
          type.floating)))
    {
       /*
        * Similar to above, but the packed pixel is larger than what fits
        * into an element of the destination format. The packed pixels will be
        * shuffled into SoA vectors appropriately, and then the extraction will
        * be done in parallel as much as possible.
        * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
        * the gathered vectors can be shuffled easily (even with avx).
        * 64xn float -> 32xn float is handled too but it's a bit special as
        * it does the conversion pre-shuffle.
        */
 
       LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
       struct lp_type fetch_type, gather_type = type;
       unsigned num_gather, fetch_width, i, j;
       struct lp_build_context bld;
       boolean fp64 = format_desc->channel[0].size == 64;
 
       lp_build_context_init(&bld, gallivm, type);
 
       assert(type.width == 32);
       assert(format_desc->block.bits > type.width);
 
       /*
        * First, figure out fetch order.
        */
       fetch_width = util_next_power_of_two(format_desc->block.bits);
       /*
        * fp64 are treated like fp32 except we fetch twice wide values
        * (as we shuffle after trunc). The shuffles for that work out
        * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
        * albeit we miss the potential opportunity for hw gather (as it
        * only handles native size).
        */
       num_gather = fetch_width / type.width;
       gather_type.width *= num_gather;
       if (fp64) {
          num_gather /= 2;
       }
       gather_type.length /= num_gather;
 
       for (i = 0; i < num_gather; i++) {
          LLVMValueRef offsetr, shuf_vec;
          if(num_gather == 4) {
             for (j = 0; j < gather_type.length; j++) {
                unsigned idx = i + 4*j;
                shuffles[j] = lp_build_const_int32(gallivm, idx);
             }
             shuf_vec = LLVMConstVector(shuffles, gather_type.length);
             offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
 
          }
          else if (num_gather == 2) {
             assert(num_gather == 2);
             for (j = 0; j < gather_type.length; j++) {
                unsigned idx = i*2 + (j%2) + (j/2)*4;
                shuffles[j] = lp_build_const_int32(gallivm, idx);
             }
             shuf_vec = LLVMConstVector(shuffles, gather_type.length);
             offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
          }
          else {
             assert(num_gather == 1);
             offsetr = offset;
          }
          if (gather_type.length == 1) {
             LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
             offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
          }
 
          /*
           * Determine whether to use float or int loads. This is mostly
           * to outsmart the (stupid) llvm int/float shuffle logic, we
           * don't really care much if the data is floats or ints...
           * But llvm will refuse to use single float shuffle with int data
           * and instead use 3 int shuffles instead, the code looks atrocious.
           * (Note bitcasts often won't help, as llvm is too smart to be
           * fooled by that.)
           * Nobody cares about simd float<->int domain transition penalties,
           * which usually don't even exist for shuffles anyway.
           * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
           * going into transpose, which is unpacks, so doesn't really matter
           * much).
           * With 2x32bit or 4x16bit fetch, we use float vec, since those
           * go into the weird channel separation shuffle. With floats,
           * this is (with 128bit vectors):
           * - 2 movq, 2 movhpd, 2 shufps
           * With ints it would be:
           * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
           * I've seen texture functions increase in code size by 15% just due
           * to that (there's lots of such fetches in them...)
           * (We could chose a different gather order to improve this somewhat
           * for the int path, but it would basically just drop the blends,
           * so the float path with this order really is optimal.)
           * Albeit it is tricky sometimes llvm doesn't ignore the float->int
           * casts so must avoid them until we're done with the float shuffle...
           * 3x16bit formats (the same is also true for 3x8) are pretty bad but
           * there's nothing we can do about them (we could overallocate by
           * those couple bytes and use unaligned but pot sized load).
           * Note that this is very much x86 specific. I don't know if this
           * affect other archs at all.
           */
          if (num_gather > 1) {
             /*
              * We always want some float type here (with x86)
              * due to shuffles being float ones afterwards (albeit for
              * the num_gather == 4 case int should work fine too
              * (unless there's some problems with avx but not avx2).
              */
             if (format_desc->channel[0].size == 64) {
                fetch_type = lp_type_float_vec(64, gather_type.width);
             } else {
                fetch_type = lp_type_int_vec(32, gather_type.width);
             }
          }
          else {
             /* type doesn't matter much */
             if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
                 (format_desc->channel[0].size == 32 ||
                  format_desc->channel[0].size == 64)) {
             fetch_type = lp_type_float(gather_type.width);
             } else {
                fetch_type = lp_type_uint(gather_type.width);
             }
          }
 
          /* Now finally gather the values */
          packed[i] = lp_build_gather(gallivm, gather_type.length,
                                      format_desc->block.bits,
                                      fetch_type, aligned,
                                      base_ptr, offsetr, FALSE);
          if (fp64) {
             struct lp_type conv_type = type;
             conv_type.width *= 2;
             packed[i] = LLVMBuildBitCast(builder, packed[i],
                                          lp_build_vec_type(gallivm, conv_type), "");
             packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
          }
       }
 
       /* shuffle the gathered values to SoA */
       if (num_gather == 2) {
          for (i = 0; i < num_gather; i++) {
             for (j = 0; j < type.length; j++) {
                unsigned idx = (j%2)*2 + (j/4)*4 + i;
                if ((j/2)%2)
                   idx += type.length;
                shuffles[j] = lp_build_const_int32(gallivm, idx);
             }
             dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
                                             LLVMConstVector(shuffles, type.length), "");
          }
       }
       else if (num_gather == 4) {
          lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
       }
       else {
          assert(num_gather == 1);
          dst[0] = packed[0];
       }
 
       /*
        * And finally unpack exactly as above, except that
        * chan shift is adjusted and the right vector selected.
        */
       if (!fp64) {
          for (i = 0; i < num_gather; i++) {
             dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
          }
          for (i = 0; i < format_desc->nr_channels; i++) {
             struct util_format_channel_description chan_desc = format_desc->channel[i];
             unsigned blockbits = type.width;
-            unsigned vec_nr = chan_desc.shift / type.width;
             chan_desc.shift %= type.width;
 
             output[i] = lp_build_extract_soa_chan(&bld,
                                                   blockbits,
                                                   FALSE,
                                                   chan_desc,
-                                                  dst[vec_nr]);
+                                                  dst[i]);
          }
       }
       else {
          for (i = 0; i < format_desc->nr_channels; i++)  {
             output[i] = dst[i];
          }
       }
 
       lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
       return;
    }
 
    if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
        format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
       /*
        * similar conceptually to above but requiring special
        * AoS packed -> SoA float conversion code.
        */
       LLVMValueRef packed;
       struct lp_type fetch_type = lp_type_uint(type.width);
 
       assert(type.floating);
       assert(type.width == 32);
 
       packed = lp_build_gather(gallivm, type.length,
                                format_desc->block.bits,
                                fetch_type, aligned,
                                base_ptr, offset, FALSE);
       if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
          lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
       }
       else {
          lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
       }
       return;
    }
 
    if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
        format_desc->block.bits == 64) {
       /*
        * special case the format is 64 bits but we only require
        * 32bit (or 8bit) from each block.
        */
       LLVMValueRef packed;
       struct lp_type fetch_type = lp_type_uint(type.width);
 
       if (format == PIPE_FORMAT_X32_S8X24_UINT) {
          /*
           * for stencil simply fix up offsets - could in fact change
           * base_ptr instead even outside the shader.
           */
          unsigned mask = (1 << 8) - 1;
          LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
          offset = LLVMBuildAdd(builder, offset, s_offset, "");
          packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
                                   aligned, base_ptr, offset, FALSE);
          packed = LLVMBuildAnd(builder, packed,
                                lp_build_const_int_vec(gallivm, type, mask), "");
       }
       else {
          assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
          packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
                                   aligned, base_ptr, offset, TRUE);
          packed = LLVMBuildBitCast(builder, packed,
                                    lp_build_vec_type(gallivm, type), "");
       }
       /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
       rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
       rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
       return;
    }
 
    /*
     * Try calling lp_build_fetch_rgba_aos for all pixels.
     * Should only really hit subsampled, compressed
     * (for s3tc srgb too, for rgtc the unorm ones only) by now.
     * (This is invalid for plain 8unorm formats because we're lazy with
     * the swizzle since some results would arrive swizzled, some not.)
     */
 
    if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
        (util_format_fits_8unorm(format_desc) ||
         format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
        type.floating && type.width == 32 &&
        (type.length == 1 || (type.length % 4 == 0))) {
       struct lp_type tmp_type;
       struct lp_build_context bld;
       LLVMValueRef packed, rgba[4];
       const struct util_format_description *flinear_desc;
       const struct util_format_description *frgba8_desc;
       unsigned chan;
 
       lp_build_context_init(&bld, gallivm, type);
 
       /*
        * Make sure the conversion in aos really only does convert to rgba8
        * and not anything more (so use linear format, adjust type).
        */
       flinear_desc = util_format_description(util_format_linear(format));
       memset(&tmp_type, 0, sizeof tmp_type);
       tmp_type.width = 8;
       tmp_type.length = type.length * 4;
       tmp_type.norm = TRUE;
 
       packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
                                        aligned, base_ptr, offset, i, j, cache);
       packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
 
       /*
        * The values are now packed so they match ordinary (srgb) RGBA8 format,
        * hence need to use matching format for unpack.
        */
       frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM);
       if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
          assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
          frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
       }
       lp_build_unpack_rgba_soa(gallivm,
                                frgba8_desc,
                                type,
                                packed, rgba);
 
       /*
        * We converted 4 channels. Make sure llvm can drop unneeded ones
        * (luckily the rgba order is fixed, only LA needs special case).
        */
       for (chan = 0; chan < 4; chan++) {
          enum pipe_swizzle swizzle = format_desc->swizzle[chan];
          if (chan == 3 && util_format_is_luminance_alpha(format)) {
             swizzle = PIPE_SWIZZLE_W;
          }
          rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
       }
       return;
    }
 
 
    /*
     * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
     *
     * This is not the most efficient way of fetching pixels, as we
     * miss some opportunities to do vectorization, but this is
     * convenient for formats or scenarios for which there was no
     * opportunity or incentive to optimize.
     *
     * We do NOT want to end up here, this typically is quite terrible,
     * in particular if the formats have less than 4 channels.
     *
     * Right now, this should only be hit for:
     * - RGTC snorm formats
     *   (those miss fast fetch functions hence they are terrible anyway)
     */
 
    {
       unsigned k;
       struct lp_type tmp_type;
       LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
 
       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
          debug_printf("%s: AoS fetch fallback for %s\n",
                       __FUNCTION__, format_desc->short_name);
       }
 
       tmp_type = type;
       tmp_type.length = 4;
 
       /*
        * Note that vector transpose can be worse compared to insert/extract
        * for aos->soa conversion (for formats with 1 or 2 channels). However,
        * we should try to avoid getting here for just about all formats, so
        * don't bother.
        */
 
       /* loop over number of pixels */
       for(k = 0; k < type.length; ++k) {
          LLVMValueRef index = lp_build_const_int32(gallivm, k);
          LLVMValueRef offset_elem;
          LLVMValueRef i_elem, j_elem;
 
          offset_elem = LLVMBuildExtractElement(builder, offset,
                                                index, "");
 
          i_elem = LLVMBuildExtractElement(builder, i, index, "");
          j_elem = LLVMBuildExtractElement(builder, j, index, "");
 
          /* Get a single float[4]={R,G,B,A} pixel */
          aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
                                                 aligned, base_ptr, offset_elem,
                                                 i_elem, j_elem, cache);
 
       }
       convert_to_soa(gallivm, aos_fetch, rgba_out, type);
    }
 }
-- 
1.8.3.1