diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index d930f09..d565996 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1648,10 +1648,62 @@ lp_build_floor(struct lp_build_context *bld,
       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
    }
    else {
-      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
+      const struct lp_type type = bld->type;
       LLVMValueRef res;
-      res = lp_build_ifloor(bld, a);
-      res = LLVMBuildSIToFP(builder, res, vec_type, "");
+      LLVMTypeRef int_vec_type = bld->int_vec_type;
+      LLVMTypeRef vec_type = bld->vec_type;
+
+      if (type.sign) {
+         struct lp_type inttype;
+         struct lp_build_context intbld;
+         LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
+         LLVMValueRef trunc, truncminusone, mask, anosign;
+
+         assert(type.floating);
+         assert(type.width == 32); /* might want to handle doubles at some point */
+         assert(lp_check_value(type, a));
+
+         inttype = type;
+         inttype.floating = 0;
+         lp_build_context_init(&intbld, bld->gallivm, inttype);
+
+         /* round by truncation */
+         trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
+         trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
+
+         /*
+          * fix values if rounding is wrong (for non-special cases)
+          * - this is the case if trunc > a
+          */
+         mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
+         /*
+          * instead of sub/select could do
+          * resint = add(itrunc, mask) (mask is minus one / zero)
+          * res = SiToFP(resint)
+          */
+         truncminusone = lp_build_sub(bld, trunc, bld->one);
+         res = lp_build_select(bld, mask, truncminusone, trunc);
+
+         /* mask out sign bit */
+         anosign = lp_build_abs(bld, a);
+         /*
+          * mask out all values if anosign > 2^24
+          * This should work both for large ints (floor is no-op for them because
+          * such floats are always exact) as well as special cases like NaNs, Infs
+          * (taking advantage of the fact they use max exponent).
+          * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
+          */
+         anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
+         cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
+         mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
+         res = lp_build_select(bld, mask, a, res);
+      }
+      else {
+         /* round by truncation */
+         res = LLVMBuildFPToSI(builder, a, int_vec_type, "");
+         res = LLVMBuildSIToFP(builder, res, vec_type, "floor.trunc");
+
+      }
       return res;
    }
 }
@@ -1826,32 +1878,30 @@ lp_build_ifloor(struct lp_build_context *bld,
          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
       }
       else {
-         /* Take the sign bit and add it to 1 constant */
-         LLVMTypeRef vec_type = bld->vec_type;
-         unsigned mantissa = lp_mantissa(type);
-         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
-                                  (unsigned long long)1 << (type.width - 1));
-         LLVMValueRef sign;
-         LLVMValueRef offset;
+         struct lp_type inttype;
+         struct lp_build_context intbld;
+         LLVMValueRef trunc, itrunc, mask;
 
-         /* sign = a < 0 ? ~0 : 0 */
-         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
-         sign = LLVMBuildAnd(builder, sign, mask, "");
-         sign = LLVMBuildAShr(builder, sign,
-                              lp_build_const_int_vec(bld->gallivm, type,
-                                                     type.width - 1),
-                              "ifloor.sign");
+         assert(type.floating);
+         assert(lp_check_value(type, a));
 
-         /* offset = -0.99999(9)f */
-         offset = lp_build_const_vec(bld->gallivm, type,
-                                     -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
-         offset = LLVMConstBitCast(offset, int_vec_type);
+         inttype = type;
+         inttype.floating = 0;
+         lp_build_context_init(&intbld, bld->gallivm, inttype);
 
-         /* offset = a < 0 ? offset : 0.0f */
-         offset = LLVMBuildAnd(builder, offset, sign, "");
-         offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset");
+         /* round by truncation */
+         itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
+         trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
 
-         res = LLVMBuildFAdd(builder, res, offset, "ifloor.res");
+         /*
+          * fix values if rounding is wrong (for non-special cases)
+          * - this is the case if trunc > a
+          * The results of doing this with NaNs, very large values etc.
+          * are undefined but this seems to be the case anyway.
+          */
+         mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
+         /* cheapie minus one with mask since the mask is minus one / zero */
+         return lp_build_add(&intbld, itrunc, mask);
       }
    }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 050eba7..d677dbb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -489,7 +489,7 @@ lp_build_init(void)
 
    gallivm_initialized = TRUE;
 
-#if 0
+#if 1
    /* For simulating less capable machines */
    util_cpu_caps.has_sse3 = 0;
    util_cpu_caps.has_ssse3 = 0;