Created attachment 113313 [details] Full glxinfo output Following fragment shader code fails to software emulation: uniform sampler2D tex; varying vec2 tx1,tx2,tx3,tx4,tx5; void main () { vec4 a = texture2D(tex,tx1); vec4 b = texture2D(tex,tx2); vec4 c = texture2D(tex,tx3); vec4 d = texture2D(tex,tx4); vec4 e = texture2D(tex,tx5); vec4 f = texture2D(tex,tx5); vec4 res = ( a * 0.3 + b * 0.1 + c * 0.2 + d * 0.08 + e* 0.2 + f * 0.099) / 6.456; gl_FragColor = res; } With output: # Fragment Program/Shader 9 0: TEX TEMP[0], INPUT[26].xyyy, texture[0], 2D; 1: TEX TEMP[1], INPUT[25].zwww, texture[0], 2D; 2: TEX TEMP[2], INPUT[25].xyyy, texture[0], 2D; 3: TEX TEMP[3], INPUT[24].zwww, texture[0], 2D; 4: TEX TEMP[4], INPUT[24].xyyy, texture[0], 2D; 5: MUL TEMP[5], TEMP[4], CONST[2].yyyy; 6: MAD TEMP[4], TEMP[3], CONST[2].xxxx, TEMP[5]; 7: MAD TEMP[3], TEMP[2], CONST[1].wwww, TEMP[4]; 8: MAD TEMP[2], TEMP[1], CONST[1].zzzz, TEMP[3]; 9: MAD TEMP[1], TEMP[0], CONST[1].yyyy, TEMP[2]; 10: MAD TEMP[2], TEMP[0], CONST[1].xxxx, TEMP[1]; 11: MUL OUTPUT[2], TEMP[2], CONST[2].zzzz; 12: END i915_program_error: Exceeded max nr indirect texture lookups (6 out of 4) ENTER FALLBACK 10000: Program As I understand, varying packing optimization leads to swizzle operation, which generates dependent texture lookup for every single texture access. I consider this as a bug, since it makes all shaders with more then 3 texture accesses unusable on i915 chips. Bug is reproduced on Atom 230 board with Intel 945g video + Linux Ubuntu 14.04 with stock Mesa 10.1 A bit of GL info (more in attachment): OpenGL vendor string: Intel Open Source Technology Center OpenGL renderer string: Mesa DRI Intel(R) 945G OpenGL version string: 2.1 Mesa 10.1.3 OpenGL shading language version string: 1.20
I've done several extra tests: In the following program you can see, that calculations and texture accesses got mixed by GLSL optimizer, which leads to a lot of indirections by itself (even if swizzling in TEX instructions could be allowed): uniform sampler2D tex; varying vec2 tx1,tx2,tx3,tx4,tx5,tx0; void main () { vec4 a = texture2D(tex,tx1); vec4 b = texture2D(tex,tx2); vec4 c = texture2D(tex,tx3); vec4 d = texture2D(tex,tx4); vec4 e = texture2D(tex,tx5); vec4 f = texture2D(tex,tx0); vec4 res =( a * .3 + b * .1 + c* .2 + d * .08 + e* .02 + f * .099)/6.456; vec2 q = tx0 + vec2(0.9); vec2 w = tx1 + vec2(0.8); vec2 s = tx2 - vec2(0.7); vec2 r = tx3 + vec2(0.6); vec2 t = tx4 - vec2(0.5); vec2 y = tx5 + vec2(0.4); a = texture2D(tex,q); b = texture2D(tex,w); c = texture2D(tex,s); d = texture2D(tex,r); e = texture2D(tex,t); f = texture2D(tex,y); res +=( a* .3456 + b * .314 + c* .1 + d * .271 + e* .3 * .6 + f * 1.3)/7.627; gl_FragColor = res; } Output: # Fragment Program/Shader 9 0: TEX TEMP[0], INPUT[26].zwww, texture[0], 2D; 1: TEX TEMP[1], INPUT[26].xyyy, texture[0], 2D; 2: TEX TEMP[2], INPUT[25].zwww, texture[0], 2D; 3: TEX TEMP[3], INPUT[25].xyyy, texture[0], 2D; 4: TEX TEMP[4], INPUT[24].zwww, texture[0], 2D; 5: TEX TEMP[5], INPUT[24].xyyy, texture[0], 2D; 6: MUL TEMP[6], TEMP[5], CONST[2].yyyy; 7: MAD TEMP[5], TEMP[4], CONST[2].xxxx, TEMP[6]; 8: MAD TEMP[4], TEMP[3], CONST[1].wwww, TEMP[5]; 9: MAD TEMP[3], TEMP[2], CONST[1].zzzz, TEMP[4]; 10: MAD TEMP[2], TEMP[1], CONST[1].yyyy, TEMP[3]; 11: MAD TEMP[1], TEMP[0], CONST[1].xxxx, TEMP[2]; 12: MUL TEMP[0], TEMP[1], CONST[2].zzzz; 13: ADD TEMP[1].xy, INPUT[26].xyyy, CONST[3]; 14: TEX TEMP[2], TEMP[1].xyyy, texture[0], 2D; 15: ADD TEMP[1].xy, INPUT[25].zwww, CONST[4]; 16: TEX TEMP[3], TEMP[1].xyyy, texture[0], 2D; 17: ADD TEMP[1].xy, INPUT[25].xyyy, CONST[5]; 18: TEX TEMP[4], TEMP[1].xyyy, texture[0], 2D; 19: ADD TEMP[1].xy, INPUT[24].zwww, CONST[6]; 20: TEX TEMP[5], TEMP[1].xyyy, texture[0], 2D; 21: ADD TEMP[1].xy, INPUT[24].xyyy, CONST[7]; 22: TEX TEMP[6], TEMP[1].xyyy, texture[0], 2D; 23: ADD TEMP[1].xy, INPUT[26].zwww, CONST[8]; 24: TEX TEMP[7], TEMP[1].xyyy, texture[0], 2D; 25: MUL TEMP[1], TEMP[7], CONST[4].wwww; 26: MAD TEMP[7], TEMP[6], CONST[4].zzzz, TEMP[1]; 27: MAD TEMP[1], TEMP[5], CONST[2].xxxx, TEMP[7]; 28: MAD TEMP[5], TEMP[4], CONST[3].wwww, TEMP[1]; 29: MAD TEMP[1], CONST[3].zzzz, TEMP[3], TEMP[5]; 30: MAD TEMP[3], TEMP[2], CONST[2].wwww, TEMP[1]; 31: MAD OUTPUT[2], TEMP[3], CONST[5].zzzz, TEMP[0]; 32: END i915_program_error: Exceeded max nr indirect texture lookups (13 out of 4) ENTER FALLBACK 10000: Program Then I've tried texture2DProj(), which requires vec3 or vec4, so varyings aren't packed for it. You cat see both examples rewritten with it below. In case of texture2DProj() with vec3 arguments, I got lots of extra moves in the output, so texture2DProj() with vec4 seems to be "native" for i945g (GMA 950): uniform sampler2D tex; varying vec4 tx1,tx2,tx3,tx4,tx5,tx0; void main () { vec4 a = texture2DProj(tex,tx1); vec4 b = texture2DProj(tex,tx2); vec4 c = texture2DProj(tex,tx3); vec4 d = texture2DProj(tex,tx4); vec4 e = texture2DProj(tex,tx5); vec4 f = texture2DProj(tex,tx0); vec4 res =( a * .3 + b * .1 + c* .2 + d * .8 + e* .2 + f * .99)/6.456; gl_FragColor = res; } 0: TXP TEMP[0], INPUT[29].xyyw, texture[0], 2D; 1: TXP TEMP[1], INPUT[28].xyyw, texture[0], 2D; 2: TXP TEMP[2], INPUT[27].xyyw, texture[0], 2D; 3: TXP TEMP[3], INPUT[26].xyyw, texture[0], 2D; 4: TXP TEMP[4], INPUT[25].xyyw, texture[0], 2D; 5: TXP TEMP[5], INPUT[24].xyyw, texture[0], 2D; 6: MUL TEMP[6], TEMP[5], CONST[2].xxxx; 7: MAD TEMP[5], TEMP[4], CONST[1].wwww, TEMP[6]; 8: MAD TEMP[4], TEMP[3], CONST[1].yyyy, TEMP[5]; 9: MAD TEMP[3], TEMP[2], CONST[1].zzzz, TEMP[4]; 10: MAD TEMP[2], TEMP[1], CONST[1].yyyy, TEMP[3]; 11: MAD TEMP[1], TEMP[0], CONST[1].xxxx, TEMP[2]; 12: MUL OUTPUT[2], TEMP[1], CONST[2].yyyy; 13: END i915_program_error: Exceeded max nr indirect texture lookups (7 out of 4) ENTER FALLBACK 10000: Program uniform sampler2D tex; varying vec4 tx1,tx2,tx3,tx4,tx5,tx0; void main () { vec4 a = texture2DProj(tex,tx1); vec4 b = texture2DProj(tex,tx2); vec4 c = texture2DProj(tex,tx3); vec4 d = texture2DProj(tex,tx4); vec4 e = texture2DProj(tex,tx5); vec4 f = texture2DProj(tex,tx0); vec4 res =( a * .3 + b * .1 + c* .2 + d * .8 + e* .2 + f * .99)/6.456; vec4 q = tx0 + vec4(0.9); vec4 w = tx1 + vec4(0.8); vec4 s = tx2 - vec4(0.7); vec4 r = tx3 + vec4(0.6); vec4 t = tx4 - vec4(0.5); vec4 y = tx5 + vec4(0.4); a = texture2DProj(tex,q); b = texture2DProj(tex,w); c = texture2DProj(tex,s); d = texture2DProj(tex,r); e = texture2DProj(tex,t); f = texture2DProj(tex,y); res +=( a* .3456 + b * .314 + c* .1 + d * .271 + e* .3 * .6 + f * 1.3)/7.627; gl_FragColor = res; } # Fragment Program/Shader 9 0: TXP TEMP[0], INPUT[29].xyyw, texture[0], 2D; 1: TXP TEMP[1], INPUT[28].xyyw, texture[0], 2D; 2: TXP TEMP[2], INPUT[27].xyyw, texture[0], 2D; 3: TXP TEMP[3], INPUT[26].xyyw, texture[0], 2D; 4: TXP TEMP[4], INPUT[25].xyyw, texture[0], 2D; 5: TXP TEMP[5], INPUT[24].xyyw, texture[0], 2D; 6: MUL TEMP[6], TEMP[5], CONST[2].xxxx; 7: MAD TEMP[5], TEMP[4], CONST[1].wwww, TEMP[6]; 8: MAD TEMP[4], TEMP[3], CONST[1].yyyy, TEMP[5]; 9: MAD TEMP[3], TEMP[2], CONST[1].zzzz, TEMP[4]; 10: MAD TEMP[2], TEMP[1], CONST[1].yyyy, TEMP[3]; 11: MAD TEMP[1], TEMP[0], CONST[1].xxxx, TEMP[2]; 12: MUL TEMP[0], TEMP[1], CONST[2].yyyy; 13: ADD TEMP[1], INPUT[24], CONST[1].zzzz; 14: ADD TEMP[2], INPUT[25], CONST[3]; 15: ADD TEMP[3], INPUT[26], CONST[4]; 16: ADD TEMP[4], INPUT[27], CONST[5]; 17: ADD TEMP[5], INPUT[28], CONST[6]; 18: ADD TEMP[6], INPUT[29], CONST[7]; 19: TXP TEMP[7], TEMP[5].xyyw, texture[0], 2D; 20: TXP TEMP[5], TEMP[4].xyyw, texture[0], 2D; 21: TXP TEMP[4], TEMP[3].xyyw, texture[0], 2D; 22: TXP TEMP[3], TEMP[2].xyyw, texture[0], 2D; 23: TXP TEMP[2], TEMP[1].xyyw, texture[0], 2D; 24: TXP TEMP[1], TEMP[6].xyyw, texture[0], 2D; 25: MUL TEMP[6], TEMP[1], CONST[8].zzzz; 26: MAD TEMP[1], TEMP[2], CONST[8].yyyy, TEMP[6]; 27: MAD TEMP[2], TEMP[3], CONST[1].wwww, TEMP[1]; 28: MAD TEMP[1], TEMP[4], CONST[8].xxxx, TEMP[2]; 29: MAD TEMP[2], CONST[2].wwww, TEMP[5], TEMP[1]; 30: MAD TEMP[1], TEMP[7], CONST[2].zzzz, TEMP[2]; 31: MAD OUTPUT[2], TEMP[1], CONST[8].wwww, TEMP[0]; 32: END i915_program_error: Exceeded max nr indirect texture lookups (13 out of 4) ENTER FALLBACK 10000: Program As You can see, every texture access still generates texture indirection, since swizzling still used, but in .xyyw second 'y' seems to be a stub, because when I tried texture2DProj(tex,tx0.xywz) explicitly, I got TEMP[2].xyyz in the output. Could it be replaced during code generation from INPUT[29].xyyw to something like INPUT[29].xyzw => INPUT[29] (without swizzling) then? The other interesting point is that calculations and texture accesses weren't reordered in the last example. If we didn't have any swizzling in TXP, we would get fully hardware accelerated shaders in the last two examples, since the native limits are 64 ALU and 32 TEX instructions. So the basic problems seems to be swizzling in texture access instructions and varing packing. There are also could be a problems with texture2DProj(sampler2D, vec3) variant, since it generates more instructions and leads to TXP TEMP[3], TEMP[2].xyyw kind of instruction. Probably, it also makes sense to disable some optimizations for i945g, which leads to reordering texture accesses and calculations. It could be very nice to have this bug fixed, since the chips like i945g(GMA 900) or X3150(like in Atom D525) are far more capable, then just 3 texture accesses! By the way, it looks like some problems in BUG 87478 could be connected with this one.
I've just found discussion of the similar issue in gallium driver, where they seem to disable varying packing for GPUs with too few supported texture indirections. http://lists.freedesktop.org/archives/mesa-commit/2012-December/040778.html Could it be solved in "classic" driver?
The problem, as can be seen in from the assembly code in comment #1 is that the i915 driver is just being dumb. If it had a scheduler that would group TEX instructions properly, all of these programs would work. I think scheduling is still the problem with the original shader in comment #0. If you run with the environment variable INTEL_DEBUG=fs the driver will log the actual machine instructions generated. What is shown in the bug report is one of the intermediate representations. My guess is that the driver is inserting some extra MOV instructions before each TEX instruction. (In reply to gnn from comment #2) > Could it be solved in "classic" driver? Based on "OpenGL vendor string: Intel Open Source Technology Center" in the glxinfo output, you are using the classic driver.
I found out, that i915 assembly is printed only if code could be hardware accelerated. I tried several extra tests with just 3 texture accesses (it makes 4 tex. indirections) and I've got full output. uniform sampler2D tex; varying vec2 tx1,tx2,tx3,tx4,tx5,tx0; void main () { vec4 a = texture2D(tex,tx1); vec4 b = texture2D(tex,tx2); vec4 c = texture2D(tex,tx3); vec4 res = (a * .3 + b * .1 + c * .2) / 1.456; gl_FragColor = res; } Fragment Program/Shader 9 0: TEX TEMP[0], INPUT[25].xyyy, texture[0], 2D; 1: TEX TEMP[1], INPUT[24].zwww, texture[0], 2D; 2: TEX TEMP[2], INPUT[24].xyyy, texture[0], 2D; 3: MUL TEMP[3], TEMP[2], CONST[1].zzzz; 4: MAD TEMP[2], TEMP[1], CONST[1].yyyy, TEMP[3]; 5: MAD TEMP[1], TEMP[0], CONST[1].xxxx, TEMP[2]; 6: MUL OUTPUT[2], TEMP[1], CONST[1].wwww; 7: END i915: BEGIN DCL S[0] DCL T_TEX1 DCL T_TEX0 R[0] = MOV T_TEX1.xyyy R[0] = TEXLD S[0],R[0] R[1] = MOV T_TEX0.zwww R[1] = TEXLD S[0],R[1] R[2] = MOV T_TEX0.xyyy R[2] = TEXLD S[0],R[2] R[3] = MUL R[2], CONST[0].zzzz R[2] = MAD R[1], CONST[0].yyyy, R[3] R[1] = MAD R[0], CONST[0].xxxx, R[2] oC = MUL R[1], CONST[0].wwww END uniform sampler2D tex; varying vec4 tx1,tx2,tx3,tx4,tx5,tx0; void main () { vec4 a = texture2DProj(tex,tx1); vec4 b = texture2DProj(tex,tx2); vec4 c = texture2DProj(tex,tx3); vec4 res = (a * .3 + b * .1 + c * .2) / 2.456; gl_FragColor = res; } # Fragment Program/Shader 9 0: TXP TEMP[0], INPUT[26].xyyw, texture[0], 2D; 1: TXP TEMP[1], INPUT[25].xyyw, texture[0], 2D; 2: TXP TEMP[2], INPUT[24].xyyw, texture[0], 2D; 3: MUL TEMP[3], TEMP[2], CONST[1].zzzz; 4: MAD TEMP[2], TEMP[1], CONST[1].yyyy, TEMP[3]; 5: MAD TEMP[1], TEMP[0], CONST[1].xxxx, TEMP[2]; 6: MUL OUTPUT[2], TEMP[1], CONST[1].wwww; 7: END i915: BEGIN DCL S[0] DCL T_TEX2 DCL T_TEX1 DCL T_TEX0 R[0] = MOV T_TEX2.xyyw R[0] = TEXLDP S[0],R[0] R[1] = MOV T_TEX1.xyyw R[1] = TEXLDP S[0],R[1] R[2] = MOV T_TEX0.xyyw R[2] = TEXLDP S[0],R[2] R[3] = MUL R[2], CONST[0].zzzz R[2] = MAD R[1], CONST[0].yyyy, R[3] R[1] = MAD R[0], CONST[0].xxxx, R[2] oC = MUL R[1], CONST[0].wwww END Yes, Every swizzling generates MOV in final code. BTW, could "INPUT[25].xyyw" be replaced with "INPUT[25].xyzw => "INPUT[25]" - to omit swizzling?
-- GitLab Migration Automatic Message -- This bug has been migrated to freedesktop.org's GitLab instance and has been closed from further activity. You can subscribe and participate further through the new bug through this link to our GitLab instance: https://gitlab.freedesktop.org/mesa/mesa/issues/749.
Use of freedesktop.org services, including Bugzilla, is subject to our Code of Conduct. How we collect and use information is described in our Privacy Policy.