89062 – i915 needs an instruction scheduler

Bug 89062 - i915 needs an instruction scheduler

Summary: i915 needs an instruction scheduler

Status:	RESOLVED MOVED

Alias:	None

Product:	Mesa
Classification:	Unclassified
Component:	Drivers/DRI/i915 (show other bugs)
Version:	git
Hardware:	x86-64 (AMD64) Linux (All)

Importance:	medium normal
Assignee:	Ian Romanick
QA Contact:

URL:
Whiteboard:
Keywords:

Depends on:
Blocks:

Reported:	2015-02-10 11:37 UTC by gnn
Modified:	2019-09-18 19:38 UTC (History)
CC List:	1 user (show)

See Also:
i915 platform:
i915 features:

Attachments
Full glxinfo output (13.73 KB, text/plain) 2015-02-10 11:37 UTC, gnn	Details
View All

Description gnn 2015-02-10 11:37:49 UTC

Created attachment 113313 [details]
Full glxinfo output

Following fragment shader code fails to software emulation:

uniform sampler2D tex;
varying vec2 tx1,tx2,tx3,tx4,tx5;
void main () {
  vec4 a = texture2D(tex,tx1);
  vec4 b = texture2D(tex,tx2);
  vec4 c = texture2D(tex,tx3);
  vec4 d = texture2D(tex,tx4);
  vec4 e = texture2D(tex,tx5);
  vec4 f = texture2D(tex,tx5);
  vec4 res = ( a * 0.3 + b * 0.1 + c * 0.2 + d * 0.08 + e* 0.2 + f * 0.099) / 6.456; 
  gl_FragColor = res;
}

With output:

# Fragment Program/Shader 9
  0: TEX TEMP[0], INPUT[26].xyyy, texture[0], 2D;
  1: TEX TEMP[1], INPUT[25].zwww, texture[0], 2D;
  2: TEX TEMP[2], INPUT[25].xyyy, texture[0], 2D;
  3: TEX TEMP[3], INPUT[24].zwww, texture[0], 2D;
  4: TEX TEMP[4], INPUT[24].xyyy, texture[0], 2D;
  5: MUL TEMP[5], TEMP[4], CONST[2].yyyy;
  6: MAD TEMP[4], TEMP[3], CONST[2].xxxx, TEMP[5];
  7: MAD TEMP[3], TEMP[2], CONST[1].wwww, TEMP[4];
  8: MAD TEMP[2], TEMP[1], CONST[1].zzzz, TEMP[3];
  9: MAD TEMP[1], TEMP[0], CONST[1].yyyy, TEMP[2];
 10: MAD TEMP[2], TEMP[0], CONST[1].xxxx, TEMP[1];
 11: MUL OUTPUT[2], TEMP[2], CONST[2].zzzz;
 12: END
i915_program_error: Exceeded max nr indirect texture lookups (6 out of 4)
ENTER FALLBACK 10000: Program

As I understand, varying packing optimization leads to swizzle operation, which generates dependent texture lookup for every single texture access. I consider this as a bug, since it makes all shaders with more then 3 texture accesses unusable on i915 chips.

Bug is reproduced on Atom 230 board with Intel 945g video + Linux Ubuntu 14.04 with stock Mesa 10.1

A bit of GL info (more in attachment):
OpenGL vendor string: Intel Open Source Technology Center
OpenGL renderer string: Mesa DRI Intel(R) 945G 
OpenGL version string: 2.1 Mesa 10.1.3
OpenGL shading language version string: 1.20

Comment 1 gnn 2015-02-12 08:59:40 UTC

I've done several extra tests:
In the following program you can see, that calculations and texture accesses got mixed by GLSL optimizer, which leads to a lot of  indirections by itself (even if swizzling in TEX instructions could be allowed):

uniform sampler2D tex;
varying vec2 tx1,tx2,tx3,tx4,tx5,tx0;

void main () {
	
 vec4 a = texture2D(tex,tx1);
  vec4 b = texture2D(tex,tx2);
  vec4 c = texture2D(tex,tx3);
  vec4 d = texture2D(tex,tx4);
  vec4 e = texture2D(tex,tx5);
  vec4 f = texture2D(tex,tx0);
  vec4 res =( a * .3 + b * .1 + c* .2 + d * .08 + e* .02 + f * .099)/6.456; 
  
  vec2 q = tx0 + vec2(0.9);
  vec2 w = tx1 + vec2(0.8);
  vec2 s = tx2 - vec2(0.7);
  vec2 r = tx3 + vec2(0.6);
  vec2 t = tx4 - vec2(0.5);
  vec2 y = tx5 + vec2(0.4);
  
  a = texture2D(tex,q);
  b = texture2D(tex,w);
  c = texture2D(tex,s);
  d = texture2D(tex,r);
  e = texture2D(tex,t);
  f = texture2D(tex,y);  
  res +=( a* .3456 + b * .314 + c* .1 + d * .271 + e* .3 * .6 + f * 1.3)/7.627; 
  gl_FragColor = res;
}

Output:

# Fragment Program/Shader 9
  0: TEX TEMP[0], INPUT[26].zwww, texture[0], 2D;
  1: TEX TEMP[1], INPUT[26].xyyy, texture[0], 2D;
  2: TEX TEMP[2], INPUT[25].zwww, texture[0], 2D;
  3: TEX TEMP[3], INPUT[25].xyyy, texture[0], 2D;
  4: TEX TEMP[4], INPUT[24].zwww, texture[0], 2D;
  5: TEX TEMP[5], INPUT[24].xyyy, texture[0], 2D;
  6: MUL TEMP[6], TEMP[5], CONST[2].yyyy;
  7: MAD TEMP[5], TEMP[4], CONST[2].xxxx, TEMP[6];
  8: MAD TEMP[4], TEMP[3], CONST[1].wwww, TEMP[5];
  9: MAD TEMP[3], TEMP[2], CONST[1].zzzz, TEMP[4];
 10: MAD TEMP[2], TEMP[1], CONST[1].yyyy, TEMP[3];
 11: MAD TEMP[1], TEMP[0], CONST[1].xxxx, TEMP[2];
 12: MUL TEMP[0], TEMP[1], CONST[2].zzzz;
 13: ADD TEMP[1].xy, INPUT[26].xyyy, CONST[3];
 14: TEX TEMP[2], TEMP[1].xyyy, texture[0], 2D;
 15: ADD TEMP[1].xy, INPUT[25].zwww, CONST[4];
 16: TEX TEMP[3], TEMP[1].xyyy, texture[0], 2D;
 17: ADD TEMP[1].xy, INPUT[25].xyyy, CONST[5];
 18: TEX TEMP[4], TEMP[1].xyyy, texture[0], 2D;
 19: ADD TEMP[1].xy, INPUT[24].zwww, CONST[6];
 20: TEX TEMP[5], TEMP[1].xyyy, texture[0], 2D;
 21: ADD TEMP[1].xy, INPUT[24].xyyy, CONST[7];
 22: TEX TEMP[6], TEMP[1].xyyy, texture[0], 2D;
 23: ADD TEMP[1].xy, INPUT[26].zwww, CONST[8];
 24: TEX TEMP[7], TEMP[1].xyyy, texture[0], 2D;
 25: MUL TEMP[1], TEMP[7], CONST[4].wwww;
 26: MAD TEMP[7], TEMP[6], CONST[4].zzzz, TEMP[1];
 27: MAD TEMP[1], TEMP[5], CONST[2].xxxx, TEMP[7];
 28: MAD TEMP[5], TEMP[4], CONST[3].wwww, TEMP[1];
 29: MAD TEMP[1], CONST[3].zzzz, TEMP[3], TEMP[5];
 30: MAD TEMP[3], TEMP[2], CONST[2].wwww, TEMP[1];
 31: MAD OUTPUT[2], TEMP[3], CONST[5].zzzz, TEMP[0];
 32: END
i915_program_error: Exceeded max nr indirect texture lookups (13 out of 4)
ENTER FALLBACK 10000: Program


Then I've tried texture2DProj(), which requires vec3 or vec4, so varyings aren't packed for it. You cat see both examples rewritten with it below.
In case of texture2DProj() with vec3 arguments, I got lots of extra moves in the output, so texture2DProj() with vec4 seems to be "native" for i945g (GMA 950):

uniform sampler2D tex;
varying vec4 tx1,tx2,tx3,tx4,tx5,tx0;

void main () {
	
 vec4 a = texture2DProj(tex,tx1);
  vec4 b = texture2DProj(tex,tx2);
  vec4 c = texture2DProj(tex,tx3);
  vec4 d = texture2DProj(tex,tx4);
  vec4 e = texture2DProj(tex,tx5);
  vec4 f = texture2DProj(tex,tx0);
  vec4 res =( a * .3 + b * .1 + c* .2 + d * .8 + e* .2 + f * .99)/6.456; 
  gl_FragColor = res;
}

  0: TXP TEMP[0], INPUT[29].xyyw, texture[0], 2D;
  1: TXP TEMP[1], INPUT[28].xyyw, texture[0], 2D;
  2: TXP TEMP[2], INPUT[27].xyyw, texture[0], 2D;
  3: TXP TEMP[3], INPUT[26].xyyw, texture[0], 2D;
  4: TXP TEMP[4], INPUT[25].xyyw, texture[0], 2D;
  5: TXP TEMP[5], INPUT[24].xyyw, texture[0], 2D;
  6: MUL TEMP[6], TEMP[5], CONST[2].xxxx;
  7: MAD TEMP[5], TEMP[4], CONST[1].wwww, TEMP[6];
  8: MAD TEMP[4], TEMP[3], CONST[1].yyyy, TEMP[5];
  9: MAD TEMP[3], TEMP[2], CONST[1].zzzz, TEMP[4];
 10: MAD TEMP[2], TEMP[1], CONST[1].yyyy, TEMP[3];
 11: MAD TEMP[1], TEMP[0], CONST[1].xxxx, TEMP[2];
 12: MUL OUTPUT[2], TEMP[1], CONST[2].yyyy;
 13: END
i915_program_error: Exceeded max nr indirect texture lookups (7 out of 4)
ENTER FALLBACK 10000: Program


uniform sampler2D tex;
varying vec4 tx1,tx2,tx3,tx4,tx5,tx0;

void main () {
	
 vec4 a = texture2DProj(tex,tx1);
  vec4 b = texture2DProj(tex,tx2);
  vec4 c = texture2DProj(tex,tx3);
  vec4 d = texture2DProj(tex,tx4);
  vec4 e = texture2DProj(tex,tx5);
  vec4 f = texture2DProj(tex,tx0);
  vec4 res =( a * .3 + b * .1 + c* .2 + d * .8 + e* .2 + f * .99)/6.456; 
  
  vec4 q = tx0 + vec4(0.9);
  vec4 w = tx1 + vec4(0.8);
  vec4 s = tx2 - vec4(0.7);
  vec4 r = tx3 + vec4(0.6);
  vec4 t = tx4 - vec4(0.5);
  vec4 y = tx5 + vec4(0.4);
  
  a = texture2DProj(tex,q);
  b = texture2DProj(tex,w);
  c = texture2DProj(tex,s);
  d = texture2DProj(tex,r);
  e = texture2DProj(tex,t);
  f = texture2DProj(tex,y);  
  res +=( a* .3456 + b * .314 + c* .1 + d * .271 + e* .3 * .6 + f * 1.3)/7.627;
  gl_FragColor = res;
}

# Fragment Program/Shader 9
  0: TXP TEMP[0], INPUT[29].xyyw, texture[0], 2D;
  1: TXP TEMP[1], INPUT[28].xyyw, texture[0], 2D;
  2: TXP TEMP[2], INPUT[27].xyyw, texture[0], 2D;
  3: TXP TEMP[3], INPUT[26].xyyw, texture[0], 2D;
  4: TXP TEMP[4], INPUT[25].xyyw, texture[0], 2D;
  5: TXP TEMP[5], INPUT[24].xyyw, texture[0], 2D;
  6: MUL TEMP[6], TEMP[5], CONST[2].xxxx;
  7: MAD TEMP[5], TEMP[4], CONST[1].wwww, TEMP[6];
  8: MAD TEMP[4], TEMP[3], CONST[1].yyyy, TEMP[5];
  9: MAD TEMP[3], TEMP[2], CONST[1].zzzz, TEMP[4];
 10: MAD TEMP[2], TEMP[1], CONST[1].yyyy, TEMP[3];
 11: MAD TEMP[1], TEMP[0], CONST[1].xxxx, TEMP[2];
 12: MUL TEMP[0], TEMP[1], CONST[2].yyyy;
 13: ADD TEMP[1], INPUT[24], CONST[1].zzzz;
 14: ADD TEMP[2], INPUT[25], CONST[3];
 15: ADD TEMP[3], INPUT[26], CONST[4];
 16: ADD TEMP[4], INPUT[27], CONST[5];
 17: ADD TEMP[5], INPUT[28], CONST[6];
 18: ADD TEMP[6], INPUT[29], CONST[7];
 19: TXP TEMP[7], TEMP[5].xyyw, texture[0], 2D;
 20: TXP TEMP[5], TEMP[4].xyyw, texture[0], 2D;
 21: TXP TEMP[4], TEMP[3].xyyw, texture[0], 2D;
 22: TXP TEMP[3], TEMP[2].xyyw, texture[0], 2D;
 23: TXP TEMP[2], TEMP[1].xyyw, texture[0], 2D;
 24: TXP TEMP[1], TEMP[6].xyyw, texture[0], 2D;
 25: MUL TEMP[6], TEMP[1], CONST[8].zzzz;
 26: MAD TEMP[1], TEMP[2], CONST[8].yyyy, TEMP[6];
 27: MAD TEMP[2], TEMP[3], CONST[1].wwww, TEMP[1];
 28: MAD TEMP[1], TEMP[4], CONST[8].xxxx, TEMP[2];
 29: MAD TEMP[2], CONST[2].wwww, TEMP[5], TEMP[1];
 30: MAD TEMP[1], TEMP[7], CONST[2].zzzz, TEMP[2];
 31: MAD OUTPUT[2], TEMP[1], CONST[8].wwww, TEMP[0];
 32: END
i915_program_error: Exceeded max nr indirect texture lookups (13 out of 4)
ENTER FALLBACK 10000: Program



As You can see, every texture access still generates texture indirection, since swizzling still used, but in .xyyw second 'y' seems to be a stub, because when I tried texture2DProj(tex,tx0.xywz) explicitly, I got TEMP[2].xyyz in the output. Could it be replaced during code generation from INPUT[29].xyyw to something like INPUT[29].xyzw => INPUT[29] (without swizzling) then?
The other interesting point is that calculations and texture accesses weren't reordered in the last example. If we didn't have any swizzling in TXP, we would get fully hardware accelerated shaders in the last two examples, since the native limits are 64 ALU and 32 TEX instructions.
So the basic problems seems to be swizzling in texture access instructions and varing packing. There are also could be a problems with texture2DProj(sampler2D, vec3) variant, since it generates more instructions and leads to TXP TEMP[3], TEMP[2].xyyw kind of instruction.
Probably, it also makes sense to disable some optimizations for i945g, which leads to reordering texture accesses and calculations.

It could be very nice to have this bug fixed, since the chips like i945g(GMA 900) or X3150(like in Atom D525) are far more capable, then just 3 texture accesses!

By the way, it looks like some problems in BUG 87478 could be connected with this one.

Comment 2 gnn 2015-02-13 07:54:07 UTC

I've just found discussion of the similar issue in gallium driver, where they seem to disable varying packing for GPUs with too few supported texture indirections.
http://lists.freedesktop.org/archives/mesa-commit/2012-December/040778.html

Could it be solved in "classic" driver?

Comment 3 Ian Romanick 2015-02-13 17:00:41 UTC

The problem, as can be seen in from the assembly code in comment #1 is that the i915 driver is just being dumb.  If it had a scheduler that would group TEX instructions properly, all of these programs would work.

I think scheduling is still the problem with the original shader in comment #0.  If you run with the environment variable INTEL_DEBUG=fs the driver will log the actual machine instructions generated.  What is shown in the bug report is one of the intermediate representations.  My guess is that the driver is inserting some extra MOV instructions before each TEX instruction.

(In reply to gnn from comment #2)
> Could it be solved in "classic" driver?

Based on "OpenGL vendor string: Intel Open Source Technology Center" in the glxinfo output, you are using the classic driver.

Comment 4 gnn 2015-02-16 07:01:48 UTC

I found out, that i915 assembly is printed only if code could be hardware accelerated.
I tried several extra tests with just 3 texture accesses (it makes 4 tex. indirections) and I've got full output.

uniform sampler2D tex;
varying vec2 tx1,tx2,tx3,tx4,tx5,tx0;

void main () {
	
 vec4 a = texture2D(tex,tx1);
  vec4 b = texture2D(tex,tx2);
  vec4 c = texture2D(tex,tx3);
  vec4 res = (a * .3 + b * .1 + c * .2) / 1.456; 
  gl_FragColor = res;
}

 Fragment Program/Shader 9
  0: TEX TEMP[0], INPUT[25].xyyy, texture[0], 2D;
  1: TEX TEMP[1], INPUT[24].zwww, texture[0], 2D;
  2: TEX TEMP[2], INPUT[24].xyyy, texture[0], 2D;
  3: MUL TEMP[3], TEMP[2], CONST[1].zzzz;
  4: MAD TEMP[2], TEMP[1], CONST[1].yyyy, TEMP[3];
  5: MAD TEMP[1], TEMP[0], CONST[1].xxxx, TEMP[2];
  6: MUL OUTPUT[2], TEMP[1], CONST[1].wwww;
  7: END
i915:
		BEGIN
		DCL S[0]
		DCL T_TEX1
		DCL T_TEX0
		R[0] = MOV T_TEX1.xyyy
		R[0] = TEXLD S[0],R[0]
		R[1] = MOV T_TEX0.zwww
		R[1] = TEXLD S[0],R[1]
		R[2] = MOV T_TEX0.xyyy
		R[2] = TEXLD S[0],R[2]
		R[3] = MUL R[2], CONST[0].zzzz
		R[2] = MAD R[1], CONST[0].yyyy, R[3]
		R[1] = MAD R[0], CONST[0].xxxx, R[2]
		oC = MUL R[1], CONST[0].wwww
		END



uniform sampler2D tex;
varying vec4 tx1,tx2,tx3,tx4,tx5,tx0;

void main () {
	
  vec4 a = texture2DProj(tex,tx1);
  vec4 b = texture2DProj(tex,tx2);
  vec4 c = texture2DProj(tex,tx3);
  vec4 res = (a * .3 + b * .1 + c * .2) / 2.456; 
  gl_FragColor = res;
}

# Fragment Program/Shader 9
  0: TXP TEMP[0], INPUT[26].xyyw, texture[0], 2D;
  1: TXP TEMP[1], INPUT[25].xyyw, texture[0], 2D;
  2: TXP TEMP[2], INPUT[24].xyyw, texture[0], 2D;
  3: MUL TEMP[3], TEMP[2], CONST[1].zzzz;
  4: MAD TEMP[2], TEMP[1], CONST[1].yyyy, TEMP[3];
  5: MAD TEMP[1], TEMP[0], CONST[1].xxxx, TEMP[2];
  6: MUL OUTPUT[2], TEMP[1], CONST[1].wwww;
  7: END
i915:
		BEGIN
		DCL S[0]
		DCL T_TEX2
		DCL T_TEX1
		DCL T_TEX0
		R[0] = MOV T_TEX2.xyyw
		R[0] = TEXLDP S[0],R[0]
		R[1] = MOV T_TEX1.xyyw
		R[1] = TEXLDP S[0],R[1]
		R[2] = MOV T_TEX0.xyyw
		R[2] = TEXLDP S[0],R[2]
		R[3] = MUL R[2], CONST[0].zzzz
		R[2] = MAD R[1], CONST[0].yyyy, R[3]
		R[1] = MAD R[0], CONST[0].xxxx, R[2]
		oC = MUL R[1], CONST[0].wwww
		END


Yes, Every swizzling generates MOV in final code. 

BTW, could "INPUT[25].xyyw" be replaced with "INPUT[25].xyzw => "INPUT[25]" - to omit swizzling?

Comment 5 GitLab Migration User 2019-09-18 19:38:44 UTC

-- GitLab Migration Automatic Message --

This bug has been migrated to freedesktop.org's GitLab instance and has been closed from further activity.

You can subscribe and participate further through the new bug through this link to our GitLab instance: https://gitlab.freedesktop.org/mesa/mesa/issues/749.

Use of freedesktop.org services, including Bugzilla, is subject to our Code of Conduct. How we collect and use information is described in our Privacy Policy.