Bug 98506

Summary: Pagefault in gf100_vm_flush
Product: xorg Reporter: Karol Herbst <karolherbst>
Component: Driver/nouveauAssignee: Nouveau Project <nouveau>
Status: NEW --- QA Contact: Xorg Project Team <xorg-team>
Severity: normal    
Priority: medium    
Version: git   
Hardware: Other   
OS: All   
Whiteboard:
i915 platform: i915 features:
Attachments:
Description Flags
kernel oops none

Description Karol Herbst 2016-10-30 17:34:12 UTC
Created attachment 127629 [details]
kernel oops

got a pagefault today, attached below

gf100_vm_flush+0xc8 is https://github.com/karolherbst/nouveau/blob/caab1df6e3b32d2eca7e82819a7dd4cb99911d71/drm/nouveau/nvkm/subdev/mmu/gf100.c#L183 -+ gdb mistake

was running chromium prime offloaded and civilisation 5 with mareks gl threading branch
Comment 1 Ilia Mirkin 2016-10-30 17:37:40 UTC
Karol, can you load up nouveau.ko and run "disassemble gf100_vm_flush" (to completion) and include that?

All code
========
   0:   38 e1                   cmp    %ah,%cl
   2:   a9 00 00 ff 00          test   $0xff0000,%eax
   7:   74 cb                   je     0xffffffffffffffd4
   9:   48 81 fb ff 93 35 77    cmp    $0x773593ff,%rbx
  10:   0f 87 ae 00 00 00       ja     0xc4
  16:   49 8b 84 24 80 00 00    mov    0x80(%r12),%rax
  1d:   00 
  1e:   31 db                   xor    %ebx,%ebx
  20:   48 8d b0 b8 0c 10 00    lea    0x100cb8(%rax),%rsi
  27:   49 8b 46 10             mov    0x10(%r14),%rax
  2b:*  48 8b b8 90 00 00 00    mov    0x90(%rax),%rdi          <-- trapping instruction
  32:   48 c1 ef 08             shr    $0x8,%rdi
  36:   e8 68 6f 38 e1          callq  0xffffffffe1386fa3
  3b:   49                      rex.WB
  3c:   8b                      .byte 0x8b
  3d:   84 24 80                test   %ah,(%rax,%rax,4)

Code starting with the faulting instruction
===========================================
   0:   48 8b b8 90 00 00 00    mov    0x90(%rax),%rdi
   7:   48 c1 ef 08             shr    $0x8,%rdi
   b:   e8 68 6f 38 e1          callq  0xffffffffe1386f78
  10:   49                      rex.WB
  11:   8b                      .byte 0x8b
  12:   84 24 80                test   %ah,(%rax,%rax,4)
Comment 2 Karol Herbst 2016-10-30 17:41:16 UTC
(gdb) disassemble gf100_vm_flush
Dump of assembler code for function gf100_vm_flush:
   0x0000000000050f60 <+0>:     push   %rbp
   0x0000000000050f61 <+1>:     mov    %rsp,%rbp
   0x0000000000050f64 <+4>:     push   %r15
   0x0000000000050f66 <+6>:     push   %r14
   0x0000000000050f68 <+8>:     push   %r13
   0x0000000000050f6a <+10>:    mov    %rdi,%r13
   0x0000000000050f6d <+13>:    push   %r12
   0x0000000000050f6f <+15>:    push   %rbx
   0x0000000000050f70 <+16>:    sub    $0x18,%rsp
   0x0000000000050f74 <+20>:    mov    0xb0(%rdi),%edx
   0x0000000000050f7a <+26>:    mov    (%rdi),%rax
   0x0000000000050f7d <+29>:    cmp    $0x1,%edx
   0x0000000000050f80 <+32>:    mov    0x10(%rax),%r12
   0x0000000000050f84 <+36>:    sbb    %ebx,%ebx
   0x0000000000050f86 <+38>:    add    $0x20,%rax
   0x0000000000050f8a <+42>:    mov    %rax,%rdi
   0x0000000000050f8d <+45>:    mov    %rax,-0x40(%rbp)
   0x0000000000050f91 <+49>:    and    $0xfffffffc,%ebx
   0x0000000000050f94 <+52>:    callq  0x50f99 <gf100_vm_flush+57>
   0x0000000000050f99 <+57>:    mov    0x60(%r13),%r14
   0x0000000000050f9d <+61>:    lea    0x60(%r13),%rax
   0x0000000000050fa1 <+65>:    add    $0x5,%ebx
   0x0000000000050fa4 <+68>:    or     $0x80000000,%ebx
   0x0000000000050faa <+74>:    mov    %rax,-0x30(%rbp)
   0x0000000000050fae <+78>:    mov    %ebx,-0x34(%rbp)
   0x0000000000050fb1 <+81>:    cmp    %rax,%r14
   0x0000000000050fb4 <+84>:    je     0x510a9 <gf100_vm_flush+329>
   0x0000000000050fba <+90>:    mov    0x198(%r12),%r13
   0x0000000000050fc2 <+98>:    xor    %ebx,%ebx
   0x0000000000050fc4 <+100>:   mov    %r13,%rdi
   0x0000000000050fc7 <+103>:   callq  0x50fcc <gf100_vm_flush+108>
   0x0000000000050fcc <+108>:   mov    %rax,%r15
   0x0000000000050fcf <+111>:   jmp    0x50feb <gf100_vm_flush+139>
   0x0000000000050fd1 <+113>:   mov    %r13,%rdi
   0x0000000000050fd4 <+116>:   callq  0x50fd9 <gf100_vm_flush+121>
   0x0000000000050fd9 <+121>:   sub    %r15,%rax
   0x0000000000050fdc <+124>:   mov    %rax,%rbx
   0x0000000000050fdf <+127>:   cmp    $0x773593ff,%rax
   0x0000000000050fe5 <+133>:   ja     0x510c1 <gf100_vm_flush+353>
   0x0000000000050feb <+139>:   mov    0x80(%r12),%rax
   0x0000000000050ff3 <+147>:   lea    0x100c80(%rax),%rdi
   0x0000000000050ffa <+154>:   callq  0x50fff <gf100_vm_flush+159>
   0x0000000000050fff <+159>:   test   $0xff0000,%eax
   0x0000000000051004 <+164>:   je     0x50fd1 <gf100_vm_flush+113>
   0x0000000000051006 <+166>:   cmp    $0x773593ff,%rbx
   0x000000000005100d <+173>:   ja     0x510c1 <gf100_vm_flush+353>
   0x0000000000051013 <+179>:   mov    0x80(%r12),%rax
   0x000000000005101b <+187>:   xor    %ebx,%ebx
   0x000000000005101d <+189>:   lea    0x100cb8(%rax),%rsi
   0x0000000000051024 <+196>:   mov    0x10(%r14),%rax
   0x0000000000051028 <+200>:   mov    0x90(%rax),%rdi
   0x000000000005102f <+207>:   shr    $0x8,%rdi
   0x0000000000051033 <+211>:   callq  0x51038 <gf100_vm_flush+216>
   0x0000000000051038 <+216>:   mov    0x80(%r12),%rax
   0x0000000000051040 <+224>:   mov    -0x34(%rbp),%edi
   0x0000000000051043 <+227>:   lea    0x100cbc(%rax),%rsi
   0x000000000005104a <+234>:   callq  0x5104f <gf100_vm_flush+239>
   0x000000000005104f <+239>:   mov    0x198(%r12),%r13
   0x0000000000051057 <+247>:   mov    %r13,%rdi
   0x000000000005105a <+250>:   callq  0x5105f <gf100_vm_flush+255>
   0x000000000005105f <+255>:   mov    %rax,%r15
   0x0000000000051062 <+258>:   jmp    0x5107a <gf100_vm_flush+282>
   0x0000000000051064 <+260>:   mov    %r13,%rdi
   0x0000000000051067 <+263>:   callq  0x5106c <gf100_vm_flush+268>
   0x000000000005106c <+268>:   sub    %r15,%rax
   0x000000000005106f <+271>:   mov    %rax,%rbx
   0x0000000000051072 <+274>:   cmp    $0x773593ff,%rax
   0x0000000000051078 <+280>:   ja     0x510ea <gf100_vm_flush+394>
   0x000000000005107a <+282>:   mov    0x80(%r12),%rax
   0x0000000000051082 <+290>:   lea    0x100c80(%rax),%rdi
   0x0000000000051089 <+297>:   callq  0x5108e <gf100_vm_flush+302>
   0x000000000005108e <+302>:   test   $0x80,%ah
   0x0000000000051091 <+305>:   je     0x51064 <gf100_vm_flush+260>
   0x0000000000051093 <+307>:   cmp    $0x773593ff,%rbx
   0x000000000005109a <+314>:   ja     0x510ea <gf100_vm_flush+394>
   0x000000000005109c <+316>:   mov    (%r14),%r14
   0x000000000005109f <+319>:   cmp    -0x30(%rbp),%r14
   0x00000000000510a3 <+323>:   jne    0x50fba <gf100_vm_flush+90>
   0x00000000000510a9 <+329>:   mov    -0x40(%rbp),%rdi
   0x00000000000510ad <+333>:   callq  0x510b2 <gf100_vm_flush+338>
   0x00000000000510b2 <+338>:   add    $0x18,%rsp
   0x00000000000510b6 <+342>:   pop    %rbx
   0x00000000000510b7 <+343>:   pop    %r12
   0x00000000000510b9 <+345>:   pop    %r13
   0x00000000000510bb <+347>:   pop    %r14
   0x00000000000510bd <+349>:   pop    %r15
   0x00000000000510bf <+351>:   pop    %rbp
   0x00000000000510c0 <+352>:   retq   
   0x00000000000510c1 <+353>:   mov    0x10(%r12),%rdi
   0x00000000000510c6 <+358>:   mov    $0x0,%r8
   0x00000000000510cd <+365>:   mov    $0xb5,%ecx
   0x00000000000510d2 <+370>:   mov    $0x0,%rdx
   0x00000000000510d9 <+377>:   mov    $0x0,%rsi
   0x00000000000510e0 <+384>:   callq  0x510e5 <gf100_vm_flush+389>
   0x00000000000510e5 <+389>:   jmpq   0x51013 <gf100_vm_flush+179>
   0x00000000000510ea <+394>:   mov    0x10(%r12),%rdi
   0x00000000000510ef <+399>:   mov    $0x0,%r8
   0x00000000000510f6 <+406>:   mov    $0xbe,%ecx
   0x00000000000510fb <+411>:   mov    $0x0,%rdx
   0x0000000000051102 <+418>:   mov    $0x0,%rsi
   0x0000000000051109 <+425>:   callq  0x5110e <gf100_vm_flush+430>
   0x000000000005110e <+430>:   jmp    0x5109c <gf100_vm_flush+316>
Comment 3 Ilia Mirkin 2016-10-30 18:06:08 UTC
OK, so it's very clearly this line:

nvkm_wr32(device, 0x100cb8, vpgd->obj->addr >> 8);

(note the shift by 8 that comes right after in the decoded code sequence), and the issue is that vpgd->obj is bad:

RAX: 000000041ad09000
CR2: 000000041ad09090

where the faulting instruction is "mov    0x90(%rax),%rdi". It seems like obj is a nvkm_gpuobj, which in turn is just a regular object that should come out of kernel memory. However the RAX value doesn't seem like a usual kernel memory address, which means it's uninitialized.

Use of freedesktop.org services, including Bugzilla, is subject to our Code of Conduct. How we collect and use information is described in our Privacy Policy.