Hello , I am investigating a issue that our test group reported concerning this driver. Their test loads and unloads every kernel module included in the Oracle 4.14.35 kernel release . You don’t even need a AMD platform . It occurs on any Intel, or a KVM VM instance too. Kernel panic while “ modprobe amdkfd ; modprobe -r amdkfd “ [ 329.425334] ? __slab_free+0x9b/0x2ba [ 329.427836] ? process_slab+0x3c1/0x45c [ 329.430336] dev_printk_emit+0x4e/0x65 [ 329.432829] __dev_printk+0x46/0x8b [ 329.435183] _dev_info+0x6c/0x85 [ 329.437435] ? kfree+0x141/0x182 [ 329.439646] kfd_module_exit+0x37/0x39 [amdkfd] [ 329.442258] SyS_delete_module+0x1c3/0x26f [ 329.444722] ? entry_SYSCALL_64_after_hwframe+0xaa/0x0 [ 329.447479] ? entry_SYSCALL_64_after_hwframe+0xa3/0x0 [ 329.450206] ? entry_SYSCALL_64_after_hwframe+0x9c/0x0 [ 329.452912] ? entry_SYSCALL_64_after_hwframe+0x95/0x0 [ 329.455586] do_syscall_64+0x79/0x1ae [ 329.457766] entry_SYSCALL_64_after_hwframe+0x151/0x0 [ 329.460369] RIP: 0033:0x7f1757a1b457 [ 329.462502] RSP: 002b:00007ffd62ce1f48 EFLAGS: 00000206 ORIG_RAX: Looks like some memory corruption. Sometimes the unload works but the message logged is garbage: [root@jpd-vmbase02 ~]# modprobe -r amdkfd [ 144.449981] ???????????? hn??蟟??xn??ן??kfd: Removed module Is this something one of team members could have possibly corrected in an upstream version ? #define KFD_DRIVER_DESC "Standalone HSA driver for AMD's GPUs" #define KFD_DRIVER_DATE "20150421" #define KFD_DRIVER_MAJOR 0 #define KFD_DRIVER_MINOR 7 #define KFD_DRIVER_PATCHLEVEL 2 Thank you, John
This was been fixed in 4.20.x upstream: commit c393e9b2d51540b74e18e555df14706098dbf2cc Author: Randy Dunlap <rdunlap@infradead.org> Date: Mon Nov 13 18:08:48 2017 +0200 drm/amdkfd: fix amdkfd use-after-free GP fault Fix GP fault caused by dev_info() reference to a struct device* after the device has been freed (use after free). kfd_chardev_exit() frees the device so 'kfd_device' should not be used after calling kfd_chardev_exit(). Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index 6c5a9ca..f744cae 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -24,6 +24,7 @@ #include <linux/sched.h> #include <linux/moduleparam.h> #include <linux/device.h> +#include <linux/printk.h> #include "kfd_priv.h" #define KFD_DRIVER_AUTHOR "AMD Inc. and others" @@ -132,7 +133,7 @@ static void __exit kfd_module_exit(void) kfd_process_destroy_wq(); kfd_topology_shutdown(); kfd_chardev_exit(); - dev_info(kfd_device, "Removed module\n"); + pr_info("amdkfd: Removed module\n"); } [root@jpd-vmbase02 ~]# modprobe amdkfd [ 132.453287] AMD IOMMUv2 driver by Joerg Roedel <jroedel@suse.de> [ 132.454004] AMD IOMMUv2 functionality not available on this system [ 132.507733] CRAT table not found [ 132.508139] Finished initializing topology ret=0 [ 132.508802] kfd kfd: Initialized module [root@jpd-vmbase02 ~]# modprobe -r amdkfd [ 137.447829] amdkfd: Removed module [root@jpd-vmbase02 ~]# [root@jpd-vmbase02 ~]#
Note - SyzKaller will also report this error: [ 3718.925349] AMD IOMMUv2 driver by Joerg Roedel <jroedel@suse.de> [ 3718.926551] AMD IOMMUv2 functionality not available on this system [ 3719.045519] CRAT table not found [ 3719.046153] Finished initializing topology ret=0 [ 3719.046921] kfd kfd: Initialized module [ 3720.058664] ================================================================== [ 3720.060042] BUG: KASAN: use-after-free in __dev_printk+0x222/0x26a [ 3720.061030] Read of size 8 at addr ffff88004be04950 by task modprobe/15273 [ 3720.062110] [ 3720.062384] CPU: 0 PID: 15273 Comm: modprobe Not tainted 4.14.35-1911.0.20190312_0000.syzkaller #12 [ 3720.063815] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 [ 3720.065559] Call Trace: [ 3720.066096] dump_stack+0x83/0xaf [ 3720.066627] ? __dev_printk+0x222/0x26a [ 3720.067171] print_address_description+0x6a/0x22d [ 3720.068215] ? __dev_printk+0x222/0x26a [ 3720.069693] kasan_report.cold.6+0x11a/0x2d3 [ 3720.070511] __asan_report_load8_noabort+0x19/0x1b [ 3720.071524] __dev_printk+0x222/0x26a [ 3720.072345] _dev_info+0xdd/0x112 [ 3720.072981] ? dev_notice+0x120/0x112 [ 3720.073694] ? kasan_slab_free+0x88/0xb9 [ 3720.074613] ? kfd_pasid_exit+0x1a/0x20 [amdkfd] [ 3720.075631] kfd_module_exit+0x5c/0x6d [amdkfd] [ 3720.076573] SyS_delete_module+0x346/0x4ad [ 3720.077131] ? free_module+0x6d0/0x6c5 [ 3720.077627] ? entry_SYSCALL_64_after_hwframe+0xfe/0x0 [ 3720.078261] ? entry_SYSCALL_64_after_hwframe+0xf0/0x0 [ 3720.078894] ? entry_SYSCALL_64_after_hwframe+0xe9/0x0 [ 3720.079491] ? entry_SYSCALL_64_after_hwframe+0xe2/0x0 [ 3720.080441] ? entry_SYSCALL_64_after_hwframe+0xdb/0x0 [ 3720.081511] ? entry_SYSCALL_64_after_hwframe+0xd4/0x0 [ 3720.082778] ? entry_SYSCALL_64_after_hwframe+0xc6/0x0 [ 3720.083809] ? entry_SYSCALL_64_after_hwframe+0xbf/0x0 [ 3720.084929] ? entry_SYSCALL_64_after_hwframe+0xb8/0x0 [ 3720.086024] ? free_module+0x6d0/0x6c5 [ 3720.086790] do_syscall_64+0x1bc/0x53f [ 3720.087502] ? entry_SYSCALL_64_after_hwframe+0x79/0x0 [ 3720.088402] ? entry_SYSCALL_64_after_hwframe+0x72/0x0 [ 3720.089251] ? entry_SYSCALL_64_after_hwframe+0x6b/0x0 [ 3720.090105] ? entry_SYSCALL_64_after_hwframe+0x64/0x0 [ 3720.091079] ? entry_SYSCALL_64_after_hwframe+0x5d/0x0 [ 3720.091978] entry_SYSCALL_64_after_hwframe+0x151/0x0 [ 3720.092759] RIP: 0033:0x7f29dad1d457 [ 3720.093414] RSP: 002b:00007ffcf29f40c8 EFLAGS: 00000202 ORIG_RAX: 00000000000000b0 [ 3720.094599] RAX: ffffffffffffffda RBX: 0000000000794550 RCX: 00007f29dad1d457 [ 3720.095927] RDX: 0000000000000000 RSI: 0000000000000800 RDI: 00000000007945b8 [ 3720.097450] RBP: 0000000000000000 R08: 00007f29dafe6060 R09: 00007f29dad91be0 [ 3720.098515] R10: 00007ffcf29f3c90 R11: 0000000000000202 R12: 0000000000000000 [ 3720.100023] R13: 0000000000000001 R14: 00000000007945b8 R15: 0000000000000000 [ 3720.101374] [ 3720.101643] Allocated by task 15263: [ 3720.102241] save_stack+0x43/0xc4 [ 3720.102853] kasan_kmalloc+0xc4/0xd8 [ 3720.103506] kmem_cache_alloc_trace+0xed/0x1f7 [ 3720.104238] device_create_groups_vargs+0x9d/0x246 [ 3720.105026] device_create+0xe0/0x10c [ 3720.105530] kfd_chardev_init+0xc0/0xf0 [amdkfd] [ 3720.106194] 0xffffffffc0440077 [ 3720.106668] do_one_initcall+0xaf/0x1ba [ 3720.107279] do_init_module+0x1fa/0x67f [ 3720.108040] load_module+0x385c/0x511e [ 3720.108790] SYSC_finit_module+0x141/0x1cd [ 3720.109502] SyS_finit_module+0xe/0x10 [ 3720.110129] do_syscall_64+0x1bc/0x53f [ 3720.110765] entry_SYSCALL_64_after_hwframe+0x151/0x0 [ 3720.111730] [ 3720.111992] Freed by task 15273: [ 3720.112546] save_stack+0x43/0xc4 [ 3720.113103] kasan_slab_free+0x72/0xb9 [ 3720.113738] kfree+0x94/0x190 [ 3720.114340] device_create_release+0x2b/0xb1 [ 3720.115069] device_release+0x83/0x1aa [ 3720.115599] kobject_release+0x165/0x41f [ 3720.116399] kobject_put+0x76/0x87 [ 3720.117129] device_unregister+0x3e/0xc9 [ 3720.117909] device_destroy+0x98/0xcc [ 3720.118659] kfd_chardev_exit+0x23/0x50 [amdkfd] [ 3720.119640] kfd_module_exit+0x22/0x6d [amdkfd] [ 3720.120662] SyS_delete_module+0x346/0x4ad [ 3720.121445] do_syscall_64+0x1bc/0x53f [ 3720.122083] entry_SYSCALL_64_after_hwframe+0x151/0x0 [ 3720.122850] [ 3720.123084] The buggy address belongs to the object at ffff88004be04900 [ 3720.123084] which belongs to the cache kmalloc-1024 of size 1024 [ 3720.125280] The buggy address is located 80 bytes inside of [ 3720.125280] 1024-byte region [ffff88004be04900, ffff88004be04d00) [ 3720.127200] The buggy address belongs to the page: [ 3720.128005] page:ffffea00012f8100 count:1 mapcount:0 mapping: (null) index:0x0 compound_mapcount: 0 [ 3720.129647] flags: 0xfffffc0008100(slab|head) [ 3720.130439] raw: 000fffffc0008100 0000000000000000 0000000000000000 00000001000e000e [ 3720.131923] raw: dead000000000100 dead000000000200 ffff880055402c40 0000000000000000 [ 3720.133104] page dumped because: kasan: bad access detected [ 3720.134023] [ 3720.134310] Memory state around the buggy address: [ 3720.135022] ffff88004be04800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 3720.136104] ffff88004be04880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 3720.137096] >ffff88004be04900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 3720.138282] ^ [ 3720.139368] ffff88004be04980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 3720.140487] ffff88004be04a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 3720.141658] ================================================================== [ 3720.142700] Disabling lock debugging due to kernel taint [ 3720.143766] Kernel panic - not syncing: panic_on_warn set ... [ 3720.143766] [ 3720.145028] CPU: 0 PID: 15273 Comm: modprobe Tainted: G B 4.14.35-1911.0.20190312_0000.syzkaller #12 [ 3720.146624] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 [ 3720.148042] Call Trace: [ 3720.148549] dump_stack+0x83/0xaf [ 3720.149108] panic+0x1bf/0x3bc [ 3720.149624] ? add_taint.cold.6+0x16/0x16 [ 3720.150294] ? __dev_printk+0x222/0x26a [ 3720.150937] kasan_end_report+0x4c/0x54 [ 3720.151578] kasan_report.cold.6+0x76/0x2d3 [ 3720.152273] __asan_report_load8_noabort+0x19/0x1b [ 3720.152987] __dev_printk+0x222/0x26a [ 3720.153456] _dev_info+0xdd/0x112 [ 3720.153883] ? dev_notice+0x120/0x112 [ 3720.154349] ? kasan_slab_free+0x88/0xb9 [ 3720.154852] ? kfd_pasid_exit+0x1a/0x20 [amdkfd] [ 3720.155466] kfd_module_exit+0x5c/0x6d [amdkfd] [ 3720.156041] SyS_delete_module+0x346/0x4ad [ 3720.156578] ? free_module+0x6d0/0x6c5 [ 3720.157065] ? entry_SYSCALL_64_after_hwframe+0xfe/0x0 [ 3720.157712] ? entry_SYSCALL_64_after_hwframe+0xf0/0x0 [ 3720.158354] ? entry_SYSCALL_64_after_hwframe+0xe9/0x0 [ 3720.159257] ? entry_SYSCALL_64_after_hwframe+0xe2/0x0 [ 3720.160249] ? entry_SYSCALL_64_after_hwframe+0xdb/0x0 [ 3720.161260] ? entry_SYSCALL_64_after_hwframe+0xd4/0x0 [ 3720.162241] ? entry_SYSCALL_64_after_hwframe+0xc6/0x0 [ 3720.163201] ? entry_SYSCALL_64_after_hwframe+0xbf/0x0 [ 3720.164023] ? entry_SYSCALL_64_after_hwframe+0xb8/0x0 [ 3720.164857] ? free_module+0x6d0/0x6c5 [ 3720.165551] do_syscall_64+0x1bc/0x53f [ 3720.166251] ? entry_SYSCALL_64_after_hwframe+0x79/0x0 [ 3720.167226] ? entry_SYSCALL_64_after_hwframe+0x72/0x0 [ 3720.168265] ? entry_SYSCALL_64_after_hwframe+0x6b/0x0 [ 3720.169358] ? entry_SYSCALL_64_after_hwframe+0x64/0x0 [ 3720.170369] ? entry_SYSCALL_64_after_hwframe+0x5d/0x0 [ 3720.171338] entry_SYSCALL_64_after_hwframe+0x151/0x0 [ 3720.172356] RIP: 0033:0x7f29dad1d457 [ 3720.173026] RSP: 002b:00007ffcf29f40c8 EFLAGS: 00000202 ORIG_RAX: 00000000000000b0 [ 3720.174545] RAX: ffffffffffffffda RBX: 0000000000794550 RCX: 00007f29dad1d457 [ 3720.176024] RDX: 0000000000000000 RSI: 0000000000000800 RDI: 00000000007945b8 [ 3720.177366] RBP: 0000000000000000 R08: 00007f29dafe6060 R09: 00007f29dad91be0 [ 3720.178849] R10: 00007ffcf29f3c90 R11: 0000000000000202 R12: 0000000000000000 [ 3720.180187] R13: 0000000000000001 R14: 00000000007945b8 R15: 0000000000000000 [ 3720.181638] Dumping ftrace buffer: [ 3720.182163] (ftrace buffer empty) [ 3720.182779] Kernel Offset: 0x34000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 3720.184747] Rebooting in 86400 seconds.
Use of freedesktop.org services, including Bugzilla, is subject to our Code of Conduct. How we collect and use information is described in our Privacy Policy.