From 11b3b9f461c5c4f700f6c8da202fcc2fd6418e1f Mon Sep 17 00:00:00 2001 From: Vitaly Prosyak Date: Thu, 6 Apr 2023 16:00:54 -0400 Subject: drm/sched: Check scheduler ready before calling timeout handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During an IGT GPU reset test we see the following oops, [ +0.000003] ------------[ cut here ]------------ [ +0.000000] WARNING: CPU: 9 PID: 0 at kernel/workqueue.c:1656 __queue_delayed_work+0x6d/0xa0 [ +0.000004] Modules linked in: iptable_filter bpfilter amdgpu(OE) nls_iso8859_1 snd_hda_codec_realtek snd_hda_codec_generic intel_rapl_msr ledtrig_audio snd_hda_codec_hdmi intel_rapl_common snd_hda_intel edac_mce_amd snd_intel_dspcfg snd_intel_sdw_acpi snd_hda_codec snd_hda_core iommu_v2 gpu_sched(OE) kvm_amd drm_buddy snd_hwdep kvm video drm_ttm_helper snd_pcm ttm snd_seq_midi drm_display_helper snd_seq_midi_event snd_rawmidi cec crct10dif_pclmul ghash_clmulni_intel sha512_ssse3 snd_seq aesni_intel rc_core crypto_simd cryptd binfmt_misc drm_kms_helper rapl snd_seq_device input_leds joydev snd_timer i2c_algo_bit syscopyarea snd ccp sysfillrect sysimgblt wmi_bmof k10temp soundcore mac_hid sch_fq_codel msr parport_pc ppdev drm lp parport ramoops reed_solomon pstore_blk pstore_zone efi_pstore ip_tables x_tables autofs4 hid_generic usbhid hid r8169 ahci xhci_pci gpio_amdpt realtek i2c_piix4 wmi crc32_pclmul xhci_pci_renesas libahci gpio_generic [ +0.000070] CPU: 9 PID: 0 Comm: swapper/9 Tainted: G W OE 6.1.11+ #2 [ +0.000003] Hardware name: Gigabyte Technology Co., Ltd. AB350-Gaming 3/AB350-Gaming 3-CF, BIOS F7 06/16/2017 [ +0.000001] RIP: 0010:__queue_delayed_work+0x6d/0xa0 [ +0.000003] Code: 7a 50 48 01 c1 48 89 4a 30 81 ff 00 20 00 00 75 38 4c 89 cf e8 64 3e 0a 00 5d e9 1e c5 11 01 e8 99 f7 ff ff 5d e9 13 c5 11 01 <0f> 0b eb c1 0f 0b 48 81 7a 38 70 5c 0e 81 74 9f 0f 0b 48 8b 42 28 [ +0.000002] RSP: 0018:ffffc90000398d60 EFLAGS: 00010007 [ +0.000002] RAX: ffff88810d589c60 RBX: 0000000000000000 RCX: 0000000000000000 [ +0.000002] RDX: ffff88810d589c58 RSI: 0000000000000000 RDI: 0000000000002000 [ +0.000001] RBP: ffffc90000398d60 R08: 0000000000000000 R09: ffff88810d589c78 [ +0.000002] R10: 72705f305f39765f R11: 7866673a6d72645b R12: ffff88810d589c58 [ +0.000001] R13: 0000000000002000 R14: 0000000000000000 R15: 0000000000000000 [ +0.000002] FS: 0000000000000000(0000) GS:ffff8887fee40000(0000) knlGS:0000000000000000 [ +0.000001] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ +0.000002] CR2: 00005562c4797fa0 CR3: 0000000110da0000 CR4: 00000000003506e0 [ +0.000002] Call Trace: [ +0.000001] [ +0.000001] mod_delayed_work_on+0x5e/0xa0 [ +0.000004] drm_sched_fault+0x23/0x30 [gpu_sched] [ +0.000007] gfx_v9_0_fault.isra.0+0xa6/0xd0 [amdgpu] [ +0.000258] gfx_v9_0_priv_reg_irq+0x29/0x40 [amdgpu] [ +0.000254] amdgpu_irq_dispatch+0x1ac/0x2b0 [amdgpu] [ +0.000243] amdgpu_ih_process+0x89/0x130 [amdgpu] [ +0.000245] amdgpu_irq_handler+0x24/0x60 [amdgpu] [ +0.000165] __handle_irq_event_percpu+0x4f/0x1a0 [ +0.000003] handle_irq_event_percpu+0x15/0x50 [ +0.000001] handle_irq_event+0x39/0x60 [ +0.000002] handle_edge_irq+0xa8/0x250 [ +0.000003] __common_interrupt+0x7b/0x150 [ +0.000002] common_interrupt+0xc1/0xe0 [ +0.000003] [ +0.000000] [ +0.000001] asm_common_interrupt+0x27/0x40 [ +0.000002] RIP: 0010:native_safe_halt+0xb/0x10 [ +0.000003] Code: 46 ff ff ff cc cc cc cc cc cc cc cc cc cc cc eb 07 0f 00 2d 69 f2 5e 00 f4 e9 f1 3b 3e 00 90 eb 07 0f 00 2d 59 f2 5e 00 fb f4 e0 3b 3e 00 0f 1f 44 00 00 55 48 89 e5 53 e8 b1 d4 fe ff 66 90 [ +0.000002] RSP: 0018:ffffc9000018fdc8 EFLAGS: 00000246 [ +0.000002] RAX: 0000000000004000 RBX: 000000000002e5a8 RCX: 000000000000001f [ +0.000001] RDX: 0000000000000001 RSI: ffff888101298800 RDI: ffff888101298864 [ +0.000001] RBP: ffffc9000018fdd0 R08: 000000527f64bd8b R09: 000000000001dc90 [ +0.000001] R10: 000000000001dc90 R11: 0000000000000003 R12: 0000000000000001 [ +0.000001] R13: ffff888101298864 R14: ffffffff832d9e20 R15: ffff888193aa8c00 [ +0.000003] ? acpi_idle_do_entry+0x5e/0x70 [ +0.000002] acpi_idle_enter+0xd1/0x160 [ +0.000003] cpuidle_enter_state+0x9a/0x6e0 [ +0.000003] cpuidle_enter+0x2e/0x50 [ +0.000003] call_cpuidle+0x23/0x50 [ +0.000002] do_idle+0x1de/0x260 [ +0.000002] cpu_startup_entry+0x20/0x30 [ +0.000002] start_secondary+0x120/0x150 [ +0.000003] secondary_startup_64_no_verify+0xe5/0xeb [ +0.000004] [ +0.000000] ---[ end trace 0000000000000000 ]--- [ +0.000003] BUG: kernel NULL pointer dereference, address: 0000000000000102 [ +0.006233] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring gfx_low timeout, signaled seq=3, emitted seq=4 [ +0.000734] #PF: supervisor read access in kernel mode [ +0.009670] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process amd_deadlock pid 2002 thread amd_deadlock pid 2002 [ +0.005135] #PF: error_code(0x0000) - not-present page [ +0.000002] PGD 0 P4D 0 [ +0.000002] Oops: 0000 [#1] PREEMPT SMP NOPTI [ +0.000002] CPU: 9 PID: 0 Comm: swapper/9 Tainted: G W OE 6.1.11+ #2 [ +0.000002] Hardware name: Gigabyte Technology Co., Ltd. AB350-Gaming 3/AB350-Gaming 3-CF, BIOS F7 06/16/2017 [ +0.012101] amdgpu 0000:0c:00.0: amdgpu: GPU reset begin! [ +0.005136] RIP: 0010:__queue_work+0x1f/0x4e0 [ +0.000004] Code: 87 cd 11 01 0f 1f 80 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 41 55 49 89 d5 41 54 49 89 f4 53 48 83 ec 10 89 7d d4 86 02 01 00 00 01 0f 85 6c 03 00 00 e8 7f 36 08 00 8b 45 d4 48 For gfx_rings the schedulers may not be initialized by amdgpu_device_init_schedulers() due to ring->no_scheduler flag being set to true and thus the timeout_wq is NULL. As a result, since all ASICs call drm_sched_fault() unconditionally even for schedulers which have not been initialized, it is simpler to use the ready condition which indicates whether the given scheduler worker thread runs and whether the timeout_wq of the reset domain has been initialized. Signed-off-by: Vitaly Prosyak Cc: Christian König Reviewed-by: Luben Tuikov Signed-off-by: Luben Tuikov Link: https://lore.kernel.org/r/20230406200054.633379-1-luben.tuikov@amd.com --- drivers/gpu/drm/scheduler/sched_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 0e4378420271..1e08cc5a1702 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -308,7 +308,8 @@ static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched) */ void drm_sched_fault(struct drm_gpu_scheduler *sched) { - mod_delayed_work(sched->timeout_wq, &sched->work_tdr, 0); + if (sched->ready) + mod_delayed_work(sched->timeout_wq, &sched->work_tdr, 0); } EXPORT_SYMBOL(drm_sched_fault); -- cgit v1.2.3 From afa965a45e01e541cdbe5c8018226eff117610f0 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Thu, 13 Apr 2023 16:43:47 +0200 Subject: drm/rockchip: vop2: fix suspend/resume During a suspend/resume cycle the VO power domain will be disabled and the VOP2 registers will reset to their default values. After that the cached register values will be out of sync and the read/modify/write operations we do on the window registers will result in bogus values written. Fix this by re-initializing the register cache each time we enable the VOP2. With this the VOP2 will show a picture after a suspend/resume cycle whereas without this the screen stays dark. Fixes: 604be85547ce4 ("drm/rockchip: Add VOP2 driver") Cc: stable@vger.kernel.org Signed-off-by: Sascha Hauer Tested-by: Chris Morgan Signed-off-by: Heiko Stuebner Link: https://patchwork.freedesktop.org/patch/msgid/20230413144347.3506023-1-s.hauer@pengutronix.de --- drivers/gpu/drm/rockchip/rockchip_drm_vop2.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c b/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c index ba3b81789509..d9daa686b014 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c @@ -215,6 +215,8 @@ struct vop2 { struct vop2_win win[]; }; +static const struct regmap_config vop2_regmap_config; + static struct vop2_video_port *to_vop2_video_port(struct drm_crtc *crtc) { return container_of(crtc, struct vop2_video_port, crtc); @@ -839,6 +841,12 @@ static void vop2_enable(struct vop2 *vop2) return; } + ret = regmap_reinit_cache(vop2->map, &vop2_regmap_config); + if (ret) { + drm_err(vop2->drm, "failed to reinit cache: %d\n", ret); + return; + } + if (vop2->data->soc_id == 3566) vop2_writel(vop2, RK3568_OTP_WIN_EN, 1); -- cgit v1.2.3 From 7363d6bedc000f6f9d09cfe498da2f3aca653778 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Sat, 15 Apr 2023 04:08:50 +0206 Subject: drm/nouveau: fix incorrect conversion to dma_resv_wait_timeout() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 41d351f29528 ("drm/nouveau: stop using ttm_bo_wait") converted from ttm_bo_wait_ctx() to dma_resv_wait_timeout(). However, dma_resv_wait_timeout() returns greater than zero on success as opposed to ttm_bo_wait_ctx(). As a result, relocs will fail and log errors even when it was a success. Change the return code handling to match that of nouveau_gem_ioctl_cpu_prep(), which was already using dma_resv_wait_timeout() correctly. Fixes: 41d351f29528 ("drm/nouveau: stop using ttm_bo_wait") Reported-by: Tanmay Bhushan <007047221b@gmail.com> Link: https://lore.kernel.org/lkml/20230119225351.71657-1-007047221b@gmail.com Signed-off-by: John Ogness Reviewed-by: Christian König Reviewed-by: Karol Herbst Signed-off-by: Karol Herbst Link: https://patchwork.freedesktop.org/patch/msgid/87edolaomt.fsf@jogness.linutronix.de --- drivers/gpu/drm/nouveau/nouveau_gem.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c index f77e44958037..ab9062e50977 100644 --- a/drivers/gpu/drm/nouveau/nouveau_gem.c +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c @@ -645,7 +645,7 @@ nouveau_gem_pushbuf_reloc_apply(struct nouveau_cli *cli, struct drm_nouveau_gem_pushbuf_reloc *reloc, struct drm_nouveau_gem_pushbuf_bo *bo) { - long ret = 0; + int ret = 0; unsigned i; for (i = 0; i < req->nr_relocs; i++) { @@ -653,6 +653,7 @@ nouveau_gem_pushbuf_reloc_apply(struct nouveau_cli *cli, struct drm_nouveau_gem_pushbuf_bo *b; struct nouveau_bo *nvbo; uint32_t data; + long lret; if (unlikely(r->bo_index >= req->nr_buffers)) { NV_PRINTK(err, cli, "reloc bo index invalid\n"); @@ -703,13 +704,18 @@ nouveau_gem_pushbuf_reloc_apply(struct nouveau_cli *cli, data |= r->vor; } - ret = dma_resv_wait_timeout(nvbo->bo.base.resv, - DMA_RESV_USAGE_BOOKKEEP, - false, 15 * HZ); - if (ret == 0) + lret = dma_resv_wait_timeout(nvbo->bo.base.resv, + DMA_RESV_USAGE_BOOKKEEP, + false, 15 * HZ); + if (!lret) ret = -EBUSY; + else if (lret > 0) + ret = 0; + else + ret = lret; + if (ret) { - NV_PRINTK(err, cli, "reloc wait_idle failed: %ld\n", + NV_PRINTK(err, cli, "reloc wait_idle failed: %d\n", ret); break; } -- cgit v1.2.3 From b63a553e8f5aa6574eeb535a551817a93c426d8c Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Mon, 17 Apr 2023 14:37:47 +0200 Subject: drm/rockchip: vop2: Use regcache_sync() to fix suspend/resume afa965a45e01 ("drm/rockchip: vop2: fix suspend/resume") uses regmap_reinit_cache() to fix the suspend/resume issue with the VOP2 driver. During discussion it came up that we should rather use regcache_sync() instead. As the original patch is already applied fix this up in this follow-up patch. Fixes: afa965a45e01 ("drm/rockchip: vop2: fix suspend/resume") Cc: stable@vger.kernel.org Signed-off-by: Sascha Hauer Signed-off-by: Heiko Stuebner Link: https://patchwork.freedesktop.org/patch/msgid/20230417123747.2179695-1-s.hauer@pengutronix.de --- drivers/gpu/drm/rockchip/rockchip_drm_vop2.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c b/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c index d9daa686b014..293c228a83f9 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c @@ -215,8 +215,6 @@ struct vop2 { struct vop2_win win[]; }; -static const struct regmap_config vop2_regmap_config; - static struct vop2_video_port *to_vop2_video_port(struct drm_crtc *crtc) { return container_of(crtc, struct vop2_video_port, crtc); @@ -841,11 +839,7 @@ static void vop2_enable(struct vop2 *vop2) return; } - ret = regmap_reinit_cache(vop2->map, &vop2_regmap_config); - if (ret) { - drm_err(vop2->drm, "failed to reinit cache: %d\n", ret); - return; - } + regcache_sync(vop2->map); if (vop2->data->soc_id == 3566) vop2_writel(vop2, RK3568_OTP_WIN_EN, 1); @@ -875,6 +869,8 @@ static void vop2_disable(struct vop2 *vop2) pm_runtime_put_sync(vop2->dev); + regcache_mark_dirty(vop2->map); + clk_disable_unprepare(vop2->aclk); clk_disable_unprepare(vop2->hclk); } -- cgit v1.2.3