drm/amdgpu: Improve RAS documentation (v2)

Clarify some areas, clean up formatting, add section for unrecoverable error handling. v2: fix grammatical errors Reviewed-by: Yong Zhao <yong.zhao@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
2026-05-02 05:22:49 -04:00 · 2019-10-30 14:40:09 -04:00
parent 7158ca8476
commit ef177d11d6
2 changed files with 68 additions and 7 deletions
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -220,7 +220,7 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
 * As their names indicate, inject operation will write the
 * value to the address.
 *
- * Second member: struct ras_debug_if::op.
+ * The second member: struct ras_debug_if::op.
 * It has three kinds of operations.
 *
 * - 0: disable RAS on the block. Take ::head as its data.
@@ -228,14 +228,20 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
 * - 2: inject errors on the block. Take ::inject as its data.
 *
 * How to use the interface?
- * programs:
- * copy the struct ras_debug_if in your codes and initialize it.
- * write the struct to the control node.
+ *
+ * Programs
+ *
+ * Copy the struct ras_debug_if in your codes and initialize it.
+ * Write the struct to the control node.
+ *
+ * Shells
 *
 * .. code-block:: bash
 *
 *	echo op block [error [sub_block address value]] > .../ras/ras_ctrl
 *
+ * Parameters:
+ *
 * op: disable, enable, inject
 *	disable: only block is needed
 *	enable: block and error are needed
@@ -265,8 +271,10 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
 *
 * .. note::
- *	Operation is only allowed on blocks which are supported.
+ *	Operations are only allowed on blocks which are supported.
 *	Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
+ *	to see which blocks support RAS on a particular asic.
+ *
 */
 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
 		size_t size, loff_t *pos)
@@ -322,7 +330,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
 * DOC: AMDGPU RAS debugfs EEPROM table reset interface
 *
 * Some boards contain an EEPROM which is used to persistently store a list of
- * bad pages containing ECC errors detected in vram.  This interface provides
+ * bad pages which experiences ECC errors in vram.  This interface provides
 * a way to reset the EEPROM, e.g., after testing error injection.
 *
 * Usage:
@@ -362,7 +370,7 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
 /**
 * DOC: AMDGPU RAS sysfs Error Count Interface
 *
- * It allows user to read the error count for each IP block on the gpu through
+ * It allows the user to read the error count for each IP block on the gpu through
 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
 *
 * It outputs the multiple lines which report the uncorrected (ue) and corrected
@@ -1027,6 +1035,24 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
 }
 /* sysfs end */

+/**
+ * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
+ *
+ * Normally when there is an uncorrectable error, the driver will reset
+ * the GPU to recover.  However, in the event of an unrecoverable error,
+ * the driver provides an interface to reboot the system automatically
+ * in that event.
+ *
+ * The following file in debugfs provides that interface:
+ * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
+ *
+ * Usage:
+ *
+ * .. code-block:: bash
+ *
+ *	echo true > .../ras/auto_reboot
+ *
+ */
 /* debugfs begin */
 static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
 {