linux crash分析案例之进程同步

时间:2024-04-14 14:05:20

声明:本文是真实案例分析,而非crash分析工具使用说明,不熟悉相关工具的同学,请参考官方文档

1、问题现场:

Unable to handle kernel NULL pointer dereference at virtual address 00000001
Mem abort info:
  Exception class = DABT (current EL), IL = 32 bits
  SET = 0, FnV = 0
  EA = 0, S1PTW = 0
Data abort info:
  ISV = 0, ISS = 0x00000006
  CM = 0, WnR = 0
user pgtable: 4k pages, 39-bit VAs, pgd = ffffffc03c784000
[0000000000000001] *pgd=00000000070f8003, *pud=00000000070f8003, *pmd=0000000000000000
Internal error: Oops: 96000006 [#1] PREEMPT SMP
Modules linked in:
CPU: 0 PID: 1948 Comm: pt-osd Not tainted 4.14.61-00003-g8d661a5-dirty #39
Hardware name:
task: ffffffc031325700 task.stack: ffffff800cc50000
PC is at i2c_dw_isr+0x4fc/0x654
LR is at i2c_dw_isr+0x428/0x654
pc : [<ffffff80086c0f70>] lr : [<ffffff80086c0e9c>] pstate: 604001c9
sp : ffffff8008003e10
x29: ffffff8008003e10 x28: 000000000000ff80
x27: 0000000000000000 x26: 0000000000000001
x25: 0000000000000400 x24: ffffff800c34b7f8
x23: 0000000000000001 x22: ffffff8008fb1000
x21: 0000000000000010 x20: 00000000ffff0081
x19: ffffffc03e170018 x18: 000000000000671f
x17: 0000007f80918620 x16: 0000007f80c54698
x15: 0000000000001d55 x14: 0000000000000000
x13: 0000000000000000 x12: 0000000000000000
x11: 0000000000000000 x10: 0000000000000000
x9 : 0000000000000000 x8 : ffffff80086c10c8
x7 : 0000000000000001 x6 : 0000000000000400
x5 : 0000000000000c34 x4 : 0000000000000065
x3 : 0000000000000001 x2 : 0000000000000000
x1 : 0000000000000000 x0 : 0000000000000000

X19: 0xffffffc03e16ff98:
ff98  00000000 00000000 57d3a303 00000000 08ae0e10 ffffff80 00000000 00000000
ffb8  00001000 00000000 00000000 00000000 08f0a328 ffffff80 000044ae 00000001
ffd8  81a40052 00000000 00000000 00000000 00000000 00000000 00000000 00000000
fff8  00000000 00000000 3e04ee00 ffffffc0 0474a698 ffffffc0 08559eb0 ffffff80
0018  0474a410 ffffffc0 09105000 ffffff80 00000000 00000000 91e791e7 00000000
0038  0c34b778 ffffff80 0c34b778 ffffff80 3e04e980 ffffffc0 00000000 00000000
0058  00000000 00000000 086c1978 ffffff80 00000000 00000000 00000000 00000000
0078  0c34b888 ffffff80 00000001 00000000 00000000 00000000 0c34b9a7 ffffff80

Process pt-osd (pid: 1948, stack limit = 0xffffff800cc50000)
Call trace:
Exception stack(0xffffff8008003cd0 to 0xffffff8008003e10)
3cc0:                                   0000000000000000 0000000000000000
3ce0: 0000000000000000 0000000000000001 0000000000000065 0000000000000c34
3d00: 0000000000000400 0000000000000001 ffffff80086c10c8 0000000000000000
3d20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
3d40: 0000000000000000 0000000000001d55 0000007f80c54698 0000007f80918620
3d60: 000000000000671f ffffffc03e170018 00000000ffff0081 0000000000000010
3d80: ffffff8008fb1000 0000000000000001 ffffff800c34b7f8 0000000000000400
3da0: 0000000000000001 0000000000000000 000000000000ff80 ffffff8008003e10
3dc0: ffffff80086c0e9c ffffff8008003e10 ffffff80086c0f70 00000000604001c9
3de0: ffffff8008003e10 ffffff80086c0b90 0000007fffffffff 0000000000000001
3e00: ffffff8008003e10 ffffff80086c0f70
[<ffffff80086c0f70>] i2c_dw_isr+0x4fc/0x654
[<ffffff8008106a28>] __handle_irq_event_percpu+0x7c/0x148
[<ffffff8008106bc8>] handle_irq_event+0x58/0xc0
[<ffffff800810a740>] handle_fasteoi_irq+0x98/0x180
[<ffffff8008105ad8>] generic_handle_irq+0x24/0x38
[<ffffff8008106220>] __handle_domain_irq+0x60/0xac
[<ffffff8008081384>] gic_handle_irq+0x104/0x1c4
Exception stack(0xffffff800cc53ec0 to 0xffffff800cc54000)
3ec0: 0000000000000035 0000007f2408bee0 0000000000000000 0000007f24000030
3ee0: 0000000000000000 0000007f240cbf60 0000007f2408a2d0 0000000000000001
3f00: 00000021000001bd 000001be0001f0be 0000000000000071 0000007f24000778
3f20: 0001f0bb00000021 00000021000001bb 0000000000000000 0000000000001d55
3f40: 0000007f80c54698 0000007f80918620 000000000000671f 0000007f2408bef0
3f60: 0000007f2408bec0 0000007ea56fa680 0000007ea56fa680 00000000007e9000
3f80: 0000007ea56fa750 0000000000000d24 000000000000003a 0000007ea56fa670
3fa0: 0000000000000002 0000007ea56fa510 0000007f806f43c0 0000007ea56fa510
3fc0: 0000007f80918648 0000000060000000 0000000000000009 00000000ffffffff
3fe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[<ffffff80080833f8>] el0_irq_naked+0x50/0x58
Code: 6b1f037f 32160026 1a8110c1 3707fc85 (394002e5)
---[ end trace 9acff67b837bbb81 ]---
Kernel panic - not syncing: Fatal exception in interrupt
SMP: stopping secondary CPUs
oops, flush dcache for cpu3
oops, flush dcache for cpu2
oops, flush dcache for cpu1
Kernel Offset: disabled
CPU features: 0x0a02a38
Memory Limit: none
panic_handler:kernel panic:0.

2、分析问题思路

1)首先通过PC,查看对应的汇编代码

crash_arm64> dis -l ffffff80086c0f70
drivers/i2c/busses/i2c-designware-master.c: 336     0xffffff80086c0f70 <i2c_dw_isr+1276>:   ldrb    w5, [x23]

对应一个ldr指令,地址是x23,但发现x23:0000000000000001,与上面问题现场Unable to handle kernel NULL pointer dereference at virtual address 00000001刚好吻合。

2)分析i2c_dw_isr()对应的汇编代码,发现x19寄存器就是struct dw_i2c_dev结构体指针,x23寄存器是struct i2c_msg结构体buf成员指针,x24寄存器是struct i2c_msg结构体指针

crash_arm64> struct i2c_msg ffffff800c34b7f8 -x
struct i2c_msg {
  addr = 0xb888,
  flags = 0xc34,
  len = 0xff80,
  buf = 0x1 <Address 0x1 out of bounds>
}
crash_arm64>

3)但是从x19寄存器来看,msgs指针好像和x24寄存器值不相同

crash_arm64>
crash_arm64> struct dw_i2c_dev.msgs ffffffc03e170018
  msgs = 0xffffff800c34b888
crash_arm64>

crash_arm64> rd ffffff800c34b7f8 -x
ffffff800c34b7f8:  ffffff800c34b888
crash_arm64>

4)从这里看出x24指向的struct i2c_msg结构体内容,居然是当前struct i2c_msg结构体指针

5)为了弄清楚执行i2c_dw_isr()时struct i2c_msg结构体真实内容,故在开机时预先申请了一个struct dw_i2c_dev结构体和struct i2c_msg结构体空间,当代码执行到i2c_dw_isr()时,拷贝当时的struct dw_i2c_dev结构体和struct i2c_msg结构体内容

6)比较两者发现,只有cmd_complete和msgs不同,其余均相同,故怀疑是进程同步问题,而非DDR跳变,或者踩内存问题

linux crash分析案例之进程同步

怀疑进程同步理由:

cmd_complete.wait.lock.rlock.raw_lock.owner,两者的值只相差1,说明是两次连续的i2c_transfer()

7)由于平台使用SMP,有没有一种可能是cpu0在执行中断处理程序,而另外一个CPU在发起I2C访问呢?

crash_arm64> struct dw_i2c_dev.adapter.bus_lock ffffffc03e170018
  adapter.bus_lock = {
    wait_lock = {
      raw_lock = {
        owner = 0,
        next = 0
      }
    },
    waiters = {
      rb_root = {
        rb_node = 0x0
      },
      rb_leftmost = 0x0
    },
    owner = 0xffffffc03df20000
  },
crash_arm64>
 

crash_arm64> task 0xffffffc03df20000
PID: 1750   TASK: ffffffc03df20000  CPU: 3   COMMAND: "jmfserver"
struct task_struct {
  thread_info = {
    flags = 8,
    addr_limit = 549755813887,
    ttbr0 = 32989184,
    preempt_count = 2
  },
  state = 2,
  stack = 0xffffff800c348000,
 

crash_arm64> bt 1750
PID: 1750   TASK: ffffffc03df20000  CPU: 3   COMMAND: "jmfserver"
 #0 [ffffff800c34b590] __switch_to at ffffff8008085df0
 #1 [ffffff800c34b5b0] __schedule at ffffff8008aa54ac
 #2 [ffffff800c34b640] schedule at ffffff8008aa5a24
 #3 [ffffff800c34b660] schedule_timeout at ffffff8008aa90e4
 #4 [ffffff800c34b700] wait_for_common at ffffff8008aa6890
 #5 [ffffff800c34b790] wait_for_completion_timeout at ffffff8008aa696c
 #6 [ffffff800c34b7a0] i2c_dw_xfer at ffffff80086c1748
 #7 [ffffff800c34b7e0] __i2c_transfer at ffffff80086bc164
 #8 [ffffff800c34b830] i2c_transfer at ffffff80086bc338
 #9 [ffffff800c34b860] i2c_master_send at ffffff80086bc3d8
#10 [ffffff800c34b8a0] regmap_i2c_write at ffffff8008575584
#11 [ffffff800c34b8c0] _regmap_raw_write at ffffff80085716bc
#12 [ffffff800c34b930] regmap_bulk_write at ffffff8008571e60
从上面来看,的确存在这种情况。

8)在修改msgs成员处添加相应锁保护,问题不再复现