20150501 调试分析之 修改内核来定位系统僵死问题

时间:2021-11-11 14:38:26

20150501 调试分析之 修改内核来定位系统僵死问题

2015-05-01 Lover雪儿


今天还是研究内核调试,

死机,这个词语,大家应该不陌生.

当我们写程序,如果加入到内核中的程序中有出现死循环的话,启动内核运行程序会直接进入相对死机状态.

那么怎么可以解决这个问题呢?


我们都知道,我们人的心脏是一直跳动的,而恰恰如此,内核也有它的跳动,那就是tick中断,

所以我们可以从tick中断入手,解决上面的死机问题.

 

在开发板上运行cat /proc/interrupts 可以查看系统当前的各种中断号,

可以看到一个中断名为i.MX Timer Tick 的中断,那么它就是我们今天的主角.

 1 root@EasyARM-iMX257 /mnt/nfs/module# cat /proc/interrupts 
 2          CPU0
 3   9:          0           -  mxsdhci
 4  14:          0           -  CSPI_IRQ
 5  25:          2           -  imxdi     -  mxcsdma
 6  35:          0           -  ehci_hcd:usb1
 7  37           2453     -  mxcintuart
 8  46:          3           -  m -  i.MX Timer Tick  
 9  57:          0           -  mxsdhci
10 Err:           0

 


在内核中查找 Timer Tick的源代码,如下所示:

 1 /* linux-2.6.31/arch/arm/plat-mxc/time.c
 2  * IRQ handler for the timer
 3  */
 4 static irqreturn_t mxc_timer_interrupt(int irq, void *dev_id)
 5 {
 6     struct clock_event_device *evt = &clockevent_mxc;
 7     uint32_t tstat;
 8 
 9     if (timer_is_v2())
10         tstat = __raw_readl(timer_base + MX3_TSTAT);
11     else
12         tstat = __raw_readl(timer_base + MX1_2_TSTAT);
13 
14     gpt_irq_acknowledge();
15 
16     evt->event_handler(evt);
17 
18     return IRQ_HANDLED;
19 }
20 
21 static struct irqaction mxc_timer_irq = {
22     .name        = "i.MX Timer Tick",
23     .flags        = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
24     .handler    = mxc_timer_interrupt,
25 };

 

在这个函数中,我们可以增加一些代码:有点类似看门狗



.mxc_timer_interrupt中增加打印语句


mxc_timer_interrupt 中断函数中检测系统当前正在运行的中断,如果10S之内都是同一个进程正在运行的话,那就我们就把这个进程打印出来(先从简单入手,此处先不做太多的复杂事情)

步骤:

首先备份 linux-2.6.31/arch/arm/plat-mxc/time.c,

接着修改time.c的内容,

最后编译内核,重新给板子启动新内核

 

root@Lover:/home/study/nfs_home/system/linux-2.6.31/arch/arm/plat-mxc# cp time.c time.c.bak

修改time.c,再中断函数中加入打印语句

root@Lover:/home/study/nfs_home/system/linux-2.6.31/arch/arm/plat-mxc# vi time.c

************************************************************************************************

root@Lover:/home/study/nfs_home/system/linux-2.6.31/arch/arm/plat-mxc# cd ../../..

编译内核

root@Lover:/home/study/nfs_home/system/linux-2.6.31# make uImage

CHK include/linux/version.h

make[1]: 'include/asm-arm/mach-types.h' is up to date.

CHK include/linux/utsrelease.h

SYMLINK include/asm -> include/asm-arm

************************************************************************************************

Data Size: 2180620 Bytes = 2129.51 kB = 2.08 MB

Load Address: 80008000

Entry Point: 80008000

Image arch/arm/boot/uImage is ready

root@Lover:/home/study/nfs_home/system/linux-2.6.31# cp arch/arm/boot/uImage /tftpboot/uImage

root@Lover:/home/study/nfs_home/system/linux-2.6.31#

************************************************************************************************

在开发板上重新烧写内核

MX25 U-Boot > run upsystem

FEC: enable RMII gasket

ver 192.168.31.179; our IP address is 192.168.31.180

Filename '00

Loading: #################################################################

#################################################################

###################

done

************************************************************************************************

加载完毕后,如果不动开发板,会发现,每隔10s,就会有进程pid=0,名字name=swapper的打印消息.

root@EasyARM-iMX257 ~# mxc_timer_interrupt: pid = 0, name = swapper

root@EasyARM-iMX257 ~# mxc_timer_interrupt: pid = 0, name = swapper

root@EasyARM-iMX257 ~#


修改time.c如下所示:

 1 /*   linux-2.6.31/arch/arm/plat-mxc/time.c
 2  * IRQ handler for the timer
 3  */
 4 static irqreturn_t mxc_timer_interrupt(int irq, void *dev_id)
 5 {
 6     struct clock_event_device *evt = &clockevent_mxc;
 7     uint32_t tstat;
 8 ////////////////////////////////////////
 9     static pid_t pre_pid;
10     static int  cnt = 0;
11     if(pre_pid == current->pid){
12         cnt++;
13     }else{
14         cnt = 0;
15         pre_pid = current->pid;
16     }
17     if(cnt == 10*HZ){
18         cnt = 0;
19         printk("mxc_timer_interrupt: pid = %d, name = %s\n",current->pid, current->comm);
20     }
21 //////////////////////////////////////////
22     if (timer_is_v2())
23         tstat = __raw_readl(timer_base + MX3_TSTAT);
24     else
25         tstat = __raw_readl(timer_base + MX1_2_TSTAT);
26 
27     gpt_irq_acknowledge();
28 
29     evt->event_handler(evt);
30 
31     return IRQ_HANDLED;
32 }

 

.修改错误代码,在代码中增加死循环

还是沿用我们前面的err_led.c的驱动程序.

参考博客地址:http://www.cnblogs.com/lihaiyan/p/4470390.html

open函数中,我们故意加入一个死循环.

 /* err_led.c
 */
 44 static int key_open(struct inode *inode, struct file *file)
 45 {
 46     printk("<0>function open!\n\n");
 47     //在此加入一个死循环
 48     while(1);
 49     return 0;
 50 }

 

20150501 调试分析之 修改内核来定位系统僵死问题

编译接着在开发板中加载错误驱动程序,使用cat 命令打开设备.

root@EasyARM-iMX257 ~# ifconfig eth0 192.168.31.181;mount -t nfs 192.168.31.179:

/home/study/nfs_home /mnt/nfs -o nolock;cd /mnt/nfs/module/

root@EasyARM-iMX257 /mnt/nfs/module#

root@EasyARM-iMX257 /mnt/nfs/module# cd 39_debug_with_timer/

root@EasyARM-iMX257 /mnt/nfs/module/39_debug_with_timer# insmod err_led.ko


Hello,this is err_led_dev module!


addr base_iomux : c4a26000

addr base_gpio3 : c4a2a000

addCTL : c4a26270

addr GDIR_GPIO3a2a000

root@EasyARM-iMX257 /mnt/nfs/module/39_debug_with_timer# cat /dev/err_led_dev

function open!

#################################################################

可以发现,打开设备后,进入open函数,系统直接进入死机状态,每格10s中便会打印出我们的进程号pid=1805

mxc_timer_interrupt: pid = 1805, name = cat

mxc_timer_interrupt: pid = 1805, name = cat

mxc_timer_interrupt: pid = 1805, name = cat


.修改错误代码,在代码中增加死循环


接着恢复上面的time.c的代码,我们找到linux-2.6.31/arch/arm/kernel/irq.c文件中找打系统中断总调用者asm_do_IRQ,

我们在asm_do_IRQ函数里加入前面time.c中的打印代码.

root@Lover:/home/study/nfs_home/system/linux-2.6.31# cd arch/arm/plat-mxc/

root@Lover:/home/study/nfs_home/system/linux-2.6.31/arch/arm/plat-mxc# mv time.c.bak time.c

root@Lover:/home/study/nfs_home/system/linux-2.6.31/arch/arm/plat-mxc# cd ..

root@Lover:/home/study/nfs_home/system/linux-2.6.31/arch/arm# cd kernel/

root@Lover:/home/study/nfs_home/system/linux-2.6.31/arch/arm/kernel# vi irq.c

root@Lover:/home/study/nfs_home/system/linux-2.6.31/arch/arm/kernel# cd ../../../

root@Lover:/home/study/nfs_home/system/linux-2.6.31# make uImage

########################################################

Load Address: 80008000

Entry Point: 80008000

Image arch/arm/boot/uImage is ready

root@Lover:/home/study/nfs_home/system/linux-2.6.31# cp arch/arm/boot/uImage /tftpboot/uImage

root@Lover:/home/study/nfs_home/system/linux-2.6.31#


########################################################

从开发板重新烧写新内核

启动开发板


Irq.c修改内容如下:

 1 /* linux-2.6.31/arch/arm/kernel/irq.c
 2  * do_IRQ handles all hardware IRQ's.  Decoded IRQs should not
 3  * come via this function.  Instead, they should provide their
 4  * own 'handler'
 5  */
 6 asmlinkage void __exception asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
 7 {
 8     struct pt_regs *old_regs = set_irq_regs(regs);
 9 ////////////////////////////////////////////////////////////////////
10 //从  cat /proc/interrupts  中得到我们的tick中断为46
11    if(irq == 46)
12    {
13     ////////////////////////////////////////
14         static pid_t pre_pid;
15         static int  cnt = 0;
16         if(pre_pid == current->pid){
17             cnt++;
18         }else{
19             cnt = 0;
20             pre_pid = current->pid;
21         }
22         if(cnt == 10*HZ){
23             cnt = 0;
24             printk("asm_do_IRQ => mxc_timer_interrupt: pid = %d, name = %s\n",current->pid, current->comm);
25             printk("pc = %08x\n",regs->ARM_pc);//ptract.h
26         }
27     /////////////////////////////////////////
28     }
29 ////////////////////////////////////////////////////////////////////
30 
31     irq_enter();
32 
33     /*
34      * Some hardware gives randomly wrong interrupts.  Rather
35      * than crashing, do something sensible.
36      */
37     if (unlikely(irq >= NR_IRQS)) {
38         if (printk_ratelimit())
39             printk(KERN_WARNING "Bad IRQ%u\n", irq);
40         ack_bad_irq(irq);
41     } else {
42         generic_handle_irq(irq);
43     }
44 
45     /* AT91 specific workaround */
46     irq_finish(irq);
47 
48     irq_exit();
49     set_irq_regs(old_regs);
50 }

 

启动开发板,加载错误的驱动程序,根据打印出来的PC值来反推错误地址:

root@EasyARM-iMX257 /mnt/nfs/module/39_debug_with_timer# insmod err_led.ko

root@EasyARM-iMX257 /mnt/nfs/module/39_debug_with_timer# cat /dev/err_led_dev

function open!

根据打印出来的pc,cat /proc/kallmps,找到错误的驱动err_led.ko,对其进行反汇编,然后找到错误的函数,进而反推出c语言代码出错位置.

92 00000130 <key_open>:

93 130: e52de004 str lr, [sp, #-4]!

94 134: e59f0008 ldr r0, [pc, #8] ; 144 <.text+0x144>

95 138: e24dd004 sub sp, sp, #4 ; 0x4

96 13c: ebfffffe bl 0 <printk>

97 140: eafffffe b 140 <key_open+0x10> //很容易就找到了错误地址,此处一直b 140就为死循环

98 144: 000000cc andeq r0, r0, ip, asr #1

99



步骤和前面的博客文章

<20150430 调试分析之 根据内核报错信息PC指针分析错误>一样了,

此处不再赘述,博客地址:http://www.cnblogs.com/lihaiyan/p/4470353.html


如果要调试应用程序,可以使用strace,具体的用法,百度上有很详细的解释

 

附上驱动程序err_led.c

20150501 调试分析之 修改内核来定位系统僵死问题20150501 调试分析之 修改内核来定位系统僵死问题
  1 #include<linux/cdev.h>
  2 #include<linux/module.h>
  3 #include<linux/types.h>
  4 #include<linux/fs.h>
  5 #include<linux/errno.h>
  6 #include<linux/mm.h>
  7 #include<linux/sched.h>
  8 #include<linux/init.h>
  9 #include<asm/io.h>
 10 #include<asm/system.h>
 11 #include<asm/uaccess.h>
 12 #include<linux/device.h>
 13 #include <linux/delay.h>
 14 
 15 #define Driver_NAME "err_led_dev"
 16 #define DEVICE_NAME "err_led_dev"
 17 
 18 static int major = 0;
 19 
 20 #define LED_ON     0
 21 #define LED_OFF 1
 22 
 23 
 24 //auto to create device node
 25 static struct class *drv_class = NULL;
 26 static struct class_device *drv_class_dev = NULL;
 27 
 28 //寄存器基址;
 29 static unsigned long mem_iomux;
 30 static unsigned long mem_gpio3;
 31 static unsigned long base_iomux;      //iomux基址 0X 43FA C000 -  0X 43FA FFFF
 32 static unsigned long base_gpio3;    //gpio3      0X 53FA 4000 -  0X 53FA 7FFF
 33 // MUX_CTL模式选择  配置寄存器
 34 #define MUX_CTL  (*(volatile unsigned long *)(base_iomux + 0x0060))
 35 // PAD_CTL GPIO常用功能设置
 36 #define PAD_CTL  (*(volatile unsigned long *)(base_iomux + 0x0270))
 37 // GPIO DR   数据寄存器  DR
 38 #define DR_GPIO3 (*(volatile unsigned long *)(base_gpio3 + 0x0000))
 39 // GPIO GDIR 方向控制寄存器  GDIR
 40 #define GDIR_GPIO3 (*(volatile unsigned long *)(base_gpio3 + 0x0004))
 41 
 42 
 43 static int key_open(struct inode *inode, struct file *file)
 44 {
 45     printk("<0>function open!\n\n");
 46 ////////////////////////////////////////////////////////
 47     //在此加入一个死循环
 48     while(1);
 49 ////////////////////////////////////////////////////////
 50     return 0;
 51 }
 52 
 53 static int key_read(struct file *filp, char __user *buff, size_t count, loff_t *offp)
 54 {
 55     return 0;
 56 }
 57 
 58 static ssize_t key_write(struct file *file, const char __user *buf, size_t count, loff_t * ppos)
 59 {
 60     printk("<0>function write!\n\n");
 61     return 1;
 62 }
 63 
 64 static int  key_release(struct inode *inode, struct file *filp)
 65 {
 66     printk("<0>function write!\n\n");
 67     return 0;
 68 }
 69 
 70 static int key_ioctl(struct inode *inode,struct file *flip,unsigned int command,unsigned long arg)
 71 {
 72     printk("<0>function ioctl!\n\n");
 73     
 74     switch(command)
 75     {
 76         case LED_ON:
 77             DR_GPIO3 &= ~(0x01 << 23);        //将GPIO2_23清零  亮
 78             break;
 79         case LED_OFF:
 80             
 81             DR_GPIO3 |= (0x01 << 23);        //将GPIO2_23置1  灭
 82             break;
 83         default:
 84             break;
 85     }
 86     
 87     return 0;
 88 }
 89 static struct file_operations key_fops = {
 90     .owner  =   THIS_MODULE,    /* 这是一个宏,推向编译模块时自动创建的__this_module变量 */
 91     .open   =   key_open,
 92     .read   =   key_read,
 93     .write  =   key_write,
 94     .release=   key_release,
 95     .ioctl  =   key_ioctl,
 96 };
 97 
 98 void gpio_addr(void){
 99     printk("<0>addr base_iomux : %x \n",base_iomux);
100     printk("<0>addr base_gpio3 : %x \n",base_gpio3);
101     printk("<0>addr MUX_CTL : %x \n",&MUX_CTL);
102     printk("<0>addr PAD_CTL : %x \n",&PAD_CTL);
103     printk("<0>addr GDIR_GPIO3 : %x \n",&GDIR_GPIO3);
104     printk("<0>addr DR_GPIO3 : %x \n",&DR_GPIO3);
105 }
106 
107 
108 
109 void led_on_off(void){
110     ssleep(1);
111     DR_GPIO3 |= (0x01 << 23);        //将GPIO2_23置1
112     ssleep(1);
113     DR_GPIO3 &= ~(0x01 << 23);        //将GPIO2_23清零
114     ssleep(1);
115     DR_GPIO3 |= (0x01 << 23);        //将GPIO2_23置1
116     ssleep(1);
117     DR_GPIO3 &= ~(0x01 << 23);        //将GPIO2_23清零
118     ssleep(1);
119     DR_GPIO3 |= (0x01 << 23);        //将GPIO2_23置1
120     ssleep(1);
121     DR_GPIO3 &= ~(0x01 << 23);        //将GPIO2_23清零
122     ssleep(1);
123     DR_GPIO3 |= (0x01 << 23);        //将GPIO2_23置1
124     ssleep(1);
125     DR_GPIO3 &= ~(0x01 << 23);        //将GPIO2_23清零
126     ssleep(1);
127     DR_GPIO3 |= (0x01 << 23);        //将GPIO2_23置1
128 }
129 
130 static int __init  key_irq_init(void)
131 {
132     printk("<0>\nHello,this is %s module!\n\n",Driver_NAME);
133     //register and mknod
134     major = register_chrdev(0,Driver_NAME,&key_fops);
135     drv_class = class_create(THIS_MODULE,Driver_NAME);
136     drv_class_dev = device_create(drv_class,NULL,MKDEV(major,0),NULL,DEVICE_NAME);  /*/dev/key_query*/
137     
138     //IO端口申请 ioremap  可以直接通过指针来访问这些地址
139     base_iomux = ioremap(0x43FAC000,0xFFF);
140     base_gpio3 = ioremap(0x53FA4000,0xFFF);
141 
142     //MUX_CTL
143     MUX_CTL &= ~(0x07 << 0);    
144     MUX_CTL |= (0X05 << 0);    //设置为ALT5  GPIO3_23 ERR_LED
145     //PAD_CTL
146     PAD_CTL &= ~(0x01<<13 | 0x01<<3 | 0x03<<1 | 0x01<<0);   //1.8v 不需要上拉下拉  CMOS输出 slew rate
147     //GDIR_GPIO3    配置为输出模式
148     GDIR_GPIO3 &= ~(0x01 << 23);    
149     GDIR_GPIO3 |= (0x01 << 23);    //配置为输出模式    
150 
151     //DR_GPIO3        配置为输出0 点亮ERR_LED
152     DR_GPIO3 &= ~(0x01 << 23);        //将GPIO2_23清零
153     DR_GPIO3 &= ~(0x01 << 23);        //将GPIO2_23清零
154     gpio_addr();
155     led_on_off();
156     return 0; 
157 }
158                      
159 static void __exit key_irq_exit(void)
160 {
161     gpio_addr();
162     printk("<0>\nGoodbye,%s!\n\n",Driver_NAME);
163     led_on_off();
164 
165        unregister_chrdev(major,Driver_NAME);
166     device_unregister(drv_class_dev);
167     class_destroy(drv_class);
168     
169     //释放IO端口
170     iounmap(base_iomux);
171     iounmap(base_gpio3);
172 }
173 
174 
175 /* 这两行指定驱动程序的初始化函数和卸载函数 */
176 module_init(key_irq_init);
177 module_exit(key_irq_exit);
178 
179 /* 描述驱动程序的一些信息,不是必须的 */
180 MODULE_AUTHOR("Lover雪儿");
181 MODULE_VERSION("0.1.0");
182 MODULE_DESCRIPTION("IMX257 key Driver");
183 MODULE_LICENSE("GPL");
err_led.ko