2

I have wrote this simple module to handle a device and call some of its power management methods such as .suspend and .resume. At its initialization, the module simple lookup for a particular device and try to call its methods.

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/pci.h>

static int __init mfps_driver_init(void){

struct pci_dev    *dev      = NULL;
struct pci_driver *driver   = NULL;
struct device     *device   = NULL;

dev = pci_get_device(0x8086, 0x15a2, NULL);

if((dev == NULL) || (dev == 0)){

    printk(KERN_INFO "LEONZO: NOTHING FOUND SIZE %ld\n", sizeof(dev));

} else {

    driver = dev->driver;

    printk(KERN_INFO "LEONZO: I FOUND THE DEVICE OF THE SIZE %ld\n", sizeof(dev));
    printk(KERN_INFO "LEONZO: HERE IS ITS DRIVER NAME %s\n", driver->name);
    printk(KERN_INFO "LEONZO: CALLING IT SUSPEND METHOD\n");

    *device = dev->dev;

    device_lock(device);

    device_unlock(device);
}

return 0;

}

static void __exit mfps_driver_exit(void){

}


module_init(mfps_driver_init);
module_exit(mfps_driver_exit);

The code compile successfully. But the I get a kernel bug when I load the module:

sudo insmod MyFirstPowerState.ko

And the dmesg show the following output

[   59.545180] MyFirstPowerState: module license 'unspecified' taints   kernel. 
[   59.545183] Disabling lock debugging due to kernel taint
[   59.546010] LEONZO: I FOUND THE DEVICE OF THE SIZE 8
[   59.546012] LEONZO: HERE IS ITS DRIVER NAME e1000e
[   59.546013] LEONZO: CALLING IT SUSPEND METHOD
[   59.546021] BUG: unable to handle kernel NULL pointer dereference         at           (null)
[   59.546051] IP: [<ffffffffc011907e>] mfps_driver_init+0x7e/0x1000         [MyFirstPowerState]
[   59.546077] PGD 0 
[   59.546085] Oops: 0002 [#1] SMP 
[   59.546097] Modules linked in: MyFirstPowerState(POE+) xt_CHECKSUM arc4 iwlmvm mac80211 snd_hda_codec_hdmi snd_hda_codec_realtek iwlwifi snd_hda_codec_generic rtsx_pci_ms memstick cfg80211 nf_conntrack_netbios_ns nf_conntrack_broadcast ipt_MASQUERADE nf_nat_masquerade_ipv4 xt_tcpudp ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter ip_tables x_tables dm_crypt hp_wmi sparse_keymap intel_rapl iosf_mbi x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm crct10dif_pclmul dm_multipath crc32_pclmul scsi_dh aesni_intel aes_x86_64 lrw gf128mul glue_helper ablk_helper cryptd joydev serio_raw lpc_ich uvcvideo snd_seq_midi snd_seq_midi_event snd_rawmidi snd_hda_intel snd_hda_controller snd_hda_codec videobuf2_vmalloc snd_hwdep shpchp snd_pcm videobuf2_memops videobuf2_core v4l2_common snd_seq e1000e(OE) i915_bpo ptp mei_me pps_core mei videodev media snd_seq_device intel_ips snd_timer drm_kms_helper drm btusb snd i2c_algo_bit soundcore 8250_fintek hp_accel lis3lv02d input_polldev tpm_infineon hp_wireless mac_hid parport_pc ppdev lp parport rfcomm bnep bluetooth binfmt_misc btrfs xor raid6_pq dm_mirror dm_region_hash dm_log uas usb_storage hid_generic usbhid hid rtsx_pci_sdmmc ahci psmouse libahci rtsx_pci wmi video
[   59.546577] CPU: 1 PID: 4180 Comm: insmod Tainted: P           OE   3.19.0-51-generic #58~14.04.1-Ubuntu
[   59.546613] Hardware name: Hewlett-Packard HP EliteBook 840 G2/2216, BIOS M71 Ver. 01.05 03/26/2015
[   59.546648] task: ffff880241a7b110 ti: ffff880242f68000 task.ti: ffff880242f68000
[   59.546678] RIP: 0010:[<ffffffffc011907e>]  [<ffffffffc011907e>] mfps_driver_init+0x7e/0x1000 [MyFirstPowerState]
[   59.546720] RSP: 0018:ffff880242f6bd18  EFLAGS: 00010246
[   59.546741] RAX: 0000000000000000 RBX: ffff880245b4d000 RCX: 00000000000000ae
[   59.546772] RDX: 0000000000000000 RSI: ffff880245b4d098 RDI: 0000000000000000
[   59.546807] RBP: ffff880242f6bd28 R08: 000000000000000a R09: 0000000000000000
[   59.546839] R10: 0000000000000d53 R11: ffff880242f6b9de R12: ffffffffc06a8000
[   59.546868] R13: 0000000000000000 R14: ffffffffc0119000 R15: ffff880242f6bef8
[   59.546900] FS:  00007f8787aa6740(0000) GS:ffff88024f440000(0000) knlGS:0000000000000000
[   59.546921] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   59.546936] CR2: 0000000000000000 CR3: 0000000244393000 CR4: 00000000003407e0
[   59.546955] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   59.546978] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   59.547006] Stack:
[   59.547014]  ffffffff81c1d060 ffff880204cd3280 ffff880242f6bda8 ffffffff81002144
[   59.547046]  0000000000000001 0000000000000002 ffff8801f8ddc4c0 0000000000000001
[   59.547079]  ffff880242f6bd88 ffffffff811cef19 ffffffff810f7aac 0000000000000018
[   59.547114] Call Trace:
[   59.547131]  [<ffffffff81002144>] do_one_initcall+0xd4/0x210
[   59.547162]  [<ffffffff811cef19>] ? kmem_cache_alloc_trace+0x199/0x220
[   59.547194]  [<ffffffff810f7aac>] ? load_module+0x164c/0x1cc0
[   59.547222]  [<ffffffff810f7ae5>] load_module+0x1685/0x1cc0
[   59.547247]  [<ffffffff810f3380>] ? store_uevent+0x40/0x40
[   59.547274]  [<ffffffff810f8296>] SyS_finit_module+0x86/0xb0
[   59.547298]  [<ffffffff817b788d>] system_call_fastpath+0x16/0x1b
[   59.547314] Code: c7 80 c0 4b c0 31 c0 e8 19 14 69 c1 48 c7 c7 a8 c0  4b c0 31 c0 e8 0b 14 69 c1 31 c0 48 8d b3 98 00 00 00 b9 ae 00 00 00 48 89 c7 <f3> a5 bf 60 00 00 00 e8 26 c7 69 c1 bf 60 00 00 00 e8 ac c5 69 
[   59.547393] RIP  [<ffffffffc011907e>] mfps_driver_init+0x7e/0x1000 [MyFirstPowerState]
[   59.547416]  RSP <ffff880242f6bd18>
[   59.547425] CR2: 0000000000000000
[   59.554577] ---[ end trace 42e3b1c73677cdfa ]---

I also notice that it is therefore impossible to remove the module:

sudo rmmod MyFirstPowerState.ko 
rmmod: ERROR: Module MyFirstPowerState is in use

Any idea of what this code mean and how to correct the error ?

Fopa Léon Constantin
  • 11,863
  • 8
  • 48
  • 82
  • This doesn't seem to be the code where your debug message came from. "CALLING IT SUSPEND METHOD" I cannot find this printk anywhere in your source code. As an aside just based on it being a null pointer dereference and the fact that both driver and dev are resolving my first guess is that for some reason dev->dev == NULL. – arduic Mar 22 '16 at 11:59
  • If however you are asking how to read the debug message provided by dmesg please say so and I'll happily write something up to explain that mess. – arduic Mar 22 '16 at 12:02
  • @arduic, you're right, I don't understand what this debug message mean and how to exploit It. I've edit the question accordingly – Fopa Léon Constantin Mar 22 '16 at 12:06
  • Ah ok sorry for delay I will write an answer now had something I needed to get done. – arduic Mar 22 '16 at 12:57
  • 1
    The line `*device = dev->dev;` is wrong. I think you meant `device = &dev->dev;`. (EDITED) – Ian Abbott Mar 22 '16 at 12:57
  • Ian does have the answer actually good call I didn't even see the * haha. Anyways on to explaining debug. – arduic Mar 22 '16 at 12:58

1 Answers1

5

I will be attempting to explain the massive wall of text that is dmesg bellow. As a note the values in brackets to the left are times I forget with what exactly they are in relation to but for you they don't really matter.

[ 59.545180] MyFirstPowerState: module license 'unspecified' taints kernel. [ 59.545183] Disabling lock debugging due to kernel taint

This is because you did not declare a module license. Usually you will see people put something like this in their code in the same section as the module_init.

MODULE_LICENSE("GPL");

[ 59.546010] LEONZO: I FOUND THE DEVICE OF THE SIZE 8 [ 59.546012] LEONZO: HERE IS ITS DRIVER NAME e1000e [ 59.546013] LEONZO: CALLING IT SUSPEND METHOD

These are your printk messages nothing really special here.

[ 59.546021] BUG: unable to handle kernel NULL pointer dereference at (null)

Here is where the cause for your crash actually lives. The kernel tried to dereference a NULL pointer which causes a seg fault. For more details on what exactly that means see here. As Ian noted in the comments earlier it looks like the cause of your crash is you put *device=dev->dev instead of device=dev->dev. In the code you have you attempted to assign the value device points to to dev->dev however since device=NULL currently you attempted to dereference NULL causing a crash.

[ 59.546051] IP: [] mfps_driver_init+0x7e/0x1000 [MyFirstPowerState] [ 59.546648] task: ffff880241a7b110 ti: ffff880242f68000 task.ti: ffff880242f68000

The chunk of errors contained within those above do not have much valuable to you currently and are more for people who have deployed something and some specific user has a problem. It is listing things like the hardware installed, the module that caused the crash, and modules that is also calling all things that in your case are very well known.

[ 59.546678] RIP: 0010:[] [] mfps_driver_init+0x7e/0x1000 [MyFirstPowerState][ 59.547079] ffff880242f6bd88 ffffffff811cef19 ffffffff810f7aac 0000000000000018

Everything in this section is assembly information which if you have no assembly experience means nothing to you although I would suggest knowing the basics it does help in these cases. The top half is registers and their current values and the bottom half is the current stack frame.

> [   59.547114] Call Trace:
[   59.547131]  [<ffffffff81002144>] do_one_initcall+0xd4/0x210
[   59.547162]  [<ffffffff811cef19>] ? kmem_cache_alloc_trace+0x199/0x220
[   59.547194]  [<ffffffff810f7aac>] ? load_module+0x164c/0x1cc0

Everything within the call trace can be exceptionally helpful especially when the module becomes long and difficult to debug with things like interrupts. Basically it is listing out every single function call (or otherwise) the system has made to lead to this crash. In your case since you went from the load module straight to the crash the trace really only has your load_module along with some wrappers and some deep system calls. However, if say your load module called another function and that caused the crash you could see this call path here.

The last little bit appears to be more registers.

Hopefully that explained the wall of text that you get from dmesg when you cause a kernel issue (not sure if this is a panic someone please correct me). If there is anything that is still vague I'll try to explain although I am by no means an expert on this.

Community
  • 1
  • 1
arduic
  • 659
  • 3
  • 12
  • thanks for this detailled answer. Can you provide a good resource to `basics of assembly` which you think may help here ? – Fopa Léon Constantin Mar 22 '16 at 13:27
  • 1
    I can point you to where I learned assembly which was here. https://www.youtube.com/watch?v=H4Z0S9ZbC0g It's a little dated and written for intel X86 which it looks like you are using a different architecture I would guess AMD but not sure. However it still teaches the basics on the call stack and other really nice things. It's a bit long but if you watch the two day video and take notes as you go I would say you will have a decent grasp on assembly. – arduic Mar 22 '16 at 13:30
  • `59.545180` should be the number of seconds since boot. `dmesg -T` shows time in a human readable format: `[Sat Jul 31 19:59:16 2021]` (I suggest an alias in `.bashrc`, it's pretty helpful.) – Avio Jul 31 '21 at 20:33