Foreword
By definition, a system call is a service offered by the system to the user space applications. When one is running inside the system, he should not call
a service destined to user space. Hence, this is unadvised to make it.
First try with a kernel space buffer
The write() system call is defined in fs/read_write.c. It calls ksys_write() which calls vfs_write():
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
if (unlikely(!access_ok(buf, count)))
return -EFAULT;
ret = rw_verify_area(WRITE, file, pos, count);
if (!ret) {
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
file_start_write(file);
ret = __vfs_write(file, buf, count, pos);
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
}
inc_syscw(current);
file_end_write(file);
}
return ret;
}
[...]
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos, *ppos = file_ppos(f.file);
if (ppos) {
pos = *ppos;
ppos = &pos;
}
ret = vfs_write(f.file, buf, count, ppos);
if (ret >= 0 && ppos)
f.file->f_pos = pos;
fdput_pos(f);
}
return ret;
}
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
return ksys_write(fd, buf, count);
}
The file descriptor passed as first parameter is not a problem. The value passed from user space is used to retrieve the file structure of the output file (in ksys_write()). But the second parameter must reference a user space memory area.
In vfs_write(), a check is done on the second parameter:
if (unlikely(!access_ok(buf, count)))
return -EFAULT;
access_ok() checks if the buffer is in the user-level space. Hence, if you
pass an address referencing the kernel space, the returned code from read() will be -EFAULT (-14).
The example below is a simple module calling the write() system call with a kernel space buffer. On x86_64, the convention for the parameters of the system calls are:
RDI = arg#0
RSI = arg#1
RDX = arg#2
R10 = arg#3
R8 = arg#4
R9 = arg#5
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/ptrace.h>
#include <linux/socket.h>
#include <linux/kallsyms.h>
MODULE_LICENSE("GPL");
typedef int (* syscall_wrapper)(struct pt_regs *);
unsigned long sys_call_table_addr;
#define DEV_NAME "[DEVICE2]"
#define DEV_STR DEV_NAME "String from driver"
static char buf[1024];
static int __init device2_init(void) {
syscall_wrapper write_syscall;
int rc;
struct pt_regs param;
printk(KERN_INFO DEV_NAME "module has been loaded\n");
sys_call_table_addr = kallsyms_lookup_name("sys_call_table");
printk(KERN_INFO DEV_NAME "sys_call_table@%lx\n", sys_call_table_addr);
write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];
/*
Call to write() system call with a kernel space buffer
*/
snprintf(buf, sizeof(buf), "%s\n", DEV_STR);
param.di = 1;
param.si = (unsigned long)buf;
param.dx = strlen(buf);
rc = (* write_syscall)(¶m);
printk(KERN_INFO DEV_NAME "write() with a kernel space buffer = %d\n", rc);
return 0;
}
static void __exit device2_exit(void) {
printk(KERN_INFO DEV_NAME "module has been unloaded\n");
}
module_init(device2_init);
module_exit(device2_exit);
At module insertion time, we can verify that the system call returns -EFAULT:
$ sudo insmod ./device2.ko
$ dmesg
[15716.262977] [DEVICE2]module has been loaded
[15716.270566] [DEVICE2]sys_call_table@ffffffff926013a0
[15716.270568] [DEVICE2]write() with a kernel space buffer = -14
But the same module with a system call like dup() which involves a file descriptor but no user space buffers, this works. Let's change the previous code with:
static int __init device2_init(void) {
syscall_wrapper write_syscall;
syscall_wrapper dup_syscall;
syscall_wrapper close_syscall;
int rc;
struct pt_regs param;
printk(KERN_INFO DEV_NAME "module has been loaded\n");
sys_call_table_addr = kallsyms_lookup_name("sys_call_table");
printk(KERN_INFO DEV_NAME "sys_call_table@%lx\n", sys_call_table_addr);
write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];
dup_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_dup];
close_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_close];
/*
Call to write() system call with a kernel space buffer
*/
snprintf(buf, sizeof(buf), "%s\n", DEV_STR);
param.di = 1;
param.si = (unsigned long)buf;
param.dx = strlen(buf);
rc = (* write_syscall)(¶m);
printk(KERN_INFO DEV_NAME "write() with a kernel space buffer = %d\n", rc);
/*
Call to dup() system call
*/
param.di = 1;
rc = (* dup_syscall)(¶m);
printk(KERN_INFO DEV_NAME "dup() = %d\n", rc);
/*
Call to close() system call
*/
param.di = 0;
rc = (* close_syscall)(¶m);
printk(KERN_INFO DEV_NAME "close() = %d\n", rc);
/*
Call to dup() system call ==> Must return 0 as it is available
*/
param.di = 1;
rc = (* dup_syscall)(¶m);
printk(KERN_INFO DEV_NAME "dup() = %d\n", rc);
return 0;
}
The result of dup() is OK:
$ sudo insmod ./device2.ko
$ dmesg
[17444.098469] [DEVICE2]module has been loaded
[17444.106935] [DEVICE2]sys_call_table@ffffffff926013a0
[17444.106937] [DEVICE2]write() with a kernel space buffer = -14
[17444.106939] [DEVICE2]dup() = 4
[17444.106940] [DEVICE2]close() = 0
[17444.106940] [DEVICE2]dup() = 0
The first call to dup() returns 4 because the current process is insmod
. The latter opened the module file and got file descriptor 3. Hence, the first available file descriptor is 4. The second call to dup() returns 0 because we closed the file descriptor 0.
Second try with a user space buffer
To use a user space buffer, let's add some file operations to the kernel module (open(), release() and write()). In the write() entry point we echo back what is passed from user space into stderr (file descriptor 2) using the user space buffer passed to the write() entry point:
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/ptrace.h>
#include <linux/socket.h>
#include <linux/kallsyms.h>
#include <linux/cdev.h>
MODULE_LICENSE("GPL");
typedef int (* syscall_wrapper)(struct pt_regs *);
static unsigned long sys_call_table_addr;
#define DEV_NAME "[DEVICE2]"
static syscall_wrapper write_syscall;
static ssize_t device2_write(struct file *filp, const char *buff, size_t len, loff_t * off)
{
struct pt_regs param;
int rc;
printk(KERN_INFO DEV_NAME "write %p, %zu\n", buff, len);
/*
Call to write() system call to echo the write to stderr
*/
param.di = 2;
param.si = (unsigned long)buff;
param.dx = len;
rc = (* write_syscall)(¶m);
printk(KERN_INFO DEV_NAME "write() = %d\n", rc);
return len; // <-------------- To stop the write
}
static int device2_open(struct inode *inode, struct file *file)
{
printk(KERN_INFO DEV_NAME "open\n");
return 0;
}
static int device2_release(struct inode *inode, struct file *file)
{
printk(KERN_INFO DEV_NAME "released\n");
return 0;
}
static const struct file_operations fops =
{
.owner= THIS_MODULE,
.write=device2_write,
.open= device2_open,
.release= device2_release
};
struct cdev *device_cdev;
dev_t deviceNumbers;
static int __init device2_init(void) {
int rc;
printk(KERN_INFO DEV_NAME "module has been loaded\n");
// This returns the major number chosen dynamically in deviceNumbers
rc = alloc_chrdev_region(&deviceNumbers, 0, 1, DEV_NAME);
if (rc < 0) {
printk(KERN_ALERT DEV_NAME "Error registering: %d\n", rc);
return -1;
}
device_cdev = cdev_alloc();
cdev_init(device_cdev, &fops);
cdev_add(device_cdev, deviceNumbers, 1);
printk(KERN_INFO DEV_NAME "initialized (major number is %d)\n", MAJOR(deviceNumbers));
sys_call_table_addr = kallsyms_lookup_name("sys_call_table");
printk(KERN_INFO DEV_NAME "sys_call_table@%lx\n", sys_call_table_addr);
write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];
printk(KERN_INFO DEV_NAME "write_syscall@%p\n", write_syscall);
return 0;
}
static void __exit device2_exit(void) {
printk(KERN_INFO DEV_NAME "module has been unloaded\n");
}
module_init(device2_init);
module_exit(device2_exit);
The loading of the module:
$ sudo insmod device2.ko
$ dmesg
[ 2255.183196] [DEVICE2]module has been loaded
[ 2255.183202] [DEVICE2]initialized (major number is 508)
[ 2255.193255] [DEVICE2]sys_call_table@ffffffffbcc013a0
[ 2255.193256] [DEVICE2]write_syscall@0000000030394929
Make the device entry in the file system to be able to write into it:
$ sudo mknod /dev/device2 c 508 0
$ sudo chmod 666 /dev/device2
$ sudo ls -l /dev/device2
crw-rw-rw- 1 root root 508, 0 janv. 24 16:55 /dev/device2
The writing into the device triggers the expected echo on stderr:
$ echo "qwerty for test purposes" > /dev/device2
qwerty for test purposes
$ echo "another string" > /dev/device2
another string
$ dmesg
[ 2255.183196] [DEVICE2]module has been loaded
[ 2255.183202] [DEVICE2]initialized (major number is 508)
[ 2255.193255] [DEVICE2]sys_call_table@ffffffffbcc013a0
[ 2255.193256] [DEVICE2]write_syscall@0000000030394929
[ 2441.674250] [DEVICE2]open
[ 2441.674268] [DEVICE2]write 0000000032fb5249, 25
[ 2441.674281] [DEVICE2]write() = 25
[ 2441.674286] [DEVICE2]released
[ 2475.538140] [DEVICE2]open
[ 2475.538159] [DEVICE2]write 0000000032fb5249, 15
[ 2475.538171] [DEVICE2]write() = 15
[ 2475.538175] [DEVICE2]released