@@ -104,7 +104,10 @@ where
attribute - used to specify the attribute of the overridden bits.
It can be ro, rw, wc, rc to mean read-only, read-write,
- write-clear, and read-clear respectively.
+ write-clear, and read-clear respectively. A special attr-
+ ibute tc is defined to trace hw access through the trace
+ event named iohook_event. When tc is used as the attribute
+ value/mask are ignored.
domain - pci domain number
bus - pci bus number
@@ -299,3 +302,52 @@ bash # echo 1 > trigger
dmesg shows:
pcieport 0000:00:05.0: AER: Corrected error received: id=0500
+3.4 Tracing hardware access by the OS
+
+I/O Hook can be used to trace access to hw registers by the OS. When compared
+to mmio trace, this IO Hook trace has both benefits and shortcomings:
+ a) I/O Hook is SMP safe while mmio trace isn't.
+ b) mmio trace can trace arbitrary assembly instruction accessing
+ arbitrary address, while IO Hook can only trace standard
+ functions like readl(), inb(), pci_read_config_byte(), etc.
+ C) I/O Hook can trace IO ports and PCI Config space, while mmio trace
+ cannot.
+
+A trace event named 'iohook_event' is defined to report the accesses. A
+special attribute tc can be used to specify the register to be traced.
+For example in order to trace access to physical address region starting
+from 0x383ffff00000 with a length of 0x4000, the following cmd sequence is used:
+
+bash # echo "0x383ffff00000-4000[0/0]tc" > /sys/kernel/debug/tracing/iohook/mem
+bash # echo 1 > /sys/kernel/debug/iohook/trigger
+
+To trace hw access during boot, the following boot parameters can be used in
+grub:
+ iohook.mem, iohook.io, iohook.pciconf, iohook.msr
+For example
+ trace_event=iohook_event iohook.mem="383ffff00000-4000[0/0]tc" as boot
+parameters in grub can be used to trace the same physical address region
+starting from 0x383ffff00000. A typical trace event output looks like the
+following:
+
+bash $ sudo cat /sys/kernel/debug/tracing/trace
+...
+swapper/0-1 [018] .... 11.594157: iohook_event: pci_read_config(0000:00:14.00, 0x6, 0x2) <== 0x290
+
+swapper/0-1 [018] .... 11.599914: iohook_event: pci_read_config(0000:00:14.00, 0x34, 0x1) <== 0x70
+
+modprobe-613 [013] d... 12.447408: iohook_event: pci_write_config(0000:00:14.00, 0x4, 0x2) ==> 0x6
+
+modprobe-613 [013] .... 12.452565: iohook_event: pci_read_config(0000:00:14.00, 0xd, 0x1) <== 0x0
+
+modprobe-613 [013] d... 12.458302: iohook_event: pci_write_config(0000:00:14.00, 0xd, 0x1) ==> 0x40
+
+modprobe-613 [013] .... 12.477935: iohook_event: read(0x383ffff00000, 0x4) <== 0x1000080
+
+modprobe-613 [013] .... 12.477936: iohook_event: read(0x383ffff00018, 0x4) <== 0x2000
+
+modprobe-613 [013] d... 12.478001: iohook_event: write(0x383ffff02030, 0x4) ==> 0x37822000
+
+modprobe-613 [013] d... 12.478001: iohook_event: write(0x383ffff02034, 0x4) ==> 0x0
+...
+
@@ -25,6 +25,9 @@
#include <linux/smp.h>
#include "iohook.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/iohook.h>
+
MODULE_LICENSE("GPL");
static struct dentry *hook_irq_dentry;
@@ -42,6 +45,37 @@ struct static_key ovrdhw_enabled = STATIC_KEY_INIT_FALSE;
EXPORT_SYMBOL(ovrdhw_enabled);
int g_ovrd_on;
+
+char *hook_func_names[] = {
+ "read",
+ "write",
+ "in",
+ "out",
+ "pci_read_config",
+ "pci_write_config",
+ "rdmsr",
+ "wrmsr"
+};
+
+const char *iohook_trace_parse_addr(struct trace_seq *p, int type, u64 address)
+{
+ const char *ret = p->buffer + p->len;
+
+ if (type == PCI_RD || type == PCI_WR)
+ trace_seq_printf(p, "%04x:%02x:%02x.%02x, 0x%x",
+ PCI_DECODE_DOMAIN(address),
+ PCI_DECODE_BUSN(address),
+ PCI_SLOT(PCI_DECODE_DEVFN(address)),
+ PCI_FUNC(PCI_DECODE_DEVFN(address)),
+ PCI_DECODE_POS(address));
+ else
+ trace_seq_printf(p, "0x%llx", address);
+ trace_seq_putc(p, 0);
+
+ return ret;
+
+}
+
/* query a Register Override given spaceid and index */
struct reg_ovrd *iohook_query_ovrd(int spaceid, int idx)
{
@@ -330,6 +364,85 @@ static u64 v2p(u64 vaddr)
return slow_virt_to_phys((void *)vaddr);
}
+static int pci_rd(struct pci_bus *pcib, u64 address, int len, void *data)
+{
+ unsigned int devfn = 0, pos = 0;
+ unsigned long flags;
+ int res;
+
+ devfn = PCI_DECODE_DEVFN(address);
+ pos = PCI_DECODE_POS(address);
+
+ raw_spin_lock_irqsave(&pci_lock, flags);
+ res = pcib->ops->read(pcib, devfn, pos, len,
+ (u32 *)data);
+ raw_spin_unlock_irqrestore(&pci_lock, flags);
+ return res;
+}
+
+static int pci_wr(struct pci_bus *pcib, u64 address, int len, u32 value)
+{
+ unsigned int devfn = 0, pos = 0;
+ unsigned long flags;
+ int res;
+
+ devfn = PCI_DECODE_DEVFN(address);
+ pos = PCI_DECODE_POS(address);
+
+ raw_spin_lock_irqsave(&pci_lock, flags);
+ res = pcib->ops->write(pcib, devfn, pos, len, value);
+ raw_spin_unlock_irqrestore(&pci_lock, flags);
+ return res;
+}
+
+static int msr_rd(void *bus, u64 address, u64 *data)
+{
+ struct msr_regs_info *rv;
+ unsigned int msr;
+ int res;
+
+ rv = (struct msr_regs_info *)bus;
+ msr = address & 0xffffffff;
+ if (rv && rv->regs) { /* rdmsr_safe_regs() */
+ u32 low, high;
+
+ rv->err = rdmsr_safe_regs(rv->regs);
+ low = rv->regs[0]; /* eax */
+ high = rv->regs[2]; /* edx */
+ *data = low | ((u64)high << 32);
+ res = rv->err;
+ } else if (rv) { /* rv->regs == NULL */
+ *data = native_do_read_msr_safe(msr,
+ &rv->err);
+ res = rv->err;
+ } else { /* rv == NULL */
+ *data = native_do_read_msr(msr);
+ res = 0;
+ }
+
+ return res;
+}
+
+static void msr_wr(void *bus, u64 address, u64 value)
+{
+ struct msr_regs_info *rv;
+ unsigned int msr, low, high;
+
+ rv = (struct msr_regs_info *)bus;
+ msr = address & 0xffffffff;
+ low = value & 0xffffffff;
+ high = value >> 32;
+
+ if (rv && rv->regs) { /* wrmsr_safe_regs() */
+ rv->err = wrmsr_safe_regs(rv->regs);
+ } else if (rv) { /* rv->regs == NULL */
+ rv->err = native_do_write_msr_safe(msr,
+ low, high);
+ } else { /* rv == NULL */
+ native_do_write_msr(msr, low, high);
+ }
+}
+
/* shift left if i>=0, otherwise shift right */
#define BYTE_SHIFT(value, i) \
((i) >= 0 ? (value) << (i)*8 : (value) >> (-i)*8)
@@ -338,11 +451,9 @@ int read_ovrd_common(int spaceid, u64 address, int len, void *value, void *bus)
{
struct list_head *ovrd_list;
struct reg_ovrd *ovrd_reg;
- struct pci_bus *pcib;
- unsigned long lock_flags = 0, flags = 0;
+ unsigned long lock_flags = 0;
u64 faddress, vaddr = 0;
u64 data, bit_mask, attrib, val;
- unsigned int devfn = 0, pos = 0;
int i, flength, res, ret;
ret = -EINVAL;
@@ -358,8 +469,6 @@ int read_ovrd_common(int spaceid, u64 address, int len, void *value, void *bus)
} else if (spaceid == OVRD_SPACE_IO) {
ovrd_list = &ovrd_io_reg_map;
} else if (spaceid == OVRD_SPACE_PCICONF) {
- devfn = PCI_DECODE_DEVFN(address);
- pos = PCI_DECODE_POS(address);
ovrd_list = &ovrd_pci_conf_reg_map;
} else if (spaceid == OVRD_SPACE_MSR) {
unsigned int cpuid;
@@ -398,6 +507,27 @@ int read_ovrd_common(int spaceid, u64 address, int len, void *value, void *bus)
lock_flags);
/* at least one byte falls into the overridden range */
+
+ if (attrib == OVRD_TC) {
+ /* trace hw access */
+ if (spaceid == OVRD_SPACE_MEM) {
+ ret = mem_read(vaddr, len, &data);
+ trace_iohook_event(MM_RD, len, address, data);
+ } else if (spaceid == OVRD_SPACE_IO) {
+ ret = io_read(address, len, &data);
+ trace_iohook_event(IO_RD, len, address, data);
+ } else if (spaceid == OVRD_SPACE_PCICONF) {
+ ret = pci_rd(bus, address, len, &data);
+ trace_iohook_event(PCI_RD, len, address, data);
+ } else if (spaceid == OVRD_SPACE_MSR) {
+ ret = msr_rd(bus, address, &data);
+ trace_iohook_event(MSR_RD, len, address, data);
+ } else
+ trace_iohook_event(MSR_WR + 1, len, address,
+ data);
+ goto read_done;
+ }
+
data = 0;
ret = 0;
if (!(address >= faddress && address+len <= faddress+flength &&
@@ -409,33 +539,9 @@ int read_ovrd_common(int spaceid, u64 address, int len, void *value, void *bus)
} else if (spaceid == OVRD_SPACE_IO) {
res = io_read(address, len, &data);
} else if (spaceid == OVRD_SPACE_PCICONF) {
- raw_spin_lock_irqsave(&pci_lock, flags);
- pcib = (struct pci_bus *)bus;
- res = pcib->ops->read(pcib, devfn, pos, len,
- (u32 *)&data);
- raw_spin_unlock_irqrestore(&pci_lock, flags);
+ res = pci_rd(bus, address, len, &data);
} else if (spaceid == OVRD_SPACE_MSR) {
- struct msr_regs_info *rv;
- unsigned int msr;
-
- rv = (struct msr_regs_info *)bus;
- msr = address & 0xffffffff;
- if (rv && rv->regs) { /* rdmsr_safe_regs() */
- u32 low, high;
-
- rv->err = rdmsr_safe_regs(rv->regs);
- low = rv->regs[0]; /* eax */
- high = rv->regs[2]; /* edx */
- data = low | ((u64)high << 32);
- res = rv->err;
- } else if (rv) { /* rv->regs == NULL */
- data = native_do_read_msr_safe(msr,
- &rv->err);
- res = rv->err;
- } else { /* rv == NULL */
- data = native_do_read_msr(msr);
- res = 0;
- }
+ res = msr_rd(bus, address, &data);
} else
goto out;
@@ -475,6 +581,7 @@ int read_ovrd_common(int spaceid, u64 address, int len, void *value, void *bus)
}
}
+read_done:
switch (len) {
case 1:
*(u8 *)value = (u8)data;
@@ -490,11 +597,10 @@ int read_ovrd_common(int spaceid, u64 address, int len, void *value, void *bus)
break;
default:
ret = -EINVAL;
- goto out;
+ break;
}
- raw_spin_lock_irqsave(&io_hook_lock,
- lock_flags);
+ goto out;
}
raw_spin_unlock_irqrestore(&io_hook_lock, lock_flags);
@@ -507,8 +613,7 @@ int write_ovrd_common(int spaceid, u64 address, int len, void *data, void *bus)
{
struct list_head *ovrd_list;
struct reg_ovrd *ovrd_reg;
- struct pci_bus *pcib;
- unsigned long lock_flags = 0, flags = 0;
+ unsigned long lock_flags = 0;
u64 faddress, vaddr = 0;
u64 bit_mask, val, attrib;
unsigned int devfn = 0, pos = 0;
@@ -567,42 +672,28 @@ int write_ovrd_common(int spaceid, u64 address, int len, void *data, void *bus)
ret = 0;
- if (!(address >= faddress && address+len <= faddress+flength &&
- bit_mask == (u64)((1<<flength*8) - 1))) {
- /* partially overridden. write to HW for real bits */
+ if (attrib == OVRD_TC) {
+ /* trace hw access */
if (spaceid == OVRD_SPACE_MEM) {
res = mem_write(vaddr, len, data);
+ trace_iohook_event(MM_WR, len, address, value);
} else if (spaceid == OVRD_SPACE_IO) {
res = io_write(address, len, data);
+ trace_iohook_event(IO_WR, len, address, value);
} else if (spaceid == OVRD_SPACE_PCICONF) {
raw_spin_unlock_irqrestore(&io_hook_lock,
lock_flags);
- raw_spin_lock_irqsave(&pci_lock, flags);
- pcib = (struct pci_bus *)bus;
- pcib->ops->write(pcib, devfn, pos, len,
- (u32)value);
- raw_spin_unlock_irqrestore(&pci_lock, flags);
+ pci_wr(bus, address, len, (u32)value);
raw_spin_lock_irqsave(&io_hook_lock,
lock_flags);
+ trace_iohook_event(PCI_WR, len, address, value);
} else if (spaceid == OVRD_SPACE_MSR) {
- struct msr_regs_info *rv;
- unsigned int msr, low, high;
-
- rv = (struct msr_regs_info *)bus;
- msr = address & 0xffffffff;
- low = value & 0xffffffff;
- high = value >> 32;
-
- if (rv && rv->regs) { /* wrmsr_safe_regs() */
- rv->err = wrmsr_safe_regs(rv->regs);
- } else if (rv) { /* rv->regs == NULL */
- rv->err = native_do_write_msr_safe(msr,
- low, high);
- } else { /* rv == NULL */
- native_do_write_msr(msr, low, high);
- }
+ msr_wr(bus, address, value);
+ trace_iohook_event(MSR_WR, len, address, value);
} else
- break;
+ trace_iohook_event(MSR_WR + 2, len, address,
+ value);
+ break;
}
for (i = 0; i < len; i++) {
@@ -634,10 +725,30 @@ int write_ovrd_common(int spaceid, u64 address, int len, void *data, void *bus)
}
}
+ /* finished using ovrd_reg->, safe to unlock */
+ raw_spin_unlock_irqrestore(&io_hook_lock, lock_flags);
+
+ if (!(address >= faddress && address+len <= faddress+flength &&
+ bit_mask == (u64)((1<<flength*8) - 1))) {
+ /* partially overridden. write to HW for real bits */
+ if (spaceid == OVRD_SPACE_MEM)
+ res = mem_write(vaddr, len, data);
+ else if (spaceid == OVRD_SPACE_IO)
+ res = io_write(address, len, data);
+ else if (spaceid == OVRD_SPACE_PCICONF)
+ pci_wr(bus, address, len, (u32)value);
+ else if (spaceid == OVRD_SPACE_MSR)
+ msr_wr(bus, address, value);
+ else
+ break;
+ }
+
+ goto out;
}
raw_spin_unlock_irqrestore(&io_hook_lock, lock_flags);
+out:
return ret;
}
EXPORT_SYMBOL(write_ovrd_common);
@@ -808,6 +919,8 @@ char *hook_attrib(int attrib)
return "wc";
case OVRD_RC:
return "rc";
+ case OVRD_TC:
+ return "tc";
}
return "(null)";
@@ -1019,6 +1132,8 @@ mem_io:
attrib = OVRD_RC;
else if (cval == *(u16 *)"wc")
attrib = OVRD_WC;
+ else if (cval == *(u16 *)"tc")
+ attrib = OVRD_TC;
else
return -1;
@@ -1044,10 +1159,32 @@ mem_io:
}
static ssize_t
+hook_write_overrides(char *buf, size_t len, int spaceid)
+{
+ char *pstr, *entry;
+
+ /* first delete all overrides */
+ iohook_cleanup_ovrd(spaceid);
+
+ buf[len] = '\0';
+ pstr = buf;
+ while (pstr && *pstr) {
+ pstr = skip_spaces(pstr);
+ entry = strsep(&pstr, " \t\x0D\x0A");
+ pr_info("hook_write_overrides: %s\n", entry);
+ if (hook_parse_entry(entry, spaceid))
+ return -EINVAL;
+ }
+
+ return len;
+
+}
+
+static ssize_t
hook_write(struct file *file, const char __user *user_buf,
size_t user_len, loff_t *offset, int spaceid)
{
- char *buf, *pstr, *entry;
+ char *buf;
ssize_t rc;
if (*offset)
@@ -1062,22 +1199,7 @@ hook_write(struct file *file, const char __user *user_buf,
goto out_free;
}
- /* first delete all overridden registers */
- iohook_cleanup_ovrd(spaceid);
-
- buf[user_len] = '\0';
- pstr = buf;
- while (pstr && *pstr) {
- pstr = skip_spaces(pstr);
- entry = strsep(&pstr, " \t\x0D\x0A");
- pr_info("hook_write input: %s\n", entry);
- if (hook_parse_entry(entry, spaceid)) {
- rc = -EINVAL;
- goto out_free;
- }
- }
-
- rc = user_len;
+ rc = hook_write_overrides(buf, user_len, spaceid);
out_free:
vfree(buf);
@@ -1158,6 +1280,45 @@ static const struct file_operations hook_trigger_fops = {
.write = hook_trigger_write,
};
+/* These params define persistent overrides across reboot */
+static char *io = "", *mem = "", *pciconf = "", *msr = "";
+module_param(io, charp, 0);
+MODULE_PARM_DESC(io, "define overrides in io port space");
+module_param(mem, charp, 0);
+MODULE_PARM_DESC(mem, "define overrides in mem space");
+module_param(pciconf, charp, 0);
+MODULE_PARM_DESC(pciconf, "define overrides in pci config space");
+module_param(msr, charp, 0);
+MODULE_PARM_DESC(msr, "define overrides in MSR space");
+
+static void
+add_persistent_overrides(void)
+{
+ int persist = 0;
+
+ if (*io) {
+ hook_write_overrides(io, strlen(io), OVRD_SPACE_IO);
+ persist++;
+ }
+ if (*mem) {
+ hook_write_overrides(mem, strlen(mem), OVRD_SPACE_MEM);
+ persist++;
+ }
+ if (*pciconf) {
+ hook_write_overrides(pciconf, strlen(pciconf),
+ OVRD_SPACE_PCICONF);
+ persist++;
+ }
+ if (*msr) {
+ hook_write_overrides(msr, strlen(msr), OVRD_SPACE_MSR);
+ persist++;
+ }
+
+ if (persist)
+ iohook_start_ovrd();
+
+}
+
static int __init iohook_init(void)
{
struct dentry *root;
@@ -1198,6 +1359,8 @@ static int __init iohook_init(void)
if (hook_reg_dentry == NULL)
return -ENODEV;
+ add_persistent_overrides();
+
return 0;
}
new file mode 100644
@@ -0,0 +1,58 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM iohook
+
+#if !defined(_TRACE_IOHOOK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IOHOOK_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#define MM_RD 0
+#define MM_WR 1
+#define IO_RD 2
+#define IO_WR 3
+#define PCI_RD 4
+#define PCI_WR 5
+#define MSR_RD 6
+#define MSR_WR 7
+
+extern char *hook_func_names[];
+
+const char *iohook_trace_parse_addr(struct trace_seq*, int, u64);
+#define __parse_addr iohook_trace_parse_addr(p, __entry->type, \
+ __entry->address)
+
+/**
+ * @type: type of hw access (MM_RD, MM_WR, IO_RD, PCI_RD ...)
+ */
+TRACE_EVENT(iohook_event,
+ TP_PROTO(const int type,
+ const int len,
+ const u64 address,
+ const u64 value),
+
+ TP_ARGS(type, len, address, value),
+
+ TP_STRUCT__entry(
+ __field( int, type)
+ __field( int, len)
+ __field( u64, address)
+ __field( u64, value)
+ ),
+
+ TP_fast_assign(
+ __entry->type = type;
+ __entry->len = len;
+ __entry->address = address;
+ __entry->value = value;
+ ),
+
+ TP_printk("%s(%s, 0x%x) %s 0x%llx\n", __entry->type > MSR_WR ?
+ "unknown" : hook_func_names[__entry->type], __parse_addr,
+ __entry->len, __entry->type % 2 ? "==>" : "<==", /* event/odd */
+ __entry->value & ((1ull << (8 * __entry->len)) - 1))
+);
+#endif /* _TRACE_IOHOOK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
A new attribute "tc" can be defined for any Register Override so that IO Hook can report each read from (or write to) the specified area in mem, io, msr, and pciconf spaces. A trace event named 'iohook_event' is defined to report the accesses. For example echo "0x383ffff00000-4000[0/0]tc" > /sys/kernel/debug/tracing/iohook/mem or trace_event=iohook_event iohook.mem="383ffff00000-4000[0/0]tc" as boot parameters in grub can be used to trace any access to physical address region starting from 0x383ffff00000 with a length of 0x4000. A typical trace event output looks like the following: bash $ sudo cat /sys/kernel/debug/tracing/trace ... swapper/0-1 [018] .... 11.594157: iohook_event: pci_read_config(0000:00:14.00, 0x6, 0x2) <== 0x290 swapper/0-1 [018] .... 11.599914: iohook_event: pci_read_config(0000:00:14.00, 0x34, 0x1) <== 0x70 modprobe-613 [013] d... 12.447408: iohook_event: pci_write_config(0000:00:14.00, 0x4, 0x2) ==> 0x6 modprobe-613 [013] .... 12.452565: iohook_event: pci_read_config(0000:00:14.00, 0xd, 0x1) <== 0x0 modprobe-613 [013] d... 12.458302: iohook_event: pci_write_config(0000:00:14.00, 0xd, 0x1) ==> 0x40 modprobe-613 [013] .... 12.477935: iohook_event: read(0x383ffff00000, 0x4) <== 0x1000080 modprobe-613 [013] .... 12.477936: iohook_event: read(0x383ffff00018, 0x4) <== 0x2000 modprobe-613 [013] d... 12.478001: iohook_event: write(0x383ffff02030, 0x4) ==> 0x37822000 modprobe-613 [013] d... 12.478001: iohook_event: write(0x383ffff02034, 0x4) ==> 0x0 ... Signed-off-by: Rui Wang <rui.y.wang@intel.com> --- Documentation/PCI/iohook.txt | 54 +++++++- drivers/misc/iohook/iohook.c | 319 +++++++++++++++++++++++++++++++---------- include/trace/events/iohook.h | 58 ++++++++ 3 files changed, 352 insertions(+), 79 deletions(-) create mode 100644 include/trace/events/iohook.h