--- /dev/null
+
+Configurable sysfs parameters for the x86-64 machine check code.
+
+Machine checks report internal hardware error conditions detected
+by the CPU. Uncorrected errors typically cause a machine check
+(often with panic), corrected ones cause a machine check log entry.
+
+Machine checks are organized in banks (normally associated with
+a hardware subsystem) and subevents in a bank. The exact meaning
+of the banks and subevent is CPU specific.
+
+mcelog knows how to decode them.
+
+When you see the "Machine check errors logged" message in the system
+log then mcelog should run to collect and decode machine check entries
+from /dev/mcelog. Normally mcelog should be run regularly from a cronjob.
+
+Each CPU has a directory in /sys/devices/system/machinecheck/machinecheckN
+(N = CPU number)
+
+The directory contains some configurable entries:
+
+Entries:
+
+bankNctl
+(N bank number)
+ 64bit Hex bitmask enabling/disabling specific subevents for bank N
+ When a bit in the bitmask is zero then the respective
+ subevent will not be reported.
+ By default all events are enabled.
+ Note that BIOS maintain another mask to disable specific events
+ per bank. This is not visible here
+
+The following entries appear for each CPU, but they are truly shared
+between all CPUs.
+
+check_interval
+ How often to poll for corrected machine check errors, in seconds
+ (Note output is hexademical). Default 5 minutes.
+
+tolerant
+ Tolerance level. When a machine check exception occurs for a non
+ corrected machine check the kernel can take different actions.
+ Since machine check exceptions can happen any time it is sometimes
+ risky for the kernel to kill a process because it defies
+ normal kernel locking rules. The tolerance level configures
+ how hard the kernel tries to recover even at some risk of deadlock.
+
+ 0: always panic,
+ 1: panic if deadlock possible,
+ 2: try to avoid panic,
+ 3: never panic or exit (for testing only)
+
+ Default: 1
+
+ Note this only makes a difference if the CPU allows recovery
+ from a machine check exception. Current x86 CPUs generally do not.
+
+trigger
+ Program to run when a machine check event is detected.
+ This is an alternative to running mcelog regularly from cron
+ and allows to detect events faster.
+
+TBD document entries for AMD threshold interrupt configuration
+
+For more details about the x86 machine check architecture
+see the Intel and AMD architecture manuals from their developer websites.
+
+For more details about the architecture see
+see http://one.firstfloor.org/~andi/mce.pdf
#include <linux/cpu.h>
#include <linux/percpu.h>
#include <linux/ctype.h>
+#include <linux/kmod.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
static int notify_user;
static int rip_msr;
static int mce_bootlog = 1;
+static atomic_t mce_events;
+
+static char trigger[128];
+static char *trigger_argv[2] = { trigger, NULL };
/*
* Lockless MCE logging infrastructure.
void mce_log(struct mce *mce)
{
unsigned next, entry;
+ atomic_inc(&mce_events);
mce->finished = 0;
wmb();
for (;;) {
}
}
+static void do_mce_trigger(void)
+{
+ static atomic_t mce_logged;
+ int events = atomic_read(&mce_events);
+ if (events != atomic_read(&mce_logged) && trigger[0]) {
+ /* Small race window, but should be harmless. */
+ atomic_set(&mce_logged, events);
+ call_usermodehelper(trigger, trigger_argv, NULL, -1);
+ }
+}
+
/*
* The actual machine check handler
*/
}
/* Never do anything final in the polling timer */
- if (!regs)
+ if (!regs) {
+ /* Normal interrupt context here. Call trigger for any new
+ events. */
+ do_mce_trigger();
goto out;
+ }
/* If we didn't find an uncorrectable error, pick
the last one (shouldn't happen, just being safe). */
} \
static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
+/* TBD should generate these dynamically based on number of available banks */
ACCESSOR(bank0ctl,bank[0],mce_restart())
ACCESSOR(bank1ctl,bank[1],mce_restart())
ACCESSOR(bank2ctl,bank[2],mce_restart())
ACCESSOR(bank3ctl,bank[3],mce_restart())
ACCESSOR(bank4ctl,bank[4],mce_restart())
ACCESSOR(bank5ctl,bank[5],mce_restart())
-static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
- &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
- &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl};
+
+static ssize_t show_trigger(struct sys_device *s, char *buf)
+{
+ strcpy(buf, trigger);
+ strcat(buf, "\n");
+ return strlen(trigger) + 1;
+}
+
+static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
+{
+ char *p;
+ int len;
+ strncpy(trigger, buf, sizeof(trigger));
+ trigger[sizeof(trigger)-1] = 0;
+ len = strlen(trigger);
+ p = strchr(trigger, '\n');
+ if (*p) *p = 0;
+ return len;
+}
+
+static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
ACCESSOR(tolerant,tolerant,)
ACCESSOR(check_interval,check_interval,mce_restart())
+static struct sysdev_attribute *mce_attributes[] = {
+ &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
+ &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
+ &attr_tolerant, &attr_check_interval, &attr_trigger,
+ NULL
+};
/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
static __cpuinit int mce_create_device(unsigned int cpu)
err = sysdev_register(&per_cpu(device_mce,cpu));
if (!err) {
- for (i = 0; i < banks; i++)
+ for (i = 0; mce_attributes[i]; i++)
sysdev_create_file(&per_cpu(device_mce,cpu),
- bank_attributes[i]);
- sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
- sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
+ mce_attributes[i]);
}
return err;
}
{
int i;
- for (i = 0; i < banks; i++)
+ for (i = 0; mce_attributes[i]; i++)
sysdev_remove_file(&per_cpu(device_mce,cpu),
- bank_attributes[i]);
- sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
- sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
+ mce_attributes[i]);
sysdev_unregister(&per_cpu(device_mce,cpu));
memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
}
sub_info->retval = ret;
}
- complete(sub_info->complete);
+ if (sub_info->wait < 0)
+ kfree(sub_info);
+ else
+ complete(sub_info->complete);
return 0;
}
pid = kernel_thread(____call_usermodehelper, sub_info,
CLONE_VFORK | SIGCHLD);
+ if (wait < 0)
+ return;
+
if (pid < 0) {
sub_info->retval = pid;
complete(sub_info->complete);
* @envp: null-terminated environment list
* @session_keyring: session keyring for process (NULL for an empty keyring)
* @wait: wait for the application to finish and return status.
+ * when -1 don't wait at all, but you get no useful error back when
+ * the program couldn't be exec'ed. This makes it safe to call
+ * from interrupt context.
*
* Runs a user-space application. The application is started
* asynchronously if wait is not set, and runs as a child of keventd.
struct key *session_keyring, int wait)
{
DECLARE_COMPLETION_ONSTACK(done);
- struct subprocess_info sub_info = {
- .work = __WORK_INITIALIZER(sub_info.work,
- __call_usermodehelper),
- .complete = &done,
- .path = path,
- .argv = argv,
- .envp = envp,
- .ring = session_keyring,
- .wait = wait,
- .retval = 0,
- };
+ struct subprocess_info *sub_info;
+ int retval;
if (!khelper_wq)
return -EBUSY;
if (path[0] == '\0')
return 0;
- queue_work(khelper_wq, &sub_info.work);
+ sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC);
+ if (!sub_info)
+ return -ENOMEM;
+
+ INIT_WORK(&sub_info->work, __call_usermodehelper);
+ sub_info->complete = &done;
+ sub_info->path = path;
+ sub_info->argv = argv;
+ sub_info->envp = envp;
+ sub_info->ring = session_keyring;
+ sub_info->wait = wait;
+
+ queue_work(khelper_wq, &sub_info->work);
+ if (wait < 0) /* task has freed sub_info */
+ return 0;
wait_for_completion(&done);
- return sub_info.retval;
+ retval = sub_info->retval;
+ kfree(sub_info);
+ return retval;
}
EXPORT_SYMBOL(call_usermodehelper_keys);