+++ /dev/null
-
- Hypervisor-Assisted Dump
- ------------------------
- November 2007
-
-The goal of hypervisor-assisted dump is to enable the dump of
-a crashed system, and to do so from a fully-reset system, and
-to minimize the total elapsed time until the system is back
-in production use.
-
-As compared to kdump or other strategies, hypervisor-assisted
-dump offers several strong, practical advantages:
-
--- Unlike kdump, the system has been reset, and loaded
- with a fresh copy of the kernel. In particular,
- PCI and I/O devices have been reinitialized and are
- in a clean, consistent state.
--- As the dump is performed, the dumped memory becomes
- immediately available to the system for normal use.
--- After the dump is completed, no further reboots are
- required; the system will be fully usable, and running
- in its normal, production mode on its normal kernel.
-
-The above can only be accomplished by coordination with,
-and assistance from the hypervisor. The procedure is
-as follows:
-
--- When a system crashes, the hypervisor will save
- the low 256MB of RAM to a previously registered
- save region. It will also save system state, system
- registers, and hardware PTE's.
-
--- After the low 256MB area has been saved, the
- hypervisor will reset PCI and other hardware state.
- It will *not* clear RAM. It will then launch the
- bootloader, as normal.
-
--- The freshly booted kernel will notice that there
- is a new node (ibm,dump-kernel) in the device tree,
- indicating that there is crash data available from
- a previous boot. It will boot into only 256MB of RAM,
- reserving the rest of system memory.
-
--- Userspace tools will parse /sys/kernel/release_region
- and read /proc/vmcore to obtain the contents of memory,
- which holds the previous crashed kernel. The userspace
- tools may copy this info to disk, or network, nas, san,
- iscsi, etc. as desired.
-
- For Example: the values in /sys/kernel/release-region
- would look something like this (address-range pairs).
- CPU:0x177fee000-0x10000: HPTE:0x177ffe020-0x1000: /
- DUMP:0x177fff020-0x10000000, 0x10000000-0x16F1D370A
-
--- As the userspace tools complete saving a portion of
- dump, they echo an offset and size to
- /sys/kernel/release_region to release the reserved
- memory back to general use.
-
- An example of this is:
- "echo 0x40000000 0x10000000 > /sys/kernel/release_region"
- which will release 256MB at the 1GB boundary.
-
-Please note that the hypervisor-assisted dump feature
-is only available on Power6-based systems with recent
-firmware versions.
-
-Implementation details:
-----------------------
-
-During boot, a check is made to see if firmware supports
-this feature on this particular machine. If it does, then
-we check to see if a active dump is waiting for us. If yes
-then everything but 256 MB of RAM is reserved during early
-boot. This area is released once we collect a dump from user
-land scripts that are run. If there is dump data, then
-the /sys/kernel/release_region file is created, and
-the reserved memory is held.
-
-If there is no waiting dump data, then only the highest
-256MB of the ram is reserved as a scratch area. This area
-is *not* released: this region will be kept permanently
-reserved, so that it can act as a receptacle for a copy
-of the low 256MB in the case a crash does occur. See,
-however, "open issues" below, as to whether
-such a reserved region is really needed.
-
-Currently the dump will be copied from /proc/vmcore to a
-a new file upon user intervention. The starting address
-to be read and the range for each data point in provided
-in /sys/kernel/release_region.
-
-The tools to examine the dump will be same as the ones
-used for kdump.
-
-General notes:
---------------
-Security: please note that there are potential security issues
-with any sort of dump mechanism. In particular, plaintext
-(unencrypted) data, and possibly passwords, may be present in
-the dump data. Userspace tools must take adequate precautions to
-preserve security.
-
-Open issues/ToDo:
-------------
- o The various code paths that tell the hypervisor that a crash
- occurred, vs. it simply being a normal reboot, should be
- reviewed, and possibly clarified/fixed.
-
- o Instead of using /sys/kernel, should there be a /sys/dump
- instead? There is a dump_subsys being created by the s390 code,
- perhaps the pseries code should use a similar layout as well.
-
- o Is reserving a 256MB region really required? The goal of
- reserving a 256MB scratch area is to make sure that no
- important crash data is clobbered when the hypervisor
- save low mem to the scratch area. But, if one could assure
- that nothing important is located in some 256MB area, then
- it would not need to be reserved. Something that can be
- improved in subsequent versions.
-
- o Still working the kdump team to integrate this with kdump,
- some work remains but this would not affect the current
- patches.
-
- o Still need to write a shell script, to copy the dump away.
- Currently I am parsing it manually.
+++ /dev/null
-/*
- * Hypervisor-assisted dump
- *
- * Linas Vepstas, Manish Ahuja 2008
- * Copyright 2008 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <linux/kobject.h>
-#include <linux/mm.h>
-#include <linux/of.h>
-#include <linux/pfn.h>
-#include <linux/swap.h>
-#include <linux/sysfs.h>
-
-#include <asm/page.h>
-#include <asm/phyp_dump.h>
-#include <asm/machdep.h>
-#include <asm/prom.h>
-#include <asm/rtas.h>
-
-/* Variables, used to communicate data between early boot and late boot */
-static struct phyp_dump phyp_dump_vars;
-struct phyp_dump *phyp_dump_info = &phyp_dump_vars;
-
-static int ibm_configure_kernel_dump;
-/* ------------------------------------------------- */
-/* RTAS interfaces to declare the dump regions */
-
-struct dump_section {
- u32 dump_flags;
- u16 source_type;
- u16 error_flags;
- u64 source_address;
- u64 source_length;
- u64 length_copied;
- u64 destination_address;
-};
-
-struct phyp_dump_header {
- u32 version;
- u16 num_of_sections;
- u16 status;
-
- u32 first_offset_section;
- u32 dump_disk_section;
- u64 block_num_dd;
- u64 num_of_blocks_dd;
- u32 offset_dd;
- u32 maxtime_to_auto;
- /* No dump disk path string used */
-
- struct dump_section cpu_data;
- struct dump_section hpte_data;
- struct dump_section kernel_data;
-};
-
-/* The dump header *must be* in low memory, so .bss it */
-static struct phyp_dump_header phdr;
-
-#define NUM_DUMP_SECTIONS 3
-#define DUMP_HEADER_VERSION 0x1
-#define DUMP_REQUEST_FLAG 0x1
-#define DUMP_SOURCE_CPU 0x0001
-#define DUMP_SOURCE_HPTE 0x0002
-#define DUMP_SOURCE_RMO 0x0011
-#define DUMP_ERROR_FLAG 0x2000
-#define DUMP_TRIGGERED 0x4000
-#define DUMP_PERFORMED 0x8000
-
-
-/**
- * init_dump_header() - initialize the header declaring a dump
- * Returns: length of dump save area.
- *
- * When the hypervisor saves crashed state, it needs to put
- * it somewhere. The dump header tells the hypervisor where
- * the data can be saved.
- */
-static unsigned long init_dump_header(struct phyp_dump_header *ph)
-{
- unsigned long addr_offset = 0;
-
- /* Set up the dump header */
- ph->version = DUMP_HEADER_VERSION;
- ph->num_of_sections = NUM_DUMP_SECTIONS;
- ph->status = 0;
-
- ph->first_offset_section =
- (u32)offsetof(struct phyp_dump_header, cpu_data);
- ph->dump_disk_section = 0;
- ph->block_num_dd = 0;
- ph->num_of_blocks_dd = 0;
- ph->offset_dd = 0;
-
- ph->maxtime_to_auto = 0; /* disabled */
-
- /* The first two sections are mandatory */
- ph->cpu_data.dump_flags = DUMP_REQUEST_FLAG;
- ph->cpu_data.source_type = DUMP_SOURCE_CPU;
- ph->cpu_data.source_address = 0;
- ph->cpu_data.source_length = phyp_dump_info->cpu_state_size;
- ph->cpu_data.destination_address = addr_offset;
- addr_offset += phyp_dump_info->cpu_state_size;
-
- ph->hpte_data.dump_flags = DUMP_REQUEST_FLAG;
- ph->hpte_data.source_type = DUMP_SOURCE_HPTE;
- ph->hpte_data.source_address = 0;
- ph->hpte_data.source_length = phyp_dump_info->hpte_region_size;
- ph->hpte_data.destination_address = addr_offset;
- addr_offset += phyp_dump_info->hpte_region_size;
-
- /* This section describes the low kernel region */
- ph->kernel_data.dump_flags = DUMP_REQUEST_FLAG;
- ph->kernel_data.source_type = DUMP_SOURCE_RMO;
- ph->kernel_data.source_address = PHYP_DUMP_RMR_START;
- ph->kernel_data.source_length = PHYP_DUMP_RMR_END;
- ph->kernel_data.destination_address = addr_offset;
- addr_offset += ph->kernel_data.source_length;
-
- return addr_offset;
-}
-
-static void print_dump_header(const struct phyp_dump_header *ph)
-{
-#ifdef DEBUG
- if (ph == NULL)
- return;
-
- printk(KERN_INFO "dump header:\n");
- /* setup some ph->sections required */
- printk(KERN_INFO "version = %d\n", ph->version);
- printk(KERN_INFO "Sections = %d\n", ph->num_of_sections);
- printk(KERN_INFO "Status = 0x%x\n", ph->status);
-
- /* No ph->disk, so all should be set to 0 */
- printk(KERN_INFO "Offset to first section 0x%x\n",
- ph->first_offset_section);
- printk(KERN_INFO "dump disk sections should be zero\n");
- printk(KERN_INFO "dump disk section = %d\n", ph->dump_disk_section);
- printk(KERN_INFO "block num = %lld\n", ph->block_num_dd);
- printk(KERN_INFO "number of blocks = %lld\n", ph->num_of_blocks_dd);
- printk(KERN_INFO "dump disk offset = %d\n", ph->offset_dd);
- printk(KERN_INFO "Max auto time= %d\n", ph->maxtime_to_auto);
-
- /*set cpu state and hpte states as well scratch pad area */
- printk(KERN_INFO " CPU AREA\n");
- printk(KERN_INFO "cpu dump_flags =%d\n", ph->cpu_data.dump_flags);
- printk(KERN_INFO "cpu source_type =%d\n", ph->cpu_data.source_type);
- printk(KERN_INFO "cpu error_flags =%d\n", ph->cpu_data.error_flags);
- printk(KERN_INFO "cpu source_address =%llx\n",
- ph->cpu_data.source_address);
- printk(KERN_INFO "cpu source_length =%llx\n",
- ph->cpu_data.source_length);
- printk(KERN_INFO "cpu length_copied =%llx\n",
- ph->cpu_data.length_copied);
-
- printk(KERN_INFO " HPTE AREA\n");
- printk(KERN_INFO "HPTE dump_flags =%d\n", ph->hpte_data.dump_flags);
- printk(KERN_INFO "HPTE source_type =%d\n", ph->hpte_data.source_type);
- printk(KERN_INFO "HPTE error_flags =%d\n", ph->hpte_data.error_flags);
- printk(KERN_INFO "HPTE source_address =%llx\n",
- ph->hpte_data.source_address);
- printk(KERN_INFO "HPTE source_length =%llx\n",
- ph->hpte_data.source_length);
- printk(KERN_INFO "HPTE length_copied =%llx\n",
- ph->hpte_data.length_copied);
-
- printk(KERN_INFO " SRSD AREA\n");
- printk(KERN_INFO "SRSD dump_flags =%d\n", ph->kernel_data.dump_flags);
- printk(KERN_INFO "SRSD source_type =%d\n", ph->kernel_data.source_type);
- printk(KERN_INFO "SRSD error_flags =%d\n", ph->kernel_data.error_flags);
- printk(KERN_INFO "SRSD source_address =%llx\n",
- ph->kernel_data.source_address);
- printk(KERN_INFO "SRSD source_length =%llx\n",
- ph->kernel_data.source_length);
- printk(KERN_INFO "SRSD length_copied =%llx\n",
- ph->kernel_data.length_copied);
-#endif
-}
-
-static ssize_t show_phyp_dump_active(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
-
- /* create filesystem entry so kdump is phyp-dump aware */
- return sprintf(buf, "%lx\n", phyp_dump_info->phyp_dump_at_boot);
-}
-
-static struct kobj_attribute pdl = __ATTR(phyp_dump_active, 0600,
- show_phyp_dump_active,
- NULL);
-
-static void register_dump_area(struct phyp_dump_header *ph, unsigned long addr)
-{
- int rc;
-
- /* Add addr value if not initialized before */
- if (ph->cpu_data.destination_address == 0) {
- ph->cpu_data.destination_address += addr;
- ph->hpte_data.destination_address += addr;
- ph->kernel_data.destination_address += addr;
- }
-
- /* ToDo Invalidate kdump and free memory range. */
-
- do {
- rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL,
- 1, ph, sizeof(struct phyp_dump_header));
- } while (rtas_busy_delay(rc));
-
- if (rc) {
- printk(KERN_ERR "phyp-dump: unexpected error (%d) on "
- "register\n", rc);
- print_dump_header(ph);
- return;
- }
-
- rc = sysfs_create_file(kernel_kobj, &pdl.attr);
- if (rc)
- printk(KERN_ERR "phyp-dump: unable to create sysfs"
- " file (%d)\n", rc);
-}
-
-static
-void invalidate_last_dump(struct phyp_dump_header *ph, unsigned long addr)
-{
- int rc;
-
- /* Add addr value if not initialized before */
- if (ph->cpu_data.destination_address == 0) {
- ph->cpu_data.destination_address += addr;
- ph->hpte_data.destination_address += addr;
- ph->kernel_data.destination_address += addr;
- }
-
- do {
- rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL,
- 2, ph, sizeof(struct phyp_dump_header));
- } while (rtas_busy_delay(rc));
-
- if (rc) {
- printk(KERN_ERR "phyp-dump: unexpected error (%d) "
- "on invalidate\n", rc);
- print_dump_header(ph);
- }
-}
-
-/* ------------------------------------------------- */
-/**
- * release_memory_range -- release memory previously memblock_reserved
- * @start_pfn: starting physical frame number
- * @nr_pages: number of pages to free.
- *
- * This routine will release memory that had been previously
- * memblock_reserved in early boot. The released memory becomes
- * available for genreal use.
- */
-static void release_memory_range(unsigned long start_pfn,
- unsigned long nr_pages)
-{
- struct page *rpage;
- unsigned long end_pfn;
- long i;
-
- end_pfn = start_pfn + nr_pages;
-
- for (i = start_pfn; i <= end_pfn; i++) {
- rpage = pfn_to_page(i);
- if (PageReserved(rpage)) {
- ClearPageReserved(rpage);
- init_page_count(rpage);
- __free_page(rpage);
- totalram_pages++;
- }
- }
-}
-
-/**
- * track_freed_range -- Counts the range being freed.
- * Once the counter goes to zero, it re-registers dump for
- * future use.
- */
-static void
-track_freed_range(unsigned long addr, unsigned long length)
-{
- static unsigned long scratch_area_size, reserved_area_size;
-
- if (addr < phyp_dump_info->init_reserve_start)
- return;
-
- if ((addr >= phyp_dump_info->init_reserve_start) &&
- (addr <= phyp_dump_info->init_reserve_start +
- phyp_dump_info->init_reserve_size))
- reserved_area_size += length;
-
- if ((addr >= phyp_dump_info->reserved_scratch_addr) &&
- (addr <= phyp_dump_info->reserved_scratch_addr +
- phyp_dump_info->reserved_scratch_size))
- scratch_area_size += length;
-
- if ((reserved_area_size == phyp_dump_info->init_reserve_size) &&
- (scratch_area_size == phyp_dump_info->reserved_scratch_size)) {
-
- invalidate_last_dump(&phdr,
- phyp_dump_info->reserved_scratch_addr);
- register_dump_area(&phdr,
- phyp_dump_info->reserved_scratch_addr);
- }
-}
-
-/* ------------------------------------------------- */
-/**
- * sysfs_release_region -- sysfs interface to release memory range.
- *
- * Usage:
- * "echo <start addr> <length> > /sys/kernel/release_region"
- *
- * Example:
- * "echo 0x40000000 0x10000000 > /sys/kernel/release_region"
- *
- * will release 256MB starting at 1GB.
- */
-static ssize_t store_release_region(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- unsigned long start_addr, length, end_addr;
- unsigned long start_pfn, nr_pages;
- ssize_t ret;
-
- ret = sscanf(buf, "%lx %lx", &start_addr, &length);
- if (ret != 2)
- return -EINVAL;
-
- track_freed_range(start_addr, length);
-
- /* Range-check - don't free any reserved memory that
- * wasn't reserved for phyp-dump */
- if (start_addr < phyp_dump_info->init_reserve_start)
- start_addr = phyp_dump_info->init_reserve_start;
-
- end_addr = phyp_dump_info->init_reserve_start +
- phyp_dump_info->init_reserve_size;
- if (start_addr+length > end_addr)
- length = end_addr - start_addr;
-
- /* Release the region of memory assed in by user */
- start_pfn = PFN_DOWN(start_addr);
- nr_pages = PFN_DOWN(length);
- release_memory_range(start_pfn, nr_pages);
-
- return count;
-}
-
-static ssize_t show_release_region(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- u64 second_addr_range;
-
- /* total reserved size - start of scratch area */
- second_addr_range = phyp_dump_info->init_reserve_size -
- phyp_dump_info->reserved_scratch_size;
- return sprintf(buf, "CPU:0x%llx-0x%llx: HPTE:0x%llx-0x%llx:"
- " DUMP:0x%llx-0x%llx, 0x%lx-0x%llx:\n",
- phdr.cpu_data.destination_address,
- phdr.cpu_data.length_copied,
- phdr.hpte_data.destination_address,
- phdr.hpte_data.length_copied,
- phdr.kernel_data.destination_address,
- phdr.kernel_data.length_copied,
- phyp_dump_info->init_reserve_start,
- second_addr_range);
-}
-
-static struct kobj_attribute rr = __ATTR(release_region, 0600,
- show_release_region,
- store_release_region);
-
-static int __init phyp_dump_setup(void)
-{
- struct device_node *rtas;
- const struct phyp_dump_header *dump_header = NULL;
- unsigned long dump_area_start;
- unsigned long dump_area_length;
- int header_len = 0;
- int rc;
-
- /* If no memory was reserved in early boot, there is nothing to do */
- if (phyp_dump_info->init_reserve_size == 0)
- return 0;
-
- /* Return if phyp dump not supported */
- if (!phyp_dump_info->phyp_dump_configured)
- return -ENOSYS;
-
- /* Is there dump data waiting for us? If there isn't,
- * then register a new dump area, and release all of
- * the rest of the reserved ram.
- *
- * The /rtas/ibm,kernel-dump rtas node is present only
- * if there is dump data waiting for us.
- */
- rtas = of_find_node_by_path("/rtas");
- if (rtas) {
- dump_header = of_get_property(rtas, "ibm,kernel-dump",
- &header_len);
- of_node_put(rtas);
- }
-
- ibm_configure_kernel_dump = rtas_token("ibm,configure-kernel-dump");
-
- print_dump_header(dump_header);
- dump_area_length = init_dump_header(&phdr);
- /* align down */
- dump_area_start = phyp_dump_info->init_reserve_start & PAGE_MASK;
-
- if (dump_header == NULL) {
- register_dump_area(&phdr, dump_area_start);
- return 0;
- }
-
- /* re-register the dump area, if old dump was invalid */
- if ((dump_header) && (dump_header->status & DUMP_ERROR_FLAG)) {
- invalidate_last_dump(&phdr, dump_area_start);
- register_dump_area(&phdr, dump_area_start);
- return 0;
- }
-
- if (dump_header) {
- phyp_dump_info->reserved_scratch_addr =
- dump_header->cpu_data.destination_address;
- phyp_dump_info->reserved_scratch_size =
- dump_header->cpu_data.source_length +
- dump_header->hpte_data.source_length +
- dump_header->kernel_data.source_length;
- }
-
- /* Should we create a dump_subsys, analogous to s390/ipl.c ? */
- rc = sysfs_create_file(kernel_kobj, &rr.attr);
- if (rc)
- printk(KERN_ERR "phyp-dump: unable to create sysfs file (%d)\n",
- rc);
-
- /* ToDo: re-register the dump area, for next time. */
- return 0;
-}
-machine_subsys_initcall(pseries, phyp_dump_setup);
-
-int __init early_init_dt_scan_phyp_dump(unsigned long node,
- const char *uname, int depth, void *data)
-{
- const unsigned int *sizes;
-
- phyp_dump_info->phyp_dump_configured = 0;
- phyp_dump_info->phyp_dump_is_active = 0;
-
- if (depth != 1 || strcmp(uname, "rtas") != 0)
- return 0;
-
- if (of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL))
- phyp_dump_info->phyp_dump_configured++;
-
- if (of_get_flat_dt_prop(node, "ibm,dump-kernel", NULL))
- phyp_dump_info->phyp_dump_is_active++;
-
- sizes = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
- NULL);
- if (!sizes)
- return 0;
-
- if (sizes[0] == 1)
- phyp_dump_info->cpu_state_size = *((unsigned long *)&sizes[1]);
-
- if (sizes[3] == 2)
- phyp_dump_info->hpte_region_size =
- *((unsigned long *)&sizes[4]);
- return 1;
-}
-
-/* Look for phyp_dump= cmdline option */
-static int __init early_phyp_dump_enabled(char *p)
-{
- phyp_dump_info->phyp_dump_at_boot = 1;
-
- if (!p)
- return 0;
-
- if (strncmp(p, "1", 1) == 0)
- phyp_dump_info->phyp_dump_at_boot = 1;
- else if (strncmp(p, "0", 1) == 0)
- phyp_dump_info->phyp_dump_at_boot = 0;
-
- return 0;
-}
-early_param("phyp_dump", early_phyp_dump_enabled);
-
-/* Look for phyp_dump_reserve_size= cmdline option */
-static int __init early_phyp_dump_reserve_size(char *p)
-{
- if (p)
- phyp_dump_info->reserve_bootvar = memparse(p, &p);
-
- return 0;
-}
-early_param("phyp_dump_reserve_size", early_phyp_dump_reserve_size);