startup.c revision 04a42e3e989d6db9fc4cbbd375eb438510055ce6
* XXX make declaration below "static" when drivers no longer use this extern caddr_t p0_va;
/* Virtual address for accessing physical page 0 */ * Declare these as initialized data so we can patch them. pgcnt_t physmem = 0;
/* memory size in pages, patch if you want less */ /* Global variables for MP support. Used in mp_startup */ * Some CPUs have holes in the middle of the 64-bit virtual address range. static int kpm_desired = 0;
/* Do we want to try to use segkpm? */ * VA range that must be preserved for boot until we release all of its * Configuration parameters set at boot time. * new memory fragmentations are possible in startup() due to BOP_ALLOCs. this * depends on number of BOP_ALLOC calls made and requested size, memory size * combination and whether boot.bin memory needs to be freed. long page_hashsz;
/* Size of page hash table (power of two) */ struct page *
pp_base;
/* Base of initial system page struct array */ struct seg ktextseg;
/* Segment used for kernel executable image */ struct seg kvalloc;
/* Segment used for "valloc" mapping */ struct seg kpseg;
/* Segment used for pageable kernel virt mem */ struct seg kmapseg;
/* Segment used for generic kernel mappings */ struct seg kdebugseg;
/* Segment used for the kernel debugger */ struct seg *
segkp = &
kpseg;
/* Pageable kernel virtual memory segment */ struct seg kpmseg;
/* Segment used for physical mapping */ * kphysm_init returns the number of pages that were processed * a couple useful roundup macros * 32-bit Kernel's Virtual memory layout. * +-----------------------+ * 0xFFC00000 -|-----------------------|- ARGSBASE * 0xFF800000 -|-----------------------|- SEGDEBUGBASE * 0xFEC00000 -|-----------------------| * 0xFE800000 -|-----------------------|- KERNEL_TEXT * 0xFE000000 -|-----------------------|- lufs_addr * --- -|-----------------------|- valloc_base + valloc_sz * | early pp structures | * --- -|-----------------------|- valloc_base (floating) * 0xFDFFE000 -|-----------------------|- ekernelheap, ptable_va * | | (segkp is an arena under the heap) * --- -|-----------------------|- kernelheap (floating) * 0xC3002000 -|-----------------------|- segkmap_start (floating) * 0xC3000000 -|-----------------------|- kernelbase / userlimit (floating) * |-----------------------| * 0x08048000 -|-----------------------| * 0x00000000 +-----------------------+ * 64-bit Kernel's Virtual memory layout. (assuming 64 bit app) * +-----------------------+ * 0xFFFFFFFF.FFC00000 |-----------------------|- ARGSBASE * 0xFFFFFFFF.FF800000 |-----------------------|- SEGDEBUGBASE * +-----------------------+ * 0xFFFFFFFF.FBC00000 |-----------------------| * 0xFFFFFFFF.FB800000 |-----------------------|- KERNEL_TEXT * 0xFFFFFFFF.FB000000 -|-----------------------|- lufs_addr * --- |-----------------------|- valloc_base + valloc_sz * | early pp structures | * --- |-----------------------|- valloc_base * --- |-----------------------|- ptable_va * | Core heap | (used for loadable modules) * 0xFFFFFFFF.C0000000 |-----------------------|- core_base / ekernelheap * 0xFFFFFXXX.XXX00000 |-----------------------|- kernelheap (floating) * 0xFFFFFXXX.XXX00000 |-----------------------|- segkmap_start (floating) * 0xFFFFFXXX.XXX00000 |-----------------------|- toxic_addr (floating) * --- |-----------------------|- segkp_base * 0xFFFFFE00.00000000 |-----------------------| * 0xFFFFFD80.00000000 |-----------------------|- KERNELBASE * | User stack |- User space memory * | shared objects, etc | (grows downwards) * 0xFFFF8000.00000000 |-----------------------| * 0x00008000.00000000 |-----------------------| * | user heap | (grows upwards) * |-----------------------| * 0x00000000.04000000 |-----------------------| * 0x00000000.00000000 +-----------------------+ * A 32 bit app on the 64 bit kernel sees the same layout as on the 32 bit * kernel, except that userlimit is raised to 0xfe000000 * structures. This region contains page_t structures for the lowest 4GB * of physical memory, memsegs, memlists, and the page hash. * core_base: start of the kernel's "core" heap area on 64-bit systems. * This area is intended to be used for global data as well as for module * text/data that does not fit into the nucleus pages. The core heap is * restricted to a 2GB range, allowing every address within it to be * accessed using rip-relative addressing * ekernelheap: end of kernelheap and start of segmap. * kernelheap: start of kernel heap. On 32-bit systems, this starts right * above a red zone that separates the user's address space from the * kernel's. On 64-bit systems, it sits above segkp and segkpm. * segkmap_start: start of segmap. The length of segmap can be modified * by changing segmapsize in /etc/system (preferred) or eeprom (deprecated). * The default length is 16MB on 32-bit systems and 64MB on 64-bit systems. * kernelbase: On a 32-bit kernel the default value of 0xd4000000 will be * decreased by 2X the size required for page_t. This allows the kernel * heap to grow in size with physical memory. With sizeof(page_t) == 80 * bytes, the following shows the values of kernelbase and kernel heap * sizes for different memory configurations (assuming default segmap and * mem size for kernelbase kernel heap * ---- --------- ---------- ----------- * 1gb 0x01400000 0xd1800000 684MB * 2gb 0x02800000 0xcf000000 704MB * 4gb 0x05000000 0xca000000 744MB * 6gb 0x07800000 0xc5000000 784MB * 8gb 0x0a000000 0xc0000000 824MB * 16gb 0x14000000 0xac000000 984MB * 32gb 0x28000000 0x84000000 1304MB * 64gb 0x50000000 0x34000000 1944MB (*) * kernelbase is less than the abi minimum of 0xc0000000 for memory * configurations above 8gb. * (*) support for memory configurations above 32gb will require manual tuning * of kernelbase to balance out the need of user applications. /* real-time-clock initialization parameters */ long gmt_lag;
/* offset in seconds of gmt to local time */ * List of bootstrap pages. We mark these as allocated in startup. * release_bootstrap() will free them when we're completely done with * Enable some debugging messages concerning memory usage... * XX64 There should only be one print routine once memlist usage between * vmx and the kernel is cleaned up and there is a single memlist structure * shared between kernel and boot. * XX64 need a comment here.. are these just default values, surely * we read the "cpuid" type information to figure this out. * on 64 bit we use a predifined VA range for mapping devices in the kernel * on 32 bit the mappings are intermixed in the heap, so we use a bit map * Simple boot time debug facilities "%s:%d: '%s' is 0x%llx\n" * This structure is used to keep track of the intial allocations * done in startup_memlist(). The value of NUM_ALLOCATIONS needs to * be >= the number of ADD_TO_ALLOCATIONS() executed in the code. panic(
"too many ADD_TO_ALLOCATIONS()"); \
panic(
"BOP_ALLOC() failed");
* Our world looks like this at startup time. * In a 32-bit OS, boot loads the kernel text at 0xfe800000 and kernel data * at 0xfec00000. On a 64-bit OS, kernel text and data are loaded at * 0xffffffff.fe800000 and 0xffffffff.fec00000 respectively. Those * addresses are fixed in the binary at link time. * Machine-dependent startup code * Make sure that nobody tries to use sekpm until we have * initialized it properly. * Complete the extraction of cpuid data * Check for prom_debug in boot environment PRM_POINT(
"prom_debug found in boot enviroment");
* Collect node, cpu and memory configuration information. * Halt if this is an unsupported processor. printf(
"\n486 processor (\"%s\") detected.\n",
halt(
"This processor is not supported by this release " * Callback for copy_memlist_filter() to filter nucleus, kadb/kmdb, (ie. * everything mapped above KERNEL_TEXT) pages from phys_avail. Note it * also filters out physical page zero. There is some reliance on the * boot loader allocating only a few contiguous physical memory chunks. * page zero is required for BIOS.. never make it available * First we trim from the front of the range. Since hat_boot_probe() * walks ranges in virtual order, but addr/size are physical, we need * to the list until no changes are seen. This deals with the case * where page "p" is mapped at v, page "p + PAGESIZE" is mapped at w * Trim pages from the end of the range. * These variables were all designed for sfmmu in which segkpm is * mapped using a single pagesize - either 8KB or 4MB. On x86, we * might use 2+ page sizes on a single machine, so none of these * variables have a single correct value. They are set up as if we * always use a 4KB pagesize, which should do no harm. In the long * run, we should get rid of KPM's assumption that only a single panic(
"cannot attach segkpm");
panic(
"segkpm_create segkpm");
* Map each of the memsegs into the kpm segment, coalesing adjacent * memsegs to allow mapping with the largest possible pages. * The purpose of startup memlist is to get the system to the * point where it can use kmem_alloc()'s that operate correctly * relying on BOP_ALLOC(). This includes allocating page_ts, * page hash table, vmem initialized, etc. * Boot's versions of physinstalled and physavail are insufficient for * the kernel's purposes. Specifically we don't know which pages that * are not in physavail can be reclaimed after boot is gone. * This code solves the problem by dividing the address space * into 3 regions as it takes over the MMU from the booter. * 1) Any (non-nucleus) pages that are mapped at addresses above KERNEL_TEXT * can not be used by the kernel. * 2) Any free page that happens to be mapped below kernelbase * is protected until the boot loader is released, but will then be reclaimed. * 3) Boot shouldn't use any address in the remaining area between kernelbase * In the case of multiple mappings to the same page, region 1 has precedence /* XX64 fix these - they should be in include files */ * Take the most current snapshot we can by calling mem-update. * For this to work properly, we first have to ask boot for its * find if the kernel is mapped on a large page panic(
"Couldn't find kernel text boot mapping");
* Use leftover large page nucleus text/data space for loadable modules. PRM_POINT(
"Kernel NOT loaded on Large Page!");
* For MP machines cr4_value must be set or the non-boot * CPUs will not be able to start. * Examine the boot loaders physical memory map to find out: * - total memory in system - physinstalled * - the max physical address - physmax * - the number of segments the intsalled memory comes in * Initialize hat's mmu parameters. * Check for enforce-prot-exec in boot environment. It's used to * The default is to enforce PROT_EXEC on processors that support NX. * Boot seems to round up the "len", but 8 seems to be big enough. * physmax is lowered if there is more memory than can be * physically addressed in 32 bit (PAE/non-PAE) modes. * We will need page_t's for every page in the system, except for * memory mapped at or above above the start of the kernel text segment. * pages above e_modtext are attributed to kernel debugger (obp_pages) * If physmem is patched to be non-zero, use it instead of * the computed value unless it is larger than the real * amount of memory on hand. * We now compute the sizes of all the initial allocations for * structures the kernel needs in order do kmem_alloc(). These * page coloring data structs * There's no real good way to know exactly how much room we'll need, * but this should be a good upper bound. * The page structure hash table size is a power of 2 * such that the average hash chain length is PAGE_HASHAVELEN. * Set aside room for the page structures themselves. Note: on * 64-bit systems we don't allocate page_t's for every page here. * We just allocate enough to map the lowest 4GB of physical * memory, minus those pages that are used for the "nucleus" kernel * text and data. The remaining pages are allocated once we can * boot_npages is used to allocate an area big enough for our * initial page_t's. kphym_init may use less than that. * determine l2 cache info and memory size for page coloring * valloc_base will be below kernel text * The extra pages are for the HAT and kmdb to map page tables. * We configure kernelbase based on: * 1. user specified kernelbase via eeprom command. Value cannot exceed * KERNELBASE_MAX. we large page align eprom_kernelbase * 2. Default to KERNELBASE and adjust to 2X less the size for page_t. * On large memory systems we must lower kernelbase to allow * enough room for page_t's for all of memory. * The value set here, might be changed a little later. * At this point, we can only use a portion of the kernelheap that * will be available after we boot. Both 32-bit and 64-bit systems * have this limitation, although the reasons are completely * On 64-bit systems, the booter only supports allocations in the * upper 4GB of memory, so we have to work with a reduced kernel * heap until we take over all allocations. The booter also sits * in the lower portion of that 4GB range, so we have to raise the * bottom of the heap even further. * On 32-bit systems we have to leave room to place segmap below * the heap. We don't yet know how large segmap will be, so we * have to be very conservative. * XX64: For now, we let boot have the lower 2GB of the top 4GB * address range. In the long run, that should be fixed. It's * insane for a booter to need 2 2GB address ranges. * If segmap is too large we can push the bottom of the kernel heap * higher than the base. Or worse, it could exceed the top of the * VA space entirely, causing it to wrap around. panic(
"too little memory available for kernelheap," " use a different kernelbase");
* Now that we know the real value of kernelbase, * update variables that were initialized with a value of * XXX The problem with this sort of hackery is that the * compiler just may feel like putting the const declarations * (in param.c) into the .text section. Perhaps they should * just be declared as variables there? * As one final sanity check, verify that the "red zone" between * kernel and userspace is exactly the size we expected. * do all the initial allocations * Initialize the kernel heap. Note 3rd argument must be > 1st. * Build phys_install and phys_avail in kernel memspace. * - phys_install should be all memory in the system. * - phys_avail is phys_install minus any memory mapped before this * point above KERNEL_TEXT. panic(
"physinstalled was too big!");
panic(
"physavail was too big!");
* free page list counters * Initialize the page structures from the memory lists. * Now that page_t's have been initialized, remove all the * initial allocation pages from the kernel free page lists. * Initialize kernel memory allocator. * print this out early so that we know what's going on * Initialize ten-micro second timer so that drivers will * not get short changed in their init phase. This was * not getting called until clkinit which, on fast cpu's * caused the drv_usecwait to be way too short. * Calculate default settings of system parameters based upon * maxusers, yet allow to be overridden via the /etc/system file. * Setup machine check architecture on P6 * Initialize system parameters. * maxmem is the amount of physical memory we're playing with. * Initialize the hat layer. * Initialize segment management stuff. halt(
"Can't load specfs");
halt(
"Can't load devfs");
* This is needed here to initialize hw_serial[] for cluster booting. if ((i =
modload(
"misc",
"sysinit")) != (
unsigned int)-
1)
/* Read cluster configuration data. */ * Create a kernel device tree. First, create rootnex and * then invoke bus specific code to probe devices. * Fake a prom tree such that /dev/openprom continues to work * Load all platform specific modules * Do final allocations of HAT data structures that need to * be allocated before quiescing the boot loader. * Setup MTRR (Memory type range registers) * Walk through the pagetables looking for pages mapped in by boot. If the * setaside flag is set the pages are expected to be returned to the * kernel later in boot, so we add them to the bootpages list. panic(
"0x%lx byte mapping at 0x%p exceeds boot's " "legal range.",
len, (
void *)
va);
panic(
"Unexpected mapping by boot. " * The next two loops are done in distinct steps in order * to be sure that any page that is doubly mapped (both above * KERNEL_TEXT and below kernelbase) is dealt with correctly. * Note this may never happen, but it might someday. * Protect any pages mapped above KERNEL_TEXT that somehow have * page_t's. This can only happen if something weird allocated * loader we must remove from our free page lists any boot pages that * will stay mapped until release_bootstrap(). * Copy in boot's page tables, set up extra page tables for the kernel, * and switch to the kernel's context. * It is no longer safe to call BOP_ALLOC(), so make sure we don't. * Before we call kvm_init(), we need to establish the final size * of the kernel's heap. So, we need to figure out how much space * to set aside for segkp, segkpm, and segmap. * Segkpm appears at the bottom of the kernel's address * range. To detect accidental overruns of the user * address space, we leave a "red zone" of unmapped memory * between kernelbase and the beginning of segkpm. * determine size of segkp and adjust the bottom of the "segkpsize has been reset to %ld pages",
* put the range of VA for device mappings next * Users can change segmapsize through eeprom or /etc/system. * If the variable is tuned through eeprom, there is no upper * bound on the size of segmap. If it is tuned through * /etc/system on 32-bit systems, it must be no larger than we * planned for in startup_memlist(). * 32-bit systems don't have segkpm or segkp, so segmap appears at * the bottom of the kernel's address range. Set aside space for a * red zone just below the start of segmap. * Tell kmdb that the VM system is now working * Mangle the brand string etc. * Now that we can use memory outside the top 4GB (on 64-bit * systems) and we know the size of segmap, we can set the final * size of the kernel's heap. Note: on 64-bit systems we still * can't touch anything in the bottom half of the top 4GB range * because boot still has pages mapped there. panic(
"Could not protect boot's memory");
* Now that the kernel heap may have grown significantly, we need * to make all the remaining page_t's available to back that memory. * XX64 this should probably wait till after release boot-strap too. * Create the device arena for toxic (to dtrace/kmdb) mappings. * allocate the bit map that tracks toxic pages * Now that we've got more VA, as well as the ability to allocate from * The following code installs a special page fault handler (#pf) * to work around a pentium bug. panic(
"failed to install pentium_pftrap");
* Map page pfn=0 for drivers, such as kd, that need to pick up * If the following is true, someone has patched phsymem to be less * than the number of pages that the system actually has. Remove * pages until system memory is limited to the requested amount. * Since we have allocated page structures for all pages, we * correct the amount of memory we want to remove by the size of * the memory used to hold page structures for the non-used pages. panic(
"limited physmem too much!");
* disable automatic large pages for small memory systems or * when the disable flag is set. * Initialize the segkp segment type. panic(
"startup: cannot attach segkp");
* For 32 bit x86 systems, we will have segkp under the heap. * There will not be a segkp segment. We do, however, need * to fill in the seg structure. panic(
"startup: segkp_create failed");
* Now create segmap segment. panic(
"cannot attach segkmap");
* The 64 bit HAT permanently maps only segmap's page tables. * The 32 bit HAT maps the heap's page tables too. panic(
"segmap_create segkmap");
* Perform tasks that get done after most of the VM * initialization has been done but before the clock * and other devices get started. * Perform CPC initialization for this CPU. * XX64 -- include SSE, SSE2, etc. here too? * Set the isa_list string to the defined instruction sets we * We're done with bootops. We don't unmap the bootstrap yet because * we're still using bootsvcs. "softlevel1",
NULL,
NULL);
/* XXX to be moved later */ * Set the system wide, processor-specific flags to be passed * to userland via the aux vector for performance hints and * instruction set extensions. * Load the System Management BIOS into the global ksmbios handle, * if an SMBIOS is present on this system. * Startup memory scrubber. * Perform forceloading tasks for /etc/system. * ON4.0: Force /proc module in until clock interrupt handle fixed * ON4.0: This must be fixed or restated in /etc/systems. * Check for required functional Floating Point hardware, * unless FP hardware explicitly disabled. halt(
"No working FP hardware found");
* Perform the formal initialization of the boot chip, * and associate the boot cpu with it. * This must be done after the cpu node for CPU has been * added to the device tree, when the necessary probing to * know the chip type and chip "id" is performed. /* unmount boot ramdisk and release kmem usage */ * We're finished using the boot loader so free its pages. * If root isn't on ramdisk, destroy the hardcoded * ramdisk node now and release the memory. Else, * ramdisk memory is kept in rd_pages. * Find 1 page below 1 MB so that other processors can boot up. * Make sure it has a kernel VA as well as a 1:1 mapping. * We should have just free'd one up. panic(
"No page available for starting " PRM_POINT(
"Returning boot's VA space to kernel heap");
* Initialize the platform-specific parts of a page_t. * kphysm_init() initializes physical memory. * In a 32 bit kernel can't use higher memory if we're * not booting in PAE mode. This check takes care of that. * align addr and size - they may not be at page boundaries /* only process pages below or equal to physmax */ * If the caller didn't provide space for the page * structures, carve them out of the memseg they will * Compute how many of the pages we need to use for " pgs=0x%lx pfn 0x%lx-0x%lx\n",
* drop pages below ddiphysmin to simplify ddi memory * allocation with non-zero addr_lo requests. /* drop entire range below ddiphysmin */ /* adjust range to ddiphysmin */ * Build the memsegs entry * insert in memseg list in decreasing pfn range order. * Low memory is typically more fragmented such that this * ordering keeps the larger ranges at the front of the list * for code that searches memseg. /* check for continuity with start of memsegpp */ * contiguous pfn and page_t's. Merge * cur_memseg into *memsegpp. Drop * check if contiguous with the end of * contiguous pfn but not page_t's. * to prevent creation of large pages * with noncontiguous page_t's if not * aligned to largest page boundary. /* check for continuity with end of memsegpp */ * contiguous pfn and page_t's. Merge * cur_memseg into *memsegpp. Drop /* merge previously done */ * contiguous pfn but not page_t's. * to prevent creation of large pages * with noncontiguous page_t's if not * aligned to largest page boundary. * add_physmem() initializes the PSM part of the page * struct by calling the PSM back with add_physmem_cb(). * In addition it coalesces pages into larger pages as * If the caller provided the page frames to us, then * advance in that list. Otherwise, prepare to allocate * our own page frames for the next memseg. * Kernel VM initialization. * Put the kernel segments in kernel address space. * We're about to map out /boot. This is the beginning of the * system resource management transition. We can no longer * call into /boot for I/O or memory allocations. * XX64 - Is this still correct with kernelheap_extend() being called * Ensure that the red zone at kernelbase is never accessible. * Make the text writable so that it can be hot patched by DTrace. * Make data writable until end. * These are MTTR registers supported by P6 * Disable reprogramming of MTRRs by default. * and Pentium 4, and yes, they are named 0, 1, 2, 4, 3 in ascending * address order (starting from 0x400). The Pentium 4 only implements * 4 sets, and while they are named 0-3 in the doc, the corresponding * names for P6 are 0,1,2,4. So define these arrays in address order * so that they work for both pre-Pentium4 and Pentium 4 processors. * If status register not valid skip this bank * If mci_addr contains the address where * error occurred, display the address "addr = 0x%" PRIx64 ", model errcode = 0x%x", i,
"MCE: Bank %d: error code 0x%x, mserrcode = 0x%x",
* Sync current cpu mtrr with the incore copy of mtrr. * This function has to be invoked with interrupts disabled * Currently we do not capture other cpu's. This is invoked on cpu0 * On other cpu's its invoked from mp_startup(). * resync mtrr so that BIOS is happy. Called from mdboot * We could have changed the default mtrr definition. * Put it back to uncached which is what it is at power on * start = start of new memory segment * len = length of new memory segment in bytes * new = pointer to a new struct memlist * memlistp = memory list to which to add segment. panic(
"unexpected call to kobj_texthole_alloc()");
panic(
"unexpected call to kobj_texthole_free()");
* This is called just after configure() in startup(). * The ISALIST concept is a bit hopeless on Intel, because * there's no guarantee of an ever-more-capable processor * given that various parts of the instruction set may appear * and disappear between different implementations. * While it would be possible to correct it and even enhance * it somewhat, the explicit hardware capability bitmask allows * So, we just leave this alone. "+mmx pentium_pro " :
" ");
* The Cyrix 6x86 does not have any Pentium features * accessible while not at privilege level 0. len =
strlen(
tp) +
1;
/* account for NULL at end of string */ * returns 1st address in range that is in device arena, or NULL * if len is not NULL it returns the length of the toxic range * if called very early by kmdb, just return NULL * First check if we're completely outside the bitmap range. * Trim ends of search to look at only what the bitmap covers.