intel_iommu.c revision a77271f8607dbace3fbc9a3cda0fd24d6e7ccd68
* The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * See the License for the specific language governing permissions * and limitations under the License. * When distributing Covered Code, include this CDDL HEADER in each * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * Portions Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (c) 2008, Intel Corporation. * Intel IOMMU implementaion * iommu_state - the list of iommu structures * reserve_memory - the list of reserved regions * page_num - the count of pages for iommu page tables * record some frequently used dips * dvma cache related variables /* ioapic info for interrupt remapping */ * switch to turn on/off the gfx dma remapping unit, * this is used when there is a dedicated drhd for the * switch to disable dmar remapping unit, even the initiation work has "The present field in root-entry is Clear",
"The present field in context-entry is Clear",
"Hardware detected invalid programming of a context-entry",
"The DMA request attempted to access an address beyond max support",
"The Write field in a page-table entry is Clear when DMA write",
"The Read field in a page-table entry is Clear when DMA read",
"Access the next level page table resulted in error",
"Access the root-entry table resulted in error",
"Access the context-entry table resulted in error",
"Reserved field not initialized to zero in a present root-entry",
"Reserved field not initialized to zero in a present context-entry",
"Reserved field not initialized to zero in a present page-table entry",
"DMA blocked due to the Translation Type field in context-entry",
"Incorrect fault event reason number" * QS field of Invalidation Queue Address Register * the size of invalidation queue is 1 << (qinv_iqa_qs + 8) * the invalidate desctiptor type of queued invalidation interface "Context Cache Invalidate Descriptor",
"IOTLB Invalidate Descriptor",
"Device-IOTLB Invalidate Descriptor",
"Interrupt Entry Cache Invalidate Descriptor",
"Invalidation Wait Descriptor",
"Incorrect queue invalidation type" * S field of the Interrupt Remapping Table Address Register * the size of the interrupt remapping table is 1 << (intrr_irta_s + 1) * whether disable interrupt remapping in LOCAL_APIC mode * whether verify the source id of interrupt request /* the fault reason for interrupt remapping */ "reserved field set in IRTE",
"interrupt_index exceed the intr-remap table size",
"present field in IRTE is clear",
"hardware access intr-remap table address resulted in error",
"reserved field set in IRTE, inlcude various conditional",
"hardware blocked an interrupt request in Compatibility format",
"remappable interrupt request blocked due to verification failure" * the queued invalidation interface functions /* interrupt remapping related functions */ * flush the cpu cache line * do some init work for the iommu page allocator * get a 4k iommu page, and zero out it * free the iommu page allocated with iommu_get_page * calculate agaw from gaw * iommu_update_stats - update iommu private kstat counters * This routine will dump and reset the iommu's internal * statistics counters. The current stats dump values will * be sent to the kernel status area. * iommu_init_stats - initialize kstat data structures * This routine will create and initialize the iommu private "Could not create kernel statistics for %s",
* Initialize all the statistics * Function to provide kernel stat update on demand * Pointer into provider's raw statistics * Add kstat to systems kstat chain * the fault event handler for a single drhd /* read the fault status */ /* check if we have a pending fault for this IOMMU */ * handle all primary pending faults /* read the higher 64bits */ /* check if pending fault */ /* get the fault reason, fault type and sid */ /* read the first 64bits */ /* report the fault info */ /* dmar-remapping fault */ "%s generated a fault event when translating " "\t on address 0x%" PRIx64 " for PCI(%d, %d, %d), " (
sid >>
8) &
0xff, (
sid >>
3) &
0x1f,
sid &
0x7,
/* intr-remapping fault */ "%s generated a fault event when translating " "\t on index 0x%" PRIx64 " for PCI(%d, %d, %d), " (
sid >>
8) &
0xff, (
sid >>
3) &
0x1f,
sid &
0x7,
* handle queued invalidation interface errors "%s generated a fault when fetching a descriptor from the\n" "\tinvalidation queue, or detects that the fetched\n" "\tdescriptor is invalid. The head register is " * Hardware received an unexpected or invalid Device-IOTLB * invalidation completion "Hardware received an unexpected or invalid " "Device-IOTLB invalidation completion.\n");
* Hardware detected a Device-IOTLB invalidation "Hardware detected a Device-IOTLB invalidation " "completion time-out.\n");
* intel_iommu_intr_handler() * call iommu_intr_handler for each iommu * the interface to hook dmar interrupt handler * wait max 60s for the hardware completion "iommu wait completion time out\n"); \
* dmar_flush_write_buffer() /* record the statistics */ * dmar_flush_iotlb_common() * if the hardward doesn't support page selective invalidation, we * will use domain type. Otherwise, use global type /* verify there is no pending command */ * check the result and record the statistics * register based iotlb psi invalidation /* choose page specified invalidation */ /* MAMV is valid only if PSI is set */ /* First calculate alignment of DVMA */ /* truncate count to the nearest power of 2 */ /* choose domain invalidation */ * dmar_flush_context_cache() * flush the context cache /* verify there is no pending command */ /* record the context cache statistics */ * dmar_flush_context_fsi() * function based context cache flush * dmar_flush_context_dsi() * domain based context cache flush * dmar_flush_context_gbl() * flush global context cache * dmar_set_root_entry_table() * the processes to bring up a dmar unit * flush the iommu write buffer * flush the context cache * at last enable the unit /* enable queued invalidation */ * get a dvma from the cache * put a dvma to the cache after use /* no cache, alloc one */ /* initialize this node */ /* insert into the free list */ /* shrink the cache list */ * iommu_dvma_cache_flush() * flush the dvma caches when vmem_xalloc() failed * get_dvma_cookie_array() * get a dvma cookie array from the cache or allocate /* LINTED E_EQUALITY_NOT_ASSIGNMENT */ * put_dvma_cookie_array() * put a dvma cookie array to the cache or free * the plant wait operation for register based cache invalidation /* no cache, alloc one */ /* initialize this node */ /* insert into the pend list */ * dmar_release_dvma_cookie() * release the dvma cookie for (i = 0; i <
count; i++) {
/* free the cookie array */ * the reap wait operation for register based cache invalidation /* put the node into the node cache */ /* initialize the dmar operations */ /* cache related functions */ * alloc and setup the iommu state * map the register address space * if the hardware access is non-coherent, we need clflush "missing clflush functionality");
* retrieve the maximum number of domains * setup the domain id allocator * domain id 0 is reserved by the architecture * the iommu is orginally disabled * alloc root entry table, this should put after init ops * init queued invalidation interface "%s init queued invalidation interface failed\n",
* init intr remapping table state pointer * initialize the iotlb pending list and cache * insert this iommu into the list * memory_region_overlap() * handle the pci mmio pages overlap condition * reserve a single dev mmio resources * ingore the devices which have no assigned-address * check the memory io assigned-addresses * refer to pci.h for bits defination of * walk through the pci device tree, and collect the mmio resources * walk through the device tree under pdip * normally, pdip should be the pci root nexus * iommu_collect_reserve_memory() * collect the reserved memory region * reserve pages for pci memory mapped io * reserve pages for ioapic * walk function for get_dip_from_info() * get the dev_info structure by pass a bus/dev/func * get the top level bridge for a pci device * domain_vmem_init_reserve() * dish out the reserved pages * initiate the domain vmem * create the whole available virtual address and * dish out the reserved memory regions with xalloc * dish out the reserved pages * record the domain statistics * create the first level page table * init the CPU available page tables /* init the memory cache list */ * check to see if the device is under scope of a p2p bridge * get the iommu structure for a device * walk the drhd list for a match * try to match the device scope * maybe under a scope of a p2p * domain_set_root_context * set root context for a single device " %d, %d, %d has been set",
bus,
/* cache mode set, flush context cache */ /* cache mode not set, flush write buffer */ * setup the root context entry * the walk function to set up the possible context entries * setup_possible_contexts() * set up all the possible context entries for a device under ppb /* for pcie-pci bridge */ /* for functions under pcie-pci bridge */ * allocate a domain for device, the result is returned in domain parameter * check if the domain has already allocated * we have to assign a domain for this device, * hold the parent for modifying its children * check to see if it is under a pci bridge * OK, we have to allocate a new domain * add the device to the domain's device list * get a iommu domain for dip, and the result is returned in domain * for isa devices attached under lpc * for gart, use the real graphic devinfo * if iommu private is NULL, we share * the domain with the parent * check if the domain has already allocated * allocate a domain for this device * helper functions to manipulate iommu pte * get the page table offset by specifying a dvma and level * iommu_setup_level_table() * setup the page table for a level * the pte is nonpresent, alloc new page * iommu_setup_page_table() * setup the page table for a dvma for (i =
level; i >
1; i--) {
* map a range of pages for iommu translation * domain: the device domain * dvma: the start dvma for mapping * start: the start physcial address * end: the end physical address /* flush cpu and iotlb cache */ /* cache mode set, flush iotlb */ /* cache mode not set, flush write buffer */ * build_single_rmrr_identity_map() * build identity map for a single rmrr unit * build_rmrr_identity_map() * build identity mapping for devices under rmrr scopes * return TRUE, if the drhd is only for gfx /* get the device number attached to this drhd */ * build_dev_identity_map() * build identity map for a device "this device may not be functional",
* record the identity map for domain, any device * which uses this domain will needn't any further * build_isa_gfx_identity_walk() * the walk function for build_isa_gfx_identity_map() /* ignore the NULL private device */ /* workaround for pci8086,10bc pci8086,11bc */ " Port LP Server Adapter applied\n");
* build_isa_gfx_identity_map() * build identity map for isa and gfx devices * walk through the device tree from pdip * normally, pdip should be the pci root * dmar_check_boot_option() * check the intel iommu boot option * intel_iommu_attach_dmar_nodes() * attach intel iommu nodes * retrieve the dmar boot options * initiate each iommu unit * register interrupt remap ops * collect the reserved memory pages * build identity map for devices in the rmrr scope * build identity map for isa and gfx devices * initialize the dvma cookie cache * regist the intr add function * free the reserve memory list * free iommu state structure * get level n page table, NULL is returned if /* walk to the level n page table */ for (i =
level; i > n; i--) {
* iommu_alloc_cookie_array() * allocate the cookie array which is needed by map sgl /* figure out the rough estimate of array size */ /* the preallocated buffer fit this size */ /* we need to allocate new array */ /* convert the sleep flags */ /* allocate the dvma cookie array */ * alloc a dvma range for the caller /* handle the rollover cases */ /* get from cache first */ /* allocate from vmem arena */ /* if xalloc failed, we have to flush the cache and retry */ * save the dvma range in the device dvma cookie * map dvma to the physical addresses, the actual * mapped dvma page number is returned /* map each physical address */ * called from rootnex_dma_bindhdl(), to build dma * cookies when iommu is enabled /* get domain for the dma request */ /* direct return if drhd is disabled */ * allocate the cookies arrays, if the pre-allocated * space is not enough, we should reallocate it /* retrieve paddr, psize, offset from dmareq */ /* save the iommu page offset */ * allocate the dvma and map [paddr, paddr+psize) * setup the first cookie with the dvma of the page * and the its size, we don't take account in the * offset into the first page now /* get the size for this page (i.e. partial or full page) */ /* get the paddr from the page_t */ /* index into the array of page_t's to get the paddr */ /* call into the VM to get the paddr */ * check to see if this page would put us * over the max cookie size /* use the next cookie */ /* allocate the dvma and map [paddr, paddr+psize) */ /* save the cookie information */ * we can add this page in the current cookie /* take account in the offset into the first page */ /* save away how many cookies we have */ * clear a single leaf pte /* retrieve the leaf page table */ /* map the leaf page and walk to the pte */ /* flush cpu and iotlb cache */ /* unmap the leaf page */ * intel_iommu_unmap_sgl() * called from rootnex_dma_unbindhdl(), to unbind dma * cookies when iommu is enabled /* get the device domain, no return check needed here */ /* if the drhd is disabled, nothing will be done */ /* the drhd is enabled */ * initialize invalidation request queue structure. * call ddi_dma_mem_alloc to allocate physical contigous * pages for invalidation queue table /* set devi_ops in dev info structure for ddi_dma_mem_alloc */ * set devi_bus_dma_allochdl in dev info structure for "alloc invalidation queue table handler failed\n");
"alloc invalidation queue sync mem handler failed\n");
/* alloc physical contiguous pages for invalidation queue */ "alloc invalidation queue table failed\n");
/* get the base physical address of invalidation request queue */ /* alloc status memory for invalidation wait descriptor */ "alloc invalidation queue sync mem failed\n");
* init iotlb pend node for submitting invalidation iotlb /* set invalidation queue structure */ /* destroy invalidation queue structure */ /* enable queued invalidation interface */ /* Initialize the Invalidation Queue Tail register to zero */ /* set invalidation queue base address register */ /* enable queued invalidation interface */ /* set new queued invalidation interface */ /* submit invalidation request descriptor to invalidation queue */ * inv queue table exhausted, wait hardware to fetch /* queued invalidation interface -- invalidate context cache */ /* record the context cache statistics */ /* queued invalidation interface -- invalidate iotlb */ * check the result and record the statistics /* queued invalidation interface -- invalidate dev_iotlb */ /* queued invalidation interface -- invalidate interrupt entry cache */ /* queued invalidation interface -- global invalidate interrupt entry cache */ /* queued invalidation interface -- invalidate single interrupt entry cache */ /* queued invalidation interface -- invalidate interrupt entry caches */ /* requested interrupt count is not a power of 2 */ for (i = 0; i <
cnt; i++) {
for (i = 0; i <
cnt; i++) {
* alloc free entry from sync status table /* should never happen */ * queued invalidation interface -- invalidation wait descriptor * fence flag not set, need status data to indicate the invalidation * wait descriptor completion /* plant an iotlb pending node */ * sdata = QINV_SYNC_DATA_UNFENCE, fence = 0, sw = 1, if = 0 * indicate the invalidation wait descriptor completion by * performing a coherent DWORD write to the status address, * not by generating an invalidation completion event * queued invalidation interface -- invalidation wait descriptor * fence flag set, indicate descriptors following the invalidation * wait descriptor must be processed by hardware only after the * invalidation wait descriptor completes. /* sw = 0, fence = 1, iflag = 0 */ * queued invalidation interface -- invalidation wait descriptor * wait until the invalidation request finished * sdata = QINV_SYNC_DATA_FENCE, fence = 1, sw = 1, if = 0 * indicate the invalidation wait descriptor completion by * performing a coherent DWORD write to the status address, * not by generating an invalidation completion event /* get already completed invalidation wait requests */ * queued invalidation interface * function based context cache invalidation * queued invalidation interface * domain based context cache invalidation * queued invalidation interface * invalidation global context cache * queued invalidation interface * paged based iotlb invalidation /* choose page specified invalidation */ /* choose domain invalidation */ * queued invalidation interface * domain based iotlb invalidation * queued invalidation interface * global iotlb invalidation * the plant wait operation for queued invalidation interface /* no cache, alloc one */ /* plant an invalidation wait descriptor, not wait its completion */ * the reap wait operation for queued invalidation interface /* init interrupt remapping table */ /* destroy interrupt remapping table */ /* enable interrupt remapping hardware unit */ /* set interrupt remap table pointer */ /* global flush intr entry cache */ /* enable interrupt remapping */ /* set compatible mode */ * helper function to find the free interrupt remapping for (i = 0; i <
post; i++) {
* helper function to find 'count' contigous free * interrupt remapping table entries for (j = 0; j <
count; j++) {
for (j = 0; j <
count; j++) {
/* alloc one interrupt remapping table entry */ /* no free intr entry, use compatible format intr */ * x2apic mode not allowed compatible /* alloc 'cnt' contigous interrupt remapping table entries */ for (i = 0; i <
cnt; i++) {
/* x2apic mode not allowed comapitible interrupt */ /* get ioapic source id and iommu structure for ioapics */ /* initialize interrupt remapping */ * interrupt remapping is not a must in apic mode * if all drhd unit disabled intr remapping, /* enable interrupt remapping */ /* get iommu structure and interrupt source id for ioapic */ /* alloc remapping entry for the interrupt */ for (i =
1; i <
cnt; i++) {
/* helper function to get iommu structure */ /* for fixed interrupt */ /* helper function to get interrupt request source id */ /* for interrupt through I/O APIC */ /* device behind pcie to pci bridge */ /* device behind pci to pci bridge */ /* remapping the interrupt */ /* set interrupt remapping table entry */ for (i = 0; i <
cnt; i++) {
/* set interrupt remapping table entry */ /* free the remapping entry */ /* record the ioapic rdt entry */ /* record the msi interrupt structure */