/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _CHEETAHASM_H
#define _CHEETAHASM_H
#pragma ident "%Z%%M% %I% %E% SMI"
#ifdef __cplusplus
extern "C" {
#endif
#ifdef _ASM
/* BEGIN CSTYLED */
/*
* Macro for getting to offset from 'cpu_private' ptr. The 'cpu_private'
* ptr is in the machcpu structure.
* off_reg: Register offset from 'cpu_private' ptr.
* scr1: Scratch, ptr is returned in this register.
* scr2: Scratch
*/
nop; \
/*
* Macro version of get_dcache_dtag. We use this macro in the
* CPU logout code. Since the Dcache is virtually indexed, only
* bits [12:5] of the AFAR can be used so we need to search through
* 8 indexes (4 ways + bit 13) in order to find the tag we want.
* afar: input AFAR, not modified.
* datap: input ptr to ch_dc_data_t, at end pts to end of ch_dc_data_t.
* scr1: scratch.
* scr2: scratch, will hold tag to look for.
* scr3: used for Dcache index, loops through 4 ways.
*/
b 1f; \
.align 128; \
1: \
nop; \
2: \
blt 2b; \
\
3: \
blt 3b; \
b 5f; \
4: \
blt 1b; \
nop; \
5:
/*
* Macro version of get_icache_dtag. We use this macro in the CPU
* logout code. If the Icache is on, we don't want to capture the data.
* afar: input AFAR, not modified.
* datap: input ptr to ch_ic_data_t, at end pts to end of ch_ic_data_t.
* scr1: scratch.
* scr2: scratch, will hold tag to look for.
* scr3: used for Icache index, loops through 4 ways.
* Note: For Panther, the Icache is virtually indexed and increases in
* size to 64KB (instead of 32KB) with a line size of 64 bytes (instead
* of 32). This means the IC_addr index bits[14:7] for Panther now
* correspond to VA bits[13:6]. But since it is virtually indexed, we
* still mask out only bits[12:5] from the AFAR (we have to manually
* check bit 13). In order to make this code work for all processors,
* we end up checking twice as many indexes (8 instead of 4) as required
* for non-Panther CPUs and saving off twice as much data (16 instructions
* instead of just 8).
*/
nop; /* Panther I$ is VIPT. */ \
b 2f; \
nop; \
1: \
2: \
b 3f; \
nop; \
.align 128; \
3: \
nop; \
4: \
blt 4b; \
b 6f; \
nop; \
5: \
blt 3b; \
nop; \
6:
/*
* Macro version of get_ecache_dtag. We use this macro in the
* CPU logout code.
* afar: input AFAR, not modified
* datap: Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
* ec_way: Constant value (way number)
* scr1: Scratch
* scr2: Scratch.
* scr3: Scratch.
*/
b 1f; \
.align 64; \
1: \
2: \
3: /* loop thru 5 regs */ \
bne 3b; \
beq 2b; \
/*
* Jalapeno does not have cores so these macros are null.
*/
#if defined(JALAPENO)
/*
* Jalapeno gets primary AFSR and AFAR. All bits in the AFSR except
* the fatal error bits are cleared.
* datap: pointer to cpu logout structure.
* afar: returned primary AFAR value.
* scr1: scratch
* scr2: scratch
*/
/*
* Jalapeno has no shadow AFAR, null operation.
*/
/*
* Serrano gets primary AFSR and AFAR. All bits in the AFSR except
* the fatal error bits are cleared. For Serrano, we also save the
* AFAR2 register.
* datap: pointer to cpu logout structure.
* afar: returned primary AFAR value.
* scr1: scratch
* scr2: scratch
*/
/*
* Serrano needs to capture E$, D$ and I$ lines associated with afar2.
* afar: scratch, holds afar2.
* datap: pointer to cpu logout structure
* scr1: scratch
* scr2: scratch
* scr3: scratch
*/
#endif /* SERRANO */
#elif defined(CHEETAH_PLUS)
/*
* Macro version of get_ecache_dtag. We use this macro in the
* CPU logout code.
* afar: input AFAR, not modified.
* datap: Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
* pn_way: ecache way for panther (value = 0-3). For non-panther
* cpus, this macro will be called with pn_way = 0.
* scr1: Scratch.
* scr2: Scratch.
* scr3: Scratch.
*/
b 1f; \
.align 64; \
1: \
2: \
3: /* loop thru 5 regs */ \
bne 3b; \
beq 2b; \
/*
* If this is a panther, we need to make sure the sibling core is
* parked so that we avoid any race conditions during diagnostic
* accesses to the shared L2 and L3 caches.
* dcucr_reg: This register will be used to keep track of whether
* or not we need to unpark the core later.
* It just so happens that we also use this same register
* to keep track of our saved DCUCR value so we only touch
* bit 4 of the register (which is a "reserved" bit in the
* DCUCR) for keeping track of core parking.
* scr1: Scratch register.
* scr2: Scratch register.
*/
GET_CPU_IMPL(scr1); \
1: \
nop; \
nop; \
2:
/*
* The core running this code will unpark its sibling core if the
* sibling core had been parked by the current core earlier in this
* trap handler.
* dcucr_reg: This register is used to keep track of whether or not
* we need to unpark our sibling core.
* It just so happens that we also use this same register
* to keep track of our saved DCUCR value so we only touch
* bit 4 of the register (which is a "reserved" bit in the
* DCUCR) for keeping track of core parking.
* scr1: Scratch register.
* scr2: Scratch register.
*/
1:
/*
* in the primary AFSR are cleared except the fatal error bits. For Panther,
* we also have to read and clear the AFSR_EXT, again leaving the fatal
* error bits alone.
* datap: pointer to cpu logout structure.
* afar: returned primary AFAR value.
* scr1: scratch
* scr2: scratch
*/
GET_CPU_IMPL(scr1); \
nop; \
1:
/*
* This macro is used in the CPU logout code to capture diagnostic
* information from the L2 cache on panther processors.
* afar: input AFAR, not modified.
* datap: Ptr to pn_l2_data_t, at end pts just past pn_l2_data_t.
* scr1: Scratch.
* scr2: Scratch.
* scr3: Scratch.
*/
b 1f; /* code to read tags and data should be ... */ \
nop; /* ...on the same cache line if possible. */ \
1: \
2: \
3: \
/*
* Cheetah+ assumes E$ is 2-way and grabs both E$ lines associated with afar.
* afar: AFAR from access.
* datap: pointer to cpu logout structure.
* scr1: scratch
* scr2: scratch
* scr3: scratch
*/
GET_CPU_IMPL(scr1); \
nop; \
b 5f; \
nop; \
4: \
5:
/*
* Cheetah+ needs to capture E$, D$ and I$ lines associated with
* shadow afar.
* afar: scratch, holds shadow afar.
* datap: pointer to cpu logout structure
* scr1: scratch
* scr2: scratch
* scr3: scratch
*/
/*
* Compute the "Way" bit for 2-way Ecache for Cheetah+.
*/
#else /* CHEETAH_PLUS */
/*
* Macro version of get_ecache_dtag. We use this macro in the
* CPU logout code.
* afar: input AFAR, not modified.
* datap: Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
* scr1: Scratch.
* scr2: Scratch.
* scr3: Scratch.
*/
b 1f; \
.align 64; \
1: \
2: \
3: /* loop thru 5 regs */ \
bne 3b; \
beq 2b; \
/*
* Cheetah does not have cores so these macros are null.
*/
/*
* Cheetah gets primary AFSR and AFAR and clears the AFSR, except for the
* fatal error bits.
* datap: pointer to cpu logout structure.
* afar: returned primary AFAR value.
* scr1: scratch
* scr2: scratch
*/
/*
* Cheetah E$ is direct-mapped, so we grab line data and skip second line.
* afar: AFAR from access.
* datap: pointer to cpu logout structure.
* scr1: scratch
* scr2: scratch
* scr3: scratch
*/
/*
* Cheetah has no shadow AFAR, null operation.
*/
#endif /* CHEETAH_PLUS */
/*
* Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU
* logout data at TL>0. r_val is a register that returns the "failure count"
* to the caller, and may be used as a scratch register until the end of
* the macro. afar is used to return the primary AFAR value to the caller
* and it too can be used as a scratch register until the end. r_or_s is
* a reg or symbol that has the offset within the "cpu_private" data area
* to deposit the logout data. t_flags is a register that has the
* trap-type/trap-level/CEEN info. This t_flags register may be used after
* the GET_AFSR_AFAR macro.
*
* The CPU logout operation will fail (r_val > 0) if the logout
* structure in question is already being used. Otherwise, the CPU
* logout operation will succeed (r_val = 0). For failures, r_val
* returns the busy count (# of times we tried using this CPU logout
* structure when it was busy.)
*
* Register usage:
* %asi: Must be set to either ASI_MEM if the address in datap
* is a physical address or to ASI_N if the address in
* datap is a virtual address.
* r_val: This register is the return value which tells the
* caller whether or not the LOGOUT operation was successful.
* For failures, r_val returns the fail count (i.e. number of
* times we have tried to use this logout structure when it was
* already being used.
* afar: output: contains AFAR on exit
* t_flags: input trap type info, may be used as scratch after stored
* to cpu log out structure.
* datap: Points to log out data area.
* scr1: Scratch
* scr2: Scratch (may be r_val)
* scr3: Scratch (may be t_flags)
*/
bne 8f; \
nop; \
ba 9f; \
nop; \
8: \
9:
/*
* Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU
* logout data. Uses DO_TL1_CPU_LOGOUT macro defined above, and sets
* up the expected data pointer in the scr1 register and sets the %asi
* register to ASI_N for kernel virtual addresses instead of ASI_MEM as
* is used at TL>0.
*
* The CPU logout operation will fail (r_val > 0) if the logout
* structure in question is already being used. Otherwise, the CPU
* logout operation will succeed (r_val = 0). For failures, r_val
* returns the busy count (# of times we tried using this CPU logout
* structure when it was busy.)
*
* Register usage:
* r_val: This register is the return value which tells the
* caller whether or not the LOGOUT operation was successful.
* For failures, r_val returns the fail count (i.e. number of
* times we have tried to use this logout structure when it was
* already being used.
* afar: returns AFAR, used internally as afar value.
* output: if the cpu_private struct has not been initialized,
* then we return the t_flags value listed below.
* r_or_s: input offset, either register or constant (symbol). It's
* OK for r_or_s to be a register as long as it's not scr1 or
* scr3.
* t_flags: input trap type info, may be used as scratch after stored
* to cpu log out structure.
* scr1: Scratch, points to log out data area.
* scr2: Scratch (may be r_or_s)
* scr3: Scratch (may be r_val)
* scr4: Scratch (may be t_flags)
*/
ba 6f; \
nop; \
7: \
6:
/*
* The P$ is flushed as a side effect of writing to the Primary
* or Secondary Context Register. After writing to a context
* register, every line of the P$ in the Valid state is invalidated,
* regardless of which context it belongs to.
* This routine simply touches the Primary context register by
* reading the current value and writing it back. The Primary
* context is not changed.
*/
/*
* Macro that flushes the entire Dcache.
*
* arg1 = dcache size
* arg2 = dcache linesize
*/
1: \
/*
* Macro that flushes the entire Icache.
*
* Note that we cannot access ASI 0x67 (ASI_IC_TAG) with the Icache on,
* because accesses to ASI 0x67 interfere with Icache coherency. We
* must make sure the Icache is off, then turn it back on after the entire
* cache has been invalidated. If the Icache is originally off, we'll just
* clear the tags but not turn the Icache on.
*
* arg1 = icache size
* arg2 = icache linesize
*/
1: \
/*
* ASI access to the L2 tag or L2 flush can hang the cpu when interacting
* with combinations of L2 snoops, victims and stores.
*
* A possible workaround is to surround each L2 ASI access with membars
* and make sure that the code is hitting in the Icache. This requires
* aligning code sequence at E$ boundary and forcing I$ fetch by
* jumping to selected offsets so that we don't take any I$ misses
* during ASI access to the L2 tag or L2 flush. This also requires
* making sure that we don't take any interrupts or traps (such as
* fast ECC trap, I$/D$ tag parity error) which can result in eviction
* of this code sequence from I$, thus causing a miss.
*
* Because of the complexity/risk, we have decided to do a partial fix
* of adding membar around each ASI access to the L2 tag or L2 flush.
*/
#define JP_EC_DIAG_ACCESS_MEMBAR \
/*
* Jalapeno version of macro that flushes the entire Ecache.
*
* Uses Jalapeno displacement flush feature of ASI_EC_DIAG.
*
* arg1 = ecache size
* arg2 = ecache linesize - not modified; can be an immediate constant.
*/
1: \
nop; \
#else /* JALAPENO || SERRANO */
/*
* Cheetah version of macro that flushes the entire Ecache.
*
* Need to displacement flush 2x ecache size from Ecache flush area.
*
* arg1 = ecache size
* arg2 = ecache linesize
* arg3 = ecache flush address - for cheetah only
*/
1: \
/*
* Cheetah+ version of macro that flushes the entire Ecache.
*
* Uses the displacement flush feature.
*
* arg1 = ecache size
* arg2 = ecache linesize
* impl = CPU implementation as returned from GET_CPU_IMPL()
* The value in this register is destroyed during execution
* of the macro.
*/
#if defined(CHEETAH_PLUS)
nop; \
b 2f; \
nop; \
1: \
2: \
#else /* CHEETAH_PLUS */
#endif /* CHEETAH_PLUS */
/*
* Macro that flushes the entire Ecache.
*
* arg1 = ecache size
* arg2 = ecache linesize
* arg3 = ecache flush address - for cheetah only
*/
GET_CPU_IMPL(tmp1); \
nop; \
ba 3f; \
nop; \
2: \
3:
#endif /* JALAPENO || SERRANO */
/*
* Macro that flushes the Panther L2 cache.
*/
#if defined(CHEETAH_PLUS)
GET_CPU_IMPL(scr3); \
nop; \
1: \
2:
#else /* CHEETAH_PLUS */
#endif /* CHEETAH_PLUS */
/*
* Given a VA and page size (page size as encoded in ASI_MMU_TAG_ACCESS_EXT),
* this macro returns the TLB index for that mapping based on a 512 entry
* (2-way set associative) TLB. Aaside from the 16 entry fully associative
* TLBs, all TLBs in Panther are 512 entry, 2-way set associative.
*
* To find the index, we shift the VA right by 13 + (3 * pg_sz) and then
* mask out all but the lower 8 bits because:
*
* ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 0 for 8K
* ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 1 for 64K
* ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 2 for 512K
* ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 3 for 4M
* ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 4 for 32M
* ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 5 for 256M
*
* and
*
* array index for 8K pages = VA[20:13]
* array index for 64K pages = VA[23:16]
* array index for 512K pages = VA[26:19]
* array index for 4M pages = VA[29:22]
* array index for 32M pages = VA[32:25]
* array index for 256M pages = VA[35:28]
*
* Inputs:
*
* va - Register.
* Input: Virtual address in which we are interested.
* Output: TLB index value.
* pg_sz - Register. Page Size of the TLB in question as encoded
* in the ASI_[D|I]MMU_TAG_ACCESS_EXT register.
*/
#if defined(CHEETAH_PLUS)
#endif /* CHEETAH_PLUS */
/*
* The following macros are for error traps at TL>0.
* The issue with error traps at TL>0 is that there are no safely
* available global registers. So we use the trick of generating a
* software trap, then using the %tpc, %tnpc and %tstate registers to
* temporarily save the values of %g1 and %g2.
*/
/*
* Macro to generate 8-instruction trap table entry for TL>0 trap handlers.
* Does the following steps:
* 1. membar #Sync - required for USIII family errors.
* 2. Specified software trap.
* NB: Must be 8 instructions or less to fit in trap table and code must
* be relocatable.
*/
/*
* Macro to generate 8-instruction trap table entry for TL>0 software trap.
* We save the values of %g1 and %g2 in %tpc, %tnpc and %tstate (since
* the low-order two bits of %tpc/%tnpc are reserved and read as zero,
* we need to put the low-order two bits of %g1 and %g2 in %tstate).
* Note that %tstate has a reserved hole from bits 3-7, so we put the
* low-order two bits of %g1 in bits 0-1 and the low-order two bits of
* %g2 in bits 10-11 (insuring bits 8-9 are zero for use by the D$/I$
* state bits). Note that we must do a jmp instruction, since this
* is moved into the trap table entry.
* NB: Must be 8 instructions or less to fit in trap table and code must
* be relocatable.
*/
/*
* Macro to get ptr to ch_err_tl1_data.
* reg1 will either point to a physaddr with ASI_MEM in %asi OR it
* will point to a kernel nucleus virtual address with ASI_N in %asi.
* This allows us to:
* 1. Avoid getting MMU misses. We may have gotten the original
* Fast ECC error in an MMU handler and if we get an MMU trap
* in the TL>0 handlers, we'll scribble on the MMU regs.
* 2. Allows us to use the same code in the TL>0 handlers whether
* we're accessing kernel nucleus virtual addresses or physical
* addresses.
* pseudo-code:
* reg1 <- ch_err_tl1_paddrs[CPUID];
* if (reg1 == NULL) {
* reg1 <- &ch_err_tl1_data
* %asi <- ASI_N
* } else {
* reg1 <- reg1 + offset +
* sizeof (ch_err_tl1_data) * (%tl - 3)
* %asi <- ASI_MEM
* }
*/
ba 2f; \
2:
/*
* Macro to generate entry code for TL>0 error handlers.
* At the end of this macro, %g1 will point to the ch_err_tl1_data
* structure and %g2 will have the original flags in the ch_err_tl1_data
* structure and %g5 will have the value of %tstate where the Fast ECC
* routines will save the state of the D$ in Bit2 CH_ERR_TSTATE_DC_ON.
* All %g registers except for %g1, %g2 and %g5 will be available after
* this macro.
* Does the following steps:
* only %g1+%g2 (which we've saved in %tpc, %tnpc, %tstate)
* leaving address in %g1 and updating the %asi register.
* If there is no data area available, we branch to label.
* 2. Save %g3-%g7 in save area.
* 3. Save %tpc->%g3, %tnpc->%g4, %tstate->%g5, which contain
* original %g1+%g2 values (because we're going to change %tl).
* 4. set %tl <- %tl - 1. We do this ASAP to make window of
* running at %tl+1 as small as possible.
* 5. Reconstitute %g1+%g2 from %tpc (%g3), %tnpc (%g4),
* %tstate (%g5) and save in save area, carefully preserving %g5
* because it has the CH_ERR_TSTATE_DC_ON value.
* 6. Load existing ch_err_tl1_data flags in %g2
* 7. Compute the new flags
* 8. If %g2 is non-zero (the structure was busy), shift the new
* flags by CH_ERR_ME_SHIFT and or them with the old flags.
* 9. Store the updated flags into ch_err_tl1_data flags.
* 10. If %g2 is non-zero, read the %tpc and store it in
* ch_err_tl1_data.
*/
8:
/*
* Turns off D$/I$ and saves the state of DCU_DC+DCU_IC in %tstate Bits 8+9
* (CH_ERR_TSTATE_DC_ON/CH_ERR_TSTATE_IC_ON). This is invoked on Fast ECC
* at TL>0 handlers because the D$ may have corrupted data and we need to
* turn off the I$ to allow for diagnostic accesses. We then invoke
* the normal entry macro and after it is done we save the values of
* the original D$/I$ state, which is in %g5 bits CH_ERR_TSTATE_DC_ON/
* CH_ERR_TSTATE_IC_ON in ch_err_tl1_tmp.
*/
#define CH_ERR_TL1_FECC_ENTER \
/*
* Macro to generate exit code for TL>0 error handlers.
* We fall into this macro if we've successfully logged the error in
* the ch_err_tl1_data structure and want the PIL15 softint to pick
* it up and log it.
* Does the following steps:
* 1. Set pending flag for this cpu in ch_err_tl1_pending.
* 2. Write %set_softint with (1<<pil) to cause a pil level trap
* 3. Restore registers from ch_err_tl1_data, which is pointed to
* by %g1, last register to restore is %g1 since it's pointing
* to the save area.
* 4. Execute retry
*/
#define CH_ERR_TL1_EXIT \
/*
* Generates unrecoverable error label for TL>0 handlers.
* At label (Unrecoverable error routine)
* 1. Sets flags in ch_err_tl1_data and leaves in %g2 (first
* argument to cpu_tl1_err_panic).
* 2. Call cpu_tl1_err_panic via systrap at PIL 15
*/
/* END CSTYLED */
#endif /* _ASM */
#ifdef __cplusplus
}
#endif
#endif /* _CHEETAHASM_H */