sun4u/sys/cheetahasm.h

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#ifndef _CHEETAHASM_H
#define _CHEETAHASM_H

#pragma ident   "%Z%%M% %I% %E% SMI"

#ifdef  __cplusplus
extern "C" {
#endif

#ifdef _ASM
/* BEGIN CSTYLED */

#define ASM_LD(reg, symbol)                     \
    sethi   %hi(symbol), reg;                   \
    ld  [reg + %lo(symbol)], reg;               \

#define ASM_LDX(reg, symbol)                        \
    sethi   %hi(symbol), reg;                   \
    ldx [reg + %lo(symbol)], reg;               \

#define ASM_JMP(reg, symbol)                        \
    sethi   %hi(symbol), reg;                   \
    jmp reg + %lo(symbol);                  \
    nop

/*
 * Macro for getting to offset from 'cpu_private' ptr.  The 'cpu_private'
 * ptr is in the machcpu structure.
 *  off_reg:  Register offset from 'cpu_private' ptr.
 *  scr1:    Scratch, ptr is returned in this register.
 *  scr2:    Scratch
 *  label:   Label to branch to if cpu_private ptr is null/zero.
 */
#define GET_CPU_PRIVATE_PTR(off_reg, scr1, scr2, label)         \
    CPU_ADDR(scr1, scr2);                       \
    ldn [scr1 + CPU_PRIVATE], scr1;             \
    cmp scr1, 0;                        \
    be  label;                          \
      nop;                              \
    add scr1, off_reg, scr1

/*
 * Macro version of get_dcache_dtag.  We use this macro in the
 * CPU logout code. Since the Dcache is virtually indexed, only
 * bits [12:5] of the AFAR can be used so we need to search through
 * 8 indexes (4 ways + bit 13) in order to find the tag we want.
 *   afar:  input AFAR, not modified.
 *   datap: input ptr to ch_dc_data_t, at end pts to end of ch_dc_data_t.
 *   scr1:  scratch.
 *   scr2:  scratch, will hold tag to look for.
 *   scr3:  used for Dcache index, loops through 4 ways.
 */
#define GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3)          \
    set CH_DCACHE_IDX_MASK, scr3;               \
    and afar, scr3, scr3;                   \
    srlx    afar, CH_DCTAG_PA_SHIFT, scr2;              \
    b   1f;                         \
      or    scr2, CH_DCTAG_VALID_BIT, scr2; /* tag we want */   \
    .align  128;                            \
1:                                  \
    ldxa    [scr3]ASI_DC_TAG, scr1;     /* read tag */      \
    cmp scr1, scr2;                     \
    bne 4f;             /* not found? */    \
      nop;                              \
    stxa    scr3, [datap + CH_DC_IDX]%asi;  /* store index */   \
    stxa    scr1, [datap + CH_DC_TAG]%asi;  /* store tag */     \
    membar  #Sync;          /* Cheetah PRM 10.6.3 */    \
    ldxa    [scr3]ASI_DC_UTAG, scr1;    /* read utag */     \
    membar  #Sync;          /* Cheetah PRM 10.6.3 */    \
    stxa    scr1, [datap + CH_DC_UTAG]%asi;             \
    ldxa    [scr3]ASI_DC_SNP_TAG, scr1; /* read snoop tag */    \
    stxa    scr1, [datap + CH_DC_SNTAG]%asi;            \
    add datap, CH_DC_DATA, datap;               \
    clr scr2;                           \
2:                                  \
    membar  #Sync;          /* Cheetah PRM 10.6.1 */    \
    ldxa    [scr3 + scr2]ASI_DC_DATA, scr1; /* read data */     \
    membar  #Sync;          /* Cheetah PRM 10.6.1 */    \
    stxa    scr1, [datap]%asi;                  \
    add datap, 8, datap;                    \
    cmp scr2, CH_DC_DATA_REG_SIZE - 8;              \
    blt 2b;                         \
      add   scr2, 8, scr2;                      \
                                    \
    GET_CPU_IMPL(scr2); /* Parity bits are elsewhere for */ \
    cmp scr2, PANTHER_IMPL; /* panther processors. */   \
    bne,a   5f;         /* Done if not panther. */  \
      add   datap, 8, datap; /* Skip to the end of the struct. */   \
    clr scr2;                           \
    add datap, 7, datap; /* offset of the last parity byte */   \
    mov 1, scr1;                        \
    sll scr1, PN_DC_DATA_PARITY_BIT_SHIFT, scr1;        \
    or  scr3, scr1, scr3; /* add DC_data_parity bit to index */ \
3:                                  \
    membar  #Sync;          /* Cheetah PRM 10.6.1 */    \
    ldxa    [scr3 + scr2]ASI_DC_DATA, scr1; /* read parity bits */  \
    membar  #Sync;          /* Cheetah PRM 10.6.1 */    \
    stba    scr1, [datap]%asi;                  \
    dec datap;                          \
    cmp scr2, CH_DC_DATA_REG_SIZE - 8;              \
    blt 3b;                         \
      add   scr2, 8, scr2;                      \
    b   5f;                         \
      add   datap, 5, datap; /* set pointer to end of our struct */ \
4:                                  \
    set CH_DCACHE_IDX_INCR, scr1;   /* incr. idx (scr3) */  \
    add scr3, scr1, scr3;                   \
    set CH_DCACHE_IDX_LIMIT, scr1;  /* done? */     \
    cmp scr3, scr1;                     \
    blt 1b;                         \
      nop;                              \
    add datap, CH_DC_DATA_SIZE, datap;              \
5:

/*
 * Macro version of get_icache_dtag.  We use this macro in the CPU
 * logout code. If the Icache is on, we don't want to capture the data.
 *   afar:  input AFAR, not modified.
 *   datap: input ptr to ch_ic_data_t, at end pts to end of ch_ic_data_t.
 *   scr1:  scratch.
 *   scr2:  scratch, will hold tag to look for.
 *   scr3:  used for Icache index, loops through 4 ways.
 * Note: For Panther, the Icache is virtually indexed and increases in
 * size to 64KB (instead of 32KB) with a line size of 64 bytes (instead
 * of 32). This means the IC_addr index bits[14:7] for Panther now
 * correspond to VA bits[13:6]. But since it is virtually indexed, we
 * still mask out only bits[12:5] from the AFAR (we have to manually
 * check bit 13). In order to make this code work for all processors,
 * we end up checking twice as many indexes (8 instead of 4) as required
 * for non-Panther CPUs and saving off twice as much data (16 instructions
 * instead of just 8).
 */
#define GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3)          \
    ldxa    [%g0]ASI_DCU, scr1;                 \
    btst    DCU_IC, scr1;       /* is Icache enabled? */    \
    bne,a   6f;         /* yes, don't capture */    \
      add   datap, CH_IC_DATA_SIZE, datap;  /* anul if no branch */ \
    GET_CPU_IMPL(scr2); /* Panther only uses VA[13:6] */    \
    cmp scr2, PANTHER_IMPL; /* and we also want to mask */  \
    be  1f;         /* out bit 13 since the */  \
      nop;              /* Panther I$ is VIPT. */   \
    set CH_ICACHE_IDX_MASK, scr3;               \
    b   2f;                         \
      nop;                              \
1:                                  \
    set PN_ICACHE_VA_IDX_MASK, scr3;                \
2:                                  \
    and afar, scr3, scr3;                   \
    sllx    scr3, CH_ICACHE_IDX_SHIFT, scr3;            \
    srlx    afar, CH_ICPATAG_SHIFT, scr2;   /* pa tag we want */    \
    andn    scr2, CH_ICPATAG_LBITS, scr2;   /* mask off lower */    \
    b   3f;                         \
      nop;                              \
    .align  128;                            \
3:                                  \
    ldxa    [scr3]ASI_IC_TAG, scr1;     /* read pa tag */   \
    andn    scr1, CH_ICPATAG_LBITS, scr1;   /* mask off lower */    \
    cmp scr1, scr2;                     \
    bne 5f;             /* not found? */    \
      nop;                              \
    stxa    scr3, [datap + CH_IC_IDX]%asi;  /* store index */   \
    stxa    scr1, [datap + CH_IC_PATAG]%asi; /* store pa tag */ \
    add scr3, CH_ICTAG_UTAG, scr3;  /* read utag */     \
    ldxa    [scr3]ASI_IC_TAG, scr1;                 \
    add scr3, (CH_ICTAG_UPPER - CH_ICTAG_UTAG), scr3;       \
    stxa    scr1, [datap + CH_IC_UTAG]%asi;             \
    ldxa    [scr3]ASI_IC_TAG, scr1;     /* read upper tag */    \
    add scr3, (CH_ICTAG_LOWER - CH_ICTAG_UPPER), scr3;      \
    stxa    scr1, [datap + CH_IC_UPPER]%asi;            \
    ldxa    [scr3]ASI_IC_TAG, scr1;     /* read lower tag */    \
    andn    scr3, CH_ICTAG_TMASK, scr3;             \
    stxa    scr1, [datap + CH_IC_LOWER]%asi;            \
    ldxa    [scr3]ASI_IC_SNP_TAG, scr1; /* read snoop tag */    \
    stxa    scr1, [datap + CH_IC_SNTAG]%asi;            \
    add datap, CH_IC_DATA, datap;               \
    clr scr2;                           \
4:                                  \
    ldxa    [scr3 + scr2]ASI_IC_DATA, scr1; /* read ins. data */    \
    stxa    scr1, [datap]%asi;                  \
    add datap, 8, datap;                    \
    cmp scr2, PN_IC_DATA_REG_SIZE - 8;              \
    blt 4b;                         \
      add   scr2, 8, scr2;                      \
    b   6f;                         \
      nop;                              \
5:                                  \
    set CH_ICACHE_IDX_INCR, scr1;   /* incr. idx (scr3) */  \
    add scr3, scr1, scr3;                   \
    set PN_ICACHE_IDX_LIMIT, scr1;  /* done? */     \
    cmp scr3, scr1;                     \
    blt 3b;                         \
      nop;                              \
    add datap, CH_IC_DATA_SIZE, datap;              \
6:

#if defined(JALAPENO) || defined(SERRANO)
/*
 * Macro version of get_ecache_dtag.  We use this macro in the
 * CPU logout code.
 *   afar:  input AFAR, not modified
 *   datap: Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
 *   ec_way:    Constant value (way number)
 *   scr1:      Scratch
 *   scr2:  Scratch.
 *   scr3:  Scratch.
 */
#define GET_ECACHE_DTAG(afar, datap, ec_way, scr1, scr2, scr3)      \
    mov ec_way, scr1;                       \
    and scr1, JP_ECACHE_NWAY - 1, scr1; /* mask E$ way bits */  \
    sllx    scr1, JP_EC_TAG_DATA_WAY_SHIFT, scr1;           \
    set ((JP_ECACHE_MAX_SIZE / JP_ECACHE_NWAY) - 1), scr2;  \
    and afar, scr2, scr3;       /* get set offset */    \
    andn    scr3, (JP_ECACHE_MAX_LSIZE - 1), scr3; /* VA<5:0>=0 */  \
    or  scr3, scr1, scr3;       /* or WAY bits */   \
    b   1f;                         \
      stxa  scr3, [datap + CH_EC_IDX]%asi;  /* store E$ index */    \
    .align  64;                         \
1:                                  \
    JP_EC_DIAG_ACCESS_MEMBAR;                   \
    ldxa    [scr3]ASI_EC_DIAG, scr1;    /* get E$ tag */    \
    JP_EC_DIAG_ACCESS_MEMBAR;                   \
    stxa    scr1, [datap + CH_EC_TAG]%asi;              \
    add datap, CH_EC_DATA, datap;               \
2:                                  \
    ldxa    [scr3]ASI_EC_R, %g0;        /* ld E$ stging regs */ \
    clr scr1;                           \
3:                      /* loop thru 5 regs */  \
    ldxa    [scr1]ASI_EC_DATA, scr2;                \
    stxa    scr2, [datap]%asi;                  \
    add datap, 8, datap;                    \
    cmp scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;           \
    bne 3b;                         \
       add  scr1, 8, scr1;                      \
    btst    CH_ECACHE_STGREG_SIZE, scr3;    /* done? */     \
    beq 2b;                         \
       add  scr3, CH_ECACHE_STGREG_SIZE, scr3

#define GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)         \
    GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);      \
    GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3);      \
    GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3);      \
    GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3);      \
    add datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap; \
    add datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;        \

/*
 * Jalapeno does not have cores so these macros are null.
 */
#define PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
#define UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)

#if defined(JALAPENO)
/*
 * Jalapeno gets primary AFSR and AFAR.  All bits in the AFSR except
 * the fatal error bits are cleared.
 *  datap:      pointer to cpu logout structure.
 *  afar:       returned primary AFAR value.
 *  scr1:       scratch
 *  scr2:       scratch
 */
#define GET_AFSR_AFAR(datap, afar, scr1, scr2)              \
    ldxa    [%g0]ASI_AFAR, afar;                    \
    stxa    afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;    \
    ldxa    [%g0]ASI_AFSR, scr2;                    \
    stxa    scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;    \
    sethi   %hh(C_AFSR_FATAL_ERRS), scr1;               \
    sllx    scr1, 32, scr1;                     \
    bclr    scr1, scr2; /* Clear fatal error bits here, so */   \
    stxa    scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */   \
    membar  #Sync

/*
 * Jalapeno has no shadow AFAR, null operation.
 */
#define GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)

#elif defined(SERRANO)
/*
 * Serrano gets primary AFSR and AFAR.  All bits in the AFSR except
 * the fatal error bits are cleared.  For Serrano, we also save the
 * AFAR2 register.
 *  datap:  pointer to cpu logout structure.
 *  afar:   returned primary AFAR value.
 *  scr1:   scratch
 *  scr2:   scratch
 */
#define GET_AFSR_AFAR(datap, afar, scr1, scr2)              \
    set ASI_MCU_AFAR2_VA, scr1;                 \
    ldxa    [scr1]ASI_MCU_CTRL, afar;               \
    stxa    afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi;   \
    ldxa    [%g0]ASI_AFAR, afar;                    \
    stxa    afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;    \
    ldxa    [%g0]ASI_AFSR, scr2;                    \
    stxa    scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;    \
    sethi   %hh(C_AFSR_FATAL_ERRS), scr1;               \
    sllx    scr1, 32, scr1;                     \
    bclr    scr1, scr2; /* Clear fatal error bits here, so */   \
    stxa    scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */   \
    membar  #Sync

/*
 * Serrano needs to capture E$, D$ and I$ lines associated with afar2.
 *      afar:   scratch, holds afar2.
 *      datap:  pointer to cpu logout structure
 *      scr1:   scratch
 *      scr2:   scratch
 *      scr3:   scratch
 */
#define GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)      \
    ldxa    [datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi, afar;   \
    add datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap;     \
    GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);        \
    GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);         \
    GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);         \
    sub datap, CH_CPU_LOGOUT_SIZE, datap
#endif /* SERRANO */

#elif defined(CHEETAH_PLUS)
/*
 * Macro version of get_ecache_dtag.  We use this macro in the
 * CPU logout code.
 *   afar:  input AFAR, not modified.
 *   datap: Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
 *   pn_way:    ecache way for panther (value = 0-3). For non-panther
 *      cpus, this macro will be called with pn_way = 0.
 *   scr1:  Scratch.
 *   scr2:  Scratch.
 *   scr3:  Scratch.
 */
#define GET_ECACHE_DTAG(afar, datap, pn_way, scr1, scr2, scr3)      \
    mov afar, scr3;                     \
    andn    scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\
    set (CH_ECACHE_8M_SIZE - 1), scr2;              \
    and scr3, scr2, scr3;       /* VA<63:23>=0 */   \
    mov pn_way, scr1;   /* panther L3$ is 4-way so we ...    */ \
    sllx    scr1, PN_L3_WAY_SHIFT, scr1;    /* need to mask...   */ \
    or  scr3, scr1, scr3;   /* in the way bits <24:23>.  */ \
    b   1f;                         \
       stxa scr3, [datap + CH_EC_IDX]%asi;  /* store E$ index */    \
    .align  64;                         \
1:                                  \
    ldxa    [scr3]ASI_EC_DIAG, scr1;    /* get E$ tag */    \
    stxa     scr1, [datap + CH_EC_TAG]%asi;             \
    set CHP_ECACHE_IDX_TAG_ECC, scr1;               \
    or  scr3, scr1, scr1;                   \
    ldxa    [scr1]ASI_EC_DIAG, scr1;    /* get E$ tag ECC */    \
    stxa    scr1, [datap + CH_EC_TAG_ECC]%asi;          \
    add datap, CH_EC_DATA, datap;               \
2:                                  \
    ldxa    [scr3]ASI_EC_R, %g0;        /* ld E$ stging regs */ \
    clr scr1;                           \
3:                      /* loop thru 5 regs */  \
    ldxa    [scr1]ASI_EC_DATA, scr2;                \
    stxa    scr2, [datap]%asi;                  \
    add datap, 8, datap;                    \
    cmp scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;           \
    bne 3b;                         \
       add  scr1, 8, scr1;                      \
    btst    CH_ECACHE_STGREG_SIZE, scr3;    /* done? */     \
    beq 2b;                         \
       add  scr3, CH_ECACHE_STGREG_SIZE, scr3

/*
 * If this is a panther, we need to make sure the sibling core is
 * parked so that we avoid any race conditions during diagnostic
 * accesses to the shared L2 and L3 caches.
 * dcucr_reg:   This register will be used to keep track of whether
 *      or not we need to unpark the core later.
 *      It just so happens that we also use this same register
 *      to keep track of our saved DCUCR value so we only touch
 *      bit 4 of the register (which is a "reserved" bit in the
 *      DCUCR) for keeping track of core parking.
 * scr1:    Scratch register.
 * scr2:    Scratch register.
 */
#define PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)            \
    GET_CPU_IMPL(scr1);                     \
    cmp scr1, PANTHER_IMPL; /* only park for panthers */    \
    bne,a   %xcc, 2f;                       \
      andn  dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;     \
    set ASI_CORE_RUNNING_STATUS, scr1;  /* check other core */  \
    ldxa    [scr1]ASI_CMP_SHARED, scr2; /* is it running?   */  \
    cmp scr2, PN_BOTH_CORES_RUNNING;                \
    bne,a   %xcc, 2f;   /* if not running, we are done */   \
      andn  dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;     \
    or  dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;     \
    set ASI_CORE_ID, scr1;                  \
    ldxa    [scr1]ASI_CMP_PER_CORE, scr2;               \
    and scr2, COREID_MASK, scr2;                \
    or  %g0, 1, scr1;       /* find out which core... */    \
    sll scr1, scr2, scr2;   /* ... we need to park... */    \
1:                                  \
    set ASI_CORE_RUNNING_RW, scr1;              \
    ldxa    [scr1]ASI_CMP_SHARED, scr1; /* ...but are we? */    \
    btst    scr1, scr2;        /* check our own parked status */    \
    bz      %xcc, 1b;        /* if we are then go round again */    \
    nop;                                \
    set ASI_CORE_RUNNING_RW, scr1;  /* else proceed... */   \
    stxa    scr2, [scr1]ASI_CMP_SHARED; /* ... and park it. */  \
    membar  #Sync;                          \
    set ASI_CORE_RUNNING_STATUS, scr1;  /* spin until... */ \
    ldxa    [scr1]ASI_CMP_SHARED, scr1; /* ... the other...  */ \
    cmp scr1, scr2; /* ...core is parked according to... */ \
    bne,a   %xcc, 1b;   /* ...the core running status reg.  */  \
      nop;                              \
2:

/*
 * The core running this code will unpark its sibling core if the
 * sibling core had been parked by the current core earlier in this
 * trap handler.
 * dcucr_reg:   This register is used to keep track of whether or not
 *      we need to unpark our sibling core.
 *      It just so happens that we also use this same register
 *      to keep track of our saved DCUCR value so we only touch
 *      bit 4 of the register (which is a "reserved" bit in the
 *      DCUCR) for keeping track of core parking.
 * scr1:    Scratch register.
 * scr2:    Scratch register.
 */
#define UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)          \
    btst    PN_PARKED_OTHER_CORE, dcucr_reg;            \
    bz,pt   %xcc, 1f;   /* if nothing to unpark, we are done */ \
      andn  dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;     \
    set ASI_CORE_RUNNING_RW, scr1;              \
    set PN_BOTH_CORES_RUNNING, scr2;    /* we want both...   */ \
    stxa    scr2, [scr1]ASI_CMP_SHARED; /* ...cores running. */ \
    membar  #Sync;                          \
1:

/*
 * Cheetah+ and Jaguar get both primary and secondary AFSR/AFAR.  All bits
 * in the primary AFSR are cleared except the fatal error bits.  For Panther,
 * we also have to read and clear the AFSR_EXT, again leaving the fatal
 * error bits alone.
 *  datap:      pointer to cpu logout structure.
 *  afar:       returned primary AFAR value.
 *  scr1:       scratch
 *  scr2:       scratch
 */
#define GET_AFSR_AFAR(datap, afar, scr1, scr2)              \
    set ASI_SHADOW_REG_VA, scr1;                \
    ldxa    [scr1]ASI_AFAR, scr2;                   \
    stxa    scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi;    \
    ldxa    [scr1]ASI_AFSR, scr2;                   \
    stxa    scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR)]%asi;    \
    ldxa    [%g0]ASI_AFAR, afar;                    \
    stxa    afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;    \
    ldxa    [%g0]ASI_AFSR, scr2;                    \
    stxa    scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;    \
    sethi   %hh(C_AFSR_FATAL_ERRS), scr1;               \
    sllx    scr1, 32, scr1;                     \
    bclr    scr1, scr2; /* Clear fatal error bits here, so */   \
    stxa    scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */   \
    membar  #Sync;                          \
    GET_CPU_IMPL(scr1);                     \
    cmp scr1, PANTHER_IMPL;                 \
    bne %xcc, 1f;                       \
       nop;                             \
    set ASI_SHADOW_AFSR_EXT_VA, scr1;   /* shadow AFSR_EXT */   \
    ldxa    [scr1]ASI_AFSR, scr2;                   \
    stxa    scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR_EXT)]%asi; \
    set ASI_AFSR_EXT_VA, scr1;      /* primary AFSR_EXT */  \
    ldxa    [scr1]ASI_AFSR, scr2;                   \
    stxa    scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR_EXT)]%asi;    \
    set C_AFSR_EXT_FATAL_ERRS, scr1;                \
    bclr    scr1, scr2; /* Clear fatal error bits here, */  \
    set ASI_AFSR_EXT_VA, scr1;  /* so they're left */       \
    stxa    scr2, [scr1]ASI_AFSR;   /* as is in AFSR_EXT */     \
    membar  #Sync;                          \
1:

/*
 * This macro is used in the CPU logout code to capture diagnostic
 * information from the L2 cache on panther processors.
 *   afar:  input AFAR, not modified.
 *   datap: Ptr to pn_l2_data_t, at end pts just past pn_l2_data_t.
 *   scr1:  Scratch.
 *   scr2:  Scratch.
 *   scr3:  Scratch.
 */
#define GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3)        \
    mov afar, scr3;                     \
    set PN_L2_INDEX_MASK, scr1;                 \
    and scr3, scr1, scr3;                   \
    b   1f; /* code to read tags and data should be ...  */ \
       nop;     /* ...on the same cache line if possible.    */ \
    .align  128;    /* update this line if you add lines below. */  \
1:                                  \
    stxa    scr3, [datap + CH_EC_IDX]%asi;  /* store L2$ index  */  \
    ldxa    [scr3]ASI_L2_TAG, scr1;     /* read the L2$ tag */  \
    stxa    scr1, [datap + CH_EC_TAG]%asi;              \
    add datap, CH_EC_DATA, datap;               \
    clr scr1;                           \
2:                                  \
    ldxa    [scr3 + scr1]ASI_L2_DATA, scr2; /* loop through     */  \
    stxa    scr2, [datap]%asi;      /* <511:256> of L2  */  \
    add datap, 8, datap;        /* data and record  */  \
    cmp scr1, (PN_L2_LINESIZE / 2) - 8; /* it in the cpu    */  \
    bne 2b;             /* logout struct.   */  \
      add   scr1, 8, scr1;                      \
    set PN_L2_DATA_ECC_SEL, scr2;   /* ECC_sel bit.     */  \
    ldxa    [scr3 + scr2]ASI_L2_DATA, scr2; /* Read and record  */  \
    stxa    scr2, [datap]%asi;      /* ecc of <511:256> */  \
    add datap, 8, datap;                    \
3:                                  \
    ldxa    [scr3 + scr1]ASI_L2_DATA, scr2; /* loop through     */  \
    stxa    scr2, [datap]%asi;      /* <255:0> of L2    */  \
    add datap, 8, datap;        /* data and record  */  \
    cmp scr1, PN_L2_LINESIZE - 8;   /* it in the cpu    */  \
    bne 3b;             /* logout struct.   */  \
      add   scr1, 8, scr1;                      \
    set PN_L2_DATA_ECC_SEL, scr2;   /* ECC_sel bit.     */  \
    add scr2, PN_L2_ECC_LO_REG, scr2;               \
    ldxa    [scr3 + scr2]ASI_L2_DATA, scr2; /* Read and record  */  \
    stxa    scr2, [datap]%asi;      /* ecc of <255:0>.  */  \
    add datap, 8, datap;        /* Advance pointer  */  \
    set PN_L2_SET_SIZE, scr2;                   \
    set PN_L2_MAX_SET, scr1;                    \
    cmp scr1, scr3; /* more ways to try for this line? */   \
    bg,a    %xcc, 1b;   /* if so, start over with next way */   \
      add   scr3, scr2, scr3

/*
 * Cheetah+ assumes E$ is 2-way and grabs both E$ lines associated with afar.
 *  afar:   AFAR from access.
 *  datap:  pointer to cpu logout structure.
 *  scr1:   scratch
 *  scr2:   scratch
 *  scr3:   scratch
 */
#define GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)         \
    GET_CPU_IMPL(scr1);                     \
    cmp scr1, PANTHER_IMPL;                 \
    bne %xcc, 4f;                       \
      nop;                              \
    GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);      \
    GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3);      \
    GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3);      \
    GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3);      \
    add datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap; \
    GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3);       \
    b   5f;                         \
      nop;                              \
4:                                  \
    GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);      \
    GET_ECACHE_WAY_BIT(scr1, scr2);                 \
    xor afar, scr1, afar;                   \
    GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);      \
    GET_ECACHE_WAY_BIT(scr1, scr2);     /* restore AFAR */  \
    xor afar, scr1, afar;                   \
    add datap, (CHD_EC_DATA_SETS-2)*CH_EC_DATA_SIZE, datap; \
    add datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;        \
5:

/*
 * Cheetah+ needs to capture E$, D$ and I$ lines associated with
 * shadow afar.
 *  afar:   scratch, holds shadow afar.
 *  datap:  pointer to cpu logout structure
 *  scr1:   scratch
 *  scr2:   scratch
 *  scr3:   scratch
 */
#define GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)      \
    ldxa    [datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi, afar;    \
    add datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap; \
    GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);        \
    GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);         \
    GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);         \
    sub datap, CH_CPU_LOGOUT_SIZE, datap

/*
 * Compute the "Way" bit for 2-way Ecache for Cheetah+.
 */
#define GET_ECACHE_WAY_BIT(scr1, scr2)                  \
    CPU_INDEX(scr1, scr2);                      \
    mulx    scr1, CPU_NODE_SIZE, scr1;              \
    add scr1, ECACHE_SIZE, scr1;                \
    set cpunodes, scr2;                     \
    ld  [scr1 + scr2], scr1;                    \
    srlx    scr1, 1, scr1

#else /* CHEETAH_PLUS */
/*
 * Macro version of get_ecache_dtag.  We use this macro in the
 * CPU logout code.
 *   afar:  input AFAR, not modified.
 *   datap: Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
 *   scr1:      Scratch.
 *   scr2:  Scratch.
 *   scr3:  Scratch.
 */
#define GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3)          \
    mov afar, scr3;                     \
    andn    scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\
    set (CH_ECACHE_8M_SIZE - 1), scr2;              \
    and scr3, scr2, scr3;       /* VA<63:23>=0 */   \
    b   1f;                         \
       stxa scr3, [datap + CH_EC_IDX]%asi;  /* store E$ index */    \
    .align  64;                         \
1:                                  \
    ldxa    [scr3]ASI_EC_DIAG, scr1;    /* get E$ tag */    \
    stxa    scr1, [datap + CH_EC_TAG]%asi;              \
    add datap, CH_EC_DATA, datap;               \
2:                                  \
    ldxa    [scr3]ASI_EC_R, %g0;        /* ld E$ stging regs */ \
    clr scr1;                           \
3:                      /* loop thru 5 regs */  \
    ldxa    [scr1]ASI_EC_DATA, scr2;                \
    stxa    scr2, [datap]%asi;                  \
    add datap, 8, datap;                    \
    cmp scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;           \
    bne 3b;                         \
       add  scr1, 8, scr1;                      \
    btst    CH_ECACHE_STGREG_SIZE, scr3;    /* done? */     \
    beq 2b;                         \
       add  scr3, CH_ECACHE_STGREG_SIZE, scr3

/*
 * Cheetah does not have cores so these macros are null.
 */
#define PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
#define UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)

/*
 * Cheetah gets primary AFSR and AFAR and clears the AFSR, except for the
 * fatal error bits.
 *  datap:      pointer to cpu logout structure.
 *  afar:       returned primary AFAR value.
 *  scr1:       scratch
 *  scr2:       scratch
 */
#define GET_AFSR_AFAR(datap, afar, scr1, scr2)  \
    ldxa    [%g0]ASI_AFAR, afar;                    \
    stxa    afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;    \
    ldxa    [%g0]ASI_AFSR, scr2;                    \
    stxa    scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;    \
    sethi   %hh(C_AFSR_FATAL_ERRS), scr1;               \
    sllx    scr1, 32, scr1;                     \
    bclr    scr1, scr2; /* Clear fatal error bits here, so */   \
    stxa    scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */   \
    membar  #Sync

/*
 * Cheetah E$ is direct-mapped, so we grab line data and skip second line.
 *  afar:   AFAR from access.
 *  datap:  pointer to cpu logout structure.
 *  scr1:   scratch
 *  scr2:   scratch
 *  scr3:   scratch
 */
#define GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)         \
    GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3);         \
    add datap, (CHD_EC_DATA_SETS-1)*CH_EC_DATA_SIZE, datap; \
    add datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;        \

/*
 * Cheetah has no shadow AFAR, null operation.
 */
#define GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)

#endif  /* CHEETAH_PLUS */

/*
 * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU
 * logout data at TL>0. r_val is a register that returns the "failure count"
 * to the caller, and may be used as a scratch register until the end of
 * the macro.  afar is used to return the primary AFAR value to the caller
 * and it too can be used as a scratch register until the end. r_or_s is
 * a reg or symbol that has the offset within the "cpu_private" data area
 * to deposit the logout data.  t_flags is a register that has the
 * trap-type/trap-level/CEEN info. This t_flags register may be used after
 * the GET_AFSR_AFAR macro.
 *
 * The CPU logout operation will fail (r_val > 0) if the logout
 * structure in question is already being used. Otherwise, the CPU
 * logout operation will succeed (r_val = 0). For failures, r_val
 * returns the busy count (# of times we tried using this CPU logout
 * structure when it was busy.)
 *
 *   Register usage:
 *  %asi:   Must be set to either ASI_MEM if the address in datap
 *      is a physical address or to ASI_N if the address in
 *      datap is a virtual address.
 *  r_val:  This register is the return value which tells the
 *      caller whether or not the LOGOUT operation was successful.
 *      For failures, r_val returns the fail count (i.e. number of
 *      times we have tried to use this logout structure when it was
 *      already being used.
 *  afar:   output: contains AFAR on exit
 *  t_flags: input trap type info, may be used as scratch after stored
 *      to cpu log out structure.
 *  datap:  Points to log out data area.
 *  scr1:   Scratch
 *  scr2:   Scratch (may be r_val)
 *  scr3:   Scratch (may be t_flags)
 */
#define DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, datap, scr1, scr2, scr3) \
    setx    LOGOUT_INVALID, scr2, scr1;             \
    ldxa    [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, scr2;    \
    cmp scr2, scr1;                     \
    bne 8f;                         \
      nop;                              \
    stxa    t_flags, [datap + CH_CLO_FLAGS]%asi;            \
    GET_AFSR_AFAR(datap, afar, scr1, scr2);             \
    add datap, CH_CLO_DATA + CH_CHD_EC_DATA, datap;     \
    GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);        \
    GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);         \
    GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);         \
    sub datap, CH_CLO_DATA + CH_DIAG_DATA_SIZE, datap;      \
    GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3);         \
    ldxa    [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, afar;    \
    set 0, r_val;   /* return value for success */      \
    ba  9f;                         \
      nop;                              \
8:                                  \
    ldxa    [%g0]ASI_AFAR, afar;                    \
    ldxa    [datap + CH_CLO_NEST_CNT]%asi, r_val;           \
    inc r_val;      /* return value for failure */      \
    stxa    r_val, [datap + CH_CLO_NEST_CNT]%asi;           \
    membar  #Sync;                          \
9:

/*
 * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU
 * logout data.  Uses DO_TL1_CPU_LOGOUT macro defined above, and sets
 * up the expected data pointer in the scr1 register and sets the %asi
 * register to ASI_N for kernel virtual addresses instead of ASI_MEM as
 * is used at TL>0.
 *
 * The CPU logout operation will fail (r_val > 0) if the logout
 * structure in question is already being used. Otherwise, the CPU
 * logout operation will succeed (r_val = 0). For failures, r_val
 * returns the busy count (# of times we tried using this CPU logout
 * structure when it was busy.)
 *
 *   Register usage:
 *  r_val:  This register is the return value which tells the
 *      caller whether or not the LOGOUT operation was successful.
 *      For failures, r_val returns the fail count (i.e. number of
 *      times we have tried to use this logout structure when it was
 *      already being used.
 *  afar:   returns AFAR, used internally as afar value.
 *      output: if the cpu_private struct has not been initialized,
 *              then we return the t_flags value listed below.
 *  r_or_s: input offset, either register or constant (symbol).  It's
 *      OK for r_or_s to be a register as long as it's not scr1 or
 *      scr3.
 *  t_flags: input trap type info, may be used as scratch after stored
 *      to cpu log out structure.
 *  scr1:   Scratch, points to log out data area.
 *  scr2:   Scratch (may be r_or_s)
 *  scr3:   Scratch (may be r_val)
 *  scr4:   Scratch (may be t_flags)
 */
#define DO_CPU_LOGOUT(r_val, afar, r_or_s, t_flags, scr1, scr2, scr3, scr4) \
    GET_CPU_PRIVATE_PTR(r_or_s, scr1, scr3, 7f); /* can't use scr2/4 */ \
    wr  %g0, ASI_N, %asi;                   \
    DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, scr1, scr2, scr3, scr4) \
    ba  6f;                         \
      nop;                              \
7:                                  \
    mov t_flags, afar;      /* depends on afar = %g2  */    \
    set 0, r_val;       /* success in this case.  */    \
6:

/*
 * The P$ is flushed as a side effect of writing to the Primary
 * or Secondary Context Register. After writing to a context
 * register, every line of the P$ in the Valid state is invalidated,
 * regardless of which context it belongs to.
 * This routine simply touches the Primary context register by
 * reading the current value and writing it back. The Primary
 * context is not changed.
 */
#define PCACHE_FLUSHALL(tmp1, tmp2, tmp3)               \
    sethi   %hi(FLUSH_ADDR), tmp1                   ;\
    set MMU_PCONTEXT, tmp2                  ;\
    ldxa    [tmp2]ASI_DMMU, tmp3                    ;\
    stxa    tmp3, [tmp2]ASI_DMMU                    ;\
    flush   tmp1    /* See Cheetah PRM 8.10.2 */

/*
 * Macro that flushes the entire Dcache.
 *
 * arg1 = dcache size
 * arg2 = dcache linesize
 */
#define CH_DCACHE_FLUSHALL(arg1, arg2, tmp1)                \
    sub arg1, arg2, tmp1;                   \
1:                                  \
    stxa    %g0, [tmp1]ASI_DC_TAG;                  \
    membar  #Sync;                          \
    cmp %g0, tmp1;                      \
    bne,pt  %icc, 1b;                       \
      sub   tmp1, arg2, tmp1;

/*
 * Macro that flushes the entire Icache.
 *
 * Note that we cannot access ASI 0x67 (ASI_IC_TAG) with the Icache on,
 * because accesses to ASI 0x67 interfere with Icache coherency.  We
 * must make sure the Icache is off, then turn it back on after the entire
 * cache has been invalidated.  If the Icache is originally off, we'll just
 * clear the tags but not turn the Icache on.
 *
 * arg1 = icache size
 * arg2 = icache linesize
 */
#define CH_ICACHE_FLUSHALL(arg1, arg2, tmp1, tmp2)          \
    ldxa    [%g0]ASI_DCU, tmp2;                 \
    andn    tmp2, DCU_IC, tmp1;                 \
    stxa    tmp1, [%g0]ASI_DCU;                 \
    flush   %g0;    /* flush required after changing the IC bit */  \
    sllx    arg2, 1, arg2;      /* arg2 = linesize * 2 */   \
    sllx    arg1, 1, arg1;      /* arg1 = size * 2 */       \
    sub arg1, arg2, arg1;                   \
    or  arg1, CH_ICTAG_LOWER, arg1; /* "write" tag */   \
1:                                  \
    stxa    %g0, [arg1]ASI_IC_TAG;                  \
    membar  #Sync;              /* Cheetah PRM 8.9.3 */ \
    cmp arg1, CH_ICTAG_LOWER;                   \
    bne,pt  %icc, 1b;                       \
      sub   arg1, arg2, arg1;                   \
    stxa    tmp2, [%g0]ASI_DCU;                 \
    flush   %g0;    /* flush required after changing the IC bit */


#if defined(JALAPENO) || defined(SERRANO)

/*
 * ASI access to the L2 tag or L2 flush can hang the cpu when interacting
 * with combinations of L2 snoops, victims and stores.
 *
 * A possible workaround is to surround each L2 ASI access with membars
 * and make sure that the code is hitting in the Icache.  This requires
 * aligning code sequence at E$ boundary and forcing I$ fetch by
 * jumping to selected offsets so that we don't take any I$ misses
 * during ASI access to the L2 tag or L2 flush.  This also requires
 * making sure that we don't take any interrupts or traps (such as
 * fast ECC trap, I$/D$ tag parity error) which can result in eviction
 * of this code sequence from I$, thus causing a miss.
 *
 * Because of the complexity/risk, we have decided to do a partial fix
 * of adding membar around each ASI access to the L2 tag or L2 flush.
 */

#define JP_EC_DIAG_ACCESS_MEMBAR    \
    membar  #Sync

/*
 * Jalapeno version of macro that flushes the entire Ecache.
 *
 * Uses Jalapeno displacement flush feature of ASI_EC_DIAG.
 *
 * arg1 = ecache size
 * arg2 = ecache linesize - not modified; can be an immediate constant.
 */
#define ECACHE_FLUSHALL(arg1, arg2, tmp1, tmp2) \
    CPU_INDEX(tmp1, tmp2);                      \
    set JP_ECACHE_IDX_DISP_FLUSH, tmp2;             \
    sllx    tmp1, JP_ECFLUSH_PORTID_SHIFT, tmp1;            \
    or  tmp1, tmp2, tmp1;                   \
    srlx    arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2;            \
1:                                  \
    subcc   tmp2, arg2, tmp2;                   \
    JP_EC_DIAG_ACCESS_MEMBAR;                   \
    ldxa    [tmp1 + tmp2]ASI_EC_DIAG, %g0;              \
    JP_EC_DIAG_ACCESS_MEMBAR;                   \
    bg,pt   %xcc, 1b;                       \
      nop;                              \
    mov 1, tmp2;                        \
    sllx    tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2;            \
    add tmp1, tmp2, tmp1;                   \
    mov (JP_ECACHE_NWAY-1), tmp2;               \
    sllx    tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2;            \
    andcc   tmp1, tmp2, tmp2;                   \
    bnz,pt  %xcc, 1b;                       \
      srlx  arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2

#else   /* JALAPENO || SERRANO */

/*
 * Cheetah version of macro that flushes the entire Ecache.
 *
 *  Need to displacement flush 2x ecache size from Ecache flush area.
 *
 * arg1 = ecache size
 * arg2 = ecache linesize
 * arg3 = ecache flush address - for cheetah only
 */
#define CH_ECACHE_FLUSHALL(arg1, arg2, arg3)                \
    sllx    arg1, 1, arg1;                      \
1:                                  \
    subcc   arg1, arg2, arg1;                   \
    bg,pt   %xcc, 1b;                       \
      ldxa  [arg1 + arg3]ASI_MEM, %g0;

/*
 * Cheetah+ version of macro that flushes the entire Ecache.
 *
 * Uses the displacement flush feature.
 *
 * arg1 = ecache size
 * arg2 = ecache linesize
 * impl = CPU implementation as returned from GET_CPU_IMPL()
 *        The value in this register is destroyed during execution
 *        of the macro.
 */
#if defined(CHEETAH_PLUS)
#define CHP_ECACHE_FLUSHALL(arg1, arg2, impl)               \
    cmp impl, PANTHER_IMPL;                 \
    bne %xcc, 1f;                       \
      nop;                              \
    set PN_L3_IDX_DISP_FLUSH, impl;             \
    b   2f;                         \
      nop;                              \
1:                                  \
    set CHP_ECACHE_IDX_DISP_FLUSH, impl;            \
2:                                  \
    subcc   arg1, arg2, arg1;                   \
    bg,pt   %xcc, 2b;                       \
      ldxa  [arg1 + impl]ASI_EC_DIAG, %g0;
#else   /* CHEETAH_PLUS */
#define CHP_ECACHE_FLUSHALL(arg1, arg2, impl)
#endif  /* CHEETAH_PLUS */

/*
 * Macro that flushes the entire Ecache.
 *
 * arg1 = ecache size
 * arg2 = ecache linesize
 * arg3 = ecache flush address - for cheetah only
 */
#define ECACHE_FLUSHALL(arg1, arg2, arg3, tmp1)             \
    GET_CPU_IMPL(tmp1);                     \
    cmp tmp1, CHEETAH_IMPL;                 \
    bne %xcc, 2f;                       \
      nop;                              \
    CH_ECACHE_FLUSHALL(arg1, arg2, arg3);               \
    ba  3f;                         \
      nop;                              \
2:                                  \
    CHP_ECACHE_FLUSHALL(arg1, arg2, tmp1);              \
3:

#endif  /* JALAPENO || SERRANO */

/*
 * Macro that flushes the Panther L2 cache.
 */
#if defined(CHEETAH_PLUS)
#define PN_L2_FLUSHALL(scr1, scr2, scr3)                \
    GET_CPU_IMPL(scr3);                     \
    cmp scr3, PANTHER_IMPL;                 \
    bne %xcc, 2f;                       \
      nop;                              \
    set PN_L2_SIZE, scr1;                   \
    set PN_L2_LINESIZE, scr2;                   \
    set PN_L2_IDX_DISP_FLUSH, scr3;             \
1:                                  \
    subcc   scr1, scr2, scr1;                   \
    bg,pt   %xcc, 1b;                       \
      ldxa  [scr1 + scr3]ASI_L2_TAG, %g0;               \
2:
#else   /* CHEETAH_PLUS */
#define PN_L2_FLUSHALL(scr1, scr2, scr3)
#endif  /* CHEETAH_PLUS */

/*
 * Given a VA and page size (page size as encoded in ASI_MMU_TAG_ACCESS_EXT),
 * this macro returns the TLB index for that mapping based on a 512 entry
 * (2-way set associative) TLB. Aaside from the 16 entry fully associative
 * TLBs, all TLBs in Panther are 512 entry, 2-way set associative.
 *
 * To find the index, we shift the VA right by 13 + (3 * pg_sz) and then
 * mask out all but the lower 8 bits because:
 *
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 0 for   8K
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 1 for  64K
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 2 for 512K
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 3 for   4M
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 4 for  32M
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 5 for 256M
 *
 * and
 *
 *    array index for   8K pages = VA[20:13]
 *    array index for  64K pages = VA[23:16]
 *    array index for 512K pages = VA[26:19]
 *    array index for   4M pages = VA[29:22]
 *    array index for  32M pages = VA[32:25]
 *    array index for 256M pages = VA[35:28]
 *
 * Inputs:
 *
 *    va    - Register.
 *        Input: Virtual address in which we are interested.
 *        Output: TLB index value.
 *    pg_sz - Register. Page Size of the TLB in question as encoded
 *        in the ASI_[D|I]MMU_TAG_ACCESS_EXT register.
 */
#if defined(CHEETAH_PLUS)
#define PN_GET_TLB_INDEX(va, pg_sz)                 \
    srlx    va, 13, va; /* first shift the 13 bits and then */  \
    srlx    va, pg_sz, va;  /* shift by pg_sz three times. */   \
    srlx    va, pg_sz, va;                      \
    srlx    va, pg_sz, va;                      \
    and va, 0xff, va;   /* mask out all but the lower 8 bits */
#endif  /* CHEETAH_PLUS */

/*
 * The following macros are for error traps at TL>0.
 * The issue with error traps at TL>0 is that there are no safely
 * available global registers.  So we use the trick of generating a
 * software trap, then using the %tpc, %tnpc and %tstate registers to
 * temporarily save the values of %g1 and %g2.
 */

/*
 * Macro to generate 8-instruction trap table entry for TL>0 trap handlers.
 * Does the following steps:
 *  1. membar #Sync - required for USIII family errors.
 *  2. Specified software trap.
 * NB: Must be 8 instructions or less to fit in trap table and code must
 *     be relocatable.
 */
#define CH_ERR_TL1_TRAPENTRY(trapno)        \
    membar  #Sync;              \
    ta  trapno;             \
    nop; nop; nop; nop; nop; nop

/*
 * Macro to generate 8-instruction trap table entry for TL>0 software trap.
 * We save the values of %g1 and %g2 in %tpc, %tnpc and %tstate (since
 * the low-order two bits of %tpc/%tnpc are reserved and read as zero,
 * we need to put the low-order two bits of %g1 and %g2 in %tstate).
 * Note that %tstate has a reserved hole from bits 3-7, so we put the
 * low-order two bits of %g1 in bits 0-1 and the low-order two bits of
 * %g2 in bits 10-11 (insuring bits 8-9 are zero for use by the D$/I$
 * state bits).  Note that we must do a jmp instruction, since this
 * is moved into the trap table entry.
 * NB: Must be 8 instructions or less to fit in trap table and code must
 *     be relocatable.
 */
#define CH_ERR_TL1_SWTRAPENTRY(label)       \
    wrpr    %g1, %tpc;          \
    and %g1, 3, %g1;            \
    wrpr    %g2, %tnpc;         \
    sllx    %g2, CH_ERR_G2_TO_TSTATE_SHFT, %g2; \
    or  %g1, %g2, %g2;          \
    sethi   %hi(label), %g1;        \
    jmp %g1+%lo(label);         \
      wrpr  %g2, %tstate

/*
 * Macro to get ptr to ch_err_tl1_data.
 * reg1 will either point to a physaddr with ASI_MEM in %asi OR it
 * will point to a kernel nucleus virtual address with ASI_N in %asi.
 * This allows us to:
 *   1. Avoid getting MMU misses.  We may have gotten the original
 *  Fast ECC error in an MMU handler and if we get an MMU trap
 *  in the TL>0 handlers, we'll scribble on the MMU regs.
 *   2. Allows us to use the same code in the TL>0 handlers whether
 *  we're accessing kernel nucleus virtual addresses or physical
 *  addresses.
 * pseudo-code:
 *  reg1 <- ch_err_tl1_paddrs[CPUID];
 *  if (reg1 == NULL) {
 *      reg1 <- &ch_err_tl1_data
 *      %asi <- ASI_N
 *  } else {
 *      reg1 <- reg1 + offset +
 *          sizeof (ch_err_tl1_data) * (%tl - 3)
 *      %asi <- ASI_MEM
 *  }
 */
#define GET_CH_ERR_TL1_PTR(reg1, reg2, offset)  \
    CPU_INDEX(reg1, reg2);          \
    sllx    reg1, 3, reg1;          \
    set ch_err_tl1_paddrs, reg2;    \
    ldx [reg1+reg2], reg1;      \
    brnz    reg1, 1f;           \
    add reg1, offset, reg1;     \
    set ch_err_tl1_data, reg1;      \
    ba  2f;             \
    wr  %g0, ASI_N, %asi;       \
1:  rdpr    %tl, reg2;          \
    sub reg2, 3, reg2;          \
    mulx    reg2, CH_ERR_TL1_DATA_SIZE, reg2;   \
    add reg1, reg2, reg1;       \
    wr  %g0, ASI_MEM, %asi;     \
2:

/*
 * Macro to generate entry code for TL>0 error handlers.
 * At the end of this macro, %g1 will point to the ch_err_tl1_data
 * structure and %g2 will have the original flags in the ch_err_tl1_data
 * structure and %g5 will have the value of %tstate where the Fast ECC
 * routines will save the state of the D$ in Bit2 CH_ERR_TSTATE_DC_ON.
 * All %g registers except for %g1, %g2 and %g5 will be available after
 * this macro.
 * Does the following steps:
 *   1. Compute physical address of per-cpu/per-tl save area using
 *  only %g1+%g2 (which we've saved in %tpc, %tnpc, %tstate)
 *  leaving address in %g1 and updating the %asi register.
 *  If there is no data area available, we branch to label.
 *   2. Save %g3-%g7 in save area.
 *   3. Save %tpc->%g3, %tnpc->%g4, %tstate->%g5, which contain
 *  original %g1+%g2 values (because we're going to change %tl).
 *   4. set %tl <- %tl - 1.  We do this ASAP to make window of
 *  running at %tl+1 as small as possible.
 *   5. Reconstitute %g1+%g2 from %tpc (%g3), %tnpc (%g4),
 *  %tstate (%g5) and save in save area, carefully preserving %g5
 *  because it has the CH_ERR_TSTATE_DC_ON value.
 *   6. Load existing ch_err_tl1_data flags in %g2
 *   7. Compute the new flags
 *   8. If %g2 is non-zero (the structure was busy), shift the new
 *  flags by CH_ERR_ME_SHIFT and or them with the old flags.
 *   9. Store the updated flags into ch_err_tl1_data flags.
 *   10. If %g2 is non-zero, read the %tpc and store it in
 *  ch_err_tl1_data.
 */
#define CH_ERR_TL1_ENTER(flags)         \
    GET_CH_ERR_TL1_PTR(%g1, %g2, CHPR_TL1_ERR_DATA);    \
    stxa    %g3, [%g1 + CH_ERR_TL1_G3]%asi; \
    stxa    %g4, [%g1 + CH_ERR_TL1_G4]%asi; \
    stxa    %g5, [%g1 + CH_ERR_TL1_G5]%asi; \
    stxa    %g6, [%g1 + CH_ERR_TL1_G6]%asi; \
    stxa    %g7, [%g1 + CH_ERR_TL1_G7]%asi; \
    rdpr    %tpc, %g3;          \
    rdpr    %tnpc, %g4;         \
    rdpr    %tstate, %g5;           \
    rdpr    %tl, %g6;           \
    sub %g6, 1, %g6;            \
    wrpr    %g6, %tl;           \
    and %g5, 3, %g6;            \
    andn    %g3, 3, %g3;            \
    or  %g3, %g6, %g3;          \
    stxa    %g3, [%g1 + CH_ERR_TL1_G1]%asi; \
    srlx    %g5, CH_ERR_G2_TO_TSTATE_SHFT, %g6; \
    and %g6, 3, %g6;            \
    andn    %g4, 3, %g4;            \
    or  %g6, %g4, %g4;          \
    stxa    %g4, [%g1 + CH_ERR_TL1_G2]%asi; \
    ldxa    [%g1 + CH_ERR_TL1_FLAGS]%asi, %g2;  \
    set flags | CH_ERR_TL, %g3;     \
    brz %g2, 9f;            \
    sllx    %g3, CH_ERR_ME_SHIFT, %g4;  \
    or  %g2, %g4, %g3;          \
9:  stxa    %g3, [%g1 + CH_ERR_TL1_FLAGS]%asi;  \
    brnz    %g2, 8f;            \
    rdpr    %tpc, %g4;          \
    stxa    %g4, [%g1 + CH_ERR_TL1_TPC]%asi;    \
8:

/*
 * Turns off D$/I$ and saves the state of DCU_DC+DCU_IC in %tstate Bits 8+9
 * (CH_ERR_TSTATE_DC_ON/CH_ERR_TSTATE_IC_ON).  This is invoked on Fast ECC
 * at TL>0 handlers because the D$ may have corrupted data and we need to
 * turn off the I$ to allow for diagnostic accesses.  We then invoke
 * the normal entry macro and after it is done we save the values of
 * the original D$/I$ state, which is in %g5 bits CH_ERR_TSTATE_DC_ON/
 * CH_ERR_TSTATE_IC_ON in ch_err_tl1_tmp.
 */
#define CH_ERR_TL1_FECC_ENTER           \
    ldxa    [%g0]ASI_DCU, %g1;      \
    andn    %g1, DCU_DC + DCU_IC, %g2;  \
    stxa    %g2, [%g0]ASI_DCU;      \
    flush   %g0;    /* DCU_IC need flush */ \
    rdpr    %tstate, %g2;           \
    and %g1, DCU_DC + DCU_IC, %g1;  \
    sllx    %g1, CH_ERR_DCU_TO_TSTATE_SHFT, %g1;    \
    or  %g1, %g2, %g2;          \
    wrpr    %g2, %tstate;           \
    CH_ERR_TL1_ENTER(CH_ERR_FECC);      \
    and %g5, CH_ERR_TSTATE_DC_ON + CH_ERR_TSTATE_IC_ON, %g5;    \
    stxa    %g5, [%g1 + CH_ERR_TL1_TMP]%asi

/*
 * Macro to generate exit code for TL>0 error handlers.
 * We fall into this macro if we've successfully logged the error in
 * the ch_err_tl1_data structure and want the PIL15 softint to pick
 * it up and log it.
 * Does the following steps:
 *   1. Set pending flag for this cpu in ch_err_tl1_pending.
 *   2. Write %set_softint with (1<<pil) to cause a pil level trap
 *   3. Restore registers from ch_err_tl1_data, which is pointed to
 *  by %g1, last register to restore is %g1 since it's pointing
 *  to the save area.
 *   4. Execute retry
 */
#define CH_ERR_TL1_EXIT             \
    CPU_INDEX(%g2, %g3);            \
    set ch_err_tl1_pending, %g3;    \
    set -1, %g4;            \
    stb %g4, [%g2 + %g3];       \
    mov 1, %g2;             \
    sll %g2, PIL_15, %g2;       \
    wr  %g2, SET_SOFTINT;       \
    ldxa    [%g1 + CH_ERR_TL1_G7]%asi, %g7; \
    ldxa    [%g1 + CH_ERR_TL1_G6]%asi, %g6; \
    ldxa    [%g1 + CH_ERR_TL1_G5]%asi, %g5; \
    ldxa    [%g1 + CH_ERR_TL1_G4]%asi, %g4; \
    ldxa    [%g1 + CH_ERR_TL1_G3]%asi, %g3; \
    ldxa    [%g1 + CH_ERR_TL1_G2]%asi, %g2; \
    ldxa    [%g1 + CH_ERR_TL1_G1]%asi, %g1; \
    retry

/*
 * Generates unrecoverable error label for TL>0 handlers.
 * At label (Unrecoverable error routine)
 *   1. Sets flags in ch_err_tl1_data and leaves in %g2 (first
 *  argument to cpu_tl1_err_panic).
 *   2. Call cpu_tl1_err_panic via systrap at PIL 15
 */
#define CH_ERR_TL1_PANIC_EXIT(label)        \
label:  ldxa    [%g1 + CH_ERR_TL1_FLAGS]%asi, %g2;  \
    or  %g2, CH_ERR_TL | CH_ERR_PANIC, %g2; \
    stxa    %g2, [%g1 + CH_ERR_TL1_FLAGS]%asi;  \
    set cpu_tl1_err_panic, %g1;     \
    ba  sys_trap;           \
      mov   PIL_15, %g4


/* END CSTYLED */
#endif  /* _ASM */

#ifdef  __cplusplus
}
#endif

#endif /* _CHEETAHASM_H */