/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/nsc_thread.h>
#include "sd_bcache.h"
#include "sd_trace.h"
#include "sd_io.h"
#include "sd_bio.h"
#include "sd_misc.h"
#include "sd_ft.h"
#include "sd_pcu.h"
/*
* dynamic memory support
*/
static void _sd_dealloc_dm(void);
extern void sdbc_requeue_head_dm_try(_sd_cctl_t *);
extern int sdbc_use_dmchain;
extern _sd_queue_t *sdbc_dm_queues;
static volatile int _sd_flush_exit;
/* secret flush toggle flag for testing */
#ifdef DEBUG
#endif
static int sdbc_flush_pageio;
/*
* Forward declare all statics that are used before defined to enforce
* parameter checking
* Some (if not all) of these could be removed if the code were reordered
*/
static void _sd_flush_thread(void);
int
_sdbc_flush_configure(void)
{
_sd_flush_exit = 1;
sdbc_flush_pageio = 0;
return (nsc_create_process(
(void (*)(void *))_sd_flush_thread, 0, TRUE));
}
void
_sdbc_flush_deconfigure(void)
{
_sd_flush_exit = 0;
}
static int
{
ALLOC_NOWAIT)) {
/* release the blocks to the queue */
while (centry) {
}
return (reqblks);
}
return (0);
}
int
{
int rc = 0;
int i;
int blks_remaining;
int blks_allocd = 0;
if (sdbc_static_cache) { /* alloc all static cache memory here */
for (i = 0; i < blk_groups; ++i) {
if (!sdbc_alloc_static_cache(reqblks))
break;
blks_allocd += reqblks;
}
int, i, int, blks_allocd);
/* if successful then allocate any remaining blocks */
if ((i == blk_groups) && blks_remaining)
int, i, int, blks_allocd);
if (blks_allocd < CBLOCKS) {
"memory.\n requested mem: %d MB; actual mem: %d MB",
}
#ifdef DEBUG
"(%d cache blocks) allocated for static cache, "
BLK_SIZE(1));
#endif /* DEBUG */
} else {
TRUE);
if (rc != 0)
}
return (rc);
}
/*
* sdbc_dealloc_dm_shutdown - deallocate cache memory.
*
* ARGUMENTS: none
*
* RETURNS: nothing
*
* USAGE:
* this function is intended for use after all i/o has stopped and all
* other cache threads have terminated. write cache resources, if any
* are released, except in the case of pinned data.
*/
static void
{
if (!cc_ent)
return;
do {
if (cc_ent->cc_alloc_size_dm) {
/* HOST or OTHER */
cc_ent->cc_alloc_size_dm = 0;
DTRACE_PROBE2(sdbc_dealloc_dm_shutdown, char *,
}
/* release safestore resource, if any. preserve pinned data */
}
}
void
{
int one_sec;
if (sdbc_static_cache) {
return;
}
if (sd_dealloc_flag_dm == NO_THREAD_DM)
return; /* thread never started */
while (sd_dealloc_flag_dm != CACHE_THREAD_TERMINATED_DM)
}
/*
* This complicated - possibly overly complicated routine works as follows:
* In general the routine sleeps a specified amount of time then wakes and
* examines the entire centry list. If an entry is avail. it ages it by one
* tick else it clears the aging flag completely. It then determines if the
* centry has aged sufficiently to have its memory deallocated and for it to
* be placed at the top of the lru.
*
* There are two deallocation schemes in place depending on whether the
*
* The behavior for a standalone entry is as follows:
* If the given centry is selected it will age normally however at full
* aging it will only be placed at the head of the lru. It's memory will
* not be deallocated until a further aging level has been reached. The
* entries selected for this behavior are goverend by counting the number
* of these holdovers in existence on each wakeup and and comparing it
* to a specified percentage. This comparision is always one cycle out of
* date and will float in the relative vicinity of the specified number.
*
* The chain is examined. If all entries are fully aged the entire chain
* is removed - ie mem is dealloc. from the host entry and all memory ref.
* removed from the parasitic entries and each entry requeued on to the lru.
*
* There are three delay timeouts and two percentage levels specified. Timeout
* level 1 is honored between 100% free and pcnt level 1. Timeout level 2 is
* honored between pcnt level 1 and pcnt level 2, Timeout level 3 is
* honored between pcnt level 2 and 0% free. In addition there exist an
* accelerated
* aging flag which mimics hysterisis behavior. If the available centrys fall
* between pcnt1 and pcnt2 an 8 bit counter is switched on. The effect is to
* keep the timer value at timer level 2 for 8 cycles even if the number
* available cache entries drifts above pcnt1. If it falls below pcnt2 an
* additional 8 bit counter is switched on. This causes the sleep timer to
* remain at timer level 3 for at least 8 cycles even if it floats above
* pcnt2 or even pcnt1. The effect of all this is to accelerate the release
* of system resources under a heavy load.
*
* All of the footwork can be stubbed out by a judicious selection of values
* for the times, aging counts and pcnts.
*
* All of these behavior parameters are adjustable on the fly via the kstat
* mechanism. In addition there is a thread wakeup msg available through the
* same mechanism.
*/
static void
_sd_dealloc_dm(void)
{
int max_holds_ct;
int dealloc;
/* setup a one sec time var */
last_holds_ct = 0;
/*CONSTANTCONDITION*/
while (1) {
if (sd_dealloc_flag_dm == CACHE_SHUTDOWN_DM) {
/* finished. shutdown - get out */
sdbc_dealloc_dm_shutdown(); /* free all memory */
return;
}
/* has the world changed */
/*
* get num cctl entries (%) below which different sleep
* rates kick in
*/
/* get sleep rates for each level */
/* get num of cycles for full normal aging */
/* get num of cycles for full meta aging */
/* get num of cycles for full extended holdover aging */
/* get maximum holds count in % */
/ 100;
/* apply the delay */
if (sd_dealloc_flag_dm == TIME_DELAY_LVL1)
else
if (sd_dealloc_flag_dm == TIME_DELAY_LVL2)
/* check for special directives on wakeup */
if (ppvars->process_directive &
}
/* Start of deallocation loop */
write_dealloc = 0;
ppvars->candidates = 0;
while (sd_dealloc_flag_dm != CACHE_SHUTDOWN_DM &&
== FALSE) {
goto next_dealloc_entry;
}
else
else
} else {
if (last_holds_ct < max_holds_ct)
}
ppvars->candidates++;
goto next_dealloc_entry;
}
/* bonafide aged entry - examine its chain */
while (cur_ent) {
else {
0) == TRUE) {
cur_ent->cc_aging_dm |=
if ((cur_ent->cc_aging_dm &
} else
}
}
/* chain not fully free - free inuse for all entries */
while (cur_ent) {
if (cur_ent->cc_aging_dm &
cur_ent->cc_aging_dm &=
}
}
} else { /* OK - free memory */
if (hold_candidate == TRUE &&
if (wctl) {
wctl);
}
goto next_dealloc_entry;
} /* if (hold_candidate == TRUE */
while (cur_ent) {
_sd_cctl_t *, cur_ent,
int, CENTRY_BLK(cur_ent),
if ((cur_ent->cc_aging_dm
& BAD_CHAIN_DM)) {
(void) _sd_hash_delete(
(_sd_hash_hd_t *)cur_ent,
continue;
}
if (cur_ent->cc_alloc_size_dm) {
int qidx;
_sd_queue_t *q;
/* HOST or OTHER */
/* debugging */
ppvars->dealloc_ct++;
/*
* remove from queue
* in preparation for putting
* on the 0 queue after
* memory is freed
*/
if (sdbc_use_dmchain) {
qidx =
q = &sdbc_dm_queues
[qidx];
cur_ent);
}
}
cur_ent->cc_alloc_size_dm = 0;
cur_ent->cc_aging_dm &=
~(FINAL_AGING_DM | ENTRY_FIELD_DM |
(void) _sd_hash_delete(
(_sd_hash_hd_t *)cur_ent,
if (sdbc_use_dmchain) {
_sd_queue_t *q;
q = &sdbc_dm_queues[0];
cur_ent);
} else {
}
if (wctl) {
wctl);
}
} /* while (cur_ent) */
} /* else OK - free memory */
} /* while (entries) */
"!notavl=%x, nodat=%x, cand=%x, hosts=%x,"
" pests=%x, metas=%x, holds=%x, others=%x,"
" deallo=%x",
}
}
if (sd_dealloc_flag_dm == CACHE_SHUTDOWN_DM)
continue;
/* set the history flag which will govern the sleep rate */
/* upper - lots of virgin cctls */
} else {
/* middle - not so many virgin cctls */
else
} else {
/*
* appear to be running low - accelerate the
* aging to free more
*/
else
}
}
else
} /* while (TRUE) */
}
int
{
/*
* if using dmchaining return immediately and do not attempt
* to acquire the cc_ent if there is no memory associated with
* this cc_ent.
* this avoids conflicts for centrys on the 0 queue.
* see sdbc_get_dmchain()
*/
if (nodata)
(*nodata)++;
return (FALSE);
}
if ((SET_CENTRY_INUSE(cc_ent))) {
return (FALSE);
}
if ((SET_CENTRY_PAGEIO(cc_ent))) {
return (FALSE);
}
/*
* we allow the QHEAD flag as it does not affect the availabilty
* of memory for aging
*/
if (nodata)
(*nodata)++;
}
return (FALSE);
}
return (TRUE);
}
/*
* function below to prohibit code movement by compiler
* and avoid using spinlocks for syncronization
*/
static void
{
sd_serialize();
}
/*
* Yet another switch!
* alloc mem and coalesce if at least this number of frags
*/
/*
* optimization for _sd_async_flclist()
* called only if not doing pageio and sdbc_coalesce_backend > 0
*
* returns with pagio bit set in the centrys in list
*/
static unsigned char *
{
int fba_len;
int total_len_bytes = 0;
unsigned char *next_addr;
int num_frags = 0;
}
while (cc_ent) {
/* check for contiguity */
if (prev_addr &&
++num_frags;
/* compute length */
if (FULLY_DIRTY(cc_ent)) {
} else {
}
}
if (num_frags >= sdbc_coalesce_backend) {
/*
* TODO - determine metric for deciding
* whether to coalesce memory or do separate i/o's
*/
int, num_frags, int, total_len_bytes);
/* copy the first dirty piece */
}
/* copy the rest of data */
while (cc_ent) {
if (FULLY_DIRTY(cc_ent)) {
} else {
}
}
}
}
return (start_addr);
}
void
{
int len;
int toflush;
extern int sdbc_do_page;
if (SDBC_IS_FRAGMENTED(first_dirty)) {
first_dirty = 0;
}
toflush = 0;
while (cc_ent->cc_dirty_next) {
if (cc_ent->cc_iocount)
cc_ent->cc_iocount, 0);
toflush++;
}
if (SDBC_IS_FRAGMENTED(last_dirty)) {
if (cc_prev)
last_dirty = 0;
}
else
toflush++;
if (toflush == 0)
return;
last_dirty)))
/* pageio bit already set in sdbc_alloc_io_mem() above */
if (!coalesce)
/* build buffer only if it was not done above */
if (!anon_mem) {
i = SDBC_LOOKUP_STPOS(first_dirty);
int, fba_len, char *,
}
} else {
len = 0;
flushed = 0;
}
while (cc_ent) {
/* pageio bit already set in sdbc_alloc_io_mem() above */
if (!coalesce)
if (FULLY_DIRTY(cc_ent)) {
flushed++;
/* build buffer only if it was not done above */
if (!anon_mem) {
int, BLK_FBAS, char *,
}
len += CACHE_BLOCK_SIZE;
} else {
#if defined(_SD_DEBUG)
/*
* consistency check.
*/
"!_sd_err: flclist: last_dirty %x next %x",
}
#endif
flushed++;
/* build buffer only if it was not done above */
if (!anon_mem) {
int, fba_len, char *,
}
}
}
#ifdef DEBUG
if (anon_mem)
#endif
/* SDTRACE(ST_INFO|SDF_FLCLIST, cd, FBA_NUM(len), dblk, flushed, bp); */
/* increment number of bytes destaged to disk */
}
void
{
else {
}
}
void
{
int dirty;
int cd;
#if defined(_SD_DEBUG_PATTERN)
#endif
if (cc_ent->cc_iocount)
cc_ent->cc_iocount, 0);
if (_SD_BMAP_ISFULL(dirty)) {
/* increment number of bytes destaged to disk */
} else {
while (dirty) {
int, len, char *,
/* SDTRACE(ST_INFO|SDF_FLCENT, cd, len, dblk, 0, bp); */
/* increment number of bytes destaged to disk */
}
}
}
static void
{
int dirty_enq;
if (processed) {
}
return;
}
if (processed) {
}
return;
}
#if defined(_SD_DEBUG)
#endif
goto process_loop;
dirty_enq = 0;
_sd_cctl_t *, cc_ent);
processed++;
if (CENTRY_PINNED(cc_ent))
/*
* Optimize for common case where block not inuse
* Grabbing cc_inuse is faster than cc_lock.
*/
if (SET_CENTRY_INUSE(cc_ent))
goto must_lock;
if (CENTRY_DIRTY_PENDING(cc_ent)) {
if (dirty_enq)
dirty_enq++;
continue;
}
/*
* if this was a QHEAD cache block, then
* _sd_centry_release() did not requeue it as
* it was dirty. Requeue it now.
*/
if (CENTRY_QHEAD(cc_ent))
if (sdbc_use_dmchain) {
/* attempt to que head */
if (cc_ent->cc_alloc_size_dm) {
}
} else
continue;
/*
* Block is inuse, must take cc_lock
* if DIRTY_PENDING, must re-issue
*/
/* was FAST */
if (CENTRY_DIRTY_PENDING(cc_ent)) {
/* was FAST */
if (dirty_enq)
dirty_enq++;
continue;
}
/*
* clear dirty bits, if block no longer inuse release cc_write
*/
if (SET_CENTRY_INUSE(cc_ent) == 0) {
/* was FAST */
/*
* if this was a QHEAD cache block, then
* _sd_centry_release() did not requeue it as
* it was dirty. Requeue it now.
*/
if (CENTRY_QHEAD(cc_ent))
if (sdbc_use_dmchain) {
/* attempt to que head */
if (cc_ent->cc_alloc_size_dm) {
(cc_ent);
}
} else
} else {
/* was FAST */
}
}
if (dirty_enq)
goto process_loop;
}
static void
{
int cd;
if (error) {
"Disk write failed cd %d (%s): err %d",
}
}
/* was FAST */
if (--(cc_ent->cc_iocount) != 0) {
/* more io's to complete before the cc_ent is done. */
if (cc_ent->cc_iocount < 0) {
/* was FAST */
} else {
/* was FAST */
}
(unsigned long)cc_ent);
return;
}
/* was FAST */
/*
* All io's are done for this cc_ent.
* Clear the pagelist io flag.
*/
if (error)
else
}
static void
int error)
{
int cd;
if (error) {
"Disk write failed cd %d (%s): err %d",
}
}
/*
* Important: skip the first cc_ent in the list. Marking this will
* make the writer think the io is done, though the rest of the
* chain have not been processed here. so mark the first cc_ent
* last. Optimization, so as not to use locks
*/
while (cc_ent) {
cc_ent->cc_iocount, 0);
cc_ent->cc_iocount = 0;
/*
* Clear the pagelist io flag.
*/
if (error)
else
if (cc_ent->cc_dirty_next) {
int, BLK_FBAS, char *,
} else {
}
}
/*
* Now process the first cc_ent in the list.
*/
cc_ent->cc_iocount = 0;
cc_ent->cc_anon_len = 0;
}
/*
* Clear the pagelist io flag.
*/
if (error)
else
}
static void
{
int cd;
while (cc_ent) {
}
}
/*
* For dual-copy, log & clear PINNED, fall thru to regular processing.
*/
int
{
}
num++;
/* was FAST */
}
if (CENTRY_PINNED(cc_ent))
}
/*
* In normal processing we wouldn't need a lock here as all i/o
* is single threaded by cd. However during failover blocks can
* be failing from real i/o and as soon as the disk is marked bad
* the failover code which is furiously cloning safe-store into
* more blocks will short circuit to here (see _sd_ft_clone)
* and two threads can be executing in here simultaneously.
*/
return (1); /* blocks are failed */
}
static void
{
int cd;
if (CENTRY_PINNABLE(cc_ent)) {
}
/* was FAST */
/* was FAST */
}
/*
* cd_write_thread -- flush dirty buffers.
*
* ARGUMENTS:
*
* cd - cache descriptor
*
* USAGE:
* called by cd's writer thread, returns when no more entries
*
* NOTE: if sdbc is being shutdown (for powerfail) then we will
* process pending i/o's but issue no more new ones.
*/
#endif
static void
{
if (!FILE_OPENED(cd)) {
return;
}
if (_sdbc_shutdown_in_progress) {
cdi->cd_write_inprogress = 0;
return;
}
return;
}
#endif
#ifdef DEBUG
if (!_sdbc_flush_flag) { /* hang the flusher for testing */
cdi->cd_write_inprogress = 0;
return;
}
#endif
/* was FAST */
if (SD_LOOP_DELAY == 0 ||
last_chain = NULL;
} else
int count = 0;
count++;
/* cdi->cd_dirty_tail is unchanged */
} else
#endif
{
}
/* was FAST */
cc_list != last_chain) {
else
}
}
cdi->cd_write_inprogress = 0;
}
/*
* cd_writer -- spawn new writer if not running already
* called after enqueing the dirty blocks
*/
int
{
nsthread_t *t;
#if defined(_SD_USE_THREADS)
#endif /* _SD_USE_THREADS */
return (0);
return (0);
}
return (0);
if (t)
return (1);
return (-1);
}
/*
* _sd_ccent_rd - add appropriate parts of cc_ent to struct buf.
* optimized not to read dirty FBAs from disk.
*
* ARGUMENTS:
*
* cc_ent - single cache block
* wanted - bitlist of FBAs that need to be read
* bp - struct buf to extend
*
* USAGE:
* Called for each dirty in a read I/O.
* The bp must be sized to allow for one entry per FBA that needs
* to be read (see _sd_doread()).
*/
void
{
continue;
size++;
else {
size = 1;
if (state) { /* dirty, don't overwrite */
} else {
}
}
}
if (state1 != -3)
}
static void
_sd_flush_thread(void)
{
int cd;
int cnt;
int short_sleep = 0;
long tics;
int waiting_for_idle = 0;
int check_count = 0;
nsthread_t *t;
#if defined(_SD_USE_THREADS)
#endif /* _SD_USE_THREADS */
/* .2 seconds */
/* .02 seconds */
/* CONSTCOND */
while (1) {
if (_sd_flush_exit == 0) {
/*
* wait until no i/o's pending (on two successive
* iterations) or we see no progress after
* GIVE_UP_WAITING total sleeps.
*/
/* at most 5*128 ticks about 6 seconds of no progress */
if (waiting_for_idle) {
pending = _sd_pending_iobuf();
/*LINTED*/
if (pending == last_pending) {
if (pending != 0)
check_count++;
} else
check_count = 0;
if ((last_pending == 0 && (pending == 0)) ||
(check_count == GIVE_UP_WAITING)) {
if (check_count == GIVE_UP_WAITING)
"!_sd_flush_thread "
"exiting with %d IOs "
"pending", pending);
return;
}
} else {
waiting_for_idle = 1;
}
}
/*
* Normally wakeup every SD_LONG_SLEEP_TICS to flush.
*/
if (!short_sleep) {
int rc;
short_sleep = 1;
} else {
"!sdbc(_sd_flush_thread)"
"cannot get safestore inq");
}
}
if (short_sleep)
else
cd = 0;
cnt = short_sleep = 0;
continue;
continue;
cnt++;
continue;
if (!_SD_CD_WBLK_USED(cd)) {
cdi->cd_failover = 0;
}
continue;
}
continue;
t = NULL;
if (tset) {
t = nst_create(tset,
0);
}
if (!t)
}
}
}
#if defined(_SD_DEBUG_PATTERN)
{
int *data;
int i, dirty_bl;
while (cc_entry) {
if (dirty_bl == 0) {
}
if (dirty_bl & 1) {
}
}
dirty_bl >>= 1;
}
}
}
char *rw;
{
int *data;
len++;
}
}
bvec1++;
}
handle->bh_fba_len);
}
}
#endif
int
_sdbc_wait_pending(void)
{
tries = 0;
last = _sd_pending_iobuf();
while ((pend = _sd_pending_iobuf()) > 0) {
if (++tries > 60) {
return (pend);
}
} else {
tries = 0;
}
}
return (0);
}