sd_bcache.c revision fcf3ce441efd61da9bb2884968af01cb7c1452cc
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/types.h>
#include <sys/ksynch.h>
#include <sys/cmn_err.h>
#include <sys/errno.h>
#include <sys/kmem.h>
#include <sys/cred.h>
#include <sys/buf.h>
#include <sys/ddi.h>
#include <sys/nsc_thread.h>
#include <sys/nsctl/nsctl.h>
#include <sys/sdt.h> /* dtrace is S10 or later */
#include "sd_bcache.h"
#include "sd_trace.h"
#include "sd_io.h"
#include "sd_bio.h"
#include "sd_ft.h"
#include "sd_misc.h"
#include "sd_pcu.h"
#include <sys/unistat/spcs_s.h>
#include <sys/unistat/spcs_s_k.h>
#include <sys/unistat/spcs_errors.h>
#include <sys/nsctl/safestore.h>
#ifndef DS_DDICT
#include <sys/ddi_impldefs.h>
#endif
/*
* kstat interface
*/
static kstat_t *sdbc_global_stats_kstat;
static int sdbc_global_stats_update(kstat_t *ksp, int rw);
typedef struct {
kstat_named_t ci_sdbc_count;
kstat_named_t ci_sdbc_loc_count;
kstat_named_t ci_sdbc_rdhits;
kstat_named_t ci_sdbc_rdmiss;
kstat_named_t ci_sdbc_wrhits;
kstat_named_t ci_sdbc_wrmiss;
kstat_named_t ci_sdbc_blksize;
kstat_named_t ci_sdbc_lru_blocks;
#ifdef DEBUG
kstat_named_t ci_sdbc_lru_noreq;
kstat_named_t ci_sdbc_lru_req;
#endif
kstat_named_t ci_sdbc_wlru_inq;
kstat_named_t ci_sdbc_cachesize;
kstat_named_t ci_sdbc_numblocks;
kstat_named_t ci_sdbc_num_shared;
kstat_named_t ci_sdbc_wrcancelns;
kstat_named_t ci_sdbc_destaged;
kstat_named_t ci_sdbc_nodehints;
} sdbc_global_stats_t;
static sdbc_global_stats_t sdbc_global_stats = {
{SDBC_GKSTAT_COUNT, KSTAT_DATA_ULONG},
{SDBC_GKSTAT_LOC_COUNT, KSTAT_DATA_ULONG},
{SDBC_GKSTAT_RDHITS, KSTAT_DATA_ULONG},
{SDBC_GKSTAT_RDMISS, KSTAT_DATA_ULONG},
{SDBC_GKSTAT_WRHITS, KSTAT_DATA_ULONG},
{SDBC_GKSTAT_WRMISS, KSTAT_DATA_ULONG},
{SDBC_GKSTAT_BLKSIZE, KSTAT_DATA_ULONG},
{SDBC_GKSTAT_LRU_BLOCKS, KSTAT_DATA_ULONG},
#ifdef DEBUG
{SDBC_GKSTAT_LRU_NOREQ, KSTAT_DATA_ULONG},
{SDBC_GKSTAT_LRU_REQ, KSTAT_DATA_ULONG},
#endif
{SDBC_GKSTAT_WLRU_INQ, KSTAT_DATA_ULONG},
{SDBC_GKSTAT_CACHESIZE, KSTAT_DATA_ULONG},
{SDBC_GKSTAT_NUMBLOCKS, KSTAT_DATA_ULONG},
{SDBC_GKSTAT_NUM_SHARED, KSTAT_DATA_ULONG},
{SDBC_GKSTAT_WRCANCELNS, KSTAT_DATA_ULONG},
{SDBC_GKSTAT_DESTAGED, KSTAT_DATA_ULONG},
{SDBC_GKSTAT_NODEHINTS, KSTAT_DATA_ULONG},
};
static kstat_t **sdbc_cd_kstats;
static kstat_t **sdbc_cd_io_kstats;
static kmutex_t *sdbc_cd_io_kstats_mutexes;
static kstat_t *sdbc_global_io_kstat;
static kmutex_t sdbc_global_io_kstat_mutex;
static int sdbc_cd_stats_update(kstat_t *ksp, int rw);
static int cd_kstat_add(int cd);
static int cd_kstat_remove(int cd);
typedef struct {
kstat_named_t ci_sdbc_vol_name;
kstat_named_t ci_sdbc_failed;
kstat_named_t ci_sdbc_cd;
kstat_named_t ci_sdbc_cache_read;
kstat_named_t ci_sdbc_cache_write;
kstat_named_t ci_sdbc_disk_read;
kstat_named_t ci_sdbc_disk_write;
kstat_named_t ci_sdbc_filesize;
kstat_named_t ci_sdbc_numdirty;
kstat_named_t ci_sdbc_numio;
kstat_named_t ci_sdbc_numfail;
kstat_named_t ci_sdbc_destaged;
kstat_named_t ci_sdbc_wrcancelns;
kstat_named_t ci_sdbc_cdhints;
} sdbc_cd_stats_t;
static sdbc_cd_stats_t sdbc_cd_stats = {
{SDBC_CDKSTAT_VOL_NAME, KSTAT_DATA_CHAR},
{SDBC_CDKSTAT_FAILED, KSTAT_DATA_ULONG},
{SDBC_CDKSTAT_CD, KSTAT_DATA_ULONG},
{SDBC_CDKSTAT_CACHE_READ, KSTAT_DATA_ULONG},
{SDBC_CDKSTAT_CACHE_WRITE, KSTAT_DATA_ULONG},
{SDBC_CDKSTAT_DISK_READ, KSTAT_DATA_ULONG},
{SDBC_CDKSTAT_DISK_WRITE, KSTAT_DATA_ULONG},
#ifdef NSC_MULTI_TERABYTE
{SDBC_CDKSTAT_FILESIZE, KSTAT_DATA_UINT64},
#else
{SDBC_CDKSTAT_FILESIZE, KSTAT_DATA_ULONG},
#endif
{SDBC_CDKSTAT_NUMDIRTY, KSTAT_DATA_ULONG},
{SDBC_CDKSTAT_NUMIO, KSTAT_DATA_ULONG},
{SDBC_CDKSTAT_NUMFAIL, KSTAT_DATA_ULONG},
{SDBC_CDKSTAT_DESTAGED, KSTAT_DATA_ULONG},
{SDBC_CDKSTAT_WRCANCELNS, KSTAT_DATA_ULONG},
{SDBC_CDKSTAT_CDHINTS, KSTAT_DATA_ULONG},
};
#ifdef DEBUG
/*
* dynmem kstat interface
*/
static kstat_t *sdbc_dynmem_kstat_dm;
static int simplect_dm;
static int sdbc_dynmem_kstat_update_dm(kstat_t *ksp, int rw);
typedef struct {
kstat_named_t ci_sdbc_monitor_dynmem;
kstat_named_t ci_sdbc_max_dyn_list;
kstat_named_t ci_sdbc_cache_aging_ct1;
kstat_named_t ci_sdbc_cache_aging_ct2;
kstat_named_t ci_sdbc_cache_aging_ct3;
kstat_named_t ci_sdbc_cache_aging_sec1;
kstat_named_t ci_sdbc_cache_aging_sec2;
kstat_named_t ci_sdbc_cache_aging_sec3;
kstat_named_t ci_sdbc_cache_aging_pcnt1;
kstat_named_t ci_sdbc_cache_aging_pcnt2;
kstat_named_t ci_sdbc_max_holds_pcnt;
kstat_named_t ci_sdbc_alloc_ct;
kstat_named_t ci_sdbc_dealloc_ct;
kstat_named_t ci_sdbc_history;
kstat_named_t ci_sdbc_nodatas;
kstat_named_t ci_sdbc_candidates;
kstat_named_t ci_sdbc_deallocs;
kstat_named_t ci_sdbc_hosts;
kstat_named_t ci_sdbc_pests;
kstat_named_t ci_sdbc_metas;
kstat_named_t ci_sdbc_holds;
kstat_named_t ci_sdbc_others;
kstat_named_t ci_sdbc_notavail;
kstat_named_t ci_sdbc_process_directive;
kstat_named_t ci_sdbc_simplect;
} sdbc_dynmem_dm_t;
static sdbc_dynmem_dm_t sdbc_dynmem_dm = {
{SDBC_DMKSTAT_MONITOR_DYNMEM, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_MAX_DYN_LIST, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_CACHE_AGING_CT1, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_CACHE_AGING_CT2, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_CACHE_AGING_CT3, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_CACHE_AGING_SEC1, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_CACHE_AGING_SEC2, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_CACHE_AGING_SEC3, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_CACHE_AGING_PCNT1, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_CACHE_AGING_PCNT2, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_MAX_HOLDS_PCNT, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_ALLOC_CNT, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_DEALLOC_CNT, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_HISTORY, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_NODATAS, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_CANDIDATES, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_DEALLOCS, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_HOSTS, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_PESTS, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_METAS, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_HOLDS, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_OTHERS, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_NOTAVAIL, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_PROCESS_DIRECTIVE, KSTAT_DATA_ULONG},
{SDBC_DMKSTAT_SIMPLECT, KSTAT_DATA_ULONG}
};
#endif
/* End of dynmem kstats */
#ifdef DEBUG
int *dmchainpull_table; /* dmchain wastage stats */
#endif
/*
* dynmem process vars
*/
extern _dm_process_vars_t dynmem_processing_dm;
/* metadata for volumes */
ss_voldata_t *_sdbc_gl_file_info;
size_t _sdbc_gl_file_info_size;
/* metadata for cache write blocks */
static ss_centry_info_t *_sdbc_gl_centry_info;
/* wblocks * sizeof(ss_centry_info_t) */
static size_t _sdbc_gl_centry_info_size;
static int _SD_DELAY_QUEUE = 1;
static int sdbc_allocb_inuse, sdbc_allocb_lost, sdbc_allocb_hit;
static int sdbc_allocb_pageio1, sdbc_allocb_pageio2;
static int sdbc_centry_hit, sdbc_centry_inuse, sdbc_centry_lost;
static int sdbc_dmchain_not_avail;
static int sdbc_allocb_deallocd;
static int sdbc_centry_deallocd;
static int sdbc_check_cot;
static int sdbc_ra_hash; /* 1-block read-ahead fails due to hash hit */
static int sdbc_ra_none; /* 1-block read-ahead fails due to "would block" */
/*
* Set the following variable to 1 to enable pagelist io mutual
* exclusion on all _sd_alloc_buf() operations.
*
* This is set to ON to prevent front end / back end races between new
* NSC_WRTHRU io operations coming in through _sd_alloc_buf(), and
* previously written data being flushed out to disk by the sdbc
* flusher at the back end.
* -- see bugtraq 4287564
* -- Simon Crosland, Mon Nov 8 16:34:09 GMT 1999
*/
static int sdbc_pageio_always = 1;
int sdbc_use_dmchain = 0; /* start time switch for dm chaining */
int sdbc_prefetch1 = 1; /* do 1-block read-ahead */
/*
* if sdbc_static_cache is 1 allocate all cache memory at startup.
* deallocate only at shutdown.
*/
int sdbc_static_cache = 1;
#ifdef DEBUG
/*
* Pagelist io mutual exclusion debug facility.
*/
#define SDBC_PAGEIO_OFF 0 /* no debug */
#define SDBC_PAGEIO_RDEV 1 /* force NSC_PAGEIO for specified dev */
#define SDBC_PAGEIO_RAND 2 /* randomly force NSC_PAGEIO */
#define SDBC_PAGEIO_ALL 3 /* always force NSC_PAGEIO */
static int sdbc_pageio_debug = SDBC_PAGEIO_OFF;
static dev_t sdbc_pageio_rdev = (dev_t)-1;
#endif
/*
* INF SD cache global data
*/
_sd_cd_info_t *_sd_cache_files;
_sd_stats_t *_sd_cache_stats;
kmutex_t _sd_cache_lock;
_sd_hash_table_t *_sd_htable;
_sd_queue_t _sd_lru_q;
_sd_cctl_t *_sd_cctl[_SD_CCTL_GROUPS];
int _sd_cctl_groupsz;
_sd_net_t _sd_net_config;
extern krwlock_t sdbc_queue_lock;
unsigned int _sd_node_hint;
#define _SD_LRU_Q (&_sd_lru_q)
int BLK_FBAS; /* number of FBA's in a cache block */
int CACHE_BLOCK_SIZE; /* size in bytes of a cache block */
int CBLOCKS;
_sd_bitmap_t BLK_FBA_BITS;
static int sdbc_prefetch_valid_cnt;
static int sdbc_prefetch_busy_cnt;
static int sdbc_prefetch_trailing;
static int sdbc_prefetch_deallocd;
static int sdbc_prefetch_pageio1;
static int sdbc_prefetch_pageio2;
static int sdbc_prefetch_hit;
static int sdbc_prefetch_lost;
static int _sd_prefetch_opt = 1; /* 0 to disable & use _prefetch_sb_vec[] */
static nsc_vec_t _prefetch_sb_vec[_SD_MAX_BLKS + 1];
_sd_bitmap_t _fba_bits[] = {
0x0000, 0x0001, 0x0003, 0x0007,
0x000f, 0x001f, 0x003f, 0x007f,
0x00ff,
#if defined(_SD_8K_BLKSIZE)
0x01ff, 0x03ff, 0x07ff,
0x0fff, 0x1fff, 0x3fff, 0x7fff,
0xffff,
#endif
};
static int _sd_ccsync_cnt = 256;
static _sd_cctl_sync_t *_sd_ccent_sync;
nsc_io_t *sdbc_io;
#ifdef _MULTI_DATAMODEL
_sd_stats32_t *_sd_cache_stats32 = NULL;
#endif
#ifdef DEBUG
int cmn_level = CE_PANIC;
#else
int cmn_level = CE_WARN;
#endif
/*
* Forward declare all statics that are used before defined to enforce
* parameter checking
* Some (if not all) of these could be removed if the code were reordered
*/
static void _sdbc_stats_deconfigure(void);
static int _sdbc_stats_configure(int cblocks);
static int _sdbc_lruq_configure(_sd_queue_t *);
static void _sdbc_lruq_deconfigure(void);
static int _sdbc_mem_configure(int cblocks, spcs_s_info_t kstatus);
static void _sdbc_mem_deconfigure(int cblocks);
static void _sd_ins_queue(_sd_queue_t *, _sd_cctl_t *centry);
static int _sd_flush_cd(int cd);
static int _sd_check_buffer_alloc(int cd, nsc_off_t fba_pos, nsc_size_t fba_len,
_sd_buf_handle_t **hp);
static int _sd_doread(_sd_buf_handle_t *handle, _sd_cctl_t *cc_ent,
nsc_off_t fba_pos, nsc_size_t fba_len, int flag);
static void _sd_async_read_ea(blind_t xhandle, nsc_off_t fba_pos,
nsc_size_t fba_len, int error);
static void _sd_async_write_ea(blind_t xhandle, nsc_off_t fba_pos,
nsc_size_t fba_len, int error);
static void _sd_queue_write(_sd_buf_handle_t *handle, nsc_off_t fba_pos,
nsc_size_t fba_len);
static int _sd_remote_store(_sd_cctl_t *cc_ent, nsc_off_t fba_pos,
nsc_size_t fba_len);
static int _sd_copy_direct(_sd_buf_handle_t *handle1, _sd_buf_handle_t *handle2,
nsc_off_t fba_pos1, nsc_off_t fba_pos2, nsc_size_t fba_len);
static int _sd_sync_write(_sd_buf_handle_t *handle, nsc_off_t fba_pos,
nsc_size_t fba_len, int flag);
static int _sd_sync_write2(_sd_buf_handle_t *wr_handle, nsc_off_t wr_st_pos,
nsc_size_t fba_len, int flag, _sd_buf_handle_t *rd_handle,
nsc_off_t rd_st_pos);
static int sdbc_fd_attach_cd(blind_t xcd);
static int sdbc_fd_detach_cd(blind_t xcd);
static int sdbc_fd_flush_cd(blind_t xcd);
static int _sdbc_gl_centry_configure(spcs_s_info_t);
static int _sdbc_gl_file_configure(spcs_s_info_t);
static void _sdbc_gl_centry_deconfigure(void);
static void _sdbc_gl_file_deconfigure(void);
static int sdbc_doread_prefetch(_sd_cctl_t *cc_ent, nsc_off_t fba_pos,
nsc_size_t fba_len);
static _sd_bitmap_t update_dirty(_sd_cctl_t *cc_ent, sdbc_cblk_fba_t st_off,
sdbc_cblk_fba_t st_len);
static int _sd_prefetch_buf(int cd, nsc_off_t fba_pos, nsc_size_t fba_len,
int flag, _sd_buf_handle_t *handle, int locked);
/* dynmem support */
static int _sd_setup_category_on_type(_sd_cctl_t *header);
static int _sd_setup_mem_chaining(_sd_cctl_t *header, int flag);
static int sdbc_check_cctl_cot(_sd_cctl_t *);
static int sdbc_dmqueues_configure();
static void sdbc_dmqueues_deconfigure();
static _sd_cctl_t *sdbc_get_dmchain(int, int *, int);
static int sdbc_dmchain_avail(_sd_cctl_t *);
void sdbc_requeue_dmchain(_sd_queue_t *, _sd_cctl_t *, int, int);
static void sdbc_ins_dmqueue_back(_sd_queue_t *, _sd_cctl_t *);
void sdbc_ins_dmqueue_front(_sd_queue_t *, _sd_cctl_t *);
void sdbc_remq_dmchain(_sd_queue_t *, _sd_cctl_t *);
static void sdbc_clear_dmchain(_sd_cctl_t *, _sd_cctl_t *);
void sdbc_requeue_head_dm_try(_sd_cctl_t *);
static _sd_cctl_t *sdbc_alloc_dmc(int, nsc_off_t, nsc_size_t, int *,
sdbc_allocbuf_t *, int);
static _sd_cctl_t *sdbc_alloc_lru(int, nsc_off_t, int *, int);
static _sd_cctl_t *sdbc_alloc_from_dmchain(int, nsc_off_t, sdbc_allocbuf_t *,
int);
static void sdbc_centry_init_dm(_sd_cctl_t *);
static int sdbc_centry_memalloc_dm(_sd_cctl_t *, int, int);
static void sdbc_centry_alloc_end(sdbc_allocbuf_t *);
/* _SD_DEBUG */
#if defined(_SD_DEBUG) || defined(DEBUG)
static int _sd_cctl_valid(_sd_cctl_t *);
#endif
static
nsc_def_t _sdbc_fd_def[] = {
"Attach", (uintptr_t)sdbc_fd_attach_cd, 0,
"Detach", (uintptr_t)sdbc_fd_detach_cd, 0,
"Flush", (uintptr_t)sdbc_fd_flush_cd, 0,
0, 0, 0
};
/*
* _sdbc_cache_configure - initialize cache blocks, queues etc.
*
* ARGUMENTS:
* cblocks - Number of cache blocks
*
* RETURNS:
* 0 on success.
* SDBC_EENABLEFAIL or SDBC_EMEMCONFIG on failure.
*
*/
int
_sdbc_cache_configure(int cblocks, spcs_s_info_t kstatus)
{
CBLOCKS = cblocks;
_sd_cache_files = (_sd_cd_info_t *)
kmem_zalloc(sdbc_max_devs * sizeof (_sd_cd_info_t),
KM_SLEEP);
if (_sdbc_stats_configure(cblocks))
return (SDBC_EENABLEFAIL);
if (sdbc_use_dmchain) {
if (sdbc_dmqueues_configure())
return (SDBC_EENABLEFAIL);
} else {
if (_sdbc_lruq_configure(_SD_LRU_Q))
return (SDBC_EENABLEFAIL);
}
if (_sdbc_mem_configure(cblocks, kstatus))
return (SDBC_EMEMCONFIG);
CACHE_BLOCK_SIZE = BLK_SIZE(1);
BLK_FBAS = FBA_NUM(CACHE_BLOCK_SIZE);
BLK_FBA_BITS = _fba_bits[BLK_FBAS];
sdbc_allocb_pageio1 = 0;
sdbc_allocb_pageio2 = 0;
sdbc_allocb_hit = 0;
sdbc_allocb_inuse = 0;
sdbc_allocb_lost = 0;
sdbc_centry_inuse = 0;
sdbc_centry_lost = 0;
sdbc_centry_hit = 0;
sdbc_centry_deallocd = 0;
sdbc_dmchain_not_avail = 0;
sdbc_allocb_deallocd = 0;
sdbc_prefetch_valid_cnt = 0;
sdbc_prefetch_busy_cnt = 0;
sdbc_prefetch_trailing = 0;
sdbc_prefetch_deallocd = 0;
sdbc_prefetch_pageio1 = 0;
sdbc_prefetch_pageio2 = 0;
sdbc_prefetch_hit = 0;
sdbc_prefetch_lost = 0;
sdbc_check_cot = 0;
sdbc_prefetch1 = 1;
sdbc_ra_hash = 0;
sdbc_ra_none = 0;
return (0);
}
/*
* _sdbc_cache_deconfigure - cache is being deconfigured. Release any
* memory that we acquired during the configuration process and return
* to the unconfigured state.
*
* NOTE: all users of the cache should be inactive at this point,
* i.e. we are unregistered from sd and all cache daemons/threads are
* gone.
*
*/
void
_sdbc_cache_deconfigure(void)
{
/* CCIO shutdown must happen before memory is free'd */
if (_sd_cache_files) {
kmem_free(_sd_cache_files,
sdbc_max_devs * sizeof (_sd_cd_info_t));
_sd_cache_files = (_sd_cd_info_t *)NULL;
}
BLK_FBA_BITS = 0;
BLK_FBAS = 0;
CACHE_BLOCK_SIZE = 0;
_sdbc_mem_deconfigure(CBLOCKS);
_sdbc_gl_centry_deconfigure();
_sdbc_gl_file_deconfigure();
if (sdbc_use_dmchain)
sdbc_dmqueues_deconfigure();
else
_sdbc_lruq_deconfigure();
_sdbc_stats_deconfigure();
CBLOCKS = 0;
}
/*
* _sdbc_stats_deconfigure - cache is being deconfigured turn off
* stats. This could seemingly do more but we leave most of the
* data intact until cache is configured again.
*
*/
static void
_sdbc_stats_deconfigure(void)
{
int i;
#ifdef DEBUG
if (sdbc_dynmem_kstat_dm) {
kstat_delete(sdbc_dynmem_kstat_dm);
sdbc_dynmem_kstat_dm = NULL;
}
#endif
if (sdbc_global_stats_kstat) {
kstat_delete(sdbc_global_stats_kstat);
sdbc_global_stats_kstat = NULL;
}
if (sdbc_cd_kstats) {
for (i = 0; i < sdbc_max_devs; i++) {
if (sdbc_cd_kstats[i]) {
kstat_delete(sdbc_cd_kstats[i]);
sdbc_cd_kstats[i] = NULL;
}
}
kmem_free(sdbc_cd_kstats, sizeof (kstat_t *) * sdbc_max_devs);
sdbc_cd_kstats = NULL;
}
if (sdbc_global_io_kstat) {
kstat_delete(sdbc_global_io_kstat);
mutex_destroy(&sdbc_global_io_kstat_mutex);
sdbc_global_io_kstat = NULL;
}
if (sdbc_cd_io_kstats) {
for (i = 0; i < sdbc_max_devs; i++) {
if (sdbc_cd_io_kstats[i]) {
kstat_delete(sdbc_cd_io_kstats[i]);
sdbc_cd_io_kstats[i] = NULL;
}
}
kmem_free(sdbc_cd_io_kstats, sizeof (kstat_t *) *
sdbc_max_devs);
sdbc_cd_io_kstats = NULL;
}
if (sdbc_cd_io_kstats_mutexes) {
/* mutexes are already destroyed in cd_kstat_remove() */
kmem_free(sdbc_cd_io_kstats_mutexes, sizeof (kmutex_t) *
sdbc_max_devs);
sdbc_cd_io_kstats_mutexes = NULL;
}
if (_sd_cache_stats) {
kmem_free(_sd_cache_stats,
sizeof (_sd_stats_t) +
(sdbc_max_devs - 1) * sizeof (_sd_shared_t));
_sd_cache_stats = NULL;
}
#ifdef _MULTI_DATAMODEL
if (_sd_cache_stats32) {
kmem_free(_sd_cache_stats32, sizeof (_sd_stats32_t) +
(sdbc_max_devs - 1) * sizeof (_sd_shared_t));
_sd_cache_stats32 = NULL;
}
#endif
}
static int
_sdbc_stats_configure(int cblocks)
{
_sd_cache_stats = kmem_zalloc(sizeof (_sd_stats_t) +
(sdbc_max_devs - 1) * sizeof (_sd_shared_t), KM_SLEEP);
_sd_cache_stats->st_blksize = (int)BLK_SIZE(1);
_sd_cache_stats->st_cachesize = cblocks * BLK_SIZE(1);
_sd_cache_stats->st_numblocks = cblocks;
_sd_cache_stats->st_wrcancelns = 0;
_sd_cache_stats->st_destaged = 0;
#ifdef _MULTI_DATAMODEL
_sd_cache_stats32 = kmem_zalloc(sizeof (_sd_stats32_t) +
(sdbc_max_devs - 1) * sizeof (_sd_shared_t), KM_SLEEP);
#endif
/* kstat implementation - global stats */
sdbc_global_stats_kstat = kstat_create(SDBC_KSTAT_MODULE, 0,
SDBC_KSTAT_GSTATS, SDBC_KSTAT_CLASS, KSTAT_TYPE_NAMED,
sizeof (sdbc_global_stats)/sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
if (sdbc_global_stats_kstat != NULL) {
sdbc_global_stats_kstat->ks_data = &sdbc_global_stats;
sdbc_global_stats_kstat->ks_update = sdbc_global_stats_update;
sdbc_global_stats_kstat->ks_private = _sd_cache_stats;
kstat_install(sdbc_global_stats_kstat);
} else {
cmn_err(CE_WARN, "sdbc: gstats kstat failed");
}
/* global I/O kstats */
sdbc_global_io_kstat = kstat_create(SDBC_KSTAT_MODULE, 0,
SDBC_IOKSTAT_GSTATS, "disk", KSTAT_TYPE_IO, 1, 0);
if (sdbc_global_io_kstat) {
mutex_init(&sdbc_global_io_kstat_mutex, NULL, MUTEX_DRIVER,
NULL);
sdbc_global_io_kstat->ks_lock =
&sdbc_global_io_kstat_mutex;
kstat_install(sdbc_global_io_kstat);
}
/*
* kstat implementation - cd stats
* NOTE: one kstat instance for each open cache descriptor
*/
sdbc_cd_kstats = kmem_zalloc(sizeof (kstat_t *) * sdbc_max_devs,
KM_SLEEP);
/*
* kstat implementation - i/o kstats per cache descriptor
* NOTE: one I/O kstat instance for each cd
*/
sdbc_cd_io_kstats = kmem_zalloc(sizeof (kstat_t *) * sdbc_max_devs,
KM_SLEEP);
sdbc_cd_io_kstats_mutexes = kmem_zalloc(sizeof (kmutex_t) *
sdbc_max_devs, KM_SLEEP);
#ifdef DEBUG
/* kstat implementation - dynamic memory stats */
sdbc_dynmem_kstat_dm = kstat_create(SDBC_KSTAT_MODULE, 0,
SDBC_KSTAT_DYNMEM, SDBC_KSTAT_CLASS, KSTAT_TYPE_NAMED,
sizeof (sdbc_dynmem_dm)/sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
if (sdbc_dynmem_kstat_dm != NULL) {
sdbc_dynmem_kstat_dm->ks_data = &sdbc_dynmem_dm;
sdbc_dynmem_kstat_dm->ks_update = sdbc_dynmem_kstat_update_dm;
sdbc_dynmem_kstat_dm->ks_private = &dynmem_processing_dm;
kstat_install(sdbc_dynmem_kstat_dm);
} else {
cmn_err(CE_WARN, "sdbc: dynmem kstat failed");
}
#endif
return (0);
}
/*
* sdbc_dmqueues_configure()
* initialize the queues of dynamic memory chains.
*/
_sd_queue_t *sdbc_dm_queues;
static int max_dm_queues;
static int
sdbc_dmqueues_configure()
{
int i;
/*
* CAUTION! this code depends on max_dyn_list not changing
* if it does change behavior may be incorrect, as cc_alloc_size_dm
* depends on max_dyn_list and indexes to dmqueues are derived from
* cc_alloc_size_dm.
* see _sd_setup_category_on_type() and _sd_dealloc_dm()
* TODO: prevent max_dyn_list from on-the-fly modification (easy) or
* allow for on-the-fly changes to number of dm queues (hard).
*/
max_dm_queues = dynmem_processing_dm.max_dyn_list;
++max_dm_queues; /* need a "0" queue for centrys with no memory */
sdbc_dm_queues = (_sd_queue_t *)
kmem_zalloc(max_dm_queues * sizeof (_sd_queue_t),
KM_SLEEP);
#ifdef DEBUG
dmchainpull_table = (int *)kmem_zalloc(max_dm_queues *
max_dm_queues * sizeof (int),
KM_SLEEP);
#endif
for (i = 0; i < max_dm_queues; ++i) {
(void) _sdbc_lruq_configure(&sdbc_dm_queues[i]);
sdbc_dm_queues[i].sq_dmchain_cblocks = i;
}
return (0);
}
static void
sdbc_dmqueues_deconfigure()
{
/* CAUTION! this code depends on max_dyn_list not changing */
if (sdbc_dm_queues)
kmem_free(sdbc_dm_queues, max_dm_queues * sizeof (_sd_queue_t));
sdbc_dm_queues = NULL;
max_dm_queues = 0;
}
#define GOOD_LRUSIZE(q) ((q->sq_inq >= 0) || (q->sq_inq <= CBLOCKS))
/*
* _sdbc_lruq_configure - initialize the lru queue
*
* ARGUMENTS: NONE
* RETURNS: 0
*
*/
static int
_sdbc_lruq_configure(_sd_queue_t *_sd_lru)
{
_sd_lru->sq_inq = 0;
mutex_init(&_sd_lru->sq_qlock, NULL, MUTEX_DRIVER, NULL);
_sd_lru->sq_qhead.cc_next = _sd_lru->sq_qhead.cc_prev
= &(_sd_lru->sq_qhead);
return (0);
}
/*
* _sdbc_lruq_deconfigure - deconfigure the lru queue
*
* ARGUMENTS: NONE
*
*/
static void
_sdbc_lruq_deconfigure(void)
{
_sd_queue_t *_sd_lru;
_sd_lru = _SD_LRU_Q;
mutex_destroy(&_sd_lru->sq_qlock);
bzero(_sd_lru, sizeof (_sd_queue_t));
}
/*
* _sdbc_mem_configure - initialize the cache memory.
* Create and initialize the hash table.
* Create cache control blocks and fill them with relevent
* information and enqueue onto the lru queue.
* Initialize the Write control blocks (blocks that contain
* information as to where the data will be mirrored)
* Initialize the Fault tolerant blocks (blocks that contain
* information about the mirror nodes dirty writes)
*
* ARGUMENTS:
* cblocks - Number of cache blocks.
* RETURNS: 0
*
*/
static int
_sdbc_mem_configure(int cblocks, spcs_s_info_t kstatus)
{
int num_blks, i, blk;
_sd_cctl_t *centry;
_sd_net_t *netc;
_sd_cctl_t *prev_entry_dm, *first_entry_dm;
if ((_sd_htable = _sdbc_hash_configure(cblocks)) == NULL) {
spcs_s_add(kstatus, SDBC_ENOHASH);
return (-1);
}
_sd_cctl_groupsz = (cblocks / _SD_CCTL_GROUPS) +
((cblocks % _SD_CCTL_GROUPS) != 0);
for (i = 0; i < _SD_CCTL_GROUPS; i++) {
_sd_cctl[i] = (_sd_cctl_t *)
nsc_kmem_zalloc(_sd_cctl_groupsz * sizeof (_sd_cctl_t),
KM_SLEEP, sdbc_cache_mem);
if (_sd_cctl[i] == NULL) {
spcs_s_add(kstatus, SDBC_ENOCB);
return (-1);
}
}
_sd_ccent_sync = (_sd_cctl_sync_t *)
nsc_kmem_zalloc(_sd_ccsync_cnt * sizeof (_sd_cctl_sync_t),
KM_SLEEP, sdbc_local_mem);
if (_sd_ccent_sync == NULL) {
spcs_s_add(kstatus, SDBC_ENOCCTL);
return (-1);
}
for (i = 0; i < _sd_ccsync_cnt; i++) {
mutex_init(&_sd_ccent_sync[i]._cc_lock, NULL, MUTEX_DRIVER,
NULL);
cv_init(&_sd_ccent_sync[i]._cc_blkcv, NULL, CV_DRIVER, NULL);
}
blk = 0;
netc = &_sd_net_config;
num_blks = (netc->sn_cpages * (int)netc->sn_psize)/BLK_SIZE(1);
prev_entry_dm = 0;
first_entry_dm = 0;
for (i = 0; i < num_blks; i++, blk++) {
centry = _sd_cctl[(blk/_sd_cctl_groupsz)] +
(blk%_sd_cctl_groupsz);
centry->cc_sync = &_sd_ccent_sync[blk % _sd_ccsync_cnt];
centry->cc_next = centry->cc_prev = NULL;
centry->cc_dirty_next = centry->cc_dirty_link = NULL;
centry->cc_await_use = centry->cc_await_page = 0;
centry->cc_inuse = centry->cc_pageio = 0;
centry->cc_flag = 0;
centry->cc_iocount = 0;
centry->cc_valid = 0;
if (!first_entry_dm)
first_entry_dm = centry;
if (prev_entry_dm)
prev_entry_dm->cc_link_list_dm = centry;
prev_entry_dm = centry;
centry->cc_link_list_dm = first_entry_dm;
centry->cc_data = 0;
centry->cc_write = NULL;
centry->cc_dirty = 0;
{
_sd_queue_t *q;
if (sdbc_use_dmchain) {
q = &sdbc_dm_queues[0];
centry->cc_cblocks = 0;
} else
q = _SD_LRU_Q;
_sd_ins_queue(q, centry);
}
}
if (_sdbc_gl_centry_configure(kstatus) != 0)
return (-1);
if (_sdbc_gl_file_configure(kstatus) != 0)
return (-1);
return (0);
}
/*
* _sdbc_gl_file_configure()
* allocate and initialize space for the global filename data.
*
*/
static int
_sdbc_gl_file_configure(spcs_s_info_t kstatus)
{
ss_voldata_t *fileinfo;
ss_voldata_t tempfinfo;
ss_vdir_t vdir;
ss_vdirkey_t key;
int err = 0;
_sdbc_gl_file_info_size = safestore_config.ssc_maxfiles *
sizeof (ss_voldata_t);
if ((_sdbc_gl_file_info = kmem_zalloc(_sdbc_gl_file_info_size,
KM_NOSLEEP)) == NULL) {
spcs_s_add(kstatus, SDBC_ENOSFNV);
return (-1);
}
/* setup the key to get a directory stream of all volumes */
key.vk_type = CDIR_ALL;
fileinfo = _sdbc_gl_file_info;
/*
* if coming up after a crash, "refresh" the host
* memory copy from safestore.
*/
if (_sdbc_warm_start()) {
if (SSOP_GETVDIR(sdbc_safestore, &key, &vdir)) {
cmn_err(CE_WARN, "sdbc(_sdbc_gl_file_configure): "
"cannot read safestore");
return (-1);
}
/*
* cycle through the vdir getting volume data
* and volume tokens
*/
while ((err = SSOP_GETVDIRENT(sdbc_safestore, &vdir, fileinfo))
== SS_OK) {
++fileinfo;
}
if (err != SS_EOF) {
/*
* fail to configure since
* recovery is not possible.
*/
spcs_s_add(kstatus, SDBC_ENOREFRESH);
return (-1);
}
} else { /* normal initialization, not a warm start */
/*
* if this fails, continue: cache will start
* in writethru mode
*/
if (SSOP_GETVDIR(sdbc_safestore, &key, &vdir)) {
cmn_err(CE_WARN, "sdbc(_sdbc_gl_file_configure): "
"cannot read safestore");
return (-1);
}
/*
* cycle through the vdir getting just the volume tokens
* and initializing volume entries
*/
while ((err = SSOP_GETVDIRENT(sdbc_safestore, &vdir,
&tempfinfo)) == 0) {
/*
* initialize the host memory copy of the
* global file region. this means setting the
* _pinned and _attached fields to _SD_NO_HOST
* because the default of zero conflicts with
* the min nodeid of zero.
*/
fileinfo->sv_vol = tempfinfo.sv_vol;
fileinfo->sv_pinned = _SD_NO_HOST;
fileinfo->sv_attached = _SD_NO_HOST;
fileinfo->sv_cd = _SD_NO_CD;
/* initialize the directory entry */
if ((err = SSOP_SETVOL(sdbc_safestore, fileinfo))
== SS_ERR) {
cmn_err(CE_WARN,
"sdbc(_sdbc_gl_file_configure): "
"volume entry write failure %p",
(void *)fileinfo->sv_vol);
break;
}
++fileinfo;
}
/* coming up clean, continue in w-t mode */
if (err != SS_EOF)
cmn_err(CE_WARN, "sdbc(_sdbc_gl_file_configure) "
"unable to init safe store volinfo");
}
return (0);
}
static void
_sdbc_gl_centry_deconfigure(void)
{
if (_sdbc_gl_centry_info)
kmem_free(_sdbc_gl_centry_info, _sdbc_gl_centry_info_size);
_sdbc_gl_centry_info = NULL;
_sdbc_gl_centry_info_size = 0;
}
static int
_sdbc_gl_centry_configure(spcs_s_info_t kstatus)
{
int wblocks;
ss_centry_info_t *cinfo;
ss_cdirkey_t key;
ss_cdir_t cdir;
int err = 0;
wblocks = safestore_config.ssc_wsize / BLK_SIZE(1);
_sdbc_gl_centry_info_size = sizeof (ss_centry_info_t) * wblocks;
if ((_sdbc_gl_centry_info = kmem_zalloc(_sdbc_gl_centry_info_size,
KM_NOSLEEP)) == NULL) {
cmn_err(CE_WARN,
"sdbc(_sdbc_gl_centry_configure) "
"alloc failed for gl_centry_info region");
_sdbc_gl_centry_deconfigure();
return (-1);
}
/*
* synchronize the centry info area with safe store
*/
/* setup the key to get a directory stream of all centrys */
key.ck_type = CDIR_ALL;
cinfo = _sdbc_gl_centry_info;
if (_sdbc_warm_start()) {
if (SSOP_GETCDIR(sdbc_safestore, &key, &cdir)) {
cmn_err(CE_WARN, "sdbc(_sdbc_gl_centry_configure): "
"cannot read safestore");
return (-1);
}
/*
* cycle through the cdir getting resource
* tokens and reading centrys
*/
while ((err = SSOP_GETCDIRENT(sdbc_safestore, &cdir, cinfo))
== 0) {
++cinfo;
}
if (err != SS_EOF) {
/*
* fail to configure since
* recovery is not possible.
*/
_sdbc_gl_centry_deconfigure();
spcs_s_add(kstatus, SDBC_EGLDMAFAIL);
return (-1);
}
} else {
if (SSOP_GETCDIR(sdbc_safestore, &key, &cdir)) {
cmn_err(CE_WARN, "sdbc(_sdbc_gl_centry_configure): "
"cannot read safestore");
return (-1);
}
/*
* cycle through the cdir getting resource
* tokens and initializing centrys
*/
while ((err = SSOP_GETCDIRENT(sdbc_safestore, &cdir, cinfo))
== 0) {
cinfo->sc_cd = -1;
cinfo->sc_fpos = -1;
if ((err = SSOP_SETCENTRY(sdbc_safestore, cinfo))
== SS_ERR) {
cmn_err(CE_WARN,
"sdbc(_sdbc_gl_centry_configure): "
"cache entry write failure %p",
(void *)cinfo->sc_res);
break;
}
++cinfo;
}
/* coming up clean, continue in w-t mode */
if (err != SS_EOF) {
cmn_err(CE_WARN, "sdbc(sdbc_gl_centry_configure) "
"_sdbc_gl_centry_info initialization failed");
}
}
return (0);
}
static void
_sdbc_gl_file_deconfigure(void)
{
if (_sdbc_gl_file_info)
kmem_free(_sdbc_gl_file_info, _sdbc_gl_file_info_size);
_sdbc_gl_file_info = NULL;
_sdbc_gl_file_info_size = 0;
}
/*
* _sdbc_mem_deconfigure - deconfigure the cache memory.
* Release any memory/locks/sv's acquired during _sdbc_mem_configure.
*
* ARGUMENTS:
* cblocks - Number of cache blocks.
*
*/
/* ARGSUSED */
static void
_sdbc_mem_deconfigure(int cblocks)
{
int i;
if (_sd_ccent_sync) {
for (i = 0; i < _sd_ccsync_cnt; i++) {
mutex_destroy(&_sd_ccent_sync[i]._cc_lock);
cv_destroy(&_sd_ccent_sync[i]._cc_blkcv);
}
nsc_kmem_free(_sd_ccent_sync,
_sd_ccsync_cnt * sizeof (_sd_cctl_sync_t));
}
_sd_ccent_sync = NULL;
for (i = 0; i < _SD_CCTL_GROUPS; i++) {
if (_sd_cctl[i] != NULL) {
nsc_kmem_free(_sd_cctl[i],
_sd_cctl_groupsz * sizeof (_sd_cctl_t));
_sd_cctl[i] = NULL;
}
}
_sd_cctl_groupsz = 0;
_sdbc_hash_deconfigure(_sd_htable);
_sd_htable = NULL;
}
#if defined(_SD_DEBUG) || defined(DEBUG)
static int
_sd_cctl_valid(_sd_cctl_t *addr)
{
_sd_cctl_t *end;
int i, valid;
valid = 0;
for (i = 0; i < _SD_CCTL_GROUPS; i++) {
end = _sd_cctl[i] + _sd_cctl_groupsz;
if (addr >= _sd_cctl[i] && addr < end) {
valid = 1;
break;
}
}
return (valid);
}
#endif
/*
* _sd_ins_queue - insert centry into LRU queue
* (during initialization, locking not required)
*/
static void
_sd_ins_queue(_sd_queue_t *q, _sd_cctl_t *centry)
{
_sd_cctl_t *q_head;
ASSERT(_sd_cctl_valid(centry));
q_head = &q->sq_qhead;
centry->cc_prev = q_head;
centry->cc_next = q_head->cc_next;
q_head->cc_next->cc_prev = centry;
q_head->cc_next = centry;
q->sq_inq++;
ASSERT(GOOD_LRUSIZE(q));
}
void
_sd_requeue(_sd_cctl_t *centry)
{
_sd_queue_t *q = _SD_LRU_Q;
/* was FAST */
mutex_enter(&q->sq_qlock);
#if defined(_SD_DEBUG)
if (1) {
_sd_cctl_t *cp, *cn, *qp;
cp = centry->cc_prev;
cn = centry->cc_next;
qp = (q->sq_qhead).cc_prev;
if (!_sd_cctl_valid(centry) ||
(cp != &(q->sq_qhead) && !_sd_cctl_valid(cp)) ||
(cn != &(q->sq_qhead) && !_sd_cctl_valid(cn)) ||
!_sd_cctl_valid(qp))
cmn_err(CE_PANIC,
"_sd_requeue %x prev %x next %x qp %x",
centry, cp, cn, qp);
}
#endif
centry->cc_prev->cc_next = centry->cc_next;
centry->cc_next->cc_prev = centry->cc_prev;
centry->cc_next = &(q->sq_qhead);
centry->cc_prev = q->sq_qhead.cc_prev;
q->sq_qhead.cc_prev->cc_next = centry;
q->sq_qhead.cc_prev = centry;
centry->cc_seq = q->sq_seq++;
/* was FAST */
mutex_exit(&q->sq_qlock);
(q->sq_req_stat)++;
}
void
_sd_requeue_head(_sd_cctl_t *centry)
{
_sd_queue_t *q = _SD_LRU_Q;
/* was FAST */
mutex_enter(&q->sq_qlock);
#if defined(_SD_DEBUG)
if (1) {
_sd_cctl_t *cp, *cn, *qn;
cp = centry->cc_prev;
cn = centry->cc_next;
qn = (q->sq_qhead).cc_prev;
if (!_sd_cctl_valid(centry) ||
(cp != &(q->sq_qhead) && !_sd_cctl_valid(cp)) ||
(cn != &(q->sq_qhead) && !_sd_cctl_valid(cn)) ||
!_sd_cctl_valid(qn))
cmn_err(CE_PANIC,
"_sd_requeue_head %x prev %x next %x qn %x",
centry, cp, cn, qn);
}
#endif
centry->cc_prev->cc_next = centry->cc_next;
centry->cc_next->cc_prev = centry->cc_prev;
centry->cc_prev = &(q->sq_qhead);
centry->cc_next = q->sq_qhead.cc_next;
q->sq_qhead.cc_next->cc_prev = centry;
q->sq_qhead.cc_next = centry;
centry->cc_seq = q->sq_seq++;
centry->cc_flag &= ~CC_QHEAD;
/* was FAST */
mutex_exit(&q->sq_qlock);
}
/*
* _sd_open - Open a file.
*
* ARGUMENTS:
* filename - Name of the file to be opened.
* flag - Flag associated with open.
* (currently used to determine a ckd device)
* RETURNS:
* cd - the cache descriptor.
*/
int
_sd_open(char *filename, int flag)
{
int cd;
if (!_sd_cache_initialized) {
cmn_err(CE_WARN, "sdbc(_sd_open) cache not initialized");
return (-EINVAL);
}
cd = _sd_open_cd(filename, -1, flag);
SDTRACE(SDF_OPEN, (cd < 0) ? SDT_INV_CD : cd, 0, SDT_INV_BL, 0, cd);
return (cd);
}
static int
_sd_open_io(char *filename, int flag, blind_t *cdp, nsc_iodev_t *iodev)
{
_sd_cd_info_t *cdi;
int cd;
int rc = 0;
if ((cd = _sd_open(filename, flag)) >= 0) {
cdi = &(_sd_cache_files[cd]);
cdi->cd_iodev = iodev;
nsc_set_owner(cdi->cd_rawfd, cdi->cd_iodev);
*cdp = (blind_t)(unsigned long)cd;
} else
rc = -cd;
return (rc);
}
int
_sd_open_cd(char *filename, const int cd, const int flag)
{
int new_cd, rc = 0, alloc_cd = -1;
ss_voldata_t *cdg;
int preexists = 0;
_sd_cd_info_t *cdi;
int failover_open, open_failed;
major_t devmaj;
minor_t devmin;
if (_sdbc_shutdown_in_progress)
return (-EIO);
if (strlen(filename) > (NSC_MAXPATH-1))
return (-ENAMETOOLONG);
/*
* If the cd is >= 0, then this is a open for a specific cd.
* This happens when the mirror node crashes, and we attempt to
* reopen the files with the same cache descriptors as existed on
* the other node
*/
retry_open:
failover_open = 0;
open_failed = 0;
if (cd >= 0) {
failover_open++;
cdi = &(_sd_cache_files[cd]);
mutex_enter(&_sd_cache_lock);
if (cdi->cd_info == NULL)
cdi->cd_info = &_sd_cache_stats->st_shared[cd];
else if (cdi->cd_info->sh_alloc &&
strcmp(cdi->cd_info->sh_filename, filename)) {
cmn_err(CE_WARN, "sdbc(_sd_open_cd) cd %d mismatch",
cd);
mutex_exit(&_sd_cache_lock);
return (-EEXIST);
}
if (cdi->cd_info->sh_failed != 2) {
if (cdi->cd_info->sh_alloc != 0)
preexists = 1;
else {
cdi->cd_info->sh_alloc = CD_ALLOC_IN_PROGRESS;
(void) strcpy(cdi->cd_info->sh_filename,
filename);
if (_sd_cache_stats->st_count < sdbc_max_devs)
_sd_cache_stats->st_count++;
}
}
mutex_exit(&_sd_cache_lock);
alloc_cd = cd;
goto known_cd;
}
new_cd = 0;
mutex_enter(&_sd_cache_lock);
for (cdi = &(_sd_cache_files[new_cd]),
cdg = _sdbc_gl_file_info + new_cd;
new_cd < (sdbc_max_devs); new_cd++, cdi++, cdg++) {
if (strlen(cdg->sv_volname) != 0)
if (strcmp(cdg->sv_volname, filename))
continue;
if (cdi->cd_info == NULL)
cdi->cd_info = &_sd_cache_stats->st_shared[new_cd];
if (cdi->cd_info->sh_failed != 2) {
if (cdi->cd_info->sh_alloc != 0)
preexists = 1;
else {
if (cd == -2) {
mutex_exit(&_sd_cache_lock);
return (-1);
}
cdi->cd_info->sh_alloc = CD_ALLOC_IN_PROGRESS;
(void) strcpy(cdi->cd_info->sh_filename,
filename);
(void) strcpy(cdg->sv_volname, filename);
cdg->sv_cd = new_cd;
/* update safestore */
SSOP_SETVOL(sdbc_safestore, cdg);
if (_sd_cache_stats->st_count < sdbc_max_devs)
_sd_cache_stats->st_count++;
cdi->cd_flag = 0;
}
}
alloc_cd = new_cd;
break;
}
mutex_exit(&_sd_cache_lock);
if (alloc_cd == -1)
return (-ENOSPC);
known_cd:
/*
* If preexists: someone else is attempting to open this file as
* well. Do only one open, but block everyone else here till the
* open is completed.
*/
if (preexists) {
while (cdi->cd_info->sh_alloc == CD_ALLOC_IN_PROGRESS) {
delay(drv_usectohz(20000));
}
if ((cdi->cd_info->sh_alloc != CD_ALLOCATED))
goto retry_open;
return (alloc_cd);
}
if (!(cdi->cd_rawfd =
nsc_open(filename, NSC_SDBC_ID|NSC_DEVICE, _sdbc_fd_def,
(blind_t)(unsigned long)alloc_cd, &rc)) ||
!nsc_getval(cdi->cd_rawfd, "DevMaj", (int *)&devmaj) ||
!nsc_getval(cdi->cd_rawfd, "DevMin", (int *)&devmin)) {
if (cdi->cd_rawfd) {
(void) nsc_close(cdi->cd_rawfd);
cdi->cd_rawfd = NULL;
}
/*
* take into account that there may be pinned data on a
* device that can no longer be opened
*/
open_failed++;
if (!(cdi->cd_info->sh_failed) && !failover_open) {
cdi->cd_info->sh_alloc = 0;
mutex_enter(&_sd_cache_lock);
_sd_cache_stats->st_count--;
mutex_exit(&_sd_cache_lock);
if (!rc)
rc = EIO;
return (-rc);
}
}
cdi->cd_strategy = nsc_get_strategy(devmaj);
cdi->cd_crdev = makedevice(devmaj, devmin);
cdi->cd_desc = alloc_cd;
cdi->cd_dirty_head = cdi->cd_dirty_tail = NULL;
cdi->cd_io_head = cdi->cd_io_tail = NULL;
cdi->cd_hint = 0;
#ifdef DEBUG
/* put the dev_t in the ioerr_inject_table */
_sdbc_ioj_set_dev(alloc_cd, cdi->cd_crdev);
#endif
cdi->cd_global = (_sdbc_gl_file_info + alloc_cd);
if (open_failed) {
cdi->cd_info->sh_failed = 2;
} else if (cdi->cd_info->sh_failed != 2)
if ((cdi->cd_global->sv_pinned == _SD_SELF_HOST) &&
!failover_open)
cdi->cd_info->sh_failed = 1;
else
cdi->cd_info->sh_failed = 0;
cdi->cd_flag |= flag;
mutex_init(&cdi->cd_lock, NULL, MUTEX_DRIVER, NULL);
#ifndef _SD_NOTRACE
(void) _sdbc_tr_configure(alloc_cd);
#endif
cdi->cd_info->sh_alloc = CD_ALLOCATED;
cdi->cd_global = (_sdbc_gl_file_info + alloc_cd);
cdi->cd_info->sh_cd = (unsigned short) alloc_cd;
mutex_enter(&_sd_cache_lock);
_sd_cache_stats->st_loc_count++;
mutex_exit(&_sd_cache_lock);
if (cd_kstat_add(alloc_cd) < 0) {
cmn_err(CE_WARN, "Could not create kstats for cache descriptor "
"%d", alloc_cd);
}
return (open_failed ? -EIO : alloc_cd);
}
/*
* _sd_close - Close a cache descriptor.
*
* ARGUMENTS:
* cd - the cache descriptor to be closed.
* RETURNS:
* 0 on success.
* Error otherwise.
*
* Note: Under Construction.
*/
int
_sd_close(int cd)
{
int rc;
_sd_cd_info_t *cdi = &(_sd_cache_files[cd]);
if (!FILE_OPENED(cd)) {
rc = EINVAL;
goto out;
}
SDTRACE(ST_ENTER|SDF_CLOSE, cd, 0, SDT_INV_BL, 0, 0);
mutex_enter(&_sd_cache_lock);
if ((cdi->cd_info->sh_alloc == 0) ||
(cdi->cd_info->sh_alloc & CD_CLOSE_IN_PROGRESS)) {
mutex_exit(&_sd_cache_lock);
SDTRACE(ST_EXIT|SDF_CLOSE, cd, 0, SDT_INV_BL, 0, EINVAL);
rc = EINVAL;
goto out;
}
cdi->cd_info->sh_alloc |= CD_CLOSE_IN_PROGRESS;
mutex_exit(&_sd_cache_lock);
/*
* _sd_flush_cd() will return -1 for the case where pinned
* data is present, but has been transfered to the mirror
* node. In this case it is safe to close the device as
* though _sd_flush_cd() had returned 0.
*/
rc = _sd_flush_cd(cd);
if (rc == -1)
rc = 0;
if (rc != 0) {
mutex_enter(&_sd_cache_lock);
if ((rc == EAGAIN) &&
(cdi->cd_global->sv_pinned == _SD_NO_HOST)) {
cdi->cd_global->sv_pinned = _SD_SELF_HOST;
SSOP_SETVOL(sdbc_safestore, cdi->cd_global);
}
cdi->cd_info->sh_alloc &= ~CD_CLOSE_IN_PROGRESS;
mutex_exit(&_sd_cache_lock);
SDTRACE(ST_EXIT|SDF_CLOSE, cd, 0, SDT_INV_BL,
_SD_CD_WBLK_USED(cd), rc);
goto out;
}
rc = nsc_close(cdi->cd_rawfd);
if (rc) {
mutex_enter(&_sd_cache_lock);
cdi->cd_info->sh_alloc &= ~CD_CLOSE_IN_PROGRESS;
mutex_exit(&_sd_cache_lock);
SDTRACE(ST_EXIT|SDF_CLOSE, cd, 0, SDT_INV_BL, 0, rc);
goto out;
}
mutex_enter(&_sd_cache_lock);
_sd_cache_stats->st_loc_count--;
mutex_exit(&_sd_cache_lock);
if (cd_kstat_remove(cd) < 0) {
cmn_err(CE_WARN, "Could not remove kstat for cache descriptor "
"%d", cd);
}
cdi->cd_info->sh_alloc = 0;
cdi->cd_info->sh_failed = 0;
/* cdi->cd_info = NULL; */
cdi->cd_flag = 0;
SDTRACE(ST_EXIT|SDF_CLOSE, cd, 0, SDT_INV_BL, 0, NSC_DONE);
rc = NSC_DONE;
goto out;
out:
return (rc);
}
static int
_sd_close_io(blind_t xcd)
{
_sd_cd_info_t *cdi;
int cd = (int)(unsigned long)xcd;
int rc = 0;
if ((rc = _sd_close((int)cd)) == NSC_DONE) {
cdi = &(_sd_cache_files[cd]);
cdi->cd_iodev = NULL;
}
return (rc);
}
/*
* _sdbc_remote_store_pinned - reflect pinned/failed blocks for cd
* to our remote mirror. Returns count of blocks reflected or -1 on error.
*
*/
int
_sdbc_remote_store_pinned(int cd)
{
int cnt = 0;
_sd_cd_info_t *cdi = &(_sd_cache_files[cd]);
_sd_cctl_t *cc_ent, *cc_list;
ASSERT(cd >= 0);
if (cdi->cd_info->sh_failed) {
if (cdi->cd_global->sv_pinned == _SD_NO_HOST) {
cdi->cd_global->sv_pinned = _SD_SELF_HOST;
SSOP_SETVOL(sdbc_safestore, cdi->cd_global);
}
mutex_enter(&cdi->cd_lock);
cc_ent = cc_list = cdi->cd_fail_head;
while (cc_ent) {
cnt++;
/* is this always necessary? jgk */
if (SSOP_WRITE_CBLOCK(sdbc_safestore,
cc_ent->cc_write->sc_res,
cc_ent->cc_data,
CACHE_BLOCK_SIZE, 0)) {
mutex_exit(&cdi->cd_lock);
return (-1);
}
/* update the cache block metadata */
CENTRY_SET_FTPOS(cc_ent);
cc_ent->cc_write->sc_flag = cc_ent->cc_flag;
cc_ent->cc_write->sc_dirty = CENTRY_DIRTY(cc_ent);
SSOP_SETCENTRY(sdbc_safestore, cc_ent->cc_write);
cc_ent = cc_ent->cc_dirty_next;
if (!cc_ent)
cc_ent = cc_list = cc_list->cc_dirty_link;
}
mutex_exit(&cdi->cd_lock);
}
return (cnt);
}
/*
* _sd_flush_cd()
* reflect pinned blocks to mirrored node
* wait for dirty blocks to be flushed
* returns:
* EIO I/O failure, or pinned blocks and no mirror
* EAGAIN Hang: count of outstanding writes isn't decreasing
* -1 pinned blocks, reflected to mirror
* 0 success
*/
static int
_sd_flush_cd(int cd)
{
int rc;
if ((rc = _sd_wait_for_flush(cd)) == 0)
return (0);
/*
* if we timed out simply return otherwise
* it must be an i/o type of error
*/
if (rc == EAGAIN)
return (rc);
if (_sd_is_mirror_down())
return (EIO); /* already failed, no mirror */
/* flush any pinned/failed blocks to mirror */
if (_sdbc_remote_store_pinned(cd) >= 0)
/*
* At this point it looks like we have blocks on the
* failed list and taking up space on this node but
* no longer have responsibility for the blocks.
* These blocks will in fact be freed from the cache
* and the failed list when the mirror picks them up
* from safe storage and then calls _sd_cd_discard_mirror
* which will issue an rpc telling us to finish up.
*
* Should the other node die before sending the rpc then
* we are safe with these blocks simply waiting on the
* failed list.
*/
return (-1);
else
return (rc);
}
/*
* _sdbc_io_attach_cd -- set up for client access to device, reserve raw device
*
* ARGUMENTS:
* cd - the cache descriptor to attach.
*
* RETURNS:
* 0 on success.
* Error otherwise.
*/
int
_sdbc_io_attach_cd(blind_t xcd)
{
int rc = 0;
_sd_cd_info_t *cdi;
int cd = (int)(unsigned long)xcd;
SDTRACE(ST_ENTER|SDF_ATTACH, cd, 0, SDT_INV_BL, 0, 0);
if (!_sd_cache_initialized ||
_sdbc_shutdown_in_progress ||
!FILE_OPENED(cd)) {
SDTRACE(ST_EXIT|SDF_ATTACH, cd, 0, SDT_INV_BL, 0, EINVAL);
DTRACE_PROBE(_sdbc_io_attach_cd_end1);
return (EINVAL);
}
cdi = &(_sd_cache_files[cd]);
/*
* check if disk is failed without raw device open. If it is,
* it has to be recovered using _sd_disk_online
*/
if (cdi->cd_global->sv_pinned == _SD_SELF_HOST) {
_sd_print(3,
"_sdbc_io_attach_cd: pinned data. returning EINVAL");
DTRACE_PROBE(_sdbc_io_attach_cd_end2);
return (EINVAL);
}
if ((cdi->cd_info == NULL) || (cdi->cd_info->sh_failed)) {
DTRACE_PROBE1(_sdbc_io_attach_cd_end3,
struct _sd_shared *, cdi->cd_info);
return (EINVAL);
}
#if defined(_SD_FAULT_RES)
/* wait for node recovery to finish */
if (_sd_node_recovery) (void)
_sd_recovery_wait();
#endif
/* this will provoke a sdbc_fd_attach_cd call .. */
rc = nsc_reserve(cdi->cd_rawfd, NSC_MULTI);
SDTRACE(ST_EXIT|SDF_ATTACH, cd, 0, SDT_INV_BL, 0, rc);
return (rc);
}
/*
* sdbc_fd_attach_cd -- setup cache for access to raw device underlying cd.
* This is provoked by some piece of sdbc doing a reserve on the raw device.
*
* ARGUMENTS:
* cd - the cache descriptor to attach.
*
* RETURNS:
* 0 on success.
* Error otherwise.
*/
static int
sdbc_fd_attach_cd(blind_t xcd)
{
int rc = 0;
int cd = (int)(unsigned long)xcd;
_sd_cd_info_t *cdi;
if (!_sd_cache_initialized || !FILE_OPENED(cd)) {
SDTRACE(ST_INFO|SDF_ATTACH, cd, 0, SDT_INV_BL, 0, EINVAL);
DTRACE_PROBE(sdbc_fd_attach_cd_end1);
return (EINVAL);
}
cdi = &(_sd_cache_files[cd]);
#if defined(_SD_FAULT_RES)
/* retrieve pinned/failed data */
if (!_sd_node_recovery) {
(void) _sd_repin_cd(cd);
}
#endif
rc = nsc_partsize(cdi->cd_rawfd, &cdi->cd_info->sh_filesize);
if (rc != 0) {
SDTRACE(ST_INFO|SDF_ATTACH, cd, 0, SDT_INV_BL, 0, rc);
DTRACE_PROBE(sdbc_fd_attach_cd_end3);
return (rc);
}
cdi->cd_global->sv_attached = _SD_SELF_HOST;
SSOP_SETVOL(sdbc_safestore, cdi->cd_global);
mutex_enter(&_sd_cache_lock);
cdi->cd_info->sh_flag |= CD_ATTACHED;
mutex_exit(&_sd_cache_lock);
return (0);
}
/*
* _sdbc_io_detach_cd -- release raw device
* Called when a cache client is being detached from this cd.
*
* ARGUMENTS:
* cd - the cache descriptor to detach.
* RETURNS:
* 0 on success.
* Error otherwise.
*/
int
_sdbc_io_detach_cd(blind_t xcd)
{
int cd = (int)(unsigned long)xcd;
_sd_cd_info_t *cdi;
SDTRACE(ST_ENTER|SDF_DETACH, cd, 0, SDT_INV_BL, 0, 0);
if (!_sd_cache_initialized || !FILE_OPENED(cd)) {
SDTRACE(ST_EXIT|SDF_DETACH, cd, 0, SDT_INV_BL, 0, EINVAL);
DTRACE_PROBE(_sdbc_io_detach_cd_end1);
return (EINVAL);
}
#if defined(_SD_FAULT_RES)
if (_sd_node_recovery) (void)
_sd_recovery_wait();
#endif
/* relinquish responsibility for device */
cdi = &(_sd_cache_files[cd]);
if (!(cdi->cd_rawfd) || !nsc_held(cdi->cd_rawfd)) {
cmn_err(CE_WARN, "sdbc(_sdbc_detach_cd) (%d) not attached", cd);
SDTRACE(ST_EXIT|SDF_DETACH, cd, 0, SDT_INV_BL, 0, EPROTO);
DTRACE_PROBE1(_sdbc_io_detach_cd_end2,
nsc_fd_t *, cdi->cd_rawfd);
return (EPROTO);
}
/* this will provoke/allow a call to sdbc_fd_detach_cd */
nsc_release(cdi->cd_rawfd);
SDTRACE(ST_EXIT|SDF_DETACH, cd, 0, SDT_INV_BL, 0, 0);
return (0);
}
/*
* _sdbc_detach_cd -- flush dirty writes to disk, release raw device
* Called when raw device is being detached from this cd.
*
* ARGUMENTS:
* cd - the cache descriptor to detach.
* rd_only - non-zero if detach is for read access.
* RETURNS:
* 0 on success.
* Error otherwise.
*/
static int
sdbc_detach_cd(blind_t xcd, int rd_only)
{
int rc;
int cd = (int)(unsigned long)xcd;
_sd_cd_info_t *cdi;
SDTRACE(ST_INFO|SDF_DETACH, cd, 0, SDT_INV_BL, 0, 0);
if (!_sd_cache_initialized || !FILE_OPENED(cd)) {
SDTRACE(ST_INFO|SDF_DETACH, cd, 0, SDT_INV_BL, 0, EINVAL);
DTRACE_PROBE(sdbc_detach_cd_end1);
return (EINVAL);
}
rc = _sd_flush_cd(cd);
if (rc > 0) {
SDTRACE(ST_INFO|SDF_DETACH, cd, 0, SDT_INV_BL, 0, rc);
DTRACE_PROBE(sdbc_detach_cd_end2);
return (rc);
}
if (!rd_only) {
_sd_hash_invalidate_cd(cd);
cdi = &(_sd_cache_files[cd]);
if (cdi->cd_global->sv_attached == _SD_SELF_HOST) {
cdi->cd_global->sv_attached = _SD_NO_HOST;
SSOP_SETVOL(sdbc_safestore, cdi->cd_global);
} else {
cmn_err(CE_WARN,
"sdbc(_sdbc_detach_cd) (%d) attached by node %d",
cd, cdi->cd_global->sv_attached);
SDTRACE(SDF_DETACH, cd, 0, SDT_INV_BL, 0, EPROTO);
DTRACE_PROBE1(sdbc_detach_cd_end3,
int, cdi->cd_global->sv_attached);
return (EPROTO);
}
mutex_enter(&_sd_cache_lock);
cdi->cd_info->sh_flag &= ~CD_ATTACHED;
mutex_exit(&_sd_cache_lock);
}
SDTRACE(ST_INFO|SDF_DETACH, cd, 0, SDT_INV_BL, 0, 0);
return (0);
}
/*
* _sdbc_fd_detach_cd -- flush dirty writes to disk, release raw device
* Called when raw device is being detached from this cd.
*
* ARGUMENTS:
* xcd - the cache descriptor to detach.
* RETURNS:
* 0 on success.
* Error otherwise.
*/
static int
sdbc_fd_detach_cd(blind_t xcd)
{
return (sdbc_detach_cd(xcd, 0));
}
/*
* sdbc_fd_flush_cd - raw device "xcd" is being detached and needs
* flushing. We only need to flush we don't need to hash invalidate
* this file.
*/
static int
sdbc_fd_flush_cd(blind_t xcd)
{
return (sdbc_detach_cd(xcd, 1));
}
/*
* _sd_get_pinned - re-issue PINNED callbacks for cache device
*
* ARGUMENTS:
* cd - the cache descriptor to reissue pinned calbacks from.
* RETURNS:
* 0 on success.
* Error otherwise.
*/
int
_sd_get_pinned(blind_t xcd)
{
_sd_cd_info_t *cdi;
_sd_cctl_t *cc_list, *cc_ent;
int cd = (int)(unsigned long)xcd;
cdi = &_sd_cache_files[cd];
if (cd < 0 || cd >= sdbc_max_devs) {
DTRACE_PROBE(_sd_get_pinned_end1);
return (EINVAL);
}
if (!FILE_OPENED(cd)) {
DTRACE_PROBE(_sd_get_pinned_end2);
return (0);
}
mutex_enter(&cdi->cd_lock);
if (!cdi->cd_info->sh_failed) {
mutex_exit(&cdi->cd_lock);
DTRACE_PROBE(_sd_get_pinned_end3);
return (0);
}
cc_ent = cc_list = cdi->cd_fail_head;
while (cc_ent) {
if (CENTRY_PINNED(cc_ent))
nsc_pinned_data(cdi->cd_iodev,
BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
BLK_FBAS);
cc_ent = cc_ent->cc_dirty_next;
if (!cc_ent)
cc_ent = cc_list = cc_list->cc_dirty_link;
}
mutex_exit(&cdi->cd_lock);
return (0);
}
/*
* _sd_allocate_buf - allocate a vector of buffers for io.
* *This call has been replaced by _sd_alloc_buf*
*/
_sd_buf_handle_t *
_sd_allocate_buf(int cd, nsc_off_t fba_pos, nsc_size_t fba_len, int flag,
int *sts)
{
_sd_buf_handle_t *handle = NULL;
*sts = _sd_alloc_buf((blind_t)(unsigned long)cd, fba_pos, fba_len,
flag, &handle);
if (*sts == NSC_HIT)
*sts = NSC_DONE;
return (handle);
}
/*
* _sd_prefetch_buf - _sd_alloc_buf w/flag = NSC_RDAHEAD|NSC_RDBUF
* no 'bufvec' (data is not read by caller)
* skip leading valid or busy entries (data available sooner)
* truncate on busy block (to avoid deadlock)
* release trailing valid entries, adjust length before starting I/O.
*/
static int
_sd_prefetch_buf(int cd, nsc_off_t fba_pos, nsc_size_t fba_len, int flag,
_sd_buf_handle_t *handle, int locked)
{
_sd_cd_info_t *cdi;
nsc_off_t cblk; /* position of temp cache block */
sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */
sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */
sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */
nsc_off_t io_pos; /* offset in FBA's */
nsc_size_t fba_orig_len;
int sts, stall;
_sd_cctl_t *centry = NULL;
_sd_cctl_t *lentry = NULL;
_sd_cctl_t *ioent = NULL;
_sd_cctl_t *last_ioent = NULL;
sdbc_allocbuf_t alloc_tok = {0};
int this_entry_type = 0;
nsc_size_t request_blocks = 0; /* number of cache blocks required */
int pageio;
handle->bh_flag |= NSC_HACTIVE;
ASSERT(cd >= 0);
cdi = &_sd_cache_files[cd];
/* prefetch: truncate if req'd */
if (fba_len > sdbc_max_fbas)
fba_len = sdbc_max_fbas;
if ((fba_pos + fba_len) > cdi->cd_info->sh_filesize) {
if (fba_pos >= cdi->cd_info->sh_filesize) {
sts = EIO;
goto done;
}
fba_len = cdi->cd_info->sh_filesize - fba_pos;
}
fba_orig_len = fba_len;
_SD_SETUP_HANDLE(handle, cd, fba_pos, fba_len, flag);
handle->bh_centry = NULL;
cblk = FBA_TO_BLK_NUM(fba_pos);
st_cblk_off = BLK_FBA_OFF(fba_pos);
st_cblk_len = BLK_FBAS - st_cblk_off;
/*
* count number of blocks on chain that is required
*/
if ((nsc_size_t)st_cblk_len >= fba_len) {
st_cblk_len = (sdbc_cblk_fba_t)fba_len;
end_cblk_len = 0;
} else {
end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
}
request_blocks = 1; /* at least one */
/* middle piece */
request_blocks += (fba_len - (st_cblk_len + end_cblk_len)) >>
BLK_FBA_SHFT;
if (end_cblk_len)
++request_blocks;
stall = 0;
do {
pageio = ((flag & NSC_PAGEIO) != 0 || sdbc_pageio_always != 0);
cget:
if (centry = (_sd_cctl_t *)
_sd_hash_search(cd, cblk, _sd_htable)) {
try:
/* prefetch: skip leading valid blocks */
if ((ioent == NULL) &&
SDBC_VALID_BITS(st_cblk_off, st_cblk_len, centry)) {
skip:
sdbc_prefetch_valid_cnt++;
--request_blocks;
lentry = centry;
centry = NULL;
cblk++;
fba_len -= st_cblk_len;
st_cblk_off = 0;
st_cblk_len = (sdbc_cblk_fba_t)
((fba_len > (nsc_size_t)BLK_FBAS) ?
BLK_FBAS : fba_len);
continue;
}
if (SET_CENTRY_INUSE(centry)) {
/*
* prefetch: skip leading busy
* or truncate at busy block
*/
if (ioent == NULL)
goto skip;
sdbc_prefetch_busy_cnt++;
fba_orig_len -= fba_len;
fba_len = 0;
centry = lentry; /* backup */
break;
}
/*
* bug 4529671
* now that we own the centry make sure that
* it is still good. it could have been processed
* by _sd_dealloc_dm() in the window between
* _sd_hash_search() and SET_CENTRY_INUSE().
*/
if ((_sd_cctl_t *)
_sd_hash_search(cd, cblk, _sd_htable) != centry) {
sdbc_prefetch_deallocd++;
#ifdef DEBUG
cmn_err(CE_WARN,
"prefetch centry %p cd %d cblk %" NSC_SZFMT
" fba_len %" NSC_SZFMT " lost to dealloc?! "
"cc_data %p",
(void *)centry, cd, cblk, fba_orig_len,
(void *)centry->cc_data);
#endif
CLEAR_CENTRY_INUSE(centry);
continue;
}
if (CC_CD_BLK_MATCH(cd, cblk, centry)) {
/*
* Do pagelist io mutual exclusion
* before messing with the centry.
*/
if (pageio && SET_CENTRY_PAGEIO(centry)) {
/* flusher not done with pageio */
/*
* prefetch: skip leading busy
* or truncate at busy block
*/
CLEAR_CENTRY_INUSE(centry);
if (ioent == NULL)
goto skip;
sdbc_prefetch_pageio1++;
fba_orig_len -= fba_len;
fba_len = 0;
centry = lentry; /* backup */
break;
}
sdbc_prefetch_hit++;
this_entry_type = HASH_ENTRY_DM;
pageio = 0;
centry->cc_toflush = 0;
centry->cc_hits++;
/* this will reset the age flag */
sdbc_centry_init_dm(centry);
DTRACE_PROBE1(_sd_prefetch_buf,
_sd_cctl_t *, centry);
} else {
/* block mismatch */
sdbc_prefetch_lost++;
CLEAR_CENTRY_INUSE(centry);
continue;
}
} else {
centry = sdbc_centry_alloc(cd, cblk, request_blocks,
&stall, &alloc_tok, ALLOC_NOWAIT);
if (centry == NULL) {
/*
* prefetch: cache is very busy. just do
* the i/o for the blocks already acquired,
* if any.
*/
fba_orig_len -= fba_len;
fba_len = 0;
/*
* if we have a chain of centry's
* then back up (set centry to lentry).
* if there is no chain (ioent == NULL)
* then centry remains NULL. this can occur
* if all previous centrys were hash hits
* on valid blocks that were processed in
* the skip logic above.
*/
if (ioent)
centry = lentry; /* backup */
break;
}
/*
* dmchaining adjustment.
* if centry was obtained from the dmchain
* then clear local pageio variable because the
* centry already has cc_pageio set.
*/
if (CENTRY_PAGEIO(centry))
pageio = 0;
DTRACE_PROBE1(_sd_alloc_buf,
_sd_cctl_t *, centry);
this_entry_type = ELIGIBLE_ENTRY_DM;
if (centry->cc_aging_dm & FOUND_IN_HASH_DM)
this_entry_type = HASH_ENTRY_DM;
else {
if (centry->cc_aging_dm & FOUND_HOLD_OVER_DM)
this_entry_type = HOLD_ENTRY_DM;
}
}
centry->cc_chain = NULL;
centry->cc_aging_dm &= ~(FOUND_IN_HASH_DM|FOUND_HOLD_OVER_DM);
/*
* Do pagelist io mutual exclusion now if we did not do
* it above.
*/
if (pageio && SET_CENTRY_PAGEIO(centry)) {
/* flusher not done with pageio */
sdbc_prefetch_pageio2++;
/*
* prefetch: skip leading busy
* or truncate at busy block
*/
CLEAR_CENTRY_INUSE(centry);
if (ioent == NULL)
goto skip;
sdbc_prefetch_busy_cnt++;
fba_orig_len -= fba_len;
fba_len = 0;
centry = lentry; /* backup */
break;
}
pageio = 0;
fba_len -= st_cblk_len;
if (ioent == NULL) {
if (!SDBC_VALID_BITS(st_cblk_off, st_cblk_len,
centry)) {
io_pos = BLK_TO_FBA_NUM(cblk) + st_cblk_off;
ioent = last_ioent = centry;
} else {
DATA_LOG(SDF_ALLOC, centry, st_cblk_off,
st_cblk_len);
DTRACE_PROBE4(_sd_prefetch_buf_data1,
uint64_t, (uint64_t)(BLK_TO_FBA_NUM(cblk) +
st_cblk_off),
int, st_cblk_len,
char *, *(int64_t *)(centry->cc_data +
FBA_SIZE(st_cblk_off)),
char *, *(int64_t *)(centry->cc_data +
FBA_SIZE(st_cblk_off + st_cblk_len)
- 8));
}
handle->bh_centry = centry;
st_cblk_off = 0;
st_cblk_len = (sdbc_cblk_fba_t)
((fba_len > (nsc_size_t)BLK_FBAS) ?
BLK_FBAS : fba_len);
} else {
if (!SDBC_VALID_BITS(st_cblk_off, st_cblk_len, centry))
last_ioent = centry;
else {
DTRACE_PROBE4(_sd_prefetch_buf_data2,
uint64_t, (uint64_t)(BLK_TO_FBA_NUM(cblk) +
st_cblk_off),
int, st_cblk_len,
char *, *(int64_t *)(centry->cc_data +
FBA_SIZE(st_cblk_off)),
char *, *(int64_t *)(centry->cc_data +
FBA_SIZE(st_cblk_off + st_cblk_len)
- 8));
}
lentry->cc_chain = centry;
if (fba_len < (nsc_size_t)BLK_FBAS)
st_cblk_len = (sdbc_cblk_fba_t)fba_len;
}
lentry = centry;
cblk++;
/* if this block has a new identity clear prefetch history */
if (this_entry_type != HASH_ENTRY_DM)
centry->cc_aging_dm &= ~(PREFETCH_BUF_I | PREFETCH_BUF_E);
centry->cc_aging_dm &= ~(ENTRY_FIELD_DM);
centry->cc_aging_dm |= this_entry_type | PREFETCH_BUF_E;
if (flag & NSC_METADATA)
centry->cc_aging_dm |= STICKY_METADATA_DM;
--request_blocks;
} while (fba_len > 0);
if (locked) {
rw_exit(&sdbc_queue_lock);
locked = 0;
}
sdbc_centry_alloc_end(&alloc_tok);
if (centry) {
centry->cc_chain = NULL;
if (sts = _sd_setup_category_on_type(handle->bh_centry)) {
(void) _sd_free_buf(handle);
goto done;
}
(void) _sd_setup_mem_chaining(handle->bh_centry, 0);
}
if (ioent) {
/* prefetch: trailing valid can be released, adjust len */
if ((centry != last_ioent)) {
centry = last_ioent->cc_chain;
last_ioent->cc_chain = NULL;
while (centry) {
lentry = centry->cc_chain;
centry->cc_aging_dm &= ~PREFETCH_BUF_E;
_sd_centry_release(centry);
centry = lentry;
sdbc_prefetch_trailing++;
}
fba_len = (CENTRY_BLK(last_ioent) -
CENTRY_BLK(ioent) + 1) * BLK_FBAS -
BLK_FBA_OFF(io_pos);
fba_orig_len = fba_len + (io_pos - fba_pos);
}
_SD_DISCONNECT_CALLBACK(handle);
sts = _sd_doread(handle, ioent, io_pos,
(fba_pos + fba_orig_len - io_pos), flag);
if (sts > 0)
(void) _sd_free_buf(handle);
} else {
CACHE_FBA_READ(cd, fba_orig_len);
CACHE_READ_HIT;
FBA_READ_IO_KSTATS(cd, FBA_SIZE(fba_orig_len));
sts = NSC_HIT;
}
done:
if (locked)
rw_exit(&sdbc_queue_lock);
return (sts);
}
/*
* _sd_cc_wait - wait for inuse cache block to become available
* Usage:
* if (SET_CENTRY_INUSE(centry)) {
* _sd_cc_wait(cd, blk, centry, CC_INUSE);
* goto try_again;
* }
* -or-
* if (SET_CENTRY_PAGEIO(centry)) {
* _sd_cc_wait(cd, blk, centry, CC_PAGEIO);
* goto try_again;
* }
*/
void
_sd_cc_wait(int cd, nsc_off_t cblk, _sd_cctl_t *centry, int flag)
{
volatile ushort_t *waiters;
volatile uchar_t *uflag;
if (flag == CC_INUSE) {
waiters = &(centry->cc_await_use);
uflag = &(CENTRY_INUSE(centry));
} else if (flag == CC_PAGEIO) {
waiters = &(centry->cc_await_page);
uflag = &(CENTRY_PAGEIO(centry));
} else {
/* Oops! */
#ifdef DEBUG
cmn_err(CE_WARN, "_sd_cc_wait: unknown flag value (%x)", flag);
#endif
return;
}
mutex_enter(&centry->cc_lock);
if (CC_CD_BLK_MATCH(cd, cblk, centry) && (*uflag) != 0) {
(*waiters)++;
sd_serialize();
if ((*uflag) != 0) {
unsigned stime = nsc_usec();
cv_wait(&centry->cc_blkcv, &centry->cc_lock);
(*waiters)--;
mutex_exit(&centry->cc_lock);
SDTRACE(ST_INFO|SDF_ENT_GET,
cd, 0, BLK_TO_FBA_NUM(cblk),
(nsc_usec()-stime), 0);
} else {
(*waiters)--;
mutex_exit(&centry->cc_lock);
}
} else
mutex_exit(&centry->cc_lock);
}
/*
* _sd_alloc_buf - Allocate a vector of buffers for io.
*
* ARGUMENTS:
* cd - Cache descriptor (from a previous open)
* fba_pos - disk position (512-byte FBAs)
* fba_len - length in disk FBAs.
* flag - allocation type. Flag is one or more of
* NSC_RDBUF, NSC_WRBUF, NSC_NOBLOCK and hints.
* NSC_RDAHEAD - prefetch for future read.
* handle_p - pointer to a handle pointer.
* If the handle pointer is non-null, its used as a
* pre-allocated handle. Else a new handle will be allocated
* and stored in *handle_p
*
* RETURNS:
* errno if return > 0.
* else NSC_HIT or NSC_DONE on success
* or NSC_PENDING on io in progress and NSC_NOBLOCK
* specified in the flag.
* USAGE:
* This routine allocates the cache blocks requested and creates a list
* of entries for this request.
* If NSC_NOBLOCK was not specified, this call could block on read io.
* If flag specified NSC_RDBUF and the request is not an entire
* hit, an io is initiated.
*/
int
_sd_alloc_buf(blind_t xcd, nsc_off_t fba_pos, nsc_size_t fba_len, int flag,
_sd_buf_handle_t **handle_p)
{
int cd = (int)(unsigned long)xcd;
_sd_cd_info_t *cdi;
_sd_buf_handle_t *handle;
int sts;
nsc_off_t st_cblk, cblk; /* position of start and temp cache block */
sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */
sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */
sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */
nsc_off_t io_pos; /* offset in FBA's */
_sd_bufvec_t *bufvec;
_sd_cctl_t *centry, *lentry, *ioent = NULL;
nsc_size_t fba_orig_len = fba_len; /* FBA length of orig request */
int stall, pageio;
unsigned char cc_flag;
int this_entry_type;
int locked = 0;
nsc_size_t dmchain_request_blocks; /* size of dmchain in cache blocks */
sdbc_allocbuf_t alloc_tok = {0};
int min_frag = 0; /* frag statistics */
int max_frag = 0; /* frag statistics */
int nfrags = 0; /* frag statistics */
#ifdef DEBUG
int err = 0;
#endif
ASSERT(*handle_p != NULL);
handle = *handle_p;
if (_sdbc_shutdown_in_progress)
return (EIO);
if (xcd == NSC_ANON_CD)
cd = _CD_NOHASH;
KSTAT_RUNQ_ENTER(cd);
/*
* Force large writes on nvram systems to be write-through to
* avoid the (slow) bcopy into nvram.
*/
if (flag & NSC_WRBUF) {
if (fba_len > (nsc_size_t)sdbc_wrthru_len) {
flag |= NSC_WRTHRU;
}
}
#ifdef DEBUG
if (sdbc_pageio_debug != SDBC_PAGEIO_OFF) {
switch (sdbc_pageio_debug) {
case SDBC_PAGEIO_RDEV:
if (cd != _CD_NOHASH &&
sdbc_pageio_rdev != (dev_t)-1 &&
_sd_cache_files[cd].cd_crdev == sdbc_pageio_rdev)
flag |= NSC_PAGEIO;
break;
case SDBC_PAGEIO_RAND:
if ((nsc_lbolt() % 3) == 0)
flag |= NSC_PAGEIO;
break;
case SDBC_PAGEIO_ALL:
flag |= NSC_PAGEIO;
break;
}
}
#endif /* DEBUG */
if (fba_len > (nsc_size_t)BLK_FBAS) {
rw_enter(&sdbc_queue_lock, RW_WRITER);
locked = 1;
}
/*
* _CD_NOHASH: client wants temporary (not hashed) cache memory
* not associated with a local disk. Skip local disk checks.
*/
if (cd == _CD_NOHASH) {
flag &= ~(NSC_RDBUF | NSC_WRBUF | NSC_RDAHEAD);
handle = *handle_p;
handle->bh_flag |= NSC_HACTIVE;
goto setup;
}
SDTRACE(ST_ENTER|SDF_ALLOCBUF, cd, fba_len, fba_pos, flag, 0);
if ((flag & NSC_RDAHEAD) && _sd_prefetch_opt) {
sts = _sd_prefetch_buf(cd, fba_pos, fba_len, flag, handle,
locked);
goto done;
}
#if !defined(_SD_NOCHECKS)
if (flag & NSC_RDAHEAD) { /* _sd_prefetch_opt == 0 */
nsc_size_t file_size; /* file_size in FBA's */
/* prefetch: truncate if req'd */
if (fba_len > sdbc_max_fbas)
fba_len = sdbc_max_fbas;
file_size = _sd_cache_files[(cd)].cd_info->sh_filesize;
if ((fba_pos + fba_len) > file_size) {
fba_len = file_size - fba_pos;
#ifdef NSC_MULTI_TERABYTE
if ((int64_t)fba_len <= 0) {
#else
if ((int32_t)fba_len <= 0) {
#endif
sts = EIO;
SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_len,
fba_pos, flag, sts);
goto done;
}
}
} else
if (sts = _sd_check_buffer_alloc(cd, fba_pos, fba_len, handle_p)) {
SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_len, fba_pos, flag, sts);
goto done;
}
#endif
if (fba_len == 0) {
SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_len, fba_pos,
flag, EINVAL);
sts = EINVAL;
goto done;
}
handle->bh_flag |= NSC_HACTIVE;
cdi = &_sd_cache_files[cd];
if (cdi->cd_recovering) {
/*
* If recovering this device, then block all allocates
* for reading or writing. If we allow reads then
* this path could see old data before we recover.
* If we allow writes then new data could be overwritten
* by old data.
* This is clearly still not a complete solution as
* the thread doing this allocate could conceivably be
* by this point (and in _sd_write/_sd_read for that matter
* which don't even have this protection). But this type
* of path seems to only exist in a failover situation
* where a device has failed on the other node and works
* on this node so the problem is not a huge one but exists
* never the less.
*/
if (sts = _sd_recovery_wblk_wait(cd)) {
handle->bh_flag &= ~NSC_HACTIVE;
SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_len, fba_pos,
flag, sts);
goto done;
}
}
/* write & disk failed, return error immediately */
if ((flag & NSC_WRBUF) && cdi->cd_info->sh_failed) {
handle->bh_flag &= ~NSC_HACTIVE;
SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_len, fba_pos, flag, EIO);
sts = EIO;
goto done;
}
setup:
_SD_SETUP_HANDLE(handle, cd, fba_pos, fba_len, flag);
handle->bh_centry = NULL;
bufvec = handle->bh_bufvec;
if (flag & NSC_RDAHEAD) { /* _sd_prefetch_opt == 0 */
/* CKD prefetch: bufvec not req'd, use placeholder */
bufvec->bufaddr = NULL;
bufvec->bufvmeaddr = NULL;
bufvec->buflen = 0;
bufvec = _prefetch_sb_vec;
}
st_cblk = FBA_TO_BLK_NUM(fba_pos);
st_cblk_off = BLK_FBA_OFF(fba_pos);
st_cblk_len = BLK_FBAS - st_cblk_off;
if ((nsc_size_t)st_cblk_len >= fba_len) {
end_cblk_len = 0;
st_cblk_len = (sdbc_cblk_fba_t)fba_len;
} else
end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
cblk = st_cblk;
/*
* count number of blocks on chain that is required
*/
/* middle piece */
dmchain_request_blocks =
(fba_len - (st_cblk_len + end_cblk_len)) >> BLK_FBA_SHFT;
/* start piece */
++dmchain_request_blocks;
/* end piece */
if (end_cblk_len)
++dmchain_request_blocks;
cc_flag = 0;
if ((handle->bh_flag & NSC_PINNABLE) && (handle->bh_flag & NSC_WRBUF))
cc_flag |= CC_PINNABLE;
if (handle->bh_flag & (NSC_NOCACHE|NSC_SEQ_IO))
cc_flag |= CC_QHEAD;
lentry = NULL;
stall = 0;
do {
pageio = ((flag & NSC_PAGEIO) != 0 || sdbc_pageio_always != 0);
cget:
if ((centry = (_sd_cctl_t *)
_sd_hash_search(cd, cblk, _sd_htable)) != 0) {
if (SET_CENTRY_INUSE(centry)) {
/* already inuse: wait for block, retry */
sdbc_allocb_inuse++;
if (locked)
rw_exit(&sdbc_queue_lock);
_sd_cc_wait(cd, cblk, centry, CC_INUSE);
if (locked)
rw_enter(&sdbc_queue_lock, RW_WRITER);
goto cget;
}
/*
* bug 4529671
* now that we own the centry make sure that
* it is still good. it could have been processed
* by _sd_dealloc_dm() in the window between
* _sd_hash_search() and SET_CENTRY_INUSE().
*/
if ((_sd_cctl_t *)
_sd_hash_search(cd, cblk, _sd_htable) != centry) {
sdbc_allocb_deallocd++;
#ifdef DEBUG
cmn_err(CE_WARN,
"centry %p cd %d cblk %" NSC_SZFMT
" fba_len %" NSC_SZFMT " lost to dealloc?! "
"cc_data %p", (void *)centry, cd, cblk,
fba_orig_len, (void *)centry->cc_data);
#endif
CLEAR_CENTRY_INUSE(centry);
goto cget;
}
if (CC_CD_BLK_MATCH(cd, cblk, centry)) {
/*
* Do pagelist io mutual exclusion
* before messing with the centry.
*/
if (pageio && SET_CENTRY_PAGEIO(centry)) {
/* wait for flusher to finish pageio */
sdbc_allocb_pageio1++;
CLEAR_CENTRY_INUSE(centry);
if (locked)
rw_exit(&sdbc_queue_lock);
_sd_cc_wait(cd, cblk, centry,
CC_PAGEIO);
if (locked)
rw_enter(&sdbc_queue_lock,
RW_WRITER);
goto cget;
}
sdbc_allocb_hit++;
this_entry_type = HASH_ENTRY_DM;
pageio = 0;
centry->cc_toflush = 0;
centry->cc_hits++;
/* this will reset the age flag */
sdbc_centry_init_dm(centry);
DTRACE_PROBE1(_sd_alloc_buf1,
_sd_cctl_t *, centry);
} else {
/* block mismatch: release, alloc new block */
sdbc_allocb_lost++;
CLEAR_CENTRY_INUSE(centry);
goto cget;
}
} else {
centry = sdbc_centry_alloc(cd, cblk,
dmchain_request_blocks, &stall,
&alloc_tok, locked ? ALLOC_LOCKED : 0);
/*
* dmchaining adjustment.
* if centry was obtained from the dmchain
* then clear local pageio variable because the
* centry already has cc_pageio set.
*/
if (CENTRY_PAGEIO(centry))
pageio = 0;
DTRACE_PROBE1(_sd_alloc_buf2,
_sd_cctl_t *, centry);
this_entry_type = ELIGIBLE_ENTRY_DM;
if (centry->cc_aging_dm & FOUND_IN_HASH_DM)
this_entry_type = HASH_ENTRY_DM;
else {
if (centry->cc_aging_dm & FOUND_HOLD_OVER_DM)
this_entry_type = HOLD_ENTRY_DM;
}
}
centry->cc_aging_dm &= ~(FOUND_IN_HASH_DM|FOUND_HOLD_OVER_DM);
/*
* Do pagelist io mutual exclusion now if we did not do
* it above.
*/
if (pageio && SET_CENTRY_PAGEIO(centry)) {
/* wait for flusher to finish pageio */
sdbc_allocb_pageio2++;
CLEAR_CENTRY_INUSE(centry);
if (locked)
rw_exit(&sdbc_queue_lock);
_sd_cc_wait(cd, cblk, centry, CC_PAGEIO);
if (locked)
rw_enter(&sdbc_queue_lock, RW_WRITER);
goto cget;
}
pageio = 0;
if (CENTRY_DIRTY(centry)) {
/*
* end action might set PEND_DIRTY flag
* must lock if need to change flag bits
*/
if (centry->cc_flag != (centry->cc_flag | cc_flag)) {
/* was FAST */
mutex_enter(&centry->cc_lock);
centry->cc_flag |= cc_flag;
/* was FAST */
mutex_exit(&centry->cc_lock);
}
} else
centry->cc_flag |= cc_flag;
centry->cc_chain = NULL;
/*
* step 0:check valid bits in each cache ele as
* the chain grows - set ioent/io_pos to first
* instance of invalid data
*/
if (cblk == st_cblk) {
handle->bh_centry = centry;
fba_len -= st_cblk_len;
lentry = centry;
if (flag & NSC_RDBUF) {
if (!SDBC_VALID_BITS(st_cblk_off, st_cblk_len,
centry)) {
io_pos = fba_pos;
ioent = centry;
} else {
DATA_LOG(SDF_ALLOC, centry, st_cblk_off,
st_cblk_len);
DTRACE_PROBE4(_sd_alloc_data1,
uint64_t,
(uint64_t)(BLK_TO_FBA_NUM(cblk)
+ st_cblk_off),
int, st_cblk_len,
char *, *(int64_t *)
(centry->cc_data +
FBA_SIZE(st_cblk_off)),
char *, *(int64_t *)
(centry->cc_data +
FBA_SIZE(st_cblk_off +
st_cblk_len) - 8));
}
}
cblk++;
} else if (fba_len == (nsc_size_t)end_cblk_len) {
lentry->cc_chain = centry;
fba_len -= end_cblk_len;
if (flag & NSC_RDBUF) {
if (ioent == NULL) {
if (!SDBC_VALID_BITS(0, end_cblk_len,
centry)) {
io_pos = BLK_TO_FBA_NUM(cblk);
ioent = centry;
} else {
DATA_LOG(SDF_ALLOC, centry, 0,
end_cblk_len);
DTRACE_PROBE4(
_sd_alloc_data2,
uint64_t,
BLK_TO_FBA_NUM(cblk),
int, end_cblk_len,
char *,
*(int64_t *)
(centry->cc_data),
char *,
*(int64_t *)
(centry->cc_data +
FBA_SIZE(end_cblk_len)
- 8));
}
}
}
} else {
lentry->cc_chain = centry;
lentry = centry;
fba_len -= BLK_FBAS;
if (flag & NSC_RDBUF) {
if (ioent == NULL) {
if (!FULLY_VALID(centry)) {
io_pos = BLK_TO_FBA_NUM(cblk);
ioent = centry;
} else {
DATA_LOG(SDF_ALLOC, centry, 0,
BLK_FBAS);
DTRACE_PROBE4(
_sd_alloc_data3,
uint64_t,
(uint64_t)
BLK_TO_FBA_NUM(cblk),
int, BLK_FBAS,
char *,
*(int64_t *)
(centry->cc_data),
char *,
*(int64_t *)
(centry->cc_data +
FBA_SIZE(BLK_FBAS) -
8));
}
}
}
cblk++;
}
/* if this block has a new identity clear prefetch history */
if (this_entry_type != HASH_ENTRY_DM)
centry->cc_aging_dm &= ~(PREFETCH_BUF_I | PREFETCH_BUF_E);
centry->cc_aging_dm &= ~(ENTRY_FIELD_DM);
centry->cc_aging_dm |= this_entry_type;
if (flag & NSC_METADATA)
centry->cc_aging_dm |= STICKY_METADATA_DM;
--dmchain_request_blocks;
} while (fba_len);
if (locked) {
rw_exit(&sdbc_queue_lock);
locked = 0;
}
ASSERT(dmchain_request_blocks == 0);
/*
* do any necessary cleanup now that all the blocks are allocated.
*/
sdbc_centry_alloc_end(&alloc_tok);
/* be sure you nul term. the chain */
centry->cc_chain = NULL;
/*
* step one: establish HOST/PARASITE/OTHER relationships
* between the centry ele in the list and calc the alloc size
* (fill in CATAGORY based on TYPE and immediate neighbors)
*/
if (sts = _sd_setup_category_on_type(handle->bh_centry)) {
#ifdef DEBUG
err = _sd_free_buf(handle);
if (err) {
cmn_err(CE_WARN, "sdbc(_sd_alloc_buf): _sd_free_buf "
"failed: err:%d handle:%p", err, (void *)handle);
}
#else
(void) _sd_free_buf(handle);
#endif
goto done;
}
/*
* step two: alloc the needed mem and fill in the data and chaining
* fields (leave bufvec for step three)
*/
(void) _sd_setup_mem_chaining(handle->bh_centry, 0);
/*
* step three: do the bufvec
*/
fba_len = fba_orig_len;
centry = handle->bh_centry;
bufvec = handle->bh_bufvec;
while (centry) {
DTRACE_PROBE3(_sd_alloc_buf_centrys,
_sd_cctl_t *, centry,
int, cd,
uint64_t, (uint64_t)
BLK_TO_FBA_NUM(CENTRY_BLK(centry)));
if (fba_len == fba_orig_len) {
bufvec->bufaddr = (centry->cc_data +
FBA_SIZE(st_cblk_off));
bufvec->bufvmeaddr = 0; /* not used */
bufvec->buflen = FBA_SIZE(st_cblk_len);
bufvec++;
fba_len -= st_cblk_len;
} else if (fba_len == (nsc_size_t)end_cblk_len) {
_sd_bufvec_t *pbufvec = bufvec - 1;
if ((pbufvec->bufaddr + pbufvec->buflen) ==
centry->cc_data) {
/* contiguous */
pbufvec->buflen += FBA_SIZE(end_cblk_len);
} else {
bufvec->bufaddr = centry->cc_data;
bufvec->bufvmeaddr = 0; /* not used */
bufvec->buflen = FBA_SIZE(end_cblk_len);
bufvec++;
}
fba_len -= end_cblk_len;
} else {
_sd_bufvec_t *pbufvec = bufvec - 1;
if ((pbufvec->bufaddr + pbufvec->buflen) ==
centry->cc_data) {
/* contiguous */
pbufvec->buflen += CACHE_BLOCK_SIZE;
} else {
bufvec->bufaddr = centry->cc_data;
bufvec->bufvmeaddr = 0; /* not used */
bufvec->buflen = CACHE_BLOCK_SIZE;
bufvec++;
}
fba_len -= BLK_FBAS;
}
centry = centry->cc_chain;
}
/* be sure you nul term. the chain */
bufvec->bufaddr = NULL;
bufvec->bufvmeaddr = 0;
bufvec->buflen = 0;
/* frag statistics */
{
_sd_bufvec_t *tbufvec;
for (tbufvec = handle->bh_bufvec; tbufvec != bufvec;
++tbufvec) {
if ((min_frag > tbufvec->buflen) || (min_frag == 0))
min_frag = tbufvec->buflen;
if (max_frag < tbufvec->buflen)
max_frag = tbufvec->buflen;
}
nfrags = bufvec - handle->bh_bufvec;
min_frag = FBA_LEN(min_frag);
max_frag = FBA_LEN(max_frag);
}
/* buffer memory frag stats */
DTRACE_PROBE4(_sd_alloc_buf_frag,
uint64_t, (uint64_t)fba_orig_len,
int, nfrags, int, min_frag,
int, max_frag);
if (flag & NSC_WRBUF) {
if (_SD_IS_WRTHRU(handle))
goto alloc_done;
if (_sd_alloc_write(handle->bh_centry, &stall)) {
_sd_unblock(&_sd_flush_cv);
handle->bh_flag |= NSC_FORCED_WRTHRU;
} else {
for (centry = handle->bh_centry;
centry; centry = centry->cc_chain) {
CENTRY_SET_FTPOS(centry);
SSOP_SETCENTRY(sdbc_safestore,
centry->cc_write);
}
}
}
alloc_done:
if (locked) {
rw_exit(&sdbc_queue_lock);
locked = 0;
}
if (ioent) {
_SD_DISCONNECT_CALLBACK(handle);
sts = _sd_doread(handle, ioent, io_pos,
(fba_pos + fba_orig_len - io_pos), flag);
if (sts > 0)
(void) _sd_free_buf(handle);
} else
if (flag & NSC_RDBUF) {
CACHE_FBA_READ(cd, fba_orig_len);
CACHE_READ_HIT;
FBA_READ_IO_KSTATS(cd, FBA_SIZE(fba_orig_len));
sts = NSC_HIT;
} else
sts = (stall) ? NSC_DONE : NSC_HIT;
SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_orig_len, fba_pos, flag, sts);
done:
if (locked)
rw_exit(&sdbc_queue_lock);
KSTAT_RUNQ_EXIT(cd);
return (sts);
}
/*
* consistency checking for ccents
*/
#define ELIGIBLE(p) (p & ELIGIBLE_ENTRY_DM)
#define HOLD(p) (p & HOLD_ENTRY_DM)
#define HASHE(p) (p & HASH_ENTRY_DM)
#define HOST(p) (p & HOST_ENTRY_DM)
#define PARA(p) (p & PARASITIC_ENTRY_DM)
#define OTHER(p) \
(!(p & (HOST_ENTRY_DM | PARASITIC_ENTRY_DM | ELIGIBLE_ENTRY_DM)))
#define AVAIL(p) (p & AVAIL_ENTRY_DM)
/*
* sdbc_check_cctl_cot -- consistency check for _sd_setup_category_on_type()
* may only be called on entry to state machine (when ccent is either
* ELIGIBLE_ENTRY_DM, HOLD_ENTRY_DM or HASH_ENTRY_DM).
*
* print message or panic (DEBUG) if inconsistency detected.
*/
static int
sdbc_check_cctl_cot(_sd_cctl_t *centry)
{
uint_t age;
int size;
uchar_t *data;
int host_or_other;
int para;
int ccent_ok = 1;
age = centry->cc_aging_dm;
size = centry->cc_alloc_size_dm;
data = centry->cc_data;
host_or_other = size && data;
para = !size && data;
/*
* on entry to _sd_setup_category_on_type(),
* one of three mutually exclusive entry field bits must be set
*/
switch ((age & (ELIGIBLE_ENTRY_DM | HOLD_ENTRY_DM | HASH_ENTRY_DM))) {
case ELIGIBLE_ENTRY_DM:
case HOLD_ENTRY_DM:
case HASH_ENTRY_DM:
/* ok */
break;
default:
/* zero or multiple flag bits */
ccent_ok = 0;
break;
}
/* categories are mutually exclusive */
if (HOST(age) && PARA(age))
ccent_ok = 0;
/* these bits should be cleared out (STICKY_METADATA_DM not used) */
if (age & (AVAIL_ENTRY_DM | FOUND_HOLD_OVER_DM | FOUND_IN_HASH_DM |
STICKY_METADATA_DM))
ccent_ok = 0;
/* eligible has no data and no size */
if (ELIGIBLE(age) && (size || data))
ccent_ok = 0;
/* parasite has zero size and non-zero data */
if (PARA(age) && !para)
ccent_ok = 0;
/* host has non-zero size and non-zero data */
if (HOST(age) && !host_or_other)
ccent_ok = 0;
/* "other" is just like a host */
if (OTHER(age) && !host_or_other)
ccent_ok = 0;
/* a HOLD or a HASH must have a size */
if ((size) && !(age & (HASH_ENTRY_DM | HOLD_ENTRY_DM)))
ccent_ok = 0;
if (!ccent_ok)
cmn_err(cmn_level,
"sdbc(sdbc_check_cctl_cot): inconsistent ccent %p "
"age %x size %d data %p", (void *)centry, age, size,
(void *)data);
return (ccent_ok);
}
/*
* sdbc_mark_cctl_cot -- mark cctls bad and invalidate when
* inconsistency found in _sd_setup_category_on_type()
* returns nothing
*
* Note: this is an error recovery path that is triggered when an
* inconsistency in a cctl is detected. _sd_centry_release() will take
* these cache entries out of circulation and place them on a separate list
* for debugging purposes.
*/
void
sdbc_mark_cctl_cot(_sd_cctl_t *header, _sd_cctl_t *centry)
{
_sd_cctl_t *cur_ent = header;
/* the entire chain is guilty by association */
while (cur_ent) {
(void) _sd_hash_delete((struct _sd_hash_hd *)cur_ent,
_sd_htable);
cur_ent->cc_aging_dm |= BAD_CHAIN_DM;
cur_ent = cur_ent->cc_chain;
}
centry->cc_aging_dm |= BAD_ENTRY_DM; /* this is the problem child */
}
/*
* _sd_setup_category_on_type(_sd_cctl_t *) - Setup the centry CATEGORY based on
* centry TYPE and immediate neighbors. Identify each eligible (ie not HASH)
* centry as a host/parasite. host actually have memory allocated to
* them and parasites are chained to the host and point to page offsets within
* the host's memory.
*
* RETURNS:
* 0 on success, EINTR if inconsistency detected in centry
*
* Note:
* none
*/
static int
_sd_setup_category_on_type(_sd_cctl_t *header)
{
_sd_cctl_t *prev_ent, *next_ent, *centry;
_sd_cctl_t *anchor = NULL;
int current_pest_count, local_max_dyn_list;
int cl;
int ret = 0;
ASSERT(header);
if (sdbc_use_dmchain)
local_max_dyn_list = max_dm_queues - 1;
else {
/* pickup a fresh copy - has the world changed */
local_max_dyn_list = dynmem_processing_dm.max_dyn_list;
}
prev_ent = 0;
centry = header;
next_ent = centry->cc_chain;
current_pest_count = 0;
cl = 2;
/* try to recover from bad cctl */
if (sdbc_check_cot && !sdbc_check_cctl_cot(centry))
ret = EINTR;
while (cl && (ret == 0)) {
switch (cl) {
case (1): /* chain to next/monitor for completion */
prev_ent = centry;
centry = next_ent;
next_ent = 0;
cl = 0;
if (centry) {
if (sdbc_check_cot &&
!sdbc_check_cctl_cot(centry)) {
ret = EINTR;
break;
}
next_ent = centry->cc_chain;
cl = 2;
}
break;
case (2): /* vector to appropriate routine */
if (!(centry->cc_aging_dm & ELIGIBLE_ENTRY_DM))
cl = 5;
else if (prev_ent && (prev_ent->cc_aging_dm &
ELIGIBLE_ENTRY_DM))
cl = 15;
else
cl = 10;
break;
case (5): /* process NON-ELIGIBLE entries */
if (!(centry->cc_aging_dm &
(HASH_ENTRY_DM|HOLD_ENTRY_DM))) {
/* no catagory */
/* consistency check */
if (centry->cc_alloc_size_dm ||
centry->cc_data) {
cmn_err(cmn_level,
"sdbc(setup_cot): "
"OTHER with data/size %p",
(void *)centry);
ret = EINTR;
break;
}
centry->cc_aging_dm &=
~CATAGORY_ENTRY_DM;
centry->cc_alloc_size_dm = BLK_SIZE(1);
DTRACE_PROBE1(_sd_setup_category,
_sd_cctl_t *, centry);
}
cl = 1;
break;
/*
* no prev entry (ie top of list) or no prev
* ELIGIBLE entry
*/
case (10):
/*
* this is an eligible entry, does it start
* a list or is it a loner
*/
/* consistency check */
if (centry->cc_alloc_size_dm ||
centry->cc_data) {
cmn_err(cmn_level, "sdbc(setup_cot): "
"HOST with data/size %p",
(void *)centry);
ret = EINTR;
break;
}
if (next_ent && (next_ent->cc_aging_dm &
ELIGIBLE_ENTRY_DM)) {
/* it starts a list */
/* host catagory */
centry->cc_aging_dm |= HOST_ENTRY_DM;
/* start out with one page */
centry->cc_alloc_size_dm = BLK_SIZE(1);
anchor = centry;
DTRACE_PROBE1(_sd_setup_category,
_sd_cctl_t *, anchor);
cl = 1;
} else {
/*
* it's a loner
* drop status to no category and
* restart
*/
cl = 2;
centry->cc_aging_dm &=
~ELIGIBLE_ENTRY_DM;
}
break;
case (15): /* default to parasite catagory */
/* consistency check */
if (centry->cc_alloc_size_dm ||
centry->cc_data) {
cmn_err(cmn_level, "sdbc(setup_cot): "
"PARA with data/size %p",
(void *)centry);
ret = EINTR;
break;
}
if (current_pest_count < local_max_dyn_list-1) {
/* continue to grow the pest list */
current_pest_count++;
centry->cc_aging_dm |=
PARASITIC_ENTRY_DM;
/*
* offset of host ent mem this will pt
* to
*/
centry->cc_alloc_size_dm =
anchor->cc_alloc_size_dm;
/*
* up the host mem req by one for
* this parasite
*/
DTRACE_PROBE1(_sd_setup_category,
_sd_cctl_t *, centry);
anchor->cc_alloc_size_dm += BLK_SIZE(1);
cl = 1;
} else {
/*
* term this pest list - restart fresh
* on this entry
*/
current_pest_count = 0;
prev_ent->cc_aging_dm &=
~(HOST_ENTRY_DM|ELIGIBLE_ENTRY_DM);
cl = 2;
}
break;
} /* switch(cl) */
} /* while (cl) */
if (ret != 0)
sdbc_mark_cctl_cot(header, centry);
return (ret);
}
/*
* _sd_setup_mem_chaining(_sd_cctl_t *) - Allocate memory, setup
* mem ptrs an host/pest chaining. Do the actual allocation as described in
* sd_setup_category_on_type().
*
* RETURNS:
* 0 on success
* non-zero on error
*
* Note:
* if called with ALLOC_NOWAIT, caller must check for non-zero return
*/
static int
_sd_setup_mem_chaining(_sd_cctl_t *header, int flag)
{
_sd_cctl_t *prev_ent, *next_ent, *centry;
_sd_cctl_t *anchor = NULL;
int cl, rc = 0;
ASSERT(header);
if (!header)
return (0);
prev_ent = 0;
centry = header;
next_ent = centry->cc_chain;
cl = 2;
while (cl) {
switch (cl) {
case (1): /* chain to next/monitor for completion */
centry->cc_aging_dm &= ~ELIGIBLE_ENTRY_DM;
prev_ent = centry;
centry = next_ent;
next_ent = 0;
cl = 0;
if (centry) {
next_ent = centry->cc_chain;
cl = 2;
}
break;
case (2): /* vector to appropriate routine */
if (centry->cc_aging_dm & HOST_ENTRY_DM)
cl = 10;
else if (centry->cc_aging_dm &
PARASITIC_ENTRY_DM)
cl = 15;
else
cl = 5;
break;
case (5): /* OTHER processing - alloc mem */
if (rc = sdbc_centry_memalloc_dm(centry,
centry->cc_alloc_size_dm, flag))
/* The allocation failed */
cl = 0;
else
cl = 1;
break;
/*
* HOST entry processing - save the anchor pt,
* alloc the memory,
*/
case (10): /* setup head and nxt ptrs */
anchor = centry;
if (rc = sdbc_centry_memalloc_dm(centry,
centry->cc_alloc_size_dm, flag))
/* The allocation failed */
cl = 0;
else
cl = 1;
break;
/*
* PARASITIC entry processing - setup w/no
* memory, setup head/next ptrs,
*/
case (15):
/*
* fudge the data mem ptr to an offset from
* the anchor alloc
*/
if (!(centry->cc_aging_dm &
(HASH_ENTRY_DM| HOLD_ENTRY_DM))) {
centry->cc_head_dm = anchor;
/* chain prev to this */
prev_ent->cc_next_dm = centry;
/*
* generate the actual data ptr into
* host entry memory
*/
centry->cc_data = anchor->cc_data +
centry->cc_alloc_size_dm;
centry->cc_alloc_size_dm = 0;
}
cl = 1;
break;
} /* switch(cl) */
} /* while (cl) */
return (rc);
}
/*
* _sd_check_buffer_alloc - Check if buffer allocation is invalid.
*
* RETURNS:
* 0 if its ok to continue with allocation.
* Else errno to be returned to the user.
*
* Note:
* This routine could block if the device is not local and
* recovery is in progress.
*/
/* ARGSUSED */
static int
_sd_check_buffer_alloc(int cd, nsc_off_t fba_pos, nsc_size_t fba_len,
_sd_buf_handle_t **hp)
{
/*
* This check exists to ensure that someone will not pass in an
* arbitrary pointer and try to pass it off as a handle.
*/
if ((*hp)->bh_flag & (~_SD_VALID_FLAGS)) {
cmn_err(CE_WARN, "sdbc(_sd_check_buffer_alloc) "
"cd %d invalid handle %p flags %x",
cd, (void *)*hp, (*hp)->bh_flag);
return (EINVAL);
}
if ((_sd_cache_initialized == 0) || (FILE_OPENED(cd) == 0)) {
cmn_err(CE_WARN, "sdbc(_sd_check_buffer_alloc) "
"cd %d not open. Cache init %d",
cd, _sd_cache_initialized);
return (EINVAL);
}
ASSERT(cd >= 0);
if (!(_sd_cache_files[cd].cd_rawfd) ||
!nsc_held(_sd_cache_files[cd].cd_rawfd)) {
cmn_err(CE_WARN,
"sdbc(_sd_check_buffer_alloc) cd %d is not attached", cd);
return (EINVAL);
}
ASSERT_IO_SIZE(fba_pos, fba_len, cd);
ASSERT_LEN(fba_len);
return (0);
}
/*
* sdbc_check_handle -- check that handle is valid
* return 1 if ok, 0 otherwise (if debug then panic).
*/
static int
sdbc_check_handle(_sd_buf_handle_t *handle)
{
int ret = 1;
if (!_SD_HANDLE_ACTIVE(handle)) {
cmn_err(cmn_level, "sdbc(_sd_free_buf): invalid handle %p"
"cd %d fpos %" NSC_SZFMT " flen %" NSC_SZFMT " flag %x",
(void *)handle, HANDLE_CD(handle), handle->bh_fba_pos,
handle->bh_fba_len, handle->bh_flag);
ret = 0;
}
return (ret);
}
/*
* _sd_free_buf - Free the buffers allocated in _sd_alloc_buf.
*
* ARGUMENTS:
* handle - The handle allocated in _sd_alloc_buf.
*
* RETURNS:
* 0 on success.
* Else errno.
*
* NOTE:
* If handle was allocated through _sd_alloc_buf, the handle allocated
* flag (NSC_HALLOCATED) will be reset by _sd_alloc_buf. This indicates
* that _sd_free_buf should free up the handle as well.
* All other handles directly allocated from _sd_alloc_handle will have
* that flag set. Any handle with valid blocks will have the handle
* active flag. It is an error if the active flag is not set.
* (if free_buf were called without going through alloc_buf)
*/
int
_sd_free_buf(_sd_buf_handle_t *handle)
{
_sd_cctl_t *centry, *cc_chain;
int cd = HANDLE_CD(handle);
int flen = handle->bh_fba_len;
int fpos = handle->bh_fba_pos;
SDTRACE(ST_ENTER|SDF_FREEBUF, HANDLE_CD(handle),
handle->bh_fba_len, handle->bh_fba_pos, 0, 0);
if (sdbc_check_handle(handle) == 0)
return (EINVAL);
if (handle->bh_flag & NSC_MIXED) {
/*
* Data in this handle will be a mix of data from the
* source device and data from another device, so
* invalidate all the blocks.
*/
handle->bh_flag &= ~NSC_QUEUE;
centry = handle->bh_centry;
while (centry) {
centry->cc_valid = 0;
centry = centry->cc_chain;
}
}
if ((handle->bh_flag & NSC_QUEUE)) {
handle->bh_flag &= ~NSC_QUEUE;
_sd_queue_write(handle, handle->bh_fba_pos, handle->bh_fba_len);
}
handle->bh_flag &= ~NSC_HACTIVE;
centry = handle->bh_centry;
while (centry) {
cc_chain = centry->cc_chain;
_sd_centry_release(centry);
centry = cc_chain;
}
/*
* help prevent dup call to _sd_centry_release if this handle
* is erroneously _sd_free_buf'd twice. (should not happen).
*/
handle->bh_centry = NULL;
if ((handle->bh_flag & NSC_HALLOCATED) == 0) {
handle->bh_flag |= NSC_HALLOCATED;
(void) _sd_free_handle(handle);
} else {
handle->bh_flag = NSC_HALLOCATED;
}
SDTRACE(ST_EXIT|SDF_FREEBUF, cd, flen, fpos, 0, 0);
return (0);
}
static int _sd_lruq_srch = 0x2000;
/*
* sdbc_get_dmchain -- get a candidate centry chain pointing to
* contiguous memory
* ARGUMENTS:
* cblocks - number of cache blocks requested
* stall - pointer to stall count (no blocks avail)
* flag - ALLOC_NOWAIT flag
*
* RETURNS:
* a cache entry or possible NULL if ALLOC_NOWAIT set
* USAGE:
* attempt to satisfy entire request from queue
* that has no memory allocated.
* if this fails then attempt a partial allocation
* with a preallocated block of requested size up to
* max_dyn_list.
* then look for largest chain less than max_dyn_list.
*/
static _sd_cctl_t *
sdbc_get_dmchain(int cblocks, int *stall, int flag)
{
_sd_cctl_t *cc_dmchain = NULL;
_sd_queue_t *q;
_sd_cctl_t *qhead;
int num_tries;
int cblocks_orig = cblocks;
int nowait = flag & ALLOC_NOWAIT;
int i;
num_tries = _sd_lruq_srch;
ASSERT(cblocks != 0);
while (!cc_dmchain) {
/* get it from the os if possible */
q = &sdbc_dm_queues[0];
qhead = &(q->sq_qhead);
if (q->sq_inq >= cblocks) {
mutex_enter(&q->sq_qlock);
if (q->sq_inq >= cblocks) {
_sd_cctl_t *cc_ent;
cc_dmchain = qhead->cc_next;
/*
* set the inuse and pageio bits
* Note: this code expects the cc_ent to
* be available. no other thread may set the
* inuse or pageio bit for an entry on the
* 0 queue.
*/
cc_ent = qhead;
for (i = 0; i < cblocks; ++i) {
cc_ent = cc_ent->cc_next;
if (SET_CENTRY_INUSE(cc_ent)) {
cmn_err(CE_PANIC,
"centry inuse on 0 q! %p",
(void *)cc_ent);
}
if (SET_CENTRY_PAGEIO(cc_ent)) {
cmn_err(CE_PANIC,
"centry pageio on 0 q! %p",
(void *)cc_ent);
}
}
/* got a dmchain */
/* remove this chain from the 0 queue */
cc_dmchain->cc_prev->cc_next = cc_ent->cc_next;
cc_ent->cc_next->cc_prev = cc_dmchain->cc_prev;
cc_dmchain->cc_prev = NULL;
cc_ent->cc_next = NULL;
q->sq_inq -= cblocks;
ASSERT(GOOD_LRUSIZE(q));
}
mutex_exit(&q->sq_qlock);
if (cc_dmchain)
continue;
}
/* look for a pre-allocated block of the requested size */
if (cblocks > (max_dm_queues - 1))
cblocks = max_dm_queues - 1;
q = &sdbc_dm_queues[cblocks];
qhead = &(q->sq_qhead);
if (q->sq_inq != 0) {
_sd_cctl_t *tmp_dmchain;
mutex_enter(&q->sq_qlock);
for (tmp_dmchain = qhead->cc_next; tmp_dmchain != qhead;
tmp_dmchain = tmp_dmchain->cc_next) {
/*
* get a dmchain
* set the inuse and pageio bits
*/
if (sdbc_dmchain_avail(tmp_dmchain)) {
/* put on MRU end of queue */
sdbc_requeue_dmchain(q, tmp_dmchain,
1, 0);
cc_dmchain = tmp_dmchain;
break;
}
sdbc_dmchain_not_avail++;
}
mutex_exit(&q->sq_qlock);
if (cc_dmchain)
continue;
}
/*
* spin block
* nudge the deallocator, accelerate ageing
*/
mutex_enter(&dynmem_processing_dm.thread_dm_lock);
cv_broadcast(&dynmem_processing_dm.thread_dm_cv);
mutex_exit(&dynmem_processing_dm.thread_dm_lock);
if (nowait)
break;
if (!(--num_tries)) {
delay(drv_usectohz(20000));
(void) (*stall)++;
num_tries = _sd_lruq_srch;
cblocks = cblocks_orig;
} else { /* see if smaller request size is available */
if (!(--cblocks))
cblocks = cblocks_orig;
}
} /* while (!cc_dmchain) */
return (cc_dmchain);
}
static int
sdbc_dmchain_avail(_sd_cctl_t *cc_ent)
{
int chain_avail = 1;
_sd_cctl_t *anchor = cc_ent;
while (cc_ent) {
ASSERT(_sd_cctl_valid(cc_ent));
if (cc_ent->cc_aging_dm & BAD_CHAIN_DM) {
chain_avail = 0;
break;
}
if (CENTRY_DIRTY(cc_ent)) {
chain_avail = 0;
break;
}
if (SET_CENTRY_INUSE(cc_ent)) {
chain_avail = 0;
break;
}
if ((SET_CENTRY_PAGEIO(cc_ent))) {
CLEAR_CENTRY_INUSE(cc_ent);
chain_avail = 0;
break;
}
if (CENTRY_DIRTY(cc_ent)) {
CLEAR_CENTRY_PAGEIO(cc_ent);
CLEAR_CENTRY_INUSE(cc_ent);
chain_avail = 0;
break;
}
cc_ent->cc_flag = 0;
cc_ent->cc_toflush = 0;
cc_ent = cc_ent->cc_next_dm;
}
if (!chain_avail)
sdbc_clear_dmchain(anchor, cc_ent);
else {
cc_ent = anchor;
/*
* prevent possible deadlocks in _sd_cc_wait():
* remove from hash and wakeup any waiters now that we
* have acquired the chain.
*/
while (cc_ent) {
(void) _sd_hash_delete((struct _sd_hash_hd *)cc_ent,
_sd_htable);
mutex_enter(&cc_ent->cc_lock);
if (cc_ent->cc_await_use) {
cv_broadcast(&cc_ent->cc_blkcv);
}
mutex_exit(&cc_ent->cc_lock);
cc_ent->cc_creat = nsc_lbolt();
cc_ent->cc_hits = 0;
cc_ent = cc_ent->cc_next_dm;
}
}
return (chain_avail);
}
static void
sdbc_clear_dmchain(_sd_cctl_t *cc_ent_start, _sd_cctl_t *cc_ent_end)
{
_sd_cctl_t *cc_ent = cc_ent_start;
_sd_cctl_t *prev_ent;
ASSERT(_sd_cctl_valid(cc_ent));
while (cc_ent != cc_ent_end) {
ASSERT(_sd_cctl_valid(cc_ent));
prev_ent = cc_ent;
cc_ent = cc_ent->cc_next_dm;
CLEAR_CENTRY_PAGEIO(prev_ent);
CLEAR_CENTRY_INUSE(prev_ent);
}
}
/*
* put a dmchain on the LRU end of a queue
*/
void
sdbc_ins_dmqueue_front(_sd_queue_t *q, _sd_cctl_t *cc_ent)
{
_sd_cctl_t *qhead = &(q->sq_qhead);
ASSERT(_sd_cctl_valid(cc_ent));
mutex_enter(&q->sq_qlock);
cc_ent->cc_next = qhead->cc_next;
cc_ent->cc_prev = qhead;
qhead->cc_next->cc_prev = cc_ent;
qhead->cc_next = cc_ent;
q->sq_inq++;
cc_ent->cc_cblocks = q->sq_dmchain_cblocks;
ASSERT(GOOD_LRUSIZE(q));
mutex_exit(&q->sq_qlock);
}
/*
* put a dmchain on the MRU end of a queue
*/
static void
sdbc_ins_dmqueue_back(_sd_queue_t *q, _sd_cctl_t *cc_ent)
{
_sd_cctl_t *qhead = &(q->sq_qhead);
ASSERT(_sd_cctl_valid(cc_ent));
mutex_enter(&q->sq_qlock);
cc_ent->cc_next = qhead;
cc_ent->cc_prev = qhead->cc_prev;
qhead->cc_prev->cc_next = cc_ent;
qhead->cc_prev = cc_ent;
cc_ent->cc_seq = q->sq_seq++;
q->sq_inq++;
cc_ent->cc_cblocks = q->sq_dmchain_cblocks;
ASSERT(GOOD_LRUSIZE(q));
mutex_exit(&q->sq_qlock);
}
/*
* remove dmchain from a queue
*/
void
sdbc_remq_dmchain(_sd_queue_t *q, _sd_cctl_t *cc_ent)
{
ASSERT(_sd_cctl_valid(cc_ent));
mutex_enter(&q->sq_qlock);
cc_ent->cc_prev->cc_next = cc_ent->cc_next;
cc_ent->cc_next->cc_prev = cc_ent->cc_prev;
cc_ent->cc_next = cc_ent->cc_prev = NULL; /* defensive programming */
cc_ent->cc_cblocks = -1; /* indicate not on any queue */
q->sq_inq--;
ASSERT(GOOD_LRUSIZE(q));
mutex_exit(&q->sq_qlock);
}
/*
* requeue a dmchain to the MRU end of its queue.
* if getlock is 0 on entry the queue lock (sq_qlock) must be held
*/
void
sdbc_requeue_dmchain(_sd_queue_t *q, _sd_cctl_t *cc_ent, int mru,
int getlock)
{
_sd_cctl_t *qhead = &(q->sq_qhead);
ASSERT(_sd_cctl_valid(cc_ent));
if (getlock)
mutex_enter(&q->sq_qlock);
/* inline of sdbc_remq_dmchain() */
cc_ent->cc_prev->cc_next = cc_ent->cc_next;
cc_ent->cc_next->cc_prev = cc_ent->cc_prev;
if (mru) { /* put on MRU end of queue */
/* inline of sdbc_ins_dmqueue_back */
cc_ent->cc_next = qhead;
cc_ent->cc_prev = qhead->cc_prev;
qhead->cc_prev->cc_next = cc_ent;
qhead->cc_prev = cc_ent;
cc_ent->cc_seq = q->sq_seq++;
(q->sq_req_stat)++;
} else { /* put on LRU end of queue i.e. requeue to head */
/* inline of sdbc_ins_dmqueue_front */
cc_ent->cc_next = qhead->cc_next;
cc_ent->cc_prev = qhead;
qhead->cc_next->cc_prev = cc_ent;
qhead->cc_next = cc_ent;
cc_ent->cc_seq = q->sq_seq++;
/*
* clear the CC_QHEAD bit on all members of the chain
*/
{
_sd_cctl_t *tcent;
for (tcent = cc_ent; tcent; tcent = tcent->cc_next_dm)
tcent->cc_flag &= ~CC_QHEAD;
}
}
if (getlock)
mutex_exit(&q->sq_qlock);
}
/*
* sdbc_dmchain_dirty(cc_ent)
* return first dirty cc_ent in dmchain, NULL if chain is not dirty
*/
static _sd_cctl_t *
sdbc_dmchain_dirty(_sd_cctl_t *cc_ent)
{
for (/* CSTYLED */; cc_ent; cc_ent = cc_ent->cc_next_dm)
if (CENTRY_DIRTY(cc_ent))
break;
return (cc_ent);
}
/*
* sdbc_requeue_head_dm_try()
* attempt to requeue a dmchain to the head of the queue
*/
void
sdbc_requeue_head_dm_try(_sd_cctl_t *cc_ent)
{
int qidx;
_sd_queue_t *q;
if (!sdbc_dmchain_dirty(cc_ent)) {
qidx = cc_ent->cc_cblocks;
q = &sdbc_dm_queues[qidx];
sdbc_requeue_dmchain(q, cc_ent, 0, 1); /* requeue head */
}
}
/*
* sdbc_centry_alloc_blks -- allocate cache entries with memory
*
* ARGUMENTS:
* cd - Cache descriptor (from a previous open)
* cblk - cache block number.
* reqblks - number of cache blocks to be allocated
* flag - can be ALLOC_NOWAIT
* RETURNS:
* A cache block chain or NULL if ALLOC_NOWAIT and request fails
*
* Note: caller must check for null return if called with
* ALLOC_NOWAIT set.
*/
_sd_cctl_t *
sdbc_centry_alloc_blks(int cd, nsc_off_t cblk, nsc_size_t reqblks, int flag)
{
sdbc_allocbuf_t alloc_tok = {0}; /* must be 0 */
int stall = 0;
_sd_cctl_t *centry = NULL;
_sd_cctl_t *lentry = NULL;
_sd_cctl_t *anchor = NULL;
_sd_cctl_t *next_centry;
ASSERT(reqblks);
while (reqblks) {
centry = sdbc_centry_alloc(cd, cblk, reqblks, &stall,
&alloc_tok, flag);
if (!centry)
break;
centry->cc_chain = NULL;
if (lentry == NULL)
anchor = centry;
else
lentry->cc_chain = centry;
lentry = centry;
centry->cc_aging_dm &= ~(ENTRY_FIELD_DM);
if (centry->cc_aging_dm & FOUND_IN_HASH_DM)
centry->cc_aging_dm |= HASH_ENTRY_DM;
else
if (centry->cc_aging_dm & FOUND_HOLD_OVER_DM)
centry->cc_aging_dm |= HOLD_ENTRY_DM;
else
centry->cc_aging_dm |= ELIGIBLE_ENTRY_DM;
centry->cc_aging_dm &= ~(FOUND_IN_HASH_DM|FOUND_HOLD_OVER_DM);
--reqblks;
}
sdbc_centry_alloc_end(&alloc_tok);
if (reqblks || (_sd_setup_category_on_type(anchor))) {
centry = anchor;
while (centry) {
next_centry = centry->cc_chain;
_sd_centry_release(centry);
centry = next_centry;
}
anchor = NULL;
} else
/* This is where the memory is actually allocated */
if (_sd_setup_mem_chaining(anchor, flag))
anchor = NULL;
return (anchor);
}
/*
* sdbc_centry_alloc - sdbc internal function to allocate a new cache block.
*
* ARGUMENTS:
* cd - Cache descriptor (from a previous open)
* cblk - cache block number.
* stall - pointer to stall count (no blocks avail)
* req_blocks - number of cache blocks remaining in caller's i/o request
* alloc_tok - pointer to token initialized to 0 on first call to function
* flag - lock status of sdbc_queue_lock or ALLOC_NOWAIT flag
* RETURNS:
* A cache block, or possibly NULL if ALLOC_NOWAIT set .
*
* USAGE:
* switch to the appropriate allocation function.
* this function is used when callers need more than one cache block.
* it is called repeatedly until the entire request is satisfied,
* at which time the caller will then do the memory allocation.
* if only one cache block is needed callers may use
* sdbc_centry_alloc_blks() which also allocates memory.
*
* Note: caller must check for null return if called with
* ALLOC_NOWAIT set.
*/
_sd_cctl_t *
sdbc_centry_alloc(int cd, nsc_off_t cblk, nsc_size_t req_blocks, int *stall,
sdbc_allocbuf_t *alloc_tok, int flag)
{
_sd_cctl_t *centry;
if (sdbc_use_dmchain)
centry = sdbc_alloc_dmc(cd, cblk, req_blocks, stall, alloc_tok,
flag);
else
centry = sdbc_alloc_lru(cd, cblk, stall, flag);
return (centry);
}
/*
* sdbc_alloc_dmc -- allocate a centry from a dmchain
*
* ARGUMENTS:
* cd - Cache descriptor (from a previous open)
* cblk - cache block number.
* stall - pointer to stall count (no blocks avail)
* req_blocks - number of cache blocks in clients i/o request
* alloc_tok - pointer to token initialized to 0 on first call to function
* flag - lock status of sdbc_queue_lock, or ALLOC_NOWAIT flag
* RETURNS:
* A cache block or possibly NULL if ALLOC_NOWAIT set
*
* USAGE:
* if dmchain is empty, allocate one.
*/
static _sd_cctl_t *
sdbc_alloc_dmc(int cd, nsc_off_t cblk, nsc_size_t req_blocks, int *stall,
sdbc_allocbuf_t *alloc_tok, int flag)
{
sdbc_allocbuf_impl_t *dmc = (sdbc_allocbuf_impl_t *)alloc_tok;
_sd_cctl_t *centry = NULL;
if (!dmc->sab_dmchain) {
/*
* Note - sdbc_get_dmchain() returns
* with cc_inuse and cc_pageio set
* for all members of dmchain.
*/
if (dmc->sab_dmchain =
sdbc_get_dmchain(req_blocks, stall, flag)) {
/* remember q it came from */
if (dmc->sab_dmchain->cc_alloc_size_dm)
dmc->sab_q = dmc->sab_dmchain->cc_cblocks;
}
}
/*
* Note: dmchain pointer is advanced in sdbc_alloc_from_dmchain()
*/
if (dmc->sab_dmchain) /* could be NULL if ALLOC_NOWAIT set */
centry = sdbc_alloc_from_dmchain(cd, cblk, alloc_tok, flag);
return (centry);
}
/*
* sdbc_alloc_from_dmchain -- allocate centry from a dmchain of centrys
*
* ARGUMENTS:
* cd - Cache descriptor (from a previous open)
* cblk - cache block number.
* alloc_tok - pointer to token
* flag - lock status of sdbc_queue_lock or ALLOC_NOWAIT
*
* RETURNS:
* A cache block or possibly NULL if ALLOC_NOWAIT set.
*
* USAGE:
* This routine allocates a new cache block from the supplied dmchain.
* Assumes that dmchain is non-NULL and that all cache entries in
* the dmchain have been removed from hash and have their cc_inuse and
* cc_pageio bits set.
*/
static _sd_cctl_t *
sdbc_alloc_from_dmchain(int cd, nsc_off_t cblk, sdbc_allocbuf_t *alloc_tok,
int flag)
{
_sd_cctl_t *cc_ent, *old_ent;
int categorize_centry;
int locked = flag & ALLOC_LOCKED;
int nowait = flag & ALLOC_NOWAIT;
sdbc_allocbuf_impl_t *dmc = (sdbc_allocbuf_impl_t *)alloc_tok;
SDTRACE(ST_ENTER|SDF_ENT_ALLOC, cd, 0, BLK_TO_FBA_NUM(cblk), 0, 0);
ASSERT(dmc->sab_dmchain);
cc_ent = dmc->sab_dmchain;
ASSERT(_sd_cctl_valid(cc_ent));
cc_ent->cc_valid = 0;
categorize_centry = 0;
if (cc_ent->cc_data)
categorize_centry = FOUND_HOLD_OVER_DM;
alloc_try:
if (cd == _CD_NOHASH)
CENTRY_BLK(cc_ent) = cblk;
else if ((old_ent = (_sd_cctl_t *)
_sd_hash_insert(cd, cblk, (struct _sd_hash_hd *)cc_ent,
_sd_htable)) != cc_ent) {
if (SET_CENTRY_INUSE(old_ent)) {
sdbc_centry_inuse++;
if (nowait) {
cc_ent = NULL;
goto out;
}
if (locked)
rw_exit(&sdbc_queue_lock);
_sd_cc_wait(cd, cblk, old_ent, CC_INUSE);
if (locked)
rw_enter(&sdbc_queue_lock, RW_WRITER);
goto alloc_try;
}
/*
* bug 4529671
* now that we own the centry make sure that
* it is still good. it could have been processed
* by _sd_dealloc_dm() in the window between
* _sd_hash_insert() and SET_CENTRY_INUSE().
*/
if ((_sd_cctl_t *)_sd_hash_search(cd, cblk, _sd_htable)
!= old_ent) {
sdbc_centry_deallocd++;
#ifdef DEBUG
cmn_err(CE_WARN, "cc_ent %p cd %d cblk %" NSC_SZFMT
" lost to dealloc?! cc_data %p", (void *)old_ent,
cd, cblk, (void *)old_ent->cc_data);
#endif
CLEAR_CENTRY_INUSE(old_ent);
if (nowait) {
cc_ent = NULL;
goto out;
}
goto alloc_try;
}
if (CC_CD_BLK_MATCH(cd, cblk, old_ent)) {
sdbc_centry_hit++;
old_ent->cc_toflush = 0;
/* _sd_centry_release(cc_ent); */
cc_ent = old_ent;
categorize_centry = FOUND_IN_HASH_DM;
} else {
sdbc_centry_lost++;
CLEAR_CENTRY_INUSE(old_ent);
if (nowait) {
cc_ent = NULL;
goto out;
}
goto alloc_try;
}
}
/*
* advance the dmchain pointer, but only if we got the
* cc_ent from the dmchain
*/
if (categorize_centry != FOUND_IN_HASH_DM) {
if (cc_ent->cc_data)
dmc->sab_dmchain = dmc->sab_dmchain->cc_next_dm;
else
dmc->sab_dmchain = dmc->sab_dmchain->cc_next;
}
SDTRACE(ST_EXIT|SDF_ENT_ALLOC, cd, 0,
BLK_TO_FBA_NUM(cblk), 0, 0);
mutex_enter(&cc_ent->cc_lock);
if (cc_ent->cc_await_use) {
cv_broadcast(&cc_ent->cc_blkcv);
}
mutex_exit(&cc_ent->cc_lock);
sdbc_centry_init_dm(cc_ent);
cc_ent->cc_aging_dm |= categorize_centry;
out:
SDTRACE(ST_INFO|SDF_ENT_ALLOC, cd, 0, BLK_TO_FBA_NUM(cblk), 0, 0);
return (cc_ent);
}
/*
* sdbc_centry_alloc_end -- tidy up after all cache blocks have been
* allocated for a request
* ARGUMENTS:
* alloc_tok - pointer to allocation token
* RETURNS
* nothing
* USAGE:
* at this time only useful when sdbc_use_dmchain is true.
* if there are cache blocks remaining on the chain then the inuse and
* pageio bits must be cleared (they were set in sdbc_get_dmchain().
*
*/
static void
sdbc_centry_alloc_end(sdbc_allocbuf_t *alloc_tok)
{
_sd_cctl_t *next_centry;
_sd_cctl_t *prev_centry;
_sd_queue_t *q;
sdbc_allocbuf_impl_t *dmc = (sdbc_allocbuf_impl_t *)alloc_tok;
#ifdef DEBUG
int chainpull = 0;
#endif
if (!sdbc_use_dmchain)
return;
next_centry = dmc->sab_dmchain;
while (next_centry != NULL) {
CLEAR_CENTRY_PAGEIO(next_centry);
prev_centry = next_centry;
if (next_centry->cc_data) {
#ifdef DEBUG
++chainpull;
#endif
next_centry = next_centry->cc_next_dm;
/* clear bit after final reference */
CLEAR_CENTRY_INUSE(prev_centry);
} else {
next_centry = next_centry->cc_next;
/*
* a floater from the 0 queue, insert on q.
*
* since this centry is not on any queue
* the inuse bit can be cleared before
* inserting on the q. this is also required
* since sdbc_get_dmchain() does not expect
* inuse bits to be set on 0 queue entry's.
*/
CLEAR_CENTRY_INUSE(prev_centry);
q = &sdbc_dm_queues[0];
sdbc_ins_dmqueue_front(q, prev_centry);
}
}
#ifdef DEBUG
/* compute wastage stats */
ASSERT((chainpull >= 0) && (chainpull < max_dm_queues));
if (chainpull)
(*(dmchainpull_table + (dmc->sab_q *
max_dm_queues + chainpull)))++;
#endif
}
/*
* sdbc_alloc_lru - allocate a new cache block from the lru queue
*
* ARGUMENTS:
* cd - Cache descriptor (from a previous open)
* cblk - cache block number.
* stall - pointer to stall count (no blocks avail)
* flag - lock status of sdbc_queue_lock or ALLOC_NOWAIT
*
* RETURNS:
* A cache block or NULL if ALLOC_NOWAIT specified
*
* USAGE:
* This routine allocates a new cache block from the lru.
* If an allocation cannot be done, we block, unless ALLOC_NOWAIT is set.
*/
static _sd_cctl_t *
sdbc_alloc_lru(int cd, nsc_off_t cblk, int *stall, int flag)
{
_sd_cctl_t *cc_ent, *old_ent, *ccnext;
_sd_queue_t *q = _SD_LRU_Q;
_sd_cctl_t *qhead = &(q->sq_qhead);
int tries = 0, num_tries;
int categorize_centry;
int locked = flag & ALLOC_LOCKED;
int nowait = flag & ALLOC_NOWAIT;
if (nowait) {
num_tries = q->sq_inq / 100; /* only search 1% of q */
if (num_tries <= 0) /* ensure num_tries is non-zero */
num_tries = q->sq_inq;
} else
num_tries = _sd_lruq_srch;
SDTRACE(ST_ENTER|SDF_ENT_ALLOC, cd, 0, BLK_TO_FBA_NUM(cblk), 0, 0);
retry_alloc_centry:
for (cc_ent = (qhead->cc_next); cc_ent != qhead; cc_ent = ccnext) {
if (--num_tries <= 0)
if (nowait) {
cc_ent = NULL;
goto out;
} else
break;
ccnext = cc_ent->cc_next;
if (cc_ent->cc_aging_dm & BAD_CHAIN_DM)
continue;
if (CENTRY_DIRTY(cc_ent))
continue;
if (SET_CENTRY_INUSE(cc_ent))
continue;
if (CENTRY_DIRTY(cc_ent)) {
sdbc_centry_lost++;
CLEAR_CENTRY_INUSE(cc_ent);
continue;
}
cc_ent->cc_flag = 0; /* CC_INUSE */
cc_ent->cc_toflush = 0;
/*
* Inlined requeue of the LRU. (should match _sd_requeue)
*/
/* was FAST */
mutex_enter(&q->sq_qlock);
#if defined(_SD_DEBUG)
if (1) {
_sd_cctl_t *cp, *cn, *qp;
cp = cc_ent->cc_prev;
cn = cc_ent->cc_next;
qp = (q->sq_qhead).cc_prev;
if (!_sd_cctl_valid(cc_ent) ||
(cp != &(q->sq_qhead) && !_sd_cctl_valid(cp)) ||
(cn != &(q->sq_qhead) && !_sd_cctl_valid(cn)) ||
!_sd_cctl_valid(qp))
cmn_err(CE_PANIC,
"_sd_centry_alloc %x prev %x next %x qp %x",
cc_ent, cp, cn, qp);
}
#endif
cc_ent->cc_prev->cc_next = cc_ent->cc_next;
cc_ent->cc_next->cc_prev = cc_ent->cc_prev;
cc_ent->cc_next = qhead;
cc_ent->cc_prev = qhead->cc_prev;
qhead->cc_prev->cc_next = cc_ent;
qhead->cc_prev = cc_ent;
cc_ent->cc_seq = q->sq_seq++;
/* was FAST */
mutex_exit(&q->sq_qlock);
/*
* End inlined requeue.
*/
#if defined(_SD_STATS)
if (_sd_hash_delete(cc_ent, _sd_htable) == 0)
SDTRACE(SDF_REPLACE,
CENTRY_CD(cc_ent), cc_ent->cc_hits,
BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
nsc_lbolt(), cc_ent->cc_creat);
cc_ent->cc_creat = nsc_lbolt();
cc_ent->cc_hits = 0;
#else
#if defined(_SD_DEBUG)
if (_sd_hash_delete(cc_ent, _sd_htable) == 0) {
SDTRACE(SDF_REPLACE|ST_DL,
CENTRY_CD(cc_ent),
cc_ent->cc_valid,
BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
cd, BLK_TO_FBA_NUM(cblk));
if (cc_ent->cc_await_use ||
((cd == CENTRY_CD(cc_ent)) &&
(cblk == CENTRY_BLK(cc_ent))))
DATA_LOG(SDF_REPLACE|ST_DL, cc_ent, 0,
BLK_FBAS);
}
#else
(void) _sd_hash_delete((struct _sd_hash_hd *)cc_ent,
_sd_htable);
#endif
#endif
cc_ent->cc_creat = nsc_lbolt();
cc_ent->cc_hits = 0;
cc_ent->cc_valid = 0;
categorize_centry = 0;
if (cc_ent->cc_data)
categorize_centry = FOUND_HOLD_OVER_DM;
alloc_try:
if (cd == _CD_NOHASH)
CENTRY_BLK(cc_ent) = cblk;
else if ((old_ent = (_sd_cctl_t *)
_sd_hash_insert(cd, cblk, (struct _sd_hash_hd *)cc_ent,
_sd_htable)) != cc_ent) {
if (SET_CENTRY_INUSE(old_ent)) {
sdbc_centry_inuse++;
if (nowait) {
_sd_centry_release(cc_ent);
cc_ent = NULL;
goto out;
}
if (locked)
rw_exit(&sdbc_queue_lock);
_sd_cc_wait(cd, cblk, old_ent, CC_INUSE);
if (locked)
rw_enter(&sdbc_queue_lock, RW_WRITER);
goto alloc_try;
}
/*
* bug 4529671
* now that we own the centry make sure that
* it is still good. it could have been processed
* by _sd_dealloc_dm() in the window between
* _sd_hash_insert() and SET_CENTRY_INUSE().
*/
if ((_sd_cctl_t *)
_sd_hash_search(cd, cblk, _sd_htable) != old_ent) {
sdbc_centry_deallocd++;
#ifdef DEBUG
cmn_err(CE_WARN, "cc_ent %p cd %d cblk %"
NSC_SZFMT " lost to dealloc?! cc_data %p",
(void *)old_ent, cd, cblk,
(void *)old_ent->cc_data);
#endif
CLEAR_CENTRY_INUSE(old_ent);
if (nowait) {
_sd_centry_release(cc_ent);
cc_ent = NULL;
goto out;
}
goto alloc_try;
}
if (CC_CD_BLK_MATCH(cd, cblk, old_ent)) {
sdbc_centry_hit++;
old_ent->cc_toflush = 0;
_sd_centry_release(cc_ent);
cc_ent = old_ent;
categorize_centry = FOUND_IN_HASH_DM;
} else {
sdbc_centry_lost++;
CLEAR_CENTRY_INUSE(old_ent);
if (nowait) {
_sd_centry_release(cc_ent);
cc_ent = NULL;
goto out;
}
goto alloc_try;
}
}
SDTRACE(ST_EXIT|SDF_ENT_ALLOC, cd, tries,
BLK_TO_FBA_NUM(cblk), 0, 0);
if (cc_ent->cc_await_use) {
mutex_enter(&cc_ent->cc_lock);
cv_broadcast(&cc_ent->cc_blkcv);
mutex_exit(&cc_ent->cc_lock);
}
sdbc_centry_init_dm(cc_ent);
cc_ent->cc_aging_dm |= categorize_centry;
out:
return (cc_ent);
}
SDTRACE(ST_INFO|SDF_ENT_ALLOC, cd, ++tries, BLK_TO_FBA_NUM(cblk), 0, 0);
delay(drv_usectohz(20000));
(void) (*stall)++;
num_tries = _sd_lruq_srch;
goto retry_alloc_centry;
}
/*
* sdbc_centry_init_dm - setup the cache block for dynamic memory allocation
*
* ARGUMENTS:
* centry - Cache block.
*
* RETURNS:
* NONE
*
* USAGE:
* This routine is the central point in which cache entry blocks are setup
*/
static void
sdbc_centry_init_dm(_sd_cctl_t *centry)
{
/* an entry already setup - don't touch simply refresh age */
if (centry->cc_data) {
centry->cc_aging_dm &= ~(FINAL_AGING_DM);
DTRACE_PROBE1(sdbc_centry_init_dm_end,
char *, centry->cc_data);
return;
}
centry->cc_aging_dm &= ~(FINAL_AGING_DM | CATAGORY_ENTRY_DM);
if (centry->cc_head_dm || centry->cc_next_dm)
cmn_err(cmn_level, "sdbc(sdbc_centry_init_dm): "
"non-zero mem chain in ccent %p", (void *)centry);
centry->cc_head_dm = 0;
if (!sdbc_use_dmchain)
centry->cc_next_dm = 0;
centry->cc_data = 0;
}
/*
* sdbc_centry_memalloc_dm
*
* Actually allocate the cache memory, storing it in the cc_data field for
* the cctl
*
* ARGS:
* centry: cache control block for which to allocate the memory
* alloc_request: number of bytes to allocate
* flag: if called with ALLOC_NOWAIT, caller must check for non-zero return
*
* RETURNS:
* 0 on success
* non-zero on error
*/
static int
sdbc_centry_memalloc_dm(_sd_cctl_t *centry, int alloc_request, int flag)
{
int cblocks;
_sd_queue_t *newq;
int sleep;
sleep = (flag & ALLOC_NOWAIT) ? KM_NOSLEEP : KM_SLEEP;
if (!centry->cc_data && (alloc_request > 0)) {
/* host or other */
dynmem_processing_dm.alloc_ct++;
centry->cc_data = (unsigned char *)
kmem_alloc((size_t)centry->cc_alloc_size_dm, sleep);
if (sdbc_use_dmchain) {
cblocks = centry->cc_alloc_size_dm >> _sd_cblock_shift;
newq = &sdbc_dm_queues[cblocks];
/* set the dmqueue index */
centry->cc_cblocks = cblocks;
/* put on appropriate queue */
sdbc_ins_dmqueue_back(newq, centry);
}
/*
* for KM_NOSLEEP (should never happen with KM_SLEEP)
*/
if (!centry->cc_data)
return (LOW_RESOURCES_DM);
centry->cc_head_dm = centry;
centry->cc_alloc_ct_dm++;
}
return (0);
}
/*
* _sd_centry_release - release a cache block
*
* ARGUMENTS:
* centry - Cache block.
*
* RETURNS:
* NONE
*
* USAGE:
* This routine frees up a cache block. It also frees up a write
* block if allocated and its valid to release it.
*/
void
_sd_centry_release(_sd_cctl_t *centry)
{
ss_centry_info_t *wctl;
SDTRACE(ST_ENTER|SDF_ENT_FREE, CENTRY_CD(centry), 0,
BLK_TO_FBA_NUM(CENTRY_BLK(centry)), 0, 0);
CLEAR_CENTRY_PAGEIO(centry);
if ((wctl = centry->cc_write) != 0) {
/* was FAST */
mutex_enter(&centry->cc_lock);
if (CENTRY_DIRTY(centry))
wctl = NULL;
else {
centry->cc_write = NULL;
centry->cc_flag &= ~(CC_PINNABLE);
}
/* was FAST */
mutex_exit(&centry->cc_lock);
if (wctl) {
wctl->sc_dirty = 0;
SSOP_SETCENTRY(sdbc_safestore, wctl);
SSOP_DEALLOCRESOURCE(sdbc_safestore, wctl->sc_res);
}
}
if (!(centry->cc_aging_dm & BAD_CHAIN_DM)) {
if (sdbc_use_dmchain) {
if (centry->cc_alloc_size_dm) {
/* see if this can be queued to head */
if (CENTRY_QHEAD(centry)) {
sdbc_requeue_head_dm_try(centry);
} else {
int qidx;
_sd_queue_t *q;
qidx = centry->cc_cblocks;
q = &sdbc_dm_queues[qidx];
if (_sd_lru_reinsert(q, centry)) {
sdbc_requeue_dmchain(q,
centry, 1, 1);
}
}
} else {
/*
* Fix for bug 4949134:
* If an internal block is marked with CC_QHEAD
* but the HOST block is not, the chain will
* never age properly, and will never be made
* available. Only the HOST of the dmchain is
* checked for CC_QHEAD, so clearing an internal
* block indiscriminately (as is being done
* here) does no damage.
*
* The same result could instead be achieved by
* not setting the CC_QHEAD flag in the first
* place, if the block is an internal dmchain
* block, and if it is found in the hash table.
* The current solution was chosen since it is
* the least intrusive.
*/
centry->cc_flag &= ~CC_QHEAD;
}
} else {
if (CENTRY_QHEAD(centry)) {
if (!CENTRY_DIRTY(centry))
_sd_requeue_head(centry);
} else if (_sd_lru_reinsert(_SD_LRU_Q, centry))
_sd_requeue(centry);
}
}
SDTRACE(ST_EXIT|SDF_ENT_FREE, CENTRY_CD(centry), 0,
BLK_TO_FBA_NUM(CENTRY_BLK(centry)), 0, 0);
/* only clear inuse after final reference to centry */
CLEAR_CENTRY_INUSE(centry);
}
/*
* lookup to centry info associated with safestore resource
* return pointer to the centry info structure
*/
ss_centry_info_t *
sdbc_get_cinfo_byres(ss_resource_t *res)
{
ss_centry_info_t *cinfo;
ss_centry_info_t *cend;
int found = 0;
ASSERT(res != NULL);
if (res == NULL)
return (NULL);
cinfo = _sdbc_gl_centry_info;
cend = _sdbc_gl_centry_info + (_sdbc_gl_centry_info_size /
sizeof (ss_centry_info_t)) - 1;
for (; cinfo <= cend; ++cinfo)
if (cinfo->sc_res == res) {
++found;
break;
}
if (!found)
cinfo = NULL; /* bad */
return (cinfo);
}
/*
* _sd_alloc_write - Allocate a write block (for remote mirroring)
* and set centry->cc_write
*
* ARGUMENTS:
* centry - Head of Cache chain
* stall - pointer to stall count (no blocks avail)
*
* RETURNS:
* 0 - and sets cc_write for all entries when write contl block obtained.
* -1 - if a write control block could not be obtained.
*/
int
_sd_alloc_write(_sd_cctl_t *centry, int *stall)
{
ss_resourcelist_t *reslist;
ss_resourcelist_t *savereslist;
ss_resource_t *res;
_sd_cctl_t *ce;
int err;
int need;
need = 0;
for (ce = centry; ce; ce = ce->cc_chain) {
if (!(ce->cc_write))
need++;
}
if (!need)
return (0);
if ((SSOP_ALLOCRESOURCE(sdbc_safestore, need, stall, &reslist))
== SS_OK) {
savereslist = reslist;
for (ce = centry; ce; ce = ce->cc_chain) {
if (ce->cc_write)
continue;
err = SSOP_GETRESOURCE(sdbc_safestore, &reslist, &res);
if (err == SS_OK)
ce->cc_write = sdbc_get_cinfo_byres(res);
ASSERT(err == SS_OK); /* panic if DEBUG on */
ASSERT(ce->cc_write != NULL);
/*
* this is bad and should not happen.
* we use the saved reslist to cleanup
* and return.
*/
if ((err != SS_OK) || !ce->cc_write) {
cmn_err(CE_WARN, "_sd_alloc_write: "
"bad resource list 0x%p"
"changing to forced write thru mode",
(void *)savereslist);
(void) _sd_set_node_hint(NSC_FORCED_WRTHRU);
while (SSOP_GETRESOURCE(sdbc_safestore,
&savereslist, &res) == SS_OK) {
SSOP_DEALLOCRESOURCE(sdbc_safestore,
res);
}
return (-1);
}
}
return (0);
}
/* no safestore resources available. do sync write */
_sd_unblock(&_sd_flush_cv);
return (-1);
}
/*
* _sd_read - Interface call to do read.
*
* ARGUMENTS:
* handle - handle allocated earlier on.
* fba_pos - disk block number to read from.
* fba_len - length in fbas.
* flag - flag: (NSC_NOBLOCK for async io)
*
* RETURNS:
* errno if return > 0
* NSC_DONE or NSC_PENDING otherwise.
*
* USAGE:
* This routine checks if the request is valid and calls the underlying
* doread routine (also called by alloc_buf)
*/
int
_sd_read(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len,
int flag)
{
sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */
sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */
sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */
_sd_cctl_t *cc_ent = NULL;
nsc_size_t fba_orig_len = fba_len;
int ret;
int cd = HANDLE_CD(handle);
if (_sdbc_shutdown_in_progress || (handle->bh_flag & NSC_ABUF)) {
ret = EIO;
goto out;
}
#if !defined(_SD_NOCHECKS)
if (!_SD_HANDLE_ACTIVE(handle)) {
cmn_err(CE_WARN, "sdbc(_sd_read) handle %p not active",
(void *)handle);
ret = EINVAL;
goto out;
}
ASSERT_HANDLE_LIMITS(handle, fba_pos, fba_len);
#endif
if (fba_len == 0) {
ret = NSC_DONE;
goto out;
}
KSTAT_RUNQ_ENTER(cd);
st_cblk_off = BLK_FBA_OFF(fba_pos);
st_cblk_len = BLK_FBAS - st_cblk_off;
if ((nsc_size_t)st_cblk_len >= fba_len) {
end_cblk_len = 0;
st_cblk_len = (sdbc_cblk_fba_t)fba_len;
} else {
end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
}
cc_ent = handle->bh_centry;
while (CENTRY_BLK(cc_ent) != FBA_TO_BLK_NUM(fba_pos))
cc_ent = cc_ent->cc_chain;
if (!SDBC_VALID_BITS(st_cblk_off, st_cblk_len, cc_ent))
goto need_io;
DATA_LOG(SDF_RD, cc_ent, st_cblk_off, st_cblk_len);
DTRACE_PROBE4(_sd_read_data1,
uint64_t,
(uint64_t)(BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)) +
st_cblk_off),
uint64_t, (uint64_t)st_cblk_len,
char *,
*(int64_t *)(cc_ent->cc_data + FBA_SIZE(st_cblk_off)),
char *,
*(int64_t *)(cc_ent->cc_data +
FBA_SIZE(st_cblk_off + st_cblk_len) - 8));
fba_pos += st_cblk_len;
fba_len -= st_cblk_len;
cc_ent = cc_ent->cc_chain;
while (fba_len > (nsc_size_t)end_cblk_len) {
if (!FULLY_VALID(cc_ent))
goto need_io;
DATA_LOG(SDF_RD, cc_ent, 0, BLK_FBAS);
DTRACE_PROBE4(_sd_read_data2,
uint64_t,
(uint64_t)BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
uint64_t, (uint64_t)BLK_FBAS,
char *, *(int64_t *)(cc_ent->cc_data),
char *, *(int64_t *)(cc_ent->cc_data +
FBA_SIZE(BLK_FBAS) - 8));
fba_pos += BLK_FBAS;
fba_len -= BLK_FBAS;
cc_ent = cc_ent->cc_chain;
}
if (fba_len) {
if (!SDBC_VALID_BITS(0, end_cblk_len, cc_ent))
goto need_io;
DATA_LOG(SDF_RD, cc_ent, 0, end_cblk_len);
DTRACE_PROBE4(_sd_read_data3,
uint64_t,
(uint64_t)BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
uint64_t, (uint64_t)end_cblk_len,
char *, *(int64_t *)(cc_ent->cc_data),
char *, *(int64_t *)(cc_ent->cc_data +
FBA_SIZE(end_cblk_len) - 8));
}
CACHE_FBA_READ(handle->bh_cd, fba_orig_len);
CACHE_READ_HIT;
FBA_READ_IO_KSTATS(handle->bh_cd, FBA_SIZE(fba_orig_len));
ret = NSC_HIT;
goto stats_exit;
need_io:
_SD_DISCONNECT_CALLBACK(handle);
ret = _sd_doread(handle, cc_ent, fba_pos, fba_len, flag);
stats_exit:
KSTAT_RUNQ_EXIT(cd);
out:
return (ret);
}
/*
* sdbc_doread_prefetch - read ahead one cache block
*
* ARGUMENTS:
* cc_ent - cache entry
* fba_pos - disk block number to read from
* fba_len - length in fbas.
*
* RETURNS:
* number of fbas, if any, that are to be read beyond (fba_pos + fba_len)
*
* USAGE:
* if readahead is to be done allocate a cache block and place
* on the cc_chain of cc_ent
*/
static int
sdbc_doread_prefetch(_sd_cctl_t *cc_ent, nsc_off_t fba_pos, nsc_size_t fba_len)
{
nsc_off_t st_cblk = FBA_TO_BLK_NUM(fba_pos);
nsc_off_t next_cblk = FBA_TO_BLK_NUM(fba_pos + BLK_FBAS);
nsc_size_t filesize;
int fba_count = 0; /* number of fbas to prefetch */
_sd_cctl_t *cc_ra; /* the read ahead cache entry */
int cd = CENTRY_CD(cc_ent);
nsc_size_t vol_fill;
filesize = _sd_cache_files[cd].cd_info->sh_filesize;
vol_fill = filesize - (fba_pos + fba_len);
/* readahead only for small reads */
if ((fba_len <= FBA_LEN(CACHE_BLOCK_SIZE)) && (fba_pos != 0) &&
(vol_fill > 0)) {
/*
* if prev block is in cache and next block is not,
* then read ahead one block
*/
if (_sd_hash_search(cd, st_cblk - 1, _sd_htable)) {
if (!_sd_hash_search(cd, next_cblk, _sd_htable)) {
cc_ra = sdbc_centry_alloc_blks(cd, next_cblk, 1,
ALLOC_NOWAIT);
if (cc_ra) {
/* if in cache don't readahead */
if (cc_ra->cc_aging_dm & HASH_ENTRY_DM) {
++sdbc_ra_hash;
_sd_centry_release(cc_ra);
} else {
cc_ent->cc_chain = cc_ra;
cc_ra->cc_chain = 0;
fba_count =
(vol_fill > (nsc_size_t)BLK_FBAS) ?
BLK_FBAS : (int)vol_fill;
/*
* indicate implicit prefetch and
* mark for release in
* _sd_read_complete()
*/
cc_ra->cc_aging_dm |= (PREFETCH_BUF_I |
PREFETCH_BUF_IR);
}
} else
++sdbc_ra_none;
}
}
}
return (fba_count);
}
/*
* _sd_doread - Check if blocks in cache. If not completely true, do io.
*
* ARGUMENTS:
* handle - handle allocated earlier on.
* fba_pos - disk block number to read from.
* fba_len - length in fbas.
* flag - flag: (NSC_NOBLOCK for async io)
*
* RETURNS:
* errno if return > 0
* NSC_DONE(from disk), or NSC_PENDING otherwise.
*
* Comments:
* It initiates an io and either blocks waiting for the completion
* or return NSC_PENDING, depending on whether the flag bit
* NSC_NOBLOCK is reset or set.
*
*/
static int
_sd_doread(_sd_buf_handle_t *handle, _sd_cctl_t *cc_ent, nsc_off_t fba_pos,
nsc_size_t fba_len, int flag)
{
int cd, err;
nsc_size_t fba_orig_len; /* length in FBA's of the original request */
nsc_size_t file_len; /* length in bytes of io to be done */
sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */
sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */
sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */
int num_bdl;
_sd_cctl_t *cc_temp;
struct buf *bp;
unsigned int want_bits;
void (*fn)(blind_t, nsc_off_t, nsc_size_t, int);
sdbc_cblk_fba_t end_cblk_fill; /* FBA's to fill to end of last block */
nsc_size_t vol_end_fill; /* # of FBA's to fill to end of the volume */
cd = HANDLE_CD(handle);
SDTRACE(ST_ENTER|SDF_READ, cd, fba_len, fba_pos, flag, 0);
ASSERT(cd >= 0);
if (_sd_cache_files[cd].cd_info->sh_failed) {
SDTRACE(ST_EXIT|SDF_READ, cd, fba_len, fba_pos, flag, EIO);
return (EIO);
}
/*
* adjust the position and length so that the entire cache
* block is read in
*/
/* first, adjust to beginning of cache block */
fba_len += BLK_FBA_OFF(fba_pos); /* add start offset to length */
fba_pos &= ~BLK_FBA_MASK; /* move position back to start of block */
/* compute fill to end of cache block */
end_cblk_fill = (BLK_FBAS - 1) - ((fba_len - 1) % BLK_FBAS);
vol_end_fill = _sd_cache_files[(cd)].cd_info->sh_filesize -
(fba_pos + fba_len);
/* fill to lesser of cache block or end of volume */
fba_len += ((nsc_size_t)end_cblk_fill < vol_end_fill) ? end_cblk_fill :
vol_end_fill;
DTRACE_PROBE2(_sd_doread_rfill,
nsc_off_t, fba_pos,
nsc_size_t, fba_len);
/* for small reads do 1-block readahead if previous block is in cache */
if (sdbc_prefetch1)
fba_len += sdbc_doread_prefetch(cc_ent, fba_pos, fba_len);
fba_orig_len = fba_len;
st_cblk_off = BLK_FBA_OFF(fba_pos);
st_cblk_len = BLK_FBAS - st_cblk_off;
if ((nsc_size_t)st_cblk_len >= fba_len) {
end_cblk_len = 0;
st_cblk_len = (sdbc_cblk_fba_t)fba_len;
} else {
end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
}
cc_temp = cc_ent;
num_bdl = 0;
while (cc_temp) {
num_bdl += (SDBC_LOOKUP_IOCOUNT(CENTRY_DIRTY(cc_temp)));
cc_temp = cc_temp->cc_chain;
}
bp = sd_alloc_iob(_sd_cache_files[cd].cd_crdev,
fba_pos, num_bdl, B_READ);
if (bp == NULL) {
SDTRACE(ST_EXIT|SDF_READ, cd, fba_len, fba_pos, flag, E2BIG);
return (E2BIG);
}
want_bits = SDBC_GET_BITS(st_cblk_off, st_cblk_len);
if (want_bits & CENTRY_DIRTY(cc_ent))
_sd_ccent_rd(cc_ent, want_bits, bp);
else {
sd_add_fba(bp, &cc_ent->cc_addr, st_cblk_off, st_cblk_len);
}
file_len = FBA_SIZE(st_cblk_len);
cc_ent = cc_ent->cc_chain;
fba_len -= st_cblk_len;
while (fba_len > (nsc_size_t)end_cblk_len) {
if (CENTRY_DIRTY(cc_ent))
_sd_ccent_rd(cc_ent, (uint_t)BLK_FBA_BITS, bp);
else {
sd_add_fba(bp, &cc_ent->cc_addr, 0, BLK_FBAS);
}
file_len += CACHE_BLOCK_SIZE;
cc_ent = cc_ent->cc_chain;
fba_len -= BLK_FBAS;
}
if (fba_len) {
want_bits = SDBC_GET_BITS(0, end_cblk_len);
if (want_bits & CENTRY_DIRTY(cc_ent))
_sd_ccent_rd(cc_ent, want_bits, bp);
else {
sd_add_fba(bp, &cc_ent->cc_addr, 0, end_cblk_len);
}
file_len += FBA_SIZE(end_cblk_len);
}
CACHE_READ_MISS;
FBA_READ_IO_KSTATS(cd, file_len);
DISK_FBA_READ(cd, FBA_NUM(file_len));
fn = (handle->bh_flag & NSC_NOBLOCK) ? _sd_async_read_ea : NULL;
err = sd_start_io(bp, _sd_cache_files[cd].cd_strategy, fn, handle);
if (err != NSC_PENDING) {
_sd_read_complete(handle, fba_pos, fba_orig_len, err);
}
SDTRACE(ST_EXIT|SDF_READ, cd, fba_orig_len, fba_pos, flag, err);
return (err);
}
/*
* _sd_read_complete - Do whatever is necessary after a read io is done.
*
* ARGUMENTS:
* handle - handle allocated earlier on.
* fba_pos - disk block number to read from.
* fba_len - length in fbas.
* error - error from io if any.
*
* RETURNS:
* NONE.
*
* Comments:
* This routine marks the cache blocks valid if the io completed
* sucessfully. Called from the async end action as well as after
* a synchrnous read completes.
*/
void
_sd_read_complete(_sd_buf_handle_t *handle, nsc_off_t fba_pos,
nsc_size_t fba_len, int error)
{
sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */
sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */
sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */
nsc_size_t cur_fba_len; /* length in FBA's */
_sd_cctl_t *cc_iocent;
_sd_cctl_t *first_iocent; /* first buffer when processing prefetch */
cc_iocent = handle->bh_centry;
if ((handle->bh_error = error) == 0) {
while (CENTRY_BLK(cc_iocent) != FBA_TO_BLK_NUM(fba_pos))
cc_iocent = cc_iocent->cc_chain;
cur_fba_len = fba_len;
st_cblk_off = BLK_FBA_OFF(fba_pos);
st_cblk_len = BLK_FBAS - st_cblk_off;
if ((nsc_size_t)st_cblk_len >= fba_len) {
end_cblk_len = 0;
st_cblk_len = (sdbc_cblk_fba_t)fba_len;
} else {
end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
}
SDBC_SET_VALID_BITS(st_cblk_off, st_cblk_len, cc_iocent);
DATA_LOG(SDF_RDIO, cc_iocent, st_cblk_off, st_cblk_len);
DTRACE_PROBE4(_sd_read_complete_data1,
uint64_t, (uint64_t)
BLK_TO_FBA_NUM(CENTRY_BLK(cc_iocent)) + st_cblk_off,
int, st_cblk_len,
char *, *(int64_t *)
(cc_iocent->cc_data + FBA_SIZE(st_cblk_off)),
char *, *(int64_t *)
(cc_iocent->cc_data +
FBA_SIZE(st_cblk_off + st_cblk_len) - 8));
first_iocent = cc_iocent;
cc_iocent = cc_iocent->cc_chain;
cur_fba_len -= st_cblk_len;
while (cur_fba_len > (nsc_size_t)end_cblk_len) {
SET_FULLY_VALID(cc_iocent);
DATA_LOG(SDF_RDIO, cc_iocent, 0, BLK_FBAS);
DTRACE_PROBE4(_sd_read_complete_data2,
uint64_t, (uint64_t)
BLK_TO_FBA_NUM(CENTRY_BLK(cc_iocent)),
int, BLK_FBAS,
char *,
*(int64_t *)(cc_iocent->cc_data),
char *,
*(int64_t *)(cc_iocent->cc_data +
FBA_SIZE(BLK_FBAS) - 8));
/*
* 4755485 release implicit prefetch buffers
*
* the cc_chain of the first buffer must NULL'd
* else _sd_free_buf() will do a double free when
* it traverses the chain.
*
* if a buffer has been marked PREFETCH_BUF_IR then
* it is guaranteed that
* 1. it is the second in a chain of two.
* 2. cur_fba_len is BLK_FBAS.
* 3. end_cblk_len is zero.
*
* because of 1 (and 2) above, we can safely exit the
* while loop via the break statement without
* executing the last two statements. the break
* statement is necessary because it would be unsafe
* to access cc_iocent which could be reallocated
* immediately after the _sd_centry_release().
*/
if (cc_iocent->cc_aging_dm & PREFETCH_BUF_IR) {
cc_iocent->cc_aging_dm &= ~(PREFETCH_BUF_IR);
_sd_centry_release(cc_iocent);
first_iocent->cc_chain = NULL;
break;
}
cc_iocent = cc_iocent->cc_chain;
cur_fba_len -= BLK_FBAS;
}
if (end_cblk_len) {
SDBC_SET_VALID_BITS(0, end_cblk_len, cc_iocent);
DATA_LOG(SDF_RDIO, cc_iocent, 0, end_cblk_len);
DTRACE_PROBE4(_sd_read_complete_data3,
uint64_t, (uint64_t)
BLK_TO_FBA_NUM(CENTRY_BLK(cc_iocent)),
int, end_cblk_len,
char *,
*(int64_t *)(cc_iocent->cc_data),
char *,
*(int64_t *)(cc_iocent->cc_data +
FBA_SIZE(end_cblk_len) - 8));
}
}
}
/*
* _sd_async_read_ea - End action for async reads.
*
* ARGUMENTS:
* xhandle - handle allocated earlier on (cast to blind_t).
* fba_pos - disk block number read from.
* fba_len - length in fbas.
* error - error from io if any.
*
* RETURNS:
* NONE.
*
* Comments:
* This routine is called at interrupt level when the io is done.
* This is called only when read is asynchronous (NSC_NOBLOCK)
*/
static void
_sd_async_read_ea(blind_t xhandle, nsc_off_t fba_pos, nsc_size_t fba_len,
int error)
{
_sd_buf_handle_t *handle = xhandle;
int cd;
if (error) {
cd = HANDLE_CD(handle);
ASSERT(cd >= 0);
_sd_cache_files[cd].cd_info->sh_failed = 1;
}
SDTRACE(ST_ENTER|SDF_READ_EA, HANDLE_CD(handle),
handle->bh_fba_len, handle->bh_fba_pos, 0, error);
_sd_read_complete(handle, fba_pos, fba_len, error);
#if defined(_SD_DEBUG_PATTERN)
check_buf_consistency(handle, "rd");
#endif
SDTRACE(ST_EXIT|SDF_READ_EA, HANDLE_CD(handle),
handle->bh_fba_len, handle->bh_fba_pos, 0, 0);
_SD_READ_CALLBACK(handle);
}
/*
* _sd_async_write_ea - End action for async writes.
*
* ARGUMENTS:
* xhandle - handle allocated earlier on. (cast to blind_t)
* fba_pos - disk block number written to.
* fba_len - length in fbas.
* error - error from io if any.
*
* RETURNS:
* NONE.
*
* Comments:
* This routine is called at interrupt level when the write io is done.
* This is called only when we are in write-through mode and the write
* call indicated asynchronous callback. (NSC_NOBLOCK)
*/
/* ARGSUSED */
static void
_sd_async_write_ea(blind_t xhandle, nsc_off_t fba_pos, nsc_size_t fba_len,
int error)
{
_sd_buf_handle_t *handle = xhandle;
handle->bh_error = error;
if (error)
_sd_cache_files[HANDLE_CD(handle)].cd_info->sh_failed = 1;
_SD_WRITE_CALLBACK(handle);
}
/*
* update_dirty - set dirty bits in cache block which is already dirty
* cc_inuse is held, need cc_lock to avoid race with _sd_process_pending
* must check for I/O in-progress and set PEND_DIRTY.
* return previous dirty bits
* [if set _sd_process_pending will re-issue]
*/
static _sd_bitmap_t
update_dirty(_sd_cctl_t *cc_ent, sdbc_cblk_fba_t st_off, sdbc_cblk_fba_t st_len)
{
_sd_bitmap_t old;
/* was FAST */
mutex_enter(&cc_ent->cc_lock);
old = CENTRY_DIRTY(cc_ent);
if (old) {
/*
* If we are writing to an FBA that is still marked dirty,
* record a write cancellation.
*/
if (old & SDBC_GET_BITS(st_off, st_len)) {
CACHE_WRITE_CANCELLATION(CENTRY_CD(cc_ent));
}
/* This is a write to a block that was already dirty */
SDBC_SET_DIRTY(st_off, st_len, cc_ent);
sd_serialize();
if (CENTRY_IO_INPROGRESS(cc_ent))
cc_ent->cc_flag |= CC_PEND_DIRTY;
}
/* was FAST */
mutex_exit(&cc_ent->cc_lock);
return (old);
}
/*
* _sd_write - Interface call to commit part of handle.
*
* ARGUMENTS:
* handle - handle allocated earlier o.
* fba_pos - disk block number to write to.
* fba_len - length in fbas.
* flag - (NSC_NOBLOCK | NSC_WRTHRU)
*
* RETURNS:
* errno if return > 0
* NSC_HIT (in cache), NSC_DONE (to disk) or NSC_PENDING otherwise.
*
* Comments:
* This routine checks validity of the handle and then calls the
* sync-write function if this write is determined to be write-through.
* Else, it reflects the data to the write blocks on the mirror node,
* (allocated in alloc_buf). If the cache block is not dirty, it is
* marked dirty and queued up for io processing later on.
* If parts are already dirty but io is not in progress yet, it is
* marked dirty and left alone (it is already in the queue)
* If parts are already dirty but io is in progress, it is marked
* dirty and also a flag is set indicating that this buffer should
* be reprocessed after the io-end-action.
* Attempt is made to coalesce multiple writes into a single list
* for io processing later on.
*
* Issuing of writes may be delayed until the handle is released;
* _sd_queue_write() sets NSC_QUEUE, indicating that dirty bits
* and reflection to mirror have already been done, just queue I/O.
*/
int
_sd_write(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len,
int flag)
{
int cd = HANDLE_CD(handle);
int num_queued, ret, queue_only, store_only;
sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */
sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */
sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */
nsc_size_t cur_fba_len; /* position in disk blocks */
_sd_cctl_t *cc_ent = NULL;
_sd_cctl_t *cur_chain = NULL, *dirty_next = NULL;
if (_sdbc_shutdown_in_progress) {
ret = EIO;
goto out;
}
if (!_SD_HANDLE_ACTIVE(handle)) {
SDALERT(SDF_WRITE,
SDT_INV_CD, 0, SDT_INV_BL, handle->bh_flag, 0);
ret = EINVAL;
goto out;
}
#if !defined(_SD_NOCHECKS)
ASSERT_HANDLE_LIMITS(handle, fba_pos, fba_len);
if ((handle->bh_flag & NSC_WRBUF) == 0) {
ret = EINVAL;
goto out;
}
#endif
if (fba_len == 0) {
ret = NSC_DONE;
goto out;
}
/*
* store_only: don't queue this I/O yet
* queue_only: queue I/O to disk, don't store in mirror node
*/
if (flag & NSC_QUEUE)
queue_only = 1, store_only = 0;
else
if (_SD_DELAY_QUEUE && (fba_len != handle->bh_fba_len))
queue_only = 0, store_only = 1;
else
queue_only = store_only = 0;
if (!queue_only && _SD_FORCE_DISCONNECT(fba_len))
_SD_DISCONNECT_CALLBACK(handle);
if (_sd_cache_files[cd].cd_info->sh_failed) {
ret = EIO;
goto out;
}
KSTAT_RUNQ_ENTER(cd);
SDTRACE(ST_ENTER|SDF_WRITE, cd, fba_len, fba_pos, flag, 0);
#if defined(_SD_DEBUG_PATTERN)
check_buf_consistency(handle, "wr");
#endif
cc_ent = handle->bh_centry;
while (CENTRY_BLK(cc_ent) != FBA_TO_BLK_NUM(fba_pos))
cc_ent = cc_ent->cc_chain;
if (((handle->bh_flag | flag) & _SD_WRTHRU_MASK) ||
(!queue_only && _sd_remote_store(cc_ent, fba_pos, fba_len))) {
flag |= NSC_WRTHRU;
ret = _sd_sync_write(handle, fba_pos, fba_len, flag);
goto stats_exit;
}
if (store_only) /* enqueue in _sd_free_buf() */
handle->bh_flag |= NSC_QUEUE;
cur_fba_len = fba_len;
st_cblk_off = BLK_FBA_OFF(fba_pos);
st_cblk_len = BLK_FBAS - st_cblk_off;
if ((nsc_size_t)st_cblk_len >= fba_len) {
end_cblk_len = 0;
st_cblk_len = (sdbc_cblk_fba_t)fba_len;
} else {
end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
}
if (CENTRY_DIRTY(cc_ent) && update_dirty(cc_ent, st_cblk_off,
st_cblk_len))
goto loop1;
if (store_only) {
SDBC_SET_TOFLUSH(st_cblk_off, st_cblk_len, cc_ent);
goto loop1;
}
SDBC_SET_DIRTY(st_cblk_off, st_cblk_len, cc_ent);
cur_chain = dirty_next = cc_ent;
num_queued = 1;
loop1:
DATA_LOG(SDF_WR, cc_ent, st_cblk_off, st_cblk_len);
DTRACE_PROBE4(_sd_write_data1,
uint64_t, (uint64_t)
(BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)) + st_cblk_off),
int, st_cblk_len,
char *,
*(int64_t *)(cc_ent->cc_data + FBA_SIZE(st_cblk_off)),
char *, *(int64_t *)(cc_ent->cc_data +
FBA_SIZE(st_cblk_off+ st_cblk_len) - 8));
cur_fba_len -= st_cblk_len;
cc_ent = cc_ent->cc_chain;
while (cur_fba_len > (nsc_size_t)end_cblk_len) {
if (CENTRY_DIRTY(cc_ent) && update_dirty(cc_ent, 0, BLK_FBAS)) {
if (cur_chain) {
_sd_enqueue_dirty(cd, cur_chain, dirty_next,
num_queued);
cur_chain = dirty_next = NULL;
}
goto loop2;
}
if (store_only) {
SDBC_SET_TOFLUSH(0, BLK_FBAS, cc_ent);
goto loop2;
}
SDBC_SET_DIRTY(0, BLK_FBAS, cc_ent);
if (dirty_next) {
dirty_next->cc_dirty_next = cc_ent;
dirty_next = cc_ent;
num_queued++;
} else {
cur_chain = dirty_next = cc_ent;
num_queued = 1;
}
loop2:
DATA_LOG(SDF_WR, cc_ent, 0, BLK_FBAS);
DTRACE_PROBE4(_sd_write_data2,
uint64_t,
(uint64_t)(BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent))),
int, BLK_FBAS,
char *, *(int64_t *)(cc_ent->cc_data),
char *, *(int64_t *)(cc_ent->cc_data +
FBA_SIZE(BLK_FBAS) - 8));
cc_ent = cc_ent->cc_chain;
cur_fba_len -= BLK_FBAS;
}
#if defined(_SD_DEBUG)
if (cur_fba_len != end_cblk_len)
cmn_err(CE_WARN, "fba_len %" NSC_SZFMT " end_cblk_len %d in "
"_sd_write", cur_fba_len, end_cblk_len);
#endif
if (cur_fba_len) {
if (CENTRY_DIRTY(cc_ent) && update_dirty(cc_ent, 0,
end_cblk_len)) {
if (cur_chain) {
_sd_enqueue_dirty(cd, cur_chain, dirty_next,
num_queued);
cur_chain = dirty_next = NULL;
}
goto loop3;
}
if (store_only) {
SDBC_SET_TOFLUSH(0, end_cblk_len, cc_ent);
goto loop3;
}
SDBC_SET_DIRTY(0, end_cblk_len, cc_ent);
if (dirty_next) {
dirty_next->cc_dirty_next = cc_ent;
dirty_next = cc_ent;
num_queued++;
} else {
cur_chain = dirty_next = cc_ent;
num_queued = 1;
}
}
loop3:
if (cur_fba_len) {
DATA_LOG(SDF_WR, cc_ent, 0, end_cblk_len);
DTRACE_PROBE4(_sd_write_data3,
uint64_t,
(uint64_t)(BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent))),
int, end_cblk_len,
char *, *(int64_t *)(cc_ent->cc_data),
char *, *(int64_t *)(cc_ent->cc_data +
FBA_SIZE(end_cblk_len) - 8));
}
if (!store_only && cur_chain) {
_sd_enqueue_dirty(cd, cur_chain, dirty_next, num_queued);
}
if (!queue_only) {
CACHE_FBA_WRITE(cd, fba_len);
CACHE_WRITE_HIT;
FBA_WRITE_IO_KSTATS(cd, FBA_SIZE(fba_len));
}
ret = NSC_HIT;
stats_exit:
SDTRACE(ST_EXIT|SDF_WRITE, cd, fba_len, fba_pos, flag, ret);
KSTAT_RUNQ_EXIT(cd);
out:
return (ret);
}
/*
* _sd_queue_write(handle, fba_pos, fba_len): Queues delayed writes for
* flushing
*
* ARGUMENTS: handle - handle allocated with NSC_WRBUF
* fba_pos - starting fba pos from _sd_alloc_buf()
* fba_len - fba len from _sd_alloc_buf()
*
* USAGE : Called if _SD_DELAY_QUEUE is set. Finds all blocks in the
* handle marked for flushing and queues them to be written in
* optimized (i.e. sequential) order
*/
static void
_sd_queue_write(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len)
{
nsc_off_t fba_end;
sdbc_cblk_fba_t sblk, len, dirty;
_sd_cctl_t *cc_ent;
nsc_off_t flush_pos;
int flush_pos_valid = 0;
nsc_size_t flush_len = 0;
cc_ent = handle->bh_centry;
fba_end = fba_pos + fba_len;
fba_pos = BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)); /* 1st block */
while (fba_pos < fba_end) {
dirty = cc_ent->cc_toflush;
cc_ent->cc_toflush = 0;
/*
* Full block
*/
if (_SD_BMAP_ISFULL(dirty)) {
if (flush_pos_valid == 0) {
flush_pos_valid = 1;
flush_pos = fba_pos;
}
flush_len += BLK_FBAS;
}
/*
* Partial block
*/
else while (dirty) {
sblk = SDBC_LOOKUP_STPOS(dirty);
len = SDBC_LOOKUP_LEN(dirty);
SDBC_LOOKUP_MODIFY(dirty);
if (sblk && flush_pos_valid) {
(void) _sd_write(handle, flush_pos, flush_len,
NSC_QUEUE);
flush_pos_valid = 0;
flush_len = 0;
}
if (flush_pos_valid == 0) {
flush_pos_valid = 1;
flush_pos = fba_pos + sblk;
}
flush_len += len;
}
fba_pos += BLK_FBAS;
cc_ent = cc_ent->cc_chain;
/*
* If we find a gap, write out what we've got
*/
if (flush_pos_valid && (flush_pos + flush_len) != fba_pos) {
(void) _sd_write(handle, flush_pos, flush_len,
NSC_QUEUE);
flush_pos_valid = 0;
flush_len = 0;
}
}
if (flush_pos_valid)
(void) _sd_write(handle, flush_pos, flush_len, NSC_QUEUE);
}
static int
_sd_remote_store(_sd_cctl_t *cc_ent, nsc_off_t fba_pos, nsc_size_t fba_len)
{
sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */
sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */
sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */
ss_resource_t *ss_res;
if (_sd_nodes_configured <= 2 && _sd_is_mirror_down())
return (0);
st_cblk_off = BLK_FBA_OFF(fba_pos);
st_cblk_len = BLK_FBAS - st_cblk_off;
if ((nsc_size_t)st_cblk_len >= fba_len) {
end_cblk_len = 0;
st_cblk_len = (sdbc_cblk_fba_t)fba_len;
} else {
end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
}
fba_len -= st_cblk_len;
ss_res = cc_ent->cc_write->sc_res;
if (SSOP_WRITE_CBLOCK(sdbc_safestore, ss_res,
cc_ent->cc_data + FBA_SIZE(st_cblk_off),
FBA_SIZE(st_cblk_len),
FBA_SIZE(st_cblk_off))) {
cmn_err(CE_WARN,
"sdbc(_sd_write) safe store failed. Going synchronous");
SDTRACE(SDF_REFLECT, CENTRY_CD(cc_ent), fba_len,
fba_pos, 0, -1);
return (-1);
}
cc_ent = cc_ent->cc_chain;
while (fba_len > (nsc_size_t)end_cblk_len) {
fba_len -= BLK_FBAS;
if (SSOP_WRITE_CBLOCK(sdbc_safestore, ss_res,
cc_ent->cc_data,
CACHE_BLOCK_SIZE, 0)) {
cmn_err(CE_WARN,
"sdbc(_sd_write) safe store failed. "
"Going synchronous");
SDTRACE(SDF_REFLECT, CENTRY_CD(cc_ent), fba_len,
fba_pos, 0, -1);
return (-1);
}
cc_ent = cc_ent->cc_chain;
} /* end while */
if (fba_len) {
if (SSOP_WRITE_CBLOCK(sdbc_safestore, ss_res,
cc_ent->cc_data,
FBA_SIZE(end_cblk_len), 0)) {
cmn_err(CE_WARN,
"sdbc(_sd_write) nvmem dma failed. "
"Going synchronous");
SDTRACE(SDF_REFLECT, CENTRY_CD(cc_ent), fba_len,
fba_pos, 0, -1);
return (-1);
}
}
return (0);
}
/*
* _sd_sync_write2 - Write-through function.
*
* ARGUMENTS:
* wr_handle - handle into which to write the data.
* wr_st_pos - starting FBA position in wr_handle.
* fba_len - length in fbas.
* flag - NSC_NOBLOCK for async io.
* rd_handle - handle from which to read the data, or NULL.
* rd_st_pos - starting FBA position in rd_handle.
*
* RETURNS:
* errno if return > 0
* NSC_DONE or NSC_PENDING otherwise.
*
* Comments:
* This routine initiates io of the indicated portion. It returns
* synchronously after io is completed if NSC_NOBLOCK is not set.
* Else NSC_PENDING is returned with a subsequent write callback on
* io completion.
*
* See _sd_copy_direct() for usage when
* (wr_handle != rd_handle && rd_handle != NULL)
*/
static int
_sd_sync_write2(_sd_buf_handle_t *wr_handle, nsc_off_t wr_st_pos,
nsc_size_t fba_len, int flag, _sd_buf_handle_t *rd_handle,
nsc_off_t rd_st_pos)
{
void (*fn)(blind_t, nsc_off_t, nsc_size_t, int);
_sd_cctl_t *wr_ent, *rd_ent;
nsc_size_t this_len;
nsc_off_t rd_pos, wr_pos;
nsc_size_t log_bytes;
int cd = HANDLE_CD(wr_handle);
int err;
uint_t dirty;
struct buf *bp;
LINTUSED(flag);
_SD_DISCONNECT_CALLBACK(wr_handle);
if (rd_handle == NULL) {
rd_handle = wr_handle;
rd_st_pos = wr_st_pos;
}
wr_ent = wr_handle->bh_centry;
while (CENTRY_BLK(wr_ent) != FBA_TO_BLK_NUM(wr_st_pos))
wr_ent = wr_ent->cc_chain;
rd_ent = rd_handle->bh_centry;
while (CENTRY_BLK(rd_ent) != FBA_TO_BLK_NUM(rd_st_pos))
rd_ent = rd_ent->cc_chain;
bp = sd_alloc_iob(_sd_cache_files[cd].cd_crdev,
wr_st_pos, FBA_TO_BLK_LEN(fba_len) + 2, B_WRITE);
if (bp == NULL)
return (E2BIG);
wr_pos = BLK_FBA_OFF(wr_st_pos);
rd_pos = BLK_FBA_OFF(rd_st_pos);
log_bytes = 0;
do {
this_len = min((BLK_FBAS - rd_pos), (BLK_FBAS - wr_pos));
if (this_len > fba_len)
this_len = fba_len;
/*
* clear dirty bits in the write handle.
*/
if (CENTRY_DIRTY(wr_ent)) {
mutex_enter(&wr_ent->cc_lock);
if (CENTRY_DIRTY(wr_ent)) {
if (this_len == (nsc_size_t)BLK_FBAS ||
rd_handle != wr_handle) {
/*
* optimization for when we have a
* full cache block, or are doing
* copy_direct (see below).
*/
wr_ent->cc_write->sc_dirty = 0;
} else {
dirty = wr_ent->cc_write->sc_dirty;
dirty &= ~(SDBC_GET_BITS(
wr_pos, this_len));
wr_ent->cc_write->sc_dirty = dirty;
}
SSOP_SETCENTRY(sdbc_safestore,
wr_ent->cc_write);
}
mutex_exit(&wr_ent->cc_lock);
}
/*
* update valid bits in the write handle.
*/
if (rd_handle == wr_handle) {
if (this_len == (nsc_size_t)BLK_FBAS) {
SET_FULLY_VALID(wr_ent);
} else {
SDBC_SET_VALID_BITS(wr_pos, this_len, wr_ent);
}
} else {
/*
* doing copy_direct, so mark the write handle
* as invalid since the data is on disk, but not
* in cache.
*/
wr_ent->cc_valid = 0;
}
DATA_LOG(SDF_WRSYNC, rd_ent, rd_pos, this_len);
DTRACE_PROBE4(_sd_sync_write2_data,
uint64_t,
(uint64_t)BLK_TO_FBA_NUM(CENTRY_BLK(rd_ent)) + rd_pos,
uint64_t, (uint64_t)this_len,
char *,
*(int64_t *)(rd_ent->cc_data + FBA_SIZE(rd_pos)),
char *,
*(int64_t *)(rd_ent->cc_data +
FBA_SIZE(rd_pos + this_len) - 8));
sd_add_fba(bp, &rd_ent->cc_addr, rd_pos, this_len);
log_bytes += FBA_SIZE(this_len);
fba_len -= this_len;
wr_pos += this_len;
if (wr_pos >= (nsc_size_t)BLK_FBAS) {
wr_ent = wr_ent->cc_chain;
wr_pos = 0;
}
rd_pos += this_len;
if (rd_pos >= (nsc_size_t)BLK_FBAS) {
rd_ent = rd_ent->cc_chain;
rd_pos = 0;
}
} while (fba_len > 0);
DISK_FBA_WRITE(cd, FBA_NUM(log_bytes));
CACHE_WRITE_MISS;
FBA_WRITE_IO_KSTATS(cd, log_bytes);
fn = (wr_handle->bh_flag & NSC_NOBLOCK) ? _sd_async_write_ea : NULL;
err = sd_start_io(bp, _sd_cache_files[cd].cd_strategy, fn, wr_handle);
if (err != NSC_PENDING) {
DATA_LOG_CHAIN(SDF_WRSYEA, wr_handle->bh_centry,
wr_st_pos, FBA_NUM(log_bytes));
}
return (err);
}
static int
_sd_sync_write(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len,
int flag)
{
return (_sd_sync_write2(handle, fba_pos, fba_len, flag, NULL, 0));
}
/*
* _sd_zero - Interface call to zero out a portion of cache blocks.
*
* ARGUMENTS:
* handle - handle allocated earlier on.
* fba_pos - disk block number to zero from.
* fba_len - length in fbas.
* flag - NSC_NOBLOCK for async io.
*
* RETURNS:
* errno if return > 0
* NSC_DONE or NSC_PENDING otherwise.
*
* Comments:
* This routine zeroes out the indicated portion of the cache blocks
* and commits the data to disk.
* (See write for more details on the commit)
*/
int
_sd_zero(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len,
int flag)
{
int cd;
sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */
sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */
sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */
nsc_size_t cur_fba_len; /* position in disk blocks */
int ret;
_sd_cctl_t *cc_ent;
if (_sdbc_shutdown_in_progress) {
DTRACE_PROBE(shutdown);
return (EIO);
}
if (!_SD_HANDLE_ACTIVE(handle)) {
cmn_err(CE_WARN, "sdbc(_sd_zero) handle %p not active",
(void *)handle);
DTRACE_PROBE1(handle_active, int, handle->bh_flag);
return (EINVAL);
}
ASSERT_HANDLE_LIMITS(handle, fba_pos, fba_len);
if ((handle->bh_flag & NSC_WRBUF) == 0) {
DTRACE_PROBE1(handle_write, int, handle->bh_flag);
return (EINVAL);
}
if (fba_len == 0) {
DTRACE_PROBE(zero_len);
return (NSC_DONE);
}
if (_SD_FORCE_DISCONNECT(fba_len))
_SD_DISCONNECT_CALLBACK(handle);
cd = HANDLE_CD(handle);
SDTRACE(ST_ENTER|SDF_ZERO, cd, fba_len, fba_pos, flag, 0);
cc_ent = handle->bh_centry;
while (CENTRY_BLK(cc_ent) != FBA_TO_BLK_NUM(fba_pos))
cc_ent = cc_ent->cc_chain;
cur_fba_len = fba_len;
st_cblk_off = BLK_FBA_OFF(fba_pos);
st_cblk_len = BLK_FBAS - st_cblk_off;
if ((nsc_size_t)st_cblk_len >= fba_len) {
end_cblk_len = 0;
st_cblk_len = (sdbc_cblk_fba_t)fba_len;
} else {
end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
}
cur_fba_len -= st_cblk_len;
bzero(cc_ent->cc_data + FBA_SIZE(st_cblk_off), FBA_SIZE(st_cblk_len));
cc_ent = cc_ent->cc_chain;
while (cur_fba_len > (nsc_size_t)end_cblk_len) {
cur_fba_len -= BLK_FBAS;
bzero(cc_ent->cc_data, CACHE_BLOCK_SIZE);
cc_ent = cc_ent->cc_chain;
}
if (cur_fba_len) {
bzero(cc_ent->cc_data, FBA_SIZE(cur_fba_len));
}
ret = _sd_write(handle, fba_pos, fba_len, flag);
SDTRACE(ST_EXIT|SDF_ZERO, cd, fba_len, fba_pos, flag, ret);
return (ret);
}
/*
* _sd_copy - Copies portions of 2 handles.
*
* ARGUMENTS:
* handle1 - handle allocated earlier on.
* handle2 - handle allocated earlier on.
* fba_pos1 - disk block number to read from.
* fba_pos2 - disk block number to write to.
* fba_len - length in fbas.
*
* RETURNS:
* errno if return > 0
* NSC_DONE otherwise.
*
* Comments:
* This routine copies the 2 handles.
* WARNING: this could put the cache blocks in the destination handle
* in an inconsistent state. (the blocks could be valid in cache,
* but the copy makes the cache different from disk)
*
*/
int
_sd_copy(_sd_buf_handle_t *handle1, _sd_buf_handle_t *handle2,
nsc_off_t fba_pos1, nsc_off_t fba_pos2, nsc_size_t fba_len)
{
sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */
sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */
sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */
nsc_off_t off1, off2; /* offsets in FBA's into the disk */
nsc_size_t cur_fba_len; /* position in disk blocks */
_sd_cctl_t *cc_ent1, *cc_ent2;
if (_sdbc_shutdown_in_progress) {
DTRACE_PROBE(shutdown);
return (EIO);
}
if (!_SD_HANDLE_ACTIVE(handle1) || !_SD_HANDLE_ACTIVE(handle2)) {
cmn_err(CE_WARN, "sdbc(_sd_copy) handle %p or %p not active",
(void *)handle1, (void *)handle2);
DTRACE_PROBE2(handle_active1,
int, handle1->bh_flag,
int, handle2->bh_flag);
return (EINVAL);
}
ASSERT_HANDLE_LIMITS(handle1, fba_pos1, fba_len);
ASSERT_HANDLE_LIMITS(handle2, fba_pos2, fba_len);
cc_ent1 = handle1->bh_centry;
while (CENTRY_BLK(cc_ent1) != FBA_TO_BLK_NUM(fba_pos1))
cc_ent1 = cc_ent1->cc_chain;
cc_ent2 = handle2->bh_centry;
while (CENTRY_BLK(cc_ent2) != FBA_TO_BLK_NUM(fba_pos2))
cc_ent2 = cc_ent2->cc_chain;
if (BLK_FBA_OFF(fba_pos1) != BLK_FBA_OFF(fba_pos2)) {
/* Different offsets, do it slowly (per fba) */
while (fba_len) {
off1 = FBA_SIZE(BLK_FBA_OFF(fba_pos1));
off2 = FBA_SIZE(BLK_FBA_OFF(fba_pos2));
bcopy(cc_ent1->cc_data+off1, cc_ent2->cc_data+off2,
FBA_SIZE(1));
fba_pos1++;
fba_pos2++;
fba_len--;
if (FBA_TO_BLK_NUM(fba_pos1) != CENTRY_BLK(cc_ent1))
cc_ent1 = cc_ent1->cc_chain;
if (FBA_TO_BLK_NUM(fba_pos2) != CENTRY_BLK(cc_ent2))
cc_ent2 = cc_ent2->cc_chain;
}
DTRACE_PROBE(_sd_copy_end);
return (NSC_DONE);
}
cur_fba_len = fba_len;
st_cblk_off = BLK_FBA_OFF(fba_pos1);
st_cblk_len = BLK_FBAS - st_cblk_off;
if ((nsc_size_t)st_cblk_len >= fba_len) {
end_cblk_len = 0;
st_cblk_len = (sdbc_cblk_fba_t)fba_len;
} else {
end_cblk_len = BLK_FBA_OFF(fba_pos1 + fba_len);
}
bcopy(cc_ent1->cc_data + FBA_SIZE(st_cblk_off),
cc_ent2->cc_data + FBA_SIZE(st_cblk_off), FBA_SIZE(st_cblk_len));
cur_fba_len -= st_cblk_len;
cc_ent1 = cc_ent1->cc_chain;
cc_ent2 = cc_ent2->cc_chain;
while (cur_fba_len > (nsc_size_t)end_cblk_len) {
bcopy(cc_ent1->cc_data, cc_ent2->cc_data, CACHE_BLOCK_SIZE);
cc_ent1 = cc_ent1->cc_chain;
cc_ent2 = cc_ent2->cc_chain;
cur_fba_len -= BLK_FBAS;
}
if (cur_fba_len) {
bcopy(cc_ent1->cc_data, cc_ent2->cc_data,
FBA_SIZE(end_cblk_len));
}
return (NSC_DONE);
}
/*
* _sd_copy_direct - Copies data from one handle direct to another disk.
*
* ARGUMENTS:
* handle1 - handle to read from
* handle2 - handle to write to
* fba_pos1 - disk block number to read from.
* fba_pos2 - disk block number to write to.
* fba_len - length in fbas.
*
* RETURNS:
* errno if return > 0
* NSC_DONE otherwise.
*
* Comments:
* This routine copies data from handle1 directly (sync write)
* onto the disk pointed to by handle2. The handle2 is then
* invalidated since the data it contains is now stale compared to
* the disk.
*/
static int
_sd_copy_direct(_sd_buf_handle_t *handle1, _sd_buf_handle_t *handle2,
nsc_off_t fba_pos1, nsc_off_t fba_pos2, nsc_size_t fba_len)
{
int rc;
if (_sdbc_shutdown_in_progress) {
DTRACE_PROBE(shutdown);
return (EIO);
}
if (!_SD_HANDLE_ACTIVE(handle1) || !_SD_HANDLE_ACTIVE(handle2)) {
cmn_err(CE_WARN,
"sdbc(_sd_copy_direct) handle %p or %p not active",
(void *)handle1, (void *)handle2);
DTRACE_PROBE2(handle_active2,
int, handle1->bh_flag,
int, handle2->bh_flag);
return (EINVAL);
}
ASSERT_HANDLE_LIMITS(handle1, fba_pos1, fba_len);
ASSERT_HANDLE_LIMITS(handle2, fba_pos2, fba_len);
if ((handle2->bh_flag & NSC_WRITE) == 0) {
cmn_err(CE_WARN,
"sdbc(_sd_copy_direct) handle2 %p is not writeable",
(void *)handle2);
DTRACE_PROBE1(handle2_write, int, handle2->bh_flag);
return (EINVAL);
}
rc = _sd_sync_write2(handle2, fba_pos2, fba_len, 0, handle1, fba_pos1);
return (rc);
}
/*
* _sd_enqueue_dirty - Enqueue a list of dirty buffers.
*
* ARGUMENTS:
* cd - cache descriptor.
* chain - pointer to list.
* cc_last - last entry in the chain.
* numq - number of entries in the list.
*
* RETURNS:
* NONE.
*
* Comments:
* This routine queues up the dirty blocks for io processing.
* It uses the cc_last to try to coalesce multiple lists into a
* single list, if consecutive writes are sequential in nature.
*/
void
_sd_enqueue_dirty(int cd, _sd_cctl_t *chain, _sd_cctl_t *cc_last, int numq)
{
_sd_cd_info_t *cdi;
_sd_cctl_t *last_ent;
int start_write = 0, maxq = SGIO_MAX;
ASSERT(cd >= 0);
cdi = &(_sd_cache_files[cd]);
#if defined(_SD_DEBUG)
if (chain->cc_dirty_link)
cmn_err(CE_WARN, "dirty_link set in enq %x fl %x",
chain->cc_dirty_link, chain->cc_flag);
#endif
/* was FAST */
mutex_enter(&(cdi->cd_lock));
cdi->cd_info->sh_numdirty += numq;
if (cc_last == NULL)
numq = 0;
if (cdi->cd_dirty_head == NULL) {
cdi->cd_dirty_head = cdi->cd_dirty_tail = chain;
cdi->cd_last_ent = cc_last;
cdi->cd_lastchain_ptr = chain;
cdi->cd_lastchain = numq;
} else {
if ((cc_last) && (last_ent = cdi->cd_last_ent) &&
(CENTRY_BLK(chain) == (CENTRY_BLK(last_ent)+1)) &&
(SDBC_DIRTY_NEIGHBORS(last_ent, chain)) &&
(cdi->cd_lastchain + numq < maxq)) {
cdi->cd_last_ent->cc_dirty_next = chain;
cdi->cd_last_ent = cc_last;
cdi->cd_lastchain += numq;
} else {
cdi->cd_dirty_tail->cc_dirty_link = chain;
cdi->cd_dirty_tail = chain;
cdi->cd_last_ent = cc_last;
cdi->cd_lastchain_ptr = chain;
cdi->cd_lastchain = numq;
start_write = 1;
}
}
/* was FAST */
mutex_exit(&(cdi->cd_lock));
if (start_write)
(void) _SD_CD_WRITER(cd);
}
/*
* _sd_enqueue_dirty_chain - Enqueue a chain of a list of dirty buffers.
*
* ARGUMENTS:
* cd - cache descriptor.
* chain_first - first list in this chain.
* chain_last - last list in this chain.
* numq - number of entries being queue (total of all lists)
*
* RETURNS:
* NONE.
*
* Comments:
* This routine is called from the processing after io completions.
* If the buffers are still dirty, they are queued up in one shot.
*/
void
_sd_enqueue_dirty_chain(int cd,
_sd_cctl_t *chain_first,
_sd_cctl_t *chain_last,
int numq)
{
_sd_cd_info_t *cdi;
ASSERT(cd >= 0);
cdi = &(_sd_cache_files[cd]);
if (chain_last->cc_dirty_link)
cmn_err(CE_PANIC,
"_sd_enqueue_dirty_chain: chain_last %p dirty_link %p",
(void *)chain_last, (void *)chain_last->cc_dirty_link);
/* was FAST */
mutex_enter(&(cdi->cd_lock));
cdi->cd_last_ent = NULL;
cdi->cd_lastchain_ptr = NULL;
cdi->cd_lastchain = 0;
cdi->cd_info->sh_numdirty += numq;
if (cdi->cd_dirty_head == NULL) {
cdi->cd_dirty_head = chain_first;
cdi->cd_dirty_tail = chain_last;
} else {
cdi->cd_dirty_tail->cc_dirty_link = chain_first;
cdi->cd_dirty_tail = chain_last;
}
/* was FAST */
mutex_exit(&(cdi->cd_lock));
}
#ifndef _MULTI_DATAMODEL
/* ARGSUSED */
#endif
static int
convert_stats(_sd_stats32_t *uptr)
/*
* Convert the 64 bit statistic structure to 32bit version.
* Possibly losing information when cache is > 4gb. Ha!
*
* NOTE: this code isn't really MT ready since the copied to struct
* is static. However the race is pretty benign and isn't a whole
* lot worse than the vanilla version which copies data to user
* space from kernel structures that can be changing under it too.
* We can't use a local stack structure since the data size is
* 70k or so and kernel stacks are tiny (8k).
*/
{
#ifndef _MULTI_DATAMODEL
return (SDBC_EMODELCONVERT);
#else
int rc = 0;
/*
* This could be done in less code with bcopy type operations
* but this is simpler to follow and easier to change if
* the structures change.
*/
_sd_cache_stats32->net_dirty = _sd_cache_stats->net_dirty;
_sd_cache_stats32->net_pending = _sd_cache_stats->net_pending;
_sd_cache_stats32->net_free = _sd_cache_stats->net_free;
_sd_cache_stats32->st_count = _sd_cache_stats->st_count;
_sd_cache_stats32->st_loc_count = _sd_cache_stats->st_loc_count;
_sd_cache_stats32->st_rdhits = _sd_cache_stats->st_rdhits;
_sd_cache_stats32->st_rdmiss = _sd_cache_stats->st_rdmiss;
_sd_cache_stats32->st_wrhits = _sd_cache_stats->st_wrhits;
_sd_cache_stats32->st_wrmiss = _sd_cache_stats->st_wrmiss;
_sd_cache_stats32->st_blksize = _sd_cache_stats->st_blksize;
_sd_cache_stats32->st_lru_blocks = _sd_cache_stats->st_lru_blocks;
_sd_cache_stats32->st_lru_noreq = _sd_cache_stats->st_lru_noreq;
_sd_cache_stats32->st_lru_req = _sd_cache_stats->st_lru_req;
_sd_cache_stats32->st_wlru_inq = _sd_cache_stats->st_wlru_inq;
_sd_cache_stats32->st_cachesize = _sd_cache_stats->st_cachesize;
_sd_cache_stats32->st_numblocks = _sd_cache_stats->st_numblocks;
_sd_cache_stats32->st_wrcancelns = _sd_cache_stats->st_wrcancelns;
_sd_cache_stats32->st_destaged = _sd_cache_stats->st_destaged;
/*
* bcopy the shared stats which has nothing that needs conversion
* in them
*/
bcopy(_sd_cache_stats->st_shared, _sd_cache_stats32->st_shared,
sizeof (_sd_shared_t) * sdbc_max_devs);
if (copyout(_sd_cache_stats32, uptr, sizeof (_sd_stats32_t) +
(sdbc_max_devs - 1) * sizeof (_sd_shared_t)))
rc = EFAULT;
return (rc);
#endif /* _MULTI_DATAMODEL */
}
int
_sd_get_stats(_sd_stats_t *uptr, int convert_32)
{
int rc = 0;
if (_sd_cache_stats == NULL) {
static _sd_stats_t dummy;
#ifdef _MULTI_DATAMODEL
static _sd_stats32_t dummy32;
#endif
if (convert_32) {
#ifdef _MULTI_DATAMODEL
if (copyout(&dummy32, uptr, sizeof (_sd_stats32_t)))
rc = EFAULT;
#else
rc = SDBC_EMODELCONVERT;
#endif
} else if (copyout(&dummy, uptr, sizeof (_sd_stats_t)))
rc = EFAULT;
return (rc);
}
_sd_cache_stats->st_lru_blocks = _sd_lru_q.sq_inq;
_sd_cache_stats->st_lru_noreq = _sd_lru_q.sq_noreq_stat;
_sd_cache_stats->st_lru_req = _sd_lru_q.sq_req_stat;
if (sdbc_safestore) {
ssioc_stats_t ss_stats;
if (SSOP_CTL(sdbc_safestore, SSIOC_STATS,
(uintptr_t)&ss_stats) == 0)
_sd_cache_stats->st_wlru_inq = ss_stats.wq_inq;
else
_sd_cache_stats->st_wlru_inq = 0;
}
if (convert_32)
rc = convert_stats((_sd_stats32_t *)uptr);
else if (copyout(_sd_cache_stats, uptr,
sizeof (_sd_stats_t) + (sdbc_max_devs - 1) * sizeof (_sd_shared_t)))
rc = EFAULT;
return (rc);
}
int
_sd_set_hint(int cd, uint_t hint)
{
int ret = 0;
if (FILE_OPENED(cd)) {
SDTRACE(ST_ENTER|SDF_HINT, cd, 1, SDT_INV_BL, hint, 0);
_sd_cache_files[cd].cd_hint |= (hint & _SD_HINT_MASK);
SDTRACE(ST_EXIT|SDF_HINT, cd, 1, SDT_INV_BL, hint, ret);
} else
ret = EINVAL;
return (ret);
}
int
_sd_clear_hint(int cd, uint_t hint)
{
int ret = 0;
if (FILE_OPENED(cd)) {
SDTRACE(ST_ENTER|SDF_HINT, cd, 2, SDT_INV_BL, hint, 0);
_sd_cache_files[cd].cd_hint &= ~(hint & _SD_HINT_MASK);
SDTRACE(ST_EXIT|SDF_HINT, cd, 2, SDT_INV_BL, hint, ret);
} else
ret = EINVAL;
return (ret);
}
int
_sd_get_cd_hint(int cd, uint_t *hint)
{
*hint = 0;
if (FILE_OPENED(cd)) {
*hint = _sd_cache_files[cd].cd_hint;
return (0);
} else
return (EINVAL);
}
static int
_sd_node_hint_caller(blind_t hint, int hint_action)
{
int rc;
switch (hint_action) {
case NSC_GET_NODE_HINT:
rc = _sd_get_node_hint((uint_t *)hint);
break;
case NSC_SET_NODE_HINT:
rc = _sd_set_node_hint((uint_t)(unsigned long)hint);
break;
case NSC_CLEAR_NODE_HINT:
rc = _sd_clear_node_hint((uint_t)(unsigned long)hint);
break;
default:
rc = EINVAL;
break;
}
return (rc);
}
int
_sd_set_node_hint(uint_t hint)
{
SDTRACE(ST_ENTER|SDF_HINT, SDT_INV_CD, 3, SDT_INV_BL, hint, 0);
if ((_sd_node_hint & NSC_NO_FORCED_WRTHRU) &&
(hint & NSC_FORCED_WRTHRU))
return (EINVAL);
_sd_node_hint |= (hint & _SD_HINT_MASK);
SDTRACE(ST_EXIT|SDF_HINT, SDT_INV_CD, 3, SDT_INV_BL, hint, 0);
return (0);
}
int
_sd_clear_node_hint(uint_t hint)
{
SDTRACE(ST_ENTER|SDF_HINT, SDT_INV_CD, 4, SDT_INV_BL, hint, 0);
_sd_node_hint &= ~(hint & _SD_HINT_MASK);
SDTRACE(ST_EXIT|SDF_HINT, SDT_INV_CD, 4, SDT_INV_BL, hint, 0);
return (0);
}
int
_sd_get_node_hint(uint_t *hint)
{
*hint = _sd_node_hint;
return (0);
}
int
_sd_get_partsize(blind_t xcd, nsc_size_t *ptr)
{
int cd = (int)(unsigned long)xcd;
if (FILE_OPENED(cd)) {
*ptr = _sd_cache_files[cd].cd_info->sh_filesize;
return (0);
} else
return (EINVAL);
}
int
_sd_get_maxfbas(blind_t xcd, int flag, nsc_size_t *ptr)
{
int cd = (int)(unsigned long)xcd;
if (!FILE_OPENED(cd))
return (EINVAL);
if (flag & NSC_CACHEBLK)
*ptr = BLK_FBAS;
else
*ptr = sdbc_max_fbas;
return (0);
}
int
_sd_control(blind_t xcd, int cmd, void *ptr, int len)
{
_sd_cd_info_t *cdi;
int cd = (int)(unsigned long)xcd;
cdi = &(_sd_cache_files[cd]);
return (nsc_control(cdi->cd_rawfd, cmd, ptr, len));
}
int
_sd_discard_pinned(blind_t xcd, nsc_off_t fba_pos, nsc_size_t fba_len)
{
int cd = (int)(unsigned long)xcd;
_sd_cctl_t *cc_ent, **cc_lst, **cc_tmp, *nxt;
ss_centry_info_t *wctl;
int found = 0;
nsc_off_t cblk;
_sd_cd_info_t *cdi = &_sd_cache_files[cd];
int rc;
if ((!FILE_OPENED(cd)) || (!cdi->cd_info->sh_failed)) {
return (EINVAL);
}
for (cblk = FBA_TO_BLK_NUM(fba_pos);
cblk < FBA_TO_BLK_LEN(fba_pos + fba_len); cblk++) {
if (cc_ent =
(_sd_cctl_t *)_sd_hash_search(cd, cblk, _sd_htable)) {
if (!CENTRY_PINNED(cc_ent))
continue;
/*
* remove cc_ent from failed links
* cc_lst - pointer to "cc_dirty_link" pointer
* starts at &cd_failed_head.
* cc_tmp - pointer to "cc_dirty_next"
* except when equal to cc_lst.
*/
mutex_enter(&cdi->cd_lock);
cc_tmp = cc_lst = &(cdi->cd_fail_head);
while (*cc_tmp != cc_ent) {
cc_tmp = &((*cc_tmp)->cc_dirty_next);
if (!*cc_tmp)
cc_lst = &((*cc_lst)->cc_dirty_link),
cc_tmp = cc_lst;
}
if (*cc_tmp) {
found++;
if (cc_lst != cc_tmp) /* break chain */
*cc_tmp = NULL;
nxt = cc_ent->cc_dirty_next;
if (nxt) {
nxt->cc_dirty_link =
(*cc_lst)->cc_dirty_link;
*cc_lst = nxt;
} else {
*cc_lst = (*cc_lst)->cc_dirty_link;
}
cdi->cd_info->sh_numfail--;
nsc_unpinned_data(cdi->cd_iodev,
BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
BLK_FBAS);
}
mutex_exit(&cdi->cd_lock);
/* clear dirty bits */
/* was FAST */
mutex_enter(&cc_ent->cc_lock);
cc_ent->cc_valid = cc_ent->cc_dirty = 0;
cc_ent->cc_flag &= ~(CC_QHEAD|CC_PEND_DIRTY|CC_PINNED);
cc_ent->cc_dirty_link = NULL;
wctl = cc_ent->cc_write;
cc_ent->cc_write = NULL;
/* was FAST */
mutex_exit(&cc_ent->cc_lock);
/* release cache block to head of LRU */
if (wctl) {
wctl->sc_flag = 0;
wctl->sc_dirty = 0;
SSOP_SETCENTRY(sdbc_safestore, wctl);
SSOP_DEALLOCRESOURCE(sdbc_safestore,
wctl->sc_res);
}
if (!sdbc_use_dmchain)
_sd_requeue_head(cc_ent);
}
}
rc = found ? NSC_DONE : EINVAL;
return (rc);
}
/*
* Handle allocation
*/
_sd_buf_hlist_t _sd_handle_list;
/*
* _sdbc_handles_unload - cache is being unloaded.
*/
void
_sdbc_handles_unload(void)
{
mutex_destroy(&_sd_handle_list.hl_lock);
}
/*
* _sdbc_handles_load - cache is being unloaded.
*/
int
_sdbc_handles_load(void)
{
mutex_init(&_sd_handle_list.hl_lock, NULL, MUTEX_DRIVER, NULL);
return (0);
}
int
_sdbc_handles_configure()
{
_sd_handle_list.hl_count = 0;
_sd_handle_list.hl_top.bh_next = &_sd_handle_list.hl_top;
_sd_handle_list.hl_top.bh_prev = &_sd_handle_list.hl_top;
return (0);
}
/*
* _sdbc_handles_deconfigure - cache is being deconfigured
*/
void
_sdbc_handles_deconfigure(void)
{
_sd_handle_list.hl_count = 0;
}
_sd_buf_handle_t *
_sd_alloc_handle(sdbc_callback_fn_t d_cb, sdbc_callback_fn_t r_cb,
sdbc_callback_fn_t w_cb)
{
_sd_buf_handle_t *handle;
handle = (_sd_buf_handle_t *)kmem_zalloc(sizeof (_sd_buf_handle_t),
KM_SLEEP);
/* maintain list and count for debugging */
mutex_enter(&_sd_handle_list.hl_lock);
handle->bh_prev = &_sd_handle_list.hl_top;
handle->bh_next = _sd_handle_list.hl_top.bh_next;
_sd_handle_list.hl_top.bh_next->bh_prev = handle;
_sd_handle_list.hl_top.bh_next = handle;
++_sd_handle_list.hl_count;
mutex_exit(&_sd_handle_list.hl_lock);
#if !defined(_SD_NOCHECKS)
ASSERT(!(handle->bh_flag & (NSC_HALLOCATED | NSC_HACTIVE)));
#endif
handle->bh_disconnect_cb = d_cb;
handle->bh_read_cb = r_cb;
handle->bh_write_cb = w_cb;
handle->bh_flag |= NSC_HALLOCATED;
handle->bh_alloc_thread = nsc_threadp();
return (handle);
}
int
_sd_free_handle(_sd_buf_handle_t *handle)
{
if ((handle->bh_flag & NSC_HALLOCATED) == 0) {
cmn_err(CE_WARN, "sdbc(_sd_free_handle) handle %p not valid",
(void *)handle);
DTRACE_PROBE(_sd_free_handle_end);
return (EINVAL);
}
if (_SD_HANDLE_ACTIVE(handle)) {
cmn_err(CE_WARN,
"sdbc(_sd_free_handle) attempt to free active handle %p",
(void *)handle);
DTRACE_PROBE1(free_handle_active, int, handle->bh_flag);
return (EINVAL);
}
/* remove from queue before free */
mutex_enter(&_sd_handle_list.hl_lock);
handle->bh_prev->bh_next = handle->bh_next;
handle->bh_next->bh_prev = handle->bh_prev;
--_sd_handle_list.hl_count;
mutex_exit(&_sd_handle_list.hl_lock);
kmem_free(handle, sizeof (_sd_buf_handle_t));
return (0);
}
#if !defined (_SD_8K_BLKSIZE)
#define _SD_MAX_MAP 0x100
#else /* !(_SD_8K_BLKSIZE) */
#define _SD_MAX_MAP 0x10000
#endif /* !(_SD_8K_BLKSIZE) */
char _sd_contig_bmap[_SD_MAX_MAP];
_sd_map_info_t _sd_lookup_map[_SD_MAX_MAP];
void
_sd_init_contig_bmap(void)
{
int i, j;
for (i = 1; i < _SD_MAX_MAP; i = ((i << 1) | 1))
for (j = i; j < _SD_MAX_MAP; j <<= 1)
_sd_contig_bmap[j] = 1;
}
void
_sd_init_lookup_map(void)
{
unsigned int i, j, k;
int stpos, len;
_sd_bitmap_t mask;
for (i = 0; i < _SD_MAX_MAP; i++) {
for (j = i, k = 0; j && ((j & 1) == 0); j >>= 1, k++)
;
stpos = k;
_sd_lookup_map[i].mi_stpos = (unsigned char)k;
for (k = 0; j & 1; j >>= 1, k++)
;
len = k;
_sd_lookup_map[i].mi_len = (unsigned char)k;
_sd_lookup_map[i].mi_mask = SDBC_GET_BITS(stpos, len);
}
for (i = 0; i < _SD_MAX_MAP; i++) {
mask = (_sd_bitmap_t)i;
for (j = 0; mask; j++)
SDBC_LOOKUP_MODIFY(mask);
_sd_lookup_map[i].mi_dirty_count = (unsigned char)j;
}
for (i = 0; i < _SD_MAX_MAP; i++) {
_sd_lookup_map[i].mi_io_count = SDBC_LOOKUP_DTCOUNT(i);
mask = ~i;
_sd_lookup_map[i].mi_io_count += SDBC_LOOKUP_DTCOUNT(mask);
}
}
nsc_def_t _sd_sdbc_def[] = {
"Open", (uintptr_t)_sd_open_io, 0,
"Close", (uintptr_t)_sd_close_io, 0,
"Attach", (uintptr_t)_sdbc_io_attach_cd, 0,
"Detach", (uintptr_t)_sdbc_io_detach_cd, 0,
"AllocBuf", (uintptr_t)_sd_alloc_buf, 0,
"FreeBuf", (uintptr_t)_sd_free_buf, 0,
"Read", (uintptr_t)_sd_read, 0,
"Write", (uintptr_t)_sd_write, 0,
"Zero", (uintptr_t)_sd_zero, 0,
"Copy", (uintptr_t)_sd_copy, 0,
"CopyDirect", (uintptr_t)_sd_copy_direct, 0,
"Uncommit", (uintptr_t)_sd_uncommit, 0,
"AllocHandle", (uintptr_t)_sd_alloc_handle, 0,
"FreeHandle", (uintptr_t)_sd_free_handle, 0,
"Discard", (uintptr_t)_sd_discard_pinned, 0,
"Sizes", (uintptr_t)_sd_cache_sizes, 0,
"GetPinned", (uintptr_t)_sd_get_pinned, 0,
"NodeHints", (uintptr_t)_sd_node_hint_caller, 0,
"PartSize", (uintptr_t)_sd_get_partsize, 0,
"MaxFbas", (uintptr_t)_sd_get_maxfbas, 0,
"Control", (uintptr_t)_sd_control, 0,
"Provide", NSC_CACHE, 0,
0, 0, 0
};
/*
* do the SD_GET_CD_CLUSTER_DATA ioctl (get the global filename data)
*/
/* ARGSUSED */
int
sd_get_file_info_data(char *uaddrp)
{
return (ENOTTY);
}
/*
* do the SD_GET_CD_CLUSTER_SIZE ioctl (get size of global filename area)
*/
int
sd_get_file_info_size(void *uaddrp)
{
if (copyout(&_sdbc_gl_file_info_size, uaddrp,
sizeof (_sdbc_gl_file_info_size))) {
return (EFAULT);
}
return (0);
}
/*
* SD_GET_GLMUL_SIZES ioctl
* get sizes of the global info regions (for this node only)
*/
/* ARGSUSED */
int
sd_get_glmul_sizes(int *uaddrp)
{
return (ENOTTY);
}
/*
* SD_GET_GLMUL_INFO ioctl
* get the global metadata for write blocks (for this node only)
*/
/* ARGSUSED */
int
sd_get_glmul_info(char *uaddrp)
{
return (ENOTTY);
}
int
sdbc_global_stats_update(kstat_t *ksp, int rw)
{
sdbc_global_stats_t *sdbc_gstats;
_sd_stats_t *gstats_vars;
uint_t hint;
sdbc_gstats = (sdbc_global_stats_t *)(ksp->ks_data);
gstats_vars = _sd_cache_stats;
if (rw == KSTAT_WRITE) {
return (EACCES);
}
/* default to READ */
sdbc_gstats->ci_sdbc_count.value.ul = gstats_vars->st_count;
sdbc_gstats->ci_sdbc_loc_count.value.ul = gstats_vars->st_loc_count;
sdbc_gstats->ci_sdbc_rdhits.value.ul = (ulong_t)gstats_vars->st_rdhits;
sdbc_gstats->ci_sdbc_rdmiss.value.ul = (ulong_t)gstats_vars->st_rdmiss;
sdbc_gstats->ci_sdbc_wrhits.value.ul = (ulong_t)gstats_vars->st_wrhits;
sdbc_gstats->ci_sdbc_wrmiss.value.ul = (ulong_t)gstats_vars->st_wrmiss;
sdbc_gstats->ci_sdbc_blksize.value.ul =
(ulong_t)gstats_vars->st_blksize;
sdbc_gstats->ci_sdbc_lru_blocks.value.ul = (ulong_t)_sd_lru_q.sq_inq;
#ifdef DEBUG
sdbc_gstats->ci_sdbc_lru_noreq.value.ul =
(ulong_t)_sd_lru_q.sq_noreq_stat;
sdbc_gstats->ci_sdbc_lru_req.value.ul = (ulong_t)_sd_lru_q.sq_req_stat;
#endif
sdbc_gstats->ci_sdbc_wlru_inq.value.ul =
(ulong_t)gstats_vars->st_wlru_inq;
sdbc_gstats->ci_sdbc_cachesize.value.ul =
(ulong_t)gstats_vars->st_cachesize;
sdbc_gstats->ci_sdbc_numblocks.value.ul =
(ulong_t)gstats_vars->st_numblocks;
sdbc_gstats->ci_sdbc_wrcancelns.value.ul =
(ulong_t)gstats_vars->st_wrcancelns;
sdbc_gstats->ci_sdbc_destaged.value.ul =
(ulong_t)gstats_vars->st_destaged;
sdbc_gstats->ci_sdbc_num_shared.value.ul = (ulong_t)sdbc_max_devs;
(void) _sd_get_node_hint(&hint);
sdbc_gstats->ci_sdbc_nodehints.value.ul = (ulong_t)hint;
return (0);
}
int
sdbc_cd_stats_update(kstat_t *ksp, int rw)
{
sdbc_cd_stats_t *sdbc_shstats;
_sd_shared_t *shstats_vars;
int name_len;
uint_t hint;
sdbc_shstats = (sdbc_cd_stats_t *)(ksp->ks_data);
shstats_vars = (_sd_shared_t *)(ksp->ks_private);
if (rw == KSTAT_WRITE) {
return (EACCES);
}
/* copy tail of filename to kstat. leave 1 byte for null char */
if (shstats_vars->sh_filename != NULL) {
name_len = (int)strlen(shstats_vars->sh_filename);
name_len -= (KSTAT_DATA_CHAR_LEN - 1);
if (name_len < 0) {
name_len = 0;
}
(void) strlcpy(sdbc_shstats->ci_sdbc_vol_name.value.c,
shstats_vars->sh_filename + name_len, KSTAT_DATA_CHAR_LEN);
} else {
cmn_err(CE_WARN, "Kstat error: no volume name associated "
"with cache descriptor");
}
sdbc_shstats->ci_sdbc_failed.value.ul =
(ulong_t)shstats_vars->sh_failed;
sdbc_shstats->ci_sdbc_cd.value.ul = (ulong_t)shstats_vars->sh_cd;
sdbc_shstats->ci_sdbc_cache_read.value.ul =
(ulong_t)shstats_vars->sh_cache_read;
sdbc_shstats->ci_sdbc_cache_write.value.ul =
(ulong_t)shstats_vars->sh_cache_write;
sdbc_shstats->ci_sdbc_disk_read.value.ul =
(ulong_t)shstats_vars->sh_disk_read;
sdbc_shstats->ci_sdbc_disk_write.value.ul =
(ulong_t)shstats_vars->sh_disk_write;
#ifdef NSC_MULTI_TERABYTE
sdbc_shstats->ci_sdbc_filesize.value.ui64 =
(uint64_t)shstats_vars->sh_filesize;
#else
sdbc_shstats->ci_sdbc_filesize.value.ul =
(ulong_t)shstats_vars->sh_filesize;
#endif
sdbc_shstats->ci_sdbc_numdirty.value.ul =
(ulong_t)shstats_vars->sh_numdirty;
sdbc_shstats->ci_sdbc_numio.value.ul = (ulong_t)shstats_vars->sh_numio;
sdbc_shstats->ci_sdbc_numfail.value.ul =
(ulong_t)shstats_vars->sh_numfail;
sdbc_shstats->ci_sdbc_destaged.value.ul =
(ulong_t)shstats_vars->sh_destaged;
sdbc_shstats->ci_sdbc_wrcancelns.value.ul =
(ulong_t)shstats_vars->sh_wrcancelns;
(void) _sd_get_cd_hint(shstats_vars->sh_cd, &hint);
sdbc_shstats->ci_sdbc_cdhints.value.ul = (ulong_t)hint;
return (0);
}
/*
* cd_kstat_add
*
* Installs all kstats and associated infrastructure (mutex, buffer),
* associated with a particular cache descriptor. This function is called
* when the cache descriptor is opened in _sd_open().
* "cd" -- cache descriptor number whose kstats we wish to add
* returns: 0 on success, -1 on failure
*/
static int
cd_kstat_add(int cd)
{
char name[KSTAT_STRLEN];
if (cd < 0 || cd >= sdbc_max_devs) {
cmn_err(CE_WARN, "invalid cache descriptor: %d", cd);
return (-1);
}
/* create a regular kstat for this cache descriptor */
if (!sdbc_cd_kstats) {
cmn_err(CE_WARN, "sdbc_cd_kstats not allocated");
return (-1);
}
(void) snprintf(name, KSTAT_STRLEN, "%s%d", SDBC_KSTAT_CDSTATS, cd);
sdbc_cd_kstats[cd] = kstat_create(SDBC_KSTAT_MODULE,
cd, name, SDBC_KSTAT_CLASS, KSTAT_TYPE_NAMED,
sizeof (sdbc_cd_stats)/sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
if (sdbc_cd_kstats[cd] != NULL) {
sdbc_cd_kstats[cd]->ks_data = &sdbc_cd_stats;
sdbc_cd_kstats[cd]->ks_update = sdbc_cd_stats_update;
sdbc_cd_kstats[cd]->ks_private =
&_sd_cache_stats->st_shared[cd];
kstat_install(sdbc_cd_kstats[cd]);
} else {
cmn_err(CE_WARN, "cdstats %d kstat allocation failed", cd);
}
/* create an I/O kstat for this cache descriptor */
if (!sdbc_cd_io_kstats) {
cmn_err(CE_WARN, "sdbc_cd_io_kstats not allocated");
return (-1);
}
(void) snprintf(name, KSTAT_STRLEN, "%s%d", SDBC_IOKSTAT_CDSTATS, cd);
sdbc_cd_io_kstats[cd] = kstat_create(
SDBC_KSTAT_MODULE, cd, name, "disk", KSTAT_TYPE_IO, 1, 0);
if (sdbc_cd_io_kstats[cd]) {
if (!sdbc_cd_io_kstats_mutexes) {
cmn_err(CE_WARN, "sdbc_cd_io_kstats_mutexes not "
"allocated");
return (-1);
}
mutex_init(&sdbc_cd_io_kstats_mutexes[cd], NULL,
MUTEX_DRIVER, NULL);
sdbc_cd_io_kstats[cd]->ks_lock = &sdbc_cd_io_kstats_mutexes[cd];
kstat_install(sdbc_cd_io_kstats[cd]);
} else {
cmn_err(CE_WARN, "sdbc cd %d io kstat allocation failed", cd);
}
return (0);
}
/*
* cd_kstat_remove
*
* Uninstalls all kstats and associated infrastructure (mutex, buffer),
* associated with a particular cache descriptor. This function is called
* when the cache descriptor is closed in _sd_close().
* "cd" -- cache descriptor number whose kstats we wish to remove
* returns: 0 on success, -1 on failure
*/
static int
cd_kstat_remove(int cd)
{
if (cd < 0 || cd >= sdbc_max_devs) {
cmn_err(CE_WARN, "invalid cache descriptor: %d", cd);
return (-1);
}
/* delete the regular kstat corresponding to this cache descriptor */
if (sdbc_cd_kstats && sdbc_cd_kstats[cd]) {
kstat_delete(sdbc_cd_kstats[cd]);
sdbc_cd_kstats[cd] = NULL;
}
/* delete the I/O kstat corresponding to this cache descriptor */
if (sdbc_cd_io_kstats && sdbc_cd_io_kstats[cd]) {
kstat_delete(sdbc_cd_io_kstats[cd]);
sdbc_cd_io_kstats[cd] = NULL;
if (sdbc_cd_io_kstats_mutexes) {
/* destroy the mutex associated with this I/O kstat */
mutex_destroy(&sdbc_cd_io_kstats_mutexes[cd]);
}
}
return (0);
}
#ifdef DEBUG
/*
* kstat update
*/
int
sdbc_dynmem_kstat_update_dm(kstat_t *ksp, int rw)
{
sdbc_dynmem_dm_t *sdbc_dynmem;
_dm_process_vars_t *process_vars;
_dm_process_vars_t local_dm_process_vars;
simplect_dm++;
sdbc_dynmem = (sdbc_dynmem_dm_t *)(ksp->ks_data);
/* global dynmem_processing_dm */
process_vars = (_dm_process_vars_t *)(ksp->ks_private);
if (rw == KSTAT_WRITE) {
simplect_dm = sdbc_dynmem->ci_sdbc_simplect.value.ul;
local_dm_process_vars.monitor_dynmem_process =
sdbc_dynmem->ci_sdbc_monitor_dynmem.value.ul;
local_dm_process_vars.max_dyn_list =
sdbc_dynmem->ci_sdbc_max_dyn_list.value.ul;
local_dm_process_vars.cache_aging_ct1 =
sdbc_dynmem->ci_sdbc_cache_aging_ct1.value.ul;
local_dm_process_vars.cache_aging_ct2 =
sdbc_dynmem->ci_sdbc_cache_aging_ct2.value.ul;
local_dm_process_vars.cache_aging_ct3 =
sdbc_dynmem->ci_sdbc_cache_aging_ct3.value.ul;
local_dm_process_vars.cache_aging_sec1 =
sdbc_dynmem->ci_sdbc_cache_aging_sec1.value.ul;
local_dm_process_vars.cache_aging_sec2 =
sdbc_dynmem->ci_sdbc_cache_aging_sec2.value.ul;
local_dm_process_vars.cache_aging_sec3 =
sdbc_dynmem->ci_sdbc_cache_aging_sec3.value.ul;
local_dm_process_vars.cache_aging_pcnt1 =
sdbc_dynmem->ci_sdbc_cache_aging_pcnt1.value.ul;
local_dm_process_vars.cache_aging_pcnt2 =
sdbc_dynmem->ci_sdbc_cache_aging_pcnt2.value.ul;
local_dm_process_vars.max_holds_pcnt =
sdbc_dynmem->ci_sdbc_max_holds_pcnt.value.ul;
local_dm_process_vars.process_directive =
sdbc_dynmem->ci_sdbc_process_directive.value.ul;
(void) sdbc_edit_xfer_process_vars_dm(&local_dm_process_vars);
if (process_vars->process_directive & WAKE_DEALLOC_THREAD_DM) {
process_vars->process_directive &=
~WAKE_DEALLOC_THREAD_DM;
mutex_enter(&dynmem_processing_dm.thread_dm_lock);
cv_broadcast(&dynmem_processing_dm.thread_dm_cv);
mutex_exit(&dynmem_processing_dm.thread_dm_lock);
}
return (0);
}
/* default to READ */
sdbc_dynmem->ci_sdbc_simplect.value.ul = simplect_dm;
sdbc_dynmem->ci_sdbc_monitor_dynmem.value.ul =
process_vars->monitor_dynmem_process;
sdbc_dynmem->ci_sdbc_max_dyn_list.value.ul =
process_vars->max_dyn_list;
sdbc_dynmem->ci_sdbc_cache_aging_ct1.value.ul =
process_vars->cache_aging_ct1;
sdbc_dynmem->ci_sdbc_cache_aging_ct2.value.ul =
process_vars->cache_aging_ct2;
sdbc_dynmem->ci_sdbc_cache_aging_ct3.value.ul =
process_vars->cache_aging_ct3;
sdbc_dynmem->ci_sdbc_cache_aging_sec1.value.ul =
process_vars->cache_aging_sec1;
sdbc_dynmem->ci_sdbc_cache_aging_sec2.value.ul =
process_vars->cache_aging_sec2;
sdbc_dynmem->ci_sdbc_cache_aging_sec3.value.ul =
process_vars->cache_aging_sec3;
sdbc_dynmem->ci_sdbc_cache_aging_pcnt1.value.ul =
process_vars->cache_aging_pcnt1;
sdbc_dynmem->ci_sdbc_cache_aging_pcnt2.value.ul =
process_vars->cache_aging_pcnt2;
sdbc_dynmem->ci_sdbc_max_holds_pcnt.value.ul =
process_vars->max_holds_pcnt;
sdbc_dynmem->ci_sdbc_process_directive.value.ul =
process_vars->process_directive;
sdbc_dynmem->ci_sdbc_alloc_ct.value.ul = process_vars->alloc_ct;
sdbc_dynmem->ci_sdbc_dealloc_ct.value.ul = process_vars->dealloc_ct;
sdbc_dynmem->ci_sdbc_history.value.ul = process_vars->history;
sdbc_dynmem->ci_sdbc_nodatas.value.ul = process_vars->nodatas;
sdbc_dynmem->ci_sdbc_candidates.value.ul = process_vars->candidates;
sdbc_dynmem->ci_sdbc_deallocs.value.ul = process_vars->deallocs;
sdbc_dynmem->ci_sdbc_hosts.value.ul = process_vars->hosts;
sdbc_dynmem->ci_sdbc_pests.value.ul = process_vars->pests;
sdbc_dynmem->ci_sdbc_metas.value.ul = process_vars->metas;
sdbc_dynmem->ci_sdbc_holds.value.ul = process_vars->holds;
sdbc_dynmem->ci_sdbc_others.value.ul = process_vars->others;
sdbc_dynmem->ci_sdbc_notavail.value.ul = process_vars->notavail;
return (0);
}
#endif