ldom.c revision e4b86885570d77af552e9cf94f142f4d744fb8c8
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <pthread.h>
#include <errno.h>
#include <libnvpair.h>
#include <dlfcn.h>
#include <link.h>
#include <assert.h>
#include <sys/processor.h>
#include <sys/stat.h>
#include <sys/mdesc.h>
#include <sys/param.h>
#include <sys/systeminfo.h>
#include <sys/mem.h>
#include <sys/bl.h>
#include <sys/fm/protocol.h>
#include <fm/fmd_fmri.h>
#include <fm/fmd_agent.h>
#include <sys/pri.h>
#include "ldom.h"
#include "ldmsvcs_utils.h"
#define MD_STR_PLATFORM "platform"
#define MD_STR_DOM_CAPABLE "domaining-enabled"
static int ldom_ldmd_is_up = 0; /* assume stays up if ever seen up */
static void *ldom_dl_hp = (void *)NULL;
static const char *ldom_dl_path = "libpri.so.1";
static int ldom_dl_mode = (RTLD_NOW | RTLD_LOCAL);
static pthread_mutex_t ldom_pri_lock = PTHREAD_MUTEX_INITIALIZER;
static int ldom_pri_ref_cnt = 0; /* num of outstanding ldom_pri_init()s */
static int ldom_pri_init_done = 0; /* bool for real pri_init() done */
static int (*ldom_pri_fp_init)(void) = (int (*)(void))NULL;
static void (*ldom_pri_fp_fini)(void) = (void (*)(void))NULL;
static ssize_t (*ldom_pri_fp_get)(uint8_t wait, uint64_t *token, uint64_t **buf,
void *(*allocp)(size_t), void (*freep)(void *, size_t)) =
(ssize_t (*)(uint8_t wait, uint64_t *token, uint64_t **buf,
void *(*allocp)(size_t), void (*freep)(void *, size_t)))NULL;
static void
ldom_pri_config(void)
{
char isa[MAXNAMELEN]; /* used to see if machine is sun4v */
if (sysinfo(SI_MACHINE, isa, MAXNAMELEN) < 0)
return;
if (strcmp(isa, "sun4v") != 0)
return;
if ((ldom_dl_hp = dlopen(ldom_dl_path, ldom_dl_mode)) == NULL)
return;
ldom_pri_fp_init = (int (*)(void))dlsym(ldom_dl_hp, "pri_init");
ldom_pri_fp_fini = (void (*)(void))dlsym(ldom_dl_hp, "pri_fini");
ldom_pri_fp_get = (ssize_t (*)(uint8_t wait, uint64_t *token,
uint64_t **buf, void *(*allocp)(size_t),
void (*freep)(void *, size_t)))dlsym(ldom_dl_hp, "pri_get");
}
static void
ldom_pri_unconfig(void)
{
if (ldom_dl_hp == NULL)
return;
ldom_pri_fp_init = (int (*)(void))NULL;
ldom_pri_fp_fini = (void (*)(void))NULL;
ldom_pri_fp_get = (ssize_t (*)(uint8_t wait, uint64_t *token,
uint64_t **buf, void *(*allocp)(size_t),
void (*freep)(void *, size_t)))NULL;
(void) dlclose(ldom_dl_hp);
ldom_dl_hp = (void *)NULL;
}
/*
* ldom_pri_lock is assumed already held by anyone accessing ldom_pri_ref_cnt
*/
static int
ldom_pri_init(void)
{
if (ldom_pri_ref_cnt == 0) {
ldom_pri_config();
/*
* ldom_pri_init() is called before we know whether we
* have LDOMS FW or not; defer calling pri_init() via
* ldom_pri_fp_init until the first time we try to
* actually get a PRI
*/
}
ldom_pri_ref_cnt++;
assert(ldom_pri_ref_cnt > 0);
return (0);
}
static void
ldom_pri_fini(void)
{
assert(ldom_pri_ref_cnt > 0);
ldom_pri_ref_cnt--;
if (ldom_pri_ref_cnt == 0) {
if (ldom_pri_init_done && (ldom_pri_fp_fini != NULL)) {
(*ldom_pri_fp_fini)();
ldom_pri_init_done = 0;
}
ldom_pri_unconfig();
}
}
static ssize_t
ldom_pri_get(uint8_t wait, uint64_t *token, uint64_t **buf,
void *(*allocp)(size_t), void (*freep)(void *, size_t))
{
assert(ldom_pri_ref_cnt > 0);
if ((!ldom_pri_init_done) && (ldom_pri_fp_init != NULL)) {
if ((*ldom_pri_fp_init)() < 0)
return (-1);
ldom_pri_init_done = 1;
}
if (ldom_pri_fp_get != NULL)
return ((*ldom_pri_fp_get)(wait, token, buf, allocp, freep));
else
return (-1);
}
static ssize_t
get_local_core_md(ldom_hdl_t *lhp, uint64_t **buf)
{
int fh;
size_t size;
uint64_t *bufp;
if ((fh = open("/devices/pseudo/mdesc@0:mdesc", O_RDONLY, 0)) < 0)
return (-1);
if (ioctl(fh, MDESCIOCGSZ, &size) < 0) {
(void) close(fh);
return (-1);
}
bufp = (uint64_t *)lhp->allocp(size);
if (read(fh, bufp, size) < 0) {
lhp->freep(bufp, size);
(void) close(fh);
return (-1);
}
(void) close(fh);
*buf = bufp;
return ((ssize_t)size);
}
static int
get_local_md_prop_value(ldom_hdl_t *lhp, char *node, char *prop, uint64_t *val)
{
int rc = 1;
uint64_t *bufp;
ssize_t bufsiz;
if ((bufsiz = get_local_core_md(lhp, &bufp)) > 0) {
md_t *mdp;
if (mdp = md_init_intern(bufp, lhp->allocp, lhp->freep)) {
int num_nodes;
mde_cookie_t *listp;
num_nodes = md_node_count(mdp);
listp = lhp->allocp(sizeof (mde_cookie_t) * num_nodes);
if (md_scan_dag(mdp, MDE_INVAL_ELEM_COOKIE,
md_find_name(mdp, node),
md_find_name(mdp, "fwd"), listp) > 0 &&
md_get_prop_val(mdp, listp[0], prop, val) >= 0) {
/* found the property */
rc = 0;
}
lhp->freep(listp, sizeof (mde_cookie_t) * num_nodes);
(void) md_fini(mdp);
}
lhp->freep(bufp, bufsiz);
}
return (rc);
}
static int
ldom_getinfo(struct ldom_hdl *lhp)
{
static pthread_mutex_t mt = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t cv = PTHREAD_COND_INITIALIZER;
static int major_version = -1;
static int service_ldom = -1;
static int busy_init = 0;
int ier, rc = 0;
uint64_t domain_capable;
(void) pthread_mutex_lock(&mt);
while (busy_init == 1)
(void) pthread_cond_wait(&cv, &mt);
if (major_version != -1 && service_ldom != -1) {
lhp->major_version = major_version;
lhp->service_ldom = service_ldom;
(void) pthread_mutex_unlock(&mt);
return (0);
}
/*
* get to this point if major_version and service_ldom have not yet
* been determined
*/
busy_init = 1;
(void) pthread_mutex_unlock(&mt);
/*
* set defaults which correspond to the case of "LDOMS not
* available". note that these can (and will) also apply to
* non-sun4v machines.
*/
major_version = 0;
service_ldom = 0;
if (get_local_md_prop_value(lhp, MD_STR_PLATFORM, MD_STR_DOM_CAPABLE,
&domain_capable) == 0) {
/*
* LDOMS capable FW is installed; it should be ok to
* try to communicate with ldmd and if that fails/timesout
* then use libpri
*/
major_version = 1;
if ((ier = ldmsvcs_check_channel()) == 0) {
/*
* control ldom
* ldmfma channel between FMA and ldmd only exists
* on the control domain.
*/
service_ldom = 1;
} else if (ier == 1) {
/*
* guest ldom
* non-control ldom such as guest and io service ldom
*/
service_ldom = 0;
}
}
(void) pthread_mutex_lock(&mt);
lhp->major_version = major_version;
lhp->service_ldom = service_ldom;
busy_init = 0;
(void) pthread_mutex_unlock(&mt);
(void) pthread_cond_broadcast(&cv);
return (rc);
}
/*
* search the machine description for a "pid" entry (physical cpuid) and
* return the corresponding "id" entry (virtual cpuid).
* return -1 if not found.
* if the pid property does not exist in a cpu node, assume pid = id.
*/
static processorid_t
cpu_phys2virt(ldom_hdl_t *lhp, uint32_t cpuid)
{
char isa[MAXNAMELEN];
md_t *mdp;
mde_cookie_t *listp;
ssize_t bufsize;
processorid_t vid;
uint64_t *bufp;
uint64_t pval, pid, id;
int num_nodes, ncpus, i;
(void) sysinfo(SI_MACHINE, isa, MAXNAMELEN);
if (strcmp(isa, "sun4v") != 0)
return ((processorid_t)cpuid);
/*
* convert the physical cpuid to a virtual cpuid
*/
if ((bufsize = get_local_core_md(lhp, &bufp)) < 1)
return (-1);
if ((mdp = md_init_intern(bufp, lhp->allocp, lhp->freep)) == NULL ||
(num_nodes = md_node_count(mdp)) < 1) {
lhp->freep(bufp, bufsize);
return (-1);
}
listp = (mde_cookie_t *)lhp->allocp(sizeof (mde_cookie_t) * num_nodes);
ncpus = md_scan_dag(mdp, MDE_INVAL_ELEM_COOKIE,
md_find_name(mdp, "cpu"), md_find_name(mdp, "fwd"), listp);
vid = -1;
for (i = 0; i < ncpus; i++) {
if (md_get_prop_val(mdp, listp[i], "id", &pval) < 0)
pval = (uint64_t)-1;
id = pval;
/* if pid does not exist, assume pid=id */
if (md_get_prop_val(mdp, listp[i], "pid", &pval) < 0)
pval = id;
pid = pval;
if (pid == (uint64_t)cpuid) {
/* Found the entry */
vid = (processorid_t)id;
break;
}
}
lhp->freep(listp, sizeof (mde_cookie_t) * num_nodes);
(void) md_fini(mdp);
lhp->freep(bufp, bufsize);
return (vid);
}
int
ldom_fmri_status(ldom_hdl_t *lhp, nvlist_t *nvl)
{
char *name;
int ret = ENOTSUP;
if (nvlist_lookup_string(nvl, FM_FMRI_SCHEME, &name) != 0)
return (EINVAL);
/*
* ldom_ldmd_is_up can only be true if ldom_major_version()
* returned 1 earlier; the major version is constant for the
* life of the client process
*/
if (!ldom_ldmd_is_up) {
/* Zeus is unavail; use local routines for status/retire */
if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) {
processorid_t vid;
uint32_t cpuid;
if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID, &cpuid)
== 0 && (vid = cpu_phys2virt(lhp, cpuid)) != -1)
return (p_online(vid, P_STATUS));
} else if (strcmp(name, FM_FMRI_SCHEME_MEM) == 0) {
fmd_agent_hdl_t *hdl;
int err;
if ((hdl = fmd_agent_open(FMD_AGENT_VERSION)) == NULL) {
err = errno;
} else {
err = fmd_agent_page_isretired(hdl, nvl);
if (err == FMD_AGENT_RETIRE_DONE)
err = 0;
else
err = fmd_agent_errno(hdl);
fmd_agent_close(hdl);
}
return (err);
}
return (EINVAL);
} else {
/* Zeus is avail; use Zeus for status/retire */
if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) {
uint32_t cpuid;
if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID,
&cpuid) == 0)
ret = ldmsvcs_cpu_req_status(lhp, cpuid);
} else if (strcmp(name, FM_FMRI_SCHEME_MEM) == 0) {
uint64_t pa;
if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_PHYSADDR,
&pa) == 0)
ret = ldmsvcs_mem_req_status(lhp, pa);
else
ret = EINVAL;
}
return (ret);
}
}
int
ldom_fmri_retire(ldom_hdl_t *lhp, nvlist_t *nvl)
{
char *name;
int ret = ENOTSUP;
if (nvlist_lookup_string(nvl, FM_FMRI_SCHEME, &name) != 0)
return (EINVAL);
/*
* ldom_ldmd_is_up can only be true if ldom_major_version()
* returned 1 earlier; the major version is constant for the
* life of the client process
*/
if (!ldom_ldmd_is_up) {
/* Zeus is unavail; use local routines for status/retire */
if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) {
processorid_t vid;
uint32_t cpuid;
if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID, &cpuid)
== 0 && (vid = cpu_phys2virt(lhp, cpuid)) != -1)
return (p_online(vid, P_FAULTED));
} else if (strcmp(name, FM_FMRI_SCHEME_MEM) == 0) {
fmd_agent_hdl_t *hdl;
int err;
if ((hdl = fmd_agent_open(FMD_AGENT_VERSION)) == NULL) {
err = errno;
} else {
err = fmd_agent_page_retire(hdl, nvl);
if (err == FMD_AGENT_RETIRE_DONE)
err = 0;
else
err = fmd_agent_errno(hdl);
fmd_agent_close(hdl);
}
return (err);
}
return (EINVAL);
} else {
/* Zeus is avail; use Zeus for status/retire */
if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) {
uint32_t cpuid;
if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID,
&cpuid) == 0)
ret = ldmsvcs_cpu_req_offline(lhp, cpuid);
} else if (strcmp(name, FM_FMRI_SCHEME_MEM) == 0) {
uint64_t pa;
if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_PHYSADDR,
&pa) == 0)
ret = ldmsvcs_mem_req_retire(lhp, pa);
else
ret = EINVAL;
}
return (ret);
}
}
int
ldom_fmri_unretire(ldom_hdl_t *lhp, nvlist_t *nvl)
{
char *name;
int ret = ENOTSUP;
if (nvlist_lookup_string(nvl, FM_FMRI_SCHEME, &name) != 0)
return (EINVAL);
/*
* ldom_ldmd_is_up can only be true if ldom_major_version()
* returned 1 earlier; the major version is constant for the
* life of the client process
*/
if (!ldom_ldmd_is_up) {
/* Zeus is unavail; use local routines for status/retire */
if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) {
processorid_t vid;
uint32_t cpuid;
if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID, &cpuid)
== 0 && (vid = cpu_phys2virt(lhp, cpuid)) != -1)
return (p_online(vid, P_ONLINE));
} else if (strcmp(name, FM_FMRI_SCHEME_MEM) == 0) {
fmd_agent_hdl_t *hdl;
int err;
if ((hdl = fmd_agent_open(FMD_AGENT_VERSION)) == NULL) {
err = errno;
} else {
err = fmd_agent_page_unretire(hdl, nvl);
if (err == FMD_AGENT_RETIRE_DONE)
err = 0;
else
err = fmd_agent_errno(hdl);
fmd_agent_close(hdl);
}
return (err);
}
return (EINVAL);
} else {
/* Zeus is avail; use Zeus for status/retire */
if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) {
uint32_t cpuid;
if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID,
&cpuid) == 0)
ret = ldmsvcs_cpu_req_online(lhp, cpuid);
} else if (strcmp(name, FM_FMRI_SCHEME_MEM) == 0) {
uint64_t pa;
if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_PHYSADDR,
&pa) == 0)
ret = ldmsvcs_mem_req_unretire(lhp, pa);
else
ret = EINVAL;
}
return (ret);
}
}
static int
fmri_blacklist(ldom_hdl_t *lhp, nvlist_t *nvl, int cmd)
{
char *name;
if (ldom_major_version(lhp) != 0)
return (0);
if (nvlist_lookup_string(nvl, FM_FMRI_SCHEME, &name) != 0)
return (EINVAL);
if (strcmp(name, FM_FMRI_SCHEME_CPU) == 0) {
bl_req_t blr;
char *class;
int fd, rc, err;
if ((nvlist_lookup_string(nvl, FM_CLASS, &class) != 0) ||
(class == NULL) || (*class == '\0'))
return (EINVAL);
if ((fd = open("/dev/bl", O_RDONLY)) < 0)
return (EIO);
if (nvlist_size(nvl, &blr.bl_fmrisz, NV_ENCODE_NATIVE) != 0 ||
blr.bl_fmrisz == 0 ||
(blr.bl_fmri = (caddr_t)lhp->allocp(blr.bl_fmrisz)) ==
NULL) {
(void) close(fd);
return (EINVAL);
}
blr.bl_class = class;
rc = ioctl(fd, cmd, &blr);
err = errno;
lhp->freep((void *)&blr.bl_fmri, blr.bl_fmrisz);
(void) close(fd);
if (rc < 0 && err != ENOTSUP) {
errno = err;
return (-1);
}
}
return (0);
}
/*
* blacklist cpus in a non-LDOMS environment
*/
int
ldom_fmri_blacklist(ldom_hdl_t *lhp, nvlist_t *nvl)
{
return (fmri_blacklist(lhp, nvl, BLIOC_INSERT));
}
/*
* unblacklist cpus
*/
int
ldom_fmri_unblacklist(ldom_hdl_t *lhp, nvlist_t *nvl)
{
return (fmri_blacklist(lhp, nvl, BLIOC_DELETE));
}
ssize_t
ldom_get_core_md(ldom_hdl_t *lhp, uint64_t **buf)
{
ssize_t rv; /* return value */
uint64_t tok; /* opaque PRI token */
switch (ldom_major_version(lhp)) {
case 0:
/* pre LDOMS */
rv = get_local_core_md(lhp, buf);
break;
case 1:
/* LDOMS 1.0 - Zeus and libpri usable only on service dom */
if (ldom_on_service(lhp) == 1) {
if ((rv = ldmsvcs_get_core_md(lhp, buf)) < 1) {
(void) pthread_mutex_lock(&ldom_pri_lock);
rv = ldom_pri_get(PRI_GET, &tok,
buf, lhp->allocp, lhp->freep);
(void) pthread_mutex_unlock(&ldom_pri_lock);
} else {
ldom_ldmd_is_up = 1;
}
} else {
rv = get_local_core_md(lhp, buf);
}
break;
default:
rv = -1;
break;
}
return (rv);
}
/*
* version 0 means no LDOMS
*/
int
ldom_major_version(ldom_hdl_t *lhp)
{
if (lhp == NULL)
return (-1);
if (ldom_getinfo(lhp) == 0)
return (lhp->major_version);
else
return (0);
}
/*
* in the absence of ldoms we are on a single OS instance which is the
* equivalent of the service ldom
*/
int
ldom_on_service(ldom_hdl_t *lhp)
{
if (lhp == NULL)
return (-1);
if (ldom_getinfo(lhp) == 0)
return (lhp->service_ldom);
else
return (1);
}
ldom_hdl_t *
ldom_init(void *(*allocp)(size_t size),
void (*freep)(void *addr, size_t size))
{
struct ldom_hdl *lhp;
(void) pthread_mutex_lock(&ldom_pri_lock);
if (ldom_pri_init() < 0) {
(void) pthread_mutex_unlock(&ldom_pri_lock);
return (NULL);
}
if ((lhp = allocp(sizeof (struct ldom_hdl))) == NULL) {
ldom_pri_fini();
(void) pthread_mutex_unlock(&ldom_pri_lock);
return (NULL);
}
(void) pthread_mutex_unlock(&ldom_pri_lock);
lhp->major_version = -1; /* version not yet determined */
lhp->allocp = allocp;
lhp->freep = freep;
ldmsvcs_init(lhp);
return (lhp);
}
void
ldom_fini(ldom_hdl_t *lhp)
{
if (lhp == NULL)
return;
ldmsvcs_fini(lhp);
lhp->freep(lhp, sizeof (struct ldom_hdl));
(void) pthread_mutex_lock(&ldom_pri_lock);
ldom_pri_fini();
(void) pthread_mutex_unlock(&ldom_pri_lock);
}
/* end file */