dhcp_network.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2000 by Sun Microsystems, Inc.
* All rights reserved.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* This file contains public functions for managing DHCP network
* containers. For the semantics of these functions, please see the
* Enterprise DHCP Architecture Document.
*
* This module uses synchronization guarantees provided by dsvclockd(1M);
* please see $SRC/lib/libdhcpsvc/private/README.synch for details.
*
* Big Theory Statement for the SUNWbinfiles DHCP Network Module
* =============================================================
*
* 1. On-disk Structure
*
* Each container consists of two basic pieces on-disk: a header and an
* array of records. In order to provide fast client IP lookup, the array
* of records is directly indexed by client IP address (using a simple
* mapping function). In order to provide fast client id lookup, each
* in-use record is also on exactly one doubly-linked client id hash chain;
* the hash chains heads are contained in the header). For all other
* lookups, we can restrict our search to only the in-use records by merely
* walking all of the hash chains. Here's a crude illustration of what
* this looks like on-disk (note that hash chains 2 and 3 are empty):
*
* _______________________________________________
* | container info | hash chain heads (buckets) |
* header | | 1 | 2 | 3 | [ .... ] | N |
* | | | | | | | | |
* |__________________|_|________________________|_|
* | rec1 | rec2 | | rec3 | rec4 | |
* | | +---> | | |
* | unused | unused | hash1 | unused | |
* |___________|___________|________^|_|_________|_|
* | rec5 | rec6 | rec7 |v | rec8 | |
* | | | -> | |
* records | unused | hashN | hash1 <- hash1 | |
* |___________|________^|_|___________|_________|_|
* | : :: : : : |
* | : :: : [ more records... ] : |
* | : :: : : : |
* |___________:________::_:___________:_________:_|
* | recN-3 | recN-2 || | recN-1 | recN v |
* | | |+--> -> |
* | unused | unused +--- hashN <- hashN |
* |___________|___________|___________|___________|
*
* Note that the actual on-disk format is a bit more complicated than this
* due to robustness issues; see section 3 below for details.
*
* 2. Robustness Requirements
*
* This module has been designed to be as efficient as possible while still
* retaining the robustness minimally required for an enterprise-level
* environment. In particular, it is designed to handle the following
* failure situations:
*
* 1. An update operation (add, modify, delete) on a container is
* unable to complete due to an unexpected internal error at
* any point in the update code.
*
* 2. An update operation (add, modify, delete) on a container is
* unable to complete due to unexpected program termination while
* at any point in the update code.
*
* If either of these situations occur, the container in question must be
* left in a consistent (and viable) state. In addition, only the pending
* transaction (at most) may be lost.
*
* 3. Robustness Techniques
*
* This module uses a few different techniques to meet our robustness goals
* while maintaining high performance. The biggest problem we encounter
* when trying to achieve robustness is updating the client id hash chain.
* In particular, it is not possible to atomically add, move, or delete an
* item from a doubly linked list, thus creating a window where a crash
* could leave our hash chains in an inconsistent state.
*
* To address this problem, we actually maintain two images (copies) of all
* the hash chains in the container. At any point in time, exactly one of
* the two images is active (and thus considered authoritative), as
* indicated by a byte in the container header. When performing an update
* operation, all hash chain modifications are done on the *inactive*
* image, then, once the inactive image has completed the hash chain
* operations required by the update, the active and inactive images are
* atomically switched, making the formerly-inactive image authoritative.
* After the image switch, the update code then updates the formerly-active
* image's hash chains to match the active image's hash chains.
*
* This approach has the nice property that internal container consistency
* can always be restored after a crash by just resynchronizing the
* inactive image's hash chains with the active image's chains. Note that
* the atomic image switch serves as the "commit point" for the operation:
* if we crash before this point, we roll back the operation upon recovery
* and it appears as though the operation never happened; if we crash after
* this point, we roll forward the rest of the operation upon recovery as
* if the crash had not happened.
*
* This technique is enough to robustly implement our add and delete
* operations, but modify has an additional complication due to our direct
* mapping of client IP addresses to records. In particular, unless the
* record modification includes changing the client IP address, the
* modified record must be written at the same location as the original
* record -- however, if the modify operation fails part way through
* writing out the new client record, the record will be corrupt and we
* will have no way to return the record to a consistent state. To address
* this issue, we allocate a spare record in the container header called
* the "temporary" record. Upon a modification of this type, we first
* write the modified record to the temporary record and indicate that the
* temporary record is currently proxying for the actual record. We then
* copy the temporary record to the actual record and make the temporary
* record available again for future use. If a crash occurs before the
* copy to the temporary record is complete, then we just roll back as if
* the modify never happened (since we have not modified the actual
* record). If a crash occurs after copying the temporary record, we roll
* forward and complete the copy operation as if the crash never happened.
* Note that there are some additional subtle complications here; see the
* comments in the code for details.
*/
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/isa_defs.h>
#include <netinet/in.h>
#include <dhcp_svc_public.h>
#include <stdlib.h>
#include <dirent.h>
#include <string.h>
#include <libgen.h>
#include <errno.h>
#include <stddef.h>
#include <assert.h>
#include "dhcp_network.h"
#include "util.h"
static uint16_t cidhash(const uchar_t *, size_t);
static void net2path(char *, size_t, const char *, ipaddr_t);
static int check_dn(dn_handle_t *);
static int getabyte(int, off_t, uchar_t *);
static int setabyte(int, off_t, uchar_t);
static int read_rec(int, dn_filerec_t *, dn_recid_t);
static int write_rec(int, dn_filerec_t *, dn_recid_t);
static int read_header(int, dn_header_t *, boolean_t);
static int write_header(int, dn_header_t *);
static int read_hashhead(int, dn_recid_t *, uint16_t, uchar_t);
static int write_hashhead(int, dn_recid_t, uint16_t, uchar_t);
static boolean_t record_match(const dn_rec_t *, const dn_rec_t *, uint_t);
int
open_dn(void **handlep, const char *dir, uint_t flags,
const struct in_addr *netp, const struct in_addr *maskp)
{
dn_handle_t *dhp;
dn_header_t header = { 0 };
char dnpath[MAXPATHLEN];
int i, retval;
off_t filesz;
dhp = malloc(sizeof (dn_handle_t));
if (dhp == NULL)
return (DSVC_NO_MEMORY);
/*
* As a safeguard, check that the size of a dn_header_t hasn't
* changed (since it contains a dn_rec_t, this will probably catch
* a change in that structure as well). If it has, bail rather
* than totally corrupting the container (by continuing). Note
* that this situation indicates an internal programming error,
* which is why we prefer assert() to just returning DSVC_INTERNAL.
*/
/* CONSTCOND */
assert(sizeof (header) == 32768);
net2path(dnpath, MAXPATHLEN, dir, netp->s_addr);
retval = open_file(dnpath, flags, &dhp->dh_fd);
if (retval != DSVC_SUCCESS) {
free(dhp);
return (retval);
}
if (flags & DSVC_CREATE) {
/*
* We just created the per-network container; initialize
* the header and put it out on disk. Note that we leave
* `dnh_version' zero until the entire header has been
* written, so we can detect partial failure.
*/
header.dnh_version = 0;
header.dnh_network = netp->s_addr;
header.dnh_netmask = maskp->s_addr;
header.dnh_magic = DN_MAGIC;
header.dnh_tempimage = DN_NOIMAGE;
header.dnh_image = 0;
header.dnh_errors = 0;
header.dnh_checks = 0;
for (i = 0; i < DN_CIDHASHSZ; i++) {
header.dnh_cidhash[i][header.dnh_image] = DN_NOREC;
header.dnh_cidhash[i][!header.dnh_image] = DN_NOREC;
}
if (write_header(dhp->dh_fd, &header) == -1) {
retval = syserr_to_dsvcerr(errno);
(void) remove_dn(dir, netp);
(void) close_dn((void **)&dhp);
return (retval);
}
/*
* Virtually reserve all the space we're going to need for
* the dn_rec_t's ahead of time, so that we don't have to
* worry about "growing" the file later (though it may
* increase in size as we fill in holes). We're guaranteed
* that we'll read these holes as zeros, which we take
* advantage of since a dn_filerec_t with a rec_prev of
* DN_NOREC (which is 0) indicates that a record is unused.
*/
filesz = RECID2OFFSET(RECID(~0, header.dnh_netmask) + 1);
retval = setabyte(dhp->dh_fd, filesz - 1, 0);
if (retval != DSVC_SUCCESS) {
(void) remove_dn(dir, netp);
(void) close_dn((void **)&dhp);
return (retval);
}
/*
* Set the version field on the container, effectively
* making it available for use.
*/
retval = setabyte(dhp->dh_fd, offsetof(dn_header_t,
dnh_version), DSVC_CONVER);
if (retval != DSVC_SUCCESS) {
(void) remove_dn(dir, netp);
(void) close_dn((void **)&dhp);
return (retval);
}
} else {
/*
* Container already exists; sanity check against the
* header that's on-disk. If we detect a problem then
* either someone scribbled on our container or we
* terminated abnormally when creating the container.
*/
if (read_header(dhp->dh_fd, &header, B_FALSE) == -1) {
retval = syserr_to_dsvcerr(errno);
(void) close_dn((void **)&dhp);
return (retval);
}
if (header.dnh_network != netp->s_addr ||
header.dnh_version != DSVC_CONVER ||
header.dnh_magic != DN_MAGIC) {
(void) close_dn((void **)&dhp);
return (DSVC_INTERNAL);
}
}
dhp->dh_netmask = header.dnh_netmask;
dhp->dh_oflags = flags;
*handlep = dhp;
return (DSVC_SUCCESS);
}
int
close_dn(void **handlep)
{
dn_handle_t *dhp = (dn_handle_t *)*handlep;
if (close(dhp->dh_fd) == -1)
return (DSVC_INTERNAL);
free(dhp);
return (DSVC_SUCCESS);
}
int
remove_dn(const char *dir, const struct in_addr *netp)
{
char dnpath[MAXPATHLEN];
net2path(dnpath, MAXPATHLEN, dir, netp->s_addr);
if (unlink(dnpath) == -1)
return (syserr_to_dsvcerr(errno));
return (DSVC_SUCCESS);
}
int
lookup_dn(void *handle, boolean_t partial, uint_t query, int count,
const dn_rec_t *targetp, dn_rec_list_t **recordsp, uint_t *nrecordsp)
{
dn_handle_t *dhp = (dn_handle_t *)handle;
int retval = DSVC_SUCCESS;
uint_t nrecords, n;
uint16_t hash;
dn_rec_t *recordp;
dn_rec_list_t *records, *new_records;
dn_recid_t recid, temp_recid = DN_NOREC;
dn_filerec_t rec;
dn_header_t header;
uchar_t image;
int fd = dhp->dh_fd;
if ((dhp->dh_oflags & DSVC_READ) == 0)
return (DSVC_ACCESS);
if (read_header(fd, &header, B_FALSE) == -1)
return (syserr_to_dsvcerr(errno));
/*
* It's possible that a previous update to this container failed
* part-way through. In general, this is fine since we always keep
* our active image's hash chains correct and only swap to the
* alternate image when the other image is completely safe to use.
* However, for reasons explained in modify_dn(), it's possible
* that a record being modified was not completely updated before a
* failure occurred. In this case, the actual data for that record
* is contained in the temporary record in the header. We need to
* be careful to use that temporary record anywhere we'd otherwise
* refer to the partially updated record. Note that we do this
* rather than attempting to restore the consistency of the
* container because we're MT-hot here.
*/
if (header.dnh_dirty && header.dnh_tempimage == header.dnh_image) {
temp_recid = RECID(header.dnh_temp.rec_dn.dn_cip.s_addr,
header.dnh_netmask);
}
image = header.dnh_image;
records = NULL;
for (n = 0, nrecords = 0; count < 0 || nrecords < count; n++) {
if (DSVC_QISEQ(query, DN_QCIP)) {
/*
* Lookup scenario 1: Caller has requested a QN_CIP
* query lookup; set `recid' to the only possible
* entry (which may not be in-use).
*/
if (n != 0)
break;
recid = RECID(targetp->dn_cip.s_addr, dhp->dh_netmask);
} else if (DSVC_QISEQ(query, DN_QCID)) {
/*
* Lookup scenario 2: Caller has requested a
* QN_CID-based lookup. Walk the `cidhash' chain
* (one call at a time) and set `recid' to hash
* bucket candidates.
*
* Note that it's possible for the client id value
* 00 to appear more than once, and it's not
* impossible for other duplicate client ids to
* occur, so continue until we reach `nrecords'.
*/
if (n == 0) {
hash = cidhash(targetp->dn_cid,
targetp->dn_cid_len);
if (read_hashhead(fd, &recid, hash, image)
== -1)
return (syserr_to_dsvcerr(errno));
} else {
/* sanity check */
if (recid == rec.rec_next[image])
break;
recid = rec.rec_next[image];
}
} else {
/*
* Lookup scenario 3: Caller has requested any
* other type of search. Walk the all the client
* id hashes.
*/
if (n == 0) {
hash = 0;
if (read_header(fd, &header, B_TRUE) == -1)
return (syserr_to_dsvcerr(errno));
recid = header.dnh_cidhash[hash][image];
} else {
/* sanity check */
if (recid == rec.rec_next[image])
break;
recid = rec.rec_next[image];
}
while (recid == DN_NOREC && ++hash < DN_CIDHASHSZ)
recid = header.dnh_cidhash[hash][image];
}
/*
* No more records; bail.
*/
if (recid == DN_NOREC)
break;
if (recid == temp_recid) {
/*
* The temporary record is actually authoritative
* for this record's contents; use it instead.
*/
recid = DN_TEMPREC;
}
if (read_rec(dhp->dh_fd, &rec, recid) == -1) {
retval = syserr_to_dsvcerr(errno);
break;
}
/*
* If the record isn't in-use, then skip...
*/
if (rec.rec_prev[image] == DN_NOREC)
continue;
/*
* See if we've got a match...
*/
if (!record_match(&rec.rec_dn, targetp, query))
continue;
/*
* Caller just wants a count of the number of matching
* records, not the records themselves; continue.
*/
if (recordsp == NULL) {
nrecords++;
continue;
}
/*
* Allocate the record and fill it in.
*/
recordp = malloc(sizeof (dn_rec_t));
if (recordp == NULL) {
if (!partial)
retval = DSVC_NO_MEMORY;
break;
}
*recordp = rec.rec_dn;
/*
* Chuck the record on the list and up the counter.
*/
new_records = add_dnrec_to_list(recordp, records);
if (new_records == NULL) {
free(recordp);
if (!partial)
retval = DSVC_NO_MEMORY;
break;
}
records = new_records;
nrecords++;
}
if (retval == DSVC_SUCCESS) {
*nrecordsp = nrecords;
if (recordsp != NULL)
*recordsp = records;
return (DSVC_SUCCESS);
}
if (records != NULL)
free_dnrec_list(records);
return (retval);
}
/*
* Compares `dnp' to the target `targetp', using `query' to decide what
* fields to compare. Returns B_TRUE if `dnp' matches `targetp', B_FALSE
* if not.
*/
static boolean_t
record_match(const dn_rec_t *dnp, const dn_rec_t *targetp, uint_t query)
{
unsigned int qflags[] = { DN_QFDYNAMIC, DN_QFAUTOMATIC, DN_QFMANUAL,
DN_QFUNUSABLE, DN_QFBOOTP_ONLY };
unsigned int flags[] = { DN_FDYNAMIC, DN_FAUTOMATIC, DN_FMANUAL,
DN_FUNUSABLE, DN_FBOOTP_ONLY };
unsigned int i;
unsigned int query0;
/*
* As an optimization, skip any checks if the query is empty.
*/
DSVC_QINIT(query0);
if (query == query0)
return (B_TRUE);
if (DSVC_QISEQ(query, DN_QLEASE) &&
targetp->dn_lease != dnp->dn_lease)
return (B_FALSE);
if (DSVC_QISNEQ(query, DN_QLEASE) &&
targetp->dn_lease == dnp->dn_lease)
return (B_FALSE);
if (DSVC_QISEQ(query, DN_QCIP) &&
dnp->dn_cip.s_addr != targetp->dn_cip.s_addr)
return (B_FALSE);
if (DSVC_QISNEQ(query, DN_QCIP) &&
dnp->dn_cip.s_addr == targetp->dn_cip.s_addr)
return (B_FALSE);
if (DSVC_QISEQ(query, DN_QCID) &&
(dnp->dn_cid_len != targetp->dn_cid_len ||
(memcmp(dnp->dn_cid, targetp->dn_cid, dnp->dn_cid_len) != 0)))
return (B_FALSE);
if (DSVC_QISNEQ(query, DN_QCID) &&
(dnp->dn_cid_len == targetp->dn_cid_len &&
(memcmp(dnp->dn_cid, targetp->dn_cid, dnp->dn_cid_len) == 0)))
return (B_FALSE);
if (DSVC_QISEQ(query, DN_QSIP) &&
dnp->dn_sip.s_addr != targetp->dn_sip.s_addr)
return (B_FALSE);
if (DSVC_QISNEQ(query, DN_QSIP) &&
dnp->dn_sip.s_addr == targetp->dn_sip.s_addr)
return (B_FALSE);
if (DSVC_QISEQ(query, DN_QMACRO) &&
strcmp(targetp->dn_macro, dnp->dn_macro) != 0)
return (B_FALSE);
if (DSVC_QISNEQ(query, DN_QMACRO) &&
strcmp(targetp->dn_macro, dnp->dn_macro) == 0)
return (B_FALSE);
for (i = 0; i < sizeof (qflags) / sizeof (unsigned int); i++) {
if (DSVC_QISEQ(query, qflags[i]) &&
(dnp->dn_flags & flags[i]) !=
(targetp->dn_flags & flags[i]))
return (B_FALSE);
if (DSVC_QISNEQ(query, qflags[i]) &&
(dnp->dn_flags & flags[i]) ==
(targetp->dn_flags & flags[i]))
return (B_FALSE);
}
return (B_TRUE);
}
int
add_dn(void *handle, dn_rec_t *addp)
{
dn_filerec_t rec, rec_next;
dn_recid_t recid, recid_head;
uint16_t hash;
uchar_t image;
int retval;
dn_handle_t *dhp = (dn_handle_t *)handle;
int fd = dhp->dh_fd;
if ((dhp->dh_oflags & DSVC_WRITE) == 0)
return (DSVC_ACCESS);
retval = check_dn(dhp);
if (retval != DSVC_SUCCESS)
return (retval);
hash = cidhash(addp->dn_cid, addp->dn_cid_len);
/*
* Get the active image.
*/
retval = getabyte(fd, offsetof(dn_header_t, dnh_image), &image);
if (retval != DSVC_SUCCESS)
return (retval);
/*
* Doublecheck to make sure this entry doesn't exist already.
*/
recid = RECID(addp->dn_cip.s_addr, dhp->dh_netmask);
if (read_rec(fd, &rec, recid) == -1)
return (syserr_to_dsvcerr(errno));
if (rec.rec_prev[image] != DN_NOREC)
return (DSVC_EXISTS);
/*
* We're going to insert `rec' at the head of the `hash' hash
* chain; get it ready-to-go. Note that we update the alternate
* image's hash record id pointers so that the record will
* atomically become in-use when we switch to the alternate image.
*/
if (read_hashhead(fd, &recid_head, hash, image) == -1)
return (syserr_to_dsvcerr(errno));
rec.rec_dn = *addp;
rec.rec_dn.dn_sig = gensig();
rec.rec_prev[!image] = DN_HASHHEAD;
rec.rec_next[!image] = recid_head;
/*
* If there's a record currently on the hash chain (i.e, we're
* not the first) then load the record.
*/
if (rec.rec_next[!image] != DN_NOREC) {
if (read_rec(fd, &rec_next, rec.rec_next[!image]) == -1)
return (syserr_to_dsvcerr(errno));
}
/*
* Before we update any information on disk, mark the container as
* dirty so that there's no chance the container is inconsistent
* without us knowing about it.
*/
retval = setabyte(fd, offsetof(dn_header_t, dnh_dirty), 1);
if (retval != DSVC_SUCCESS)
return (retval);
/*
* Update the new record on-disk; note that it's not yet reachable
* via hash.
*/
if (write_rec(fd, &rec, recid) == -1)
return (syserr_to_dsvcerr(errno));
/*
* Update the alternate image's on-disk hash pointers. We need to
* do this before we switch to the alternate image so we cannot
* abort with an inconsistent active image.
*/
if (rec.rec_next[!image] != DN_NOREC) {
rec_next.rec_prev[!image] = recid;
if (write_rec(fd, &rec_next, rec.rec_next[!image]) == -1)
return (syserr_to_dsvcerr(errno));
}
if (write_hashhead(fd, recid, hash, !image) == -1)
return (syserr_to_dsvcerr(errno));
/*
* Activate the alternate image. This is our commit point -- if we
* fail after this point, we will roll forward on recovery.
*/
image = !image;
retval = setabyte(fd, offsetof(dn_header_t, dnh_image), image);
if (retval != DSVC_SUCCESS)
return (retval);
/*
* Update the old record id pointers to match
*/
rec.rec_prev[!image] = rec.rec_prev[image];
rec.rec_next[!image] = rec.rec_next[image];
if (write_rec(fd, &rec, recid) == -1)
return (syserr_to_dsvcerr(errno));
if (rec.rec_next[!image] != DN_NOREC) {
rec_next.rec_prev[!image] = recid;
if (write_rec(fd, &rec_next, rec.rec_next[!image]) == -1)
return (syserr_to_dsvcerr(errno));
}
if (write_hashhead(fd, recid, hash, !image) == -1)
return (syserr_to_dsvcerr(errno));
/*
* Update the signature on the record handed back to the caller.
*/
addp->dn_sig = rec.rec_dn.dn_sig;
/*
* Finally, mark the container as clean.
*/
return (setabyte(fd, offsetof(dn_header_t, dnh_dirty), 0));
}
int
delete_dn(void *handle, const dn_rec_t *delp)
{
dn_filerec_t rec, rec_prev, rec_next;
dn_recid_t recid;
uint16_t hash;
uchar_t image;
int retval;
dn_handle_t *dhp = (dn_handle_t *)handle;
int fd = dhp->dh_fd;
if ((dhp->dh_oflags & DSVC_WRITE) == 0)
return (DSVC_ACCESS);
retval = check_dn(dhp);
if (retval != DSVC_SUCCESS)
return (retval);
/*
* Get the active image.
*/
retval = getabyte(fd, offsetof(dn_header_t, dnh_image), &image);
if (retval != DSVC_SUCCESS)
return (retval);
/*
* Find the original entry in the network table, make sure the
* record is in-use, and check the signature field (to guard
* against collisions).
*/
recid = RECID(delp->dn_cip.s_addr, dhp->dh_netmask);
if (read_rec(fd, &rec, recid) == -1)
return (syserr_to_dsvcerr(errno));
if (rec.rec_prev[image] == DN_NOREC)
return (DSVC_NOENT);
hash = cidhash(rec.rec_dn.dn_cid, rec.rec_dn.dn_cid_len);
/*
* The signatures must match to delete a record, *except* when
* delp->dn_sig == 0. This is so records can be deleted that
* weren't retrieved via lookup_dn()
*/
if (delp->dn_sig != 0 && rec.rec_dn.dn_sig != delp->dn_sig)
return (DSVC_COLLISION);
/*
* Read our neighboring records.
*/
if (rec.rec_next[image] != DN_NOREC) {
if (read_rec(fd, &rec_next, rec.rec_next[image]) == -1)
return (syserr_to_dsvcerr(errno));
}
if (rec.rec_prev[image] != DN_HASHHEAD) {
if (read_rec(fd, &rec_prev, rec.rec_prev[image]) == -1)
return (syserr_to_dsvcerr(errno));
}
/*
* Before we update the alternate image's on-disk hash pointers,
* mark the container as dirty so that there's no chance the
* container is inconsistent without us knowing about it.
*/
retval = setabyte(fd, offsetof(dn_header_t, dnh_dirty), 1);
if (retval != DSVC_SUCCESS)
return (retval);
/*
* Update the alternate image's on-disk hash pointers. We need to
* do this before we switch to the alternate image so we do not
* abort with an inconsistent active image. Also reset the
* record's alternate image record id pointers, so that the old
* record will not be in-use when we switch to the alternate image.
*/
if (rec.rec_next[image] != DN_NOREC) {
rec_next.rec_prev[!image] = rec.rec_prev[image];
if (write_rec(fd, &rec_next, rec.rec_next[image]) == -1)
return (syserr_to_dsvcerr(errno));
}
if (rec.rec_prev[image] != DN_HASHHEAD) {
rec_prev.rec_next[!image] = rec.rec_next[image];
if (write_rec(fd, &rec_prev, rec.rec_prev[image]) == -1)
return (syserr_to_dsvcerr(errno));
} else {
if (write_hashhead(fd, rec.rec_next[image], hash, !image) == -1)
return (syserr_to_dsvcerr(errno));
}
rec.rec_next[!image] = DN_NOREC;
rec.rec_prev[!image] = DN_NOREC;
if (write_rec(fd, &rec, recid) == -1)
return (syserr_to_dsvcerr(errno));
/*
* Activate the alternate image. This is our commit point -- if we
* fail after this point, we will roll forward on recovery.
*/
image = !image;
retval = setabyte(fd, offsetof(dn_header_t, dnh_image), image);
if (retval != DSVC_SUCCESS)
return (retval);
/*
* Update the old record id pointers to match.
*/
if (rec.rec_next[!image] != DN_NOREC) {
rec_next.rec_prev[!image] = rec.rec_prev[!image];
if (write_rec(fd, &rec_next, rec.rec_next[!image]) == -1)
return (syserr_to_dsvcerr(errno));
}
if (rec.rec_prev[!image] != DN_HASHHEAD) {
rec_prev.rec_next[!image] = rec.rec_next[!image];
if (write_rec(fd, &rec_prev, rec.rec_prev[!image]) == -1)
return (syserr_to_dsvcerr(errno));
} else {
if (write_hashhead(fd, rec.rec_next[!image], hash, !image)
== -1)
return (syserr_to_dsvcerr(errno));
}
rec.rec_next[!image] = DN_NOREC;
rec.rec_prev[!image] = DN_NOREC;
if (write_rec(fd, &rec, recid) == -1)
return (syserr_to_dsvcerr(errno));
/*
* Finally, mark the container as clean.
*/
return (setabyte(fd, offsetof(dn_header_t, dnh_dirty), 0));
}
int
modify_dn(void *handle, const dn_rec_t *origp, dn_rec_t *newp)
{
dn_filerec_t rec, new_rec, rec_head, rec_next, rec_prev;
dn_recid_t recid, new_recid, recid_head;
uint16_t hash, new_hash;
uchar_t image;
int retval;
dn_handle_t *dhp = (dn_handle_t *)handle;
int fd = dhp->dh_fd;
if ((dhp->dh_oflags & DSVC_WRITE) == 0)
return (DSVC_ACCESS);
retval = check_dn(dhp);
if (retval != DSVC_SUCCESS)
return (retval);
/*
* Get the active image
*/
retval = getabyte(fd, offsetof(dn_header_t, dnh_image), &image);
if (retval != DSVC_SUCCESS)
return (retval);
/*
* Find the original entry in the network table, make sure the
* entry is in-use, and check the signature field (to guard against
* collisions).
*/
recid = RECID(origp->dn_cip.s_addr, dhp->dh_netmask);
if (read_rec(fd, &rec, recid) == -1)
return (syserr_to_dsvcerr(errno));
if (rec.rec_prev[image] == DN_NOREC)
return (DSVC_NOENT);
if (rec.rec_dn.dn_sig != origp->dn_sig)
return (DSVC_COLLISION);
/*
* Check if the record id is changing (as a result of modifying the
* IP address). If it is, then make sure the new one is available
* (if not, fail with DSVC_EXISTS).
*/
new_recid = RECID(newp->dn_cip.s_addr, dhp->dh_netmask);
if (recid != new_recid) {
if (read_rec(fd, &new_rec, new_recid) == -1)
return (syserr_to_dsvcerr(errno));
if (new_rec.rec_prev[image] != DN_NOREC)
return (DSVC_EXISTS);
}
/*
* Update the record with the new information.
*/
new_rec.rec_dn = *newp;
new_rec.rec_dn.dn_sig = origp->dn_sig + 1;
/*
* Find out if our hash chain is changing. If so, then update the
* new record's record id pointers to be on the new chain;
* otherwise just take the original record's pointers. Note that
* in either case, only update the alternate image pointers, so
* that the new record becomes in-use when we switch to the
* alternate image.
*/
hash = cidhash(rec.rec_dn.dn_cid, rec.rec_dn.dn_cid_len);
new_hash = cidhash(newp->dn_cid, newp->dn_cid_len);
if (hash == new_hash) {
new_rec.rec_prev[!image] = rec.rec_prev[image];
new_rec.rec_next[!image] = rec.rec_next[image];
} else {
if (read_hashhead(fd, &recid_head, new_hash, image) == -1)
return (syserr_to_dsvcerr(errno));
new_rec.rec_prev[!image] = DN_HASHHEAD;
new_rec.rec_next[!image] = recid_head;
}
/*
* Write the record out; if this means overwriting the old record,
* then write to a temporary record instead.
*/
if (write_rec(fd, &new_rec, new_recid == recid ? DN_TEMPREC : new_recid)
== -1)
return (syserr_to_dsvcerr(errno));
/*
* Mark the container as dirty so that there's no chance the
* container is inconsistent without us knowing about it.
*/
retval = setabyte(fd, offsetof(dn_header_t, dnh_dirty), 1);
if (retval != DSVC_SUCCESS)
return (retval);
/*
* If we've changed either the hash chain or the record id, then
* update our neighboring records' record id pointers. If we're
* changing hash chains, then remove ourselves from the old
* hash chain and insert ourselves on the new one -- otherwise, if
* we're changing record id's, then update our neighbors with our
* new record id. Note that we only apply these changes to the
* alternate image for now so that we can recover upon failure.
*/
if (hash != new_hash || recid != new_recid) {
if (rec.rec_next[image] != DN_NOREC) {
if (read_rec(fd, &rec_next, rec.rec_next[image]) == -1)
return (syserr_to_dsvcerr(errno));
}
if (rec.rec_prev[image] != DN_HASHHEAD) {
if (read_rec(fd, &rec_prev, rec.rec_prev[image]) == -1)
return (syserr_to_dsvcerr(errno));
}
if (hash != new_hash) {
rec_next.rec_prev[!image] = rec.rec_prev[!image];
rec_prev.rec_next[!image] = rec.rec_next[!image];
} else {
rec_next.rec_prev[!image] = new_recid;
rec_prev.rec_next[!image] = new_recid;
}
if (rec.rec_next[image] != DN_NOREC) {
if (write_rec(fd, &rec_next, rec.rec_next[image]) == -1)
return (syserr_to_dsvcerr(errno));
}
if (rec.rec_prev[image] != DN_HASHHEAD) {
if (write_rec(fd, &rec_prev, rec.rec_prev[image]) == -1)
return (syserr_to_dsvcerr(errno));
} else {
if (write_hashhead(fd, rec_prev.rec_next[!image], hash,
!image) == -1)
return (syserr_to_dsvcerr(errno));
}
/*
* If our hash is changing, update the alternate image
* record id pointers to point to our moved record.
*/
if (hash != new_hash) {
if (recid_head != DN_NOREC) {
if (read_rec(fd, &rec_head, recid_head) == -1)
return (syserr_to_dsvcerr(errno));
rec_head.rec_prev[!image] = new_recid;
if (write_rec(fd, &rec_head, recid_head) == -1)
return (syserr_to_dsvcerr(errno));
}
if (write_hashhead(fd, new_recid, new_hash, !image)
== -1)
return (syserr_to_dsvcerr(errno));
}
/*
* If our record id is changing, reset the old record's
* alternate image record id pointers, so that the old
* record will not be in-use once we switch over to the
* alternate image.
*/
if (recid != new_recid) {
rec.rec_prev[!image] = DN_NOREC;
rec.rec_next[!image] = DN_NOREC;
if (write_rec(fd, &rec, recid) == -1)
return (syserr_to_dsvcerr(errno));
}
}
/*
* If we're using the temporary record, then set `dnh_tempimage' to
* the image that will be active when we're done. This piece of
* state is critical in the case of failure, since it indicates
* both that the temporary record is valid, and tells us whether we
* failed before or after activating the alternate image (below).
* If we failed before activating the alternate image, then the
* failure code can just reset `dnh_tempimage' to DN_NOIMAGE and
* resynchronize the pointers. Otherwise, we failed somewhere
* after making the alternate image active but before we completed
* copying the temporary record over to the actual record, which
* the recovery code will then complete on our behalf before
* resynchronizing the pointers.
*/
if (recid == new_recid) {
retval = setabyte(fd, offsetof(dn_header_t, dnh_tempimage),
!image);
if (retval != DSVC_SUCCESS)
return (retval);
}
/*
* Activate the alternate image. This is our commit point -- if we
* fail after this point, we will roll forward on recovery.
*/
image = !image;
retval = setabyte(fd, offsetof(dn_header_t, dnh_image), image);
if (retval != DSVC_SUCCESS)
return (retval);
/*
* If we used the temporary record, copy the data into the actual
* record. Once finished, reset `dnh_tempimage' to DN_NOIMAGE
* since the temporary record no longer needs to be used.
*/
if (recid == new_recid) {
if (write_rec(fd, &new_rec, new_recid) == -1)
return (syserr_to_dsvcerr(errno));
retval = setabyte(fd, offsetof(dn_header_t, dnh_tempimage),
DN_NOIMAGE);
if (retval != DSVC_SUCCESS)
return (retval);
}
/*
* Update the old record id pointers to match.
*/
new_rec.rec_prev[!image] = new_rec.rec_prev[image];
new_rec.rec_next[!image] = new_rec.rec_next[image];
if (write_rec(fd, &new_rec, new_recid) == -1)
return (syserr_to_dsvcerr(errno));
if (hash != new_hash || recid != new_recid) {
if (rec.rec_next[image] != DN_NOREC) {
rec_next.rec_prev[!image] = rec.rec_prev[image];
if (write_rec(fd, &rec_next, rec.rec_next[image]) == -1)
return (syserr_to_dsvcerr(errno));
}
if (rec.rec_prev[image] != DN_HASHHEAD) {
rec_prev.rec_next[!image] = rec.rec_next[image];
if (write_rec(fd, &rec_prev, rec.rec_prev[image]) == -1)
return (syserr_to_dsvcerr(errno));
} else {
if (write_hashhead(fd, rec.rec_next[image], hash,
!image) == -1)
return (syserr_to_dsvcerr(errno));
}
/*
* If our hash changed, update the alternate image record
* id pointers to point to our moved record.
*/
if (hash != new_hash) {
if (recid_head != DN_NOREC) {
rec_head.rec_prev[!image] =
rec_head.rec_prev[image];
if (write_rec(fd, &rec_head, recid_head) == -1)
return (syserr_to_dsvcerr(errno));
}
if (write_hashhead(fd, new_recid, new_hash, !image)
== -1)
return (syserr_to_dsvcerr(errno));
}
/*
* If our record id changed, then finish marking the old
* record as "not in use".
*/
if (recid != new_recid) {
rec.rec_prev[!image] = DN_NOREC;
rec.rec_next[!image] = DN_NOREC;
if (write_rec(fd, &rec, recid) == -1)
return (syserr_to_dsvcerr(errno));
}
}
/*
* Update the signature on the new record handed back to the caller.
*/
newp->dn_sig = new_rec.rec_dn.dn_sig;
/*
* Finally, mark the container as clean.
*/
return (setabyte(fd, offsetof(dn_header_t, dnh_dirty), 0));
}
int
list_dn(const char *location, char ***listppp, uint_t *countp)
{
char ipaddr[INET_ADDRSTRLEN];
uint64_t direntbuf[(MAXPATHLEN + sizeof (struct dirent)) / 8];
struct dirent *result, *dirent = (struct dirent *)&direntbuf;
DIR *dirp;
unsigned int i, count = 0;
char *re, **new_listpp, **listpp = NULL;
char conver[4];
int error;
dirp = opendir(location);
if (dirp == NULL) {
switch (errno) {
case EACCES:
case EPERM:
return (DSVC_ACCESS);
case ENOENT:
return (DSVC_NO_LOCATION);
default:
break;
}
return (DSVC_INTERNAL);
}
/*
* Compile a regular expression matching "SUNWbinfilesX_" (where X
* is a container version number) followed by an IP address
* (roughly speaking). Note that the $N constructions allow us to
* get the container version and IP address when calling regex(3C).
*/
re = regcmp("^SUNWbinfiles([0-9]{1,3})$0_"
"(([0-9]{1,3}_){3}[0-9]{1,3})$1$", (char *)0);
if (re == NULL)
return (DSVC_NO_MEMORY);
for (;;) {
/*
* readdir_r() is very broken; see 4329196 -- in the
* meantime, workaround as best we can.
*/
error = readdir_r(dirp, dirent, &result);
if (error != 0 || result == NULL)
break;
if (regex(re, result->d_name, conver, ipaddr) != NULL) {
if (atoi(conver) != DSVC_CONVER)
continue;
for (i = 0; ipaddr[i] != '\0'; i++)
if (ipaddr[i] == '_')
ipaddr[i] = '.';
new_listpp = realloc(listpp,
(sizeof (char **)) * (count + 1));
if (new_listpp == NULL) {
error = DSVC_NO_MEMORY;
goto fail;
}
listpp = new_listpp;
listpp[count] = strdup(ipaddr);
if (listpp[count] == NULL) {
error = DSVC_NO_MEMORY;
goto fail;
}
count++;
}
}
free(re);
(void) closedir(dirp);
*countp = count;
*listppp = listpp;
return (DSVC_SUCCESS);
fail:
free(re);
(void) closedir(dirp);
for (i = 0; i < count; i++)
free(listpp[i]);
free(listpp);
return (error);
}
/*
* Check (a la fsck) that a given DHCP network container is in a consistent
* state. If not, then attempt to restore internal consistency; this should
* always be possible unless the container has been externally corrupted.
*/
static int
check_dn(dn_handle_t *dhp)
{
dn_header_t header;
uchar_t image, dirty;
uint16_t hash;
dn_filerec_t rec;
dn_recid_t recid, maxrecid;
int retval;
/*
* Reading the whole header is a very expensive operation; only do
* it once we're sure the container is actually dirty. On an
* E4500, this optimization lowers the wall-clock cost of creating
* a 5000-record datastore by 20 percent.
*/
retval = getabyte(dhp->dh_fd, offsetof(dn_header_t, dnh_dirty), &dirty);
if (retval != DSVC_SUCCESS)
return (retval);
if (dirty == 0)
return (DSVC_SUCCESS);
if (read_header(dhp->dh_fd, &header, B_TRUE) == -1)
return (syserr_to_dsvcerr(errno));
/*
* If `dnh_tempimage' matches the current working image, then we
* crashed in the middle of a modify_dn() operation. Complete
* writing out the temporary record before restoring internal
* consistency. This is a bit of a kludge but there doesn't seem
* to be another way.
*/
if (header.dnh_tempimage == header.dnh_image) {
recid = RECID(header.dnh_temp.rec_dn.dn_cip.s_addr,
header.dnh_netmask);
if (write_rec(dhp->dh_fd, &header.dnh_temp, recid) == -1)
return (syserr_to_dsvcerr(errno));
header.dnh_tempimage = DN_NOIMAGE;
}
/*
* Blindly update all the header hashhead pointers since we're
* going to have to re-write the header anyway.
*/
image = header.dnh_image;
for (hash = 0; hash < DN_CIDHASHSZ; hash++) {
header.dnh_cidhash[hash][!image] =
header.dnh_cidhash[hash][image];
}
/*
* Synchronize the record pointers of all in-use records. We do
* this instead of just walking the hashheads because not all dirty
* records are hashed (for instance, we may have failed part way
* through an add_dn()).
*/
maxrecid = RECID(~0, header.dnh_netmask);
for (recid = RECID(0, header.dnh_netmask); recid <= maxrecid; recid++) {
if (read_rec(dhp->dh_fd, &rec, recid) == -1)
return (syserr_to_dsvcerr(errno));
/*
* Verify the pointers match. If not, then correct
* the record and write it back to disk.
*/
if (rec.rec_next[image] != rec.rec_next[!image] ||
rec.rec_prev[image] != rec.rec_prev[!image]) {
header.dnh_errors++;
rec.rec_prev[!image] = rec.rec_prev[image];
rec.rec_next[!image] = rec.rec_next[image];
if (write_rec(dhp->dh_fd, &rec, recid) == -1)
return (syserr_to_dsvcerr(errno));
}
}
header.dnh_checks++;
if (write_header(dhp->dh_fd, &header) == -1)
return (syserr_to_dsvcerr(errno));
/*
* Clear the dirty bit on the container.
*/
return (setabyte(dhp->dh_fd, offsetof(dn_header_t, dnh_dirty), 0));
}
/*
* Given a buffer `path' of `pathlen' bytes, fill it in with a path to the
* DHCP Network table for IP network `ip' located in directory `dir'.
*/
static void
net2path(char *path, size_t pathlen, const char *dir, ipaddr_t ip)
{
(void) snprintf(path, pathlen, "%s/SUNWbinfiles%u_%d_%d_%d_%d", dir,
DSVC_CONVER, ip >> 24, (ip >> 16) & 0xff, (ip >> 8) & 0xff,
ip & 0xff);
}
/*
* Given a `cid' that's `cidlen' bytes long, hash it to a value between 0
* and DN_CIDHASHSZ - 1. We use CRC16 for our hash since it's known to be
* very evenly distributed.
*/
static uint16_t
cidhash(const uchar_t *cid, size_t cidlen)
{
uchar_t bit;
uint16_t result = 0xffff;
const uint16_t crc16_poly = 0x8408; /* mutated CRC-CCITT polynomial */
while (cidlen-- != 0) {
result ^= *cid++;
for (bit = 0; bit < 8; bit++) {
if (result & 1)
result = (result >> 1) ^ crc16_poly;
else
result >>= 1;
}
}
return (result % DN_CIDHASHSZ);
}
/*
* Convert the dn_filerec_t pointed to by `rec' from native (host) to
* network order or the other way.
*/
/* ARGSUSED */
static void
nhconvert_rec(dn_filerec_t *rec)
{
#ifdef _LITTLE_ENDIAN
dn_rec_t *dnp = &rec->rec_dn;
nhconvert(&rec->rec_prev[0], &rec->rec_prev[0], sizeof (dn_recid_t));
nhconvert(&rec->rec_prev[1], &rec->rec_prev[1], sizeof (dn_recid_t));
nhconvert(&rec->rec_next[0], &rec->rec_next[0], sizeof (dn_recid_t));
nhconvert(&rec->rec_next[1], &rec->rec_next[1], sizeof (dn_recid_t));
nhconvert(&dnp->dn_cip.s_addr, &dnp->dn_cip.s_addr, sizeof (ipaddr_t));
nhconvert(&dnp->dn_sip.s_addr, &dnp->dn_sip.s_addr, sizeof (ipaddr_t));
nhconvert(&dnp->dn_lease, &dnp->dn_lease, sizeof (lease_t));
nhconvert(&dnp->dn_sig, &dnp->dn_sig, sizeof (uint64_t));
#endif
}
/*
* Convert the header pointed to by `hdrp' from native (host) to network
* order or the other way. If `hash' is false, then don't bother
* converting the hash chains.
*/
/* ARGSUSED */
static void
nhconvert_header(dn_header_t *hdrp, boolean_t hash)
{
#ifdef _LITTLE_ENDIAN
unsigned int i;
nhconvert(&hdrp->dnh_network, &hdrp->dnh_network, sizeof (ipaddr_t));
nhconvert(&hdrp->dnh_netmask, &hdrp->dnh_netmask, sizeof (ipaddr_t));
nhconvert(&hdrp->dnh_magic, &hdrp->dnh_magic, sizeof (uint32_t));
nhconvert_rec(&hdrp->dnh_temp);
if (hash) {
for (i = 0; i < DN_CIDHASHSZ; i++) {
nhconvert(&hdrp->dnh_cidhash[i][0],
&hdrp->dnh_cidhash[i][0], sizeof (dn_recid_t));
nhconvert(&hdrp->dnh_cidhash[i][1],
&hdrp->dnh_cidhash[i][1], sizeof (dn_recid_t));
}
}
#endif
}
/*
* Read the dn_filerec_t identified by `recid' from open container `fd'
* into `rec'. Returns 0 on success, -1 on failure (errno is set).
*/
static int
read_rec(int fd, dn_filerec_t *rec, dn_recid_t recid)
{
if (pnread(fd, rec, sizeof (*rec), RECID2OFFSET(recid)) == -1)
return (-1);
nhconvert_rec(rec);
return (0);
}
/*
* Write the dn_filerec_t `rec' identified by `recid' into the open
* container `fd'. Returns 0 on success, -1 on failure (errno is set).
*/
static int
write_rec(int fd, dn_filerec_t *rec, dn_recid_t recid)
{
int retval;
nhconvert_rec(rec);
retval = pnwrite(fd, rec, sizeof (*rec), RECID2OFFSET(recid));
nhconvert_rec(rec);
return (retval);
}
/*
* Read the dn_header_t from the open container `fd' into the dn_header_t
* pointed to by `hdrp'; if `hash' is not set, then skip reading the
* dn_header_t hash chains. Returns 0 on success, -1 on failure (errno is
* set).
*/
static int
read_header(int fd, dn_header_t *hdrp, boolean_t hash)
{
size_t size;
size = hash ? sizeof (dn_header_t) : offsetof(dn_header_t, dnh_cidhash);
if (pnread(fd, hdrp, size, 0) == -1)
return (-1);
nhconvert_header(hdrp, hash);
return (0);
}
/*
* Write the dn_header_t pointed to by `hdrp' into open container `fd'.
* Returns 0 on success, -1 on failure (errno is set).
*/
static int
write_header(int fd, dn_header_t *hdrp)
{
int retval;
nhconvert_header(hdrp, B_TRUE);
retval = pnwrite(fd, hdrp, sizeof (dn_header_t), 0);
nhconvert_header(hdrp, B_TRUE);
return (retval);
}
/*
* Read in the head of the `cidhash' hash chain from open container `fd'
* into `recid_headp', using image `image'. Returns 0 on success, -1 on
* failure (errno is set).
*/
static int
read_hashhead(int fd, dn_recid_t *recid_headp, uint16_t cidhash, uchar_t image)
{
if (pnread(fd, recid_headp, sizeof (dn_recid_t),
offsetof(dn_header_t, dnh_cidhash[cidhash][image])) == -1)
return (-1);
nhconvert(recid_headp, recid_headp, sizeof (dn_recid_t));
return (0);
}
/*
* Write out the head of the `cidhash' hash chain into open container `fd'
* from `recid_head', using image `image'. Returns 0 on success, -1 on
* failure (errno is set).
*/
static int
write_hashhead(int fd, dn_recid_t recid_head, uint16_t cidhash, uchar_t image)
{
nhconvert(&recid_head, &recid_head, sizeof (dn_recid_t));
return (pnwrite(fd, &recid_head, sizeof (dn_recid_t),
offsetof(dn_header_t, dnh_cidhash[cidhash][image])));
}
/*
* Get the byte `offset' bytes into open file `fd', and store in `bytep'.
* Returns a DSVC_* return code.
*/
static int
getabyte(int fd, off_t offset, uchar_t *bytep)
{
switch (pread(fd, bytep, 1, offset)) {
case 1:
return (DSVC_SUCCESS);
case -1:
return (syserr_to_dsvcerr(errno));
default:
break;
}
return (DSVC_INTERNAL);
}
/*
* Set the byte `offset' bytes into open file `fd' to `byte'. Returns a
* DSVC_* return code.
*/
static int
setabyte(int fd, off_t offset, uchar_t byte)
{
switch (pwrite(fd, &byte, 1, offset)) {
case 1:
return (DSVC_SUCCESS);
case -1:
return (syserr_to_dsvcerr(errno));
default:
break;
}
return (DSVC_INTERNAL);
}