arp.c revision e11c3f44f531fdff80941ce57c065d2ae861cefc
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
/* AR - Address Resolution Protocol */
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/stropts.h>
#include <sys/errno.h>
#include <sys/strlog.h>
#include <sys/dlpi.h>
#include <sys/sockio.h>
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <sys/socket.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/cmn_err.h>
#include <sys/sdt.h>
#include <sys/vtrace.h>
#include <sys/strsun.h>
#include <sys/policy.h>
#include <sys/zone.h>
#include <sys/ethernet.h>
#include <sys/zone.h>
#include <sys/random.h>
#include <sys/sdt.h>
#include <sys/hook_event.h>
#include <inet/common.h>
#include <inet/optcom.h>
#include <inet/mi.h>
#include <inet/nd.h>
#include <inet/snmpcom.h>
#include <net/if.h>
#include <inet/arp.h>
#include <netinet/ip6.h>
#include <netinet/arp.h>
#include <inet/ip.h>
#include <inet/ip_ire.h>
#include <inet/ip_ndp.h>
#include <inet/mib2.h>
#include <inet/arp_impl.h>
/*
* ARP entry life time and design notes
* ------------------------------------
*
* ARP entries (ACEs) must last at least as long as IP knows about a given
* MAC-IP translation (i.e., as long as the IRE cache entry exists). It's ok
* if the ARP entry lasts longer, but not ok if it is removed before the IP
* entry. The reason for this is that if ARP doesn't have an entry, we will be
* unable to detect the difference between an ARP broadcast that represents no
* change (same, known address of sender) and one that represents a change (new
* address for existing entry). In the former case, we must not notify IP, or
* we can suffer hurricane attack. In the latter case, we must notify IP, or
* IP will drift out of sync with the network.
*
* Note that IP controls the lifetime of entries, not ARP.
*
* We don't attempt to reconfirm aging entries. If the system is no longer
* talking to a given peer, then it doesn't matter if we have the right mapping
* for that peer. It would be possible to send queries on aging entries that
* are active, but this isn't done.
*
* IPMP Notes
* ----------
*
* ARP is aware of IPMP. In particular, IP notifies ARP about all "active"
* (able to transmit data packets) interfaces in a given group via
* AR_IPMP_ACTIVATE and AR_IPMP_DEACTIVATE messages. These messages, combined
* with the "IPMP arl_t" that ARP creates over the IPMP DLPI stub driver,
* enable ARP to track all the arl_t's that are in the same group and thus
* ensure that ACEs are shared across each group and the arl_t that ARP
* chooses to transmit on for a given ACE is optimal.
*
* ARP relies on IP for hardware address updates. In particular, if the
* hardware address of an interface changes (DL_NOTE_PHYS_ADDR), then IP will
* bring the interface down and back up -- and as part of bringing it back
* up, will send messages to ARP that allow it to update the affected arl's
* with new hardware addresses.
*
* N.B.: One side-effect of this approach is that when an interface fails and
* then starts to repair, it will temporarily populate the ARP cache with
* addresses that are owned by it rather than the group's arl_t. To address
* this, we could add more messages (e.g., AR_IPMP_JOIN and AR_IPMP_LEAVE),
* but as the issue appears to be only cosmetic (redundant entries in the ARP
* cache during interace repair), we've kept things simple for now.
*/
/*
* This is used when scanning for "old" (least recently broadcast) ACEs. We
* don't want to have to walk the list for every single one, so we gather up
* batches at a time.
*/
#define ACE_RESCHED_LIST_LEN 8
typedef struct {
arl_t *art_arl;
uint_t art_naces;
ace_t *art_aces[ACE_RESCHED_LIST_LEN];
} ace_resched_t;
#define ACE_RESOLVED(ace) ((ace)->ace_flags & ACE_F_RESOLVED)
#define ACE_NONPERM(ace) \
(((ace)->ace_flags & (ACE_F_RESOLVED | ACE_F_PERMANENT)) == \
ACE_F_RESOLVED)
#define AR_DEF_XMIT_INTERVAL 500 /* time in milliseconds */
#define AR_LL_HDR_SLACK 32 /* Leave the lower layer some room */
#define AR_SNMP_MSG T_OPTMGMT_ACK
#define AR_DRAINING (void *)0x11
/*
* The IPv4 Link Local address space is special; we do extra duplicate checking
* there, as the entire assignment mechanism rests on random numbers.
*/
#define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \
((uchar_t *)ptr)[1] == 254)
/*
* Check if the command needs to be enqueued by seeing if there are other
* commands ahead of us or if some DLPI response is being awaited. Usually
* there would be an enqueued command in the latter case, however if the
* stream that originated the command has closed, the close would have
* cleaned up the enqueued command. AR_DRAINING signifies that the command
* at the head of the arl_queue has been internally dequeued on completion
* of the previous command and is being called from ar_dlpi_done
*/
#define CMD_NEEDS_QUEUEING(mp, arl) \
(mp->b_prev != AR_DRAINING && (arl->arl_queue != NULL || \
arl->arl_dlpi_pending != DL_PRIM_INVAL))
#define ARH_FIXED_LEN 8
/*
* Macro used when creating ACEs to determine the arl that should own it.
*/
#define OWNING_ARL(arl) \
((arl)->arl_ipmp_arl != NULL ? (arl)->arl_ipmp_arl : arl)
/*
* MAC-specific intelligence. Shouldn't be needed, but the DL_INFO_ACK
* doesn't quite do it for us.
*/
typedef struct ar_m_s {
t_uscalar_t ar_mac_type;
uint32_t ar_mac_arp_hw_type;
t_scalar_t ar_mac_sap_length;
uint32_t ar_mac_hw_addr_length;
} ar_m_t;
typedef struct msg2_args {
mblk_t *m2a_mpdata;
mblk_t *m2a_mptail;
} msg2_args_t;
static mblk_t *ar_alloc(uint32_t cmd, int);
static int ar_ce_create(arl_t *arl, uint32_t proto, uchar_t *hw_addr,
uint32_t hw_addr_len, uchar_t *proto_addr,
uint32_t proto_addr_len, uchar_t *proto_mask,
uchar_t *proto_extract_mask, uint32_t hw_extract_start,
uchar_t *sender_addr, uint32_t flags);
static void ar_ce_delete(ace_t *ace);
static void ar_ce_delete_per_arl(ace_t *ace, void *arg);
static ace_t **ar_ce_hash(arp_stack_t *as, uint32_t proto,
const uchar_t *proto_addr, uint32_t proto_addr_length);
static ace_t *ar_ce_lookup(arl_t *arl, uint32_t proto,
const uchar_t *proto_addr, uint32_t proto_addr_length);
static ace_t *ar_ce_lookup_entry(arl_t *arl, uint32_t proto,
const uchar_t *proto_addr, uint32_t proto_addr_length);
static ace_t *ar_ce_lookup_from_area(arp_stack_t *as, mblk_t *mp,
ace_t *matchfn());
static ace_t *ar_ce_lookup_mapping(arl_t *arl, uint32_t proto,
const uchar_t *proto_addr, uint32_t proto_addr_length);
static ace_t *ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto,
uchar_t *proto_addr, uint32_t proto_addr_length);
static boolean_t ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr,
uint32_t hw_addr_length);
static void ar_ce_walk(arp_stack_t *as, void (*pfi)(ace_t *, void *),
void *arg1);
static void ar_client_notify(const arl_t *arl, mblk_t *mp, int code);
static int ar_close(queue_t *q);
static int ar_cmd_dispatch(queue_t *q, mblk_t *mp, boolean_t from_wput);
static void ar_cmd_done(arl_t *arl);
static mblk_t *ar_dlpi_comm(t_uscalar_t prim, size_t size);
static void ar_dlpi_send(arl_t *, mblk_t *);
static void ar_dlpi_done(arl_t *, t_uscalar_t);
static int ar_entry_add(queue_t *q, mblk_t *mp);
static int ar_entry_delete(queue_t *q, mblk_t *mp);
static int ar_entry_query(queue_t *q, mblk_t *mp);
static int ar_entry_squery(queue_t *q, mblk_t *mp);
static int ar_interface_up(queue_t *q, mblk_t *mp);
static int ar_interface_down(queue_t *q, mblk_t *mp);
static int ar_interface_on(queue_t *q, mblk_t *mp);
static int ar_interface_off(queue_t *q, mblk_t *mp);
static int ar_ipmp_activate(queue_t *q, mblk_t *mp);
static int ar_ipmp_deactivate(queue_t *q, mblk_t *mp);
static void ar_ll_cleanup_arl_queue(queue_t *q);
static void ar_ll_down(arl_t *arl);
static arl_t *ar_ll_lookup_by_name(arp_stack_t *as, const char *name);
static arl_t *ar_ll_lookup_from_mp(arp_stack_t *as, mblk_t *mp);
static void ar_ll_init(arp_stack_t *, ar_t *, mblk_t *mp);
static void ar_ll_set_defaults(arl_t *, mblk_t *mp);
static void ar_ll_clear_defaults(arl_t *);
static int ar_ll_up(arl_t *arl);
static int ar_mapping_add(queue_t *q, mblk_t *mp);
static boolean_t ar_mask_all_ones(uchar_t *mask, uint32_t mask_len);
static ar_m_t *ar_m_lookup(t_uscalar_t mac_type);
static int ar_nd_ioctl(queue_t *q, mblk_t *mp);
static int ar_open(queue_t *q, dev_t *devp, int flag, int sflag,
cred_t *credp);
static int ar_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
static boolean_t ar_param_register(IDP *ndp, arpparam_t *arppa, int cnt);
static int ar_param_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static void ar_query_delete(ace_t *ace, void *ar);
static void ar_query_reply(ace_t *ace, int ret_val,
uchar_t *proto_addr, uint32_t proto_addr_len);
static clock_t ar_query_xmit(arp_stack_t *as, ace_t *ace);
static void ar_rput(queue_t *q, mblk_t *mp_orig);
static void ar_rput_dlpi(queue_t *q, mblk_t *mp);
static void ar_set_address(ace_t *ace, uchar_t *addrpos,
uchar_t *proto_addr, uint32_t proto_addr_len);
static int ar_slifname(queue_t *q, mblk_t *mp);
static int ar_set_ppa(queue_t *q, mblk_t *mp);
static int ar_snmp_msg(queue_t *q, mblk_t *mp_orig);
static void ar_snmp_msg2(ace_t *, void *);
static void ar_wput(queue_t *q, mblk_t *mp);
static void ar_wsrv(queue_t *q);
static void ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto,
uint32_t plen, const uchar_t *haddr1, const uchar_t *paddr1,
const uchar_t *haddr2, const uchar_t *paddr2, const uchar_t *dstaddr,
arp_stack_t *as);
static void ar_cmd_enqueue(arl_t *arl, mblk_t *mp, queue_t *q,
ushort_t cmd, boolean_t);
static mblk_t *ar_cmd_dequeue(arl_t *arl);
static void *arp_stack_init(netstackid_t stackid, netstack_t *ns);
static void arp_stack_fini(netstackid_t stackid, void *arg);
static void arp_stack_shutdown(netstackid_t stackid, void *arg);
/*
* All of these are alterable, within the min/max values given,
* at run time. arp_publish_interval and arp_publish_count are
* set by default to 2 seconds and 5 respectively. This is
* useful during FAILOVER/FAILBACK to make sure that the ARP
* packets are not lost. Assumed that it does not affect the
* normal operations.
*/
static arpparam_t arp_param_arr[] = {
/* min max value name */
{ 30000, 3600000, 300000, "arp_cleanup_interval"},
{ 1000, 20000, 2000, "arp_publish_interval"},
{ 1, 20, 5, "arp_publish_count"},
{ 0, 20000, 1000, "arp_probe_delay"},
{ 10, 20000, 1500, "arp_probe_interval"},
{ 0, 20, 3, "arp_probe_count"},
{ 0, 20000, 100, "arp_fastprobe_delay"},
{ 10, 20000, 150, "arp_fastprobe_interval"},
{ 0, 20, 3, "arp_fastprobe_count"},
{ 0, 3600000, 300000, "arp_defend_interval"},
{ 0, 20000, 100, "arp_defend_rate"},
{ 0, 3600000, 15000, "arp_broadcast_interval"},
{ 5, 86400, 3600, "arp_defend_period"}
};
#define as_cleanup_interval as_param_arr[0].arp_param_value
#define as_publish_interval as_param_arr[1].arp_param_value
#define as_publish_count as_param_arr[2].arp_param_value
#define as_probe_delay as_param_arr[3].arp_param_value
#define as_probe_interval as_param_arr[4].arp_param_value
#define as_probe_count as_param_arr[5].arp_param_value
#define as_fastprobe_delay as_param_arr[6].arp_param_value
#define as_fastprobe_interval as_param_arr[7].arp_param_value
#define as_fastprobe_count as_param_arr[8].arp_param_value
#define as_defend_interval as_param_arr[9].arp_param_value
#define as_defend_rate as_param_arr[10].arp_param_value
#define as_broadcast_interval as_param_arr[11].arp_param_value
#define as_defend_period as_param_arr[12].arp_param_value
static struct module_info arp_mod_info = {
0, "arp", 0, INFPSZ, 512, 128
};
static struct qinit arprinit = {
(pfi_t)ar_rput, NULL, ar_open, ar_close, NULL, &arp_mod_info
};
static struct qinit arpwinit = {
(pfi_t)ar_wput, (pfi_t)ar_wsrv, ar_open, ar_close, NULL, &arp_mod_info
};
struct streamtab arpinfo = {
&arprinit, &arpwinit
};
/*
* TODO: we need a better mechanism to set the ARP hardware type since
* the DLPI mac type does not include enough predefined values.
*/
static ar_m_t ar_m_tbl[] = {
{ DL_CSMACD, ARPHRD_ETHER, -2, 6}, /* 802.3 */
{ DL_TPB, ARPHRD_IEEE802, -2, 6}, /* 802.4 */
{ DL_TPR, ARPHRD_IEEE802, -2, 6}, /* 802.5 */
{ DL_METRO, ARPHRD_IEEE802, -2, 6}, /* 802.6 */
{ DL_ETHER, ARPHRD_ETHER, -2, 6}, /* Ethernet */
{ DL_FDDI, ARPHRD_ETHER, -2, 6}, /* FDDI */
{ DL_IB, ARPHRD_IB, -2, 20}, /* Infiniband */
{ DL_OTHER, ARPHRD_ETHER, -2, 6}, /* unknown */
};
/*
* Note that all routines which need to queue the message for later
* processing have to be ioctl_aware to be able to queue the complete message.
* Following are command entry flags in arct_flags
*/
#define ARF_IOCTL_AWARE 0x1 /* Arp command can come down as M_IOCTL */
#define ARF_ONLY_CMD 0x2 /* Command is exclusive to ARP */
#define ARF_WPUT_OK 0x4 /* Command is allowed from ar_wput */
/* ARP Cmd Table entry */
typedef struct arct_s {
int (*arct_pfi)(queue_t *, mblk_t *);
uint32_t arct_cmd;
int arct_min_len;
uint32_t arct_flags;
int arct_priv_req; /* Privilege required for this cmd */
const char *arct_txt;
} arct_t;
/*
* AR_ENTRY_ADD, QUERY and SQUERY are used by sdp, hence they need to
* have ARF_WPUT_OK set.
*/
static arct_t ar_cmd_tbl[] = {
{ ar_entry_add, AR_ENTRY_ADD, sizeof (area_t),
ARF_IOCTL_AWARE | ARF_ONLY_CMD | ARF_WPUT_OK, OP_CONFIG,
"AR_ENTRY_ADD" },
{ ar_entry_delete, AR_ENTRY_DELETE, sizeof (ared_t),
ARF_IOCTL_AWARE | ARF_ONLY_CMD, OP_CONFIG, "AR_ENTRY_DELETE" },
{ ar_entry_query, AR_ENTRY_QUERY, sizeof (areq_t),
ARF_IOCTL_AWARE | ARF_ONLY_CMD | ARF_WPUT_OK, OP_NP,
"AR_ENTRY_QUERY" },
{ ar_entry_squery, AR_ENTRY_SQUERY, sizeof (area_t),
ARF_IOCTL_AWARE | ARF_ONLY_CMD | ARF_WPUT_OK, OP_NP,
"AR_ENTRY_SQUERY" },
{ ar_mapping_add, AR_MAPPING_ADD, sizeof (arma_t),
ARF_IOCTL_AWARE | ARF_ONLY_CMD, OP_CONFIG, "AR_MAPPING_ADD" },
{ ar_interface_up, AR_INTERFACE_UP, sizeof (arc_t),
ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_UP" },
{ ar_interface_down, AR_INTERFACE_DOWN, sizeof (arc_t),
ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_DOWN" },
{ ar_interface_on, AR_INTERFACE_ON, sizeof (arc_t),
ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_ON" },
{ ar_interface_off, AR_INTERFACE_OFF, sizeof (arc_t),
ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_OFF" },
{ ar_ipmp_activate, AR_IPMP_ACTIVATE, sizeof (arie_t),
ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_ACTIVATE" },
{ ar_ipmp_deactivate, AR_IPMP_DEACTIVATE, sizeof (arie_t),
ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_DEACTIVATE" },
{ ar_set_ppa, (uint32_t)IF_UNITSEL, sizeof (int),
ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "IF_UNITSEL" },
{ ar_nd_ioctl, ND_GET, 1,
ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_NP, "ND_GET" },
{ ar_nd_ioctl, ND_SET, 1,
ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "ND_SET" },
{ ar_snmp_msg, AR_SNMP_MSG, sizeof (struct T_optmgmt_ack),
ARF_IOCTL_AWARE | ARF_WPUT_OK | ARF_ONLY_CMD, OP_NP,
"AR_SNMP_MSG" },
{ ar_slifname, (uint32_t)SIOCSLIFNAME, sizeof (struct lifreq),
ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "SIOCSLIFNAME" }
};
/*
* Lookup and return an arl appropriate for sending packets with either source
* hardware address `hw_addr' or source protocol address `ip_addr', in that
* order. If neither was specified or neither match, return any arl in the
* same group as `arl'.
*/
static arl_t *
ar_ipmp_lookup_xmit_arl(arl_t *arl, uchar_t *hw_addr, uint_t hw_addrlen,
uchar_t *ip_addr)
{
arlphy_t *ap;
ace_t *src_ace;
arl_t *xmit_arl = NULL;
arp_stack_t *as = ARL_TO_ARPSTACK(arl);
ASSERT(arl->arl_flags & ARL_F_IPMP);
if (hw_addr != NULL && hw_addrlen != 0) {
xmit_arl = as->as_arl_head;
for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next) {
/*
* There may be arls with the same HW address that are
* not in our IPMP group; we don't want those.
*/
if (xmit_arl->arl_ipmp_arl != arl)
continue;
ap = xmit_arl->arl_phy;
if (ap != NULL && ap->ap_hw_addrlen == hw_addrlen &&
bcmp(ap->ap_hw_addr, hw_addr, hw_addrlen) == 0)
break;
}
DTRACE_PROBE4(xmit_arl_hwsrc, arl_t *, arl, arl_t *,
xmit_arl, uchar_t *, hw_addr, uint_t, hw_addrlen);
}
if (xmit_arl == NULL && ip_addr != NULL) {
src_ace = ar_ce_lookup_permanent(as, IP_ARP_PROTO_TYPE, ip_addr,
IP_ADDR_LEN);
if (src_ace != NULL)
xmit_arl = src_ace->ace_xmit_arl;
DTRACE_PROBE4(xmit_arl_ipsrc, arl_t *, arl, arl_t *,
xmit_arl, uchar_t *, ip_addr, uint_t, IP_ADDR_LEN);
}
if (xmit_arl == NULL) {
xmit_arl = as->as_arl_head;
for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next)
if (xmit_arl->arl_ipmp_arl == arl && xmit_arl != arl)
break;
DTRACE_PROBE2(xmit_arl_any, arl_t *, arl, arl_t *, xmit_arl);
}
return (xmit_arl);
}
/*
* ARP Cache Entry creation routine.
* Cache entries are allocated within timer messages and inserted into
* the global hash list based on protocol and protocol address.
*/
static int
ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len,
uchar_t *proto_addr, uint_t proto_addr_len, uchar_t *proto_mask,
uchar_t *proto_extract_mask, uint_t hw_extract_start, uchar_t *sender_addr,
uint_t flags)
{
static ace_t ace_null;
ace_t *ace;
ace_t **acep;
uchar_t *dst;
mblk_t *mp;
arp_stack_t *as = ARL_TO_ARPSTACK(arl);
arl_t *xmit_arl;
arlphy_t *ap;
if ((flags & ~ACE_EXTERNAL_FLAGS_MASK) || arl == NULL)
return (EINVAL);
if (proto_addr == NULL || proto_addr_len == 0 ||
(proto == IP_ARP_PROTO_TYPE && proto_addr_len != IP_ADDR_LEN))
return (EINVAL);
if (flags & ACE_F_MYADDR)
flags |= ACE_F_PUBLISH | ACE_F_AUTHORITY;
/*
* Latch a transmit arl for this ace.
*/
if (arl->arl_flags & ARL_F_IPMP) {
ASSERT(proto == IP_ARP_PROTO_TYPE);
xmit_arl = ar_ipmp_lookup_xmit_arl(arl, hw_addr, hw_addr_len,
sender_addr);
} else {
xmit_arl = arl;
}
if (xmit_arl == NULL || xmit_arl->arl_phy == NULL)
return (EINVAL);
ap = xmit_arl->arl_phy;
if (!hw_addr && hw_addr_len == 0) {
if (flags == ACE_F_PERMANENT) { /* Not publish */
/* 224.0.0.0 to zero length address */
flags |= ACE_F_RESOLVED;
} else { /* local address and unresolved case */
hw_addr = ap->ap_hw_addr;
hw_addr_len = ap->ap_hw_addrlen;
if (flags & ACE_F_PUBLISH)
flags |= ACE_F_RESOLVED;
}
} else {
flags |= ACE_F_RESOLVED;
}
/* Handle hw_addr_len == 0 for DL_ENABMULTI_REQ etc. */
if (hw_addr_len != 0 && hw_addr == NULL)
return (EINVAL);
if (hw_addr_len < ap->ap_hw_addrlen && hw_addr_len != 0)
return (EINVAL);
if (!proto_extract_mask && (flags & ACE_F_MAPPING))
return (EINVAL);
/*
* If the underlying link doesn't have reliable up/down notification or
* if we're working with the IPv4 169.254.0.0/16 Link Local Address
* space, then don't use the fast timers. Otherwise, use them.
*/
if (ap->ap_notifies &&
!(proto == IP_ARP_PROTO_TYPE && IS_IPV4_LL_SPACE(proto_addr))) {
flags |= ACE_F_FAST;
}
/*
* Allocate the timer block to hold the ace.
* (ace + proto_addr + proto_addr_mask + proto_extract_mask + hw_addr)
*/
mp = mi_timer_alloc(sizeof (ace_t) + proto_addr_len + proto_addr_len +
proto_addr_len + hw_addr_len);
if (!mp)
return (ENOMEM);
ace = (ace_t *)mp->b_rptr;
*ace = ace_null;
ace->ace_proto = proto;
ace->ace_mp = mp;
ace->ace_arl = arl;
ace->ace_xmit_arl = xmit_arl;
dst = (uchar_t *)&ace[1];
ace->ace_proto_addr = dst;
ace->ace_proto_addr_length = proto_addr_len;
bcopy(proto_addr, dst, proto_addr_len);
dst += proto_addr_len;
/*
* The proto_mask allows us to add entries which will let us respond
* to requests for a group of addresses. This makes it easy to provide
* proxy ARP service for machines that don't understand about the local
* subnet structure, if, for example, there are BSD4.2 systems lurking.
*/
ace->ace_proto_mask = dst;
if (proto_mask != NULL) {
bcopy(proto_mask, dst, proto_addr_len);
dst += proto_addr_len;
} else {
while (proto_addr_len-- > 0)
*dst++ = (uchar_t)~0;
}
if (proto_extract_mask != NULL) {
ace->ace_proto_extract_mask = dst;
bcopy(proto_extract_mask, dst, ace->ace_proto_addr_length);
dst += ace->ace_proto_addr_length;
} else {
ace->ace_proto_extract_mask = NULL;
}
ace->ace_hw_extract_start = hw_extract_start;
ace->ace_hw_addr_length = hw_addr_len;
ace->ace_hw_addr = dst;
if (hw_addr != NULL) {
bcopy(hw_addr, dst, hw_addr_len);
dst += hw_addr_len;
}
ace->ace_flags = flags;
if (ar_mask_all_ones(ace->ace_proto_mask,
ace->ace_proto_addr_length)) {
acep = ar_ce_hash(as, ace->ace_proto, ace->ace_proto_addr,
ace->ace_proto_addr_length);
} else {
acep = &as->as_ce_mask_entries;
}
if ((ace->ace_next = *acep) != NULL)
ace->ace_next->ace_ptpn = &ace->ace_next;
*acep = ace;
ace->ace_ptpn = acep;
return (0);
}
/* Delete a cache entry. */
static void
ar_ce_delete(ace_t *ace)
{
ace_t **acep;
/* Get out of the hash list. */
acep = ace->ace_ptpn;
if (ace->ace_next)
ace->ace_next->ace_ptpn = acep;
acep[0] = ace->ace_next;
/* Mark it dying in case we have a timer about to fire. */
ace->ace_flags |= ACE_F_DYING;
/* Complete any outstanding queries immediately. */
ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
/* Free the timer, immediately, or when it fires. */
mi_timer_free(ace->ace_mp);
}
/*
* ar_ce_walk routine. Delete the ace if it is associated with the arl
* that is going away.
*/
static void
ar_ce_delete_per_arl(ace_t *ace, void *arl)
{
if (ace->ace_arl == arl || ace->ace_xmit_arl == arl) {
ace->ace_flags &= ~ACE_F_PERMANENT;
ar_ce_delete(ace);
}
}
/*
* ar_ce_walk routine used when deactivating an `arl' in a group. Deletes
* `ace' if it was using `arl_arg' as its output interface.
*/
static void
ar_ce_ipmp_deactivate(ace_t *ace, void *arl_arg)
{
arl_t *arl = arl_arg;
ASSERT(!(arl->arl_flags & ARL_F_IPMP));
if (ace->ace_arl == arl) {
ASSERT(ace->ace_xmit_arl == arl);
/*
* This ACE is tied to the arl leaving the group (e.g., an
* ACE_F_PERMANENT for a test address) and is not used by the
* group, so we can leave it be.
*/
return;
}
if (ace->ace_xmit_arl != arl)
return;
ASSERT(ace->ace_arl == arl->arl_ipmp_arl);
/*
* IP should've already sent us messages asking us to move any
* ACE_F_MYADDR entries to another arl, but there are two exceptions:
*
* 1. The group was misconfigured with interfaces that have duplicate
* hardware addresses, but in.mpathd was unable to offline those
* duplicate interfaces.
*
* 2. The messages from IP were lost or never created (e.g. due to
* memory pressure).
*
* We handle the first case by just quietly deleting the ACE. Since
* the second case cannot be distinguished from a more serious bug in
* the IPMP framework, we ASSERT() that this can't happen on DEBUG
* systems, but quietly delete the ACE on production systems (the
* deleted ACE will render the IP address unreachable).
*/
if (ace->ace_flags & ACE_F_MYADDR) {
arlphy_t *ap = arl->arl_phy;
uint_t hw_addrlen = ap->ap_hw_addrlen;
ASSERT(hw_addrlen == ace->ace_hw_addr_length &&
bcmp(ap->ap_hw_addr, ace->ace_hw_addr, hw_addrlen) == 0);
}
/*
* NOTE: it's possible this arl got selected as the ace_xmit_arl when
* creating an ACE_F_PERMANENT ACE on behalf of an SIOCS*ARP ioctl for
* an IPMP IP interface. But it's still OK for us to delete such an
* ACE since ipmp_illgrp_refresh_arpent() will ask us to recreate it
* and we'll pick another arl then.
*/
ar_ce_delete(ace);
}
/* Cache entry hash routine, based on protocol and protocol address. */
static ace_t **
ar_ce_hash(arp_stack_t *as, uint32_t proto, const uchar_t *proto_addr,
uint32_t proto_addr_length)
{
const uchar_t *up = proto_addr;
unsigned int hval = proto;
int len = proto_addr_length;
while (--len >= 0)
hval ^= *up++;
return (&as->as_ce_hash_tbl[hval % ARP_HASH_SIZE]);
}
/* Cache entry lookup. Try to find an ace matching the parameters passed. */
ace_t *
ar_ce_lookup(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
uint32_t proto_addr_length)
{
ace_t *ace;
ace = ar_ce_lookup_entry(arl, proto, proto_addr, proto_addr_length);
if (!ace)
ace = ar_ce_lookup_mapping(arl, proto, proto_addr,
proto_addr_length);
return (ace);
}
/*
* Cache entry lookup. Try to find an ace matching the parameters passed.
* Look only for exact entries (no mappings)
*/
static ace_t *
ar_ce_lookup_entry(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
uint32_t proto_addr_length)
{
ace_t *ace;
arp_stack_t *as = ARL_TO_ARPSTACK(arl);
if (!proto_addr)
return (NULL);
ace = *ar_ce_hash(as, proto, proto_addr, proto_addr_length);
for (; ace; ace = ace->ace_next) {
if ((ace->ace_arl == arl ||
ace->ace_arl == arl->arl_ipmp_arl) &&
ace->ace_proto_addr_length == proto_addr_length &&
ace->ace_proto == proto) {
int i1 = proto_addr_length;
uchar_t *ace_addr = ace->ace_proto_addr;
uchar_t *mask = ace->ace_proto_mask;
/*
* Note that the ace_proto_mask is applied to the
* proto_addr before comparing to the ace_addr.
*/
do {
if (--i1 < 0)
return (ace);
} while ((proto_addr[i1] & mask[i1]) == ace_addr[i1]);
}
}
return (ace);
}
/*
* Extract cache entry lookup parameters from an external command message, then
* call the supplied match function.
*/
static ace_t *
ar_ce_lookup_from_area(arp_stack_t *as, mblk_t *mp, ace_t *matchfn())
{
uchar_t *proto_addr;
area_t *area = (area_t *)mp->b_rptr;
proto_addr = mi_offset_paramc(mp, area->area_proto_addr_offset,
area->area_proto_addr_length);
if (!proto_addr)
return (NULL);
return ((*matchfn)(ar_ll_lookup_from_mp(as, mp), area->area_proto,
proto_addr, area->area_proto_addr_length));
}
/*
* Cache entry lookup. Try to find an ace matching the parameters passed.
* Look only for mappings.
*/
static ace_t *
ar_ce_lookup_mapping(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
uint32_t proto_addr_length)
{
ace_t *ace;
arp_stack_t *as = ARL_TO_ARPSTACK(arl);
if (!proto_addr)
return (NULL);
ace = as->as_ce_mask_entries;
for (; ace; ace = ace->ace_next) {
if (ace->ace_arl == arl &&
ace->ace_proto_addr_length == proto_addr_length &&
ace->ace_proto == proto) {
int i1 = proto_addr_length;
uchar_t *ace_addr = ace->ace_proto_addr;
uchar_t *mask = ace->ace_proto_mask;
/*
* Note that the ace_proto_mask is applied to the
* proto_addr before comparing to the ace_addr.
*/
do {
if (--i1 < 0)
return (ace);
} while ((proto_addr[i1] & mask[i1]) == ace_addr[i1]);
}
}
return (ace);
}
/*
* Look for a permanent entry for proto_addr across all interfaces.
*/
static ace_t *
ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, uchar_t *proto_addr,
uint32_t proto_addr_length)
{
ace_t *ace;
ace = *ar_ce_hash(as, proto, proto_addr, proto_addr_length);
for (; ace != NULL; ace = ace->ace_next) {
if (!(ace->ace_flags & ACE_F_PERMANENT))
continue;
if (ace->ace_proto_addr_length == proto_addr_length &&
ace->ace_proto == proto) {
int i1 = proto_addr_length;
uchar_t *ace_addr = ace->ace_proto_addr;
uchar_t *mask = ace->ace_proto_mask;
/*
* Note that the ace_proto_mask is applied to the
* proto_addr before comparing to the ace_addr.
*/
do {
if (--i1 < 0)
return (ace);
} while ((proto_addr[i1] & mask[i1]) == ace_addr[i1]);
}
}
return (ace);
}
/*
* ar_ce_resolve is called when a response comes in to an outstanding request.
* Returns 'true' if the address has changed and we need to tell the client.
* (We don't need to tell the client if there's still an outstanding query.)
*/
static boolean_t
ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length)
{
boolean_t hwchanged;
if (hw_addr_length == ace->ace_hw_addr_length) {
ASSERT(ace->ace_hw_addr != NULL);
hwchanged = bcmp(hw_addr, ace->ace_hw_addr,
hw_addr_length) != 0;
if (hwchanged)
bcopy(hw_addr, ace->ace_hw_addr, hw_addr_length);
/*
* No need to bother with ar_query_reply if no queries are
* waiting.
*/
ace->ace_flags |= ACE_F_RESOLVED;
if (ace->ace_query_mp != NULL)
ar_query_reply(ace, 0, NULL, (uint32_t)0);
if (hwchanged)
return (B_TRUE);
}
return (B_FALSE);
}
/*
* There are 2 functions performed by this function.
* 1. Resolution of unresolved entries and update of resolved entries.
* 2. Detection of nodes with our own IP address (duplicates).
*
* If the resolving ARL is in the same group as a matching ACE's ARL, then
* update the ACE. Otherwise, make no updates.
*
* For all entries, we first check to see if this is a duplicate (probable
* loopback) message. If so, then just ignore it.
*
* Next, check to see if the entry has completed DAD. If not, then we've
* failed, because someone is already using the address. Notify IP of the DAD
* failure and remove the broken ace.
*
* Next, we check if we're the authority for this address. If so, then it's
* time to defend it, because the other node is a duplicate. Report it as a
* 'bogon' and let IP decide how to defend.
*
* Finally, if it's unresolved or if the arls match, we just update the MAC
* address. This allows a published 'static' entry to be updated by an ARP
* request from the node for which we're a proxy ARP server.
*
* Note that this logic does not update published ARP entries for mismatched
* arls, as for example when we proxy arp across 2 subnets with differing
* subnet masks.
*
* Return Values below
*/
#define AR_NOTFOUND 1 /* No matching ace found in cache */
#define AR_MERGED 2 /* Matching ace updated (RFC 826 Merge_flag) */
#define AR_LOOPBACK 3 /* Our own arp packet was received */
#define AR_BOGON 4 /* Another host has our IP addr. */
#define AR_FAILED 5 /* Duplicate Address Detection has failed */
#define AR_CHANGED 6 /* Address has changed; tell IP (and merged) */
static int
ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr,
uint32_t hlen, const uchar_t *src_paddr, uint32_t plen, arl_t **ace_arlp)
{
ace_t *ace;
ace_t *ace_next;
int i1;
const uchar_t *paddr;
uchar_t *ace_addr;
uchar_t *mask;
int retv = AR_NOTFOUND;
arp_stack_t *as = ARL_TO_ARPSTACK(arl);
ace = *ar_ce_hash(as, proto, src_paddr, plen);
for (; ace != NULL; ace = ace_next) {
/* ar_ce_resolve may delete the ace; fetch next pointer now */
ace_next = ace->ace_next;
if (ace->ace_proto_addr_length != plen ||
ace->ace_proto != proto) {
continue;
}
/*
* Note that the ace_proto_mask is applied to the proto_addr
* before comparing to the ace_addr.
*/
paddr = src_paddr;
i1 = plen;
ace_addr = ace->ace_proto_addr;
mask = ace->ace_proto_mask;
while (--i1 >= 0) {
if ((*paddr++ & *mask++) != *ace_addr++)
break;
}
if (i1 >= 0)
continue;
*ace_arlp = ace->ace_arl;
/*
* If the IP address is ours, and the hardware address matches
* one of our own arls, then this is a broadcast packet
* emitted by one of our interfaces, reflected by the switch
* and received on another interface. We return AR_LOOPBACK.
*/
if (ace->ace_flags & ACE_F_MYADDR) {
arl_t *hw_arl = as->as_arl_head;
arlphy_t *ap;
for (; hw_arl != NULL; hw_arl = hw_arl->arl_next) {
ap = hw_arl->arl_phy;
if (ap != NULL && ap->ap_hw_addrlen == hlen &&
bcmp(ap->ap_hw_addr, src_haddr, hlen) == 0)
return (AR_LOOPBACK);
}
}
/*
* If the entry is unverified, then we've just verified that
* someone else already owns this address, because this is a
* message with the same protocol address but different
* hardware address. NOTE: the ace_xmit_arl check ensures we
* don't send duplicate AR_FAILEDs if arl is in an IPMP group.
*/
if ((ace->ace_flags & ACE_F_UNVERIFIED) &&
arl == ace->ace_xmit_arl) {
ar_ce_delete(ace);
return (AR_FAILED);
}
/*
* If the IP address matches ours and we're authoritative for
* this entry, then some other node is using our IP addr, so
* return AR_BOGON. Also reset the transmit count to zero so
* that, if we're currently in initial announcement mode, we
* switch back to the lazier defense mode. Knowing that
* there's at least one duplicate out there, we ought not
* blindly announce. NOTE: the ace_xmit_arl check ensures we
* don't send duplicate AR_BOGONs if arl is in an IPMP group.
*/
if ((ace->ace_flags & ACE_F_AUTHORITY) &&
arl == ace->ace_xmit_arl) {
ace->ace_xmit_count = 0;
return (AR_BOGON);
}
/*
* Only update this ACE if it's on the same network -- i.e.,
* it's for our ARL or another ARL in the same IPMP group.
*/
if (ace->ace_arl == arl || ace->ace_arl == arl->arl_ipmp_arl) {
if (ar_ce_resolve(ace, src_haddr, hlen))
retv = AR_CHANGED;
else if (retv == AR_NOTFOUND)
retv = AR_MERGED;
}
}
if (retv == AR_NOTFOUND)
*ace_arlp = NULL;
return (retv);
}
/* Pass arg1 to the pfi supplied, along with each ace in existence. */
static void
ar_ce_walk(arp_stack_t *as, void (*pfi)(ace_t *, void *), void *arg1)
{
ace_t *ace;
ace_t *ace1;
int i;
for (i = 0; i < ARP_HASH_SIZE; i++) {
/*
* We walk the hash chain in a way that allows the current
* ace to get blown off by the called routine.
*/
for (ace = as->as_ce_hash_tbl[i]; ace; ace = ace1) {
ace1 = ace->ace_next;
(*pfi)(ace, arg1);
}
}
for (ace = as->as_ce_mask_entries; ace; ace = ace1) {
ace1 = ace->ace_next;
(*pfi)(ace, arg1);
}
}
/*
* Send a copy of interesting packets to the corresponding IP instance.
* The corresponding IP instance is the ARP-IP-DEV instance for this
* DEV (i.e. ARL).
*/
static void
ar_client_notify(const arl_t *arl, mblk_t *mp, int code)
{
ar_t *ar = ((ar_t *)arl->arl_rq->q_ptr)->ar_arl_ip_assoc;
arcn_t *arcn;
mblk_t *mp1;
int arl_namelen = strlen(arl->arl_name) + 1;
/* Looks like the association disappeared */
if (ar == NULL) {
freemsg(mp);
return;
}
/* ar is the corresponding ARP-IP instance for this ARL */
ASSERT(ar->ar_arl == NULL && ar->ar_wq->q_next != NULL);
mp1 = allocb(sizeof (arcn_t) + arl_namelen, BPRI_MED);
if (mp1 == NULL) {
freemsg(mp);
return;
}
DB_TYPE(mp1) = M_CTL;
mp1->b_cont = mp;
arcn = (arcn_t *)mp1->b_rptr;
mp1->b_wptr = (uchar_t *)&arcn[1] + arl_namelen;
arcn->arcn_cmd = AR_CLIENT_NOTIFY;
arcn->arcn_name_offset = sizeof (arcn_t);
arcn->arcn_name_length = arl_namelen;
arcn->arcn_code = code;
bcopy(arl->arl_name, &arcn[1], arl_namelen);
putnext(ar->ar_wq, mp1);
}
/*
* Send a delete-notify message down to IP. We've determined that IP doesn't
* have a cache entry for the IP address itself, but it may have other cache
* entries with the same hardware address, and we don't want to see those grow
* stale. (The alternative is sending down updates for every ARP message we
* get that doesn't match an existing ace. That's much more expensive than an
* occasional delete and reload.)
*/
static void
ar_delete_notify(const ace_t *ace)
{
const arl_t *arl = ace->ace_arl;
const arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
mblk_t *mp;
size_t len;
arh_t *arh;
len = sizeof (*arh) + 2 * ace->ace_proto_addr_length;
mp = allocb(len, BPRI_MED);
if (mp == NULL)
return;
arh = (arh_t *)mp->b_rptr;
mp->b_wptr = (uchar_t *)arh + len;
U16_TO_BE16(ap->ap_arp_hw_type, arh->arh_hardware);
U16_TO_BE16(ace->ace_proto, arh->arh_proto);
arh->arh_hlen = 0;
arh->arh_plen = ace->ace_proto_addr_length;
U16_TO_BE16(ARP_RESPONSE, arh->arh_operation);
bcopy(ace->ace_proto_addr, arh + 1, ace->ace_proto_addr_length);
bcopy(ace->ace_proto_addr, (uchar_t *)(arh + 1) +
ace->ace_proto_addr_length, ace->ace_proto_addr_length);
ar_client_notify(arl, mp, AR_CN_ANNOUNCE);
}
/* ARP module close routine. */
static int
ar_close(queue_t *q)
{
ar_t *ar = (ar_t *)q->q_ptr;
char name[LIFNAMSIZ];
arl_t *arl, *xarl;
arl_t **arlp;
cred_t *cr;
arc_t *arc;
mblk_t *mp1;
int index;
arp_stack_t *as = ar->ar_as;
TRACE_1(TR_FAC_ARP, TR_ARP_CLOSE,
"arp_close: q %p", q);
arl = ar->ar_arl;
if (arl == NULL) {
index = 0;
/*
* If this is the <ARP-IP-Driver> stream send down
* a closing message to IP and wait for IP to send
* an ack. This helps to make sure that messages
* that are currently being sent up by IP are not lost.
*/
if (ar->ar_on_ill_stream) {
mp1 = allocb(sizeof (arc_t), BPRI_MED);
if (mp1 != NULL) {
DB_TYPE(mp1) = M_CTL;
arc = (arc_t *)mp1->b_rptr;
mp1->b_wptr = mp1->b_rptr + sizeof (arc_t);
arc->arc_cmd = AR_ARP_CLOSING;
putnext(WR(q), mp1);
while (!ar->ar_ip_acked_close)
/* If we are interrupted break out */
if (qwait_sig(q) == 0)
break;
}
}
/* Delete all our pending queries, 'arl' is not dereferenced */
ar_ce_walk(as, ar_query_delete, ar);
/*
* The request could be pending on some arl_queue also. This
* happens if the arl is not yet bound, and bind is pending.
*/
ar_ll_cleanup_arl_queue(q);
} else {
index = arl->arl_index;
(void) strcpy(name, arl->arl_name);
arl->arl_closing = 1;
while (arl->arl_queue != NULL)
qwait(arl->arl_rq);
if (arl->arl_state == ARL_S_UP)
ar_ll_down(arl);
while (arl->arl_state != ARL_S_DOWN)
qwait(arl->arl_rq);
if (arl->arl_flags & ARL_F_IPMP) {
/*
* Though rude, someone could force the IPMP arl
* closed without removing the underlying interfaces.
* In that case, force the ARLs out of the group.
*/
xarl = as->as_arl_head;
for (; xarl != NULL; xarl = xarl->arl_next) {
if (xarl->arl_ipmp_arl != arl || xarl == arl)
continue;
ar_ce_walk(as, ar_ce_ipmp_deactivate, xarl);
xarl->arl_ipmp_arl = NULL;
}
}
ar_ll_clear_defaults(arl);
/*
* If this is the control stream for an arl, delete anything
* hanging off our arl.
*/
ar_ce_walk(as, ar_ce_delete_per_arl, arl);
/* Free any messages waiting for a bind_ack */
/* Get the arl out of the chain. */
rw_enter(&as->as_arl_lock, RW_WRITER);
for (arlp = &as->as_arl_head; *arlp;
arlp = &(*arlp)->arl_next) {
if (*arlp == arl) {
*arlp = arl->arl_next;
break;
}
}
ASSERT(arl->arl_dlpi_deferred == NULL);
ar->ar_arl = NULL;
rw_exit(&as->as_arl_lock);
mi_free((char *)arl);
}
/* Let's break the association between an ARL and IP instance */
if (ar->ar_arl_ip_assoc != NULL) {
ASSERT(ar->ar_arl_ip_assoc->ar_arl_ip_assoc != NULL &&
ar->ar_arl_ip_assoc->ar_arl_ip_assoc == ar);
ar->ar_arl_ip_assoc->ar_arl_ip_assoc = NULL;
ar->ar_arl_ip_assoc = NULL;
}
cr = ar->ar_credp;
/* mi_close_comm frees the instance data. */
(void) mi_close_comm(&as->as_head, q);
qprocsoff(q);
crfree(cr);
if (index != 0) {
hook_nic_event_t info;
info.hne_nic = index;
info.hne_lif = 0;
info.hne_event = NE_UNPLUMB;
info.hne_data = name;
info.hne_datalen = strlen(name);
(void) hook_run(as->as_net_data->netd_hooks,
as->as_arpnicevents, (hook_data_t)&info);
}
netstack_rele(as->as_netstack);
return (0);
}
/*
* Dispatch routine for ARP commands. This routine can be called out of
* either ar_wput or ar_rput, in response to IOCTLs or M_PROTO messages.
*/
/* TODO: error reporting for M_PROTO case */
static int
ar_cmd_dispatch(queue_t *q, mblk_t *mp_orig, boolean_t from_wput)
{
arct_t *arct;
uint32_t cmd;
ssize_t len;
mblk_t *mp = mp_orig;
cred_t *cr = NULL;
if (!mp)
return (ENOENT);
/* We get both M_PROTO and M_IOCTL messages, so watch out! */
if (DB_TYPE(mp) == M_IOCTL) {
struct iocblk *ioc;
ioc = (struct iocblk *)mp->b_rptr;
cmd = ioc->ioc_cmd;
cr = ioc->ioc_cr;
mp = mp->b_cont;
if (!mp)
return (ENOENT);
}
len = MBLKL(mp);
if (len < sizeof (uint32_t) || !OK_32PTR(mp->b_rptr))
return (ENOENT);
if (mp_orig == mp)
cmd = *(uint32_t *)mp->b_rptr;
for (arct = ar_cmd_tbl; ; arct++) {
if (arct >= A_END(ar_cmd_tbl))
return (ENOENT);
if (arct->arct_cmd == cmd)
break;
}
if (len < arct->arct_min_len) {
/*
* If the command is exclusive to ARP, we return EINVAL,
* else we need to pass the command downstream, so return
* ENOENT
*/
return ((arct->arct_flags & ARF_ONLY_CMD) ? EINVAL : ENOENT);
}
if (arct->arct_priv_req != OP_NP) {
int error;
if (cr == NULL)
cr = DB_CREDDEF(mp_orig, ((ar_t *)q->q_ptr)->ar_credp);
if ((error = secpolicy_ip(cr, arct->arct_priv_req,
B_FALSE)) != 0)
return (error);
}
/* Disallow many commands except if from rput i.e. from IP */
if (from_wput && !(arct->arct_flags & ARF_WPUT_OK)) {
return (EINVAL);
}
if (arct->arct_flags & ARF_IOCTL_AWARE)
mp = mp_orig;
DTRACE_PROBE3(cmd_dispatch, queue_t *, q, mblk_t *, mp,
arct_t *, arct);
return (*arct->arct_pfi)(q, mp);
}
/* Allocate and do common initializations for DLPI messages. */
static mblk_t *
ar_dlpi_comm(t_uscalar_t prim, size_t size)
{
mblk_t *mp;
if ((mp = allocb(size, BPRI_HI)) == NULL)
return (NULL);
/*
* DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter
* of which we don't seem to use) are sent with M_PCPROTO, and
* that other DLPI are M_PROTO.
*/
DB_TYPE(mp) = (prim == DL_INFO_REQ) ? M_PCPROTO : M_PROTO;
mp->b_wptr = mp->b_rptr + size;
bzero(mp->b_rptr, size);
((union DL_primitives *)mp->b_rptr)->dl_primitive = prim;
return (mp);
}
/*
* The following two functions serialize DLPI messages to the driver, much
* along the lines of ill_dlpi_send and ill_dlpi_done in IP. Basically,
* we wait for a DLPI message, sent downstream, to be acked before sending
* the next. If there are DLPI messages that have not yet been sent, queue
* this message (mp), else send it downstream.
*/
static void
ar_dlpi_send(arl_t *arl, mblk_t *mp)
{
ASSERT(arl != NULL);
ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
if (arl->arl_dlpi_pending != DL_PRIM_INVAL) {
mblk_t **mpp;
/* Must queue message. Tail insertion */
mpp = &arl->arl_dlpi_deferred;
while (*mpp != NULL)
mpp = &((*mpp)->b_next);
*mpp = mp;
DTRACE_PROBE2(dlpi_defer, arl_t *, arl, mblk_t *, mp);
return;
}
arl->arl_dlpi_pending =
((union DL_primitives *)mp->b_rptr)->dl_primitive;
DTRACE_PROBE2(dlpi_send, arl_t *, arl, mblk_t *, mp);
putnext(arl->arl_wq, mp);
}
/*
* Called when an DLPI control message has been acked; send down the next
* queued message (if any).
* The DLPI messages of interest being bind, attach, unbind and detach since
* these are the only ones sent by ARP via ar_dlpi_send.
*/
static void
ar_dlpi_done(arl_t *arl, t_uscalar_t prim)
{
mblk_t *mp;
if (arl->arl_dlpi_pending != prim) {
DTRACE_PROBE2(dlpi_done_unexpected, arl_t *, arl,
t_uscalar_t, prim);
return;
}
if ((mp = arl->arl_dlpi_deferred) == NULL) {
DTRACE_PROBE2(dlpi_done_idle, arl_t *, arl, t_uscalar_t, prim);
arl->arl_dlpi_pending = DL_PRIM_INVAL;
ar_cmd_done(arl);
return;
}
arl->arl_dlpi_deferred = mp->b_next;
mp->b_next = NULL;
ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
arl->arl_dlpi_pending =
((union DL_primitives *)mp->b_rptr)->dl_primitive;
DTRACE_PROBE2(dlpi_done_next, arl_t *, arl, mblk_t *, mp);
putnext(arl->arl_wq, mp);
}
static void
ar_cmd_done(arl_t *arl)
{
mblk_t *mp;
int cmd;
int err;
mblk_t *mp1;
mblk_t *dlpi_op_done_mp = NULL;
queue_t *dlpi_op_done_q;
ar_t *ar_arl;
ar_t *ar_ip;
queue_t *q;
ASSERT(arl->arl_state == ARL_S_UP || arl->arl_state == ARL_S_DOWN);
/*
* If the current operation was initiated by IP there must be
* an op enqueued in arl_queue. But if ar_close has sent down
* a detach/unbind, there is no command enqueued. Also if the IP-ARP
* stream has closed the cleanup would be done and there won't be any mp
*/
if ((mp = arl->arl_queue) == NULL)
return;
if ((cmd = (uintptr_t)mp->b_prev) & CMD_IN_PROGRESS) {
mp1 = ar_cmd_dequeue(arl);
ASSERT(mp == mp1);
cmd &= ~CMD_IN_PROGRESS;
if (cmd == AR_INTERFACE_UP) {
/*
* There is an ioctl waiting for us...
*/
if (arl->arl_state == ARL_S_UP)
err = 0;
else
err = EINVAL;
dlpi_op_done_mp = ar_alloc(AR_DLPIOP_DONE, err);
if (dlpi_op_done_mp != NULL) {
/*
* Better performance if we send the response
* after the potential MAPPING_ADDs command
* that are likely to follow. (Do it below the
* while loop, instead of putnext right now)
*/
dlpi_op_done_q = WR(mp->b_queue);
}
if (err == 0) {
/*
* Now that we have the ARL instance
* corresponding to the IP instance let's make
* the association here.
*/
ar_ip = (ar_t *)mp->b_queue->q_ptr;
ar_arl = (ar_t *)arl->arl_rq->q_ptr;
ar_arl->ar_arl_ip_assoc = ar_ip;
ar_ip->ar_arl_ip_assoc = ar_arl;
}
}
inet_freemsg(mp);
}
/*
* Run the commands that have been enqueued while we were waiting
* for the last command (AR_INTERFACE_UP or AR_INTERFACE_DOWN)
* to complete.
*/
while ((mp = ar_cmd_dequeue(arl)) != NULL) {
mp->b_prev = AR_DRAINING;
q = mp->b_queue;
mp->b_queue = NULL;
/*
* Don't call put(q, mp) since it can lead to reorder of
* messages by sending the current messages to the end of
* arp's syncq
*/
if (q->q_flag & QREADR)
ar_rput(q, mp);
else
ar_wput(q, mp);
if ((mp = arl->arl_queue) == NULL)
goto done; /* no work to do */
if ((cmd = (uintptr_t)mp->b_prev) & CMD_IN_PROGRESS) {
/*
* The current command is an AR_INTERFACE_UP or
* AR_INTERFACE_DOWN and is waiting for a DLPI ack
* from the driver. Return. We can't make progress now.
*/
goto done;
}
}
done:
if (dlpi_op_done_mp != NULL) {
DTRACE_PROBE3(cmd_done_next, arl_t *, arl,
queue_t *, dlpi_op_done_q, mblk_t *, dlpi_op_done_mp);
putnext(dlpi_op_done_q, dlpi_op_done_mp);
}
}
/*
* Queue all arp commands coming from clients. Typically these commands
* come from IP, but could also come from other clients. The commands
* are serviced in FIFO order. Some commands need to wait and restart
* after the DLPI response from the driver is received. Typically
* AR_INTERFACE_UP and AR_INTERFACE_DOWN. ar_dlpi_done restarts
* the command and then dequeues the queue at arl_queue and calls ar_rput
* or ar_wput for each enqueued command. AR_DRAINING is used to signify
* that the command is being executed thru a drain from ar_dlpi_done.
* Functions handling the individual commands such as ar_entry_add
* check for this flag in b_prev to determine whether the command has
* to be enqueued for later processing or must be processed now.
*
* b_next used to thread the enqueued command mblks
* b_queue used to identify the queue of the originating request(client)
* b_prev used to store the command itself for easy parsing.
*/
static void
ar_cmd_enqueue(arl_t *arl, mblk_t *mp, queue_t *q, ushort_t cmd,
boolean_t tail_insert)
{
mp->b_queue = q;
if (arl->arl_queue == NULL) {
ASSERT(arl->arl_queue_tail == NULL);
mp->b_prev = (void *)((uintptr_t)(cmd | CMD_IN_PROGRESS));
mp->b_next = NULL;
arl->arl_queue = mp;
arl->arl_queue_tail = mp;
} else if (tail_insert) {
mp->b_prev = (void *)((uintptr_t)cmd);
mp->b_next = NULL;
arl->arl_queue_tail->b_next = mp;
arl->arl_queue_tail = mp;
} else {
/* head insert */
mp->b_prev = (void *)((uintptr_t)cmd | CMD_IN_PROGRESS);
mp->b_next = arl->arl_queue;
arl->arl_queue = mp;
}
}
static mblk_t *
ar_cmd_dequeue(arl_t *arl)
{
mblk_t *mp;
if (arl->arl_queue == NULL) {
ASSERT(arl->arl_queue_tail == NULL);
return (NULL);
}
mp = arl->arl_queue;
arl->arl_queue = mp->b_next;
if (arl->arl_queue == NULL)
arl->arl_queue_tail = NULL;
mp->b_next = NULL;
return (mp);
}
/*
* Standard ACE timer handling: compute 'fuzz' around a central value or from 0
* up to a value, and then set the timer. The randomization is necessary to
* prevent groups of systems from falling into synchronization on the network
* and producing ARP packet storms.
*/
static void
ace_set_timer(ace_t *ace, boolean_t initial_time)
{
clock_t intv, rnd, frac;
(void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
/* Note that clock_t is signed; must chop off bits */
rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
intv = ace->ace_xmit_interval;
if (initial_time) {
/* Set intv to be anywhere in the [1 .. intv] range */
if (intv <= 0)
intv = 1;
else
intv = (rnd % intv) + 1;
} else {
/* Compute 'frac' as 20% of the configured interval */
if ((frac = intv / 5) <= 1)
frac = 2;
/* Set intv randomly in the range [intv-frac .. intv+frac] */
if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
intv = 1;
}
mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, intv);
}
/*
* Process entry add requests from external messages.
* It is also called by ip_rput_dlpi_writer() through
* ipif_resolver_up() to change hardware address when
* an asynchronous hardware address change notification
* arrives from the driver.
*/
static int
ar_entry_add(queue_t *q, mblk_t *mp_orig)
{
area_t *area;
ace_t *ace;
uchar_t *hw_addr;
uint32_t hw_addr_len;
uchar_t *proto_addr;
uint32_t proto_addr_len;
uchar_t *proto_mask;
arl_t *arl;
mblk_t *mp = mp_orig;
int err;
uint_t aflags;
boolean_t unverified;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
/* We handle both M_IOCTL and M_PROTO messages. */
if (DB_TYPE(mp) == M_IOCTL)
mp = mp->b_cont;
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL)
return (EINVAL);
/*
* Newly received commands from clients go to the tail of the queue.
*/
if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
DTRACE_PROBE3(eadd_enqueued, queue_t *, q, mblk_t *, mp_orig,
arl_t *, arl);
ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_ADD, B_TRUE);
return (EINPROGRESS);
}
mp_orig->b_prev = NULL;
area = (area_t *)mp->b_rptr;
aflags = area->area_flags;
/*
* If the previous entry wasn't published and we are now going
* to publish, then we need to do address verification. The previous
* entry may have been a local unpublished address or even an external
* address. If the entry we find was in an unverified state we retain
* this.
* If it's a new published entry, then we're obligated to do
* duplicate address detection now.
*/
ace = ar_ce_lookup_from_area(as, mp, ar_ce_lookup_entry);
if (ace != NULL) {
unverified = !(ace->ace_flags & ACE_F_PUBLISH) &&
(aflags & ACE_F_PUBLISH);
if (ace->ace_flags & ACE_F_UNVERIFIED)
unverified = B_TRUE;
ar_ce_delete(ace);
} else {
unverified = (aflags & ACE_F_PUBLISH) != 0;
}
/* Allow client to request DAD restart */
if (aflags & ACE_F_UNVERIFIED)
unverified = B_TRUE;
/* Extract parameters from the message. */
hw_addr_len = area->area_hw_addr_length;
hw_addr = mi_offset_paramc(mp, area->area_hw_addr_offset, hw_addr_len);
proto_addr_len = area->area_proto_addr_length;
proto_addr = mi_offset_paramc(mp, area->area_proto_addr_offset,
proto_addr_len);
proto_mask = mi_offset_paramc(mp, area->area_proto_mask_offset,
proto_addr_len);
if (proto_mask == NULL) {
DTRACE_PROBE2(eadd_bad_mask, arl_t *, arl, area_t *, area);
return (EINVAL);
}
err = ar_ce_create(
arl,
area->area_proto,
hw_addr,
hw_addr_len,
proto_addr,
proto_addr_len,
proto_mask,
NULL,
(uint32_t)0,
NULL,
aflags & ~ACE_F_MAPPING & ~ACE_F_UNVERIFIED & ~ACE_F_DEFEND);
if (err != 0) {
DTRACE_PROBE3(eadd_create_failed, arl_t *, arl, area_t *, area,
int, err);
return (err);
}
if (aflags & ACE_F_PUBLISH) {
arlphy_t *ap;
ace = ar_ce_lookup(arl, area->area_proto, proto_addr,
proto_addr_len);
ASSERT(ace != NULL);
ap = ace->ace_xmit_arl->arl_phy;
if (hw_addr == NULL || hw_addr_len == 0) {
hw_addr = ap->ap_hw_addr;
} else if (aflags & ACE_F_MYADDR) {
/*
* If hardware address changes, then make sure
* that the hardware address and hardware
* address length fields in arlphy_t get updated
* too. Otherwise, they will continue carrying
* the old hardware address information.
*/
ASSERT((hw_addr != NULL) && (hw_addr_len != 0));
bcopy(hw_addr, ap->ap_hw_addr, hw_addr_len);
ap->ap_hw_addrlen = hw_addr_len;
}
if (ace->ace_flags & ACE_F_FAST) {
ace->ace_xmit_count = as->as_fastprobe_count;
ace->ace_xmit_interval = as->as_fastprobe_delay;
} else {
ace->ace_xmit_count = as->as_probe_count;
ace->ace_xmit_interval = as->as_probe_delay;
}
/*
* If the user has disabled duplicate address detection for
* this kind of interface (fast or slow) by setting the probe
* count to zero, then pretend as if we've verified the
* address, and go right to address defense mode.
*/
if (ace->ace_xmit_count == 0)
unverified = B_FALSE;
/*
* If we need to do duplicate address detection, then kick that
* off. Otherwise, send out a gratuitous ARP message in order
* to update everyone's caches with the new hardware address.
*/
if (unverified) {
ace->ace_flags |= ACE_F_UNVERIFIED;
if (ace->ace_xmit_interval == 0) {
/*
* User has configured us to send the first
* probe right away. Do so, and set up for
* the subsequent probes.
*/
DTRACE_PROBE2(eadd_probe, ace_t *, ace,
area_t *, area);
ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
area->area_proto, proto_addr_len,
hw_addr, NULL, NULL, proto_addr, NULL, as);
ace->ace_xmit_count--;
ace->ace_xmit_interval =
(ace->ace_flags & ACE_F_FAST) ?
as->as_fastprobe_interval :
as->as_probe_interval;
ace_set_timer(ace, B_FALSE);
} else {
DTRACE_PROBE2(eadd_delay, ace_t *, ace,
area_t *, area);
/* Regular delay before initial probe */
ace_set_timer(ace, B_TRUE);
}
} else {
DTRACE_PROBE2(eadd_announce, ace_t *, ace,
area_t *, area);
ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
area->area_proto, proto_addr_len, hw_addr,
proto_addr, ap->ap_arp_addr, proto_addr, NULL, as);
ace->ace_last_bcast = ddi_get_lbolt();
/*
* If AUTHORITY is set, it is not just a proxy arp
* entry; we believe we're the authority for this
* entry. In that case, and if we're not just doing
* one-off defense of the address, we send more than
* one copy, so we'll still have a good chance of
* updating everyone even when there's a packet loss
* or two.
*/
if ((aflags & ACE_F_AUTHORITY) &&
!(aflags & ACE_F_DEFEND) &&
as->as_publish_count > 0) {
/* Account for the xmit we just did */
ace->ace_xmit_count = as->as_publish_count - 1;
ace->ace_xmit_interval =
as->as_publish_interval;
if (ace->ace_xmit_count > 0)
ace_set_timer(ace, B_FALSE);
}
}
}
return (0);
}
/* Process entry delete requests from external messages. */
static int
ar_entry_delete(queue_t *q, mblk_t *mp_orig)
{
ace_t *ace;
arl_t *arl;
mblk_t *mp = mp_orig;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
/* We handle both M_IOCTL and M_PROTO messages. */
if (DB_TYPE(mp) == M_IOCTL)
mp = mp->b_cont;
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL)
return (EINVAL);
/*
* Newly received commands from clients go to the tail of the queue.
*/
if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
DTRACE_PROBE3(edel_enqueued, queue_t *, q, mblk_t *, mp_orig,
arl_t *, arl);
ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_DELETE, B_TRUE);
return (EINPROGRESS);
}
mp_orig->b_prev = NULL;
/*
* Need to know if it is a mapping or an exact match. Check exact
* match first.
*/
ace = ar_ce_lookup_from_area(as, mp, ar_ce_lookup);
if (ace != NULL) {
ared_t *ared = (ared_t *)mp->b_rptr;
/*
* If it's a permanent entry, then the client is the one who
* told us to delete it, so there's no reason to notify.
*/
if (ACE_NONPERM(ace))
ar_delete_notify(ace);
/*
* Only delete the ARP entry if it is non-permanent, or
* ARED_F_PRESERVE_PERM flags is not set.
*/
if (ACE_NONPERM(ace) ||
!(ared->ared_flags & ARED_F_PRESERVE_PERM)) {
ar_ce_delete(ace);
}
return (0);
}
return (ENXIO);
}
/*
* Process entry query requests from external messages.
* Bump up the ire_stats_freed for all errors except
* EINPROGRESS - which means the packet has been queued.
* For all other errors the packet is going to be freed
* and hence we account for ire being freed if it
* is a M_PROTO message.
*/
static int
ar_entry_query(queue_t *q, mblk_t *mp_orig)
{
ace_t *ace;
areq_t *areq;
arl_t *arl;
int err;
mblk_t *mp = mp_orig;
uchar_t *proto_addr;
uchar_t *sender_addr;
uint32_t proto_addr_len;
clock_t ms;
boolean_t is_mproto = B_TRUE;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
/* We handle both M_IOCTL and M_PROTO messages. */
if (DB_TYPE(mp) == M_IOCTL) {
is_mproto = B_FALSE;
mp = mp->b_cont;
}
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL) {
DTRACE_PROBE2(query_no_arl, queue_t *, q, mblk_t *, mp);
err = EINVAL;
goto err_ret;
}
/*
* Newly received commands from clients go to the tail of the queue.
*/
if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
DTRACE_PROBE3(query_enqueued, queue_t *, q, mblk_t *, mp_orig,
arl_t *, arl);
ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_QUERY, B_TRUE);
return (EINPROGRESS);
}
mp_orig->b_prev = NULL;
areq = (areq_t *)mp->b_rptr;
proto_addr_len = areq->areq_target_addr_length;
proto_addr = mi_offset_paramc(mp, areq->areq_target_addr_offset,
proto_addr_len);
if (proto_addr == NULL) {
DTRACE_PROBE1(query_illegal_address, areq_t *, areq);
err = EINVAL;
goto err_ret;
}
/* Stash the reply queue pointer for later use. */
mp->b_prev = (mblk_t *)OTHERQ(q);
mp->b_next = NULL;
if (areq->areq_xmit_interval == 0)
areq->areq_xmit_interval = AR_DEF_XMIT_INTERVAL;
ace = ar_ce_lookup(arl, areq->areq_proto, proto_addr, proto_addr_len);
if (ace != NULL && (ace->ace_flags & ACE_F_OLD)) {
/*
* This is a potentially stale entry that IP's asking about.
* Since IP is asking, it must not have an answer anymore,
* either due to periodic ARP flush or due to SO_DONTROUTE.
* Rather than go forward with what we've got, restart
* resolution.
*/
DTRACE_PROBE2(query_stale_ace, ace_t *, ace, areq_t *, areq);
ar_ce_delete(ace);
ace = NULL;
}
if (ace != NULL) {
mblk_t **mpp;
uint32_t count = 0;
/*
* There is already a cache entry. This means there is either
* a permanent entry, or address resolution is in progress.
* If the latter, there should be one or more queries queued
* up. We link the current one in at the end, if there aren't
* too many outstanding.
*/
for (mpp = &ace->ace_query_mp; mpp[0]; mpp = &mpp[0]->b_next) {
if (++count > areq->areq_max_buffered) {
DTRACE_PROBE2(query_overflow, ace_t *, ace,
areq_t *, areq);
mp->b_prev = NULL;
err = EALREADY;
goto err_ret;
}
}
/* Put us on the list. */
mpp[0] = mp;
if (count != 0) {
/*
* If a query was already queued up, then we must not
* have an answer yet.
*/
DTRACE_PROBE2(query_in_progress, ace_t *, ace,
areq_t *, areq);
return (EINPROGRESS);
}
if (ACE_RESOLVED(ace)) {
/*
* We have an answer already.
* Keep a dup of mp since proto_addr points to it
* and mp has been placed on the ace_query_mp list.
*/
mblk_t *mp1;
DTRACE_PROBE2(query_resolved, ace_t *, ace,
areq_t *, areq);
mp1 = dupmsg(mp);
ar_query_reply(ace, 0, proto_addr, proto_addr_len);
freemsg(mp1);
return (EINPROGRESS);
}
if (ace->ace_flags & ACE_F_MAPPING) {
/* Should never happen */
DTRACE_PROBE2(query_unresolved_mapping, ace_t *, ace,
areq_t *, areq);
mpp[0] = mp->b_next;
err = ENXIO;
goto err_ret;
}
DTRACE_PROBE2(query_unresolved, ace_t, ace, areq_t *, areq);
} else {
/* No ace yet. Make one now. (This is the common case.) */
if (areq->areq_xmit_count == 0) {
DTRACE_PROBE2(query_template, arl_t *, arl,
areq_t *, areq);
mp->b_prev = NULL;
err = ENXIO;
goto err_ret;
}
/*
* Check for sender addr being NULL or not before
* we create the ace. It is easy to cleanup later.
*/
sender_addr = mi_offset_paramc(mp,
areq->areq_sender_addr_offset,
areq->areq_sender_addr_length);
if (sender_addr == NULL) {
DTRACE_PROBE2(query_no_sender, arl_t *, arl,
areq_t *, areq);
mp->b_prev = NULL;
err = EINVAL;
goto err_ret;
}
err = ar_ce_create(OWNING_ARL(arl), areq->areq_proto, NULL, 0,
proto_addr, proto_addr_len, NULL,
NULL, (uint32_t)0, sender_addr,
areq->areq_flags);
if (err != 0) {
DTRACE_PROBE3(query_create_failed, arl_t *, arl,
areq_t *, areq, int, err);
mp->b_prev = NULL;
goto err_ret;
}
ace = ar_ce_lookup(arl, areq->areq_proto, proto_addr,
proto_addr_len);
if (ace == NULL || ace->ace_query_mp != NULL) {
/* Shouldn't happen! */
DTRACE_PROBE3(query_lookup_failed, arl_t *, arl,
areq_t *, areq, ace_t *, ace);
mp->b_prev = NULL;
err = ENXIO;
goto err_ret;
}
ace->ace_query_mp = mp;
}
ms = ar_query_xmit(as, ace);
if (ms == 0) {
/* Immediate reply requested. */
ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
} else {
mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, ms);
}
return (EINPROGRESS);
err_ret:
if (is_mproto) {
ip_stack_t *ipst = as->as_netstack->netstack_ip;
BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed);
}
return (err);
}
/* Handle simple query requests. */
static int
ar_entry_squery(queue_t *q, mblk_t *mp_orig)
{
ace_t *ace;
area_t *area;
arl_t *arl;
uchar_t *hw_addr;
uint32_t hw_addr_len;
mblk_t *mp = mp_orig;
uchar_t *proto_addr;
int proto_addr_len;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
if (DB_TYPE(mp) == M_IOCTL)
mp = mp->b_cont;
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL)
return (EINVAL);
/*
* Newly received commands from clients go to the tail of the queue.
*/
if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
DTRACE_PROBE3(squery_enqueued, queue_t *, q, mblk_t *, mp_orig,
arl_t *, arl);
ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_SQUERY, B_TRUE);
return (EINPROGRESS);
}
mp_orig->b_prev = NULL;
/* Extract parameters from the request message. */
area = (area_t *)mp->b_rptr;
proto_addr_len = area->area_proto_addr_length;
proto_addr = mi_offset_paramc(mp, area->area_proto_addr_offset,
proto_addr_len);
hw_addr_len = area->area_hw_addr_length;
hw_addr = mi_offset_paramc(mp, area->area_hw_addr_offset, hw_addr_len);
if (proto_addr == NULL || hw_addr == NULL) {
DTRACE_PROBE1(squery_illegal_address, area_t *, area);
return (EINVAL);
}
ace = ar_ce_lookup(arl, area->area_proto, proto_addr, proto_addr_len);
if (ace == NULL) {
return (ENXIO);
}
if (hw_addr_len < ace->ace_hw_addr_length) {
return (EINVAL);
}
if (ACE_RESOLVED(ace)) {
/* Got it, prepare the response. */
ASSERT(area->area_hw_addr_length == ace->ace_hw_addr_length);
ar_set_address(ace, hw_addr, proto_addr, proto_addr_len);
} else {
/*
* We have an incomplete entry. Set the length to zero and
* just return out the flags.
*/
area->area_hw_addr_length = 0;
}
area->area_flags = ace->ace_flags;
if (mp == mp_orig) {
/* Non-ioctl case */
/* TODO: change message type? */
DB_TYPE(mp) = M_CTL; /* Caught by ip_wput */
DTRACE_PROBE3(squery_reply, queue_t *, q, mblk_t *, mp,
arl_t *, arl);
qreply(q, mp);
return (EINPROGRESS);
}
return (0);
}
/* Process an interface down causing us to detach and unbind. */
/* ARGSUSED */
static int
ar_interface_down(queue_t *q, mblk_t *mp)
{
arl_t *arl;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL || arl->arl_closing) {
DTRACE_PROBE2(down_no_arl, queue_t *, q, mblk_t *, mp);
return (EINVAL);
}
/*
* Newly received commands from clients go to the tail of the queue.
*/
if (CMD_NEEDS_QUEUEING(mp, arl)) {
DTRACE_PROBE3(down_enqueued, queue_t *, q, mblk_t *, mp,
arl_t *, arl);
ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_DOWN, B_TRUE);
return (EINPROGRESS);
}
mp->b_prev = NULL;
/*
* The arl is already down, no work to do.
*/
if (arl->arl_state == ARL_S_DOWN) {
/* ar_rput frees the mp */
return (0);
}
/*
* This command cannot complete in a single shot now itself.
* It has to be restarted after the receipt of the ack from
* the driver. So we need to enqueue the command (at the head).
*/
ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_DOWN, B_FALSE);
ASSERT(arl->arl_state == ARL_S_UP);
/* Free all arp entries for this interface */
ar_ce_walk(as, ar_ce_delete_per_arl, arl);
ar_ll_down(arl);
/* Return EINPROGRESS so that ar_rput does not free the 'mp' */
return (EINPROGRESS);
}
/* Process an interface up causing the info req sequence to start. */
/* ARGSUSED */
static int
ar_interface_up(queue_t *q, mblk_t *mp)
{
arl_t *arl;
int err;
mblk_t *mp1;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL || arl->arl_closing) {
DTRACE_PROBE2(up_no_arl, queue_t *, q, mblk_t *, mp);
err = EINVAL;
goto done;
}
/*
* Newly received commands from clients go to the tail of the queue.
*/
if (CMD_NEEDS_QUEUEING(mp, arl)) {
DTRACE_PROBE3(up_enqueued, queue_t *, q, mblk_t *, mp,
arl_t *, arl);
ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_UP, B_TRUE);
return (EINPROGRESS);
}
mp->b_prev = NULL;
/*
* The arl is already up. No work to do.
*/
if (arl->arl_state == ARL_S_UP) {
err = 0;
goto done;
}
/*
* This command cannot complete in a single shot now itself.
* It has to be restarted after the receipt of the ack from
* the driver. So we need to enqueue the command (at the head).
*/
ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_UP, B_FALSE);
err = ar_ll_up(arl);
/* Return EINPROGRESS so that ar_rput does not free the 'mp' */
return (EINPROGRESS);
done:
/* caller frees 'mp' */
mp1 = ar_alloc(AR_DLPIOP_DONE, err);
if (mp1 != NULL) {
q = WR(q);
DTRACE_PROBE3(up_send_err, queue_t *, q, mblk_t *, mp1,
int, err);
putnext(q, mp1);
}
return (err);
}
/*
* Given an arie_t `mp', find the arl_t's that it names and return them
* in `*arlp' and `*ipmp_arlp'. If they cannot be found, return B_FALSE.
*/
static boolean_t
ar_ipmp_lookup(arp_stack_t *as, mblk_t *mp, arl_t **arlp, arl_t **ipmp_arlp)
{
arie_t *arie = (arie_t *)mp->b_rptr;
*arlp = ar_ll_lookup_from_mp(as, mp);
if (*arlp == NULL) {
DTRACE_PROBE1(ipmp_lookup_no_arl, mblk_t *, mp);
return (B_FALSE);
}
arie->arie_grifname[LIFNAMSIZ - 1] = '\0';
*ipmp_arlp = ar_ll_lookup_by_name(as, arie->arie_grifname);
if (*ipmp_arlp == NULL) {
DTRACE_PROBE1(ipmp_lookup_no_ipmp_arl, mblk_t *, mp);
return (B_FALSE);
}
DTRACE_PROBE2(ipmp_lookup, arl_t *, *arlp, arl_t *, *ipmp_arlp);
return (B_TRUE);
}
/*
* Bind an arl_t to an IPMP group arl_t.
*/
static int
ar_ipmp_activate(queue_t *q, mblk_t *mp)
{
arl_t *arl, *ipmp_arl;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl))
return (EINVAL);
if (arl->arl_ipmp_arl != NULL) {
DTRACE_PROBE1(ipmp_activated_already, arl_t *, arl);
return (EALREADY);
}
DTRACE_PROBE2(ipmp_activate, arl_t *, arl, arl_t *, ipmp_arl);
arl->arl_ipmp_arl = ipmp_arl;
return (0);
}
/*
* Unbind an arl_t from an IPMP group arl_t and update the ace_t's so
* that it is no longer part of the group.
*/
static int
ar_ipmp_deactivate(queue_t *q, mblk_t *mp)
{
arl_t *arl, *ipmp_arl;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl))
return (EINVAL);
if (ipmp_arl != arl->arl_ipmp_arl) {
DTRACE_PROBE2(ipmp_deactivate_notactive, arl_t *, arl, arl_t *,
ipmp_arl);
return (EINVAL);
}
DTRACE_PROBE2(ipmp_deactivate, arl_t *, arl, arl_t *,
arl->arl_ipmp_arl);
ar_ce_walk(as, ar_ce_ipmp_deactivate, arl);
arl->arl_ipmp_arl = NULL;
return (0);
}
/*
* Enable an interface to process ARP_REQUEST and ARP_RESPONSE messages.
*/
/* ARGSUSED */
static int
ar_interface_on(queue_t *q, mblk_t *mp)
{
arl_t *arl;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL) {
DTRACE_PROBE2(on_no_arl, queue_t *, q, mblk_t *, mp);
return (EINVAL);
}
DTRACE_PROBE3(on_intf, queue_t *, q, mblk_t *, mp, arl_t *, arl);
arl->arl_flags &= ~ARL_F_NOARP;
return (0);
}
/*
* Disable an interface from processing
* ARP_REQUEST and ARP_RESPONSE messages
*/
/* ARGSUSED */
static int
ar_interface_off(queue_t *q, mblk_t *mp)
{
arl_t *arl;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL) {
DTRACE_PROBE2(off_no_arl, queue_t *, q, mblk_t *, mp);
return (EINVAL);
}
DTRACE_PROBE3(off_intf, queue_t *, q, mblk_t *, mp, arl_t *, arl);
arl->arl_flags |= ARL_F_NOARP;
return (0);
}
/*
* The queue 'q' is closing. Walk all the arl's and free any message
* pending in the arl_queue if it originated from the closing q.
* Also cleanup the ip_pending_queue, if the arp-IP stream is closing.
*/
static void
ar_ll_cleanup_arl_queue(queue_t *q)
{
arl_t *arl;
mblk_t *mp;
mblk_t *mpnext;
mblk_t *prev;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
ip_stack_t *ipst = as->as_netstack->netstack_ip;
for (arl = as->as_arl_head; arl != NULL; arl = arl->arl_next) {
for (prev = NULL, mp = arl->arl_queue; mp != NULL;
mp = mpnext) {
mpnext = mp->b_next;
if ((void *)mp->b_queue == (void *)q ||
(void *)mp->b_queue == (void *)OTHERQ(q)) {
if (prev == NULL)
arl->arl_queue = mp->b_next;
else
prev->b_next = mp->b_next;
if (arl->arl_queue_tail == mp)
arl->arl_queue_tail = prev;
if (DB_TYPE(mp) == M_PROTO &&
*(uint32_t *)mp->b_rptr == AR_ENTRY_QUERY) {
BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
ire_stats_freed);
}
inet_freemsg(mp);
} else {
prev = mp;
}
}
}
}
/*
* Look up a lower level tap by name.
*/
static arl_t *
ar_ll_lookup_by_name(arp_stack_t *as, const char *name)
{
arl_t *arl;
for (arl = as->as_arl_head; arl; arl = arl->arl_next) {
if (strcmp(arl->arl_name, name) == 0) {
return (arl);
}
}
return (NULL);
}
/*
* Look up a lower level tap using parameters extracted from the common
* portion of the ARP command.
*/
static arl_t *
ar_ll_lookup_from_mp(arp_stack_t *as, mblk_t *mp)
{
arc_t *arc = (arc_t *)mp->b_rptr;
uint8_t *name;
size_t namelen = arc->arc_name_length;
name = mi_offset_param(mp, arc->arc_name_offset, namelen);
if (name == NULL || name[namelen - 1] != '\0')
return (NULL);
return (ar_ll_lookup_by_name(as, (char *)name));
}
static void
ar_ll_init(arp_stack_t *as, ar_t *ar, mblk_t *mp)
{
arl_t *arl;
dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr;
ASSERT(ar->ar_arl == NULL);
if ((arl = (arl_t *)mi_zalloc(sizeof (arl_t))) == NULL)
return;
if (dlia->dl_mac_type == SUNW_DL_IPMP) {
arl->arl_flags |= ARL_F_IPMP;
arl->arl_ipmp_arl = arl;
}
arl->arl_provider_style = dlia->dl_provider_style;
arl->arl_rq = ar->ar_rq;
arl->arl_wq = ar->ar_wq;
arl->arl_dlpi_pending = DL_PRIM_INVAL;
ar->ar_arl = arl;
/*
* If/when ARP gets pushed into the IP module then this code to make
* a number uniquely identify an ARP instance can be removed and the
* ifindex from IP used. Rather than try and reinvent or copy the
* code used by IP for the purpose of allocating an index number
* (and trying to keep the number small), just allocate it in an
* ever increasing manner. This index number isn't ever exposed to
* users directly, its only use is for providing the pfhooks interface
* with a number it can use to uniquely identify an interface in time.
*
* Using a 32bit counter, over 136 plumbs would need to be done every
* second of every day (non-leap year) for it to wrap around and the
* for() loop below to kick in as a performance concern.
*/
if (as->as_arp_counter_wrapped) {
arl_t *arl1;
do {
for (arl1 = as->as_arl_head; arl1 != NULL;
arl1 = arl1->arl_next)
if (arl1->arl_index ==
as->as_arp_index_counter) {
as->as_arp_index_counter++;
if (as->as_arp_index_counter == 0) {
as->as_arp_counter_wrapped++;
as->as_arp_index_counter = 1;
}
break;
}
} while (arl1 != NULL);
} else {
arl->arl_index = as->as_arp_index_counter;
}
as->as_arp_index_counter++;
if (as->as_arp_index_counter == 0) {
as->as_arp_counter_wrapped++;
as->as_arp_index_counter = 1;
}
}
/*
* This routine is called during module initialization when the DL_INFO_ACK
* comes back from the device. We set up defaults for all the device dependent
* doo-dads we are going to need. This will leave us ready to roll if we are
* attempting auto-configuration. Alternatively, these defaults can be
* overridden by initialization procedures possessing higher intelligence.
*/
static void
ar_ll_set_defaults(arl_t *arl, mblk_t *mp)
{
ar_m_t *arm;
dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr;
dl_unitdata_req_t *dlur;
uchar_t *up;
arlphy_t *ap;
ASSERT(arl != NULL);
/*
* Clear any stale defaults that might exist.
*/
ar_ll_clear_defaults(arl);
if (arl->arl_flags & ARL_F_IPMP) {
/*
* If this is an IPMP arl_t, we have nothing to do,
* since we will never transmit or receive.
*/
return;
}
ap = kmem_zalloc(sizeof (arlphy_t), KM_NOSLEEP);
if (ap == NULL)
goto bad;
arl->arl_phy = ap;
if ((arm = ar_m_lookup(dlia->dl_mac_type)) == NULL)
arm = ar_m_lookup(DL_OTHER);
ASSERT(arm != NULL);
/*
* We initialize based on parameters in the (currently) not too
* exhaustive ar_m_tbl.
*/
if (dlia->dl_version == DL_VERSION_2) {
/* XXX DLPI spec allows dl_sap_length of 0 before binding. */
ap->ap_saplen = dlia->dl_sap_length;
ap->ap_hw_addrlen = dlia->dl_brdcst_addr_length;
} else {
ap->ap_saplen = arm->ar_mac_sap_length;
ap->ap_hw_addrlen = arm->ar_mac_hw_addr_length;
}
ap->ap_arp_hw_type = arm->ar_mac_arp_hw_type;
/*
* Allocate the hardware and ARP addresses; note that the hardware
* address cannot be filled in until we see the DL_BIND_ACK.
*/
ap->ap_hw_addr = kmem_zalloc(ap->ap_hw_addrlen, KM_NOSLEEP);
ap->ap_arp_addr = kmem_alloc(ap->ap_hw_addrlen, KM_NOSLEEP);
if (ap->ap_hw_addr == NULL || ap->ap_arp_addr == NULL)
goto bad;
if (dlia->dl_version == DL_VERSION_2) {
if ((up = mi_offset_param(mp, dlia->dl_brdcst_addr_offset,
ap->ap_hw_addrlen)) == NULL)
goto bad;
bcopy(up, ap->ap_arp_addr, ap->ap_hw_addrlen);
} else {
/*
* No choice but to assume a broadcast address of all ones,
* known to work on some popular networks.
*/
(void) memset(ap->ap_arp_addr, ~0, ap->ap_hw_addrlen);
}
/*
* Make us a template DL_UNITDATA_REQ message which we will use for
* broadcasting resolution requests, and which we will clone to hand
* back as responses to the protocols.
*/
ap->ap_xmit_mp = ar_dlpi_comm(DL_UNITDATA_REQ, ap->ap_hw_addrlen +
ABS(ap->ap_saplen) + sizeof (dl_unitdata_req_t));
if (ap->ap_xmit_mp == NULL)
goto bad;
dlur = (dl_unitdata_req_t *)ap->ap_xmit_mp->b_rptr;
dlur->dl_priority.dl_min = 0;
dlur->dl_priority.dl_max = 0;
dlur->dl_dest_addr_length = ap->ap_hw_addrlen + ABS(ap->ap_saplen);
dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
/* NOTE: the destination address and sap offsets are permanently set */
ap->ap_xmit_sapoff = dlur->dl_dest_addr_offset;
ap->ap_xmit_addroff = dlur->dl_dest_addr_offset;
if (ap->ap_saplen < 0)
ap->ap_xmit_sapoff += ap->ap_hw_addrlen; /* sap last */
else
ap->ap_xmit_addroff += ap->ap_saplen; /* addr last */
*(uint16_t *)((caddr_t)dlur + ap->ap_xmit_sapoff) = ETHERTYPE_ARP;
return;
bad:
ar_ll_clear_defaults(arl);
}
static void
ar_ll_clear_defaults(arl_t *arl)
{
arlphy_t *ap = arl->arl_phy;
if (ap != NULL) {
arl->arl_phy = NULL;
if (ap->ap_hw_addr != NULL)
kmem_free(ap->ap_hw_addr, ap->ap_hw_addrlen);
if (ap->ap_arp_addr != NULL)
kmem_free(ap->ap_arp_addr, ap->ap_hw_addrlen);
freemsg(ap->ap_xmit_mp);
kmem_free(ap, sizeof (arlphy_t));
}
}
static void
ar_ll_down(arl_t *arl)
{
mblk_t *mp;
ar_t *ar;
ASSERT(arl->arl_state == ARL_S_UP);
/* Let's break the association between an ARL and IP instance */
ar = (ar_t *)arl->arl_rq->q_ptr;
if (ar->ar_arl_ip_assoc != NULL) {
ASSERT(ar->ar_arl_ip_assoc->ar_arl_ip_assoc != NULL &&
ar->ar_arl_ip_assoc->ar_arl_ip_assoc == ar);
ar->ar_arl_ip_assoc->ar_arl_ip_assoc = NULL;
ar->ar_arl_ip_assoc = NULL;
}
arl->arl_state = ARL_S_PENDING;
mp = arl->arl_unbind_mp;
ASSERT(mp != NULL);
ar_dlpi_send(arl, mp);
arl->arl_unbind_mp = NULL;
if (arl->arl_provider_style == DL_STYLE2) {
mp = arl->arl_detach_mp;
ASSERT(mp != NULL);
ar_dlpi_send(arl, mp);
arl->arl_detach_mp = NULL;
}
}
static int
ar_ll_up(arl_t *arl)
{
mblk_t *attach_mp = NULL;
mblk_t *bind_mp = NULL;
mblk_t *detach_mp = NULL;
mblk_t *unbind_mp = NULL;
mblk_t *info_mp = NULL;
mblk_t *notify_mp = NULL;
ASSERT(arl->arl_state == ARL_S_DOWN);
if (arl->arl_provider_style == DL_STYLE2) {
attach_mp =
ar_dlpi_comm(DL_ATTACH_REQ, sizeof (dl_attach_req_t));
if (attach_mp == NULL)
goto bad;
((dl_attach_req_t *)attach_mp->b_rptr)->dl_ppa =
arl->arl_ppa;
detach_mp =
ar_dlpi_comm(DL_DETACH_REQ, sizeof (dl_detach_req_t));
if (detach_mp == NULL)
goto bad;
}
info_mp = ar_dlpi_comm(DL_INFO_REQ, sizeof (dl_info_req_t));
if (info_mp == NULL)
goto bad;
/* Allocate and initialize a bind message. */
bind_mp = ar_dlpi_comm(DL_BIND_REQ, sizeof (dl_bind_req_t));
if (bind_mp == NULL)
goto bad;
((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ETHERTYPE_ARP;
((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
unbind_mp = ar_dlpi_comm(DL_UNBIND_REQ, sizeof (dl_unbind_req_t));
if (unbind_mp == NULL)
goto bad;
notify_mp = ar_dlpi_comm(DL_NOTIFY_REQ, sizeof (dl_notify_req_t));
if (notify_mp == NULL)
goto bad;
((dl_notify_req_t *)notify_mp->b_rptr)->dl_notifications =
DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN;
arl->arl_state = ARL_S_PENDING;
if (arl->arl_provider_style == DL_STYLE2) {
ar_dlpi_send(arl, attach_mp);
ASSERT(detach_mp != NULL);
arl->arl_detach_mp = detach_mp;
}
ar_dlpi_send(arl, info_mp);
ar_dlpi_send(arl, bind_mp);
arl->arl_unbind_mp = unbind_mp;
ar_dlpi_send(arl, notify_mp);
return (0);
bad:
freemsg(attach_mp);
freemsg(bind_mp);
freemsg(detach_mp);
freemsg(unbind_mp);
freemsg(info_mp);
freemsg(notify_mp);
return (ENOMEM);
}
/* Process mapping add requests from external messages. */
static int
ar_mapping_add(queue_t *q, mblk_t *mp_orig)
{
arma_t *arma;
mblk_t *mp = mp_orig;
ace_t *ace;
uchar_t *hw_addr;
uint32_t hw_addr_len;
uchar_t *proto_addr;
uint32_t proto_addr_len;
uchar_t *proto_mask;
uchar_t *proto_extract_mask;
uint32_t hw_extract_start;
arl_t *arl;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
/* We handle both M_IOCTL and M_PROTO messages. */
if (DB_TYPE(mp) == M_IOCTL)
mp = mp->b_cont;
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL)
return (EINVAL);
/*
* Newly received commands from clients go to the tail of the queue.
*/
if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
DTRACE_PROBE3(madd_enqueued, queue_t *, q, mblk_t *, mp_orig,
arl_t *, arl);
ar_cmd_enqueue(arl, mp_orig, q, AR_MAPPING_ADD, B_TRUE);
return (EINPROGRESS);
}
mp_orig->b_prev = NULL;
arma = (arma_t *)mp->b_rptr;
ace = ar_ce_lookup_from_area(as, mp, ar_ce_lookup_mapping);
if (ace != NULL)
ar_ce_delete(ace);
hw_addr_len = arma->arma_hw_addr_length;
hw_addr = mi_offset_paramc(mp, arma->arma_hw_addr_offset, hw_addr_len);
proto_addr_len = arma->arma_proto_addr_length;
proto_addr = mi_offset_paramc(mp, arma->arma_proto_addr_offset,
proto_addr_len);
proto_mask = mi_offset_paramc(mp, arma->arma_proto_mask_offset,
proto_addr_len);
proto_extract_mask = mi_offset_paramc(mp,
arma->arma_proto_extract_mask_offset, proto_addr_len);
hw_extract_start = arma->arma_hw_mapping_start;
if (proto_mask == NULL || proto_extract_mask == NULL) {
DTRACE_PROBE2(madd_illegal_mask, arl_t *, arl, arpa_t *, arma);
return (EINVAL);
}
return (ar_ce_create(
arl,
arma->arma_proto,
hw_addr,
hw_addr_len,
proto_addr,
proto_addr_len,
proto_mask,
proto_extract_mask,
hw_extract_start,
NULL,
arma->arma_flags | ACE_F_MAPPING));
}
static boolean_t
ar_mask_all_ones(uchar_t *mask, uint32_t mask_len)
{
if (mask == NULL)
return (B_TRUE);
while (mask_len-- > 0) {
if (*mask++ != 0xFF) {
return (B_FALSE);
}
}
return (B_TRUE);
}
/* Find an entry for a particular MAC type in the ar_m_tbl. */
static ar_m_t *
ar_m_lookup(t_uscalar_t mac_type)
{
ar_m_t *arm;
for (arm = ar_m_tbl; arm < A_END(ar_m_tbl); arm++) {
if (arm->ar_mac_type == mac_type)
return (arm);
}
return (NULL);
}
/* Respond to Named Dispatch requests. */
static int
ar_nd_ioctl(queue_t *q, mblk_t *mp)
{
ar_t *ar = (ar_t *)q->q_ptr;
arp_stack_t *as = ar->ar_as;
if (DB_TYPE(mp) == M_IOCTL && nd_getset(q, as->as_nd, mp))
return (0);
return (ENOENT);
}
/* ARP module open routine. */
static int
ar_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
{
ar_t *ar;
int err;
queue_t *tmp_q;
mblk_t *mp;
netstack_t *ns;
arp_stack_t *as;
TRACE_1(TR_FAC_ARP, TR_ARP_OPEN,
"arp_open: q %p", q);
/* Allow a reopen. */
if (q->q_ptr != NULL) {
return (0);
}
ns = netstack_find_by_cred(credp);
ASSERT(ns != NULL);
as = ns->netstack_arp;
ASSERT(as != NULL);
/* mi_open_comm allocates the instance data structure, etc. */
err = mi_open_comm(&as->as_head, sizeof (ar_t), q, devp, flag, sflag,
credp);
if (err) {
netstack_rele(as->as_netstack);
return (err);
}
/*
* We are D_MTPERMOD so it is safe to do qprocson before
* the instance data has been initialized.
*/
qprocson(q);
ar = (ar_t *)q->q_ptr;
ar->ar_rq = q;
q = WR(q);
ar->ar_wq = q;
crhold(credp);
ar->ar_credp = credp;
ar->ar_as = as;
/*
* Probe for the DLPI info if we are not pushed on IP or UDP. Wait for
* the reply. In case of error call ar_close() which will take
* care of doing everything required to close this instance, such
* as freeing the arl, restarting the timer on a different queue etc.
*/
if (strcmp(q->q_next->q_qinfo->qi_minfo->mi_idname, "ip") == 0 ||
strcmp(q->q_next->q_qinfo->qi_minfo->mi_idname, "udp") == 0) {
arc_t *arc;
/*
* We are pushed directly on top of IP or UDP. There is no need
* to send down a DL_INFO_REQ. Return success. This could
* either be an ill stream (i.e. <arp-IP-Driver> stream)
* or a stream corresponding to an open of /dev/arp
* (i.e. <arp-IP> stream). Note that we don't support
* pushing some module in between arp and IP.
*
* Tell IP, though, that we're an extended implementation, so
* it knows to expect a DAD response after bringing an
* interface up. Old ATM drivers won't do this, and IP will
* just bring the interface up immediately.
*/
ar->ar_on_ill_stream = (q->q_next->q_next != NULL);
if (!ar->ar_on_ill_stream)
return (0);
mp = allocb(sizeof (arc_t), BPRI_MED);
if (mp == NULL) {
(void) ar_close(RD(q));
return (ENOMEM);
}
DB_TYPE(mp) = M_CTL;
arc = (arc_t *)mp->b_rptr;
mp->b_wptr = mp->b_rptr + sizeof (arc_t);
arc->arc_cmd = AR_ARP_EXTEND;
putnext(q, mp);
return (0);
}
tmp_q = q;
/* Get the driver's queue */
while (tmp_q->q_next != NULL)
tmp_q = tmp_q->q_next;
ASSERT(tmp_q->q_qinfo->qi_minfo != NULL);
if (strcmp(tmp_q->q_qinfo->qi_minfo->mi_idname, "ip") == 0 ||
strcmp(tmp_q->q_qinfo->qi_minfo->mi_idname, "udp") == 0) {
/*
* We don't support pushing ARP arbitrarily on an IP or UDP
* driver stream. ARP has to be pushed directly above IP or
* UDP.
*/
(void) ar_close(RD(q));
return (ENOTSUP);
} else {
/*
* Send down a DL_INFO_REQ so we can find out what we are
* talking to.
*/
mp = ar_dlpi_comm(DL_INFO_REQ, sizeof (dl_info_req_t));
if (mp == NULL) {
(void) ar_close(RD(q));
return (ENOMEM);
}
putnext(ar->ar_wq, mp);
while (ar->ar_arl == NULL) {
if (!qwait_sig(ar->ar_rq)) {
(void) ar_close(RD(q));
return (EINTR);
}
}
}
return (0);
}
/* Get current value of Named Dispatch item. */
/* ARGSUSED */
static int
ar_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
{
arpparam_t *arppa = (arpparam_t *)cp;
(void) mi_mpprintf(mp, "%d", arppa->arp_param_value);
return (0);
}
/*
* Walk through the param array specified registering each element with the
* named dispatch handler.
*/
static boolean_t
ar_param_register(IDP *ndp, arpparam_t *arppa, int cnt)
{
for (; cnt-- > 0; arppa++) {
if (arppa->arp_param_name && arppa->arp_param_name[0]) {
if (!nd_load(ndp, arppa->arp_param_name,
ar_param_get, ar_param_set,
(caddr_t)arppa)) {
nd_free(ndp);
return (B_FALSE);
}
}
}
return (B_TRUE);
}
/* Set new value of Named Dispatch item. */
/* ARGSUSED */
static int
ar_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
{
long new_value;
arpparam_t *arppa = (arpparam_t *)cp;
if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
new_value < arppa->arp_param_min ||
new_value > arppa->arp_param_max) {
return (EINVAL);
}
arppa->arp_param_value = new_value;
return (0);
}
/*
* Process an I_PLINK ioctl. If the lower stream is an arp device stream,
* append another mblk to the chain, that will carry the device name,
* and the muxid. IP uses this info to lookup the corresponding ill, and
* set the ill_arp_muxid atomically, as part of the I_PLINK, instead of
* waiting for the SIOCSLIFMUXID. (which may never happen if ifconfig is
* killed, and this has the bad effect of not being able to unplumb
* subsequently)
*/
static int
ar_plink_send(queue_t *q, mblk_t *mp)
{
char *name;
mblk_t *muxmp;
mblk_t *mp1;
ar_t *ar = (ar_t *)q->q_ptr;
arp_stack_t *as = ar->ar_as;
struct linkblk *li;
struct ipmx_s *ipmxp;
queue_t *arpwq;
mp1 = mp->b_cont;
ASSERT((mp1 != NULL) && (mp1->b_cont == NULL));
li = (struct linkblk *)mp1->b_rptr;
arpwq = li->l_qbot;
/*
* Allocate a new mblk which will hold an ipmx_s and chain it to
* the M_IOCTL chain. The final chain will consist of 3 mblks,
* namely the M_IOCTL, followed by the linkblk, followed by the ipmx_s
*/
muxmp = allocb(sizeof (struct ipmx_s), BPRI_MED);
if (muxmp == NULL)
return (ENOMEM);
ipmxp = (struct ipmx_s *)muxmp->b_wptr;
ipmxp->ipmx_arpdev_stream = 0;
muxmp->b_wptr += sizeof (struct ipmx_s);
mp1->b_cont = muxmp;
/*
* The l_qbot represents the uppermost write queue of the
* lower stream. Walk down this stream till we hit ARP.
* We can safely walk, since STREAMS has made sure the stream
* cannot close till the IOCACK goes up, and is not interruptible.
*/
while (arpwq != NULL) {
/*
* Beware of broken modules like logsubr.c that
* may not have a q_qinfo or qi_minfo.
*/
if ((q->q_qinfo != NULL) && (q->q_qinfo->qi_minfo != NULL)) {
name = arpwq->q_qinfo->qi_minfo->mi_idname;
if (name != NULL && name[0] != NULL &&
(strcmp(name, arp_mod_info.mi_idname) == 0))
break;
}
arpwq = arpwq->q_next;
}
/*
* Check if arpwq corresponds to an arp device stream, by walking
* the mi list. If it does, then add the muxid and device name info
* for use by IP. IP will send the M_IOCACK.
*/
if (arpwq != NULL) {
for (ar = (ar_t *)mi_first_ptr(&as->as_head); ar != NULL;
ar = (ar_t *)mi_next_ptr(&as->as_head, (void *)ar)) {
if ((ar->ar_wq == arpwq) && (ar->ar_arl != NULL)) {
ipmxp->ipmx_arpdev_stream = 1;
(void) strcpy((char *)ipmxp->ipmx_name,
ar->ar_arl->arl_name);
break;
}
}
}
putnext(q, mp);
return (0);
}
/*
* ar_ce_walk routine to delete any outstanding queries for an ar that is
* going away.
*/
static void
ar_query_delete(ace_t *ace, void *arg)
{
ar_t *ar = arg;
mblk_t **mpp = &ace->ace_query_mp;
mblk_t *mp;
arp_stack_t *as = ar->ar_as;
ip_stack_t *ipst = as->as_netstack->netstack_ip;
while ((mp = *mpp) != NULL) {
/* The response queue was stored in the query b_prev. */
if ((queue_t *)mp->b_prev == ar->ar_wq ||
(queue_t *)mp->b_prev == ar->ar_rq) {
*mpp = mp->b_next;
if (DB_TYPE(mp) == M_PROTO &&
*(uint32_t *)mp->b_rptr == AR_ENTRY_QUERY) {
BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
ire_stats_freed);
}
inet_freemsg(mp);
} else {
mpp = &mp->b_next;
}
}
}
/*
* This routine is called either when an address resolution has just been
* found, or when it is time to give, or in some other error situation.
* If a non-zero ret_val is provided, any outstanding queries for the
* specified ace will be completed using that error value. Otherwise,
* the completion status will depend on whether the address has been
* resolved.
*/
static void
ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr,
uint32_t proto_addr_len)
{
mblk_t *areq_mp;
mblk_t *mp;
mblk_t *xmit_mp;
queue_t *arl_wq = ace->ace_arl->arl_wq;
arp_stack_t *as = ARL_TO_ARPSTACK(ace->ace_arl);
ip_stack_t *ipst = as->as_netstack->netstack_ip;
arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
/*
* On error or completion for a query, we need to shut down the timer.
* However, the timer must not be stopped for an interface doing
* Duplicate Address Detection, or it will never finish that phase.
*/
if (!(ace->ace_flags & (ACE_F_UNVERIFIED | ACE_F_AUTHORITY)))
mi_timer(arl_wq, ace->ace_mp, -1L);
/* Establish the return value appropriate. */
if (ret_val == 0) {
if (!ACE_RESOLVED(ace) || ap == NULL)
ret_val = ENXIO;
}
/* Terminate all outstanding queries. */
while ((mp = ace->ace_query_mp) != 0) {
/* The response queue was saved in b_prev. */
queue_t *q = (queue_t *)mp->b_prev;
mp->b_prev = NULL;
ace->ace_query_mp = mp->b_next;
mp->b_next = NULL;
/*
* If we have the answer, attempt to get a copy of the xmit
* template to prepare for the client.
*/
if (ret_val == 0 &&
(xmit_mp = copyb(ap->ap_xmit_mp)) == NULL) {
/* Too bad, buy more memory. */
ret_val = ENOMEM;
}
/* Complete the response based on how the request arrived. */
if (DB_TYPE(mp) == M_IOCTL) {
struct iocblk *ioc = (struct iocblk *)mp->b_rptr;
ioc->ioc_error = ret_val;
if (ret_val != 0) {
DB_TYPE(mp) = M_IOCNAK;
ioc->ioc_count = 0;
putnext(q, mp);
continue;
}
/*
* Return the xmit mp out with the successful IOCTL.
*/
DB_TYPE(mp) = M_IOCACK;
ioc->ioc_count = MBLKL(xmit_mp);
/* Remove the areq mblk from the IOCTL. */
areq_mp = mp->b_cont;
mp->b_cont = areq_mp->b_cont;
} else {
if (ret_val != 0) {
/* TODO: find some way to let the guy know? */
inet_freemsg(mp);
BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
ire_stats_freed);
continue;
}
/*
* In the M_PROTO case, the areq message is followed by
* a message chain to be returned to the protocol. ARP
* doesn't know (or care) what is in this chain, but in
* the event that the reader is pondering the
* relationship between ARP and IP (for example), the
* areq is followed by an incipient IRE, and then the
* original outbound packet. Here we detach the areq.
*/
areq_mp = mp;
mp = mp->b_cont;
}
ASSERT(ret_val == 0 && ap != NULL);
if (ap->ap_saplen != 0) {
/*
* Copy the SAP type specified in the request into
* the xmit mp.
*/
areq_t *areq = (areq_t *)areq_mp->b_rptr;
bcopy(areq->areq_sap, xmit_mp->b_rptr +
ap->ap_xmit_sapoff, ABS(ap->ap_saplen));
}
/* Done with the areq message. */
freeb(areq_mp);
/*
* Copy the resolved hardware address into the xmit mp
* or perform the mapping operation.
*/
ar_set_address(ace, xmit_mp->b_rptr + ap->ap_xmit_addroff,
proto_addr, proto_addr_len);
/*
* Now insert the xmit mp after the response message. In
* the M_IOCTL case, it will be the returned data block. In
* the M_PROTO case, (again using IP as an example) it will
* appear after the IRE and before the outbound packet.
*/
xmit_mp->b_cont = mp->b_cont;
mp->b_cont = xmit_mp;
putnext(q, mp);
}
/*
* Unless we are responding from a permanent cache entry, start the
* cleanup timer or (on error) delete the entry.
*/
if (!(ace->ace_flags & (ACE_F_PERMANENT | ACE_F_DYING))) {
if (!ACE_RESOLVED(ace) || ap == NULL) {
/*
* No need to notify IP here, because the entry was
* never resolved, so IP can't have any cached copies
* of the address.
*/
ar_ce_delete(ace);
} else {
mi_timer(arl_wq, ace->ace_mp, as->as_cleanup_interval);
}
}
}
/*
* Returns number of milliseconds after which we should either rexmit or abort.
* Return of zero means we should abort.
*/
static clock_t
ar_query_xmit(arp_stack_t *as, ace_t *ace)
{
areq_t *areq;
mblk_t *mp;
uchar_t *proto_addr;
uchar_t *sender_addr;
ace_t *src_ace;
arl_t *xmit_arl = ace->ace_xmit_arl;
mp = ace->ace_query_mp;
/*
* ar_query_delete may have just blown off the outstanding
* ace_query_mp entries because the client who sent the query
* went away. If this happens just before the ace_mp timer
* goes off, we'd find a null ace_query_mp which is not an error.
* The unresolved ace itself, and the timer, will be removed
* when the arl stream goes away.
*/
if (!mp)
return (0);
if (DB_TYPE(mp) == M_IOCTL)
mp = mp->b_cont;
areq = (areq_t *)mp->b_rptr;
if (areq->areq_xmit_count == 0)
return (0);
areq->areq_xmit_count--;
proto_addr = mi_offset_paramc(mp, areq->areq_target_addr_offset,
areq->areq_target_addr_length);
sender_addr = mi_offset_paramc(mp, areq->areq_sender_addr_offset,
areq->areq_sender_addr_length);
/*
* Get the ace for the sender address, so that we can verify that
* we have one and that DAD has completed.
*/
src_ace = ar_ce_lookup(xmit_arl, areq->areq_proto, sender_addr,
areq->areq_sender_addr_length);
if (src_ace == NULL) {
DTRACE_PROBE3(xmit_no_source, ace_t *, ace, areq_t *, areq,
uchar_t *, sender_addr);
return (0);
}
/*
* If we haven't yet finished duplicate address checking on this source
* address, then do *not* use it on the wire. Doing so will corrupt
* the world's caches. Just allow the timer to restart. Note that
* duplicate address checking will eventually complete one way or the
* other, so this cannot go on "forever."
*/
if (src_ace->ace_flags & ACE_F_UNVERIFIED) {
DTRACE_PROBE2(xmit_source_unverified, ace_t *, ace,
ace_t *, src_ace);
areq->areq_xmit_count++;
return (areq->areq_xmit_interval);
}
DTRACE_PROBE3(xmit_send, ace_t *, ace, ace_t *, src_ace,
areq_t *, areq);
ar_xmit(xmit_arl, ARP_REQUEST, areq->areq_proto,
areq->areq_sender_addr_length, xmit_arl->arl_phy->ap_hw_addr,
sender_addr, xmit_arl->arl_phy->ap_arp_addr, proto_addr, NULL, as);
src_ace->ace_last_bcast = ddi_get_lbolt();
return (areq->areq_xmit_interval);
}
/* Our read side put procedure. */
static void
ar_rput(queue_t *q, mblk_t *mp)
{
arh_t *arh;
arl_t *arl;
arl_t *client_arl;
ace_t *dst_ace;
uchar_t *dst_paddr;
int err;
uint32_t hlen;
struct iocblk *ioc;
mblk_t *mp1;
int op;
uint32_t plen;
uint32_t proto;
uchar_t *src_haddr;
uchar_t *src_paddr;
uchar_t *dst_haddr;
boolean_t is_probe;
boolean_t is_unicast = B_FALSE;
dl_unitdata_ind_t *dlindp;
int i;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
TRACE_1(TR_FAC_ARP, TR_ARP_RPUT_START,
"arp_rput_start: q %p", q);
/*
* We handle ARP commands from below both in M_IOCTL and M_PROTO
* messages. Actual ARP requests and responses will show up as
* M_PROTO messages containing DL_UNITDATA_IND blocks.
*/
switch (DB_TYPE(mp)) {
case M_IOCTL:
err = ar_cmd_dispatch(q, mp, B_FALSE);
switch (err) {
case ENOENT:
DB_TYPE(mp) = M_IOCNAK;
if ((mp1 = mp->b_cont) != 0) {
/*
* Collapse the data as a note to the
* originator.
*/
mp1->b_wptr = mp1->b_rptr;
}
break;
case EINPROGRESS:
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "ioctl/inprogress");
return;
default:
DB_TYPE(mp) = M_IOCACK;
break;
}
ioc = (struct iocblk *)mp->b_rptr;
ioc->ioc_error = err;
if ((mp1 = mp->b_cont) != 0)
ioc->ioc_count = MBLKL(mp1);
else
ioc->ioc_count = 0;
qreply(q, mp);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "ioctl");
return;
case M_CTL:
/*
* IP is acking the AR_ARP_CLOSING message that we sent
* in ar_close.
*/
if (MBLKL(mp) == sizeof (arc_t)) {
if (((arc_t *)mp->b_rptr)->arc_cmd == AR_ARP_CLOSING)
((ar_t *)q->q_ptr)->ar_ip_acked_close = 1;
}
freemsg(mp);
return;
case M_PCPROTO:
case M_PROTO:
dlindp = (dl_unitdata_ind_t *)mp->b_rptr;
if (MBLKL(mp) >= sizeof (dl_unitdata_ind_t) &&
dlindp->dl_primitive == DL_UNITDATA_IND) {
is_unicast = (dlindp->dl_group_address == 0);
arl = ((ar_t *)q->q_ptr)->ar_arl;
if (arl != NULL && arl->arl_phy != NULL) {
/* Real messages from the wire! */
break;
}
putnext(q, mp);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "default");
return;
}
err = ar_cmd_dispatch(q, mp, B_FALSE);
switch (err) {
case ENOENT:
/* Miscellaneous DLPI messages get shuffled off. */
ar_rput_dlpi(q, mp);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "proto/dlpi");
break;
case EINPROGRESS:
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "proto");
break;
default:
inet_freemsg(mp);
break;
}
return;
default:
putnext(q, mp);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "default");
return;
}
/*
* If the IFF_NOARP flag is on, then do not process any
* incoming ARP_REQUEST or incoming ARP_RESPONSE.
*/
if (arl->arl_flags & ARL_F_NOARP) {
freemsg(mp);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "interface has IFF_NOARP set");
return;
}
/*
* What we should have at this point is a DL_UNITDATA_IND message
* followed by an ARP packet. We do some initial checks and then
* get to work.
*/
mp1 = mp->b_cont;
if (mp1 == NULL) {
freemsg(mp);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "baddlpi");
return;
}
if (mp1->b_cont != NULL) {
/* No fooling around with funny messages. */
if (!pullupmsg(mp1, -1)) {
freemsg(mp);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "pullupmsgfail");
return;
}
}
arh = (arh_t *)mp1->b_rptr;
hlen = arh->arh_hlen;
plen = arh->arh_plen;
if (MBLKL(mp1) < ARH_FIXED_LEN + 2 * hlen + 2 * plen) {
freemsg(mp);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "short");
return;
}
/*
* hlen 0 is used for RFC 1868 UnARP.
*
* Note that the rest of the code checks that hlen is what we expect
* for this hardware address type, so might as well discard packets
* here that don't match.
*/
if ((hlen > 0 && hlen != arl->arl_phy->ap_hw_addrlen) || plen == 0) {
DTRACE_PROBE2(rput_bogus, arl_t *, arl, mblk_t *, mp1);
freemsg(mp);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "hlenzero/plenzero");
return;
}
/*
* Historically, Solaris has been lenient about hardware type numbers.
* We should check here, but don't.
*/
DTRACE_PROBE2(rput_normal, arl_t *, arl, arh_t *, arh);
DTRACE_PROBE3(arp__physical__in__start,
arl_t *, arl, arh_t *, arh, mblk_t *, mp);
ARP_HOOK_IN(as->as_arp_physical_in_event, as->as_arp_physical_in,
arl->arl_index, arh, mp, mp1, as);
DTRACE_PROBE1(arp__physical__in__end, mblk_t *, mp);
if (mp == NULL)
return;
proto = (uint32_t)BE16_TO_U16(arh->arh_proto);
src_haddr = (uchar_t *)arh;
src_haddr = &src_haddr[ARH_FIXED_LEN];
src_paddr = &src_haddr[hlen];
dst_haddr = &src_haddr[hlen + plen];
dst_paddr = &src_haddr[hlen + plen + hlen];
op = BE16_TO_U16(arh->arh_operation);
/* Determine if this is just a probe */
for (i = 0; i < plen; i++)
if (src_paddr[i] != 0)
break;
is_probe = i >= plen;
/*
* RFC 826: first check if the <protocol, sender protocol address> is
* in the cache, if there is a sender protocol address. Note that this
* step also handles resolutions based on source.
*
* Note that IP expects that each notification it receives will be
* tied to the ill it received it on. Thus, we must talk to it over
* the arl tied to the resolved IP address (if any), hence client_arl.
*/
if (is_probe)
err = AR_NOTFOUND;
else
err = ar_ce_resolve_all(arl, proto, src_haddr, hlen, src_paddr,
plen, &client_arl);
switch (err) {
case AR_BOGON:
ar_client_notify(client_arl, mp1, AR_CN_BOGON);
mp1 = NULL;
break;
case AR_FAILED:
ar_client_notify(client_arl, mp1, AR_CN_FAILED);
mp1 = NULL;
break;
case AR_LOOPBACK:
DTRACE_PROBE2(rput_loopback, arl_t *, arl, arh_t *, arh);
freemsg(mp1);
mp1 = NULL;
break;
}
if (mp1 == NULL) {
freeb(mp);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "unneeded");
return;
}
/*
* Now look up the destination address. By RFC 826, we ignore the
* packet at this step if the target isn't one of our addresses. This
* is true even if the target is something we're trying to resolve and
* the packet is a response. To avoid duplicate responses, we also
* ignore the packet if it was multicast/broadcast to an arl that's in
* an IPMP group but was not the designated xmit_arl for the ACE.
*
* Note that in order to do this correctly, we need to know when to
* notify IP of a change implied by the source address of the ARP
* message. That implies that the local ARP table has entries for all
* of the resolved entries cached in the client. This is why we must
* notify IP when we delete a resolved entry and we know that IP may
* have cached answers.
*/
dst_ace = ar_ce_lookup_entry(arl, proto, dst_paddr, plen);
if (dst_ace == NULL || !ACE_RESOLVED(dst_ace) ||
(dst_ace->ace_xmit_arl != arl && !is_unicast) ||
!(dst_ace->ace_flags & ACE_F_PUBLISH)) {
/*
* Let the client know if the source mapping has changed, even
* if the destination provides no useful information for the
* client.
*/
if (err == AR_CHANGED)
ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
else
freemsg(mp1);
freeb(mp);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "nottarget");
return;
}
/*
* If the target is unverified by DAD, then one of two things is true:
* either it's someone else claiming this address (on a probe or an
* announcement) or it's just a regular request. The former is
* failure, but a regular request is not.
*/
if (dst_ace->ace_flags & ACE_F_UNVERIFIED) {
/*
* Check for a reflection. Some misbehaving bridges will
* reflect our own transmitted packets back to us.
*/
if (hlen == dst_ace->ace_hw_addr_length &&
bcmp(src_haddr, dst_ace->ace_hw_addr, hlen) == 0) {
DTRACE_PROBE3(rput_probe_reflected, arl_t *, arl,
arh_t *, arh, ace_t *, dst_ace);
freeb(mp);
freemsg(mp1);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "reflection");
return;
}
/*
* Conflicts seen via the wrong interface may be bogus.
* Multiple interfaces on the same segment imply any conflict
* will also be seen via the correct interface, so we can ignore
* anything not matching the arl from the ace.
*/
if (arl != dst_ace->ace_arl) {
DTRACE_PROBE3(rput_probe_misdirect, arl_t *, arl,
arh_t *, arh, ace_t *, dst_ace);
freeb(mp);
freemsg(mp1);
return;
}
/*
* Responses targeting our HW address that are not responses to
* our DAD probe must be ignored as they are related to requests
* sent before DAD was restarted. Note: response to our DAD
* probe will have been handled by ar_ce_resolve_all() above.
*/
if (op == ARP_RESPONSE &&
(bcmp(dst_haddr, dst_ace->ace_hw_addr, hlen) == 0)) {
DTRACE_PROBE3(rput_probe_stale, arl_t *, arl,
arh_t *, arh, ace_t *, dst_ace);
freeb(mp);
freemsg(mp1);
return;
}
/*
* Responses targeted to HW addresses which are not ours but
* sent to our unverified proto address are also conflicts.
* These may be reported by a proxy rather than the interface
* with the conflicting address, dst_paddr is in conflict
* rather than src_paddr. To ensure IP can locate the correct
* ipif to take down, it is necessary to copy dst_paddr to
* the src_paddr field before sending it to IP. The same is
* required for probes, where src_paddr will be INADDR_ANY.
*/
if (is_probe) {
/*
* In this case, client_arl will be invalid (e.g.,
* since probes don't have a valid sender address).
* But dst_ace has the appropriate arl.
*/
bcopy(dst_paddr, src_paddr, plen);
ar_client_notify(dst_ace->ace_arl, mp1, AR_CN_FAILED);
ar_ce_delete(dst_ace);
} else if (op == ARP_RESPONSE) {
bcopy(dst_paddr, src_paddr, plen);
ar_client_notify(client_arl, mp1, AR_CN_FAILED);
ar_ce_delete(dst_ace);
} else if (err == AR_CHANGED) {
ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
} else {
DTRACE_PROBE3(rput_request_unverified, arl_t *, arl,
arh_t *, arh, ace_t *, dst_ace);
freemsg(mp1);
}
freeb(mp);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "unverified");
return;
}
/*
* If it's a request, then we reply to this, and if we think the
* sender's unknown, then we create an entry to avoid unnecessary ARPs.
* The design assumption is that someone ARPing us is likely to send us
* a packet soon, and that we'll want to reply to it.
*/
if (op == ARP_REQUEST) {
const uchar_t *dstaddr = src_haddr;
clock_t now;
/*
* This implements periodic address defense based on a modified
* version of the RFC 3927 requirements. Instead of sending a
* broadcasted reply every time, as demanded by the RFC, we
* send at most one broadcast reply per arp_broadcast_interval.
*/
now = ddi_get_lbolt();
if ((now - dst_ace->ace_last_bcast) >
MSEC_TO_TICK(as->as_broadcast_interval)) {
DTRACE_PROBE3(rput_bcast_reply, arl_t *, arl,
arh_t *, arh, ace_t *, dst_ace);
dst_ace->ace_last_bcast = now;
dstaddr = arl->arl_phy->ap_arp_addr;
/*
* If this is one of the long-suffering entries, then
* pull it out now. It no longer needs separate
* defense, because we're doing now that with this
* broadcasted reply.
*/
dst_ace->ace_flags &= ~ACE_F_DELAYED;
}
ar_xmit(arl, ARP_RESPONSE, dst_ace->ace_proto, plen,
dst_ace->ace_hw_addr, dst_ace->ace_proto_addr,
src_haddr, src_paddr, dstaddr, as);
if (!is_probe && err == AR_NOTFOUND &&
ar_ce_create(OWNING_ARL(arl), proto, src_haddr, hlen,
src_paddr, plen, NULL, NULL, 0, NULL, 0) == 0) {
ace_t *ace;
ace = ar_ce_lookup(arl, proto, src_paddr, plen);
ASSERT(ace != NULL);
mi_timer(ace->ace_arl->arl_wq, ace->ace_mp,
as->as_cleanup_interval);
}
}
if (err == AR_CHANGED) {
freeb(mp);
ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "reqchange");
} else {
freemsg(mp);
TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
"arp_rput_end: q %p (%S)", q, "end");
}
}
static void
ar_ce_restart_dad(ace_t *ace, void *arl_arg)
{
arl_t *arl = arl_arg;
arp_stack_t *as = ARL_TO_ARPSTACK(arl);
if ((ace->ace_xmit_arl == arl) &&
(ace->ace_flags & (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) ==
(ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) {
/*
* Slight cheat here: we don't use the initial probe delay
* in this obscure case.
*/
if (ace->ace_flags & ACE_F_FAST) {
ace->ace_xmit_count = as->as_fastprobe_count;
ace->ace_xmit_interval = as->as_fastprobe_interval;
} else {
ace->ace_xmit_count = as->as_probe_count;
ace->ace_xmit_interval = as->as_probe_interval;
}
ace->ace_flags &= ~ACE_F_DAD_ABORTED;
ace_set_timer(ace, B_FALSE);
}
}
/* DLPI messages, other than DL_UNITDATA_IND are handled here. */
static void
ar_rput_dlpi(queue_t *q, mblk_t *mp)
{
ar_t *ar = q->q_ptr;
arl_t *arl = ar->ar_arl;
arlphy_t *ap = NULL;
union DL_primitives *dlp;
const char *err_str;
arp_stack_t *as = ar->ar_as;
if (arl != NULL)
ap = arl->arl_phy;
if (MBLKL(mp) < sizeof (dlp->dl_primitive)) {
putnext(q, mp);
return;
}
dlp = (union DL_primitives *)mp->b_rptr;
switch (dlp->dl_primitive) {
case DL_ERROR_ACK:
/*
* ce is confused about how DLPI works, so we have to interpret
* an "error" on DL_NOTIFY_ACK (which we never could have sent)
* as really meaning an error on DL_NOTIFY_REQ.
*
* Note that supporting DL_NOTIFY_REQ is optional, so printing
* out an error message on the console isn't warranted except
* for debug.
*/
if (dlp->error_ack.dl_error_primitive == DL_NOTIFY_ACK ||
dlp->error_ack.dl_error_primitive == DL_NOTIFY_REQ) {
ar_dlpi_done(arl, DL_NOTIFY_REQ);
freemsg(mp);
return;
}
err_str = dl_primstr(dlp->error_ack.dl_error_primitive);
DTRACE_PROBE2(rput_dl_error, arl_t *, arl,
dl_error_ack_t *, &dlp->error_ack);
switch (dlp->error_ack.dl_error_primitive) {
case DL_UNBIND_REQ:
if (arl->arl_provider_style == DL_STYLE1)
arl->arl_state = ARL_S_DOWN;
break;
case DL_DETACH_REQ:
case DL_BIND_REQ:
arl->arl_state = ARL_S_DOWN;
break;
case DL_ATTACH_REQ:
break;
default:
/* If it's anything else, we didn't send it. */
putnext(q, mp);
return;
}
ar_dlpi_done(arl, dlp->error_ack.dl_error_primitive);
(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
"ar_rput_dlpi: %s failed, dl_errno %d, dl_unix_errno %d",
err_str, dlp->error_ack.dl_errno,
dlp->error_ack.dl_unix_errno);
break;
case DL_INFO_ACK:
DTRACE_PROBE2(rput_dl_info, arl_t *, arl,
dl_info_ack_t *, &dlp->info_ack);
if (arl != NULL && arl->arl_dlpi_pending == DL_INFO_REQ) {
/*
* We have a response back from the driver. Go set up
* transmit defaults.
*/
ar_ll_set_defaults(arl, mp);
ar_dlpi_done(arl, DL_INFO_REQ);
} else if (arl == NULL) {
ar_ll_init(as, ar, mp);
}
/* Kick off any awaiting messages */
qenable(WR(q));
break;
case DL_OK_ACK:
DTRACE_PROBE2(rput_dl_ok, arl_t *, arl,
dl_ok_ack_t *, &dlp->ok_ack);
switch (dlp->ok_ack.dl_correct_primitive) {
case DL_UNBIND_REQ:
if (arl->arl_provider_style == DL_STYLE1)
arl->arl_state = ARL_S_DOWN;
break;
case DL_DETACH_REQ:
arl->arl_state = ARL_S_DOWN;
break;
case DL_ATTACH_REQ:
break;
default:
putnext(q, mp);
return;
}
ar_dlpi_done(arl, dlp->ok_ack.dl_correct_primitive);
break;
case DL_NOTIFY_ACK:
DTRACE_PROBE2(rput_dl_notify, arl_t *, arl,
dl_notify_ack_t *, &dlp->notify_ack);
/*
* We mostly care about interface-up transitions, as this is
* when we need to redo duplicate address detection.
*/
if (ap != NULL) {
ap->ap_notifies = (dlp->notify_ack.dl_notifications &
DL_NOTE_LINK_UP) != 0;
}
ar_dlpi_done(arl, DL_NOTIFY_REQ);
break;
case DL_BIND_ACK:
DTRACE_PROBE2(rput_dl_bind, arl_t *, arl,
dl_bind_ack_t *, &dlp->bind_ack);
if (ap != NULL) {
caddr_t hw_addr;
hw_addr = (caddr_t)dlp + dlp->bind_ack.dl_addr_offset;
if (ap->ap_saplen > 0)
hw_addr += ap->ap_saplen;
bcopy(hw_addr, ap->ap_hw_addr, ap->ap_hw_addrlen);
}
arl->arl_state = ARL_S_UP;
ar_dlpi_done(arl, DL_BIND_REQ);
break;
case DL_NOTIFY_IND:
DTRACE_PROBE2(rput_dl_notify_ind, arl_t *, arl,
dl_notify_ind_t *, &dlp->notify_ind);
if (ap != NULL) {
switch (dlp->notify_ind.dl_notification) {
case DL_NOTE_LINK_UP:
ap->ap_link_down = B_FALSE;
ar_ce_walk(as, ar_ce_restart_dad, arl);
break;
case DL_NOTE_LINK_DOWN:
ap->ap_link_down = B_TRUE;
break;
}
}
break;
case DL_UDERROR_IND:
DTRACE_PROBE2(rput_dl_uderror, arl_t *, arl,
dl_uderror_ind_t *, &dlp->uderror_ind);
(void) mi_strlog(q, 1, SL_ERROR | SL_TRACE,
"ar_rput_dlpi: "
"DL_UDERROR_IND, dl_dest_addr_length %d dl_errno %d",
dlp->uderror_ind.dl_dest_addr_length,
dlp->uderror_ind.dl_errno);
putnext(q, mp);
return;
default:
DTRACE_PROBE2(rput_dl_badprim, arl_t *, arl,
union DL_primitives *, dlp);
putnext(q, mp);
return;
}
freemsg(mp);
}
static void
ar_set_address(ace_t *ace, uchar_t *addrpos, uchar_t *proto_addr,
uint32_t proto_addr_len)
{
uchar_t *mask, *to;
int len;
ASSERT(ace->ace_hw_addr != NULL);
bcopy(ace->ace_hw_addr, addrpos, ace->ace_hw_addr_length);
if (ace->ace_flags & ACE_F_MAPPING &&
proto_addr != NULL &&
ace->ace_proto_extract_mask) { /* careful */
len = MIN((int)ace->ace_hw_addr_length
- ace->ace_hw_extract_start,
proto_addr_len);
mask = ace->ace_proto_extract_mask;
to = addrpos + ace->ace_hw_extract_start;
while (len-- > 0)
*to++ |= *mask++ & *proto_addr++;
}
}
static int
ar_slifname(queue_t *q, mblk_t *mp_orig)
{
ar_t *ar = q->q_ptr;
arl_t *arl = ar->ar_arl;
struct lifreq *lifr;
mblk_t *mp = mp_orig;
arl_t *old_arl;
mblk_t *ioccpy;
struct iocblk *iocp;
hook_nic_event_t info;
arp_stack_t *as = ar->ar_as;
if (ar->ar_on_ill_stream) {
/*
* This command is for IP, since it is coming down
* the <arp-IP-driver> stream. Return ENOENT so that
* it will be sent downstream by the caller
*/
return (ENOENT);
}
/* We handle both M_IOCTL and M_PROTO messages */
if (DB_TYPE(mp) == M_IOCTL)
mp = mp->b_cont;
if (q->q_next == NULL || arl == NULL) {
/*
* If the interface was just opened and
* the info ack has not yet come back from the driver
*/
DTRACE_PROBE2(slifname_no_arl, queue_t *, q,
mblk_t *, mp_orig);
(void) putq(q, mp_orig);
return (EINPROGRESS);
}
if (MBLKL(mp) < sizeof (struct lifreq)) {
DTRACE_PROBE2(slifname_malformed, queue_t *, q,
mblk_t *, mp);
}
if (arl->arl_name[0] != '\0') {
DTRACE_PROBE1(slifname_already, arl_t *, arl);
return (EALREADY);
}
lifr = (struct lifreq *)mp->b_rptr;
if (strlen(lifr->lifr_name) >= LIFNAMSIZ) {
DTRACE_PROBE2(slifname_bad_name, arl_t *, arl,
struct lifreq *, lifr);
return (ENXIO);
}
/* Check whether the name is already in use. */
old_arl = ar_ll_lookup_by_name(as, lifr->lifr_name);
if (old_arl != NULL) {
DTRACE_PROBE2(slifname_exists, arl_t *, arl, arl_t *, old_arl);
return (EEXIST);
}
/* Make a copy of the message so we can send it downstream. */
if ((ioccpy = allocb(sizeof (struct iocblk), BPRI_MED)) == NULL ||
(ioccpy->b_cont = copymsg(mp)) == NULL) {
if (ioccpy != NULL)
freeb(ioccpy);
return (ENOMEM);
}
(void) strlcpy(arl->arl_name, lifr->lifr_name, sizeof (arl->arl_name));
/* The ppa is sent down by ifconfig */
arl->arl_ppa = lifr->lifr_ppa;
/*
* A network device is not considered to be fully plumb'd until
* its name has been set using SIOCSLIFNAME. Once it has
* been set, it cannot be set again (see code above), so there
* is currently no danger in this function causing two NE_PLUMB
* events without an intervening NE_UNPLUMB.
*/
info.hne_nic = arl->arl_index;
info.hne_lif = 0;
info.hne_event = NE_PLUMB;
info.hne_data = arl->arl_name;
info.hne_datalen = strlen(arl->arl_name);
(void) hook_run(as->as_net_data->netd_hooks, as->as_arpnicevents,
(hook_data_t)&info);
/* Chain in the new arl. */
rw_enter(&as->as_arl_lock, RW_WRITER);
arl->arl_next = as->as_arl_head;
as->as_arl_head = arl;
rw_exit(&as->as_arl_lock);
DTRACE_PROBE1(slifname_set, arl_t *, arl);
/*
* Send along a copy of the ioctl; this is just for hitbox. Use
* M_CTL to avoid confusing anyone else who might be listening.
*/
DB_TYPE(ioccpy) = M_CTL;
iocp = (struct iocblk *)ioccpy->b_rptr;
bzero(iocp, sizeof (*iocp));
iocp->ioc_cmd = SIOCSLIFNAME;
iocp->ioc_count = msgsize(ioccpy->b_cont);
ioccpy->b_wptr = (uchar_t *)(iocp + 1);
putnext(arl->arl_wq, ioccpy);
return (0);
}
static int
ar_set_ppa(queue_t *q, mblk_t *mp_orig)
{
ar_t *ar = (ar_t *)q->q_ptr;
arl_t *arl = ar->ar_arl;
int ppa;
char *cp;
mblk_t *mp = mp_orig;
arl_t *old_arl;
arp_stack_t *as = ar->ar_as;
if (ar->ar_on_ill_stream) {
/*
* This command is for IP, since it is coming down
* the <arp-IP-driver> stream. Return ENOENT so that
* it will be sent downstream by the caller
*/
return (ENOENT);
}
/* We handle both M_IOCTL and M_PROTO messages. */
if (DB_TYPE(mp) == M_IOCTL)
mp = mp->b_cont;
if (q->q_next == NULL || arl == NULL) {
/*
* If the interface was just opened and
* the info ack has not yet come back from the driver.
*/
DTRACE_PROBE2(setppa_no_arl, queue_t *, q,
mblk_t *, mp_orig);
(void) putq(q, mp_orig);
return (EINPROGRESS);
}
if (arl->arl_name[0] != '\0') {
DTRACE_PROBE1(setppa_already, arl_t *, arl);
return (EALREADY);
}
do {
q = q->q_next;
} while (q->q_next != NULL);
cp = q->q_qinfo->qi_minfo->mi_idname;
ppa = *(int *)(mp->b_rptr);
(void) snprintf(arl->arl_name, sizeof (arl->arl_name), "%s%d", cp, ppa);
old_arl = ar_ll_lookup_by_name(as, arl->arl_name);
if (old_arl != NULL) {
DTRACE_PROBE2(setppa_exists, arl_t *, arl, arl_t *, old_arl);
/* Make it a null string again */
arl->arl_name[0] = '\0';
return (EBUSY);
}
arl->arl_ppa = ppa;
DTRACE_PROBE1(setppa_done, arl_t *, arl);
/* Chain in the new arl. */
rw_enter(&as->as_arl_lock, RW_WRITER);
arl->arl_next = as->as_arl_head;
as->as_arl_head = arl;
rw_exit(&as->as_arl_lock);
return (0);
}
static int
ar_snmp_msg(queue_t *q, mblk_t *mp_orig)
{
mblk_t *mpdata, *mp = mp_orig;
struct opthdr *optp;
msg2_args_t args;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
if (mp == NULL)
return (0);
/*
* ar_cmd_dispatch() already checked for us that "mp->b_cont" is valid
* in case of an M_IOCTL message.
*/
if (DB_TYPE(mp) == M_IOCTL)
mp = mp->b_cont;
optp = (struct opthdr *)(&mp->b_rptr[sizeof (struct T_optmgmt_ack)]);
if (optp->level == MIB2_IP && optp->name == MIB2_IP_MEDIA) {
/*
* Put our ARP cache entries in the ipNetToMediaTable mp from
* IP. Due to a historical side effect of IP's MIB code, it
* always passes us a b_cont, but the b_cont should be empty.
*/
if ((mpdata = mp->b_cont) == NULL || MBLKL(mpdata) != 0)
return (EINVAL);
args.m2a_mpdata = mpdata;
args.m2a_mptail = NULL;
ar_ce_walk(as, ar_snmp_msg2, &args);
optp->len = msgdsize(mpdata);
}
putnext(q, mp_orig);
return (EINPROGRESS); /* so that rput() exits doing nothing... */
}
static void
ar_snmp_msg2(ace_t *ace, void *arg)
{
const char *name = "unknown";
mib2_ipNetToMediaEntry_t ntme;
msg2_args_t *m2ap = arg;
ASSERT(ace != NULL && ace->ace_arl != NULL);
if (ace->ace_arl != NULL)
name = ace->ace_arl->arl_name;
/*
* Fill in ntme using the information in the ACE.
*/
ntme.ipNetToMediaType = (ace->ace_flags & ACE_F_PERMANENT) ? 4 : 3;
ntme.ipNetToMediaIfIndex.o_length = MIN(OCTET_LENGTH, strlen(name));
bcopy(name, ntme.ipNetToMediaIfIndex.o_bytes,
ntme.ipNetToMediaIfIndex.o_length);
bcopy(ace->ace_proto_addr, &ntme.ipNetToMediaNetAddress,
MIN(sizeof (uint32_t), ace->ace_proto_addr_length));
ntme.ipNetToMediaInfo.ntm_mask.o_length =
MIN(OCTET_LENGTH, ace->ace_proto_addr_length);
bcopy(ace->ace_proto_mask, ntme.ipNetToMediaInfo.ntm_mask.o_bytes,
ntme.ipNetToMediaInfo.ntm_mask.o_length);
ntme.ipNetToMediaInfo.ntm_flags = ace->ace_flags;
ntme.ipNetToMediaPhysAddress.o_length =
MIN(OCTET_LENGTH, ace->ace_hw_addr_length);
if ((ace->ace_flags & ACE_F_RESOLVED) == 0)
ntme.ipNetToMediaPhysAddress.o_length = 0;
bcopy(ace->ace_hw_addr, ntme.ipNetToMediaPhysAddress.o_bytes,
ntme.ipNetToMediaPhysAddress.o_length);
/*
* All entries within the ARP cache are unique, and there are no
* preexisting entries in the ipNetToMediaTable mp, so just add 'em.
*/
(void) snmp_append_data2(m2ap->m2a_mpdata, &m2ap->m2a_mptail,
(char *)&ntme, sizeof (ntme));
}
/* Write side put procedure. */
static void
ar_wput(queue_t *q, mblk_t *mp)
{
int err;
struct iocblk *ioc;
mblk_t *mp1;
TRACE_1(TR_FAC_ARP, TR_ARP_WPUT_START,
"arp_wput_start: q %p", q);
/*
* Here we handle ARP commands coming from controlling processes
* either in the form of M_IOCTL messages, or M_PROTO messages.
*/
switch (DB_TYPE(mp)) {
case M_IOCTL:
switch (err = ar_cmd_dispatch(q, mp, B_TRUE)) {
case ENOENT:
/*
* If it is an I_PLINK, process it. Otherwise
* we don't recognize it, so pass it down.
* Since ARP is a module there is always someone
* below.
*/
ASSERT(q->q_next != NULL);
ioc = (struct iocblk *)mp->b_rptr;
if ((ioc->ioc_cmd != I_PLINK) &&
(ioc->ioc_cmd != I_PUNLINK)) {
putnext(q, mp);
TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
"arp_wput_end: q %p (%S)",
q, "ioctl/enoent");
return;
}
err = ar_plink_send(q, mp);
if (err == 0) {
return;
}
if ((mp1 = mp->b_cont) != 0)
mp1->b_wptr = mp1->b_rptr;
break;
case EINPROGRESS:
/*
* If the request resulted in an attempt to resolve
* an address, we return out here. The IOCTL will
* be completed in ar_rput if something comes back,
* or as a result of the timer expiring.
*/
TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
"arp_wput_end: q %p (%S)", q, "inprog");
return;
default:
DB_TYPE(mp) = M_IOCACK;
break;
}
ioc = (struct iocblk *)mp->b_rptr;
if (err != 0)
ioc->ioc_error = err;
if (ioc->ioc_error != 0) {
/*
* Don't free b_cont as IP/IB needs
* it to identify the request.
*/
DB_TYPE(mp) = M_IOCNAK;
}
ioc->ioc_count = msgdsize(mp->b_cont);
qreply(q, mp);
TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
"arp_wput_end: q %p (%S)", q, "ioctl");
return;
case M_FLUSH:
if (*mp->b_rptr & FLUSHW)
flushq(q, FLUSHDATA);
if (*mp->b_rptr & FLUSHR) {
flushq(RD(q), FLUSHDATA);
*mp->b_rptr &= ~FLUSHW;
qreply(q, mp);
TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
"arp_wput_end: q %p (%S)", q, "flush");
return;
}
/*
* The normal behavior of a STREAMS module should be
* to pass down M_FLUSH messages. However there is a
* complex sequence of events during plumb/unplumb that
* can cause DLPI messages in the driver's queue to be
* flushed. So we don't send down M_FLUSH. This has been
* reported for some drivers (Eg. le) that send up an M_FLUSH
* in response to unbind request which will eventually be
* looped back at the mux head and sent down. Since IP
* does not queue messages in a module instance queue
* of IP, nothing is lost by not sending down the flush.
*/
freemsg(mp);
return;
case M_PROTO:
case M_PCPROTO:
/*
* Commands in the form of PROTO messages are handled very
* much the same as IOCTLs, but no response is returned.
*/
switch (err = ar_cmd_dispatch(q, mp, B_TRUE)) {
case ENOENT:
if (q->q_next) {
putnext(q, mp);
TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
"arp_wput_end: q %p (%S)", q,
"proto/enoent");
return;
}
break;
case EINPROGRESS:
TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
"arp_wput_end: q %p (%S)", q, "proto/einprog");
return;
default:
break;
}
break;
case M_IOCDATA:
/*
* We pass M_IOCDATA downstream because it could be as a
* result of a previous M_COPYIN/M_COPYOUT message sent
* upstream.
*/
/* FALLTHRU */
case M_CTL:
/*
* We also send any M_CTL downstream as it could
* contain control information for a module downstream.
*/
putnext(q, mp);
return;
default:
break;
}
/* Free any message we don't understand */
freemsg(mp);
TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
"arp_wput_end: q %p (%S)", q, "end");
}
static boolean_t
arp_say_ready(ace_t *ace)
{
mblk_t *mp;
arl_t *arl = ace->ace_arl;
arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
arh_t *arh;
uchar_t *cp;
mp = allocb(sizeof (*arh) + 2 * (ace->ace_hw_addr_length +
ace->ace_proto_addr_length), BPRI_MED);
if (mp == NULL) {
/* skip a beat on allocation trouble */
ace->ace_xmit_count = 1;
ace_set_timer(ace, B_FALSE);
return (B_FALSE);
}
/* Tell IP address is now usable */
arh = (arh_t *)mp->b_rptr;
U16_TO_BE16(ap->ap_arp_hw_type, arh->arh_hardware);
U16_TO_BE16(ace->ace_proto, arh->arh_proto);
arh->arh_hlen = ace->ace_hw_addr_length;
arh->arh_plen = ace->ace_proto_addr_length;
U16_TO_BE16(ARP_REQUEST, arh->arh_operation);
cp = (uchar_t *)(arh + 1);
bcopy(ace->ace_hw_addr, cp, ace->ace_hw_addr_length);
cp += ace->ace_hw_addr_length;
bcopy(ace->ace_proto_addr, cp, ace->ace_proto_addr_length);
cp += ace->ace_proto_addr_length;
bcopy(ace->ace_hw_addr, cp, ace->ace_hw_addr_length);
cp += ace->ace_hw_addr_length;
bcopy(ace->ace_proto_addr, cp, ace->ace_proto_addr_length);
cp += ace->ace_proto_addr_length;
mp->b_wptr = cp;
ar_client_notify(arl, mp, AR_CN_READY);
DTRACE_PROBE1(ready, ace_t *, ace);
return (B_TRUE);
}
/*
* Pick the longest-waiting aces for defense.
*/
static void
ace_reschedule(ace_t *ace, void *arg)
{
ace_resched_t *art = arg;
ace_t **aces;
ace_t **acemax;
ace_t *atemp;
if (ace->ace_xmit_arl != art->art_arl)
return;
/*
* Only published entries that are ready for announcement are eligible.
*/
if ((ace->ace_flags & (ACE_F_PUBLISH | ACE_F_UNVERIFIED | ACE_F_DYING |
ACE_F_DELAYED)) != ACE_F_PUBLISH) {
return;
}
if (art->art_naces < ACE_RESCHED_LIST_LEN) {
art->art_aces[art->art_naces++] = ace;
} else {
aces = art->art_aces;
acemax = aces + ACE_RESCHED_LIST_LEN;
for (; aces < acemax; aces++) {
if ((*aces)->ace_last_bcast > ace->ace_last_bcast) {
atemp = *aces;
*aces = ace;
ace = atemp;
}
}
}
}
/*
* Reschedule the ARP defense of any long-waiting ACEs. It's assumed that this
* doesn't happen very often (if at all), and thus it needn't be highly
* optimized. (Note, though, that it's actually O(N) complexity, because the
* outer loop is bounded by a constant rather than by the length of the list.)
*/
static void
arl_reschedule(arl_t *arl)
{
arlphy_t *ap = arl->arl_phy;
ace_resched_t art;
int i;
ace_t *ace;
arp_stack_t *as = ARL_TO_ARPSTACK(arl);
i = ap->ap_defend_count;
ap->ap_defend_count = 0;
/* If none could be sitting around, then don't reschedule */
if (i < as->as_defend_rate) {
DTRACE_PROBE1(reschedule_none, arl_t *, arl);
return;
}
art.art_arl = arl;
while (ap->ap_defend_count < as->as_defend_rate) {
art.art_naces = 0;
ar_ce_walk(as, ace_reschedule, &art);
for (i = 0; i < art.art_naces; i++) {
ace = art.art_aces[i];
ace->ace_flags |= ACE_F_DELAYED;
ace_set_timer(ace, B_FALSE);
if (++ap->ap_defend_count >= as->as_defend_rate)
break;
}
if (art.art_naces < ACE_RESCHED_LIST_LEN)
break;
}
DTRACE_PROBE1(reschedule, arl_t *, arl);
}
/*
* Write side service routine. The only action here is delivery of transmit
* timer events and delayed messages while waiting for the info_ack (ar_arl
* not yet set).
*/
static void
ar_wsrv(queue_t *q)
{
ace_t *ace;
arlphy_t *ap;
mblk_t *mp;
clock_t ms;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
TRACE_1(TR_FAC_ARP, TR_ARP_WSRV_START,
"arp_wsrv_start: q %p", q);
while ((mp = getq(q)) != NULL) {
switch (DB_TYPE(mp)) {
case M_PCSIG:
if (!mi_timer_valid(mp))
continue;
ace = (ace_t *)mp->b_rptr;
if (ace->ace_flags & ACE_F_DYING)
continue;
ap = ace->ace_xmit_arl->arl_phy;
if (ace->ace_flags & ACE_F_UNVERIFIED) {
ASSERT(ace->ace_flags & ACE_F_PUBLISH);
ASSERT(ace->ace_query_mp == NULL);
/*
* If the link is down, give up for now. IP
* will give us the go-ahead to try again when
* the link restarts.
*/
if (ap->ap_link_down) {
DTRACE_PROBE1(timer_link_down,
ace_t *, ace);
ace->ace_flags |= ACE_F_DAD_ABORTED;
continue;
}
if (ace->ace_xmit_count > 0) {
DTRACE_PROBE1(timer_probe,
ace_t *, ace);
ace->ace_xmit_count--;
ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
ace->ace_proto,
ace->ace_proto_addr_length,
ace->ace_hw_addr, NULL, NULL,
ace->ace_proto_addr, NULL, as);
ace_set_timer(ace, B_FALSE);
continue;
}
if (!arp_say_ready(ace))
continue;
DTRACE_PROBE1(timer_ready, ace_t *, ace);
ace->ace_xmit_interval =
as->as_publish_interval;
ace->ace_xmit_count = as->as_publish_count;
if (ace->ace_xmit_count == 0)
ace->ace_xmit_count++;
ace->ace_flags &= ~ACE_F_UNVERIFIED;
}
if (ace->ace_flags & ACE_F_PUBLISH) {
clock_t now;
/*
* If an hour has passed, then free up the
* entries that need defense by rescheduling
* them.
*/
now = ddi_get_lbolt();
if (as->as_defend_rate > 0 &&
now - ap->ap_defend_start >
SEC_TO_TICK(as->as_defend_period)) {
ap->ap_defend_start = now;
arl_reschedule(ace->ace_xmit_arl);
}
/*
* Finish the job that we started in
* ar_entry_add. When we get to zero
* announcement retransmits left, switch to
* address defense.
*/
ASSERT(ace->ace_query_mp == NULL);
if (ace->ace_xmit_count > 0) {
ace->ace_xmit_count--;
DTRACE_PROBE1(timer_announce,
ace_t *, ace);
} else if (ace->ace_flags & ACE_F_DELAYED) {
/*
* This guy was rescheduled as one of
* the really old entries needing
* on-going defense. Let him through
* now.
*/
DTRACE_PROBE1(timer_send_delayed,
ace_t *, ace);
ace->ace_flags &= ~ACE_F_DELAYED;
} else if (as->as_defend_rate > 0 &&
(ap->ap_defend_count >=
as->as_defend_rate ||
++ap->ap_defend_count >=
as->as_defend_rate)) {
/*
* If we're no longer allowed to send
* unbidden defense messages, then just
* wait for rescheduling.
*/
DTRACE_PROBE1(timer_excess_defense,
ace_t *, ace);
ace_set_timer(ace, B_FALSE);
continue;
} else {
DTRACE_PROBE1(timer_defend,
ace_t *, ace);
}
ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
ace->ace_proto,
ace->ace_proto_addr_length,
ace->ace_hw_addr,
ace->ace_proto_addr,
ace->ace_xmit_arl->arl_phy->ap_arp_addr,
ace->ace_proto_addr, NULL, as);
ace->ace_last_bcast = now;
if (ace->ace_xmit_count == 0)
ace->ace_xmit_interval =
as->as_defend_interval;
if (ace->ace_xmit_interval != 0)
ace_set_timer(ace, B_FALSE);
continue;
}
/*
* If this is a non-permanent (regular) resolved ARP
* entry, then it's now time to check if it can be
* retired. As an optimization, we check with IP
* first, and just restart the timer if the address is
* still in use.
*/
if (ACE_NONPERM(ace)) {
if (ace->ace_proto == IP_ARP_PROTO_TYPE &&
ndp_lookup_ipaddr(*(ipaddr_t *)
ace->ace_proto_addr, as->as_netstack)) {
ace->ace_flags |= ACE_F_OLD;
mi_timer(ace->ace_arl->arl_wq,
ace->ace_mp,
as->as_cleanup_interval);
} else {
ar_delete_notify(ace);
ar_ce_delete(ace);
}
continue;
}
/*
* ar_query_xmit returns the number of milliseconds to
* wait following this transmit. If the number of
* allowed transmissions has been exhausted, it will
* return zero without transmitting. If that happens
* we complete the operation with a failure indication.
* Otherwise, we restart the timer.
*/
ms = ar_query_xmit(as, ace);
if (ms == 0)
ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
else
mi_timer(q, mp, ms);
continue;
default:
put(q, mp);
continue;
}
}
TRACE_1(TR_FAC_ARP, TR_ARP_WSRV_END,
"arp_wsrv_end: q %p", q);
}
/* ar_xmit is called to transmit an ARP Request or Response. */
static void
ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto, uint32_t plen,
const uchar_t *haddr1, const uchar_t *paddr1, const uchar_t *haddr2,
const uchar_t *paddr2, const uchar_t *dstaddr, arp_stack_t *as)
{
arh_t *arh;
uint8_t *cp;
uint_t hlen;
mblk_t *mp;
arlphy_t *ap = arl->arl_phy;
ASSERT(!(arl->arl_flags & ARL_F_IPMP));
if (ap == NULL) {
DTRACE_PROBE1(xmit_no_arl_phy, arl_t *, arl);
return;
}
/* IFF_NOARP flag is set or link down: do not send arp messages */
if ((arl->arl_flags & ARL_F_NOARP) || ap->ap_link_down)
return;
hlen = ap->ap_hw_addrlen;
if ((mp = copyb(ap->ap_xmit_mp)) == NULL)
return;
mp->b_cont = allocb(AR_LL_HDR_SLACK + ARH_FIXED_LEN + (hlen * 4) +
plen + plen, BPRI_MED);
if (mp->b_cont == NULL) {
freeb(mp);
return;
}
/* Get the L2 destination address for the message */
if (haddr2 == NULL)
dstaddr = ap->ap_arp_addr;
else if (dstaddr == NULL)
dstaddr = haddr2;
/*
* Figure out where the target hardware address goes in the
* DL_UNITDATA_REQ header, and copy it in.
*/
cp = mi_offset_param(mp, ap->ap_xmit_addroff, hlen);
ASSERT(cp != NULL);
if (cp == NULL) {
freemsg(mp);
return;
}
bcopy(dstaddr, cp, hlen);
/* Fill in the ARP header. */
cp = mp->b_cont->b_rptr + (AR_LL_HDR_SLACK + hlen + hlen);
mp->b_cont->b_rptr = cp;
arh = (arh_t *)cp;
U16_TO_BE16(ap->ap_arp_hw_type, arh->arh_hardware);
U16_TO_BE16(proto, arh->arh_proto);
arh->arh_hlen = (uint8_t)hlen;
arh->arh_plen = (uint8_t)plen;
U16_TO_BE16(operation, arh->arh_operation);
cp += ARH_FIXED_LEN;
bcopy(haddr1, cp, hlen);
cp += hlen;
if (paddr1 == NULL)
bzero(cp, plen);
else
bcopy(paddr1, cp, plen);
cp += plen;
if (haddr2 == NULL)
bzero(cp, hlen);
else
bcopy(haddr2, cp, hlen);
cp += hlen;
bcopy(paddr2, cp, plen);
cp += plen;
mp->b_cont->b_wptr = cp;
DTRACE_PROBE3(arp__physical__out__start,
arl_t *, arl, arh_t *, arh, mblk_t *, mp);
ARP_HOOK_OUT(as->as_arp_physical_out_event, as->as_arp_physical_out,
arl->arl_index, arh, mp, mp->b_cont, as);
DTRACE_PROBE1(arp__physical__out__end, mblk_t *, mp);
if (mp == NULL)
return;
/* Ship it out. */
if (canputnext(arl->arl_wq))
putnext(arl->arl_wq, mp);
else
freemsg(mp);
}
static mblk_t *
ar_alloc(uint32_t cmd, int err)
{
uint32_t len;
mblk_t *mp;
mblk_t *mp1;
char *cp;
arc_t *arc;
/* For now only one type of command is accepted */
if (cmd != AR_DLPIOP_DONE)
return (NULL);
len = sizeof (arc_t);
mp = allocb(len, BPRI_HI);
if (!mp)
return (NULL);
DB_TYPE(mp) = M_CTL;
cp = (char *)mp->b_rptr;
arc = (arc_t *)(mp->b_rptr);
arc->arc_cmd = cmd;
mp->b_wptr = (uchar_t *)&cp[len];
len = sizeof (int);
mp1 = allocb(len, BPRI_HI);
if (!mp1) {
freeb(mp);
return (NULL);
}
cp = (char *)mp->b_rptr;
/* Initialize the error code */
*((int *)mp1->b_rptr) = err;
mp1->b_wptr = (uchar_t *)&cp[len];
linkb(mp, mp1);
return (mp);
}
void
arp_ddi_init(void)
{
/*
* We want to be informed each time a stack is created or
* destroyed in the kernel, so we can maintain the
* set of arp_stack_t's.
*/
netstack_register(NS_ARP, arp_stack_init, arp_stack_shutdown,
arp_stack_fini);
}
void
arp_ddi_destroy(void)
{
netstack_unregister(NS_ARP);
}
/*
* Initialize the ARP stack instance.
*/
/* ARGSUSED */
static void *
arp_stack_init(netstackid_t stackid, netstack_t *ns)
{
arp_stack_t *as;
arpparam_t *pa;
as = (arp_stack_t *)kmem_zalloc(sizeof (*as), KM_SLEEP);
as->as_netstack = ns;
pa = (arpparam_t *)kmem_alloc(sizeof (arp_param_arr), KM_SLEEP);
as->as_param_arr = pa;
bcopy(arp_param_arr, as->as_param_arr, sizeof (arp_param_arr));
(void) ar_param_register(&as->as_nd,
as->as_param_arr, A_CNT(arp_param_arr));
as->as_arp_index_counter = 1;
as->as_arp_counter_wrapped = 0;
rw_init(&as->as_arl_lock, NULL, RW_DRIVER, NULL);
arp_net_init(as, stackid);
arp_hook_init(as);
return (as);
}
/* ARGSUSED */
static void
arp_stack_shutdown(netstackid_t stackid, void *arg)
{
arp_stack_t *as = (arp_stack_t *)arg;
arp_net_shutdown(as);
}
/*
* Free the ARP stack instance.
*/
/* ARGSUSED */
static void
arp_stack_fini(netstackid_t stackid, void *arg)
{
arp_stack_t *as = (arp_stack_t *)arg;
arp_hook_destroy(as);
arp_net_destroy(as);
rw_destroy(&as->as_arl_lock);
nd_free(&as->as_nd);
kmem_free(as->as_param_arr, sizeof (arp_param_arr));
as->as_param_arr = NULL;
kmem_free(as, sizeof (*as));
}