xref/libmicro/cachetocache.c

0N/A/*
0N/A * CDDL HEADER START
0N/A *
0N/A * The contents of this file are subject to the terms
0N/A * of the Common Development and Distribution License
0N/A * (the "License").  You may not use this file except
0N/A * in compliance with the License.
0N/A *
0N/A * You can obtain a copy of the license at
0N/A * src/OPENSOLARIS.LICENSE
0N/A * or http://www.opensolaris.org/os/licensing.
0N/A * See the License for the specific language governing
0N/A * permissions and limitations under the License.
0N/A *
0N/A * When distributing Covered Code, include this CDDL
0N/A * HEADER in each file and include the License file at
0N/A * usr/src/OPENSOLARIS.LICENSE.  If applicable,
0N/A * add the following below this CDDL HEADER, with the
0N/A * fields enclosed by brackets "[]" replaced with your
0N/A * own identifying information: Portions Copyright [yyyy]
0N/A * [name of copyright owner]
0N/A *
0N/A * CDDL HEADER END
0N/A */
0N/A
0N/A/*
9N/A * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
0N/A * Use is subject to license terms.
0N/A */
0N/A
0N/A/*
0N/A * routine to benchmark cache-to-cache transfer times... uses
0N/A * solaris features to find and bind to cpus in the current
0N/A * processor set, so not likely to work elsewhere.
0N/A */
0N/A
0N/A
0N/A#include <unistd.h>
0N/A#include <stdlib.h>
0N/A#include <stdio.h>
0N/A#include <fcntl.h>
0N/A#include <string.h>
0N/A#include <sys/processor.h>
0N/A#include <sys/types.h>
0N/A#include <stdio.h>
0N/A#include <errno.h>
0N/A#include <sys/pset.h>
0N/A
0N/A#include "libmicro.h"
0N/A
9N/Astatic long         opts = 1024*512;
0N/A
0N/Atypedef struct {
0N/A    long            **ts_data;
0N/A    long            ts_result;
0N/A    pthread_mutex_t     ts_lock;
0N/A} tsd_t;
0N/A
0N/Astatic unsigned int ncpu = 1024;
0N/A
0N/Astatic tsd_t *thread_data[1024];
0N/Astatic processorid_t cpus[1024];
0N/A
0N/Aint traverse_ptrchain(long **, int, int);
0N/A
0N/Aint
0N/Abenchmark_init()
0N/A{
0N/A    lm_tsdsize = sizeof (tsd_t);
0N/A
0N/A    (void) sprintf(lm_optstr, "s:");
0N/A
0N/A    (void) sprintf(lm_usage,
0N/A        "       [-s size] size of access area in bytes"
0N/A        " (default %ld)\n"
0N/A        "notes: measures cache to cache transfer times on Solaris\n",
0N/A        opts);
0N/A
0N/A    (void) sprintf(lm_header, "%8s", "size");
0N/A
0N/A    return (0);
0N/A}
0N/A
0N/Aint
0N/Abenchmark_optswitch(int opt, char *optarg)
0N/A{
0N/A    switch (opt) {
0N/A    case 's':
0N/A        opts = sizetoint(optarg);
0N/A        break;
0N/A    default:
0N/A        return (-1);
0N/A    }
0N/A
0N/A    return (0);
0N/A}
0N/A
0N/Aint
0N/Abenchmark_initrun()
0N/A{
0N/A    if (pset_info(PS_MYID, NULL, &ncpu, cpus) < 0) {
0N/A        perror("pset_info");
0N/A        return (1);
0N/A    }
0N/A
0N/A    return (0);
0N/A}
0N/A
0N/Aint
0N/Abenchmark_initworker(void *tsd)
0N/A{
0N/A    tsd_t           *ts = (tsd_t *)tsd;
0N/A    int i, j;
0N/A    processorid_t cpu;
0N/A
0N/A    ts->ts_data = malloc(opts);
0N/A
0N/A    if (ts->ts_data == NULL) {
0N/A        return (1);
0N/A    }
0N/A
0N/A    (void) pthread_mutex_init(&ts->ts_lock, NULL);
0N/A
0N/A
0N/A    if (processor_bind(P_LWPID, P_MYID,
0N/A        cpu = cpus[(pthread_self() - 1) % ncpu],
0N/A        NULL) < 0) {
0N/A        perror("processor_bind:");
0N/A        return (1);
0N/A    }
0N/A
0N/A    (void) printf("# thread %d using processor %d\n", pthread_self(), cpu);
0N/A
0N/A    /*
0N/A     * use lmbench style backwards stride
0N/A     */
0N/A
0N/A    for (i = 0; i < opts / sizeof (long); i++) {
0N/A        j = i - 128;
0N/A        if (j < 0)
0N/A            j = j + opts / sizeof (long);
0N/A        ts->ts_data[i] = (long *)&(ts->ts_data[j]);
0N/A    }
0N/A
0N/A    thread_data[pthread_self() - 1] = ts;
0N/A
0N/A    return (0);
0N/A}
0N/A
0N/A/*
0N/A * here we go in order for each thread, causing inherent serialization
0N/A * this is normally not a good idea, but in this case we're trying to
0N/A * measure cache-to-cache transfer times, and if we run threads in
0N/A * parallel we're likely to see saturation effects rather than cache-to-cache,
0N/A * esp. on wimpy memory platforms like P4.
0N/A */
0N/A
0N/A
0N/A/*ARGSUSED*/
0N/Aint
0N/Abenchmark(void *tsd, result_t *res)
0N/A{
0N/A    tsd_t           *ts;
0N/A    int         i, j;
0N/A    int             count = opts / 128 / sizeof (long);
0N/A
0N/A    for (j = 0; j < lm_optB; j++)
0N/A        for (i = 0; i < lm_optT; i++) {
0N/A            ts = thread_data[i];
0N/A            (void) pthread_mutex_lock(&ts->ts_lock);
0N/A            ts->ts_result += traverse_ptrchain(
0N/A                (long **)ts->ts_data, count, 0);
0N/A            (void) pthread_mutex_unlock(&ts->ts_lock);
0N/A        }
0N/A
0N/A    res->re_count = lm_optB * lm_optT * count;
0N/A
0N/A    return (0);
0N/A}
0N/A
0N/Aint
0N/Atraverse_ptrchain(long **ptr, int count, int value)
0N/A{
0N/A    int i;
0N/A
0N/A    for (i = 0; i < count; i += 10) {
0N/A        *ptr = *ptr + value;
0N/A        ptr = (long **)*ptr;
0N/A        *ptr = *ptr + value;
0N/A        ptr = (long **)*ptr;
0N/A        *ptr = *ptr + value;
0N/A        ptr = (long **)*ptr;
0N/A        *ptr = *ptr + value;
0N/A        ptr = (long **)*ptr;
0N/A        *ptr = *ptr + value;
0N/A        ptr = (long **)*ptr;
0N/A        *ptr = *ptr + value;
0N/A        ptr = (long **)*ptr;
0N/A        *ptr = *ptr + value;
0N/A        ptr = (long **)*ptr;
0N/A        *ptr = *ptr + value;
0N/A        ptr = (long **)*ptr;
0N/A        *ptr = *ptr + value;
0N/A        ptr = (long **)*ptr;
0N/A        *ptr = *ptr + value;
0N/A        ptr = (long **)*ptr;
0N/A        *ptr = *ptr + value;
0N/A    }
0N/A    return ((int)*ptr); /* bogus return */
0N/A}
0N/A
0N/A
0N/Achar *
0N/Abenchmark_result()
0N/A{
0N/A    static char  result[256];
0N/A
0N/A    (void) sprintf(result, "%8ld ", opts);
0N/A
0N/A
0N/A    return (result);
0N/A}