Beowulf vs Dual

Mark Hahn hahn at coffee.psychology.mcmaster.ca
Sat Apr 21 00:15:18 PDT 2001


> There is a significant difference for many applications.  A dual has
> no "latency" in communication, while a beowulf pair would have to
> deal with typical ethernet latency.

a dual very definitely has measurable, even *significant* latency.
I just hacked up the attached program, which does a very simple-minded
ping-pong latency test between two CPUs.  on my (admittedly cheesy)
dual celeron/366, it takes around 300 clock ticks (.82 us) for one
thread to notice that the other has changed a flag, and respond.
(that's a ping-pong, so latency is half that.)  sure, this is a hack, 
and my code sucks, but that's still nontrivial.

note that there are cluster interconnects that claim latencies 
in the 2-4 us range.  a simple ping-pong with small UDP packets
over cheap-o 100bT shows around 120 us latency.

anyway, it's a mistake to think of IPC on SMP being so low-latency
that you can ignore it.  

regards, mark hahn
-------------- next part --------------
#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
#include <myio.H>

typedef unsigned long long u64;
double ticksPerSecond, secondsPerTick;

static inline u64 rdtsc(void) {
    u64 clock;
    __asm__ __volatile__("rdtsc" : "=A" (clock));
    return clock;
}
double second() {
    struct timeval tv;
    gettimeofday(&tv,0);
    return tv.tv_sec + 1e-6 * tv.tv_usec;
}
void selectsleep(unsigned us) {
    struct timeval tv;
    tv.tv_sec = 0;
    tv.tv_usec = us;
    select(0,0,0,0,&tv);
}
void calibrate() {
    double sumx = 0;
    double sumy = 0;
    double sumxx = 0;
    double sumxy = 0;
    double slope;

    // least squares linear regression of ticks onto real time
    // as returned by gettimeofday.

    const unsigned n = 30;
    unsigned i;

    for (i=0; i<n; i++) {
        double breal,real,ticks;
        u64 bticks,aticks;
        
        breal = second();
        bticks = rdtsc();

        selectsleep((unsigned)(10000 + drand48() * 100000));

        aticks = rdtsc();
        real = second() - breal;
        ticks = aticks - bticks;

        sumx += real;
        sumxx += real * real;
        sumxy += real * ticks;
        sumy += ticks;
    }
    slope = ((sumxy - (sumx*sumy) / n) /
             (sumxx - (sumx*sumx) / n));
    ticksPerSecond = slope;
    secondsPerTick = 1.0 / slope;
    printf("%3.3f MHz\n",ticksPerSecond*1e-6);
}

int main() {
    calibrate();

    volatile unsigned *p;
    p = (volatile unsigned *) mmap(0,
				   4096, 
				   PROT_READ|PROT_WRITE, 
				   MAP_SHARED|MAP_ANONYMOUS, 
				   -1, 0);
    if (p == MAP_FAILED)
	cerr << "mmap failed" << perr << fatal;

    pid_t pid = fork();
    switch(pid) {
    case -1:
	cerr << "fork failed" << perr << fatal;
    case 0:
	while (1) {
	    *p = 1;
	    while (*p == 1);
	}
	break;
    default:
	break;
    }
    const unsigned times = 1000;
    u64 min = -1;

    for (unsigned i=0; i<times; i++) {
	u64 before = rdtsc();
	*p = 0;
	while (*p == 0);
	u64 ticks = rdtsc() - before;
	if (ticks < min) min = ticks;
    }
    kill(pid,SIGKILL);
    cout << "minumum of " << min << " ticks (" 
	 << 1e6 * min * secondsPerTick << " microseconds\n";
    return 0;
}


More information about the Beowulf mailing list