Hello,<br><br>I'm new to parallel programming and MPI.  I've developed a simulator in C++ for which I would<br>like to decrease the running time by using a Beowulf cluster.  I'm not interested in optimizing <br>

speed, I'm just looking for a quick and easy way to significantly improve the speed over running<br>the program on a single machine.<br>Basically, I need to learn how to parallelize a C++ function.  The following functions in particular

<br>take the longest to run in my simulator.  The first implements LU decomposition on a large matrix and <br>the second implements the backsubstitution method to solve matrix division.<br><br>void NR::ludcmp(Mat_IO_DP &a, Vec_O_INT &indx, DP &d)

<br>{<br>    const DP TINY=1.0e-20;<br>    int i,imax,j,k;<br>    DP big,dum,sum,temp;<br><br>    int n=a.nrows();<br>    Vec_DP vv(n);<br>    d=1.0;<br>    for (i=0;i<n;i++) {<br>        big=0.0;<br>        for (j=0;j<n;j++)

<br>            if ((temp=fabs(a[i][j])) > big) big=temp;<br>        if (big == 0.0) nrerror("Singular matrix in routine ludcmp");<br>        vv[i]=1.0/big;<br>    }<br>    for (j=0;j<n;j++) {<br>        for (i=0;i<j;i++) {

<br>            sum=a[i][j];<br>            for (k=0;k<i;k++) sum -= a[i][k]*a[k][j];<br>            a[i][j]=sum;<br>        }<br>        big=0.0;<br>        for (i=j;i<n;i++) {<br>            sum=a[i][j];<br>            for (k=0;k<j;k++) sum -= a[i][k]*a[k][j];

<br>            a[i][j]=sum;<br>            if ((dum=vv[i]*fabs(sum)) >= big) {<br>                big=dum;<br>                imax=i;<br>            }<br>        }<br>        if (j != imax) {<br>            for (k=0;k<n;k++) {

<br>                dum=a[imax][k];<br>                a[imax][k]=a[j][k];<br>                a[j][k]=dum;<br>            }<br>            d = -d;<br>            vv[imax]=vv[j];<br>        }<br>        indx[j]=imax;<br>        if (a[j][j] == 

0.0) a[j][j]=TINY;<br>        if (j != n-1) {<br>            dum=1.0/(a[j][j]);<br>            for (i=j+1;i<n;i++) a[i][j] *= dum;<br>        }<br>    }<br>}<br><br><br><br>and...<br><br><br><br>void NR::lubksb(Mat_I_DP &a, Vec_I_INT &indx, Vec_IO_DP &b)

<br>{<br>    int i,ii=0,ip,j;<br>    DP sum;<br><br>    int n=a.nrows();<br>    for (i=0;i<n;i++) {<br>        ip=indx[i];<br>        sum=b[ip];<br>        b[ip]=b[i];<br>        if (ii != 0)<br>            for (j=ii-1;j<i;j++) sum -= a[i][j]*b[j];

<br>        else if (sum != 0.0)<br>            ii=i+1;<br>        b[i]=sum;<br>    }<br>    for (i=n-1;i>=0;i--) {<br>        sum=b[i];<br>        for (j=i+1;j<n;j++) sum -= a[i][j]*b[j];<br>        b[i]=sum/a[i][i];

<br>    }<br>}<br><br>(The functions are borrowed from the library provided by "Numerical Recipes in C++")<br>I'm currently calling these functions from the main loop with the lines:<br><br>NR::ludcmp(c,indx,d);

<br><br>and<br><br>NR::lubksb(c,indx,xv);<br><br>where the variable 'c' is a large matrix (holding image pixel values) and 'xv' is a vector <br>used in backsubstitution.  <br>All of the variables passed into these functions are of types defined in "

nr.h"<br>'c' is a Mat_DP  (double-precision matrix)<br>'indx' is a Vec_INT  (integer vector)<br>'d' is a DP    (double precision)<br>'xv' is a Vec_DP  (double precision vector)<br><br>

Is there a simple way to call these functions which will cause the cluster to distribute the <br>load of operations?  Currently, when I run the program with a 50x50 array on a single machine, <br>it takes about 5 minutes to process a single iteration through the matrix division.

<br><br>Any help would be greatly appreciated. <br>