並列化,高速化の例


エラトステネスのふるい

prim.c
n2=n*n;
/* initialize the table. */
for(i=0; i < n2; i++) tab[i] = 0;
/* main loop */
for(i=2; i < n; i++)
  if(tab[i] == 0)
    for(k=i*i; k < n2; k+=i)
      tab[k] = 1;

並列化・高速化の流れの例

prim-p.c - 高速化 → prim-p2.cprim2-p.c
        /                                       /
 パイプライン型並列化(1)                 パイプライン型並列化(2)
    /                                       /
 prim.c ------ 高速化(1)---------------→ prim2.c
   |                                         |
独立部分を増やす変形                      独立部分を増やす変形
   |                                         |
   V                                         V
 prim-i.c -----高速化------→ prim-i2.c    prim2-i.c
   |                            |            |
分割型並列化                分割型並列化  分割型並列化
   |                            |            |
   V                            V            V
prim-ip-1.c                   prim-i2p.c   prim2-ip.c (同期過多)
prim-ip.c -高速化(2)prim-ip2.c

高速化(1)

prim.c prim2.c
n2=n*n;
/* initialize the table. */
for(i=0; i < n2; i++) tab[i] = 0;
/* main loop */
for(i=2; i < n; i++)
  if(tab[i] == 0)
    for(k=i*i; k < n2; k+=i)
      tab[k] = 1;
n2=n*n;
tab[2] = tab[3] = 0;
for(k0=4,k1=MIN(k0+BSIZE,n2); k0 < n2; k0=k1,k1=MIN(k0+BSIZE,n2)){
  for(i=k0; i < k1; i++)
    tab[i] = 0;
  for(i=2; i < n; i++)
    if(tab[i] == 0)
      if(i*i > k1)
        break;
      else
        for(k=MAX(i*i,k0),k+=i-1,k-=k%i; k < k1; k+=i)
          tab[k] = 1;
}

独立部分を増やす変形

prim.c prim-i.c
n2=n*n;
/* initialize the table. */
for(i=0; i < n2; i++) tab[i] = 0;
/* main loop */
for(i=2; i < n; i++)
  if(tab[i] == 0)
    for(k=i*i; k < n2; k+=i)
      tab[k] = 1;
n2=n*n
/* initialize the table. */
for(i=0; i < n2; i++) tab[i] = 0;
/* main loop
   {i=2,i2=4}→{i=4,i2=16}→{i=16,i2=256}→... */
for(i=2,i2=MIN(4,n); i < n; i=i2,i2=MIN(i*i,n))
  /* sieve (write) tab[i*i .. (n2-1)]
     by reading tab[i .. (i*i-1)]. */
  for(j=i; j < i2; j++)
    if(tab[j] == 0)
      for(k=j*j; k < n2; k+=j)
        tab[k] = 1;

分割型並列化

prim-i.c prim-ip.c
n2=n*n;
/* initialize the table. */
for(i=0; i < n2; i++) tab[i] = 0;
/* main loop
   {i=2,i2=4}→{i=4,i2=16}→{i=16,i2=256}→... */
for(i=2,i2=MIN(4,n); i < n; i=i2,i2=MIN(i*i,n))
  /* sieve (write) tab[i*i .. (n2-1)]
     by reading tab[i .. (i*i-1)]. */
  for(j=i; j < i2; j++)
    if(tab[j] == 0)
      for(k=j*j; k < n2; k+=j)
        tab[k] = 1;
n2=n*n;
/* initialize the table. */
i0=n2*myid/P; i1=n2*(myid+1)/P;
for(i=i0; i < i1; i++) tab[i] = 0;
barrier_sync(P, myid);
/* main loop
   {i=2,i2=4}→{i=4,i2=16}→{i=16,i2=256}→... */
for(i=2,i2=MIN(4,n); i < n; i=i2,i2=MIN(i*i,n))
/* sieve (write) tab[i*i .. (n2-1)]
   by reading tab[i .. (i*i-1)]. */
{
  k0 = i*i + (n2-i*i)*myid/P;
  k1 = i*i + (n2-i*i)*(myid+1)/P;
  for(j=i; j < i2; j++)
    if(tab[j] == 0)
      if(j*j > k1)
        break;
      else
        for(k=MAX(j*j,k0),k+=j-1,k-=k%j; k < k1; k+=j)
          tab[k] = 1;
  barrier_sync(P, myid);
}

高速化(2)

prim-ip.c prim-ip2.c
n2=n*n;
/* initialize the table. */
i0=n2*myid/P; i1=n2*(myid+1)/P;
for(i=i0; i < i1; i++) tab[i] = 0;
barrier_sync(P, myid);
/* main loop
   {i=2,i2=4}→{i=4,i2=16}→{i=16,i2=256}→... */
for(i=2,i2=MIN(4,n); i < n; i=i2,i2=MIN(i*i,n))
  /* sieve (write) tab[i*i .. (n2-1)]
     by reading tab[i .. (i*i-1)]. */
  {
    k0 = i*i + (n2-i*i)*myid/P;
    k1 = i*i + (n2-i*i)*(myid+1)/P;
    for(j=i; j < i2; j++)
      if(tab[j] == 0)
        if(j*j > k1)
          break;
        else
          for(k=MAX(j*j,k0),k+=j-1,k-=k%j; k < k1; k+=j)
            tab[k] = 1;
    barrier_sync(P, myid);
  }
n2=n*n;
/* initialize the table. */
i0=n2*myid/P; i1=n2*(myid+1)/P;
for(i=i0; i < i1; i++) tab[i] = 0;
barrier_sync(P, myid);
/* main loop
   {i=2,i2=4}→{i=4,i2=16}→{i=16,i2=256}→... */
for(i=2,i2=MIN(4,n); i < n; i=i2,i2=MIN(i*i,n))
  /* sieve (write) tab[i*i .. (n2-1)]
     by reading tab[i .. (i*i-1)]. */
  {
    k0 = i*i + (n2-i*i)*myid/P;
    k1 = i*i + (n2-i*i)*(myid+1)/P;
    for(k0a=k0,k1a=MIN(k0a+BSIZE,k1); k0a < k1;
        k0a=k1a,k1a=MIN(k0a+BSIZE,k1))
      for(j=i; j < i2; j++)
        if(tab[j] == 0)
          if(j*j > k1a)
            break;
          else
            for(k=MAX(j*j,k0a),k+=j-1,k-=k%j; k < k1a; k+=j)
              tab[k] = 1;
    barrier_sync(P, myid);
  }

パイプライン型並列化(1)

prim.c prim-p.c
n2=n*n;
/* initialize the table. */
for(i=0; i < n2; i++) tab[i] = 0;
/* main loop */
for(i=2; i < n; i++)
  if(tab[i] == 0)
    for(k=i*i; k < n2; k+=i)
      tab[k] = 1;
int n2=n*n;
i0=n2*myid/P; i1=n2*(myid+1)/P;
/* initialize the table. */
for(i=i0; i < i1; i++) tab[i] = 0;
maxnewp = MIN(i1, n); prg = 4;
/* main loop */
for(i=2; i < n; i++){
  if(! (i < prg))
    prg = wait_progress2(i);
  if(tab[i] == 0){
    i2=i*i;
    newp = MIN(i2, maxnewp);
    if(newp > i0)
      prg = wait_and_signal_progress2(newp, i0);
    if(i2 >= i1) break;
    for(k=MAX(i2,i0),k+=i-1,k-=k%i; k < i1; k+=i)
      tab[k] = 1;
  }
}
if(maxnewp > i0)
  prg = wait_and_signal_progress2(maxnewp, i0);
barrier_sync(P, myid);

パイプライン型並列化(2)

prim2.c prim2-p.c
n2=n*n;
tab[2] = tab[3] = 0;
for(k0=4,k1=MIN(k0+BSIZE,n2); k0 < n2;
    k0=k1,k1=MIN(k0+BSIZE,n2)){
  for(i=k0; i < k1; i++)
    tab[i] = 0;
  for(i=2; i < n; i++)
    if(tab[i] == 0)
      if(i*i > k1)
        break;
      else
        for(k=MAX(i*i,k0),k+=i-1,k-=k%i; k < k1; k+=i)
          tab[k] = 1;
}
n2=n*n;
tab[2] = tab[3] = 0;
prg = 4;
for(k0=4,k1=MIN(k0+BSIZE*P,n2); k0 < n2;
    k0=k1,k1=MIN(k0+BSIZE*P,n2)){
  k0a = k0 + (k1-k0)*myid/P;
  k1a = k0 + (k1-k0)*(myid+1)/P;
  /* initialize the table. */
  for(i=k0a; i < k1a; i++)
    tab[i] = 0;
  maxnewp = MIN(k1a, n); 
  /* main loop */
  for(i=2; i < n; i++){
    if(! (i < prg))
      prg = wait_progress2(i);
    if(tab[i] == 0){
      i2 = i*i;
      newp = MIN(i2, maxnewp);
      if(newp > k0a && newp > prg)
        prg = wait_and_signal_progress2(newp, k0a);
      if(i2 > k1a) break;
      for(k=MAX(i2,k0a),k+=i-1,k-=k%i; k < k1a; k+=i)
        tab[k] = 1;
    }
  }
  if(maxnewp > k0a)
    prg = wait_and_signal_progress2(maxnewp, k0a);
}
barrier_sync(P, myid);

並列プログラミング, 先頭ページへ
Masahiro Yasugi: yasugi@kuis.kyoto-u.ac.jp