/*
224/290 03 Jun 90  22:09:20
From:   Ratko Tomic
To:     All
Subj:   SIEVE 5X8 + PICTURE
Attr:   
------------------------------------------------
*/

 /********** PART 1/3: Macros, Globals and count_all() ******************/
 
 /************************************************************************
           SIEVE5x8  (Primes to 1.8 Million in < 60K)
                   Author: Ratko V. Tomic
 *************************************************************************
  This program computes primes using Sieve with bits used for flags and
  by removing multiples of 2,3 and 5 ahead of time. This allows computation
  of primes up to 30*(Number of bytes available). Thus even on a PC with
  520K array (in huge model) you can compute primes up to 15.97 Million,
  good for factoring of numbers up 2.55e14.
 *************************************************************************/
 #include <stdio.h>

unsigned _stack = 1024;

 typedef unsigned char byte;
 typedef unsigned word;
 
 #define S  60000    /* S*30 = 1,800,000 primes, Small model & 4K stack */
                     /* To increase the limit S, reduce stack size.     */
 
 #define M  (S*8)    /* Maximum bits available                          */
 #define T  (S*30)   /* Upper limit for #'s                             */
 byte a[S+1];        /* Bit map holding S*30 primes, must be all 0's    */
                     /* If iterated, re-initialization is NOT needed.   */
 
 byte x[8]={1,7,11,13,17,19,23,29};   /* Numbers left from 1-30    */
 byte y[8]={12,18,21,27,30,36,47,49}; /* Used to calculate steps   */
 
 #define M6(x)  (((x)+(x)+(x))<<1)              /* Multiply by 6   */
 #define L3(x)  ((unsigned long)(x)<<1<<1<<1)   /* Shift left by 3 */
 #define R3(x)  ((unsigned long)(x)>>1>>1>>1)
 #define R2(x)  ((unsigned long)(x)>>1>>1)
 
 #define cvstep(zz,z,bz) z=((word)R3(zz)), bz=((word)zz&7)
 #define set(j,bj) a[j]|=1<<bj
 #define adv(j,bj,k,bk) j+=k; bj+=bk; if (bj>7) j++, bj-=8
 
 #define bn(n) (n&15?(n&3?(n&1?0:1):(n&4?2:3)):(n&48?(n&16?4:5):(n&64?6:7)))
 
 /*** Count primes after sieve5x8() has finished, called from main()   ***/
 
 long count_all(void)            /* Compute total number of primes found */
 { int c;                        /* which is: 3 + Num_of_0_bits_in_a[]   */
   word w,i;
   long t=3;                       /* Account for primes 2,3,5       */
      for(i=0; i<S/2; i++)         /* Process a[] in word chunks     */
         {                         /* Byte order doesn't matter      */
         w=~*((word*)a+i);         /* There are less 0 bits -> use ~ */
         for (c=0; w; ++c) w&=w-1; /* Count bits set to 1 in w       */
         t+=c;                     /* Accumm. total 0 bits from a[]  */
         }
      return t;
 }
 

 /*** PART 2/3: sieve5x8() and utility gen_up2() ****/
 
 sieve5x8()                 /** COMPUTE PRIMES < 30*S **/
 { long P,n,d;
   register word i,bi;
   word j,bm,sav_i,sav_bi;
   static long ss[8];       /* 8 Steps used to cross out multiples of P  */
   static word s[8],b[8];   /* Steps in byte offsets and bit shifts      */
      for (i=0,a[i--]=1;;)
         {
         do i++; while(a[i]==0xFF);       /* Find next 0 bit in a[] */
         if (i>=S) return;                /* End of array reached   */
         bi=~a[i]; sav_i=i;
         bm=-bi&bi;                       /* Bit mask=lowest bit set in bi */
         n=L3(i)+(bi=bn(bi));             /* Get index n for prime found   */
         do {                             /* Process bits in their own loop */
            d=(n-1)&~7L;                  /* == ((n-1)/8)*8;               */
            ss[0]=ss[6]=M6(d)+y[(bi-1)&7];/* == ((n-1)/8)*48+y[(n-1)%8];   */
            if ((d=ss[0]+n)>M) return;    /* Done - no more flags to be set */
            P=(long)i*30+x[bi];;          /* Compute prime P from the index */
            ss[7]=n+n+1;                  /* Compute remaining 7 steps     */
            ss[3]=(R2(n+4)&~1L)+P;        /* == ((n+4)/8)*2+P;             */
            ss[2]=ss[4]=i+((1+P)>>1);     /* == (n/8)+(1+P)/2;             */
            ss[1]=ss[5]=R2(n+2)+P;        /* == (n+2)/4+P;                 */
            for (j=3;j<8;j++)             /* Convert steps to byte step s[] */
              cvstep(ss[j],s[j],b[j]);    /* and bit step b[]              */
            s[0]=s[6];s[1]=s[5];s[2]=s[4];/* Clone remaining 3 byte offsets */
            b[0]=b[6];b[1]=b[5];b[2]=b[4];/* same for bit steps            */
            sav_bi=bi; j=i; i=0;
            do {                          /* Tag mult of P in 8 steps ss[j] */
               adv(j,bi,s[i],b[i]);       /* Advance to next postion in a[] */
               set(j,bi);                 /* Set bit for that position     */
               ++i; i&=7;                 /* Advance to next step (i%8)    */
               }
            while((d+=ss[i])<M);
            i=sav_i; bi=sav_bi;
            do n++, bi++, bm<<=1;
              while(bi<8 && a[i]&bm);     /* Get next 0 bit in this byte */
            }
         while(bi<8);
         }
 }
 
 long gen_up2(long h)  /*** Generate primes<=h, where: 6 < h < T ) ***/
 { word i=0,k=1;       /* This func. is not called for benchmark     */
   long c=3,n,b=0;     /* It assumes sieve5x8() was already called   */
   byte m=2;
      while(1)
         {
         n=b+x[k++];             /* n=Next number to check */
         if (n>h) return(c);
         if (!(a[i]&m)) c++;     /* Prime found (its value is n) */
         if (!(m+=m))            /* Advance to next bit */
           {
           m=1; i++; k=0; b+=30;
           if (i>=S) return(c);
           }
         }
 } 

 /** PART 3/3: main(), ITER=5 **/
 
 /****** PC Time calculations ******/
 
 #define pc_time (*(long far*)0x46C)     /* Get IBM tick count */
 time_diff(long t,int *s,int *hs)        /* Compute seconds & 1/100's */
 { long dt;
     if ((dt=pc_time-t)<0) dt+=0x1800B0L; /* Adjust for midnight */
     dt*=1000; *s=(int)(dt/18207);
     *hs=(int)(dt%18207)/182;
     if (*hs>100) *hs=99;
 }
 
 /*** TEST SHELL  ***/
 
 #define ITER 5      /* Iterations for benchmark (on 4.77 PC set to 1) */
 
 #define d3(n) (int)((long)(n)/1000000L)        /* Millions            */
 #define d2(n) (int)(((long)(n)%1000000L)/1000) /* Thousands           */
 #define d1(n) (int)((long)(n)%1000)            /* Ones                */
 long primes;                                   /* Total primes found  */
 
 main()
 { long t0;
   int i,sec,s100;
      printf("Computing primes up to %d,%03d,%03d... ",d3(T),d2(T),d1(T));
      t0=pc_time;                /* Timer ticks used for accuracy */
      for (i=0; i<ITER; i++)
        sieve5x8(),              /* Construct primes < T=1,800,000  */
        primes=count_all();      /* Count number of primes found    */
      time_diff(t0,&sec,&s100);  /* Result should be 135,072 primes */
      printf("Found %d,%03d primes.\n",d2(primes),d1(primes));
      printf("TIME: %d.%02d seconds.\n",sec,s100);
 }
/*
 ---------------------- END OF C CODE -----------------------------------
 
 This is new version for the benchmark I posted recently. The main new
 addon is the new algorithm which eliminates primes 2,3,5 (instead of
 2,3 only). This boosts the packing density from 3 to 3.75 numbers/bit,
 or from 24 to 30 numbers per byte. The new algorithm is in the sieve5x8().
 Some macros were rewritten and some code improvements were done, as result
 it takes roughly same time as sieve3x8() per iteration, even though it's
 more complex and generates more primes (1,800,000 for 60,000 byte array).
 For more accuracy (on a 386/25Mhz), 5 ITERATIONS were used:
 
   Compiler          TIME (sec)   CODE (bytes)    EXE SIZE
  -------------------------------------------------------------- 
   TopSpeed 2.02      27.02          1105           8197  (winner again)
   Watcom 7.0         34.16          1164           6950
   MSC 5.0            35.09          1266           8827
   Turbo C 2.0        35.75          1246           8056  (last again)
  --------------------------------------------------------------
 
 THE BIG PICTURE - If you have VGA or MCGA video adapter try this: Take 
 the content of a[S] word at a time (2 bytes) and use it as an offset
 into video RAM, (video mode 0x13: 320x200, 256 colors, flat 2-D address).
 At that offset increment a byte to new (brighter) color. Do that for all
 30,000 words in a[].  Quite surprising ?!  It also works with sieve3x8()
 and other video cards. Also, the byte ordering in a word doesn't matter.
 
--- TBBS v2.1/NM
 * Origin: MSI S/W BBS-Fram. MA-(508) 875-8009-CodeRunneR-HandiWARE (322/327)
*/

