Difference for texmap/scanline.c from version 1.4 to 1.5


version 1.4 version 1.5
Line 19
 
Line 19
  * Routines to draw the texture mapped scanlines.   * Routines to draw the texture mapped scanlines.
  *    *
  * $Log$   * $Log$
    * Revision 1.5  1999/12/08 01:08:10  donut
    * Falk Hueffner's updated fp_tmap, plus my own code to allow runtime selection of tmap routines
    *
  * Revision 1.4  1999/10/18 00:31:01  donut   * Revision 1.4  1999/10/18 00:31:01  donut
  * more alpha fixes from Falk Hueffner   * more alpha fixes from Falk Hueffner
  *   *
Line 58
 
Line 61
 #include "texmap.h"  #include "texmap.h"
 #include "texmapl.h"  #include "texmapl.h"
 #include "scanline.h"  #include "scanline.h"
   #include "strutil.h"
   
 void c_tmap_scanline_flat()  void c_tmap_scanline_flat()
 {  {
Line 263
 
Line 267
 }  }
 #endif  #endif
   
 #ifdef FP_TMAP  // Used for energy centers. See comments for c_tmap_scanline_per().
 void c_tmap_scanline_per_nolight()  void c_fp_tmap_scanline_per_nolight()
 {  {
  ubyte        *dest;   ubyte        *dest;
  uint c;   ubyte           c;
  int x, j;   int             x;
  double u, v, z, dudx, dvdx, dzdx, rec_z;   double u, v, z, dudx, dvdx, dzdx, rec_z;
  u_int64_t destlong;   double          ubyz, vbyz, ubyz0, vbyz0, ubyz8, vbyz8, du1, dv1;
    double          dudx8, dvdx8, dzdx8;
    u_int64_t       destlong;//, destmask;
   
    ubyte          *texmap = pixptr;//, *fadetable = gr_fade_table;
   
  u = f2db(fx_u);   u = f2db(fx_u);
  v = f2db(fx_v) * 64.0;   v = f2db(fx_v) * 64.0;
  z = f2db(fx_z);   z = f2db(fx_z);
   
  dudx = f2db(fx_du_dx);   dudx = f2db(fx_du_dx);
  dvdx = f2db(fx_dv_dx) * 64.0;   dvdx = f2db(fx_dv_dx) * 64.0;
  dzdx = f2db(fx_dz_dx);   dzdx = f2db(fx_dz_dx);
   
    dudx8 = dudx * 8.0;
    dvdx8 = dvdx * 8.0;
    dzdx8 = dzdx * 8.0;
   
  rec_z = 1.0 / z;   rec_z = 1.0 / z;
   
  dest = (ubyte *) (write_buffer + fx_xleft + (bytes_per_row * fx_y));   dest = (ubyte *) (write_buffer + fx_xleft + (bytes_per_row * fx_y));
   
  x = fx_xright - fx_xleft + 1;   x = fx_xright - fx_xleft + 1;
  if (!Transparency_on) {  
  if (x >= 8) {  
  if ((j = (size_t) dest & 7) != 0) {  
  j = 8 - j;  
   
  while (j > 0) {   if (!Transparency_on) { // I'm not sure this is ever used (energy texture is transparent)
  *dest++ =   if (x >= 8) {
      (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +   for ( ; (size_t) dest & 7; --x) {
    *dest++ = (uint) texmap[(((int) (v * rec_z)) & (64 * 63)) +
  (((int) (u * rec_z)) & 63)];   (((int) (u * rec_z)) & 63)];
  u += dudx;   u += dudx;
  v += dvdx;   v += dvdx;
  z += dzdx;   z += dzdx;
  rec_z = 1.0 / z;   rec_z = 1.0 / z;
  x--;  
  j--;  
  }  
  }   }
   
  while (j >= 8) {   ubyz0 = u * rec_z;
  destlong =   vbyz0 = v * rec_z;
      (u_int64_t) pixptr[(((int) (v * rec_z)) & (64 * 63)) +  
         (((int) (u * rec_z)) & 63)];   u += dudx8;
  u += dudx;   v += dvdx8;
  v += dvdx;   z += dzdx8;
  z += dzdx;  
  rec_z = 1.0 / z;  
  destlong |=  
      (u_int64_t) pixptr[(((int) (v * rec_z)) & (64 * 63)) +  
         (((int) (u * rec_z)) & 63)] << 8;  
  u += dudx;  
  v += dvdx;  
  z += dzdx;  
  rec_z = 1.0 / z;  
  destlong |=  
      (u_int64_t) pixptr[(((int) (v * rec_z)) & (64 * 63)) +  
         (((int) (u * rec_z)) & 63)] << 16;  
  u += dudx;  
  v += dvdx;  
  z += dzdx;  
  rec_z = 1.0 / z;  
  destlong |=  
      (u_int64_t) pixptr[(((int) (v * rec_z)) & (64 * 63)) +  
         (((int) (u * rec_z)) & 63)] << 24;  
  u += dudx;  
  v += dvdx;  
  z += dzdx;  
  rec_z = 1.0 / z;  
  destlong |=  
      (u_int64_t) pixptr[(((int) (v * rec_z)) & (64 * 63)) +  
         (((int) (u * rec_z)) & 63)] << 32;  
  u += dudx;  
  v += dvdx;  
  z += dzdx;  
  rec_z = 1.0 / z;  
  destlong |=  
      (u_int64_t) pixptr[(((int) (v * rec_z)) & (64 * 63)) +  
         (((int) (u * rec_z)) & 63)] << 40;  
  u += dudx;  
  v += dvdx;  
  z += dzdx;  
  rec_z = 1.0 / z;  
  destlong |=  
      (u_int64_t) pixptr[(((int) (v * rec_z)) & (64 * 63)) +  
         (((int) (u * rec_z)) & 63)] << 48;  
  u += dudx;  
  v += dvdx;  
  z += dzdx;  
  rec_z = 1.0 / z;   rec_z = 1.0 / z;
  destlong |=  
      (u_int64_t) pixptr[(((int) (v * rec_z)) & (64 * 63)) +   ubyz8 = u * rec_z;
         (((int) (u * rec_z)) & 63)] << 56;   vbyz8 = v * rec_z;
  u += dudx;  
  v += dvdx;   du1 = (ubyz8 - ubyz0) / 8.0;
  z += dzdx;   dv1 = (vbyz8 - vbyz0) / 8.0;
    ubyz = ubyz0;
    vbyz = vbyz0;
   
    for ( ; x >= 8; x -= 8) {
    destlong = (u_int64_t) texmap[(((int) vbyz) & (64 * 63)) +
         (((int) ubyz) & 63)];
    ubyz += du1;
    vbyz += dv1;
   
    destlong |= (u_int64_t) texmap[(((int) vbyz) & (64 * 63)) +
          (((int) ubyz) & 63)] << 8;
    ubyz += du1;
    vbyz += dv1;
   
    destlong |= (u_int64_t) texmap[(((int) vbyz) & (64 * 63)) +
          (((int) ubyz) & 63)] << 16;
    ubyz += du1;
    vbyz += dv1;
   
    destlong |= (u_int64_t) texmap[(((int) vbyz) & (64 * 63)) +
          (((int) ubyz) & 63)] << 24;
    ubyz += du1;
    vbyz += dv1;
   
    destlong |= (u_int64_t) texmap[(((int) vbyz) & (64 * 63)) +
          (((int) ubyz) & 63)] << 32;
    ubyz += du1;
    vbyz += dv1;
   
    destlong |= (u_int64_t) texmap[(((int) vbyz) & (64 * 63)) +
          (((int) ubyz) & 63)] << 40;
    ubyz += du1;
    vbyz += dv1;
   
    destlong |= (u_int64_t) texmap[(((int) vbyz) & (64 * 63)) +
          (((int) ubyz) & 63)] << 48;
    ubyz += du1;
    vbyz += dv1;
   
    destlong |= (u_int64_t) texmap[(((int) vbyz) & (64 * 63)) +
          (((int) ubyz) & 63)] << 56;
   
    ubyz0 = ubyz8;
    vbyz0 = vbyz8;
   
    u += dudx8;
    v += dvdx8;
    z += dzdx8;
   
  rec_z = 1.0 / z;   rec_z = 1.0 / z;
   
    ubyz8 = u * rec_z;
    vbyz8 = v * rec_z;
   
    du1 = (ubyz8 - ubyz0) / 8.0;
    dv1 = (vbyz8 - vbyz0) / 8.0;
    ubyz = ubyz0;
    vbyz = vbyz0;
   
  *((u_int64_t *) dest) = destlong;   *((u_int64_t *) dest) = destlong;
  dest += 8;   dest += 8;
  x -= 8;  
  j -= 8;  
  }   }
    u -= dudx8;
    v -= dvdx8;
    z -= dzdx8;
  }   }
  while (x-- > 0) {  
  *dest++ =   rec_z = 1.0 / z;
      (u_int64_t) pixptr[(((int) (v * rec_z)) & (64 * 63)) +   for ( ; x > 0; x--) {
         (((int) (u * rec_z)) & 63)];   *dest++ = (uint) texmap[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)];
  u += dudx;   u += dudx;
  v += dvdx;   v += dvdx;
  z += dzdx;   z += dzdx;
  rec_z = 1.0 / z;   rec_z = 1.0 / z;
  }   }
  } else {   } else { // Transparency_on
  x = fx_xright - fx_xleft + 1;  
   
  if (x >= 8) {   if (x >= 8) {
  if ((j = (size_t) dest & 7) != 0) {   for ( ; (size_t) dest & 7; --x) {
  j = 8 - j;   c = (uint) texmap[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)];
    if (c != TRANSPARENCY_COLOR)
  while (j > 0) {  
  c =  
      (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +  
  (((int) (u * rec_z)) & 63)];  
  if (c != 255)  
  *dest = c;   *dest = c;
  dest++;   dest++;
  u += dudx;   u += dudx;
  v += dvdx;   v += dvdx;
  z += dzdx;   z += dzdx;
  rec_z = 1.0 / z;   rec_z = 1.0 / z;
  x--;  
  j--;  
  }  
  }   }
   
  j = x;   ubyz0 = u * rec_z;
  while (j >= 8) {   vbyz0 = v * rec_z;
   
    u += dudx8;
    v += dvdx8;
    z += dzdx8;
    rec_z = 1.0 / z;
    ubyz8 = u * rec_z;
    vbyz8 = v * rec_z;
    du1 = (ubyz8 - ubyz0) / 8.0;
    dv1 = (vbyz8 - vbyz0) / 8.0;
    ubyz = ubyz0;
    vbyz = vbyz0;
    for ( ; x >= 8; x -= 8) {
  destlong = *((u_int64_t *) dest);   destlong = *((u_int64_t *) dest);
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +  
    (((int) (u * rec_z)) & 63)];   c = texmap[(((int) vbyz) & (64 * 63)) + (((int) ubyz) & 63)];
  if (c != 255) {   if (c != TRANSPARENCY_COLOR) {
  destlong &= ~(u_int64_t)0xFF;   destlong &= ~((u_int64_t) 0xFF);
  destlong |= (u_int64_t) c;   destlong |= (u_int64_t) c;
  }   }
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;   c = texmap[(((int) vbyz) & (64 * 63)) + (((int) ubyz) & 63)];
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +   if (c != TRANSPARENCY_COLOR) {
    (((int) (u * rec_z)) & 63)];  
  if (c != 255) {  
  destlong &= ~((u_int64_t)0xFF << 8);   destlong &= ~((u_int64_t)0xFF << 8);
  destlong |= (u_int64_t) c << 8;   destlong |= (u_int64_t) c << 8;
  }   }
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;   c = texmap[(((int) vbyz) & (64 * 63)) + (((int) ubyz) & 63)];
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +   if (c != TRANSPARENCY_COLOR) {
    (((int) (u * rec_z)) & 63)];  
  if (c != 255) {  
  destlong &= ~((u_int64_t)0xFF << 16);   destlong &= ~((u_int64_t)0xFF << 16);
  destlong |= (u_int64_t) c << 16;   destlong |= (u_int64_t) c << 16;
  }   }
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;   c = texmap[(((int) vbyz) & (64 * 63)) + (((int) ubyz) & 63)];
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +   if (c != TRANSPARENCY_COLOR) {
    (((int) (u * rec_z)) & 63)];  
  if (c != 255) {  
  destlong &= ~((u_int64_t)0xFF << 24);   destlong &= ~((u_int64_t)0xFF << 24);
  destlong |= (u_int64_t) c << 24;   destlong |= (u_int64_t) c << 24;
  }   }
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;   c = texmap[(((int) vbyz) & (64 * 63)) + (((int) ubyz) & 63)];
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +   if (c != TRANSPARENCY_COLOR) {
    (((int) (u * rec_z)) & 63)];  
  if (c != 255) {  
  destlong &= ~((u_int64_t)0xFF << 32);   destlong &= ~((u_int64_t)0xFF << 32);
  destlong |= (u_int64_t) c << 32;   destlong |= (u_int64_t) c << 32;
  }   }
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;   c = texmap[(((int) vbyz) & (64 * 63)) + (((int) ubyz) & 63)];
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +   if (c != TRANSPARENCY_COLOR) {
    (((int) (u * rec_z)) & 63)];  
  if (c != 255) {  
  destlong &= ~((u_int64_t)0xFF << 40);   destlong &= ~((u_int64_t)0xFF << 40);
  destlong |= (u_int64_t) c << 40;   destlong |= (u_int64_t) c << 40;
  }   }
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;   c = texmap[(((int) vbyz) & (64 * 63)) + (((int) ubyz) & 63)];
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +   if (c != TRANSPARENCY_COLOR) {
    (((int) (u * rec_z)) & 63)];  
  if (c != 255) {  
  destlong &= ~((u_int64_t)0xFF << 48);   destlong &= ~((u_int64_t)0xFF << 48);
  destlong |= (u_int64_t) c << 48;   destlong |= (u_int64_t) c << 48;
  }   }
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;   c = texmap[(((int) vbyz) & (64 * 63)) + (((int) ubyz) & 63)];
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +   if (c != TRANSPARENCY_COLOR) {
    (((int) (u * rec_z)) & 63)];  
  if (c != 255) {  
  destlong &= ~((u_int64_t)0xFF << 56);   destlong &= ~((u_int64_t)0xFF << 56);
  destlong |= (u_int64_t) c << 56;   destlong |= (u_int64_t) c << 56;
  }   }
  u += dudx;  
  v += dvdx;  
  z += dzdx;  
  rec_z = 1.0 / z;  
   
  *((u_int64_t *) dest) = destlong;   *((u_int64_t *) dest) = destlong;
  dest += 8;   dest += 8;
  x -= 8;  
  j -= 8;   ubyz0 = ubyz8;
    vbyz0 = vbyz8;
   
    u += dudx8;
    v += dvdx8;
    z += dzdx8;
    rec_z = 1.0 / z;
    ubyz8 = u * rec_z;
    vbyz8 = v * rec_z;
    du1 = (ubyz8 - ubyz0) / 8.0;
    dv1 = (vbyz8 - vbyz0) / 8.0;
    ubyz = ubyz0;
    vbyz = vbyz0;
   
  }   }
    u -= dudx8;
    v -= dvdx8;
    z -= dzdx8;
  }   }
  while (x-- > 0) {   rec_z = 1.0 / z;
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +   for ( ; x > 0; x--) {
    (((int) (u * rec_z)) & 63)];   c = (uint) texmap[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)];
  if (c != 255)   if (c != TRANSPARENCY_COLOR)
  *dest = c;   *dest = c;
  dest++;   dest++;
  u += dudx;   u += dudx;
Line 501
 
Line 523
  }   }
  }   }
 }  }
 #else  
 void c_tmap_scanline_per_nolight()  void c_tmap_scanline_per_nolight()
 {  {
  ubyte *dest;   ubyte *dest;
Line 537
 
Line 559
  }   }
  }   }
 }  }
 #endif  
   
 #ifdef FP_TMAP  // This texture mapper uses floating point extensively and writes 8 pixels at once, so it likely works
 void c_tmap_scanline_per()  // best on 64 bit RISC processors.
   // WARNING: it is not endian clean. For big endian, reverse the shift counts in the unrolled loops. I
   // have no means to test that, so I didn't try it. Please tell me if you get this to work on a big
   // endian machine.
   // If you're using an Alpha, use the Compaq compiler for this file for quite some fps more.
   // Unfortunately, it won't compile the whole source, so simply compile everything, change the
   // compiler to ccc, remove scanline.o and compile again.
   // Please send comments/suggestions to falk.hueffner@student.uni-tuebingen.de.
   void c_fp_tmap_scanline_per()
 {  {
  ubyte          *dest;   ubyte          *dest;
  uint            c;   ubyte c;
  int             x, j;   int x;
  double          u, v, z, l, dudx, dvdx, dzdx, dldx, rec_z;   double u, v, z, dudx, dvdx, dzdx, rec_z;
  u_int64_t       destlong;   double ubyz, vbyz, ubyz0, vbyz0, ubyz8, vbyz8, du1, dv1;
    double dudx8, dvdx8, dzdx8;
    fix l, dldx;
    u_int64_t destlong;//, destmask;
   
    // give dumb compilers a chance to put these global pointers into registers or at least have
    // nicer names :)
    ubyte        *texmap = pixptr, *fadetable = gr_fade_table;
   
   #ifdef CYCLECOUNT
    unsigned long start, stop, time;
    static unsigned long sum, count;
   #endif
   
    // v is pre-scaled by 64 to avoid the multiplication when accessing the 64x64 texture array
  u = f2db(fx_u);   u = f2db(fx_u);
  v = f2db(fx_v) * 64.0;   v = f2db(fx_v) * 64.0;
  z = f2db(fx_z);   z = f2db(fx_z);
  l = f2db(fx_l);   l = fx_l >> 8;
   
  dudx = f2db(fx_du_dx);   dudx = f2db(fx_du_dx);
  dvdx = f2db(fx_dv_dx) * 64.0;   dvdx = f2db(fx_dv_dx) * 64.0;
  dzdx = f2db(fx_dz_dx);   dzdx = f2db(fx_dz_dx);
  dldx = f2db(fx_dl_dx);   dldx = fx_dl_dx >> 8;
   
    dudx8 = dudx * 8.0;
    dvdx8 = dvdx * 8.0;
    dzdx8 = dzdx * 8.0;
   
  rec_z = 1.0 / z; // gcc 2.95.2 is won't do this optimization itself   rec_z = 1.0 / z; // multiplication is often faster than division
   
  dest = (ubyte *) (write_buffer + fx_xleft + (bytes_per_row * fx_y));   dest = (ubyte *) (write_buffer + fx_xleft + (bytes_per_row * fx_y));
  x = fx_xright - fx_xleft + 1;   x = fx_xright - fx_xleft + 1;
   
  if (!Transparency_on) {   if (!Transparency_on) {
  if (x >= 8) {   if (x >= 8) {
  if ((j = (size_t) dest & 7) != 0) {   // draw till we are on a 8-byte aligned address
  j = 8 - j;   for ( ; (size_t) dest & 7; --x) {
    *dest++ = fadetable[(l & 0x7f00) +
  while (j > 0) {       (uint) texmap[(((int) (v * rec_z)) & (64 * 63)) +
  *dest++ =  
      gr_fade_table[((int) fabs(l)) * 256 +  
    (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +  
  (((int) (u * rec_z)) & 63)]];   (((int) (u * rec_z)) & 63)]];
  l += dldx;   l += dldx;
  u += dudx;   u += dudx;
  v += dvdx;   v += dvdx;
  z += dzdx;   z += dzdx;
  rec_z = 1.0 / z;   rec_z = 1.0 / z;
  x--;  
  j--;  
  }  
  }   }
   
  j = x;   // Now draw 8 pixels at once, interpolating 1/z linearly. Artifacts of the
  while (j >= 8) {   // interpolation aren't really noticeable; many games even interpolate over 16
    // pixels.
   
    // We do these calculations once before and then at the end of the loop instead
    // of simply at the start of the loop, because he scheduler can then interleave
    // them with the texture accesses. Silly, but gains a few fps.
    ubyz0 = u * rec_z;
    vbyz0 = v * rec_z;
   
    u += dudx8;
    v += dvdx8;
    z += dzdx8;
   
    rec_z = 1.0 / z;
   
    ubyz8 = u * rec_z;
    vbyz8 = v * rec_z;
   
    du1 = (ubyz8 - ubyz0) / 8.0;
    dv1 = (vbyz8 - vbyz0) / 8.0;
    ubyz = ubyz0;
    vbyz = vbyz0;
   
    // This loop is the "hot spot" of the game; it takes about 70% of the time. The
    // major weak point are the many integer casts, which have to go through memory
    // on processors < 21264. But when using integers, one needs to compensate for
    // inexactness, and the code ends up being not really faster.
    for ( ; x >= 8; x -= 8) {
   #ifdef CYCLECOUNT
    start = virtcc();
   #endif
  destlong =   destlong =
      (u_int64_t) gr_fade_table[((int) fabs(l)) * 256 +   (u_int64_t) fadetable[(l & 0x7f00) +
        (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +        (uint) texmap[(((int) vbyz) & (64 * 63)) +
      (((int) (u * rec_z)) & 63)]];     (((int) ubyz) & 63)]];
  l += dldx;   l += dldx;
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;  
  destlong |=   destlong |=
      (u_int64_t) gr_fade_table[((int) fabs(l)) * 256 +   (u_int64_t) fadetable[(l & 0x7f00) +
        (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +        (uint) texmap[(((int) vbyz) & (64 * 63)) +
      (((int) (u * rec_z)) & 63)]] << 8;     (((int) ubyz) & 63)]] << 8;
  l += dldx;   l += dldx;
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;  
  destlong |=   destlong |=
      (u_int64_t) gr_fade_table[((int) fabs(l)) * 256 +   (u_int64_t) fadetable[(l & 0x7f00) +
        (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +        (uint) texmap[(((int) vbyz) & (64 * 63)) +
      (((int) (u * rec_z)) & 63)]] << 16;     (((int) ubyz) & 63)]] << 16;
  l += dldx;   l += dldx;
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;  
  destlong |=   destlong |=
      (u_int64_t) gr_fade_table[((int) fabs(l)) * 256 +   (u_int64_t) fadetable[(l & 0x7f00) +
        (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +        (uint) texmap[(((int) vbyz) & (64 * 63)) +
      (((int) (u * rec_z)) & 63)]] << 24;     (((int) ubyz) & 63)]] << 24;
  l += dldx;   l += dldx;
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;  
  destlong |=   destlong |=
      (u_int64_t) gr_fade_table[((int) fabs(l)) * 256 +   (u_int64_t) fadetable[(l & 0x7f00) +
        (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +        (uint) texmap[(((int) vbyz) & (64 * 63)) +
      (((int) (u * rec_z)) & 63)]] << 32;     (((int) ubyz) & 63)]] << 32;
  l += dldx;   l += dldx;
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;  
  destlong |=   destlong |=
      (u_int64_t) gr_fade_table[((int) fabs(l)) * 256 +   (u_int64_t) fadetable[(l & 0x7f00) +
        (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +        (uint) texmap[(((int) vbyz) & (64 * 63)) +
      (((int) (u * rec_z)) & 63)]] << 40;     (((int) ubyz) & 63)]] << 40;
  l += dldx;   l += dldx;
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;  
  destlong |=   destlong |=
      (u_int64_t) gr_fade_table[((int) fabs(l)) * 256 +   (u_int64_t) fadetable[(l & 0x7f00) +
        (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +        (uint) texmap[(((int) vbyz) & (64 * 63)) +
      (((int) (u * rec_z)) & 63)]] << 48;     (((int) ubyz) & 63)]] << 48;
  l += dldx;   l += dldx;
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;  
  destlong |=   destlong |=
      (u_int64_t) gr_fade_table[((int) fabs(l)) * 256 +   (u_int64_t) fadetable[(l & 0x7f00) +
        (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) +        (uint) texmap[(((int) vbyz) & (64 * 63)) +
      (((int) (u * rec_z)) & 63)]] << 56;     (((int) ubyz) & 63)]] << 56;
  l += dldx;   l += dldx;
  u += dudx;  
  v += dvdx;   ubyz0 = ubyz8;
  z += dzdx;   vbyz0 = vbyz8;
   
    u += dudx8;
    v += dvdx8;
    z += dzdx8;
   
  rec_z = 1.0 / z;   rec_z = 1.0 / z;
   
    ubyz8 = u * rec_z;
    vbyz8 = v * rec_z;
   
    du1 = (ubyz8 - ubyz0) / 8.0;
    dv1 = (vbyz8 - vbyz0) / 8.0;
    ubyz = ubyz0;
    vbyz = vbyz0;
   
  *((u_int64_t *) dest) = destlong;   *((u_int64_t *) dest) = destlong;
  dest += 8;   dest += 8;
  x -= 8;  #ifdef CYCLECOUNT
  j -= 8;   stop = virtcc();
   #endif
  }   }
    // compensate for being calculated once too often
    u -= dudx8;
    v -= dvdx8;
    z -= dzdx8;
   #ifdef CYCLECOUNT
    time = stop - start;
    if (time > 10 && time < 900) {
    sum += time;
    ++count;
    if (count % 10000 == 1)
    printf("%f %d\n", (double) sum / (double) count, time);
  }   }
  while (x-- > 0) {  #endif
    }
   
    // Draw the last few (<8) pixels.
    rec_z = 1.0 / z;
    for ( ; x > 0; x--) {
  *dest++ =   *dest++ =
      gr_fade_table[((int) fabs(l)) * 256 +   fadetable[(l & 0x7f00) +
    (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)]];   (uint) texmap[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)]];
  l += dldx;   l += dldx;
  u += dudx;   u += dudx;
  v += dvdx;   v += dvdx;
  z += dzdx;   z += dzdx;
  rec_z = 1.0 / z;   rec_z = 1.0 / z;
  }   }
  } else {   } else { // Transparency_on
  if (x >= 8) {   if (x >= 8) {
  if ((j = (size_t) dest & 7) != 0) {   for ( ; (size_t) dest & 7; --x) {
  j = 8 - j;   c = (uint) texmap[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)];
    if (c != TRANSPARENCY_COLOR)
  while (j > 0) {   *dest = fadetable[(l & 0x7f00) + c];
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)];  
  if (c != 255)  
  *dest = gr_fade_table[((int) fabs(l)) * 256 + c];  
  dest++;   dest++;
  l += dldx;   l += dldx;
  u += dudx;   u += dudx;
  v += dvdx;   v += dvdx;
  z += dzdx;   z += dzdx;
  rec_z = 1.0 / z;   rec_z = 1.0 / z;
  x--;  
  j--;  
  }  
  }   }
   
  j = x;   ubyz0 = u * rec_z;
  while (j >= 8) {   vbyz0 = v * rec_z;
   
    u += dudx8;
    v += dvdx8;
    z += dzdx8;
    rec_z = 1.0 / z;
    ubyz8 = u * rec_z;
    vbyz8 = v * rec_z;
    du1 = (ubyz8 - ubyz0) / 8.0;
    dv1 = (vbyz8 - vbyz0) / 8.0;
    ubyz = ubyz0;
    vbyz = vbyz0;
    for ( ; x >= 8; x -= 8) {
  destlong = *((u_int64_t *) dest);   destlong = *((u_int64_t *) dest);
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)];  
  if (c != 255) {   c = texmap[(((int) vbyz) & (64 * 63)) + (((int) ubyz) & 63)];
  destlong &= ~(u_int64_t)0xFF;   if (c != TRANSPARENCY_COLOR) {
  destlong |= (u_int64_t) gr_fade_table[((int) fabs(l)) * 256 + c];   destlong &= ~((u_int64_t) 0xFF);
    destlong |= (u_int64_t) fadetable[(l & 0x7f00) + c];
  }   }
  l += dldx;   l += dldx;
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;   c = texmap[(((int) vbyz) & (64 * 63)) + (((int) ubyz) & 63)];
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)];   if (c != TRANSPARENCY_COLOR) {
  if (c != 255) {  
  destlong &= ~((u_int64_t)0xFF << 8);   destlong &= ~((u_int64_t)0xFF << 8);
  destlong |= (u_int64_t) gr_fade_table[((int) fabs(l)) * 256 + c] << 8;   destlong |= (u_int64_t) fadetable[(l & 0x7f00) + c] << 8;
  }   }
  l += dldx;   l += dldx;
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;   c = texmap[(((int) vbyz) & (64 * 63)) + (((int) ubyz) & 63)];
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)];   if (c != TRANSPARENCY_COLOR) {
  if (c != 255) {  
  destlong &= ~((u_int64_t)0xFF << 16);   destlong &= ~((u_int64_t)0xFF << 16);
  destlong |= (u_int64_t) gr_fade_table[((int) fabs(l)) * 256 + c] << 16;   destlong |= (u_int64_t) fadetable[(l & 0x7f00) + c] << 16;
  }   }
  l += dldx;   l += dldx;
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;   c = texmap[(((int) vbyz) & (64 * 63)) + (((int) ubyz) & 63)];
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)];   if (c != TRANSPARENCY_COLOR) {
  if (c != 255) {  
  destlong &= ~((u_int64_t)0xFF << 24);   destlong &= ~((u_int64_t)0xFF << 24);
  destlong |= (u_int64_t) gr_fade_table[((int) fabs(l)) * 256 + c] << 24;   destlong |= (u_int64_t) fadetable[(l & 0x7f00) + c] << 24;
  }   }
  l += dldx;   l += dldx;
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;   c = texmap[(((int) vbyz) & (64 * 63)) + (((int) ubyz) & 63)];
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)];   if (c != TRANSPARENCY_COLOR) {
  if (c != 255) {  
  destlong &= ~((u_int64_t)0xFF << 32);   destlong &= ~((u_int64_t)0xFF << 32);
  destlong |= (u_int64_t) gr_fade_table[((int) fabs(l)) * 256 + c] << 32;   destlong |= (u_int64_t) fadetable[(l & 0x7f00) + c] << 32;
  }   }
  l += dldx;   l += dldx;
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;   c = texmap[(((int) vbyz) & (64 * 63)) + (((int) ubyz) & 63)];
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)];   if (c != TRANSPARENCY_COLOR) {
  if (c != 255) {  
  destlong &= ~((u_int64_t)0xFF << 40);   destlong &= ~((u_int64_t)0xFF << 40);
  destlong |= (u_int64_t) gr_fade_table[((int) fabs(l)) * 256 + c] << 40;   destlong |= (u_int64_t) fadetable[(l & 0x7f00) + c] << 40;
  }   }
  l += dldx;   l += dldx;
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;   c = texmap[(((int) vbyz) & (64 * 63)) + (((int) ubyz) & 63)];
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)];   if (c != TRANSPARENCY_COLOR) {
  if (c != 255) {  
  destlong &= ~((u_int64_t)0xFF << 48);   destlong &= ~((u_int64_t)0xFF << 48);
  destlong |= (u_int64_t) gr_fade_table[((int) fabs(l)) * 256 + c] << 48;   destlong |= (u_int64_t) fadetable[(l & 0x7f00) + c] << 48;
  }   }
  l += dldx;   l += dldx;
  u += dudx;   ubyz += du1;
  v += dvdx;   vbyz += dv1;
  z += dzdx;  
  rec_z = 1.0 / z;   c = texmap[(((int) vbyz) & (64 * 63)) + (((int) ubyz) & 63)];
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)];   if (c != TRANSPARENCY_COLOR) {
  if (c != 255) {  
  destlong &= ~((u_int64_t)0xFF << 56);   destlong &= ~((u_int64_t)0xFF << 56);
  destlong |= (u_int64_t) gr_fade_table[((int) fabs(l)) * 256 + c] << 56;   destlong |= (u_int64_t) fadetable[(l & 0x7f00) + c] << 56;
  }   }
  l += dldx;   l += dldx;
  u += dudx;  
  v += dvdx;  
  z += dzdx;  
  rec_z = 1.0 / z;  
   
  *((u_int64_t *) dest) = destlong;   *((u_int64_t *) dest) = destlong;
  dest += 8;   dest += 8;
  x -= 8;  
  j -= 8;   ubyz0 = ubyz8;
    vbyz0 = vbyz8;
   
    u += dudx8;
    v += dvdx8;
    z += dzdx8;
    rec_z = 1.0 / z;
    ubyz8 = u * rec_z;
    vbyz8 = v * rec_z;
    du1 = (ubyz8 - ubyz0) / 8.0;
    dv1 = (vbyz8 - vbyz0) / 8.0;
    ubyz = ubyz0;
    vbyz = vbyz0;
   
  }   }
    u -= dudx8;
    v -= dvdx8;
    z -= dzdx8;
  }   }
  while (x-- > 0) {   rec_z = 1.0 / z;
  c = (uint) pixptr[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)];   for ( ; x > 0; x--) {
  if (c != 255)   c = (uint) texmap[(((int) (v * rec_z)) & (64 * 63)) + (((int) (u * rec_z)) & 63)];
  *dest = gr_fade_table[((int) fabs(l)) * 256 + c];   if (c != TRANSPARENCY_COLOR)
    *dest = fadetable[(l & 0x7f00) + c];
  dest++;   dest++;
  l += dldx;   l += dldx;
  u += dudx;   u += dudx;
Line 797
 
Line 904
  }   }
 }  }
   
 #elif 1  #if 1
 // note the unrolling loop is broken. It is never called, and uses big endian. -- FH  // note the unrolling loop is broken. It is never called, and uses big endian. -- FH
 void c_tmap_scanline_per()  void c_tmap_scanline_per()
 {  {
Line 954
 
Line 1061
  }   }
  }   }
 }  }
   #endif
   
   void (*cur_tmap_scanline_per)(void);
   void (*cur_tmap_scanline_per_nolight)(void);
   void (*cur_tmap_scanline_lin)(void);
   void (*cur_tmap_scanline_lin_nolight)(void);
   void (*cur_tmap_scanline_flat)(void);
   void (*cur_tmap_scanline_shaded)(void);
   
   //runtime selection of optimized tmappers.  12/07/99  Matthew Mueller
   //the reason I did it this way rather than having a *tmap_funcs that then points to a c_tmap or fp_tmap struct thats already filled in, is to avoid a second pointer dereference.
   void select_tmap(char *type){
    if (!type){
   #ifndef NO_ASM
    select_tmap("i386");
   #else
    select_tmap("c");
   #endif
    return;
    }
   #ifndef NO_ASM
    if (stricmp(type,"i386")==0){
    cur_tmap_scanline_per=asm_tmap_scanline_per;
    cur_tmap_scanline_per_nolight=asm_tmap_scanline_per;
    cur_tmap_scanline_lin=asm_tmap_scanline_lin_lighted;
    cur_tmap_scanline_lin_nolight=asm_tmap_scanline_lin;
    cur_tmap_scanline_flat=asm_tmap_scanline_flat;
    cur_tmap_scanline_shaded=asm_tmap_scanline_shaded;
    }
    else if (stricmp(type,"pent")==0){
    cur_tmap_scanline_per=asm_pent_tmap_scanline_per;
    cur_tmap_scanline_per_nolight=asm_pent_tmap_scanline_per;
    cur_tmap_scanline_lin=asm_tmap_scanline_lin_lighted;
    cur_tmap_scanline_lin_nolight=asm_tmap_scanline_lin;
    cur_tmap_scanline_flat=asm_tmap_scanline_flat;
    cur_tmap_scanline_shaded=asm_tmap_scanline_shaded;
    }
    else if (stricmp(type,"ppro")==0){
    cur_tmap_scanline_per=asm_ppro_tmap_scanline_per;
    cur_tmap_scanline_per_nolight=asm_ppro_tmap_scanline_per;
    cur_tmap_scanline_lin=asm_tmap_scanline_lin_lighted;
    cur_tmap_scanline_lin_nolight=asm_tmap_scanline_lin;
    cur_tmap_scanline_flat=asm_tmap_scanline_flat;
    cur_tmap_scanline_shaded=asm_tmap_scanline_shaded;
    }
    else
 #endif  #endif
    if (stricmp(type,"fp")==0){
    cur_tmap_scanline_per=c_fp_tmap_scanline_per;
    cur_tmap_scanline_per_nolight=c_fp_tmap_scanline_per_nolight;
    cur_tmap_scanline_lin=c_tmap_scanline_lin;
    cur_tmap_scanline_lin_nolight=c_tmap_scanline_lin_nolight;
    cur_tmap_scanline_flat=c_tmap_scanline_flat;
    cur_tmap_scanline_shaded=c_tmap_scanline_shaded;
    }
    else {
    if (stricmp(type,"c")!=0)
    printf("unknown tmap requested, using c tmap\n");
    cur_tmap_scanline_per=c_tmap_scanline_per;
    cur_tmap_scanline_per_nolight=c_tmap_scanline_per_nolight;
    cur_tmap_scanline_lin=c_tmap_scanline_lin;
    cur_tmap_scanline_lin_nolight=c_tmap_scanline_lin_nolight;
    cur_tmap_scanline_flat=c_tmap_scanline_flat;
    cur_tmap_scanline_shaded=c_tmap_scanline_shaded;
    }
   }
   

Legend:
line(s) removed in v.1.4 
line(s) changed
 line(s) added in v.1.5