#include <stdio.h>
#include <stdlib.h>
#include <string.h>

//#define HEAVYTM
#include "tm.h"

#ifdef RADUSETM3
tm_api * g_tm_api;
//#define PROFILE_MODE
#endif

#include <math.h>

#ifdef _MSC_VER
#define stop() __debugbreak()
#include <windows.h>
#define int64 __int64
#define uint64 unsigned __int64
#else
#define stop() __builtin_trap()
#define int64 long long
#define uint64 unsigned long long
#endif

#ifdef _MSC_VER
#pragma warning(disable:4127)
#endif

//#define NOCOMP


//#define PROFILE_NEW_ONLY
//#define PROFILE_MODE


#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ )

#ifdef _MSC_VER

  uint64 __rdtsc();
  #define __cycles() __rdtsc()

#else // non msvc

  static inline uint64 __cycles() 
  {
    unsigned int lo, hi;
    asm volatile ("rdtsc" : "=a" (lo), "=d" (hi) );
    return ( ( (uint64) hi ) << 32 ) | ( (uint64) lo );
  }

#endif  // msvc

#elif defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__) 

#ifdef _MSC_VER

  #define __cycles() _ReadStatusReg(ARM64_CNTVCT)

#else

  static inline uint64 __cycles()
  {
    uint64 tsc;
    asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));
    return tsc;
  }

#endif

#else // x64, arm

#error Unknown platform for timing.

#endif  //x64 and   


#ifdef PROFILE_MODE

#define STBIR_ASSERT(cond)

#endif

#ifdef _DEBUG
#undef STBIR_ASSERT
#define STBIR_ASSERT(cond) { if (!(cond)) stop(); }
#endif


#define SHRINKBYW 2
#define ZOOMBYW 2
#define SHRINKBYH 2
#define ZOOMBYH 2


int mem_count = 0;

#ifdef TEST_WITH_VALLOC

#define STBIR__SEPARATE_ALLOCATIONS

#if TEST_WITH_LIMIT_AT_FRONT

  void * wmalloc(SIZE_T size)
  {
     static unsigned int pagesize=0;
     void* p;
     SIZE_T s;

     // get the page size, if we haven't yet
     if (pagesize==0)
     {
       SYSTEM_INFO si;
       GetSystemInfo(&si);
       pagesize=si.dwPageSize;
     }

     // we need room for the size, 8 bytes to hide the original pointer and a
     //   validation dword, and enough data to completely fill one page
     s=(size+(pagesize-1))&~(pagesize-1);

     // allocate the size plus a page (for the guard)
     p=VirtualAlloc(0,(SIZE_T)s,MEM_RESERVE|MEM_COMMIT,PAGE_READWRITE);
     
     return p;
   }

  void wfree(void * ptr)
  {
    if (ptr)
    {
      if ( ((ptrdiff_t)ptr) & 4095 ) stop();
      if ( VirtualFree(ptr,0,MEM_RELEASE) == 0 ) stop();
    }
  }

#else

  void * wmalloc(SIZE_T size)
  {
     static unsigned int pagesize=0;
     void* p;
     SIZE_T s;

     // get the page size, if we haven't yet
     if (pagesize==0)
     {
       SYSTEM_INFO si;
       GetSystemInfo(&si);
       pagesize=si.dwPageSize;
     }

     // we need room for the size, 8 bytes to hide the original pointer and a
     //   validation dword, and enough data to completely fill one page
     s=(size+16+(pagesize-1))&~(pagesize-1);

     // allocate the size plus a page (for the guard)
     p=VirtualAlloc(0,(SIZE_T)(s+pagesize+pagesize),MEM_RESERVE|MEM_COMMIT,PAGE_READWRITE);

     if (p)
     {
       DWORD oldprot;
       void* orig=p;

       // protect the first page
       VirtualProtect(((char*)p),pagesize,PAGE_NOACCESS,&oldprot);

       // protect the final page
       VirtualProtect(((char*)p)+s+pagesize,pagesize,PAGE_NOACCESS,&oldprot);

       // now move the returned pointer so that it bumps right up against the
       //   the next (protected) page (this may result in unaligned return
       //   addresses - pre-align the sizes if you always want aligned ptrs)
//#define ERROR_ON_FRONT
#ifdef ERROR_ON_FRONT
       p=((char*)p)+pagesize+16;
#else
       p=((char*)p)+(s-size)+pagesize;
#endif

       // hide the validation value and the original pointer (which we'll
       //   need used for freeing) right behind the returned pointer
       ((unsigned int*)p)[-1]=0x98765432;
       ((void**)p)[-2]=orig;
       ++mem_count;
//printf("aloc: %p bytes: %d\n",p,(int)size);
       return(p);
     }

     return 0;
  }

  void wfree(void * ptr)
  {
    if (ptr)
    {
      int err=0;

      // is this one of our allocations?
      if (((((unsigned int*)ptr)[-1])!=0x98765432) || ((((void**)ptr)[-2])==0))
      {
        err=1;
      }

      if (err)
      {
        __debugbreak();
      }
      else
      {

        // back up to find the original pointer
        void* p=((void**)ptr)[-2];

        // clear the validation value and the original pointer
        ((unsigned int*)ptr)[-1]=0;
        ((void**)ptr)[-2]=0;

//printf("free: %p\n",ptr);

        --mem_count;

        // now free the pages
        if (p)
          VirtualFree(p,0,MEM_RELEASE);

      }
    }
  }

#endif

#define STBIR_MALLOC(size,user_data) ((void)(user_data), wmalloc(size))
#define STBIR_FREE(ptr,user_data)    ((void)(user_data), wfree(ptr))

#endif

#define STBIR_PROFILE
//#define STBIR_NO_SIMD
//#define STBIR_AVX
//#define STBIR_AVX2
#define STB_IMAGE_RESIZE_IMPLEMENTATION
#include "stb_image_resize2.h"  // new one!

#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"

int tsizes[5] =   { 1, 1, 2, 4, 2 };
int ttypes[5] =   { STBIR_TYPE_UINT8, STBIR_TYPE_UINT8_SRGB, STBIR_TYPE_UINT16, STBIR_TYPE_FLOAT, STBIR_TYPE_HALF_FLOAT };

int cedges[4] =   { STBIR_EDGE_CLAMP, STBIR_EDGE_REFLECT, STBIR_EDGE_ZERO, STBIR_EDGE_WRAP };
int flts[5] =     { STBIR_FILTER_BOX, STBIR_FILTER_TRIANGLE, STBIR_FILTER_CUBICBSPLINE, STBIR_FILTER_CATMULLROM, STBIR_FILTER_MITCHELL };
int buffers[20] = { STBIR_1CHANNEL, STBIR_2CHANNEL, STBIR_RGB, STBIR_4CHANNEL, 
                    STBIR_BGRA, STBIR_ARGB, STBIR_RA, STBIR_AR,
                    STBIR_RGBA_PM, STBIR_ARGB_PM, STBIR_RA_PM, STBIR_AR_PM,
                    STBIR_RGBA, STBIR_ARGB, STBIR_RA, STBIR_AR,
                    STBIR_RGBA_PM, STBIR_ARGB_PM, STBIR_RA_PM, STBIR_AR_PM,
                  };
int obuffers[20] = { STBIR_1CHANNEL, STBIR_2CHANNEL, STBIR_RGB, STBIR_4CHANNEL, 
                     STBIR_BGRA, STBIR_ARGB, STBIR_RA, STBIR_AR,
                     STBIR_RGBA_PM, STBIR_ARGB_PM, STBIR_RA_PM, STBIR_AR_PM,
                     STBIR_RGBA_PM, STBIR_ARGB_PM, STBIR_RA_PM, STBIR_AR_PM,
                     STBIR_RGBA, STBIR_ARGB, STBIR_RA, STBIR_AR,
                   };

int bchannels[20] = { 1, 2, 3, 4,  4,4, 2,2,  4,4, 2,2,  4,4, 2,2,  4,4, 2,2  }; 
int alphapos[20] = { -1, -1, -1, -1,  3,0,  1,0,   3,0,  1,0,   3,0,  1,0,3,0,  1,0 }; 


char const * buffstrs[20] = { "1ch", "2ch", "3ch", "4ch",  "RGBA", "ARGB", "RA", "AR",  "RGBA_both_pre", "ARGB_both_pre", "RA_both_pre", "AR_both_pre",  "RGBA_out_pre", "ARGB_out_pre", "RA_out_pre", "AR_out_pre",  "RGBA_in_pre", "ARGB_in_pre", "RA_in_pre", "AR_in_pre" };
char const * typestrs[5] =  { "Bytes", "BytesSRGB", "Shorts", "Floats", "Half Floats"};
char const * edgestrs[4] =  { "Clamp", "Reflect", "Zero", "Wrap" };
char const * fltstrs[5] =   { "Box", "Triangle", "Cubic", "Catmullrom", "Mitchell" };

#ifdef STBIR_PROFILE
  static void do_acc_zones( STBIR_PROFILE_INFO * profile )
  {
    stbir_uint32 j;
    stbir_uint64 start = tmGetAccumulationStart( tm_mask ); start=start;

    for( j = 0 ; j < profile->count ; j++ )
    {
      if ( profile->clocks[j] )
        tmEmitAccumulationZone( 0, 0, (tm_uint64*)&start, 0, profile->clocks[j], profile->descriptions[j] );
    }
  }
#else
  #define do_acc_zones(...)
#endif

int64 vert;

//#define WINTHREADTEST
#ifdef WINTHREADTEST

static STBIR_RESIZE * thread_resize;
static LONG which;
static int threads_started = 0;
static HANDLE threads[32];
static HANDLE starts,stops;
  
static DWORD resize_shim( LPVOID p )
{
  for(;;)
  {
    LONG wh;

    WaitForSingleObject( starts, INFINITE );

    wh = InterlockedAdd( &which, 1 ) - 1;

    ENTER( "Split %d", wh );
    stbir_resize_split( thread_resize, wh, 1 );
  #ifdef STBIR_PROFILE
    { STBIR_PROFILE_INFO profile; stbir_resize_split_profile_info( &profile, thread_resize, wh, 1 ); do_acc_zones( &profile ); vert = profile.clocks[1]; }
  #endif
    LEAVE();

    ReleaseSemaphore( stops, 1, 0 );
  }
}

#endif

void nresize( void * o, int ox, int oy, int op, void * i, int ix, int iy, int ip, int buf, int type, int edg, int flt )
{
  STBIR_RESIZE resize;
 
  stbir_resize_init( &resize, i, ix, iy, ip, o, ox, oy, op, buffers[buf], ttypes[type] );
  stbir_set_pixel_layouts( &resize, buffers[buf], obuffers[buf] );
  stbir_set_edgemodes( &resize, cedges[edg], cedges[edg] );
  stbir_set_filters( &resize, flts[flt], /*STBIR_FILTER_POINT_SAMPLE */ flts[flt] );
  //stbir_set_input_subrect( &resize, 0.55f,0.333f,0.75f,0.50f);
  //stbir_set_output_pixel_subrect( &resize, 00, 00, ox/2,oy/2);
  //stbir_set_pixel_subrect(&resize, 1430,1361,30,30);
  
  ENTER( "Resize" );
 
  #ifndef WINTHREADTEST

  ENTER( "Filters" );
  stbir_build_samplers_with_splits( &resize, 1 );
  #ifdef STBIR_PROFILE
    { STBIR_PROFILE_INFO profile; stbir_resize_build_profile_info( &profile, &resize ); do_acc_zones( &profile ); }
  #endif
  LEAVE();
 
  ENTER( "Resize" );
  if(!stbir_resize_extended( &resize ) )
    stop();
  #ifdef STBIR_PROFILE
    { STBIR_PROFILE_INFO profile; stbir_resize_extended_profile_info( &profile, &resize ); do_acc_zones( &profile ); vert = profile.clocks[1]; }
  #endif
  LEAVE();

  #else
  {
    int c, cnt;
 
    ENTER( "Filters" );
    cnt =  stbir_build_samplers_with_splits( &resize, 4 );
    #ifdef STBIR_PROFILE
      { STBIR_PROFILE_INFO profile; stbir_resize_build_profile_info( &profile, &resize ); do_acc_zones( &profile ); }
    #endif
    LEAVE();
    
    ENTER( "Thread start" );
    if ( threads_started == 0 )
    {
      starts = CreateSemaphore( 0, 0, 32, 0 );
      stops = CreateSemaphore( 0, 0, 32, 0 );
    }
    for( c = threads_started ; c < cnt ; c++ )
      threads[ c ] = CreateThread( 0, 2048*1024, resize_shim, 0, 0, 0 );
    
    threads_started = cnt;
    thread_resize = &resize;
    which = 0;
    LEAVE();

    // starts the threads
    ReleaseSemaphore( starts, cnt, 0 );

    ENTER( "Wait" );
    for( c = 0 ; c < cnt; c++ )
      WaitForSingleObject( stops, INFINITE );
    LEAVE();
  }
  #endif

  ENTER( "Free" );
  stbir_free_samplers( &resize );
  LEAVE();
  LEAVE();
}


#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"

extern void oresize( void * o, int ox, int oy, int op, void * i, int ix, int iy, int ip, int buf, int type, int edg, int flt );



#define TYPESTART 0
#define TYPEEND   4

#define LAYOUTSTART  0
#define LAYOUTEND   19

#define SIZEWSTART 0
#define SIZEWEND   2

#define SIZEHSTART 0
#define SIZEHEND   2

#define EDGESTART 0
#define EDGEEND   3

#define FILTERSTART 0
#define FILTEREND   4

#define HEIGHTSTART 0
#define HEIGHTEND   2

#define WIDTHSTART 0
#define WIDTHEND   2




static void * convert8to16( unsigned char * i, int w, int h, int c )
{
  unsigned short * ret;
  int p;

  ret = malloc( w*h*c*sizeof(short) );
  for(p = 0 ; p < (w*h*c) ; p++ )
  {
    ret[p]=(short)((((int)i[p])<<8)+i[p]);
  }

  return ret;
}

static void * convert8tof( unsigned char * i, int w, int h, int c )
{
  float * ret;
  int p;

  ret = malloc( w*h*c*sizeof(float) );
  for(p = 0 ; p < (w*h*c) ; p++ )
  {
    ret[p]=((float)i[p])*(1.0f/255.0f);
  }

  return ret;
}

static void * convert8tohf( unsigned char * i, int w, int h, int c )
{
  stbir__FP16 * ret;
  int p;

  ret = malloc( w*h*c*sizeof(stbir__FP16) );
  for(p = 0 ; p < (w*h*c) ; p++ )
  {
    ret[p]=stbir__float_to_half(((float)i[p])*(1.0f/255.0f));
  }

  return ret;
}

static void * convert8tohff( unsigned char * i, int w, int h, int c )
{
  float * ret;
  int p;

  ret = malloc( w*h*c*sizeof(float) );
  for(p = 0 ; p < (w*h*c) ; p++ )
  {
    ret[p]=stbir__half_to_float(stbir__float_to_half(((float)i[p])*(1.0f/255.0f)));
  }

  return ret;
}

static int isprime( int v )
{
  int i;

  if ( v <= 3 )
    return ( v > 1 );
  if ( ( v & 1 ) == 0 )
    return 0;
  if ( ( v % 3 ) == 0 )
    return 0;
  i = 5;
  while ( (i*i) <= v )
  {
    if ( ( v % i ) == 0 )
      return 0;
    if ( ( v % ( i + 2 ) ) == 0 )
      return 0;
    i += 6;
  }

  return 1;
}

static int getprime( int v )
{
  int i;
  i = 0;
  for(;;)
  {
    if ( i >= v )
      return v;  // can't find any, just return orig
    if (isprime(v - i))
      return v - i;
    if (isprime(v + i))
      return v + i;
    ++i;
  }
}


int main( int argc, char ** argv )
{
  int ix, iy, ic;
  unsigned char * input[6];
  char * ir1;
  char * ir2;
  int szhs[3];
  int szws[3];
  int aw, ah, ac;
  unsigned char * correctalpha;
  int layouts, types, heights, widths, edges, filters;

  if ( argc != 2 )
  {
    printf("command: stbirtest [imagefile]\n");
    exit(1);
  }

  SetupTM( "127.0.0.1" );

  correctalpha = stbi_load( "correctalpha.png", &aw, &ah, &ac, 0 );

  input[0] = stbi_load( argv[1], &ix, &iy, &ic, 0 );
  input[1] = input[0];
  input[2] = convert8to16( input[0], ix, iy, ic );
  input[3] = convert8tof( input[0], ix, iy, ic );
  input[4] = convert8tohf( input[0], ix, iy, ic );
  input[5] = convert8tohff( input[0], ix, iy, ic );

  printf("Input %dx%d (%d channels)\n",ix,iy,ic);

  ir1 = malloc( 4 * 4 * 3000 * 3000ULL );
  ir2 = malloc( 4 * 4 * 3000 * 3000ULL );

  szhs[0] = getprime( iy/SHRINKBYH );
  szhs[1] = iy;
  szhs[2] = getprime( iy*ZOOMBYH );

  szws[0] = getprime( ix/SHRINKBYW );
  szws[1] = ix;
  szws[2] = getprime( ix*ZOOMBYW );

  #if 1
  for( types = TYPESTART ; types <= TYPEEND ; types++ )
  #else
  for( types = 1 ; types <= 1 ; types++ )
  #endif
  {
    ENTER( "Test type: %s",typestrs[types]);
    #if 1
    for( layouts = LAYOUTSTART ; layouts <= LAYOUTEND ; layouts++ )
    #else
    for( layouts = 16; layouts <= 16 ; layouts++ )
    #endif
    {
      ENTER( "Test layout: %s",buffstrs[layouts]);
      
      #if 0
      for( heights = HEIGHTSTART ; heights <= HEIGHTEND ; heights++ )
      {
        int w, h = szhs[heights];
      #else
      for( heights = 0 ; heights <= 11 ; heights++ )
      {
        static int szhsz[12]={32, 200, 350, 400, 450, 509, 532, 624, 700, 824, 1023, 2053 };
        int w, h = szhsz[heights];
      #endif

        ENTER( "Test height: %d %s %d",iy,(h<iy)?"Down":((h>iy)?"Up":"Same"),h);
  
        #if 0
        for( widths = WIDTHSTART ; widths <= WIDTHEND ; widths++ )
        {
          w = szws[widths];
        #else
        for( widths = 0 ; widths <= 12 ; widths++ )
        {
          static int szwsz[13]={2, 32, 200, 350, 400, 450, 509, 532, 624, 700, 824, 1023, 2053 };
          w = szwsz[widths];
        #endif

          ENTER( "Test width: %d %s %d",ix, (w<ix)?"Down":((w>ix)?"Up":"Same"), w);

          #if 0
          for( edges = EDGESTART ; edges <= EDGEEND ; edges++ )
          #else
          for( edges = 0 ; edges <= 0 ; edges++ )
          #endif
          {
            ENTER( "Test edge: %s",edgestrs[edges]);
            #if 0
            for( filters = FILTERSTART ; filters <= FILTEREND ; filters++ )
            #else
            for( filters = 3 ; filters <= 3 ; filters++ )
            #endif
            {
              int op, opw, np,npw, c, a;
              #ifdef COMPARE_SAME                        
              int oldtypes = types;
              #else
              int oldtypes = (types==4)?3:types;
              #endif
            
              ENTER( "Test filter: %s",fltstrs[filters]);
              {
                c = bchannels[layouts];
                a = alphapos[layouts];

                op = w*tsizes[oldtypes]*c + 60;
                opw = w*tsizes[oldtypes]*c;

                np = w*tsizes[types]*c + 60;
                npw = w*tsizes[types]*c;

                printf( "%s:layout: %s  w: %d h: %d edge: %s filt: %s\n", typestrs[types],buffstrs[layouts], w, h, edgestrs[edges], fltstrs[filters] );


                // clear pixel area to different, right edge to zero
                #ifndef NOCLEAR
                ENTER( "Test clear padding" );
                {
                  int d;
                  for( d = 0 ; d < h ; d++ )
                  {
                    int oofs = d * op;
                    int nofs = d * np;
                    memset( ir1 + oofs, 192, opw );
                    memset( ir1 + oofs+opw, 79, op-opw );
                    memset( ir2 + nofs, 255, npw );
                    memset( ir2 + nofs+npw, 79, np-npw );
                  }
                }
                LEAVE();

                #endif

                #ifdef COMPARE_SAME                        
                #define TIMINGS 1
                #else 
                #define TIMINGS 1
                #endif
                ENTER( "Test both" );
                {
                  #ifndef PROFILE_NEW_ONLY
                    {
                      int ttt, max = 0x7fffffff;
                      ENTER( "Test old" );
                      for( ttt = 0 ; ttt < TIMINGS ; ttt++ )
                      {
                        int64 m = __cycles();

                        oresize( ir1, w, h, op, 
                        #ifdef COMPARE_SAME                        
                        input[types], 
                        #else
                        input[(types==4)?5:types], 
                        #endif
                        ix, iy, ix*ic*tsizes[oldtypes], layouts, oldtypes, edges, filters );

                        m = __cycles() - m;
                        if ( ( (int)m ) < max )
                        max = (int) m;
                      }
                      LEAVE();
                      printf("old: %d\n", max );
                    }
                  #endif
     
                  {
                    int ttt, max = 0x7fffffff, maxv = 0x7fffffff;
                    ENTER( "Test new" );
                    for( ttt = 0 ; ttt < TIMINGS ; ttt++ )
                    {
                      int64 m = __cycles();

                      nresize( ir2, w, h, np, input[types], ix, iy, ix*ic*tsizes[types], layouts, types, edges, filters );
        
                      m = __cycles() - m;
                      if ( ( (int)m ) < max )
                        max = (int) m;
                      if ( ( (int)vert ) < maxv )
                        maxv = (int) vert;
                    }
                    LEAVE(); // test new
                    printf("new: %d (v: %d)\n", max, maxv );
                  }
                }
                LEAVE();  // test both

                if ( mem_count!= 0 )
                  stop();

              #ifndef NOCOMP
                ENTER( "Test compare" );
                {
                  int x,y,ch;
                  int nums = 0;
                  for( y = 0 ; y < h ; y++ )
                  {
                    for( x = 0 ; x < w ; x++ )
                    {
                      switch(types)
                      {
                        case 0:
                        case 1: //SRGB 
                        {
                          unsigned char * p1 = (unsigned char *)&ir1[y*op+x*c];
                          unsigned char * p2 = (unsigned char *)&ir2[y*np+x*c];
                          for( ch = 0 ; ch < c ; ch++ )
                          {
                            float pp1,pp2,d;
                            float av = (a==-1)?1.0f:((float)p1[a]/255.0f);

                            pp1 = p1[ch];
                            pp2 = p2[ch];

                            // compare in premult space
                            #ifndef COMPARE_SAME                        
                            if ( ( ( layouts >=4 ) && ( layouts <= 7 ) ) || ( ( layouts >=16 ) && ( layouts <= 19 ) ) )
                            {
                              pp1 *= av;
                              pp2 *= av;
                            }
                            #endif

                            d = pp1 - pp2;
                            if ( d < 0 ) d = -d;

                            #ifdef COMPARE_SAME                        
                            if ( d > 0 ) 
                            #else 
                            if ( d > 1 )
                            #endif
                            {
                              printf("Error at %d x %d (chan %d) (d: %g a: %g) [%d %d %d %d] [%d %d %d %d]\n",x,y,ch, d,av, p1[0],p1[1],p1[2],p1[3], p2[0],p2[1],p2[2],p2[3]);
                              ++nums;
                              if ( nums > 16 ) goto ex;
                              //if (d) exit(1);
                              //goto ex;
                            }
                          }  
                        }
                        break;

                        case 2:
                        {
                          unsigned short * p1 = (unsigned short *)&ir1[y*op+x*c*sizeof(short)];
                          unsigned short * p2 = (unsigned short *)&ir2[y*np+x*c*sizeof(short)];
                          for( ch = 0 ; ch < c ; ch++ )
                          {
                            float thres,pp1,pp2,d;
                            float av = (a==-1)?1.0f:((float)p1[a]/65535.0f);

                            pp1 = p1[ch];
                            pp2 = p2[ch];

                            // compare in premult space
                            #ifndef COMPARE_SAME                        
                            if ( ( ( layouts >=4 ) && ( layouts <= 7 ) ) || ( ( layouts >= 16 ) && ( layouts <= 19 ) ) )
                            {
                              pp1 *= av;
                              pp2 *= av;
                            }
                            #endif

                            d = pp1 - pp2;
                            if ( d < 0 ) d = -d;

                            thres=((float)p1[ch]*0.007f)+2.0f;
                            if (thres<4) thres = 4;
  
                            #ifdef COMPARE_SAME                        
                            if ( d > 0 ) 
                            #else 
                            if ( d > thres)
                            #endif
                            {
                              printf("Error at %d x %d (chan %d) %d %d [df: %g th: %g al: %g] (%d %d %d %d) (%d %d %d %d)\n",x,y,ch, p1[ch],p2[ch],d,thres,av,p1[0],p1[1],p1[2],p1[3],p2[0],p2[1],p2[2],p2[3]);
                              ++nums;
                              if ( nums > 16 ) goto ex;
                              //if (d) exit(1);
                              //goto ex;
                            }
                          }
                        }
                        break;

                        case 3:
                        {
                          float * p1 = (float *)&ir1[y*op+x*c*sizeof(float)];
                          float * p2 = (float *)&ir2[y*np+x*c*sizeof(float)];
                          for( ch = 0 ; ch < c ; ch++ )
                          {
                            float pp1 = p1[ch], pp2 = p2[ch];
                            float av = (a==-1)?1.0f:p1[a];
                            float thres, d;

                            // clamp
                            if (pp1<=0.0f) pp1 = 0;
                            if (pp2<=0.0f) pp2 = 0;
                            if (av<=0.0f) av = 0;
                            if (pp1>1.0f) pp1 = 1.0f;
                            if (pp2>1.0f) pp2 = 1.0f;
                            if (av>1.0f) av = 1.0f;

                            // compare in premult space
                            #ifndef COMPARE_SAME                        
                            if ( ( ( layouts >=4 ) && ( layouts <= 7 ) ) || ( ( layouts >= 16 ) && ( layouts <= 19 ) ) )
                            {
                              pp1 *= av;
                              pp2 *= av;
                            }
                            #endif

                            d = pp1 - pp2;
                            if ( d < 0 ) d = -d;

                            thres=(p1[ch]*0.002f)+0.0002f;
                            if ( thres < 0 ) thres = -thres;

                            #ifdef COMPARE_SAME                        
                            if ( d != 0.0f ) 
                            #else 
                            if ( d > thres )
                            #endif
                            {
                              printf("Error at %d x %d (chan %d) %g %g [df: %g th: %g al: %g] (%g %g %g %g) (%g %g %g %g)\n",x,y,ch, p1[ch],p2[ch],d,thres,av,p1[0],p1[1],p1[2],p1[3],p2[0],p2[1],p2[2],p2[3]);
                              ++nums;
                              if ( nums > 16 ) goto ex;
                              //if (d) exit(1);
                              //goto ex;
                            }
                          }
                        }
                        break;

                        case 4:
                        {
                          #ifdef COMPARE_SAME                        
                          stbir__FP16 * p1 = (stbir__FP16 *)&ir1[y*op+x*c*sizeof(stbir__FP16)];
                          #else
                          float * p1 = (float *)&ir1[y*op+x*c*sizeof(float)];
                          #endif
                          stbir__FP16 * p2 = (stbir__FP16 *)&ir2[y*np+x*c*sizeof(stbir__FP16)];
                          for( ch = 0 ; ch < c ; ch++ )
                          {
                            #ifdef COMPARE_SAME                        
                            float pp1  = stbir__half_to_float(p1[ch]);
                            float av = (a==-1)?1.0f:stbir__half_to_float(p1[a]);
                            #else
                            float pp1 = stbir__half_to_float(stbir__float_to_half(p1[ch]));
                            float av = (a==-1)?1.0f:stbir__half_to_float(stbir__float_to_half(p1[a]));
                            #endif
                            float pp2 = stbir__half_to_float(p2[ch]);
                            float d, thres;

                            // clamp
                            if (pp1<=0.0f) pp1 = 0;
                            if (pp2<=0.0f) pp2 = 0;
                            if (av<=0.0f) av = 0;
                            if (pp1>1.0f) pp1 = 1.0f;
                            if (pp2>1.0f) pp2 = 1.0f;
                            if (av>1.0f) av = 1.0f;

                            thres=(pp1*0.002f)+0.0002f;

                            // compare in premult space
                            #ifndef COMPARE_SAME                        
                            if ( ( ( layouts >=4 ) && ( layouts <= 7 ) ) || ( ( layouts >= 16 ) && ( layouts <= 19 ) ) )
                            {
                              pp1 *= av;
                              pp2 *= av;
                            }
                            #endif

                            d = pp1 - pp2;
                            if ( d < 0 ) d = -d;


                            #ifdef COMPARE_SAME                        
                            if ( d != 0.0f ) 
                            #else 
                            if ( d > thres )
                            #endif
                            {
                              printf("Error at %d x %d (chan %d) %g %g [df: %g th: %g al: %g] (%g %g %g %g) (%g %g %g %g)\n",x,y,ch, 
                              #ifdef COMPARE_SAME                        
                              stbir__half_to_float(p1[ch]), 
                              #else
                              p1[ch],
                              #endif
                              stbir__half_to_float(p2[ch]), 
                              d,thres,av,
                              #ifdef COMPARE_SAME                        
                              stbir__half_to_float(p1[0]),stbir__half_to_float(p1[1]),stbir__half_to_float(p1[2]),stbir__half_to_float(p1[3]),
                              #else
                              p1[0],p1[1],p1[2],p1[3],
                              #endif
                              stbir__half_to_float(p2[0]),stbir__half_to_float(p2[1]),stbir__half_to_float(p2[2]),stbir__half_to_float(p2[3]) );
                              ++nums;
                              if ( nums > 16 ) goto ex;
                              //if (d) exit(1);
                              //goto ex;
                            }
                          }
                        }
                        break;
                      }
                    }

                    for( x = (w*c)*tsizes[oldtypes]; x < op; x++ )
                    {
                      if ( ir1[y*op+x] != 79 )
                      {
                        printf("Margin error at %d x %d %d (should be 79) OLD!\n",x,y,(unsigned char)ir1[y*op+x]);
                        goto ex;
                      }
                    }

                    for( x = (w*c)*tsizes[types]; x < np; x++ )
                    {
                      if ( ir2[y*np+x] != 79 )
                      {
                        printf("Margin error at %d x %d %d (should be 79) NEW\n",x,y,(unsigned char)ir2[y*np+x]);
                        goto ex;
                      }
                    }
                  }
                
                  ex:
                  ENTER( "OUTPUT IMAGES" );
                  printf("  tot pix: %d, errs: %d\n", w*h*c,nums );

                  if (nums)
                  {
                    stbi_write_png("old.png", w, h, c, ir1, op);
                    stbi_write_png("new.png", w, h, c, ir2, np);
                    exit(1);
                  }

                  LEAVE(); // output images
                }
                LEAVE(); //test compare
              #endif



              }
              LEAVE(); // test filter
            }
            LEAVE(); // test edge
          }
          LEAVE(); // test width
        }
        LEAVE(); // test height
      }
      LEAVE(); // test type
    }
    LEAVE();  // test layout
  }

  CloseTM();
  return 0;
}