/** COMPILE: ( you will need to change the paths probably )

nvcc --host-compilation c++ --compiler-options "-I../include -I$HOME/NVIDIA_CUDA_SDK/common/inc" --linker-options "-L../ -lopenvidia -lglut -lGLEW -lstdc++ -lccext2 -lccgnu2" V4L2CUDA.cu

about this example:  

   This example shows how to process camera input using NVIDIA CUDA.
   It requires CUDA capable hardware (GeForce 8xxx and up), and the 
     CUDA Toolkit and SDK installed (available from developer.nvidia.com)

   This example does YUYV to RGB32 conversion on an
   image from a V4L2 camera source. This was tested with a Logitech Quickcam Pro
   using the uvcvideo driver.
  
   The example shows how to put the image into a CUDA array, then access it using 
   the CUDA texture 'path', which converts the unsigned char data to floats when 
   accessing it.

   To send the results back to OpenGL, we use a OpenGL Pixel Buffer Object.
   We create a PBO as usual in OpenGL, then retrieve a pointer to it for CUDA to use. 
   the CUDA kernel then reads from the texture, and writes to this pointer. 
   Afterwards we draw on screen in OpenGL, using the PBO to texture a quad.
  
   See also the imageDenoising example from the NVIDIA CUDA SDK.
*/
#define USE_CAMERA 

#include <GL/glew.h> 

#include <stdio.h> 
#include <assert.h> 
#include <iostream> 

#include <cutil.h> 
#include <cutil_gl_error.h> 
#include <cuda_gl_interop.h> 

#ifdef USE_CAMERA 
#include <openvidia/openvidia32.h> 
#else 
#include <GL/glut.h> 
#endif 

using namespace std;
#define BUFFER_OFFSET(i) ((char *)NULL + (i))   

int width = 640;
int height = 480;
int texSize;

GLuint bufferObj;  // pixel buffer object to use for texturing
GLuint texObj;
unsigned int  *GPU_PBOPtr = NULL;
unsigned char *Img = NULL;

cudaArray *a_Src;


// create a CUDA texture, note a CUDA texture is different from an OpenGL texture
// section 4.3.4.1: CUDA textures must be 1,2,4 component vectors
// here, we specify the storage format of the texture
// we'll do a memcpy into this texture later

// components are  uchar2's:  [ Y U | Y V | Y U | Y V ... ]
// 2 dimensional
// when read, return normalized floats, this is 
//  how the unsigned char to floating point conversion is done
texture<uchar2, 2, cudaReadModeNormalizedFloat> texImage;

// create a descriptor, we'll use it to specify what type of
// memory to malloc
cudaChannelFormatDesc uchar2tex = cudaCreateChannelDesc<uchar2>();



#ifdef USE_CAMERA 
V4L2 *CamSource;
#endif 

void showstats() {
  static int lasttime;
  static int fpscounter;
  static int fps;
  int curtime;
  curtime = time(NULL);
  if( lasttime != curtime) {
    fps=fpscounter;
    fpscounter=1;
    lasttime = curtime;
    fprintf(stderr, "fps = %d,  %f msecs\n",fps,1.0/((float)fps)*1000);

  } else {
    fpscounter++;
  }
}

static GLenum errCode;
const GLubyte *errString;

void errcheck() {

  if ((errCode = glGetError()) != GL_NO_ERROR) {
    errString = gluErrorString(errCode);
    fprintf (stderr, "OpenGL Error: %s\n", errString);
    exit(1);
  }

}

void reshape(int w, int h)
{
  glClearColor (0.0, 0.0, 0.0, 0.0);
  glViewport(0, 0, (GLsizei) w, (GLsizei) h);
  glMatrixMode(GL_PROJECTION);
  glLoadIdentity();

  glFrustum(0.0, 1.0,  0.0, 1.0,   1.0,   100.0);

  gluLookAt(0.0,0.0,0.0,  0.0, 0.0,  -1.0,   0.0, 1.0, 0.0);

  glMatrixMode(GL_MODELVIEW);

  glLoadIdentity();
  glutPostRedisplay();
}

void myIdle(){
  glutPostRedisplay();
}

void keyboard (unsigned char key, int x, int y)
{
   switch (key) {
      case 27:
         cudaGLUnregisterBufferObject( bufferObj );
         glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0);
         glDeleteBuffers(1, &bufferObj );
         glDeleteTextures(1, &texObj );
         cudaFreeArray( a_Src );
         exit(0);
         break;
      default:
         break;
   }
}

void MouseFunc( int button, int state, int x, int y)
{
  switch(button) {
    case GLUT_LEFT_BUTTON :
      break;
    case GLUT_RIGHT_BUTTON :
      break;
  }
}

#define BLOCKDIM_X 8 
#define BLOCKDIM_Y 8 


/* duplicates a single component (this function not used) */
__device__ unsigned int make_color(float r, float g, float b, float a){
    return
        ((int)(r * 255.0f) << 24) |
        ((int)(r * 255.0f) << 16) |
        ((int)(r * 255.0f) <<  8) |
        ((int)(r * 255.0f) <<  0);
}

/* testing function: duplicate a luminance value into a greyscale RGB32 pixel */
/* (this function not used) */
__device__ unsigned int make_luminance(float y){
    return
        ((int)(y * 255.0f) << 24) |
        ((int)(y * 255.0f) << 16) |
        ((int)(y * 255.0f) <<  8) |
        ((int)(y * 255.0f) <<  0);
}

/*
YUV to RGB Conversion
    B = 1.164(Y - 16)                   + 2.018(U - 128)
    G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
    R = 1.164(Y - 16) + 1.596(V - 128)

   convert a YUYV pixel to a RGB32 pixel. 
   not necessarily the fastest implementation but fast enough
   given the h/w and easy to read 
 */
__device__ unsigned int make_rgb32(float y, float u, float v){

    float l = 1.164 * ( y - 16.0f/255.0f );
    float m = v - 0.5;
    float n = u - 0.5;

    float b = l             + 2.018 * n;
    float g = l - 0.813 * m - 0.391 * n;
    float r = l + 1.596 * m;
    
    r = min( r, 1.0);
    g = min( g, 1.0);
    b = min( b, 1.0);

    r = max( r, 0.0);
    g = max( g, 0.0);
    b = max( b, 0.0);

    return
        ((int)(1.0 * 255.0f) << 24) |
        ((int)(b * 255.0f) << 16) |
        ((int)(g * 255.0f) <<  8) |
        ((int)(r * 255.0f) <<  0);
}


// convert a YUYV input texture into a RGB32 floating point array 
// recall YUYV texture components are  uchar2's:  [ Y U | Y V | Y U | Y V ... ]
// so on even numbered columns, use yuv as:  [ Y U |    V | ... ]
// and on odd numbered columns, use yuv as:  [   U | Y  V | ... ]
__global__ void AddConst( unsigned int *dst, int imageW, int imageH )
{
    const int ix = blockDim.x * blockIdx.x + threadIdx.x;
    const int iy = blockDim.y * blockIdx.y + threadIdx.y;
    //Add half of a texel to always address exact texel centers
    const float x = (float)ix + 0.5f;
    const float y = (float)iy + 0.5f;

    if(ix < imageW && iy < imageH){
        float2 fresultYU;
        float2 fresultYV;

        if( ix % 2  == 0  ) {
            fresultYU = tex2D(texImage, x, y);
            fresultYV = tex2D(texImage, x+1, y);
            dst[imageW * iy + ix] = make_rgb32( fresultYU.x, 
                                                    fresultYU.y,
                                                    fresultYV.y );
        } else {
            fresultYU = tex2D(texImage, x-1, y);
            fresultYV = tex2D(texImage, x, y);
            dst[imageW * iy + ix] = make_rgb32( fresultYV.x, 
                                                    fresultYU.y,
                                                    fresultYV.y );
        }

    }
}

int iDivUp(int a, int b){
    return ((a % b) != 0) ? (a / b + 1) : (a / b);
}


void render_redirect() {
  float d =  -1.0;

   // 1. copy the image to the array
   //    this does the PCI-E transfer to the card
   cudaMemcpyToArray(
                a_Src, 0, 0,
                CamSource->ptr(), width * height * sizeof(uchar2),
                cudaMemcpyHostToDevice
            );

   // 2. get a pointer to the GL Pixel Buffer
   //    this is where the resuls are written
   cudaGLMapBufferObject( (void **)&GPU_PBOPtr, bufferObj );
 
   // 3. Bind our texture reference to the array
   //    This lets the CUDA kernel access the array using tex2D() 
   cudaBindTextureToArray( texImage, a_Src );

   // 4. Set up the kernal launch dimensions 
   dim3 threads(BLOCKDIM_X, BLOCKDIM_Y);
   dim3 grid(iDivUp(width, BLOCKDIM_X), iDivUp(height, BLOCKDIM_Y));

   // 5. Launch the kernel.
   //    This reads from the array, and writes to GPU_PBOPtr: a pointer
   //      to the Pixel Buffer Object
   AddConst<<<grid,threads>>>( GPU_PBOPtr, width, height );

   // 6. Release resources from CUDA
   cudaUnbindTexture(texImage);
   cudaGLUnmapBufferObject( bufferObj );


   // 7. Now, back in OpenGL, recall we have the texture texObj
   //    active and the pixel buffer bufferObj active, so this 
   //    texsubImage writes to the pixel buffer bound to the 
   //    texture texObj

  glTexSubImage2D( GL_TEXTURE_RECTANGLE_NV, 0, 0, 0, width, height, 
                   GL_RGBA, GL_UNSIGNED_BYTE, BUFFER_OFFSET(0) );

  // 8. draw the 'tex' texture containing the PBO data.
  glBegin(GL_QUADS); 

    glTexCoord2f(0, height);
    glVertex3f(0.0, 0.0,d );

    glTexCoord2f(0, 0);
    glVertex3f(0.0, 1.0, d);

    glTexCoord2f(width, 0);
    glVertex3f(1.0,1.0, d);

    glTexCoord2f(width,height);
    glVertex3f(1.0, 0.0, d );
  glEnd();

  glutSwapBuffers();
  showstats();
}

#ifdef USE_CAMERA 
void init_camera() {
   CamSource = new V4L2( 640, 480 );
//   width = CamSource->width();
//   height = CamSource->height();
}

#endif 


unsigned char *makeImage( int w, int h, int n ) 
{

   unsigned char *d;
   //allocate pinned memory for this buffer for faster bus transfer
   cudaMallocHost((void **)&d, w*h*n);
   unsigned char *p = d;
   for( int i=0; i<h ; i++ ) {
     for( int j=0; j<w ; j++ ) {
       *p++ = 128;
       *p++ = 228;
       *p++ = 228;
       *p++ = 228;
     }
   }
   return d;
}

int main(int argc, char *argv[] )  {

   cerr<<"Actual window created : "<<width<<"x"<<height<<endl;
   
   glutInit(&argc, argv);
   glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH | GLUT_ALPHA );
   glutInitWindowSize(width, height );
   glutCreateWindow(argv[0]);
   glewInit();
#ifdef USE_CAMERA 
   init_camera(); 
#endif 
   glutReshapeWindow( width, height );

   // make a test image
   Img = makeImage( width, height, 4 );

   // make a texture
   cerr<<" Making texture "<<endl;
   glEnable(GL_TEXTURE_RECTANGLE_NV);
   glGenTextures(1, &texObj);
   glBindTexture(GL_TEXTURE_RECTANGLE_NV, texObj);
   glTexEnvi( GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE,  GL_REPLACE );
   glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA8, width,height, 0,
           GL_RGBA, GL_UNSIGNED_BYTE,Img);
   glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
   glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST);


   //make a pixel buffer object (PBO) 
   cerr<<" PBO "<<endl;
   texSize = width*height*4;
   glGenBuffers( 1, &bufferObj ); errcheck();
   glBindBuffer(GL_PIXEL_UNPACK_BUFFER_EXT, bufferObj); errcheck();
   glBufferData(GL_PIXEL_UNPACK_BUFFER_EXT, width*height*4, NULL,  GL_STREAM_COPY); errcheck();


   // make the pixel buffer accessible to CUDA, so CUDA can write to it
   cerr<<"Registering buffer"<<endl;
   cudaGLRegisterBufferObject( bufferObj );
   cerr<<" PBO OK "<<endl;

   // make a cuda array, used to load up a texture with data
   cerr<<" CUDA array "<<endl;
   cudaMallocArray(&a_Src, &uchar2tex, width, height) ;
   cerr<<" malloc ok "<<endl;

   // transfer initial data to array
   cudaMemcpyToArray(
                a_Src, 0, 0,
                Img, width * height * sizeof(uchar2),
                cudaMemcpyHostToDevice );
   cerr<<"memcpy ok"<<endl;
   cerr<<"array OK "<<endl;


   // quick error check.
   fprintf(stderr, "any errors?: %s\n", cudaGetErrorString( cudaGetLastError() )) ;
   cerr<<"register ok "<<endl;

   errcheck();
    
   glutDisplayFunc(render_redirect);
   glutIdleFunc(myIdle);
   glutReshapeFunc(reshape);
   glutKeyboardFunc(keyboard);
   glutMouseFunc(MouseFunc);

   cerr<<" init ok"<<endl;
   glutMainLoop();
   return 0;
}