Requirements
- Basic knowledge about C programming
- CUDA supported device (CUDA GPU list)
- Installed CUDA SDK
- Eclipse/nvidia nsight (nsight recommended)
Goal
The aim of this article is to get acquainted with bilinear interpolation technique and implement demo program which is going to scale given input image using nvidia CUDA SDK. The concept of interpolation is not explained here.
Optimus (hybrid graphics card)
What if I have CUDA compatibile device as a secondary graphics card (optimus)?
First of all (if you haven’t done it yet) install Bumblebee deamon.
It’s available on many popular distros, for instance:
# Archlinux
pacman -S bumblebee
# Ubuntu 12.04
sudo add-apt-repository ppa:bumblebee/stable
sudo apt-get update
sudo apt-get install bumblebee virtualgl linux-headers-generic
After reboot perform:
sudo tee /proc/acpi/bbswitch <<< ON
modprobe nvidia
Turned on nvidia device will shorten your baterry life 🙂 Be aware of that and plug in.
Bit operations in brief
Let's consider the situation that we have ARGB pixel located in memory and we want to get out the blue channel. How to do that?
// Declare ARGB variable
Uint32 color = 0xffd5b2c4;
Ok! What is that 0xff....??? This is a ARGB color written in hexadecimal form. Hmm... So how to read this?
/*
Each of channels (alpha, red, green, blue) has value from 0 to 255. 1 byte is sufficeient to store the number, so on
one octet (8 bits) you can write one channel.
After decomposition:
*/
Uint8 a = 0xff; // 255
Uint8 r = 0xd5; // 213
Uint8 g = 0xb2; // 178
Uint8 b = 0xc4; // 196
/*
How many bits do we need to store all four channels? 4*8 = 32 bits. Now it's perfectly clear the idea of Uint32
"color" variable.
*/
Let's pull out channels from "color" variable
printf("Blue: 0x%02x", (color && 0xff));
printf("Green: 0x%02x", ((color >> 8) && 0xff));
printf("Red: 0x%02x", ((color >> 16) && 0xff));
printf("Alpha: 0x%02x", ((color >> 24) && 0xff));
&& 0xff" is a binary AND mask which extracts only last 2 octets.
Simple DirectMedia Layer (SDL)
We're gonna use SDL to read/write image from/to file. SDL is natively written in C, so I'm convinced that is good choice.
In order to have SDL working, give linker a clue where to look for libs
# Library search path
-L /usr/lib/
# Library
-l SDL
-l SDL_image
Read image
With the knowlege about bit operations, we'll make an attempt to read an image from disk using SDL library.
SDL_Surface *image = IMG_Load ("test.jpg");
if (!image){
printf ( "IMG_Load: %s\n", IMG_GetError () );
return 1;
}
Trivial isn't it?
Prepare new image
Ok, we have successfully read the image. Let's create the "output" sufrace where new image will be stored.
// Bit masks
Uint32 amask = 0xff000000;
Uint32 rmask = 0x00ff0000;
Uint32 gmask = 0x0000ff00;
Uint32 bmask = 0x000000ff;
// New width of image
int rWidth = 3000;
int newWidth = image->w + (rWidth-image->w);
int newHeight = image->h + (rWidth-image->w);
// Create scaled image surface
SDL_Surface *newImage = SDL_CreateRGBSurface(SDL_SWSURFACE, newWidth, newHeight, 32, rmask, gmask, bmask, amask);
Why BitsPerPixel == 32? In this case im 100% percent sure that the output will use an ARGB pallete of colours.
CUDA error handler
Before we start writting CUDA kernel function it is very convinient (and saves a lot of time for debugging) to create an error handler method for CUDA kernels.
void cudasafe(int error, char* message, char* file, int line) {
if (error != cudaSuccess) {
fprintf(stderr, "CUDA Error: %s : %i. In %s line %d\n", message, error, file, line);
exit(-1);
}
}
Memory allocation (CUDA)
It's time to copy and preserve some memory on CUDA device.
// Get output image size
int newImageByteLength = newImage->w * newImage->h * sizeof(Uint8)*newImage->format->BytesPerPixel;
// Create pointer to device and host pixels
Uint8 *pixels = (Uint8*)image->pixels;
Uint8 *pixels_dyn;
// Copy original image
cudasafe(cudaMalloc((void **) &pixels_dyn, imageByteLength),"Original image allocation ",__FILE__,__LINE__);
cudasafe(cudaMemcpy(pixels_dyn, pixels, imageByteLength, cudaMemcpyHostToDevice),"Copy original image to device ",__FILE__,__LINE__);
// Allocate new image on DEVICE
Uint8 *newPixels_dyn;
Uint8 *newPixels = (Uint8*)malloc(newImageByteLength);
cudasafe(cudaMalloc((void **) &newPixels_dyn, newImageByteLength),"New image allocation ",__FILE__,__LINE__);
"Transform" the kernel function
Eventually, we will touch the "heart" of the program. Transform kernel function will take four corresponding pixels and based on that do specific calculations. If you want to know more about interpolation I recommend you this article
__global__ void cudaTransform(Uint8 *output, Uint8 *input, Uint16 pitchOutput, Uint16 pitchInput, Uint8 bytesPerPixelInput, Uint8 bytesPerPixelOutput, float xRatio, float yRatio){
int x = (int) (xRatio * blockIdx.x);
int y = (int) (yRatio * blockIdx.y);
Uint8 *a; Uint8 *b; Uint8 *c; Uint8 *d;
float xDist, yDist, blue, red, green;
// X and Y distance difference
xDist = (xRatio * blockIdx.x) - x;
yDist = (yRatio * blockIdx.y) - y;
// Points
a = input + y * pitchInput + x * bytesPerPixelInput;
b = input + y * pitchInput + (x+1) * bytesPerPixelInput;
c = input + (y+1) * pitchInput + x * bytesPerPixelInput;
d = input + (y+1) * pitchInput + (x+1) * bytesPerPixelInput;
// blue
blue = (a[2])*(1 - xDist)*(1 - yDist) + (b[2])*(xDist)*(1 - yDist) + (c[2])*(yDist)*(1 - xDist) + (d[2])*(xDist * yDist);
// green
green = ((a[1]))*(1 - xDist)*(1 - yDist) + (b[1])*(xDist)*(1 - yDist) + (c[1])*(yDist)*(1 - xDist) + (d[1])*(xDist * yDist);
// red
red = (a[0])*(1 - xDist)*(1 - yDist) + (b[0])*(xDist)*(1 - yDist) + (c[0])*(yDist)*(1 - xDist) + (d[0])*(xDist * yDist);
Uint8 *p = output + blockIdx.y * pitchOutput + blockIdx.x * bytesPerPixelOutput;
*(Uint32*)p = 0xff000000 | ((((int)red) << 16)) | ((((int)green) << 8)) | ((int)blue);
}
Save image
After saving an image, we would like to keep environment clean, so deallocate memory by cudaFree function.
// Copy scaled image to host
cudasafe(cudaMemcpy(newPixels, newPixels_dyn, newImageByteLength, cudaMemcpyDeviceToHost),"from device to host", __FILE__, __LINE__);
newImage->pixels = newPixels;
// Free memory
cudaFree(pixels_dyn);
cudaFree(newPixels_dyn);
//Save image
SDL_SaveBMP(newImage, "test2.bmp");
// Free surfaces
SDL_FreeSurface (image);
SDL_FreeSurface (newImage);
SDL_Quit();
Summary
All in one.
void cudasafe(int error, char* message, char* file, int line) {
if (error != cudaSuccess) {
fprintf(stderr, "CUDA Error: %s : %i. In %s line %d\n", message, error, file, line);
exit(-1);
}
}
__global__ void cudaTransform(Uint8 *output, Uint8 *input, Uint16 pitchOutput, Uint16 pitchInput, Uint8 bytesPerPixelInput, Uint8 bytesPerPixelOutput, float xRatio, float yRatio){
int x = (int) (xRatio * blockIdx.x);
int y = (int) (yRatio * blockIdx.y);
Uint8 *a; Uint8 *b; Uint8 *c; Uint8 *d;
float xDist, yDist, blue, red, green;
// X and Y distance difference
xDist = (xRatio * blockIdx.x) - x;
yDist = (yRatio * blockIdx.y) - y;
// Points
a = input + y * pitchInput + x * bytesPerPixelInput;
b = input + y * pitchInput + (x+1) * bytesPerPixelInput;
c = input + (y+1) * pitchInput + x * bytesPerPixelInput;
d = input + (y+1) * pitchInput + (x+1) * bytesPerPixelInput;
// blue
blue = (a[2])*(1 - xDist)*(1 - yDist) + (b[2])*(xDist)*(1 - yDist) + (c[2])*(yDist)*(1 - xDist) + (d[2])*(xDist * yDist);
// green
green = ((a[1]))*(1 - xDist)*(1 - yDist) + (b[1])*(xDist)*(1 - yDist) + (c[1])*(yDist)*(1 - xDist) + (d[1])*(xDist * yDist);
// red
red = (a[0])*(1 - xDist)*(1 - yDist) + (b[0])*(xDist)*(1 - yDist) + (c[0])*(yDist)*(1 - xDist) + (d[0])*(xDist * yDist);
Uint8 *p = output + blockIdx.y * pitchOutput + blockIdx.x * bytesPerPixelOutput;
*(Uint32*)p = 0xff000000 | ((((int)red) << 16)) | ((((int)green) << 8)) | ((int)blue);
}
int main(void) {
Uint32 amask = 0xff000000;
Uint32 rmask = 0x00ff0000;
Uint32 gmask = 0x0000ff00;
Uint32 bmask = 0x000000ff;
SDL_Surface *image = IMG_Load ("test.jpg");
int imageByteLength = image->w * image->h * sizeof(Uint8)*image->format->BytesPerPixel;
if (!image){
printf ( "IMG_Load: %s\n", IMG_GetError () );
return 1;
}
// New width of image
int rWidth = 3000;
int newWidth = image->w + (rWidth-image->w);
int newHeight = image->h + (rWidth-image->w);
dim3 grid(newWidth,newHeight);
// Create scaled image surface
SDL_Surface *newImage = SDL_CreateRGBSurface(SDL_SWSURFACE, newWidth, newHeight, 32, rmask, gmask, bmask, amask);
int newImageByteLength = newImage->w * newImage->h * sizeof(Uint8)*newImage->format->BytesPerPixel;
float xRatio = ((float)(image->w-1))/newImage->w;
float yRatio = ((float)(image->h-1))/newImage->h;
// Create pointer to device and host pixels
Uint8 *pixels = (Uint8*)image->pixels;
Uint8 *pixels_dyn;
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Copy original image
cudasafe(cudaMalloc((void **) &pixels_dyn, imageByteLength),"Original image allocation ",__FILE__,__LINE__);
cudasafe(cudaMemcpy(pixels_dyn, pixels, imageByteLength, cudaMemcpyHostToDevice),"Copy original image to device ",__FILE__,__LINE__);
// Allocate new image on DEVICE
Uint8 *newPixels_dyn;
Uint8 *newPixels = (Uint8*)malloc(newImageByteLength);
cudasafe(cudaMalloc((void **) &newPixels_dyn, newImageByteLength),"New image allocation ",__FILE__,__LINE__);
// Start measuring time
cudaEventRecord(start, 0);
// Do the bilinear transform on CUDA device
cudaTransform<<< grid,1 >>>(newPixels_dyn, pixels_dyn, newImage->pitch, image->pitch, image->format->BytesPerPixel, newImage->format->BytesPerPixel, xRatio, yRatio);
// Stop the timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
// Copy scaled image to host
cudasafe(cudaMemcpy(newPixels, newPixels_dyn, newImageByteLength, cudaMemcpyDeviceToHost),"from device to host", __FILE__, __LINE__);
newImage->pixels = newPixels;
// Free memory
cudaFree(pixels_dyn);
cudaFree(newPixels_dyn);
cudaEventElapsedTime(&time, start, stop);
printf ("Time for the kernel: %f ms\n", time);
//Save image
SDL_SaveBMP(newImage, "test2.bmp");
// Free surfaces
SDL_FreeSurface (image);
SDL_FreeSurface (newImage);
SDL_Quit();
}
Demo
Sources: Project files