【CUDA】C++实现warpaffine仿射变换及其逆变换
目录
- 仿射变换矩阵工具类
- 进行前向仿射变换:i->d
- 进行仿射变换逆向变换d->i
仿射变换矩阵工具类
假设有图片i,要将其仿射变换至图片d,使用下面的类计算仿射变换矩阵i2d及d2i:
在调用compute函数后,输入i及d的尺寸,自动得到仿射变换矩阵i2d及d2i。
struct AffineMatrix{float i2d[6]; float d2i[6]; void invertAffineTransform(float imat[6], float omat[6]){float i00 = imat[0]; float i01 = imat[1]; float i02 = imat[2];float i10 = imat[3]; float i11 = imat[4]; float i12 = imat[5];float D = i00 * i11 - i01 * i10;D = D != 0 ? 1.0 / D : 0;float A11 = i11 * D;float A22 = i00 * D;float A12 = -i01 * D;float A21 = -i10 * D;float b1 = -A11 * i02 - A12 * i12;float b2 = -A21 * i02 - A22 * i12;omat[0] = A11; omat[1] = A12; omat[2] = b1;omat[3] = A21; omat[4] = A22; omat[5] = b2;}void compute(const Size& from, const Size& to){float scale_x = to.width / (float)from.width;float scale_y = to.height / (float)from.height;float scale = min(scale_x, scale_y); i2d[0] = scale; i2d[1] = 0; i2d[2] = -scale * from.width * 0.5 + to.width * 0.5 + scale * 0.5 - 0.5;i2d[3] = 0; i2d[4] = scale; i2d[5] = -scale * from.height * 0.5 + to.height * 0.5 + scale * 0.5 - 0.5;invertAffineTransform(i2d, d2i);}
};
进行前向仿射变换:i->d
在将i变换到d时,是对d进行变换操作,使用cuda遍历d中的每个点,找出d映射到i的对应关系,此时应使用d2i进行变换,相关代码如下:
调用处代码:
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
#include <stdio.h>using namespace cv;#define min(a, b) ((a) < (b) ? (a) : (b))
#define checkRuntime(op) __check_cuda_runtime((op), #op, __FILE__, __LINE__)bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line){if(code != cudaSuccess){const char* err_name = cudaGetErrorName(code); const char* err_message = cudaGetErrorString(code); printf("runtime error %s:%d %s failed. \n code = %s, message = %s\n", file, line, op, err_name, err_message); return false;}return true;
}void warp_affine_bilinear( // 声明uint8_t* src, int src_line_size, int src_width, int src_height, uint8_t* dst, int dst_line_size, int dst_width, int dst_height, uint8_t fill_value
);Mat warpaffine_to_center_align(const Mat& image, const Size& size){ Mat output(size, CV_8UC3);uint8_t* psrc_device = nullptr;uint8_t* pdst_device = nullptr;size_t src_size = image.cols * image.rows * 3;size_t dst_size = size.width * size.height * 3;checkRuntime(cudaMalloc(&psrc_device, src_size)); // 在GPU上开辟两块空间checkRuntime(cudaMalloc(&pdst_device, dst_size));checkRuntime(cudaMemcpy(psrc_device, image.data, src_size, cudaMemcpyHostToDevice)); // 搬运数据到GPU上warp_affine_bilinear(psrc_device, image.cols * 3, image.cols, image.rows,pdst_device, size.width * 3, size.width, size.height,114);// 检查核函数执行是否存在错误checkRuntime(cudaPeekAtLastError());checkRuntime(cudaMemcpy(output.data, pdst_device, dst_size, cudaMemcpyDeviceToHost)); checkRuntime(cudaFree(psrc_device));checkRuntime(cudaFree(pdst_device));return output;
}int main(){ Mat image = imread("i.jpg");Mat output = warpaffine_to_center_align(image, Size(640, 640));imwrite("d.jpg", output);return 0;
}
核函数代码:
#include <cuda_runtime.h>#define min(a, b) ((a) < (b) ? (a) : (b))
#define num_threads 512
typedef unsigned char uint8_t;
struct Size{int width = 0, height = 0;Size() = default;Size(int w, int h):width(w), height(h){}
};struct AffineMatrix{float i2d[6]; float d2i[6]; void invertAffineTransform(float imat[6], float omat[6]){float i00 = imat[0]; float i01 = imat[1]; float i02 = imat[2];float i10 = imat[3]; float i11 = imat[4]; float i12 = imat[5];// 计算行列式float D = i00 * i11 - i01 * i10;D = D != 0 ? 1.0 / D : 0;// 计算剩余的伴随矩阵除以行列式float A11 = i11 * D;float A22 = i00 * D;float A12 = -i01 * D;float A21 = -i10 * D;float b1 = -A11 * i02 - A12 * i12;float b2 = -A21 * i02 - A22 * i12;omat[0] = A11; omat[1] = A12; omat[2] = b1;omat[3] = A21; omat[4] = A22; omat[5] = b2;}void compute(const Size& from, const Size& to){float scale_x = to.width / (float)from.width;float scale_y = to.height / (float)from.height;float scale = min(scale_x, scale_y); // 缩放比例辅助视频讲解 [0] = scale; i2d[1] = 0; i2d[2] = -scale * from.width * 0.5 + to.width * 0.5 + scale * 0.5 - 0.5;i2d[3] = 0; i2d[4] = scale; i2d[5] = -scale * from.height * 0.5 + to.height * 0.5 + scale * 0.5 - 0.5;invertAffineTransform(i2d, d2i);}
};__device__ void affine_project(float* matrix, int x, int y, float* proj_x, float* proj_y){*proj_x = matrix[0] * x + matrix[1] * y + matrix[2];*proj_y = matrix[3] * x + matrix[4] * y + matrix[5];
}__global__ void warp_affine_bilinear_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, uint8_t* dst, int dst_line_size, int dst_width, int dst_height, uint8_t fill_value, AffineMatrix matrix
){:int dx = blockDim.x * blockIdx.x + threadIdx.x; int dy = blockDim.y * blockIdx.y + threadIdx.y;if (dx >= dst_width || dy >= dst_height) return;float c0 = fill_value, c1 = fill_value, c2 = fill_value;float src_x = 0; float src_y = 0;//这里使用了d2iaffine_project(matrix.d2i, dx, dy, &src_x, &src_y);if(src_x < -1 || src_x >= src_width || src_y < -1 || src_y >= src_height){}else{int y_low = floorf(src_y);int x_low = floorf(src_x);int y_high = y_low + 1;int x_high = x_low + 1;uint8_t const_values[] = {fill_value, fill_value, fill_value};float ly = src_y - y_low;float lx = src_x - x_low;float hy = 1 - ly;float hx = 1 - lx;float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;uint8_t* v1 = const_values;uint8_t* v2 = const_values;uint8_t* v3 = const_values;uint8_t* v4 = const_values;if(y_low >= 0){if (x_low >= 0)v1 = src + y_low * src_line_size + x_low * 3;if (x_high < src_width)v2 = src + y_low * src_line_size + x_high * 3;}if(y_high < src_height){if (x_low >= 0)v3 = src + y_high * src_line_size + x_low * 3;if (x_high < src_width)v4 = src + y_high * src_line_size + x_high * 3;}c0 = floorf(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0] + 0.5f);c1 = floorf(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1] + 0.5f);c2 = floorf(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2] + 0.5f);}uint8_t* pdst = dst + dy * dst_line_size + dx * 3;pdst[0] = c0; pdst[1] = c1; pdst[2] = c2;
}void warp_affine_bilinear(uint8_t* src, int src_line_size, int src_width, int src_height, uint8_t* dst, int dst_line_size, int dst_width, int dst_height, uint8_t fill_value
){dim3 block_size(32, 32); dim3 grid_size((dst_width + 31) / 32, (dst_height + 31) / 32);AffineMatrix affine;affine.compute(Size(src_width, src_height), Size(dst_width, dst_height));warp_affine_bilinear_kernel<<<grid_size, block_size, 0, nullptr>>>(src, src_line_size, src_width, src_height,dst, dst_line_size, dst_width, dst_height,fill_value, affine);
}
进行仿射变换逆向变换d->i
此时的输入图片为d,待求i,遍历待求i的每一像素,根据i至d的映射关系找出i(即前向计算出矩阵的i2d);调用处为:
这里d2i意味着变换后的图像坐标(d)经该变换后,可以被映射为原图像上的坐标(i)。
int main(){ Mat image = imread("d.jpg");Mat output = warpaffine_to_center_align(image, Size(550, 676));imwrite("i.jpg", output);return 0;
}
这个例子中i尺寸为Size(550, 676),d的尺寸为(640,640)
核函数只需改一处代码即可:
//这里使用了i2d
affine_project(matrix.i2d, dx, dy, &src_x, &src_y);
注意,这里的矩阵matrix仍是前向计算时得出的矩阵:
AffineMatrix affine;
affine.compute(Size(550, 676), Size(640, 640));
变换前后效果如下图:
【CUDA】C++实现warpaffine仿射变换及其逆变换
目录
- 仿射变换矩阵工具类
- 进行前向仿射变换:i->d
- 进行仿射变换逆向变换d->i
仿射变换矩阵工具类
假设有图片i,要将其仿射变换至图片d,使用下面的类计算仿射变换矩阵i2d及d2i:
在调用compute函数后,输入i及d的尺寸,自动得到仿射变换矩阵i2d及d2i。
struct AffineMatrix{float i2d[6]; float d2i[6]; void invertAffineTransform(float imat[6], float omat[6]){float i00 = imat[0]; float i01 = imat[1]; float i02 = imat[2];float i10 = imat[3]; float i11 = imat[4]; float i12 = imat[5];float D = i00 * i11 - i01 * i10;D = D != 0 ? 1.0 / D : 0;float A11 = i11 * D;float A22 = i00 * D;float A12 = -i01 * D;float A21 = -i10 * D;float b1 = -A11 * i02 - A12 * i12;float b2 = -A21 * i02 - A22 * i12;omat[0] = A11; omat[1] = A12; omat[2] = b1;omat[3] = A21; omat[4] = A22; omat[5] = b2;}void compute(const Size& from, const Size& to){float scale_x = to.width / (float)from.width;float scale_y = to.height / (float)from.height;float scale = min(scale_x, scale_y); i2d[0] = scale; i2d[1] = 0; i2d[2] = -scale * from.width * 0.5 + to.width * 0.5 + scale * 0.5 - 0.5;i2d[3] = 0; i2d[4] = scale; i2d[5] = -scale * from.height * 0.5 + to.height * 0.5 + scale * 0.5 - 0.5;invertAffineTransform(i2d, d2i);}
};
进行前向仿射变换:i->d
在将i变换到d时,是对d进行变换操作,使用cuda遍历d中的每个点,找出d映射到i的对应关系,此时应使用d2i进行变换,相关代码如下:
调用处代码:
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
#include <stdio.h>using namespace cv;#define min(a, b) ((a) < (b) ? (a) : (b))
#define checkRuntime(op) __check_cuda_runtime((op), #op, __FILE__, __LINE__)bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line){if(code != cudaSuccess){const char* err_name = cudaGetErrorName(code); const char* err_message = cudaGetErrorString(code); printf("runtime error %s:%d %s failed. \n code = %s, message = %s\n", file, line, op, err_name, err_message); return false;}return true;
}void warp_affine_bilinear( // 声明uint8_t* src, int src_line_size, int src_width, int src_height, uint8_t* dst, int dst_line_size, int dst_width, int dst_height, uint8_t fill_value
);Mat warpaffine_to_center_align(const Mat& image, const Size& size){ Mat output(size, CV_8UC3);uint8_t* psrc_device = nullptr;uint8_t* pdst_device = nullptr;size_t src_size = image.cols * image.rows * 3;size_t dst_size = size.width * size.height * 3;checkRuntime(cudaMalloc(&psrc_device, src_size)); // 在GPU上开辟两块空间checkRuntime(cudaMalloc(&pdst_device, dst_size));checkRuntime(cudaMemcpy(psrc_device, image.data, src_size, cudaMemcpyHostToDevice)); // 搬运数据到GPU上warp_affine_bilinear(psrc_device, image.cols * 3, image.cols, image.rows,pdst_device, size.width * 3, size.width, size.height,114);// 检查核函数执行是否存在错误checkRuntime(cudaPeekAtLastError());checkRuntime(cudaMemcpy(output.data, pdst_device, dst_size, cudaMemcpyDeviceToHost)); checkRuntime(cudaFree(psrc_device));checkRuntime(cudaFree(pdst_device));return output;
}int main(){ Mat image = imread("i.jpg");Mat output = warpaffine_to_center_align(image, Size(640, 640));imwrite("d.jpg", output);return 0;
}
核函数代码:
#include <cuda_runtime.h>#define min(a, b) ((a) < (b) ? (a) : (b))
#define num_threads 512
typedef unsigned char uint8_t;
struct Size{int width = 0, height = 0;Size() = default;Size(int w, int h):width(w), height(h){}
};struct AffineMatrix{float i2d[6]; float d2i[6]; void invertAffineTransform(float imat[6], float omat[6]){float i00 = imat[0]; float i01 = imat[1]; float i02 = imat[2];float i10 = imat[3]; float i11 = imat[4]; float i12 = imat[5];// 计算行列式float D = i00 * i11 - i01 * i10;D = D != 0 ? 1.0 / D : 0;// 计算剩余的伴随矩阵除以行列式float A11 = i11 * D;float A22 = i00 * D;float A12 = -i01 * D;float A21 = -i10 * D;float b1 = -A11 * i02 - A12 * i12;float b2 = -A21 * i02 - A22 * i12;omat[0] = A11; omat[1] = A12; omat[2] = b1;omat[3] = A21; omat[4] = A22; omat[5] = b2;}void compute(const Size& from, const Size& to){float scale_x = to.width / (float)from.width;float scale_y = to.height / (float)from.height;float scale = min(scale_x, scale_y); // 缩放比例辅助视频讲解 [0] = scale; i2d[1] = 0; i2d[2] = -scale * from.width * 0.5 + to.width * 0.5 + scale * 0.5 - 0.5;i2d[3] = 0; i2d[4] = scale; i2d[5] = -scale * from.height * 0.5 + to.height * 0.5 + scale * 0.5 - 0.5;invertAffineTransform(i2d, d2i);}
};__device__ void affine_project(float* matrix, int x, int y, float* proj_x, float* proj_y){*proj_x = matrix[0] * x + matrix[1] * y + matrix[2];*proj_y = matrix[3] * x + matrix[4] * y + matrix[5];
}__global__ void warp_affine_bilinear_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, uint8_t* dst, int dst_line_size, int dst_width, int dst_height, uint8_t fill_value, AffineMatrix matrix
){:int dx = blockDim.x * blockIdx.x + threadIdx.x; int dy = blockDim.y * blockIdx.y + threadIdx.y;if (dx >= dst_width || dy >= dst_height) return;float c0 = fill_value, c1 = fill_value, c2 = fill_value;float src_x = 0; float src_y = 0;//这里使用了d2iaffine_project(matrix.d2i, dx, dy, &src_x, &src_y);if(src_x < -1 || src_x >= src_width || src_y < -1 || src_y >= src_height){}else{int y_low = floorf(src_y);int x_low = floorf(src_x);int y_high = y_low + 1;int x_high = x_low + 1;uint8_t const_values[] = {fill_value, fill_value, fill_value};float ly = src_y - y_low;float lx = src_x - x_low;float hy = 1 - ly;float hx = 1 - lx;float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;uint8_t* v1 = const_values;uint8_t* v2 = const_values;uint8_t* v3 = const_values;uint8_t* v4 = const_values;if(y_low >= 0){if (x_low >= 0)v1 = src + y_low * src_line_size + x_low * 3;if (x_high < src_width)v2 = src + y_low * src_line_size + x_high * 3;}if(y_high < src_height){if (x_low >= 0)v3 = src + y_high * src_line_size + x_low * 3;if (x_high < src_width)v4 = src + y_high * src_line_size + x_high * 3;}c0 = floorf(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0] + 0.5f);c1 = floorf(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1] + 0.5f);c2 = floorf(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2] + 0.5f);}uint8_t* pdst = dst + dy * dst_line_size + dx * 3;pdst[0] = c0; pdst[1] = c1; pdst[2] = c2;
}void warp_affine_bilinear(uint8_t* src, int src_line_size, int src_width, int src_height, uint8_t* dst, int dst_line_size, int dst_width, int dst_height, uint8_t fill_value
){dim3 block_size(32, 32); dim3 grid_size((dst_width + 31) / 32, (dst_height + 31) / 32);AffineMatrix affine;affine.compute(Size(src_width, src_height), Size(dst_width, dst_height));warp_affine_bilinear_kernel<<<grid_size, block_size, 0, nullptr>>>(src, src_line_size, src_width, src_height,dst, dst_line_size, dst_width, dst_height,fill_value, affine);
}
进行仿射变换逆向变换d->i
此时的输入图片为d,待求i,遍历待求i的每一像素,根据i至d的映射关系找出i(即前向计算出矩阵的i2d);调用处为:
这里d2i意味着变换后的图像坐标(d)经该变换后,可以被映射为原图像上的坐标(i)。
int main(){ Mat image = imread("d.jpg");Mat output = warpaffine_to_center_align(image, Size(550, 676));imwrite("i.jpg", output);return 0;
}
这个例子中i尺寸为Size(550, 676),d的尺寸为(640,640)
核函数只需改一处代码即可:
//这里使用了i2d
affine_project(matrix.i2d, dx, dy, &src_x, &src_y);
注意,这里的矩阵matrix仍是前向计算时得出的矩阵:
AffineMatrix affine;
affine.compute(Size(550, 676), Size(640, 640));
变换前后效果如下图: