1 簡介
基於機器視覺opencv的手勢檢測 手勢識別 算法
2 傳統機器視覺的手勢檢測
2.1 輪廓檢測法
使用 void convexityDefects(InputArray contour, InputArray convexhull, OutputArray convexityDefects) 方法
int Mat::checkVector(int _elemChannels, int _depth, bool _requireContinuous) const { return (depth() == _depth || _depth <= 0) && (isContinuous() || !_requireContinuous) && ((dims == 2 && (((rows == 1 || cols == 1) && channels() == _elemChannels) || (cols == _elemChannels))) || (dims == 3 && channels() == 1 && size.p[2] == _elemChannels && (size.p[0] == 1 || size.p[1] == 1) && (isContinuous() || step.p[1] == step.p[2]*size.p[2]))) ? (int)(total()*channels()/_elemChannels) : -1; }
該函數源碼大概意思就是說對應的Mat矩陣如果其深度,連續性,通道數,行列式滿足一定條件的話就返回Mat元素的個數和其通道數的乘積,否則返回-1;而本文是要求其返回值大於3,有得知此處輸入多邊形曲線(即參數1)的通道數為2,所以還需要求其元素的個數大於1.5,即大於2才滿足ptnum > 3。簡單的說就是用convexityDefects()函數來對多邊形曲線進行凹陷檢測時,必須要求參數1曲線本身至少有2個點(也不知道這樣分析對不對)。因此本人在本次程序convexityDefects()函數前加入瞭if(Mat(approx_poly_curve).checkVector(2, CV_32S) > 3)來判斷,隻有滿足該if條件,才會進行後面的凹陷檢測。這樣程序就不會再出現類似的bug瞭。
第2個參數一般是由opencv中的函數convexHull()獲得的,一般情況下該參數裡面存的是凸包集合中的點在多項式曲線點中的位置索引,且該參數以vector的形式存在,因此參數convexhull中其元素的類型為unsigned int。在本次凹陷點檢測函數convexityDefects()裡面根據文檔,要求該參數為Mat型。因此在使用convexityDefects()的參數2時,一般將vector直接轉換Mat型。
2.2 算法結果
2.3 整體代碼實現
2.3.1 算法流程
1. 求出手部的掩膜
2. 求出掩膜的輪廓
3. 求出輪廓的多變形擬合曲線
4. 求出多邊形擬合曲線的凸包集,找出凸點
5. 求出多變形擬合曲線的凹陷集,找出凹點
6. 利用上面的凸凹點和手部中心點的幾何關系來做簡單的數字手勢識別
#include <iostream> #include "opencv2/highgui/highgui.hpp" #include "opencv2/imgproc/imgproc.hpp" #include <opencv2/core/core.hpp> #include "copenni.cpp" #include <iostream> #define DEPTH_SCALE_FACTOR 255./4096. #define ROI_HAND_WIDTH 140 #define ROI_HAND_HEIGHT 140 #define MEDIAN_BLUR_K 5 #define XRES 640 #define YRES 480 #define DEPTH_SEGMENT_THRESH 5 #define MAX_HANDS_COLOR 10 #define MAX_HANDS_NUMBER 10 #define HAND_LIKELY_AREA 2000 #define DELTA_POINT_DISTENCE 25 //手部中心點1和中心點2距離的閾值 #define SEGMENT_POINT1_DISTANCE 27 //凸點與手部中心點1遠近距離的閾值 #define SEGMENT_POINT2_DISTANCE 30 //凸點與手部中心點2遠近距離的閾值 using namespace cv; using namespace xn; using namespace std; int main (int argc, char **argv) { unsigned int convex_number_above_point1 = 0; unsigned int concave_number_above_point1 = 0; unsigned int convex_number_above_point2 = 0; unsigned int concave_number_above_point2 = 0; unsigned int convex_assist_above_point1 = 0; unsigned int convex_assist_above_point2 = 0; unsigned int point_y1 = 0; unsigned int point_y2 = 0; int number_result = -1; bool recognition_flag = false; //開始手部數字識別的標志 vector<Scalar> color_array;//采用默認的10種顏色 { color_array.push_back(Scalar(255, 0, 0)); color_array.push_back(Scalar(0, 255, 0)); color_array.push_back(Scalar(0, 0, 255)); color_array.push_back(Scalar(255, 0, 255)); color_array.push_back(Scalar(255, 255, 0)); color_array.push_back(Scalar(0, 255, 255)); color_array.push_back(Scalar(128, 255, 0)); color_array.push_back(Scalar(0, 128, 255)); color_array.push_back(Scalar(255, 0, 128)); color_array.push_back(Scalar(255, 128, 255)); } vector<unsigned int> hand_depth(MAX_HANDS_NUMBER, 0); vector<Rect> hands_roi(MAX_HANDS_NUMBER, Rect(XRES/2, YRES/2, ROI_HAND_WIDTH, ROI_HAND_HEIGHT)); namedWindow("color image", CV_WINDOW_AUTOSIZE); namedWindow("depth image", CV_WINDOW_AUTOSIZE); namedWindow("hand_segment", CV_WINDOW_AUTOSIZE); //顯示分割出來的手的區域 namedWindow("handrecognition", CV_WINDOW_AUTOSIZE); //顯示0~5數字識別的圖像 COpenNI openni; if(!openni.Initial()) return 1; if(!openni.Start()) return 1; while(1) { if(!openni.UpdateData()) { return 1; } /*獲取並顯示色彩圖像*/ Mat color_image_src(openni.image_metadata_.YRes(), openni.image_metadata_.XRes(), CV_8UC3, (char *)openni.image_metadata_.Data()); Mat color_image; cvtColor(color_image_src, color_image, CV_RGB2BGR); Mat hand_segment_mask(color_image.size(), CV_8UC1, Scalar::all(0)); for(auto itUser = openni.hand_points_.cbegin(); itUser != openni.hand_points_.cend(); ++itUser) { point_y1 = itUser->second.Y; point_y2 = itUser->second.Y + DELTA_POINT_DISTENCE; circle(color_image, Point(itUser->second.X, itUser->second.Y), 5,>first % color_array.size()), 3, 8); /*設置不同手部的深度*/>first % MAX_HANDS_COLOR) = (unsigned int)(itUser->second.Z* DEPTH_SCALE_FACTOR);//itUser->first會導致程序出現bug /*設置不同手部的不同感興趣區域*/>first % MAX_HANDS_NUMBER) = Rect(itUser->second.X - ROI_HAND_WIDTH/2, itUser->second.Y - ROI_HAND_HEIGHT/2, ROI_HAND_WIDTH, ROI_HAND_HEIGHT);>first % MAX_HANDS_NUMBER).x = itUser->second.X - ROI_HAND_WIDTH/2;>first % MAX_HANDS_NUMBER).y = itUser->second.Y - ROI_HAND_HEIGHT/2;>first % MAX_HANDS_NUMBER).width = ROI_HAND_WIDTH;>first % MAX_HANDS_NUMBER).height = ROI_HAND_HEIGHT; if(>first % MAX_HANDS_NUMBER).x <= 0)>first % MAX_HANDS_NUMBER).x = 0; if(>first % MAX_HANDS_NUMBER).x > XRES)>first % MAX_HANDS_NUMBER).x = XRES; if(>first % MAX_HANDS_NUMBER).y <= 0)>first % MAX_HANDS_NUMBER).y = 0; if(>first % MAX_HANDS_NUMBER).y > YRES)>first % MAX_HANDS_NUMBER).y = YRES; } imshow("color image", color_image); /*獲取並顯示深度圖像*/ Mat depth_image_src(openni.depth_metadata_.YRes(), openni.depth_metadata_.XRes(), CV_16UC1, (char *)openni.depth_metadata_.Data());//因為kinect獲取到的深度圖像實際上是無符號的16位數據 Mat depth_image; depth_image_src.convertTo(depth_image, CV_8U, DEPTH_SCALE_FACTOR); imshow("depth image", depth_image); //取出手的mask部分 //不管原圖像時多少通道的,mask矩陣聲明為單通道就ok for(auto itUser = openni.hand_points_.cbegin(); itUser != openni.hand_points_.cend(); ++itUser) { for(int i =>first % MAX_HANDS_NUMBER).x; i < std::min(>first % MAX_HANDS_NUMBER)>first % MAX_HANDS_NUMBER).width, XRES); i++) for(int j =>first % MAX_HANDS_NUMBER).y; j < std::min(>first % MAX_HANDS_NUMBER)>first % MAX_HANDS_NUMBER).height, YRES); j++) {<unsigned char>(j, i) = ((>first % MAX_HANDS_NUMBER)-DEPTH_SEGMENT_THRESH) <<unsigned char>(j, i)) & ((>first % MAX_HANDS_NUMBER)+DEPTH_SEGMENT_THRESH) ><unsigned char>(j,i)); } } medianBlur(hand_segment_mask, hand_segment_mask, MEDIAN_BLUR_K); Mat hand_segment(color_image.size(), CV_8UC3); color_image.copyTo(hand_segment, hand_segment_mask); /*對mask圖像進行輪廓提取,並在手勢識別圖像中畫出來*/ std::vector< std::vector<Point> > contours; findContours(hand_segment_mask, contours, CV_RETR_LIST, CV_CHAIN_APPROX_SIMPLE);//找出mask圖像的輪廓 Mat hand_recognition_image = Mat::zeros(color_image.rows, color_image.cols, CV_8UC3); for(int i = 0; i < contours.size(); i++) { //隻有在檢測到輪廓時才會去求它的多邊形,凸包集,凹陷集 recognition_flag = true; /*找出輪廓圖像多邊形擬合曲線*/ Mat contour_mat = Mat(contours[i]); if(contourArea(contour_mat) > HAND_LIKELY_AREA) { //比較有可能像手的區域 std::vector<Point> approx_poly_curve; approxPolyDP(contour_mat, approx_poly_curve, 10, true);//找出輪廓的多邊形擬合曲線 std::vector< std::vector<Point> > approx_poly_curve_debug; approx_poly_curve_debug.push_back(approx_poly_curve); drawContours(hand_recognition_image, contours, i, Scalar(255, 0, 0), 1, 8); //畫出輪廓 // drawContours(hand_recognition_image, approx_poly_curve_debug, 0, Scalar(256, 128, 128), 1, 8); //畫出多邊形擬合曲線 /*對求出的多邊形擬合曲線求出其凸包集*/ vector<int> hull; convexHull(Mat(approx_poly_curve), hull, true); for(int i = 0; i < hull.size(); i++) { circle(hand_recognition_image, approx_poly_curve[hull[i]], 2, Scalar(0, 255, 0), 2, 8); /*統計在中心點1以上凸點的個數*/ if(approx_poly_curve[hull[i]].y <= point_y1) { /*統計凸點與中心點1的y軸距離*/ long dis_point1 = abs(long(point_y1 - approx_poly_curve[hull[i]].y)); int dis1 = point_y1 - approx_poly_curve[hull[i]].y; if(dis_point1 > SEGMENT_POINT1_DISTANCE && dis1 >= 0) { convex_assist_above_point1++; } convex_number_above_point1++; } /*統計在中心點2以上凸點的個數*/ if(approx_poly_curve[hull[i]].y <= point_y2) { /*統計凸點與中心點1的y軸距離*/ long dis_point2 = abs(long(point_y2 - approx_poly_curve[hull[i]].y)); int dis2 = point_y2 - approx_poly_curve[hull[i]].y; if(dis_point2 > SEGMENT_POINT2_DISTANCE && dis2 >= 0) { convex_assist_above_point2++; } convex_number_above_point2++; } } // /*對求出的多邊形擬合曲線求出凹陷集*/ std::vector<Vec4i> convexity_defects; if(Mat(approx_poly_curve).checkVector(2, CV_32S) > 3) convexityDefects(approx_poly_curve, Mat(hull), convexity_defects); for(int i = 0; i < convexity_defects.size(); i++) { circle(hand_recognition_image, approx_poly_curve[convexity_defects[i][2]] , 2, Scalar(0, 0, 255), 2, 8); /*統計在中心點1以上凹陷點的個數*/ if(approx_poly_curve[convexity_defects[i][2]].y <= point_y1) concave_number_above_point1++; /*統計在中心點2以上凹陷點的個數*/ if(approx_poly_curve[convexity_defects[i][2]].y <= point_y2) concave_number_above_point2++; } } } /**畫出手勢的中心點**/ for(auto itUser = openni.hand_points_.cbegin(); itUser != openni.hand_points_.cend(); ++itUser) { circle(hand_recognition_image, Point(itUser->second.X, itUser->second.Y), 3, Scalar(0, 255, 255), 3, 8); circle(hand_recognition_image, Point(itUser->second.X, itUser->second.Y + 25), 3, Scalar(255, 0, 255), 3, 8); } /*手勢數字0~5的識別*/ //"0"的識別 if((convex_assist_above_point1 ==0 && convex_number_above_point2 >= 2 && convex_number_above_point2 <= 3 && concave_number_above_point2 <= 1 && concave_number_above_point1 <= 1) || (concave_number_above_point1 ==0 || concave_number_above_point2 == 0) && recognition_flag == true) number_result = 0; //"1"的識別 if(convex_assist_above_point1 ==1 && convex_number_above_point1 >=1 && convex_number_above_point1 <=2 && convex_number_above_point2 >=2 && convex_assist_above_point2 == 1) number_result = 1; //"2"的識別 if(convex_number_above_point1 == 2 && concave_number_above_point1 == 1 && convex_assist_above_point2 == 2 /*convex_assist_above_point1 <=1*/ && concave_number_above_point2 == 1) number_result = 2; //"3"的識別 if(convex_number_above_point1 == 3 && concave_number_above_point1 <= 3 && concave_number_above_point1 >=1 && convex_number_above_point2 >= 3 && convex_number_above_point2 <= 4 && convex_assist_above_point2 == 3) number_result = 3; //"4"的識別 if(convex_number_above_point1 == 4 && concave_number_above_point1 <=3 && concave_number_above_point1 >=2 && convex_number_above_point2 == 4) number_result = 4; //"5"的識別 if(convex_number_above_point1 >=4 && convex_number_above_point2 == 5 && concave_number_above_point2 >= 3 && convex_number_above_point2 >= 4) number_result = 5; if(number_result !=0 && number_result != 1 && number_result != 2 && number_result != 3 && number_result != 4 && number_result != 5) number_result == -1; /*在手勢識別圖上顯示匹配的數字*/ std::stringstream number_str; number_str << number_result; putText(hand_recognition_image, "Match: ", Point(0, 60), 4, 1, Scalar(0, 255, 0), 2, 0 ); if(number_result == -1) putText(hand_recognition_image, " ", Point(120, 60), 4, 2, Scalar(255, 0 ,0), 2, 0); else putText(hand_recognition_image, number_str.str(), Point(150, 60), 4, 2, Scalar(255, 0 ,0), 2, 0); imshow("handrecognition", hand_recognition_image); imshow("hand_segment", hand_segment); /*一個循環中對有些變量進行初始化操作*/ convex_number_above_point1 = 0; convex_number_above_point2 = 0; concave_number_above_point1 = 0; concave_number_above_point2 = 0; convex_assist_above_point1 = 0; convex_assist_above_point2 = 0; number_result = -1; recognition_flag = false; number_str.clear(); waitKey(20); } }
3 深度學習方法做手勢識別
3.1 經典的卷積神經網絡
卷積神經網絡的優勢就在於它能夠從常見的視覺任務中自動學習目 標數據的特征, 然後將這些特征用於某種特定任務的模型。 隨著時代的發展, 深度學習也形成瞭一些經典的卷積神經網絡。
3.2 YOLO系列
YOLO 系列的網絡模型最早源於 2016 年, 之後幾年經過不斷改進相繼推出YOLOv2、 YOLOv3 等網絡,直到今日yoloV5也誕生瞭,不得不感慨一句,darknet是真的肝。
3.3 SSD
SSD 作為典型的一階段網絡模型, 具有更高的操作性, 端到端的學習模式同樣受到眾多研究者的喜愛
3.4 實現步驟
3.4.1 數據集
- 圖像大小:100*100
- 像素顏色空間:RGB種類
- 圖片種類:6 種(0,1,2,3,4,5)
- 每種圖片數量:200 張
3.4.2 圖像預處理
3.4.3 構建卷積神經網絡結構
Dropout: 增加魯棒性幫助正則化和避免過擬合
一個相關的早期使用這種技術的論文((ImageNet Classification with Deep Convolutional Neural Networks, by Alex Krizhevsky, Ilya Sutskever, and Geoffrey Hinton (2012).))中啟發性的dropout解釋是:
3.4.4 實驗訓練過程及結果
3.5 關鍵代碼
# 作者:丹成學長 Q746876041, 需要完整代碼聯系學長獲取 import tensorflow as tf IMAGE_SIZE = 100 NUM_CHANNELS = 1 CONV1_SIZE = 4 CONV1_KERNEL_NUM = 8 CONV2_SIZE = 2 CONV2_KERNEL_NUM = 16 FC_SIZE = 512 OUTPUT_NODE = 6 def get_weight(shape, regularizer): w = tf.Variable(tf.truncated_normal(shape,stddev=0.1)) if regularizer != None: tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(regularizer)(w)) return w def get_bias(shape): b = tf.Variable(tf.zeros(shape)) return b def conv2d(x,w): return tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME') def max_pool_8x8(x): return tf.nn.max_pool(x, ksize=[1, 8, 8, 1], strides=[1, 4, 4, 1], padding='SAME') def max_pool_4x4(x): return tf.nn.max_pool(x, ksize=[1, 4, 4, 1], strides=[1, 2, 2, 1], padding='SAME') def forward(x, train, regularizer): conv1_w = get_weight([CONV1_SIZE, CONV1_SIZE, NUM_CHANNELS, CONV1_KERNEL_NUM], regularizer) conv1_b = get_bias([CONV1_KERNEL_NUM]) conv1 = conv2d(x, conv1_w) relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_b)) pool1 = max_pool_8x8(relu1) conv2_w = get_weight([CONV2_SIZE, CONV2_SIZE, CONV1_KERNEL_NUM, CONV2_KERNEL_NUM],regularizer) conv2_b = get_bias([CONV2_KERNEL_NUM]) conv2 = conv2d(pool1, conv2_w) relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_b)) pool2 = max_pool_4x4(relu2) pool_shape = pool2.get_shape().as_list() nodes = pool_shape[1] * pool_shape[2] * pool_shape[3] reshaped = tf.reshape(pool2, [pool_shape[0], nodes]) fc1_w = get_weight([nodes, FC_SIZE], regularizer) fc1_b = get_bias([FC_SIZE]) fc1 = tf.nn.relu(tf.matmul(reshaped, fc1_w) + fc1_b) if train: fc1 = tf.nn.dropout(fc1, 0.5) fc2_w = get_weight([FC_SIZE, OUTPUT_NODE], regularizer) fc2_b = get_bias([OUTPUT_NODE]) y = tf.matmul(fc1, fc2_w) + fc2_b return y
# 作者:丹成學長 Q746876041, 需要完整代碼聯系學長獲取 import tensorflow as tf import numpy as np import gesture_forward import gesture_backward from image_processing import func5,func6 import cv2 def restore_model(testPicArr): with tf.Graph().as_default() as tg: x = tf.placeholder(tf.float32,[ 1, gesture_forward.IMAGE_SIZE, gesture_forward.IMAGE_SIZE, gesture_forward.NUM_CHANNELS]) #y_ = tf.placeholder(tf.float32, [None, mnist_lenet5_forward.OUTPUT_NODE]) y = gesture_forward.forward(x,False,None) preValue = tf.argmax(y, 1) variable_averages = tf.train.ExponentialMovingAverage(gesture_backward.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(gesture_backward.MODEL_SAVE_PATH) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) #global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] preValue =, feed_dict={x:testPicArr}) return preValue else: print("No checkpoint file found") return -1 def application01(): testNum = input("input the number of test pictures:") testNum = int(testNum) for i in range(testNum): testPic = input("the path of test picture:") img = func5(testPic) cv2.imwrite(str(i)+'ttt.jpg',img) # cv2.waitKey(0) # cv2.destroyAllWindows() img = img.reshape([1,100,100,1]) img = img.astype(np.float32) img = np.multiply(img, 1.0/255.0) # print(img.shape) # print(type(img)) preValue = restore_model(img) print ("The prediction number is:", preValue) def application02(): #vc = cv2.VideoCapture('testVideo.mp4') vc = cv2.VideoCapture(0) # 設置每秒傳輸幀數 fps = vc.get(cv2.CAP_PROP_FPS) # 獲取視頻的大小 size = (int(vc.get(cv2.CAP_PROP_FRAME_WIDTH)),int(vc.get(cv2.CAP_PROP_FRAME_HEIGHT))) # 生成一個空的視頻文件 # 視頻編碼類型 # cv2.VideoWriter_fourcc('X','V','I','D') MPEG-4 編碼類型 # cv2.VideoWriter_fourcc('I','4','2','0') YUY編碼類型 # cv2.VideoWriter_fourcc('P','I','M','I') MPEG-1 編碼類型 # cv2.VideoWriter_fourcc('T','H','E','O') Ogg Vorbis類型,文件名為.ogv # cv2.VideoWriter_fourcc('F','L','V','1') Flask視頻,文件名為.flv #vw = cv2.VideoWriter('ges_pro.avi',cv2.VideoWriter_fourcc('X','V','I','D'), fps, size) # 讀取視頻第一幀的內容 success, frame = # rows = frame.shape[0] # cols = frame.shape[1] # t1 = int((cols-rows)/2) # t2 = int(cols-t1) # M = cv2.getRotationMatrix2D((cols/2,rows/2),90,1) # frame = cv2.warpAffine(frame,M,(cols,rows)) # frame = frame[0:rows, t1:t2] # cv2.imshow('sd',frame) # cv2.waitKey(0) # cv2.destroyAllWindows() while success: #90度旋轉 # img = cv2.warpAffine(frame,M,(cols,rows)) # img = img[0:rows, t1:t2] img = func6(frame) img = img.reshape([1,100,100,1]) img = img.astype(np.float32) img = np.multiply(img, 1.0/255.0) preValue = restore_model(img) # 寫入視頻 cv2.putText(frame,"Gesture:"+str(preValue),(50,50),cv2.FONT_HERSHEY_PLAIN,2.0,(0,0,255),1) #vw.write(frame) cv2.imshow('gesture',frame) if cv2.waitKey(1) & 0xFF == ord('q'): break # 讀取視頻下一幀的內容 success, frame = vc.release() cv2.destroyAllWindows() print('viedo app over!') def main(): #application01() application02() if __name__ == '__main__': main()
4 實現手勢交互
