object_detection.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. import cv2 as cv
  2. import argparse
  3. import numpy as np
  4. import sys
  5. import copy
  6. import time
  7. from threading import Thread
  8. if sys.version_info[0] == 2:
  9. import Queue as queue
  10. else:
  11. import queue
  12. from common import *
  13. from tf_text_graph_common import readTextMessage
  14. from tf_text_graph_ssd import createSSDGraph
  15. from tf_text_graph_faster_rcnn import createFasterRCNNGraph
  16. backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV,
  17. cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
  18. targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL,
  19. cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
  20. parser = argparse.ArgumentParser(add_help=False)
  21. parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
  22. help='An optional path to file with preprocessing parameters.')
  23. parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.')
  24. parser.add_argument('--out_tf_graph', default='graph.pbtxt',
  25. help='For models from TensorFlow Object Detection API, you may '
  26. 'pass a .config file which was used for training through --config '
  27. 'argument. This way an additional .pbtxt file with TensorFlow graph will be created.')
  28. parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet', 'dldt', 'onnx'],
  29. help='Optional name of an origin framework of the model. '
  30. 'Detect it automatically if it does not set.')
  31. parser.add_argument('--thr', type=float, default=0.5, help='Confidence threshold')
  32. parser.add_argument('--nms', type=float, default=0.4, help='Non-maximum suppression threshold')
  33. parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
  34. help="Choose one of computation backends: "
  35. "%d: automatically (by default), "
  36. "%d: Halide language (http://halide-lang.org/), "
  37. "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
  38. "%d: OpenCV implementation, "
  39. "%d: VKCOM, "
  40. "%d: CUDA" % backends)
  41. parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
  42. help='Choose one of target computation devices: '
  43. '%d: CPU target (by default), '
  44. '%d: OpenCL, '
  45. '%d: OpenCL fp16 (half-float precision), '
  46. '%d: NCS2 VPU, '
  47. '%d: HDDL VPU, '
  48. '%d: Vulkan, '
  49. '%d: CUDA, '
  50. '%d: CUDA fp16 (half-float preprocess)' % targets)
  51. parser.add_argument('--async', type=int, default=0,
  52. dest='asyncN',
  53. help='Number of asynchronous forwards at the same time. '
  54. 'Choose 0 for synchronous mode')
  55. args, _ = parser.parse_known_args()
  56. add_preproc_args(args.zoo, parser, 'object_detection')
  57. parser = argparse.ArgumentParser(parents=[parser],
  58. description='Use this script to run object detection deep learning networks using OpenCV.',
  59. formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  60. args = parser.parse_args()
  61. args.model = findFile(args.model)
  62. args.config = findFile(args.config)
  63. args.classes = findFile(args.classes)
  64. # If config specified, try to load it as TensorFlow Object Detection API's pipeline.
  65. config = readTextMessage(args.config)
  66. if 'model' in config:
  67. print('TensorFlow Object Detection API config detected')
  68. if 'ssd' in config['model'][0]:
  69. print('Preparing text graph representation for SSD model: ' + args.out_tf_graph)
  70. createSSDGraph(args.model, args.config, args.out_tf_graph)
  71. args.config = args.out_tf_graph
  72. elif 'faster_rcnn' in config['model'][0]:
  73. print('Preparing text graph representation for Faster-RCNN model: ' + args.out_tf_graph)
  74. createFasterRCNNGraph(args.model, args.config, args.out_tf_graph)
  75. args.config = args.out_tf_graph
  76. # Load names of classes
  77. classes = None
  78. if args.classes:
  79. with open(args.classes, 'rt') as f:
  80. classes = f.read().rstrip('\n').split('\n')
  81. # Load a network
  82. net = cv.dnn.readNet(args.model, args.config, args.framework)
  83. net.setPreferableBackend(args.backend)
  84. net.setPreferableTarget(args.target)
  85. outNames = net.getUnconnectedOutLayersNames()
  86. confThreshold = args.thr
  87. nmsThreshold = args.nms
  88. def postprocess(frame, outs):
  89. frameHeight = frame.shape[0]
  90. frameWidth = frame.shape[1]
  91. def drawPred(classId, conf, left, top, right, bottom):
  92. # Draw a bounding box.
  93. cv.rectangle(frame, (left, top), (right, bottom), (0, 255, 0))
  94. label = '%.2f' % conf
  95. # Print a label of class.
  96. if classes:
  97. assert(classId < len(classes))
  98. label = '%s: %s' % (classes[classId], label)
  99. labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
  100. top = max(top, labelSize[1])
  101. cv.rectangle(frame, (left, top - labelSize[1]), (left + labelSize[0], top + baseLine), (255, 255, 255), cv.FILLED)
  102. cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
  103. layerNames = net.getLayerNames()
  104. lastLayerId = net.getLayerId(layerNames[-1])
  105. lastLayer = net.getLayer(lastLayerId)
  106. classIds = []
  107. confidences = []
  108. boxes = []
  109. if lastLayer.type == 'DetectionOutput':
  110. # Network produces output blob with a shape 1x1xNx7 where N is a number of
  111. # detections and an every detection is a vector of values
  112. # [batchId, classId, confidence, left, top, right, bottom]
  113. for out in outs:
  114. for detection in out[0, 0]:
  115. confidence = detection[2]
  116. if confidence > confThreshold:
  117. left = int(detection[3])
  118. top = int(detection[4])
  119. right = int(detection[5])
  120. bottom = int(detection[6])
  121. width = right - left + 1
  122. height = bottom - top + 1
  123. if width <= 2 or height <= 2:
  124. left = int(detection[3] * frameWidth)
  125. top = int(detection[4] * frameHeight)
  126. right = int(detection[5] * frameWidth)
  127. bottom = int(detection[6] * frameHeight)
  128. width = right - left + 1
  129. height = bottom - top + 1
  130. classIds.append(int(detection[1]) - 1) # Skip background label
  131. confidences.append(float(confidence))
  132. boxes.append([left, top, width, height])
  133. elif lastLayer.type == 'Region' or args.postprocessing == 'yolov8':
  134. # Network produces output blob with a shape NxC where N is a number of
  135. # detected objects and C is a number of classes + 4 where the first 4
  136. # numbers are [center_x, center_y, width, height]
  137. if args.postprocessing == 'yolov8':
  138. box_scale_w = frameWidth / args.width
  139. box_scale_h = frameHeight / args.height
  140. else:
  141. box_scale_w = frameWidth
  142. box_scale_h = frameHeight
  143. for out in outs:
  144. if args.postprocessing == 'yolov8':
  145. out = out[0].transpose(1, 0)
  146. for detection in out:
  147. scores = detection[4:]
  148. if args.background_label_id >= 0:
  149. scores = np.delete(scores, args.background_label_id)
  150. classId = np.argmax(scores)
  151. confidence = scores[classId]
  152. if confidence > confThreshold:
  153. center_x = int(detection[0] * box_scale_w)
  154. center_y = int(detection[1] * box_scale_h)
  155. width = int(detection[2] * box_scale_w)
  156. height = int(detection[3] * box_scale_h)
  157. left = int(center_x - width / 2)
  158. top = int(center_y - height / 2)
  159. classIds.append(classId)
  160. confidences.append(float(confidence))
  161. boxes.append([left, top, width, height])
  162. else:
  163. print('Unknown output layer type: ' + lastLayer.type)
  164. exit()
  165. # NMS is used inside Region layer only on DNN_BACKEND_OPENCV for another backends we need NMS in sample
  166. # or NMS is required if number of outputs > 1
  167. if len(outNames) > 1 or (lastLayer.type == 'Region' or args.postprocessing == 'yolov8') and args.backend != cv.dnn.DNN_BACKEND_OPENCV:
  168. indices = []
  169. classIds = np.array(classIds)
  170. boxes = np.array(boxes)
  171. confidences = np.array(confidences)
  172. unique_classes = set(classIds)
  173. for cl in unique_classes:
  174. class_indices = np.where(classIds == cl)[0]
  175. conf = confidences[class_indices]
  176. box = boxes[class_indices].tolist()
  177. nms_indices = cv.dnn.NMSBoxes(box, conf, confThreshold, nmsThreshold)
  178. indices.extend(class_indices[nms_indices])
  179. else:
  180. indices = np.arange(0, len(classIds))
  181. for i in indices:
  182. box = boxes[i]
  183. left = box[0]
  184. top = box[1]
  185. width = box[2]
  186. height = box[3]
  187. drawPred(classIds[i], confidences[i], left, top, left + width, top + height)
  188. # Process inputs
  189. winName = 'Deep learning object detection in OpenCV'
  190. cv.namedWindow(winName, cv.WINDOW_NORMAL)
  191. def callback(pos):
  192. global confThreshold
  193. confThreshold = pos / 100.0
  194. cv.createTrackbar('Confidence threshold, %', winName, int(confThreshold * 100), 99, callback)
  195. cap = cv.VideoCapture(cv.samples.findFileOrKeep(args.input) if args.input else 0)
  196. class QueueFPS(queue.Queue):
  197. def __init__(self):
  198. queue.Queue.__init__(self)
  199. self.startTime = 0
  200. self.counter = 0
  201. def put(self, v):
  202. queue.Queue.put(self, v)
  203. self.counter += 1
  204. if self.counter == 1:
  205. self.startTime = time.time()
  206. def getFPS(self):
  207. return self.counter / (time.time() - self.startTime)
  208. process = True
  209. #
  210. # Frames capturing thread
  211. #
  212. framesQueue = QueueFPS()
  213. def framesThreadBody():
  214. global framesQueue, process
  215. while process:
  216. hasFrame, frame = cap.read()
  217. if not hasFrame:
  218. break
  219. framesQueue.put(frame)
  220. #
  221. # Frames processing thread
  222. #
  223. processedFramesQueue = queue.Queue()
  224. predictionsQueue = QueueFPS()
  225. def processingThreadBody():
  226. global processedFramesQueue, predictionsQueue, args, process
  227. futureOutputs = []
  228. while process:
  229. # Get a next frame
  230. frame = None
  231. try:
  232. frame = framesQueue.get_nowait()
  233. if args.asyncN:
  234. if len(futureOutputs) == args.asyncN:
  235. frame = None # Skip the frame
  236. else:
  237. framesQueue.queue.clear() # Skip the rest of frames
  238. except queue.Empty:
  239. pass
  240. if not frame is None:
  241. frameHeight = frame.shape[0]
  242. frameWidth = frame.shape[1]
  243. # Create a 4D blob from a frame.
  244. inpWidth = args.width if args.width else frameWidth
  245. inpHeight = args.height if args.height else frameHeight
  246. blob = cv.dnn.blobFromImage(frame, size=(inpWidth, inpHeight), swapRB=args.rgb, ddepth=cv.CV_8U)
  247. processedFramesQueue.put(frame)
  248. # Run a model
  249. net.setInput(blob, scalefactor=args.scale, mean=args.mean)
  250. if net.getLayer(0).outputNameToIndex('im_info') != -1: # Faster-RCNN or R-FCN
  251. frame = cv.resize(frame, (inpWidth, inpHeight))
  252. net.setInput(np.array([[inpHeight, inpWidth, 1.6]], dtype=np.float32), 'im_info')
  253. if args.asyncN:
  254. futureOutputs.append(net.forwardAsync())
  255. else:
  256. outs = net.forward(outNames)
  257. predictionsQueue.put(copy.deepcopy(outs))
  258. while futureOutputs and futureOutputs[0].wait_for(0):
  259. out = futureOutputs[0].get()
  260. predictionsQueue.put(copy.deepcopy([out]))
  261. del futureOutputs[0]
  262. framesThread = Thread(target=framesThreadBody)
  263. framesThread.start()
  264. processingThread = Thread(target=processingThreadBody)
  265. processingThread.start()
  266. #
  267. # Postprocessing and rendering loop
  268. #
  269. while cv.waitKey(1) < 0:
  270. try:
  271. # Request prediction first because they put after frames
  272. outs = predictionsQueue.get_nowait()
  273. frame = processedFramesQueue.get_nowait()
  274. postprocess(frame, outs)
  275. # Put efficiency information.
  276. if predictionsQueue.counter > 1:
  277. label = 'Camera: %.2f FPS' % (framesQueue.getFPS())
  278. cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
  279. label = 'Network: %.2f FPS' % (predictionsQueue.getFPS())
  280. cv.putText(frame, label, (0, 30), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
  281. label = 'Skipped frames: %d' % (framesQueue.counter - predictionsQueue.counter)
  282. cv.putText(frame, label, (0, 45), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
  283. cv.imshow(winName, frame)
  284. except queue.Empty:
  285. pass
  286. process = False
  287. framesThread.join()
  288. processingThread.join()