loftr_template_match.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789
  1. """
  2. 使用 LoFTR 在整张屏幕截图中匹配小模板图,得到几何关系(单应矩阵或备用点集)。
  3. 小模板 + 大图时 LoFTR 外点会破坏单应;局部 NCC 若以错误簇为中心会框错(如把刷新钮当成搜索图标)。
  4. 因此在全分辨率上做「多尺度全局 NCC」,优先取全图最高分作为最终框;LoFTR 仍用于几何备份。
  5. """
  6. from __future__ import annotations
  7. import math
  8. import ssl
  9. import sys
  10. import urllib.request
  11. from abc import ABC, abstractmethod
  12. from dataclasses import dataclass
  13. from pathlib import Path
  14. import cv2
  15. import numpy as np
  16. # 与 Kornia LoFTR outdoor 相同来源;本地无权重文件时从此处下载
  17. LOFTR_OUTDOOR_WEIGHT_DOWNLOAD_URL = (
  18. "http://cmp.felk.cvut.cz/~mishkdmy/models/loftr_outdoor.ckpt"
  19. )
  20. _REPOSITORY_ROOT_DIRECTORY = Path(__file__).resolve().parent.parent
  21. @dataclass(frozen=True)
  22. class TemplateMatchBoundingBoxAndCenterRoundedToIntegerScreenPixels:
  23. bounding_box_left_integer: int
  24. bounding_box_top_integer: int
  25. bounding_box_width_integer: int
  26. bounding_box_height_integer: int
  27. center_screen_x_integer: int
  28. center_screen_y_integer: int
  29. def bounding_box_left_top_width_height_xywh_as_tuple_of_four_integers(
  30. self,
  31. ) -> tuple[int, int, int, int]:
  32. return (
  33. self.bounding_box_left_integer,
  34. self.bounding_box_top_integer,
  35. self.bounding_box_width_integer,
  36. self.bounding_box_height_integer,
  37. )
  38. @dataclass(frozen=True)
  39. class TemplateMatchBoundingBoxAndCenterInScreenshotPixels:
  40. bounding_box_left_top_width_height_xywh_float_tuple: tuple[float, float, float, float]
  41. center_screen_x_float: float
  42. center_screen_y_float: float
  43. def as_rounded_to_integer_screen_coordinates(
  44. self,
  45. ) -> TemplateMatchBoundingBoxAndCenterRoundedToIntegerScreenPixels:
  46. bounding_box_left_float, bounding_box_top_float, bounding_box_width_float, bounding_box_height_float = (
  47. self.bounding_box_left_top_width_height_xywh_float_tuple
  48. )
  49. return TemplateMatchBoundingBoxAndCenterRoundedToIntegerScreenPixels(
  50. bounding_box_left_integer=int(round(bounding_box_left_float)),
  51. bounding_box_top_integer=int(round(bounding_box_top_float)),
  52. bounding_box_width_integer=int(round(bounding_box_width_float)),
  53. bounding_box_height_integer=int(round(bounding_box_height_float)),
  54. center_screen_x_integer=int(round(self.center_screen_x_float)),
  55. center_screen_y_integer=int(round(self.center_screen_y_float)),
  56. )
  57. @dataclass(frozen=True)
  58. class LoFTRTemplateAgainstScreenshotMatch:
  59. """模板在截图上的 LoFTR 匹配中间量,用于解析全分辨率包围盒与中心点。"""
  60. screenshot_bgr_full_size: np.ndarray
  61. """原始分辨率屏幕截图,BGR。"""
  62. homography_template_inference_to_screenshot_inference: np.ndarray | None
  63. """3×3 单应:模板推理分辨率坐标 → 截图推理分辨率坐标;可能因不可靠而为 None。"""
  64. template_width_pixels_at_inference: int
  65. template_height_pixels_at_inference: int
  66. divisor_inference_screenshot_x_to_fullsize_x: float
  67. divisor_inference_screenshot_y_to_fullsize_y: float
  68. high_confidence_match_points_on_screenshot_inference: np.ndarray
  69. """高置信度匹配点在截图推理坐标下;作回退包围盒用。"""
  70. template_original_width_pixels: int
  71. template_original_height_pixels: int
  72. refined_template_bbox_xywh_full_size: tuple[float, float, float, float] | None
  73. """NCC 精修成功时为全图坐标系下的 (x, y, width, height);否则为 None。"""
  74. ransac_inlier_points_screenshot_inference: np.ndarray
  75. """RANSAC 内点在截图推理分辨率下的坐标;用于回退矩形与精修搜索中心。"""
  76. class TemplateAgainstScreenshotMatcher(ABC):
  77. @abstractmethod
  78. def match_template_center_in_screenshot(
  79. self,
  80. template_image_file_path: Path | str,
  81. screenshot_image_file_path: Path | str | None = None,
  82. *,
  83. screenshot_bgr_full_size_numpy: np.ndarray | None = None,
  84. ) -> TemplateMatchBoundingBoxAndCenterInScreenshotPixels:
  85. ...
  86. def _template_match_result_center_xy_in_screenshot_pixels(
  87. match: LoFTRTemplateAgainstScreenshotMatch,
  88. ) -> tuple[float, float]:
  89. refined_bbox_xywh = match.refined_template_bbox_xywh_full_size
  90. if refined_bbox_xywh is not None:
  91. bbox_x, bbox_y, bbox_width, bbox_height = refined_bbox_xywh
  92. return (
  93. bbox_x + bbox_width / 2.0,
  94. bbox_y + bbox_height / 2.0,
  95. )
  96. homography_matrix = match.homography_template_inference_to_screenshot_inference
  97. if homography_matrix is not None:
  98. template_width_inference = match.template_width_pixels_at_inference
  99. template_height_inference = match.template_height_pixels_at_inference
  100. corners_template_inference_xy = np.array(
  101. [
  102. [0, 0],
  103. [template_width_inference - 1, 0],
  104. [template_width_inference - 1, template_height_inference - 1],
  105. [0, template_height_inference - 1],
  106. ],
  107. dtype=np.float32,
  108. ).reshape(1, 4, 2)
  109. corners_screenshot_inference_xy = cv2.perspectiveTransform(
  110. corners_template_inference_xy, homography_matrix
  111. )[0]
  112. divisor_x = match.divisor_inference_screenshot_x_to_fullsize_x
  113. divisor_y = match.divisor_inference_screenshot_y_to_fullsize_y
  114. corners_screenshot_full_xy = corners_screenshot_inference_xy.copy()
  115. corners_screenshot_full_xy[:, 0] /= divisor_x
  116. corners_screenshot_full_xy[:, 1] /= divisor_y
  117. mean_corner_xy = np.mean(corners_screenshot_full_xy, axis=0)
  118. return (float(mean_corner_xy[0]), float(mean_corner_xy[1]))
  119. inlier_points_inference_xy = match.ransac_inlier_points_screenshot_inference
  120. median_inference_xy = np.median(inlier_points_inference_xy, axis=0)
  121. divisor_x = match.divisor_inference_screenshot_x_to_fullsize_x
  122. divisor_y = match.divisor_inference_screenshot_y_to_fullsize_y
  123. return (
  124. float(median_inference_xy[0] / divisor_x),
  125. float(median_inference_xy[1] / divisor_y),
  126. )
  127. def _template_match_bounding_box_xywh_full_size_float_tuple_from_loftr_template_against_screenshot_match_object(
  128. match: LoFTRTemplateAgainstScreenshotMatch,
  129. ) -> tuple[float, float, float, float]:
  130. refined_bbox_xywh = match.refined_template_bbox_xywh_full_size
  131. if refined_bbox_xywh is not None:
  132. return refined_bbox_xywh
  133. homography_matrix = match.homography_template_inference_to_screenshot_inference
  134. if homography_matrix is not None:
  135. template_width_inference = match.template_width_pixels_at_inference
  136. template_height_inference = match.template_height_pixels_at_inference
  137. corners_template_inference_xy = np.array(
  138. [
  139. [0, 0],
  140. [template_width_inference - 1, 0],
  141. [
  142. template_width_inference - 1,
  143. template_height_inference - 1,
  144. ],
  145. [0, template_height_inference - 1],
  146. ],
  147. dtype=np.float32,
  148. ).reshape(1, 4, 2)
  149. corners_screenshot_inference_xy = cv2.perspectiveTransform(
  150. corners_template_inference_xy,
  151. homography_matrix,
  152. )[0]
  153. divisor_x = match.divisor_inference_screenshot_x_to_fullsize_x
  154. divisor_y = match.divisor_inference_screenshot_y_to_fullsize_y
  155. corners_screenshot_full_xy = corners_screenshot_inference_xy.copy()
  156. corners_screenshot_full_xy[:, 0] /= divisor_x
  157. corners_screenshot_full_xy[:, 1] /= divisor_y
  158. min_x = float(np.min(corners_screenshot_full_xy[:, 0]))
  159. min_y = float(np.min(corners_screenshot_full_xy[:, 1]))
  160. max_x = float(np.max(corners_screenshot_full_xy[:, 0]))
  161. max_y = float(np.max(corners_screenshot_full_xy[:, 1]))
  162. return (min_x, min_y, max_x - min_x, max_y - min_y)
  163. inlier_points_inference_xy = match.ransac_inlier_points_screenshot_inference
  164. divisor_x = match.divisor_inference_screenshot_x_to_fullsize_x
  165. divisor_y = match.divisor_inference_screenshot_y_to_fullsize_y
  166. inlier_full_xy = inlier_points_inference_xy.astype(np.float64).copy()
  167. inlier_full_xy[:, 0] /= divisor_x
  168. inlier_full_xy[:, 1] /= divisor_y
  169. min_x = float(np.min(inlier_full_xy[:, 0]))
  170. min_y = float(np.min(inlier_full_xy[:, 1]))
  171. max_x = float(np.max(inlier_full_xy[:, 0]))
  172. max_y = float(np.max(inlier_full_xy[:, 1]))
  173. return (min_x, min_y, max_x - min_x, max_y - min_y)
  174. def _template_match_bounding_box_and_center_in_screenshot_pixels_from_loftr_template_against_screenshot_match_object(
  175. match: LoFTRTemplateAgainstScreenshotMatch,
  176. ) -> TemplateMatchBoundingBoxAndCenterInScreenshotPixels:
  177. bounding_box_left_top_width_height_xywh_float_tuple = (
  178. _template_match_bounding_box_xywh_full_size_float_tuple_from_loftr_template_against_screenshot_match_object(
  179. match,
  180. )
  181. )
  182. center_screen_x_float, center_screen_y_float = (
  183. _template_match_result_center_xy_in_screenshot_pixels(match)
  184. )
  185. return TemplateMatchBoundingBoxAndCenterInScreenshotPixels(
  186. bounding_box_left_top_width_height_xywh_float_tuple=(
  187. bounding_box_left_top_width_height_xywh_float_tuple
  188. ),
  189. center_screen_x_float=center_screen_x_float,
  190. center_screen_y_float=center_screen_y_float,
  191. )
  192. class LoFTRTemplateAgainstScreenshotMatcher(TemplateAgainstScreenshotMatcher):
  193. def __init__(
  194. self,
  195. loftr_repository_directory: Path,
  196. loftr_weight_checkpoint_file_path: Path,
  197. *,
  198. template_long_edge_max_pixels: int = 640,
  199. screenshot_long_edge_max_pixels: int = 1280,
  200. ransac_reprojection_threshold: float = 3.0,
  201. max_matches_for_homography: int = 800,
  202. ) -> None:
  203. self._loftr_repository_directory = loftr_repository_directory
  204. self._loftr_weight_checkpoint_file_path = loftr_weight_checkpoint_file_path
  205. self._template_long_edge_max_pixels = template_long_edge_max_pixels
  206. self._screenshot_long_edge_max_pixels = screenshot_long_edge_max_pixels
  207. self._ransac_reprojection_threshold = ransac_reprojection_threshold
  208. self._max_matches_for_homography = max_matches_for_homography
  209. def match_template_center_in_screenshot(
  210. self,
  211. template_image_file_path: Path | str,
  212. screenshot_image_file_path: Path | str | None = None,
  213. *,
  214. screenshot_bgr_full_size_numpy: np.ndarray | None = None,
  215. ) -> TemplateMatchBoundingBoxAndCenterInScreenshotPixels:
  216. screenshot_path_for_loftr: Path | None
  217. if screenshot_image_file_path is not None:
  218. screenshot_path_for_loftr = Path(screenshot_image_file_path)
  219. else:
  220. screenshot_path_for_loftr = None
  221. match_result = run_loftr_template_match(
  222. loftr_repository_directory=self._loftr_repository_directory,
  223. loftr_weight_checkpoint_path=self._loftr_weight_checkpoint_file_path,
  224. screenshot_image_path=screenshot_path_for_loftr,
  225. screenshot_bgr_full_size_numpy=screenshot_bgr_full_size_numpy,
  226. template_image_path=Path(template_image_file_path),
  227. template_long_edge_max_pixels=self._template_long_edge_max_pixels,
  228. screenshot_long_edge_max_pixels=self._screenshot_long_edge_max_pixels,
  229. ransac_reprojection_threshold=self._ransac_reprojection_threshold,
  230. max_matches_for_homography=self._max_matches_for_homography,
  231. )
  232. return match_result
  233. def match_template_center_xy_for_screenshot_file_and_template_file(
  234. source_screen_screenshot_image_file_path: Path | str,
  235. template_image_file_path: Path | str,
  236. ) -> TemplateMatchBoundingBoxAndCenterInScreenshotPixels:
  237. """
  238. 在 ``source_screen_screenshot_image_file_path`` 所指整图(如全屏 PNG)中用 LoFTR + NCC 匹配 ``template_image_file_path``,
  239. 返回全分辨率下的包围盒 ``(x, y, width, height)`` 与模板中心 ``(x, y)``。LoFTR 目录与权重使用仓库默认路径。
  240. """
  241. loftr_repository_directory_path = _REPOSITORY_ROOT_DIRECTORY / "python" / "LoFTR"
  242. loftr_weight_checkpoint_file_path = (
  243. loftr_repository_directory_path / "weights" / "loftr_outdoor.ckpt"
  244. )
  245. loftr_template_against_screenshot_matcher = LoFTRTemplateAgainstScreenshotMatcher(
  246. loftr_repository_directory_path,
  247. loftr_weight_checkpoint_file_path,
  248. )
  249. return loftr_template_against_screenshot_matcher.match_template_center_in_screenshot(
  250. template_image_file_path,
  251. Path(source_screen_screenshot_image_file_path),
  252. )
  253. def _download_file_with_ssl_fallbacks(download_url: str, destination_file_path: Path) -> None:
  254. destination_file_path.parent.mkdir(parents=True, exist_ok=True)
  255. ssl_context_candidates: list[ssl.SSLContext] = []
  256. try:
  257. import certifi
  258. ssl_context_candidates.append(
  259. ssl.create_default_context(cafile=certifi.where())
  260. )
  261. except Exception:
  262. pass
  263. ssl_context_candidates.append(ssl.create_default_context())
  264. ssl_context_candidates.append(ssl._create_unverified_context())
  265. last_error: BaseException | None = None
  266. for ssl_context in ssl_context_candidates:
  267. try:
  268. with urllib.request.urlopen(
  269. download_url, context=ssl_context, timeout=300
  270. ) as response:
  271. destination_file_path.write_bytes(response.read())
  272. return
  273. except Exception as exc:
  274. last_error = exc
  275. raise RuntimeError(
  276. f"无法下载 LoFTR 权重:{download_url}\n请手动保存到:{destination_file_path}"
  277. ) from last_error
  278. def _resize_grayscale_divisible_by_eight(
  279. grayscale_image: np.ndarray,
  280. longest_edge_max_pixels: int,
  281. ) -> tuple[np.ndarray, float, float]:
  282. """返回 (缩放图, divisor_x, divisor_y),全尺寸坐标 = 推理坐标 / divisor。"""
  283. original_height, original_width = grayscale_image.shape[:2]
  284. target_width, target_height = original_width, original_height
  285. if max(target_height, target_width) > longest_edge_max_pixels:
  286. shrink_scale = longest_edge_max_pixels / max(target_height, target_width)
  287. target_width = int(round(original_width * shrink_scale))
  288. target_height = int(round(original_height * shrink_scale))
  289. inference_width = max((target_width // 8) * 8, 8)
  290. inference_height = max((target_height // 8) * 8, 8)
  291. resized = cv2.resize(
  292. grayscale_image,
  293. (inference_width, inference_height),
  294. interpolation=cv2.INTER_AREA,
  295. )
  296. divisor_x = inference_width / original_width
  297. divisor_y = inference_height / original_height
  298. return resized, divisor_x, divisor_y
  299. def _filter_matches_near_confident_median(
  300. template_points: np.ndarray,
  301. screenshot_points: np.ndarray,
  302. confidence: np.ndarray,
  303. template_width_inf: int,
  304. template_height_inf: int,
  305. min_points: int = 12,
  306. ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
  307. """保留置信度高且在几何中位数附近成簇的匹配,抑制随机外点。"""
  308. order = np.argsort(-confidence)
  309. top_n = min(200, len(order))
  310. sp = screenshot_points[order[:top_n]]
  311. tp = template_points[order[:top_n]]
  312. cf = confidence[order[:top_n]]
  313. median_screen = np.median(sp[: min(40, len(sp))], axis=0)
  314. diag = float(np.hypot(template_width_inf, template_height_inf))
  315. radius = max(diag * 2.5, 48.0)
  316. for _ in range(8):
  317. dist = np.linalg.norm(sp - median_screen, axis=1)
  318. keep = dist < radius
  319. if int(np.sum(keep)) >= min_points:
  320. return tp[keep], sp[keep], cf[keep]
  321. radius *= 1.35
  322. median_screen = np.median(sp[keep] if np.any(keep) else sp, axis=0)
  323. return tp, sp, cf
  324. def _homography_quad_plausible_on_full_image(
  325. corners_full_xy: np.ndarray,
  326. image_width: int,
  327. image_height: int,
  328. template_orig_w: int,
  329. template_orig_h: int,
  330. ) -> bool:
  331. """检查单应投影四边形是否落在图内且尺度/长宽比与模板大致一致。"""
  332. if corners_full_xy.shape != (4, 2) or not np.all(np.isfinite(corners_full_xy)):
  333. return False
  334. margin_x = 0.08 * image_width
  335. margin_y = 0.08 * image_height
  336. xs, ys = corners_full_xy[:, 0], corners_full_xy[:, 1]
  337. if xs.min() < -margin_x or xs.max() > image_width + margin_x:
  338. return False
  339. if ys.min() < -margin_y or ys.max() > image_height + margin_y:
  340. return False
  341. bbox_w = float(xs.max() - xs.min())
  342. bbox_h = float(ys.max() - ys.min())
  343. if bbox_w < 4 or bbox_h < 4:
  344. return False
  345. tpl_ar = template_orig_w / max(template_orig_h, 1)
  346. box_ar = bbox_w / max(bbox_h, 1e-6)
  347. ratio = box_ar / max(tpl_ar, 1e-6)
  348. if ratio > 3.5 or ratio < (1.0 / 3.5):
  349. return False
  350. area = float(cv2.contourArea(corners_full_xy.astype(np.float32)))
  351. expected = float(template_orig_w * template_orig_h)
  352. if area < 0.12 * expected or area > 30.0 * expected:
  353. return False
  354. return True
  355. def _ncc_global_multiscale_best_match(
  356. screenshot_gray_full: np.ndarray,
  357. template_gray_full: np.ndarray,
  358. template_scale_factors: tuple[float, ...] = (
  359. 0.88,
  360. 0.92,
  361. 0.96,
  362. 1.0,
  363. 1.04,
  364. 1.08,
  365. 1.12,
  366. ),
  367. minimum_acceptable_score: float = 0.34,
  368. ) -> tuple[tuple[float, float, float, float], float] | None:
  369. """
  370. 在全图上对多种缩放的模板做 TM_CCOEFF_NORMED,取全局最大响应。
  371. 适用于 UI 小图标与截图同尺度、需避免「只在 LoFTR 簇附近找」而找错位置的情况。
  372. 返回 ((x, y, w, h), 最佳分数);分数低于阈值则视为不可靠。
  373. """
  374. h_img, w_img = screenshot_gray_full.shape[:2]
  375. h_tpl0, w_tpl0 = template_gray_full.shape[:2]
  376. if h_tpl0 >= h_img or w_tpl0 >= w_img:
  377. return None
  378. best_score = -1.0
  379. best_bbox_xywh: tuple[float, float, float, float] | None = None
  380. for scale in template_scale_factors:
  381. tw = max(3, int(round(w_tpl0 * scale)))
  382. th = max(3, int(round(h_tpl0 * scale)))
  383. if tw >= w_img or th >= h_img:
  384. continue
  385. template_scaled = cv2.resize(
  386. template_gray_full, (tw, th), interpolation=cv2.INTER_AREA
  387. )
  388. response_map = cv2.matchTemplate(
  389. screenshot_gray_full, template_scaled, cv2.TM_CCOEFF_NORMED
  390. )
  391. _, max_val, _, max_loc = cv2.minMaxLoc(response_map)
  392. if max_val > best_score:
  393. best_score = float(max_val)
  394. best_bbox_xywh = (
  395. float(max_loc[0]),
  396. float(max_loc[1]),
  397. float(tw),
  398. float(th),
  399. )
  400. if best_bbox_xywh is None or best_score < minimum_acceptable_score:
  401. return None
  402. return best_bbox_xywh, best_score
  403. def list_template_match_centers_ncc_multiscale(
  404. template_image_file_path: Path | str,
  405. *,
  406. screenshot_image_file_path: Path | str | None = None,
  407. screenshot_bgr_numpy: np.ndarray | None = None,
  408. template_scale_factors: tuple[float, ...] = (
  409. 0.88,
  410. 0.92,
  411. 0.96,
  412. 1.0,
  413. 1.04,
  414. 1.08,
  415. 1.12,
  416. ),
  417. min_score: float = 0.28,
  418. max_peaks_per_scale: int = 8,
  419. dedupe_distance_pixels: float = 28.0,
  420. ) -> list[tuple[float, float]]:
  421. """
  422. 全图多尺度 ``matchTemplate``,收集局部极大响应,NMS 后按分数从高到低去重,
  423. 返回模板中心点列表(全分辨率坐标),供与 OCR 锚点选「最近」匹配用。
  424. """
  425. if screenshot_bgr_numpy is not None:
  426. screenshot_bgr = screenshot_bgr_numpy
  427. else:
  428. screenshot_bgr = cv2.imread(
  429. str(screenshot_image_file_path), cv2.IMREAD_COLOR
  430. )
  431. template_gray = cv2.imread(str(template_image_file_path), cv2.IMREAD_GRAYSCALE)
  432. if screenshot_bgr is None or template_gray is None:
  433. raise FileNotFoundError(
  434. f"无法读取截图或模板:screenshot_image_file_path={screenshot_image_file_path!s} template_image_file_path={template_image_file_path!s}"
  435. )
  436. screenshot_gray = cv2.cvtColor(screenshot_bgr, cv2.COLOR_BGR2GRAY)
  437. h_img, w_img = screenshot_gray.shape[:2]
  438. h_tpl0, w_tpl0 = template_gray.shape[:2]
  439. if h_tpl0 >= h_img or w_tpl0 >= w_img:
  440. return []
  441. raw: list[tuple[float, float, float]] = []
  442. for scale in template_scale_factors:
  443. tw = max(3, int(round(w_tpl0 * scale)))
  444. th = max(3, int(round(h_tpl0 * scale)))
  445. if tw >= w_img or th >= h_img:
  446. continue
  447. template_scaled = cv2.resize(
  448. template_gray, (tw, th), interpolation=cv2.INTER_AREA
  449. )
  450. response_map = cv2.matchTemplate(
  451. screenshot_gray, template_scaled, cv2.TM_CCOEFF_NORMED
  452. )
  453. nms_radius = int(max(4, max(tw, th) * 0.55))
  454. work = response_map.copy()
  455. for _ in range(max_peaks_per_scale):
  456. _, max_val, _, max_loc = cv2.minMaxLoc(work)
  457. if max_val < min_score:
  458. break
  459. mx, my = max_loc
  460. cx = float(mx) + float(tw) / 2.0
  461. cy = float(my) + float(th) / 2.0
  462. raw.append((cx, cy, float(max_val)))
  463. x0 = max(0, mx - nms_radius)
  464. y0 = max(0, my - nms_radius)
  465. x1 = min(work.shape[1], mx + nms_radius + 1)
  466. y1 = min(work.shape[0], my + nms_radius + 1)
  467. work[y0:y1, x0:x1] = -1.0
  468. raw.sort(key=lambda row: -row[2])
  469. merged: list[tuple[float, float]] = []
  470. for cx, cy, _sc in raw:
  471. if any(
  472. math.hypot(cx - ox, cy - oy) < dedupe_distance_pixels
  473. for ox, oy in merged
  474. ):
  475. continue
  476. merged.append((cx, cy))
  477. return merged
  478. def _ncc_refine_template_bbox(
  479. screenshot_gray_full: np.ndarray,
  480. template_gray_full: np.ndarray,
  481. center_x_full: float,
  482. center_y_full: float,
  483. search_margin_full: float,
  484. min_ncc_score: float = 0.25,
  485. ) -> tuple[float, float, float, float] | None:
  486. """在以 LoFTR 粗中心为邻域内做归一化互相关,返回最佳 (x, y, w, h)。"""
  487. h_img, w_img = screenshot_gray_full.shape[:2]
  488. h_t, w_t = template_gray_full.shape[:2]
  489. if h_t >= h_img or w_t >= w_img:
  490. return None
  491. half = int(max(search_margin_full, max(w_t, h_t) * 2))
  492. x0 = int(np.clip(center_x_full - half, 0, w_img - 1))
  493. y0 = int(np.clip(center_y_full - half, 0, h_img - 1))
  494. x1 = int(np.clip(center_x_full + half, 0, w_img))
  495. y1 = int(np.clip(center_y_full + half, 0, h_img))
  496. if x1 - x0 <= w_t or y1 - y0 <= h_t:
  497. return None
  498. roi = screenshot_gray_full[y0:y1, x0:x1]
  499. result = cv2.matchTemplate(roi, template_gray_full, cv2.TM_CCOEFF_NORMED)
  500. _, max_val, _, max_loc = cv2.minMaxLoc(result)
  501. if max_val < min_ncc_score:
  502. return None
  503. x = float(x0 + max_loc[0])
  504. y = float(y0 + max_loc[1])
  505. return (x, y, float(w_t), float(h_t))
  506. def run_loftr_template_match(
  507. *,
  508. loftr_repository_directory: Path,
  509. loftr_weight_checkpoint_path: Path,
  510. screenshot_image_path: Path | None = None,
  511. screenshot_bgr_full_size_numpy: np.ndarray | None = None,
  512. template_image_path: Path,
  513. template_long_edge_max_pixels: int = 640,
  514. screenshot_long_edge_max_pixels: int = 1280,
  515. ransac_reprojection_threshold: float = 3.0,
  516. max_matches_for_homography: int = 800,
  517. ) -> TemplateMatchBoundingBoxAndCenterInScreenshotPixels:
  518. """
  519. 读取模板图与截图,在推理尺度上跑 LoFTR,过滤外点后估计单应,并用 NCC 在全分辨率上精修位置;
  520. 返回全分辨率包围盒与中心点(不写磁盘)。
  521. """
  522. if not loftr_repository_directory.is_dir():
  523. raise FileNotFoundError(f"未找到 LoFTR 源码目录:{loftr_repository_directory}")
  524. repository_path_string = str(loftr_repository_directory)
  525. if repository_path_string not in sys.path:
  526. sys.path.insert(0, repository_path_string)
  527. import torch # noqa: E402
  528. from src.loftr import LoFTR, default_cfg # noqa: E402
  529. if not loftr_weight_checkpoint_path.is_file():
  530. _download_file_with_ssl_fallbacks(
  531. LOFTR_OUTDOOR_WEIGHT_DOWNLOAD_URL, loftr_weight_checkpoint_path
  532. )
  533. if screenshot_bgr_full_size_numpy is not None:
  534. screenshot_bgr_full_size = screenshot_bgr_full_size_numpy
  535. else:
  536. screenshot_bgr_full_size = cv2.imread(
  537. str(screenshot_image_path), cv2.IMREAD_COLOR
  538. )
  539. template_grayscale = cv2.imread(
  540. str(template_image_path), cv2.IMREAD_GRAYSCALE
  541. )
  542. if screenshot_bgr_full_size is None or template_grayscale is None:
  543. raise FileNotFoundError(
  544. f"无法读取图片:screenshot_image_path={screenshot_image_path!s} template_image_path={template_image_path!s}"
  545. )
  546. template_orig_h, template_orig_w = template_grayscale.shape[:2]
  547. screenshot_grayscale = cv2.cvtColor(
  548. screenshot_bgr_full_size, cv2.COLOR_BGR2GRAY
  549. )
  550. full_h, full_w = screenshot_grayscale.shape[:2]
  551. template_at_inference, _, _ = _resize_grayscale_divisible_by_eight(
  552. template_grayscale, template_long_edge_max_pixels
  553. )
  554. screenshot_at_inference, divisor_screen_x, divisor_screen_y = (
  555. _resize_grayscale_divisible_by_eight(
  556. screenshot_grayscale, screenshot_long_edge_max_pixels
  557. )
  558. )
  559. template_height_at_inference, template_width_at_inference = (
  560. template_at_inference.shape[:2]
  561. )
  562. compute_device = torch.device(
  563. "cuda" if torch.cuda.is_available() else "cpu"
  564. )
  565. template_batch = (
  566. torch.from_numpy(template_at_inference).float()[None, None].to(compute_device)
  567. / 255.0
  568. )
  569. screenshot_batch = (
  570. torch.from_numpy(screenshot_at_inference).float()[None, None].to(compute_device)
  571. / 255.0
  572. )
  573. matcher_network = LoFTR(config=default_cfg)
  574. checkpoint = torch.load(
  575. str(loftr_weight_checkpoint_path), map_location=compute_device
  576. )
  577. matcher_network.load_state_dict(checkpoint["state_dict"])
  578. matcher_network = matcher_network.eval().to(compute_device)
  579. torch.set_grad_enabled(False)
  580. forward_batch = {"image0": template_batch, "image1": screenshot_batch}
  581. matcher_network(forward_batch)
  582. template_match_points = forward_batch["mkpts0_f"].detach().cpu().numpy()
  583. screenshot_match_points = forward_batch["mkpts1_f"].detach().cpu().numpy()
  584. match_confidence = forward_batch["mconf"].detach().cpu().numpy()
  585. trusted_homography: np.ndarray | None
  586. screenshot_points_for_ransac: np.ndarray
  587. inlier_screen: np.ndarray
  588. if len(template_match_points) < 4:
  589. inference_screenshot_height, inference_screenshot_width = (
  590. screenshot_at_inference.shape[:2]
  591. )
  592. inlier_screen = np.array(
  593. [
  594. [
  595. float(inference_screenshot_width) * 0.5,
  596. float(inference_screenshot_height) * 0.5,
  597. ]
  598. ],
  599. dtype=np.float32,
  600. )
  601. screenshot_points_for_ransac = inlier_screen.copy()
  602. trusted_homography = None
  603. else:
  604. tp_filt, sp_filt, cf_filt = _filter_matches_near_confident_median(
  605. template_match_points,
  606. screenshot_match_points,
  607. match_confidence,
  608. template_width_at_inference,
  609. template_height_at_inference,
  610. )
  611. confidence_sorted_indices = np.argsort(-cf_filt)[
  612. : min(max_matches_for_homography, len(cf_filt))
  613. ]
  614. template_points_for_ransac = tp_filt[confidence_sorted_indices].astype(
  615. np.float32
  616. )
  617. screenshot_points_for_ransac = sp_filt[confidence_sorted_indices].astype(
  618. np.float32
  619. )
  620. homography_matrix, homography_mask = cv2.findHomography(
  621. template_points_for_ransac,
  622. screenshot_points_for_ransac,
  623. cv2.RANSAC,
  624. ransac_reprojection_threshold,
  625. maxIters=5000,
  626. confidence=0.995,
  627. )
  628. if homography_mask is not None:
  629. inlier_flat = homography_mask.ravel().astype(bool)
  630. inlier_screen = screenshot_points_for_ransac[inlier_flat]
  631. else:
  632. inlier_screen = screenshot_points_for_ransac
  633. if len(inlier_screen) < 4:
  634. inlier_screen = sp_filt
  635. trusted_homography = homography_matrix
  636. if homography_matrix is not None:
  637. corners_tpl = np.array(
  638. [
  639. [0, 0],
  640. [template_width_at_inference - 1, 0],
  641. [
  642. template_width_at_inference - 1,
  643. template_height_at_inference - 1,
  644. ],
  645. [0, template_height_at_inference - 1],
  646. ],
  647. dtype=np.float32,
  648. ).reshape(1, 4, 2)
  649. corners_inf = cv2.perspectiveTransform(corners_tpl, homography_matrix)[0]
  650. corners_full = corners_inf.copy()
  651. corners_full[:, 0] /= divisor_screen_x
  652. corners_full[:, 1] /= divisor_screen_y
  653. if not _homography_quad_plausible_on_full_image(
  654. corners_full, full_w, full_h, template_orig_w, template_orig_h
  655. ):
  656. trusted_homography = None
  657. center_inf = np.median(inlier_screen, axis=0)
  658. center_full_x = float(center_inf[0] / divisor_screen_x)
  659. center_full_y = float(center_inf[1] / divisor_screen_y)
  660. search_margin = float(
  661. max(
  662. template_orig_w,
  663. template_orig_h,
  664. template_width_at_inference / divisor_screen_x,
  665. )
  666. * 2.5
  667. )
  668. global_ncc = _ncc_global_multiscale_best_match(
  669. screenshot_grayscale, template_grayscale
  670. )
  671. local_ncc = _ncc_refine_template_bbox(
  672. screenshot_grayscale,
  673. template_grayscale,
  674. center_full_x,
  675. center_full_y,
  676. search_margin,
  677. )
  678. refined_bbox: tuple[float, float, float, float] | None
  679. if global_ncc is not None:
  680. refined_bbox = global_ncc[0]
  681. else:
  682. refined_bbox = local_ncc
  683. if refined_bbox is None:
  684. relaxed_global_ncc = _ncc_global_multiscale_best_match(
  685. screenshot_grayscale,
  686. template_grayscale,
  687. minimum_acceptable_score=0.18,
  688. )
  689. if relaxed_global_ncc is not None:
  690. refined_bbox = relaxed_global_ncc[0]
  691. template_against_screenshot_match = LoFTRTemplateAgainstScreenshotMatch(
  692. screenshot_bgr_full_size=screenshot_bgr_full_size,
  693. homography_template_inference_to_screenshot_inference=trusted_homography,
  694. template_width_pixels_at_inference=template_width_at_inference,
  695. template_height_pixels_at_inference=template_height_at_inference,
  696. divisor_inference_screenshot_x_to_fullsize_x=divisor_screen_x,
  697. divisor_inference_screenshot_y_to_fullsize_y=divisor_screen_y,
  698. high_confidence_match_points_on_screenshot_inference=screenshot_points_for_ransac,
  699. template_original_width_pixels=template_orig_w,
  700. template_original_height_pixels=template_orig_h,
  701. refined_template_bbox_xywh_full_size=refined_bbox,
  702. ransac_inlier_points_screenshot_inference=inlier_screen.astype(np.float32),
  703. )
  704. return _template_match_bounding_box_and_center_in_screenshot_pixels_from_loftr_template_against_screenshot_match_object(
  705. template_against_screenshot_match,
  706. )