""" 使用 LoFTR 在整张屏幕截图中匹配小模板图,得到几何关系(单应矩阵或备用点集)。 小模板 + 大图时 LoFTR 外点会破坏单应;局部 NCC 若以错误簇为中心会框错(如把刷新钮当成搜索图标)。 因此在全分辨率上做「多尺度全局 NCC」,优先取全图最高分作为最终框;LoFTR 仍用于几何备份。 """ from __future__ import annotations import math import ssl import sys import urllib.request from abc import ABC, abstractmethod from dataclasses import dataclass from pathlib import Path import cv2 import numpy as np # 与 Kornia LoFTR outdoor 相同来源;本地无权重文件时从此处下载 LOFTR_OUTDOOR_WEIGHT_DOWNLOAD_URL = ( "http://cmp.felk.cvut.cz/~mishkdmy/models/loftr_outdoor.ckpt" ) _REPOSITORY_ROOT_DIRECTORY = Path(__file__).resolve().parent.parent @dataclass(frozen=True) class TemplateMatchBoundingBoxAndCenterRoundedToIntegerScreenPixels: bounding_box_left_integer: int bounding_box_top_integer: int bounding_box_width_integer: int bounding_box_height_integer: int center_screen_x_integer: int center_screen_y_integer: int def bounding_box_left_top_width_height_xywh_as_tuple_of_four_integers( self, ) -> tuple[int, int, int, int]: return ( self.bounding_box_left_integer, self.bounding_box_top_integer, self.bounding_box_width_integer, self.bounding_box_height_integer, ) @dataclass(frozen=True) class TemplateMatchBoundingBoxAndCenterInScreenshotPixels: bounding_box_left_top_width_height_xywh_float_tuple: tuple[float, float, float, float] center_screen_x_float: float center_screen_y_float: float def as_rounded_to_integer_screen_coordinates( self, ) -> TemplateMatchBoundingBoxAndCenterRoundedToIntegerScreenPixels: bounding_box_left_float, bounding_box_top_float, bounding_box_width_float, bounding_box_height_float = ( self.bounding_box_left_top_width_height_xywh_float_tuple ) return TemplateMatchBoundingBoxAndCenterRoundedToIntegerScreenPixels( bounding_box_left_integer=int(round(bounding_box_left_float)), bounding_box_top_integer=int(round(bounding_box_top_float)), bounding_box_width_integer=int(round(bounding_box_width_float)), bounding_box_height_integer=int(round(bounding_box_height_float)), center_screen_x_integer=int(round(self.center_screen_x_float)), center_screen_y_integer=int(round(self.center_screen_y_float)), ) @dataclass(frozen=True) class LoFTRTemplateAgainstScreenshotMatch: """模板在截图上的 LoFTR 匹配中间量,用于解析全分辨率包围盒与中心点。""" screenshot_bgr_full_size: np.ndarray """原始分辨率屏幕截图,BGR。""" homography_template_inference_to_screenshot_inference: np.ndarray | None """3×3 单应:模板推理分辨率坐标 → 截图推理分辨率坐标;可能因不可靠而为 None。""" template_width_pixels_at_inference: int template_height_pixels_at_inference: int divisor_inference_screenshot_x_to_fullsize_x: float divisor_inference_screenshot_y_to_fullsize_y: float high_confidence_match_points_on_screenshot_inference: np.ndarray """高置信度匹配点在截图推理坐标下;作回退包围盒用。""" template_original_width_pixels: int template_original_height_pixels: int refined_template_bbox_xywh_full_size: tuple[float, float, float, float] | None """NCC 精修成功时为全图坐标系下的 (x, y, width, height);否则为 None。""" ransac_inlier_points_screenshot_inference: np.ndarray """RANSAC 内点在截图推理分辨率下的坐标;用于回退矩形与精修搜索中心。""" class TemplateAgainstScreenshotMatcher(ABC): @abstractmethod def match_template_center_in_screenshot( self, template_image_file_path: Path | str, screenshot_image_file_path: Path | str | None = None, *, screenshot_bgr_full_size_numpy: np.ndarray | None = None, ) -> TemplateMatchBoundingBoxAndCenterInScreenshotPixels: ... def _template_match_result_center_xy_in_screenshot_pixels( match: LoFTRTemplateAgainstScreenshotMatch, ) -> tuple[float, float]: refined_bbox_xywh = match.refined_template_bbox_xywh_full_size if refined_bbox_xywh is not None: bbox_x, bbox_y, bbox_width, bbox_height = refined_bbox_xywh return ( bbox_x + bbox_width / 2.0, bbox_y + bbox_height / 2.0, ) homography_matrix = match.homography_template_inference_to_screenshot_inference if homography_matrix is not None: template_width_inference = match.template_width_pixels_at_inference template_height_inference = match.template_height_pixels_at_inference corners_template_inference_xy = np.array( [ [0, 0], [template_width_inference - 1, 0], [template_width_inference - 1, template_height_inference - 1], [0, template_height_inference - 1], ], dtype=np.float32, ).reshape(1, 4, 2) corners_screenshot_inference_xy = cv2.perspectiveTransform( corners_template_inference_xy, homography_matrix )[0] divisor_x = match.divisor_inference_screenshot_x_to_fullsize_x divisor_y = match.divisor_inference_screenshot_y_to_fullsize_y corners_screenshot_full_xy = corners_screenshot_inference_xy.copy() corners_screenshot_full_xy[:, 0] /= divisor_x corners_screenshot_full_xy[:, 1] /= divisor_y mean_corner_xy = np.mean(corners_screenshot_full_xy, axis=0) return (float(mean_corner_xy[0]), float(mean_corner_xy[1])) inlier_points_inference_xy = match.ransac_inlier_points_screenshot_inference median_inference_xy = np.median(inlier_points_inference_xy, axis=0) divisor_x = match.divisor_inference_screenshot_x_to_fullsize_x divisor_y = match.divisor_inference_screenshot_y_to_fullsize_y return ( float(median_inference_xy[0] / divisor_x), float(median_inference_xy[1] / divisor_y), ) def _template_match_bounding_box_xywh_full_size_float_tuple_from_loftr_template_against_screenshot_match_object( match: LoFTRTemplateAgainstScreenshotMatch, ) -> tuple[float, float, float, float]: refined_bbox_xywh = match.refined_template_bbox_xywh_full_size if refined_bbox_xywh is not None: return refined_bbox_xywh homography_matrix = match.homography_template_inference_to_screenshot_inference if homography_matrix is not None: template_width_inference = match.template_width_pixels_at_inference template_height_inference = match.template_height_pixels_at_inference corners_template_inference_xy = np.array( [ [0, 0], [template_width_inference - 1, 0], [ template_width_inference - 1, template_height_inference - 1, ], [0, template_height_inference - 1], ], dtype=np.float32, ).reshape(1, 4, 2) corners_screenshot_inference_xy = cv2.perspectiveTransform( corners_template_inference_xy, homography_matrix, )[0] divisor_x = match.divisor_inference_screenshot_x_to_fullsize_x divisor_y = match.divisor_inference_screenshot_y_to_fullsize_y corners_screenshot_full_xy = corners_screenshot_inference_xy.copy() corners_screenshot_full_xy[:, 0] /= divisor_x corners_screenshot_full_xy[:, 1] /= divisor_y min_x = float(np.min(corners_screenshot_full_xy[:, 0])) min_y = float(np.min(corners_screenshot_full_xy[:, 1])) max_x = float(np.max(corners_screenshot_full_xy[:, 0])) max_y = float(np.max(corners_screenshot_full_xy[:, 1])) return (min_x, min_y, max_x - min_x, max_y - min_y) inlier_points_inference_xy = match.ransac_inlier_points_screenshot_inference divisor_x = match.divisor_inference_screenshot_x_to_fullsize_x divisor_y = match.divisor_inference_screenshot_y_to_fullsize_y inlier_full_xy = inlier_points_inference_xy.astype(np.float64).copy() inlier_full_xy[:, 0] /= divisor_x inlier_full_xy[:, 1] /= divisor_y min_x = float(np.min(inlier_full_xy[:, 0])) min_y = float(np.min(inlier_full_xy[:, 1])) max_x = float(np.max(inlier_full_xy[:, 0])) max_y = float(np.max(inlier_full_xy[:, 1])) return (min_x, min_y, max_x - min_x, max_y - min_y) def _template_match_bounding_box_and_center_in_screenshot_pixels_from_loftr_template_against_screenshot_match_object( match: LoFTRTemplateAgainstScreenshotMatch, ) -> TemplateMatchBoundingBoxAndCenterInScreenshotPixels: bounding_box_left_top_width_height_xywh_float_tuple = ( _template_match_bounding_box_xywh_full_size_float_tuple_from_loftr_template_against_screenshot_match_object( match, ) ) center_screen_x_float, center_screen_y_float = ( _template_match_result_center_xy_in_screenshot_pixels(match) ) return TemplateMatchBoundingBoxAndCenterInScreenshotPixels( bounding_box_left_top_width_height_xywh_float_tuple=( bounding_box_left_top_width_height_xywh_float_tuple ), center_screen_x_float=center_screen_x_float, center_screen_y_float=center_screen_y_float, ) class LoFTRTemplateAgainstScreenshotMatcher(TemplateAgainstScreenshotMatcher): def __init__( self, loftr_repository_directory: Path, loftr_weight_checkpoint_file_path: Path, *, template_long_edge_max_pixels: int = 640, screenshot_long_edge_max_pixels: int = 1280, ransac_reprojection_threshold: float = 3.0, max_matches_for_homography: int = 800, ) -> None: self._loftr_repository_directory = loftr_repository_directory self._loftr_weight_checkpoint_file_path = loftr_weight_checkpoint_file_path self._template_long_edge_max_pixels = template_long_edge_max_pixels self._screenshot_long_edge_max_pixels = screenshot_long_edge_max_pixels self._ransac_reprojection_threshold = ransac_reprojection_threshold self._max_matches_for_homography = max_matches_for_homography def match_template_center_in_screenshot( self, template_image_file_path: Path | str, screenshot_image_file_path: Path | str | None = None, *, screenshot_bgr_full_size_numpy: np.ndarray | None = None, ) -> TemplateMatchBoundingBoxAndCenterInScreenshotPixels: screenshot_path_for_loftr: Path | None if screenshot_image_file_path is not None: screenshot_path_for_loftr = Path(screenshot_image_file_path) else: screenshot_path_for_loftr = None match_result = run_loftr_template_match( loftr_repository_directory=self._loftr_repository_directory, loftr_weight_checkpoint_path=self._loftr_weight_checkpoint_file_path, screenshot_image_path=screenshot_path_for_loftr, screenshot_bgr_full_size_numpy=screenshot_bgr_full_size_numpy, template_image_path=Path(template_image_file_path), template_long_edge_max_pixels=self._template_long_edge_max_pixels, screenshot_long_edge_max_pixels=self._screenshot_long_edge_max_pixels, ransac_reprojection_threshold=self._ransac_reprojection_threshold, max_matches_for_homography=self._max_matches_for_homography, ) return match_result def match_template_center_xy_for_screenshot_file_and_template_file( source_screen_screenshot_image_file_path: Path | str, template_image_file_path: Path | str, ) -> TemplateMatchBoundingBoxAndCenterInScreenshotPixels: """ 在 ``source_screen_screenshot_image_file_path`` 所指整图(如全屏 PNG)中用 LoFTR + NCC 匹配 ``template_image_file_path``, 返回全分辨率下的包围盒 ``(x, y, width, height)`` 与模板中心 ``(x, y)``。LoFTR 目录与权重使用仓库默认路径。 """ loftr_repository_directory_path = _REPOSITORY_ROOT_DIRECTORY / "python" / "LoFTR" loftr_weight_checkpoint_file_path = ( loftr_repository_directory_path / "weights" / "loftr_outdoor.ckpt" ) loftr_template_against_screenshot_matcher = LoFTRTemplateAgainstScreenshotMatcher( loftr_repository_directory_path, loftr_weight_checkpoint_file_path, ) return loftr_template_against_screenshot_matcher.match_template_center_in_screenshot( template_image_file_path, Path(source_screen_screenshot_image_file_path), ) def _download_file_with_ssl_fallbacks(download_url: str, destination_file_path: Path) -> None: destination_file_path.parent.mkdir(parents=True, exist_ok=True) ssl_context_candidates: list[ssl.SSLContext] = [] try: import certifi ssl_context_candidates.append( ssl.create_default_context(cafile=certifi.where()) ) except Exception: pass ssl_context_candidates.append(ssl.create_default_context()) ssl_context_candidates.append(ssl._create_unverified_context()) last_error: BaseException | None = None for ssl_context in ssl_context_candidates: try: with urllib.request.urlopen( download_url, context=ssl_context, timeout=300 ) as response: destination_file_path.write_bytes(response.read()) return except Exception as exc: last_error = exc raise RuntimeError( f"无法下载 LoFTR 权重:{download_url}\n请手动保存到:{destination_file_path}" ) from last_error def _resize_grayscale_divisible_by_eight( grayscale_image: np.ndarray, longest_edge_max_pixels: int, ) -> tuple[np.ndarray, float, float]: """返回 (缩放图, divisor_x, divisor_y),全尺寸坐标 = 推理坐标 / divisor。""" original_height, original_width = grayscale_image.shape[:2] target_width, target_height = original_width, original_height if max(target_height, target_width) > longest_edge_max_pixels: shrink_scale = longest_edge_max_pixels / max(target_height, target_width) target_width = int(round(original_width * shrink_scale)) target_height = int(round(original_height * shrink_scale)) inference_width = max((target_width // 8) * 8, 8) inference_height = max((target_height // 8) * 8, 8) resized = cv2.resize( grayscale_image, (inference_width, inference_height), interpolation=cv2.INTER_AREA, ) divisor_x = inference_width / original_width divisor_y = inference_height / original_height return resized, divisor_x, divisor_y def _filter_matches_near_confident_median( template_points: np.ndarray, screenshot_points: np.ndarray, confidence: np.ndarray, template_width_inf: int, template_height_inf: int, min_points: int = 12, ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """保留置信度高且在几何中位数附近成簇的匹配,抑制随机外点。""" order = np.argsort(-confidence) top_n = min(200, len(order)) sp = screenshot_points[order[:top_n]] tp = template_points[order[:top_n]] cf = confidence[order[:top_n]] median_screen = np.median(sp[: min(40, len(sp))], axis=0) diag = float(np.hypot(template_width_inf, template_height_inf)) radius = max(diag * 2.5, 48.0) for _ in range(8): dist = np.linalg.norm(sp - median_screen, axis=1) keep = dist < radius if int(np.sum(keep)) >= min_points: return tp[keep], sp[keep], cf[keep] radius *= 1.35 median_screen = np.median(sp[keep] if np.any(keep) else sp, axis=0) return tp, sp, cf def _homography_quad_plausible_on_full_image( corners_full_xy: np.ndarray, image_width: int, image_height: int, template_orig_w: int, template_orig_h: int, ) -> bool: """检查单应投影四边形是否落在图内且尺度/长宽比与模板大致一致。""" if corners_full_xy.shape != (4, 2) or not np.all(np.isfinite(corners_full_xy)): return False margin_x = 0.08 * image_width margin_y = 0.08 * image_height xs, ys = corners_full_xy[:, 0], corners_full_xy[:, 1] if xs.min() < -margin_x or xs.max() > image_width + margin_x: return False if ys.min() < -margin_y or ys.max() > image_height + margin_y: return False bbox_w = float(xs.max() - xs.min()) bbox_h = float(ys.max() - ys.min()) if bbox_w < 4 or bbox_h < 4: return False tpl_ar = template_orig_w / max(template_orig_h, 1) box_ar = bbox_w / max(bbox_h, 1e-6) ratio = box_ar / max(tpl_ar, 1e-6) if ratio > 3.5 or ratio < (1.0 / 3.5): return False area = float(cv2.contourArea(corners_full_xy.astype(np.float32))) expected = float(template_orig_w * template_orig_h) if area < 0.12 * expected or area > 30.0 * expected: return False return True def _ncc_global_multiscale_best_match( screenshot_gray_full: np.ndarray, template_gray_full: np.ndarray, template_scale_factors: tuple[float, ...] = ( 0.88, 0.92, 0.96, 1.0, 1.04, 1.08, 1.12, ), minimum_acceptable_score: float = 0.34, ) -> tuple[tuple[float, float, float, float], float] | None: """ 在全图上对多种缩放的模板做 TM_CCOEFF_NORMED,取全局最大响应。 适用于 UI 小图标与截图同尺度、需避免「只在 LoFTR 簇附近找」而找错位置的情况。 返回 ((x, y, w, h), 最佳分数);分数低于阈值则视为不可靠。 """ h_img, w_img = screenshot_gray_full.shape[:2] h_tpl0, w_tpl0 = template_gray_full.shape[:2] if h_tpl0 >= h_img or w_tpl0 >= w_img: return None best_score = -1.0 best_bbox_xywh: tuple[float, float, float, float] | None = None for scale in template_scale_factors: tw = max(3, int(round(w_tpl0 * scale))) th = max(3, int(round(h_tpl0 * scale))) if tw >= w_img or th >= h_img: continue template_scaled = cv2.resize( template_gray_full, (tw, th), interpolation=cv2.INTER_AREA ) response_map = cv2.matchTemplate( screenshot_gray_full, template_scaled, cv2.TM_CCOEFF_NORMED ) _, max_val, _, max_loc = cv2.minMaxLoc(response_map) if max_val > best_score: best_score = float(max_val) best_bbox_xywh = ( float(max_loc[0]), float(max_loc[1]), float(tw), float(th), ) if best_bbox_xywh is None or best_score < minimum_acceptable_score: return None return best_bbox_xywh, best_score def list_template_match_centers_ncc_multiscale( template_image_file_path: Path | str, *, screenshot_image_file_path: Path | str | None = None, screenshot_bgr_numpy: np.ndarray | None = None, template_scale_factors: tuple[float, ...] = ( 0.88, 0.92, 0.96, 1.0, 1.04, 1.08, 1.12, ), min_score: float = 0.28, max_peaks_per_scale: int = 8, dedupe_distance_pixels: float = 28.0, ) -> list[tuple[float, float]]: """ 全图多尺度 ``matchTemplate``,收集局部极大响应,NMS 后按分数从高到低去重, 返回模板中心点列表(全分辨率坐标),供与 OCR 锚点选「最近」匹配用。 """ if screenshot_bgr_numpy is not None: screenshot_bgr = screenshot_bgr_numpy else: screenshot_bgr = cv2.imread( str(screenshot_image_file_path), cv2.IMREAD_COLOR ) template_gray = cv2.imread(str(template_image_file_path), cv2.IMREAD_GRAYSCALE) if screenshot_bgr is None or template_gray is None: raise FileNotFoundError( f"无法读取截图或模板:screenshot_image_file_path={screenshot_image_file_path!s} template_image_file_path={template_image_file_path!s}" ) screenshot_gray = cv2.cvtColor(screenshot_bgr, cv2.COLOR_BGR2GRAY) h_img, w_img = screenshot_gray.shape[:2] h_tpl0, w_tpl0 = template_gray.shape[:2] if h_tpl0 >= h_img or w_tpl0 >= w_img: return [] raw: list[tuple[float, float, float]] = [] for scale in template_scale_factors: tw = max(3, int(round(w_tpl0 * scale))) th = max(3, int(round(h_tpl0 * scale))) if tw >= w_img or th >= h_img: continue template_scaled = cv2.resize( template_gray, (tw, th), interpolation=cv2.INTER_AREA ) response_map = cv2.matchTemplate( screenshot_gray, template_scaled, cv2.TM_CCOEFF_NORMED ) nms_radius = int(max(4, max(tw, th) * 0.55)) work = response_map.copy() for _ in range(max_peaks_per_scale): _, max_val, _, max_loc = cv2.minMaxLoc(work) if max_val < min_score: break mx, my = max_loc cx = float(mx) + float(tw) / 2.0 cy = float(my) + float(th) / 2.0 raw.append((cx, cy, float(max_val))) x0 = max(0, mx - nms_radius) y0 = max(0, my - nms_radius) x1 = min(work.shape[1], mx + nms_radius + 1) y1 = min(work.shape[0], my + nms_radius + 1) work[y0:y1, x0:x1] = -1.0 raw.sort(key=lambda row: -row[2]) merged: list[tuple[float, float]] = [] for cx, cy, _sc in raw: if any( math.hypot(cx - ox, cy - oy) < dedupe_distance_pixels for ox, oy in merged ): continue merged.append((cx, cy)) return merged def _ncc_refine_template_bbox( screenshot_gray_full: np.ndarray, template_gray_full: np.ndarray, center_x_full: float, center_y_full: float, search_margin_full: float, min_ncc_score: float = 0.25, ) -> tuple[float, float, float, float] | None: """在以 LoFTR 粗中心为邻域内做归一化互相关,返回最佳 (x, y, w, h)。""" h_img, w_img = screenshot_gray_full.shape[:2] h_t, w_t = template_gray_full.shape[:2] if h_t >= h_img or w_t >= w_img: return None half = int(max(search_margin_full, max(w_t, h_t) * 2)) x0 = int(np.clip(center_x_full - half, 0, w_img - 1)) y0 = int(np.clip(center_y_full - half, 0, h_img - 1)) x1 = int(np.clip(center_x_full + half, 0, w_img)) y1 = int(np.clip(center_y_full + half, 0, h_img)) if x1 - x0 <= w_t or y1 - y0 <= h_t: return None roi = screenshot_gray_full[y0:y1, x0:x1] result = cv2.matchTemplate(roi, template_gray_full, cv2.TM_CCOEFF_NORMED) _, max_val, _, max_loc = cv2.minMaxLoc(result) if max_val < min_ncc_score: return None x = float(x0 + max_loc[0]) y = float(y0 + max_loc[1]) return (x, y, float(w_t), float(h_t)) def run_loftr_template_match( *, loftr_repository_directory: Path, loftr_weight_checkpoint_path: Path, screenshot_image_path: Path | None = None, screenshot_bgr_full_size_numpy: np.ndarray | None = None, template_image_path: Path, template_long_edge_max_pixels: int = 640, screenshot_long_edge_max_pixels: int = 1280, ransac_reprojection_threshold: float = 3.0, max_matches_for_homography: int = 800, ) -> TemplateMatchBoundingBoxAndCenterInScreenshotPixels: """ 读取模板图与截图,在推理尺度上跑 LoFTR,过滤外点后估计单应,并用 NCC 在全分辨率上精修位置; 返回全分辨率包围盒与中心点(不写磁盘)。 """ if not loftr_repository_directory.is_dir(): raise FileNotFoundError(f"未找到 LoFTR 源码目录:{loftr_repository_directory}") repository_path_string = str(loftr_repository_directory) if repository_path_string not in sys.path: sys.path.insert(0, repository_path_string) import torch # noqa: E402 from src.loftr import LoFTR, default_cfg # noqa: E402 if not loftr_weight_checkpoint_path.is_file(): _download_file_with_ssl_fallbacks( LOFTR_OUTDOOR_WEIGHT_DOWNLOAD_URL, loftr_weight_checkpoint_path ) if screenshot_bgr_full_size_numpy is not None: screenshot_bgr_full_size = screenshot_bgr_full_size_numpy else: screenshot_bgr_full_size = cv2.imread( str(screenshot_image_path), cv2.IMREAD_COLOR ) template_grayscale = cv2.imread( str(template_image_path), cv2.IMREAD_GRAYSCALE ) if screenshot_bgr_full_size is None or template_grayscale is None: raise FileNotFoundError( f"无法读取图片:screenshot_image_path={screenshot_image_path!s} template_image_path={template_image_path!s}" ) template_orig_h, template_orig_w = template_grayscale.shape[:2] screenshot_grayscale = cv2.cvtColor( screenshot_bgr_full_size, cv2.COLOR_BGR2GRAY ) full_h, full_w = screenshot_grayscale.shape[:2] template_at_inference, _, _ = _resize_grayscale_divisible_by_eight( template_grayscale, template_long_edge_max_pixels ) screenshot_at_inference, divisor_screen_x, divisor_screen_y = ( _resize_grayscale_divisible_by_eight( screenshot_grayscale, screenshot_long_edge_max_pixels ) ) template_height_at_inference, template_width_at_inference = ( template_at_inference.shape[:2] ) compute_device = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) template_batch = ( torch.from_numpy(template_at_inference).float()[None, None].to(compute_device) / 255.0 ) screenshot_batch = ( torch.from_numpy(screenshot_at_inference).float()[None, None].to(compute_device) / 255.0 ) matcher_network = LoFTR(config=default_cfg) checkpoint = torch.load( str(loftr_weight_checkpoint_path), map_location=compute_device ) matcher_network.load_state_dict(checkpoint["state_dict"]) matcher_network = matcher_network.eval().to(compute_device) torch.set_grad_enabled(False) forward_batch = {"image0": template_batch, "image1": screenshot_batch} matcher_network(forward_batch) template_match_points = forward_batch["mkpts0_f"].detach().cpu().numpy() screenshot_match_points = forward_batch["mkpts1_f"].detach().cpu().numpy() match_confidence = forward_batch["mconf"].detach().cpu().numpy() trusted_homography: np.ndarray | None screenshot_points_for_ransac: np.ndarray inlier_screen: np.ndarray if len(template_match_points) < 4: inference_screenshot_height, inference_screenshot_width = ( screenshot_at_inference.shape[:2] ) inlier_screen = np.array( [ [ float(inference_screenshot_width) * 0.5, float(inference_screenshot_height) * 0.5, ] ], dtype=np.float32, ) screenshot_points_for_ransac = inlier_screen.copy() trusted_homography = None else: tp_filt, sp_filt, cf_filt = _filter_matches_near_confident_median( template_match_points, screenshot_match_points, match_confidence, template_width_at_inference, template_height_at_inference, ) confidence_sorted_indices = np.argsort(-cf_filt)[ : min(max_matches_for_homography, len(cf_filt)) ] template_points_for_ransac = tp_filt[confidence_sorted_indices].astype( np.float32 ) screenshot_points_for_ransac = sp_filt[confidence_sorted_indices].astype( np.float32 ) homography_matrix, homography_mask = cv2.findHomography( template_points_for_ransac, screenshot_points_for_ransac, cv2.RANSAC, ransac_reprojection_threshold, maxIters=5000, confidence=0.995, ) if homography_mask is not None: inlier_flat = homography_mask.ravel().astype(bool) inlier_screen = screenshot_points_for_ransac[inlier_flat] else: inlier_screen = screenshot_points_for_ransac if len(inlier_screen) < 4: inlier_screen = sp_filt trusted_homography = homography_matrix if homography_matrix is not None: corners_tpl = np.array( [ [0, 0], [template_width_at_inference - 1, 0], [ template_width_at_inference - 1, template_height_at_inference - 1, ], [0, template_height_at_inference - 1], ], dtype=np.float32, ).reshape(1, 4, 2) corners_inf = cv2.perspectiveTransform(corners_tpl, homography_matrix)[0] corners_full = corners_inf.copy() corners_full[:, 0] /= divisor_screen_x corners_full[:, 1] /= divisor_screen_y if not _homography_quad_plausible_on_full_image( corners_full, full_w, full_h, template_orig_w, template_orig_h ): trusted_homography = None center_inf = np.median(inlier_screen, axis=0) center_full_x = float(center_inf[0] / divisor_screen_x) center_full_y = float(center_inf[1] / divisor_screen_y) search_margin = float( max( template_orig_w, template_orig_h, template_width_at_inference / divisor_screen_x, ) * 2.5 ) global_ncc = _ncc_global_multiscale_best_match( screenshot_grayscale, template_grayscale ) local_ncc = _ncc_refine_template_bbox( screenshot_grayscale, template_grayscale, center_full_x, center_full_y, search_margin, ) refined_bbox: tuple[float, float, float, float] | None if global_ncc is not None: refined_bbox = global_ncc[0] else: refined_bbox = local_ncc if refined_bbox is None: relaxed_global_ncc = _ncc_global_multiscale_best_match( screenshot_grayscale, template_grayscale, minimum_acceptable_score=0.18, ) if relaxed_global_ncc is not None: refined_bbox = relaxed_global_ncc[0] template_against_screenshot_match = LoFTRTemplateAgainstScreenshotMatch( screenshot_bgr_full_size=screenshot_bgr_full_size, homography_template_inference_to_screenshot_inference=trusted_homography, template_width_pixels_at_inference=template_width_at_inference, template_height_pixels_at_inference=template_height_at_inference, divisor_inference_screenshot_x_to_fullsize_x=divisor_screen_x, divisor_inference_screenshot_y_to_fullsize_y=divisor_screen_y, high_confidence_match_points_on_screenshot_inference=screenshot_points_for_ransac, template_original_width_pixels=template_orig_w, template_original_height_pixels=template_orig_h, refined_template_bbox_xywh_full_size=refined_bbox, ransac_inlier_points_screenshot_inference=inlier_screen.astype(np.float32), ) return _template_match_bounding_box_and_center_in_screenshot_pixels_from_loftr_template_against_screenshot_match_object( template_against_screenshot_match, )