| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789 |
- """
- 使用 LoFTR 在整张屏幕截图中匹配小模板图,得到几何关系(单应矩阵或备用点集)。
- 小模板 + 大图时 LoFTR 外点会破坏单应;局部 NCC 若以错误簇为中心会框错(如把刷新钮当成搜索图标)。
- 因此在全分辨率上做「多尺度全局 NCC」,优先取全图最高分作为最终框;LoFTR 仍用于几何备份。
- """
- from __future__ import annotations
- import math
- import ssl
- import sys
- import urllib.request
- from abc import ABC, abstractmethod
- from dataclasses import dataclass
- from pathlib import Path
- import cv2
- import numpy as np
- # 与 Kornia LoFTR outdoor 相同来源;本地无权重文件时从此处下载
- LOFTR_OUTDOOR_WEIGHT_DOWNLOAD_URL = (
- "http://cmp.felk.cvut.cz/~mishkdmy/models/loftr_outdoor.ckpt"
- )
- _REPOSITORY_ROOT_DIRECTORY = Path(__file__).resolve().parent.parent
- @dataclass(frozen=True)
- class TemplateMatchBoundingBoxAndCenterRoundedToIntegerScreenPixels:
- bounding_box_left_integer: int
- bounding_box_top_integer: int
- bounding_box_width_integer: int
- bounding_box_height_integer: int
- center_screen_x_integer: int
- center_screen_y_integer: int
- def bounding_box_left_top_width_height_xywh_as_tuple_of_four_integers(
- self,
- ) -> tuple[int, int, int, int]:
- return (
- self.bounding_box_left_integer,
- self.bounding_box_top_integer,
- self.bounding_box_width_integer,
- self.bounding_box_height_integer,
- )
- @dataclass(frozen=True)
- class TemplateMatchBoundingBoxAndCenterInScreenshotPixels:
- bounding_box_left_top_width_height_xywh_float_tuple: tuple[float, float, float, float]
- center_screen_x_float: float
- center_screen_y_float: float
- def as_rounded_to_integer_screen_coordinates(
- self,
- ) -> TemplateMatchBoundingBoxAndCenterRoundedToIntegerScreenPixels:
- bounding_box_left_float, bounding_box_top_float, bounding_box_width_float, bounding_box_height_float = (
- self.bounding_box_left_top_width_height_xywh_float_tuple
- )
- return TemplateMatchBoundingBoxAndCenterRoundedToIntegerScreenPixels(
- bounding_box_left_integer=int(round(bounding_box_left_float)),
- bounding_box_top_integer=int(round(bounding_box_top_float)),
- bounding_box_width_integer=int(round(bounding_box_width_float)),
- bounding_box_height_integer=int(round(bounding_box_height_float)),
- center_screen_x_integer=int(round(self.center_screen_x_float)),
- center_screen_y_integer=int(round(self.center_screen_y_float)),
- )
- @dataclass(frozen=True)
- class LoFTRTemplateAgainstScreenshotMatch:
- """模板在截图上的 LoFTR 匹配中间量,用于解析全分辨率包围盒与中心点。"""
- screenshot_bgr_full_size: np.ndarray
- """原始分辨率屏幕截图,BGR。"""
- homography_template_inference_to_screenshot_inference: np.ndarray | None
- """3×3 单应:模板推理分辨率坐标 → 截图推理分辨率坐标;可能因不可靠而为 None。"""
- template_width_pixels_at_inference: int
- template_height_pixels_at_inference: int
- divisor_inference_screenshot_x_to_fullsize_x: float
- divisor_inference_screenshot_y_to_fullsize_y: float
- high_confidence_match_points_on_screenshot_inference: np.ndarray
- """高置信度匹配点在截图推理坐标下;作回退包围盒用。"""
- template_original_width_pixels: int
- template_original_height_pixels: int
- refined_template_bbox_xywh_full_size: tuple[float, float, float, float] | None
- """NCC 精修成功时为全图坐标系下的 (x, y, width, height);否则为 None。"""
- ransac_inlier_points_screenshot_inference: np.ndarray
- """RANSAC 内点在截图推理分辨率下的坐标;用于回退矩形与精修搜索中心。"""
- class TemplateAgainstScreenshotMatcher(ABC):
- @abstractmethod
- def match_template_center_in_screenshot(
- self,
- template_image_file_path: Path | str,
- screenshot_image_file_path: Path | str | None = None,
- *,
- screenshot_bgr_full_size_numpy: np.ndarray | None = None,
- ) -> TemplateMatchBoundingBoxAndCenterInScreenshotPixels:
- ...
- def _template_match_result_center_xy_in_screenshot_pixels(
- match: LoFTRTemplateAgainstScreenshotMatch,
- ) -> tuple[float, float]:
- refined_bbox_xywh = match.refined_template_bbox_xywh_full_size
- if refined_bbox_xywh is not None:
- bbox_x, bbox_y, bbox_width, bbox_height = refined_bbox_xywh
- return (
- bbox_x + bbox_width / 2.0,
- bbox_y + bbox_height / 2.0,
- )
- homography_matrix = match.homography_template_inference_to_screenshot_inference
- if homography_matrix is not None:
- template_width_inference = match.template_width_pixels_at_inference
- template_height_inference = match.template_height_pixels_at_inference
- corners_template_inference_xy = np.array(
- [
- [0, 0],
- [template_width_inference - 1, 0],
- [template_width_inference - 1, template_height_inference - 1],
- [0, template_height_inference - 1],
- ],
- dtype=np.float32,
- ).reshape(1, 4, 2)
- corners_screenshot_inference_xy = cv2.perspectiveTransform(
- corners_template_inference_xy, homography_matrix
- )[0]
- divisor_x = match.divisor_inference_screenshot_x_to_fullsize_x
- divisor_y = match.divisor_inference_screenshot_y_to_fullsize_y
- corners_screenshot_full_xy = corners_screenshot_inference_xy.copy()
- corners_screenshot_full_xy[:, 0] /= divisor_x
- corners_screenshot_full_xy[:, 1] /= divisor_y
- mean_corner_xy = np.mean(corners_screenshot_full_xy, axis=0)
- return (float(mean_corner_xy[0]), float(mean_corner_xy[1]))
- inlier_points_inference_xy = match.ransac_inlier_points_screenshot_inference
- median_inference_xy = np.median(inlier_points_inference_xy, axis=0)
- divisor_x = match.divisor_inference_screenshot_x_to_fullsize_x
- divisor_y = match.divisor_inference_screenshot_y_to_fullsize_y
- return (
- float(median_inference_xy[0] / divisor_x),
- float(median_inference_xy[1] / divisor_y),
- )
- def _template_match_bounding_box_xywh_full_size_float_tuple_from_loftr_template_against_screenshot_match_object(
- match: LoFTRTemplateAgainstScreenshotMatch,
- ) -> tuple[float, float, float, float]:
- refined_bbox_xywh = match.refined_template_bbox_xywh_full_size
- if refined_bbox_xywh is not None:
- return refined_bbox_xywh
- homography_matrix = match.homography_template_inference_to_screenshot_inference
- if homography_matrix is not None:
- template_width_inference = match.template_width_pixels_at_inference
- template_height_inference = match.template_height_pixels_at_inference
- corners_template_inference_xy = np.array(
- [
- [0, 0],
- [template_width_inference - 1, 0],
- [
- template_width_inference - 1,
- template_height_inference - 1,
- ],
- [0, template_height_inference - 1],
- ],
- dtype=np.float32,
- ).reshape(1, 4, 2)
- corners_screenshot_inference_xy = cv2.perspectiveTransform(
- corners_template_inference_xy,
- homography_matrix,
- )[0]
- divisor_x = match.divisor_inference_screenshot_x_to_fullsize_x
- divisor_y = match.divisor_inference_screenshot_y_to_fullsize_y
- corners_screenshot_full_xy = corners_screenshot_inference_xy.copy()
- corners_screenshot_full_xy[:, 0] /= divisor_x
- corners_screenshot_full_xy[:, 1] /= divisor_y
- min_x = float(np.min(corners_screenshot_full_xy[:, 0]))
- min_y = float(np.min(corners_screenshot_full_xy[:, 1]))
- max_x = float(np.max(corners_screenshot_full_xy[:, 0]))
- max_y = float(np.max(corners_screenshot_full_xy[:, 1]))
- return (min_x, min_y, max_x - min_x, max_y - min_y)
- inlier_points_inference_xy = match.ransac_inlier_points_screenshot_inference
- divisor_x = match.divisor_inference_screenshot_x_to_fullsize_x
- divisor_y = match.divisor_inference_screenshot_y_to_fullsize_y
- inlier_full_xy = inlier_points_inference_xy.astype(np.float64).copy()
- inlier_full_xy[:, 0] /= divisor_x
- inlier_full_xy[:, 1] /= divisor_y
- min_x = float(np.min(inlier_full_xy[:, 0]))
- min_y = float(np.min(inlier_full_xy[:, 1]))
- max_x = float(np.max(inlier_full_xy[:, 0]))
- max_y = float(np.max(inlier_full_xy[:, 1]))
- return (min_x, min_y, max_x - min_x, max_y - min_y)
- def _template_match_bounding_box_and_center_in_screenshot_pixels_from_loftr_template_against_screenshot_match_object(
- match: LoFTRTemplateAgainstScreenshotMatch,
- ) -> TemplateMatchBoundingBoxAndCenterInScreenshotPixels:
- bounding_box_left_top_width_height_xywh_float_tuple = (
- _template_match_bounding_box_xywh_full_size_float_tuple_from_loftr_template_against_screenshot_match_object(
- match,
- )
- )
- center_screen_x_float, center_screen_y_float = (
- _template_match_result_center_xy_in_screenshot_pixels(match)
- )
- return TemplateMatchBoundingBoxAndCenterInScreenshotPixels(
- bounding_box_left_top_width_height_xywh_float_tuple=(
- bounding_box_left_top_width_height_xywh_float_tuple
- ),
- center_screen_x_float=center_screen_x_float,
- center_screen_y_float=center_screen_y_float,
- )
- class LoFTRTemplateAgainstScreenshotMatcher(TemplateAgainstScreenshotMatcher):
- def __init__(
- self,
- loftr_repository_directory: Path,
- loftr_weight_checkpoint_file_path: Path,
- *,
- template_long_edge_max_pixels: int = 640,
- screenshot_long_edge_max_pixels: int = 1280,
- ransac_reprojection_threshold: float = 3.0,
- max_matches_for_homography: int = 800,
- ) -> None:
- self._loftr_repository_directory = loftr_repository_directory
- self._loftr_weight_checkpoint_file_path = loftr_weight_checkpoint_file_path
- self._template_long_edge_max_pixels = template_long_edge_max_pixels
- self._screenshot_long_edge_max_pixels = screenshot_long_edge_max_pixels
- self._ransac_reprojection_threshold = ransac_reprojection_threshold
- self._max_matches_for_homography = max_matches_for_homography
- def match_template_center_in_screenshot(
- self,
- template_image_file_path: Path | str,
- screenshot_image_file_path: Path | str | None = None,
- *,
- screenshot_bgr_full_size_numpy: np.ndarray | None = None,
- ) -> TemplateMatchBoundingBoxAndCenterInScreenshotPixels:
- screenshot_path_for_loftr: Path | None
- if screenshot_image_file_path is not None:
- screenshot_path_for_loftr = Path(screenshot_image_file_path)
- else:
- screenshot_path_for_loftr = None
- match_result = run_loftr_template_match(
- loftr_repository_directory=self._loftr_repository_directory,
- loftr_weight_checkpoint_path=self._loftr_weight_checkpoint_file_path,
- screenshot_image_path=screenshot_path_for_loftr,
- screenshot_bgr_full_size_numpy=screenshot_bgr_full_size_numpy,
- template_image_path=Path(template_image_file_path),
- template_long_edge_max_pixels=self._template_long_edge_max_pixels,
- screenshot_long_edge_max_pixels=self._screenshot_long_edge_max_pixels,
- ransac_reprojection_threshold=self._ransac_reprojection_threshold,
- max_matches_for_homography=self._max_matches_for_homography,
- )
- return match_result
- def match_template_center_xy_for_screenshot_file_and_template_file(
- source_screen_screenshot_image_file_path: Path | str,
- template_image_file_path: Path | str,
- ) -> TemplateMatchBoundingBoxAndCenterInScreenshotPixels:
- """
- 在 ``source_screen_screenshot_image_file_path`` 所指整图(如全屏 PNG)中用 LoFTR + NCC 匹配 ``template_image_file_path``,
- 返回全分辨率下的包围盒 ``(x, y, width, height)`` 与模板中心 ``(x, y)``。LoFTR 目录与权重使用仓库默认路径。
- """
- loftr_repository_directory_path = _REPOSITORY_ROOT_DIRECTORY / "python" / "LoFTR"
- loftr_weight_checkpoint_file_path = (
- loftr_repository_directory_path / "weights" / "loftr_outdoor.ckpt"
- )
- loftr_template_against_screenshot_matcher = LoFTRTemplateAgainstScreenshotMatcher(
- loftr_repository_directory_path,
- loftr_weight_checkpoint_file_path,
- )
- return loftr_template_against_screenshot_matcher.match_template_center_in_screenshot(
- template_image_file_path,
- Path(source_screen_screenshot_image_file_path),
- )
- def _download_file_with_ssl_fallbacks(download_url: str, destination_file_path: Path) -> None:
- destination_file_path.parent.mkdir(parents=True, exist_ok=True)
- ssl_context_candidates: list[ssl.SSLContext] = []
- try:
- import certifi
- ssl_context_candidates.append(
- ssl.create_default_context(cafile=certifi.where())
- )
- except Exception:
- pass
- ssl_context_candidates.append(ssl.create_default_context())
- ssl_context_candidates.append(ssl._create_unverified_context())
- last_error: BaseException | None = None
- for ssl_context in ssl_context_candidates:
- try:
- with urllib.request.urlopen(
- download_url, context=ssl_context, timeout=300
- ) as response:
- destination_file_path.write_bytes(response.read())
- return
- except Exception as exc:
- last_error = exc
- raise RuntimeError(
- f"无法下载 LoFTR 权重:{download_url}\n请手动保存到:{destination_file_path}"
- ) from last_error
- def _resize_grayscale_divisible_by_eight(
- grayscale_image: np.ndarray,
- longest_edge_max_pixels: int,
- ) -> tuple[np.ndarray, float, float]:
- """返回 (缩放图, divisor_x, divisor_y),全尺寸坐标 = 推理坐标 / divisor。"""
- original_height, original_width = grayscale_image.shape[:2]
- target_width, target_height = original_width, original_height
- if max(target_height, target_width) > longest_edge_max_pixels:
- shrink_scale = longest_edge_max_pixels / max(target_height, target_width)
- target_width = int(round(original_width * shrink_scale))
- target_height = int(round(original_height * shrink_scale))
- inference_width = max((target_width // 8) * 8, 8)
- inference_height = max((target_height // 8) * 8, 8)
- resized = cv2.resize(
- grayscale_image,
- (inference_width, inference_height),
- interpolation=cv2.INTER_AREA,
- )
- divisor_x = inference_width / original_width
- divisor_y = inference_height / original_height
- return resized, divisor_x, divisor_y
- def _filter_matches_near_confident_median(
- template_points: np.ndarray,
- screenshot_points: np.ndarray,
- confidence: np.ndarray,
- template_width_inf: int,
- template_height_inf: int,
- min_points: int = 12,
- ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
- """保留置信度高且在几何中位数附近成簇的匹配,抑制随机外点。"""
- order = np.argsort(-confidence)
- top_n = min(200, len(order))
- sp = screenshot_points[order[:top_n]]
- tp = template_points[order[:top_n]]
- cf = confidence[order[:top_n]]
- median_screen = np.median(sp[: min(40, len(sp))], axis=0)
- diag = float(np.hypot(template_width_inf, template_height_inf))
- radius = max(diag * 2.5, 48.0)
- for _ in range(8):
- dist = np.linalg.norm(sp - median_screen, axis=1)
- keep = dist < radius
- if int(np.sum(keep)) >= min_points:
- return tp[keep], sp[keep], cf[keep]
- radius *= 1.35
- median_screen = np.median(sp[keep] if np.any(keep) else sp, axis=0)
- return tp, sp, cf
- def _homography_quad_plausible_on_full_image(
- corners_full_xy: np.ndarray,
- image_width: int,
- image_height: int,
- template_orig_w: int,
- template_orig_h: int,
- ) -> bool:
- """检查单应投影四边形是否落在图内且尺度/长宽比与模板大致一致。"""
- if corners_full_xy.shape != (4, 2) or not np.all(np.isfinite(corners_full_xy)):
- return False
- margin_x = 0.08 * image_width
- margin_y = 0.08 * image_height
- xs, ys = corners_full_xy[:, 0], corners_full_xy[:, 1]
- if xs.min() < -margin_x or xs.max() > image_width + margin_x:
- return False
- if ys.min() < -margin_y or ys.max() > image_height + margin_y:
- return False
- bbox_w = float(xs.max() - xs.min())
- bbox_h = float(ys.max() - ys.min())
- if bbox_w < 4 or bbox_h < 4:
- return False
- tpl_ar = template_orig_w / max(template_orig_h, 1)
- box_ar = bbox_w / max(bbox_h, 1e-6)
- ratio = box_ar / max(tpl_ar, 1e-6)
- if ratio > 3.5 or ratio < (1.0 / 3.5):
- return False
- area = float(cv2.contourArea(corners_full_xy.astype(np.float32)))
- expected = float(template_orig_w * template_orig_h)
- if area < 0.12 * expected or area > 30.0 * expected:
- return False
- return True
- def _ncc_global_multiscale_best_match(
- screenshot_gray_full: np.ndarray,
- template_gray_full: np.ndarray,
- template_scale_factors: tuple[float, ...] = (
- 0.88,
- 0.92,
- 0.96,
- 1.0,
- 1.04,
- 1.08,
- 1.12,
- ),
- minimum_acceptable_score: float = 0.34,
- ) -> tuple[tuple[float, float, float, float], float] | None:
- """
- 在全图上对多种缩放的模板做 TM_CCOEFF_NORMED,取全局最大响应。
- 适用于 UI 小图标与截图同尺度、需避免「只在 LoFTR 簇附近找」而找错位置的情况。
- 返回 ((x, y, w, h), 最佳分数);分数低于阈值则视为不可靠。
- """
- h_img, w_img = screenshot_gray_full.shape[:2]
- h_tpl0, w_tpl0 = template_gray_full.shape[:2]
- if h_tpl0 >= h_img or w_tpl0 >= w_img:
- return None
- best_score = -1.0
- best_bbox_xywh: tuple[float, float, float, float] | None = None
- for scale in template_scale_factors:
- tw = max(3, int(round(w_tpl0 * scale)))
- th = max(3, int(round(h_tpl0 * scale)))
- if tw >= w_img or th >= h_img:
- continue
- template_scaled = cv2.resize(
- template_gray_full, (tw, th), interpolation=cv2.INTER_AREA
- )
- response_map = cv2.matchTemplate(
- screenshot_gray_full, template_scaled, cv2.TM_CCOEFF_NORMED
- )
- _, max_val, _, max_loc = cv2.minMaxLoc(response_map)
- if max_val > best_score:
- best_score = float(max_val)
- best_bbox_xywh = (
- float(max_loc[0]),
- float(max_loc[1]),
- float(tw),
- float(th),
- )
- if best_bbox_xywh is None or best_score < minimum_acceptable_score:
- return None
- return best_bbox_xywh, best_score
- def list_template_match_centers_ncc_multiscale(
- template_image_file_path: Path | str,
- *,
- screenshot_image_file_path: Path | str | None = None,
- screenshot_bgr_numpy: np.ndarray | None = None,
- template_scale_factors: tuple[float, ...] = (
- 0.88,
- 0.92,
- 0.96,
- 1.0,
- 1.04,
- 1.08,
- 1.12,
- ),
- min_score: float = 0.28,
- max_peaks_per_scale: int = 8,
- dedupe_distance_pixels: float = 28.0,
- ) -> list[tuple[float, float]]:
- """
- 全图多尺度 ``matchTemplate``,收集局部极大响应,NMS 后按分数从高到低去重,
- 返回模板中心点列表(全分辨率坐标),供与 OCR 锚点选「最近」匹配用。
- """
- if screenshot_bgr_numpy is not None:
- screenshot_bgr = screenshot_bgr_numpy
- else:
- screenshot_bgr = cv2.imread(
- str(screenshot_image_file_path), cv2.IMREAD_COLOR
- )
- template_gray = cv2.imread(str(template_image_file_path), cv2.IMREAD_GRAYSCALE)
- if screenshot_bgr is None or template_gray is None:
- raise FileNotFoundError(
- f"无法读取截图或模板:screenshot_image_file_path={screenshot_image_file_path!s} template_image_file_path={template_image_file_path!s}"
- )
- screenshot_gray = cv2.cvtColor(screenshot_bgr, cv2.COLOR_BGR2GRAY)
- h_img, w_img = screenshot_gray.shape[:2]
- h_tpl0, w_tpl0 = template_gray.shape[:2]
- if h_tpl0 >= h_img or w_tpl0 >= w_img:
- return []
- raw: list[tuple[float, float, float]] = []
- for scale in template_scale_factors:
- tw = max(3, int(round(w_tpl0 * scale)))
- th = max(3, int(round(h_tpl0 * scale)))
- if tw >= w_img or th >= h_img:
- continue
- template_scaled = cv2.resize(
- template_gray, (tw, th), interpolation=cv2.INTER_AREA
- )
- response_map = cv2.matchTemplate(
- screenshot_gray, template_scaled, cv2.TM_CCOEFF_NORMED
- )
- nms_radius = int(max(4, max(tw, th) * 0.55))
- work = response_map.copy()
- for _ in range(max_peaks_per_scale):
- _, max_val, _, max_loc = cv2.minMaxLoc(work)
- if max_val < min_score:
- break
- mx, my = max_loc
- cx = float(mx) + float(tw) / 2.0
- cy = float(my) + float(th) / 2.0
- raw.append((cx, cy, float(max_val)))
- x0 = max(0, mx - nms_radius)
- y0 = max(0, my - nms_radius)
- x1 = min(work.shape[1], mx + nms_radius + 1)
- y1 = min(work.shape[0], my + nms_radius + 1)
- work[y0:y1, x0:x1] = -1.0
- raw.sort(key=lambda row: -row[2])
- merged: list[tuple[float, float]] = []
- for cx, cy, _sc in raw:
- if any(
- math.hypot(cx - ox, cy - oy) < dedupe_distance_pixels
- for ox, oy in merged
- ):
- continue
- merged.append((cx, cy))
- return merged
- def _ncc_refine_template_bbox(
- screenshot_gray_full: np.ndarray,
- template_gray_full: np.ndarray,
- center_x_full: float,
- center_y_full: float,
- search_margin_full: float,
- min_ncc_score: float = 0.25,
- ) -> tuple[float, float, float, float] | None:
- """在以 LoFTR 粗中心为邻域内做归一化互相关,返回最佳 (x, y, w, h)。"""
- h_img, w_img = screenshot_gray_full.shape[:2]
- h_t, w_t = template_gray_full.shape[:2]
- if h_t >= h_img or w_t >= w_img:
- return None
- half = int(max(search_margin_full, max(w_t, h_t) * 2))
- x0 = int(np.clip(center_x_full - half, 0, w_img - 1))
- y0 = int(np.clip(center_y_full - half, 0, h_img - 1))
- x1 = int(np.clip(center_x_full + half, 0, w_img))
- y1 = int(np.clip(center_y_full + half, 0, h_img))
- if x1 - x0 <= w_t or y1 - y0 <= h_t:
- return None
- roi = screenshot_gray_full[y0:y1, x0:x1]
- result = cv2.matchTemplate(roi, template_gray_full, cv2.TM_CCOEFF_NORMED)
- _, max_val, _, max_loc = cv2.minMaxLoc(result)
- if max_val < min_ncc_score:
- return None
- x = float(x0 + max_loc[0])
- y = float(y0 + max_loc[1])
- return (x, y, float(w_t), float(h_t))
- def run_loftr_template_match(
- *,
- loftr_repository_directory: Path,
- loftr_weight_checkpoint_path: Path,
- screenshot_image_path: Path | None = None,
- screenshot_bgr_full_size_numpy: np.ndarray | None = None,
- template_image_path: Path,
- template_long_edge_max_pixels: int = 640,
- screenshot_long_edge_max_pixels: int = 1280,
- ransac_reprojection_threshold: float = 3.0,
- max_matches_for_homography: int = 800,
- ) -> TemplateMatchBoundingBoxAndCenterInScreenshotPixels:
- """
- 读取模板图与截图,在推理尺度上跑 LoFTR,过滤外点后估计单应,并用 NCC 在全分辨率上精修位置;
- 返回全分辨率包围盒与中心点(不写磁盘)。
- """
- if not loftr_repository_directory.is_dir():
- raise FileNotFoundError(f"未找到 LoFTR 源码目录:{loftr_repository_directory}")
- repository_path_string = str(loftr_repository_directory)
- if repository_path_string not in sys.path:
- sys.path.insert(0, repository_path_string)
- import torch # noqa: E402
- from src.loftr import LoFTR, default_cfg # noqa: E402
- if not loftr_weight_checkpoint_path.is_file():
- _download_file_with_ssl_fallbacks(
- LOFTR_OUTDOOR_WEIGHT_DOWNLOAD_URL, loftr_weight_checkpoint_path
- )
- if screenshot_bgr_full_size_numpy is not None:
- screenshot_bgr_full_size = screenshot_bgr_full_size_numpy
- else:
- screenshot_bgr_full_size = cv2.imread(
- str(screenshot_image_path), cv2.IMREAD_COLOR
- )
- template_grayscale = cv2.imread(
- str(template_image_path), cv2.IMREAD_GRAYSCALE
- )
- if screenshot_bgr_full_size is None or template_grayscale is None:
- raise FileNotFoundError(
- f"无法读取图片:screenshot_image_path={screenshot_image_path!s} template_image_path={template_image_path!s}"
- )
- template_orig_h, template_orig_w = template_grayscale.shape[:2]
- screenshot_grayscale = cv2.cvtColor(
- screenshot_bgr_full_size, cv2.COLOR_BGR2GRAY
- )
- full_h, full_w = screenshot_grayscale.shape[:2]
- template_at_inference, _, _ = _resize_grayscale_divisible_by_eight(
- template_grayscale, template_long_edge_max_pixels
- )
- screenshot_at_inference, divisor_screen_x, divisor_screen_y = (
- _resize_grayscale_divisible_by_eight(
- screenshot_grayscale, screenshot_long_edge_max_pixels
- )
- )
- template_height_at_inference, template_width_at_inference = (
- template_at_inference.shape[:2]
- )
- compute_device = torch.device(
- "cuda" if torch.cuda.is_available() else "cpu"
- )
- template_batch = (
- torch.from_numpy(template_at_inference).float()[None, None].to(compute_device)
- / 255.0
- )
- screenshot_batch = (
- torch.from_numpy(screenshot_at_inference).float()[None, None].to(compute_device)
- / 255.0
- )
- matcher_network = LoFTR(config=default_cfg)
- checkpoint = torch.load(
- str(loftr_weight_checkpoint_path), map_location=compute_device
- )
- matcher_network.load_state_dict(checkpoint["state_dict"])
- matcher_network = matcher_network.eval().to(compute_device)
- torch.set_grad_enabled(False)
- forward_batch = {"image0": template_batch, "image1": screenshot_batch}
- matcher_network(forward_batch)
- template_match_points = forward_batch["mkpts0_f"].detach().cpu().numpy()
- screenshot_match_points = forward_batch["mkpts1_f"].detach().cpu().numpy()
- match_confidence = forward_batch["mconf"].detach().cpu().numpy()
- trusted_homography: np.ndarray | None
- screenshot_points_for_ransac: np.ndarray
- inlier_screen: np.ndarray
- if len(template_match_points) < 4:
- inference_screenshot_height, inference_screenshot_width = (
- screenshot_at_inference.shape[:2]
- )
- inlier_screen = np.array(
- [
- [
- float(inference_screenshot_width) * 0.5,
- float(inference_screenshot_height) * 0.5,
- ]
- ],
- dtype=np.float32,
- )
- screenshot_points_for_ransac = inlier_screen.copy()
- trusted_homography = None
- else:
- tp_filt, sp_filt, cf_filt = _filter_matches_near_confident_median(
- template_match_points,
- screenshot_match_points,
- match_confidence,
- template_width_at_inference,
- template_height_at_inference,
- )
- confidence_sorted_indices = np.argsort(-cf_filt)[
- : min(max_matches_for_homography, len(cf_filt))
- ]
- template_points_for_ransac = tp_filt[confidence_sorted_indices].astype(
- np.float32
- )
- screenshot_points_for_ransac = sp_filt[confidence_sorted_indices].astype(
- np.float32
- )
- homography_matrix, homography_mask = cv2.findHomography(
- template_points_for_ransac,
- screenshot_points_for_ransac,
- cv2.RANSAC,
- ransac_reprojection_threshold,
- maxIters=5000,
- confidence=0.995,
- )
- if homography_mask is not None:
- inlier_flat = homography_mask.ravel().astype(bool)
- inlier_screen = screenshot_points_for_ransac[inlier_flat]
- else:
- inlier_screen = screenshot_points_for_ransac
- if len(inlier_screen) < 4:
- inlier_screen = sp_filt
- trusted_homography = homography_matrix
- if homography_matrix is not None:
- corners_tpl = np.array(
- [
- [0, 0],
- [template_width_at_inference - 1, 0],
- [
- template_width_at_inference - 1,
- template_height_at_inference - 1,
- ],
- [0, template_height_at_inference - 1],
- ],
- dtype=np.float32,
- ).reshape(1, 4, 2)
- corners_inf = cv2.perspectiveTransform(corners_tpl, homography_matrix)[0]
- corners_full = corners_inf.copy()
- corners_full[:, 0] /= divisor_screen_x
- corners_full[:, 1] /= divisor_screen_y
- if not _homography_quad_plausible_on_full_image(
- corners_full, full_w, full_h, template_orig_w, template_orig_h
- ):
- trusted_homography = None
- center_inf = np.median(inlier_screen, axis=0)
- center_full_x = float(center_inf[0] / divisor_screen_x)
- center_full_y = float(center_inf[1] / divisor_screen_y)
- search_margin = float(
- max(
- template_orig_w,
- template_orig_h,
- template_width_at_inference / divisor_screen_x,
- )
- * 2.5
- )
- global_ncc = _ncc_global_multiscale_best_match(
- screenshot_grayscale, template_grayscale
- )
- local_ncc = _ncc_refine_template_bbox(
- screenshot_grayscale,
- template_grayscale,
- center_full_x,
- center_full_y,
- search_margin,
- )
- refined_bbox: tuple[float, float, float, float] | None
- if global_ncc is not None:
- refined_bbox = global_ncc[0]
- else:
- refined_bbox = local_ncc
- if refined_bbox is None:
- relaxed_global_ncc = _ncc_global_multiscale_best_match(
- screenshot_grayscale,
- template_grayscale,
- minimum_acceptable_score=0.18,
- )
- if relaxed_global_ncc is not None:
- refined_bbox = relaxed_global_ncc[0]
- template_against_screenshot_match = LoFTRTemplateAgainstScreenshotMatch(
- screenshot_bgr_full_size=screenshot_bgr_full_size,
- homography_template_inference_to_screenshot_inference=trusted_homography,
- template_width_pixels_at_inference=template_width_at_inference,
- template_height_pixels_at_inference=template_height_at_inference,
- divisor_inference_screenshot_x_to_fullsize_x=divisor_screen_x,
- divisor_inference_screenshot_y_to_fullsize_y=divisor_screen_y,
- high_confidence_match_points_on_screenshot_inference=screenshot_points_for_ransac,
- template_original_width_pixels=template_orig_w,
- template_original_height_pixels=template_orig_h,
- refined_template_bbox_xywh_full_size=refined_bbox,
- ransac_inlier_points_screenshot_inference=inlier_screen.astype(np.float32),
- )
- return _template_match_bounding_box_and_center_in_screenshot_pixels_from_loftr_template_against_screenshot_match_object(
- template_against_screenshot_match,
- )
|