yichael
/
xhs-note-crawling


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528
							#!/usr/bin/env python3
"""
使用仓库内 ``python/RapidOCR`` 对**图片**做 OCR。

``ocr_find_text_center``：在单行识别结果中找完整包含目标子串的框（与 CLI 一致）。

``ocr_find_text_center_allowing_note_title_wrapped_across_two_ocr_lines``（笔记标题锚点）：
仅使用用户字符串 **第一逻辑行**（首个换行符之前）；第二行及以后 **不参与** 匹配与几何计算。
对第一行先做 **NFKC**，再抠掉难以 OCR 的字符得到多段可识别文本。整图 **只跑一次** OCR，
每段取面积最小匹配框中心；同一段内多段则取首末段中心的中点作为代表点（不再跨多逻辑行拼接）。

与 Playwright / 浏览器 DOM 无关；仅输入图片路径或字节。

用法：``python workplace/ocr-pos.py <目标文字> <图片路径>``
成功时 stdout 一行：``cx cy``（整数）。

可调模块级常量：``OCR_RAPIDOCR_GLOBAL_MAX_SIDE_LEN``、``OCR_RAPIDOCR_INFERENCE_ENGINE_NAME``、
``OCR_RAPIDOCR_ONNX_RUNTIME_USE_CUDA``（仅 ``onnxruntime`` 引擎时对应 ``EngineConfig.onnxruntime.use_cuda``）。
可选推理依赖见 ``python/RapidOCR/requirements-inference-*.txt``。
"""

from __future__ import annotations

import sys
import unicodedata
from pathlib import Path
from typing import Any

import cv2
import numpy as np

_REPO_ROOT = Path(__file__).resolve().parent.parent
_RAPIDOCR_PKG_ROOT = _REPO_ROOT / "python" / "RapidOCR" / "python"
if str(_RAPIDOCR_PKG_ROOT) not in sys.path:
    sys.path.insert(0, str(_RAPIDOCR_PKG_ROOT))

REPO_ROOT: Path = _REPO_ROOT

# OCR_RAPIDOCR_GLOBAL_MAX_SIDE_LEN: int = 2000  # 提速可试 1280 或 960（顶栏 OCR 常够用；过小易丢小字）
# OCR_RAPIDOCR_INFERENCE_ENGINE_NAME: str = "onnxruntime"  # 可选 openvino（Intel CPU 常较快）/ tensorrt（需 NVIDIA 环境）
# OCR_RAPIDOCR_ONNX_RUNTIME_USE_CUDA: bool = False  # 已装 onnxruntime-gpu 且要用 GPU 时改为 True

OCR_RAPIDOCR_GLOBAL_MAX_SIDE_LEN: int = 2000  # 提速可试 1280 或 960（顶栏 OCR 常够用；过小易丢小字）
OCR_RAPIDOCR_INFERENCE_ENGINE_NAME: str = "onnxruntime"  # 与 requirements 中 onnxruntime 一致；本机已装 openvino 可改 openvino
OCR_RAPIDOCR_ONNX_RUNTIME_USE_CUDA: bool = False  # 已装 onnxruntime-gpu 且要用 GPU 时改为 True


# 下方为 RapidOCR 单例缓存（勿改此变量）。可切换的只有上面三个常量；改完后须重启进程才会重新建引擎。
_ocr_engine_singleton = None


def _build_rapid_ocr_params_dictionary() -> dict[str, Any]:
    from rapidocr.utils.typings import EngineType

    inference_engine_name_normalized = (
        OCR_RAPIDOCR_INFERENCE_ENGINE_NAME.strip().lower()
    )
    rapid_ocr_params: dict[str, Any] = {
        "Global.max_side_len": int(OCR_RAPIDOCR_GLOBAL_MAX_SIDE_LEN),
        "Global.use_cls": False,
    }

    if inference_engine_name_normalized == "openvino":
        rapid_ocr_params["Det.engine_type"] = EngineType.OPENVINO
        rapid_ocr_params["Cls.engine_type"] = EngineType.OPENVINO
        rapid_ocr_params["Rec.engine_type"] = EngineType.OPENVINO
        return rapid_ocr_params

    if inference_engine_name_normalized == "tensorrt":
        rapid_ocr_params["Det.engine_type"] = EngineType.TENSORRT
        rapid_ocr_params["Cls.engine_type"] = EngineType.TENSORRT
        rapid_ocr_params["Rec.engine_type"] = EngineType.TENSORRT
        return rapid_ocr_params

    rapid_ocr_params["Det.engine_type"] = EngineType.ONNXRUNTIME
    rapid_ocr_params["Cls.engine_type"] = EngineType.ONNXRUNTIME
    rapid_ocr_params["Rec.engine_type"] = EngineType.ONNXRUNTIME
    rapid_ocr_params["EngineConfig.onnxruntime.use_cuda"] = bool(
        OCR_RAPIDOCR_ONNX_RUNTIME_USE_CUDA
    )
    return rapid_ocr_params


def _get_rapid_ocr():
    global _ocr_engine_singleton
    if _ocr_engine_singleton is None:
        from rapidocr import RapidOCR

        _ocr_engine_singleton = RapidOCR(
            params=_build_rapid_ocr_params_dictionary(),
        )
    return _ocr_engine_singleton


def _rapidocr_input_from_path_bytes_or_bgr_numpy(
    image: str | Path | bytes | np.ndarray,
):
    if isinstance(image, np.ndarray):
        return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    return image


def _quad_center_xy(box) -> tuple[float, float]:
    """
    ``box`` 为四角点 (4,2)；返回几何中心点。
    """
    xs = [float(p[0]) for p in box]
    ys = [float(p[1]) for p in box]
    return sum(xs) / len(xs), sum(ys) / len(ys)


def _normalize_ocr_match_string(s: str) -> str:
    """去掉首尾空白并去掉内部空白；再将全角标点与半角对齐，便于 metadata 与 RapidOCR 输出对齐。"""
    collapsed_whitespace_string = "".join((s or "").strip().split())
    punctuation_aligned_string = (
        collapsed_whitespace_string.replace("，", ",")
        .replace("？", "?")
        .replace("（", "(")
        .replace("）", ")")
        .replace("：", ":")
        .replace("；", ";")
        .replace("！", "!")
    )
    return punctuation_aligned_string.casefold()


def _ocr_line_contains_full_target(ocr_line: str, target_text: str) -> bool:
    """
    仅当 **整段** ``target_text`` 连续出现在本行识别结果中才算匹配。

    禁止再用「识别串是目标子串」（如仅「小红书」）去碰「搜索小红书」，
    避免中心点落到不完整的字块上（例如「小红书上」里截出的「小红书」）。
    """
    line = _normalize_ocr_match_string(ocr_line)
    target = _normalize_ocr_match_string(target_text)
    if not target:
        return False
    return target in line


def _ocr_dom_anchor_line_matches_normalized_text(
    ocr_line: str,
    anchor_norm: str,
) -> bool:
    """与 ``clear-input`` 一致：多字用子串匹配，单字要求整行一致。"""
    norm = _normalize_ocr_match_string(ocr_line)
    if not anchor_norm:
        return False
    if len(anchor_norm) >= 2:
        return anchor_norm in norm
    return norm == anchor_norm


def ocr_smallest_line_center_for_dom_text(
    image: str | Path | bytes | np.ndarray,
    anchor_dom_text: str,
) -> tuple[int, int] | None:
    """
    在 OCR 结果中找归一后包含 ``anchor_dom_text`` 的行；多行命中时取**框面积最小**的一行，
    返回该行中心（屏幕/图像像素）。与 ``clear-input`` 锚点规则一致。
    """
    anchor_norm = _normalize_ocr_match_string(anchor_dom_text)
    if not anchor_norm:
        return None
    ocr = _get_rapid_ocr()
    result = ocr(_rapidocr_input_from_path_bytes_or_bgr_numpy(image))
    if result is None or not result.txts or result.boxes is None:
        return None
    anchor_indices: list[int] = []
    for i, txt in enumerate(result.txts):
        if _ocr_dom_anchor_line_matches_normalized_text(txt, anchor_norm):
            anchor_indices.append(i)
    if not anchor_indices:
        return None

    def _box_area(idx: int) -> float:
        box = result.boxes[idx]
        xs = [float(p[0]) for p in box]
        ys = [float(p[1]) for p in box]
        return (max(xs) - min(xs)) * (max(ys) - min(ys))

    best_i = min(anchor_indices, key=_box_area)
    cx, cy = _quad_center_xy(result.boxes[best_i])
    return int(round(cx)), int(round(cy))


_CJK_AND_COMMON_TITLE_PUNCTUATION_CHARACTER_FROZEN_SET = frozenset(
    "，。、：；？！""''（）《》·—…－—「」『』【】〈〉＆％＃＠…—",
)


def _is_single_character_treated_as_recognizable_by_rapidocr_line_text_anchor(
    single_unicode_character: str,
) -> bool:
    if len(single_unicode_character) != 1:
        return False
    code_point_integer = ord(single_unicode_character)
    if single_unicode_character.isascii():
        return single_unicode_character.isalnum() or single_unicode_character in " \t.-_/:&!?"
    if (
        0x4E00 <= code_point_integer <= 0x9FFF
        or 0x3400 <= code_point_integer <= 0x4DBF
        or 0x20000 <= code_point_integer <= 0x2CEAF
    ):
        return True
    if single_unicode_character in _CJK_AND_COMMON_TITLE_PUNCTUATION_CHARACTER_FROZEN_SET:
        return True
    return False


def _split_user_anchor_string_into_recognizable_text_segment_string_list(
    raw_anchor_string_from_user: str,
) -> list[str]:
    unicode_nfkc_normalized_anchor_string = unicodedata.normalize(
        "NFKC",
        raw_anchor_string_from_user or "",
    )
    contiguous_recognizable_character_list: list[str] = []
    recognizable_text_segment_string_list: list[str] = []
    for single_character in unicode_nfkc_normalized_anchor_string:
        if _is_single_character_treated_as_recognizable_by_rapidocr_line_text_anchor(
            single_character,
        ):
            contiguous_recognizable_character_list.append(single_character)
        else:
            if contiguous_recognizable_character_list:
                recognizable_text_segment_string_list.append(
                    "".join(contiguous_recognizable_character_list),
                )
                contiguous_recognizable_character_list = []
    if contiguous_recognizable_character_list:
        recognizable_text_segment_string_list.append(
            "".join(contiguous_recognizable_character_list),
        )
    return recognizable_text_segment_string_list


def _split_anchor_raw_string_into_logical_line_then_recognizable_segment_nested_string_list(
    raw_anchor_string_from_user: str,
) -> list[list[str]]:
    anchor_string_with_normalized_newline_only = (
        (raw_anchor_string_from_user or "")
        .replace("\r\n", "\n")
        .replace("\r", "\n")
    )
    logical_line_raw_string_list = anchor_string_with_normalized_newline_only.split("\n")
    nested_recognizable_segment_string_list: list[list[str]] = []
    for single_logical_line_raw_string in logical_line_raw_string_list:
        stripped_logical_line_string = single_logical_line_raw_string.strip()
        if not stripped_logical_line_string:
            continue
        recognizable_segment_list_for_one_logical_line = (
            _split_user_anchor_string_into_recognizable_text_segment_string_list(
                stripped_logical_line_string,
            )
        )
        if recognizable_segment_list_for_one_logical_line:
            nested_recognizable_segment_string_list.append(
                recognizable_segment_list_for_one_logical_line,
            )
    return nested_recognizable_segment_string_list


def _midpoint_float_xy_pair_from_first_and_last_center_in_ordered_float_pair_list(
    ordered_center_xy_float_pair_list: list[tuple[float, float]],
) -> tuple[float, float] | None:
    if not ordered_center_xy_float_pair_list:
        return None
    if len(ordered_center_xy_float_pair_list) == 1:
        lone_center_x, lone_center_y = ordered_center_xy_float_pair_list[0]
        return lone_center_x, lone_center_y
    first_center_x, first_center_y = ordered_center_xy_float_pair_list[0]
    last_center_x, last_center_y = ordered_center_xy_float_pair_list[-1]
    return (
        (first_center_x + last_center_x) / 2.0,
        (first_center_y + last_center_y) / 2.0,
    )


def _midpoint_float_xy_pair_from_closest_pair_of_points_in_float_pair_list(
    center_xy_float_pair_list: list[tuple[float, float]],
) -> tuple[float, float] | None:
    point_count = len(center_xy_float_pair_list)
    if point_count == 0:
        return None
    if point_count == 1:
        lone_x, lone_y = center_xy_float_pair_list[0]
        return lone_x, lone_y
    best_squared_euclidean_distance_between_pair: float | None = None
    midpoint_x_between_closest_pair = 0.0
    midpoint_y_between_closest_pair = 0.0
    for first_point_index in range(point_count):
        first_x, first_y = center_xy_float_pair_list[first_point_index]
        for second_point_index in range(first_point_index + 1, point_count):
            second_x, second_y = center_xy_float_pair_list[second_point_index]
            delta_x = first_x - second_x
            delta_y = first_y - second_y
            squared_distance = delta_x * delta_x + delta_y * delta_y
            if (
                best_squared_euclidean_distance_between_pair is None
                or squared_distance < best_squared_euclidean_distance_between_pair
            ):
                best_squared_euclidean_distance_between_pair = squared_distance
                midpoint_x_between_closest_pair = (first_x + second_x) / 2.0
                midpoint_y_between_closest_pair = (first_y + second_y) / 2.0
    return midpoint_x_between_closest_pair, midpoint_y_between_closest_pair


def _ocr_smallest_area_line_index_where_normalized_line_contains_substring(
    ocr_result: Any,
    substring_normalized: str,
) -> int | None:
    if not substring_normalized:
        return None
    best_line_index: int | None = None
    best_area_pixels = float("inf")
    for line_index, line_text in enumerate(ocr_result.txts):
        line_normalized = _normalize_ocr_match_string(line_text)
        if substring_normalized not in line_normalized:
            continue
        box = ocr_result.boxes[line_index]
        xs = [float(p[0]) for p in box]
        ys = [float(p[1]) for p in box]
        area_pixels = (max(xs) - min(xs)) * (max(ys) - min(ys))
        if area_pixels < best_area_pixels:
            best_area_pixels = area_pixels
            best_line_index = line_index
    return best_line_index


def _first_logical_line_only_from_multiline_anchor_string(raw_anchor_string: str) -> str:
    """只保留第一逻辑行（首个换行前），用于列表页标题锚点：第二行起忽略。"""
    normalized_newlines_only = (raw_anchor_string or "").replace("\r\n", "\n").replace(
        "\r",
        "\n",
    )
    return normalized_newlines_only.split("\n", 1)[0].strip()


def ocr_find_text_center_allowing_note_title_wrapped_across_two_ocr_lines(
    image: str | Path | bytes | np.ndarray,
    target_text: str,
) -> tuple[int, int] | None:
    anchor_first_logical_line_only = _first_logical_line_only_from_multiline_anchor_string(
        target_text,
    )
    nested_recognizable_segment_string_list = (
        _split_anchor_raw_string_into_logical_line_then_recognizable_segment_nested_string_list(
            anchor_first_logical_line_only,
        )
    )
    if not nested_recognizable_segment_string_list:
        return None
    ocr = _get_rapid_ocr()
    result = ocr(_rapidocr_input_from_path_bytes_or_bgr_numpy(image))
    if result is None or not result.txts or result.boxes is None:
        return None
    logical_line_representative_center_xy_float_pair_list: list[tuple[float, float]] = []
    for recognizable_segment_raw_string_list_for_one_logical_line in nested_recognizable_segment_string_list:
        segment_center_xy_float_pair_list_for_one_logical_line: list[tuple[float, float]] = []
        for raw_segment_string in recognizable_segment_raw_string_list_for_one_logical_line:
            normalized_segment_string = _normalize_ocr_match_string(raw_segment_string)
            if not normalized_segment_string:
                continue
            matched_line_index = _ocr_smallest_area_line_index_where_normalized_line_contains_substring(
                result,
                normalized_segment_string,
            )
            if matched_line_index is None:
                return None
            segment_center_x, segment_center_y = _quad_center_xy(
                result.boxes[matched_line_index],
            )
            segment_center_xy_float_pair_list_for_one_logical_line.append(
                (segment_center_x, segment_center_y),
            )
        if not segment_center_xy_float_pair_list_for_one_logical_line:
            continue
        one_logical_line_representative_center_xy_float_pair = (
            _midpoint_float_xy_pair_from_first_and_last_center_in_ordered_float_pair_list(
                segment_center_xy_float_pair_list_for_one_logical_line,
            )
        )
        if one_logical_line_representative_center_xy_float_pair is not None:
            logical_line_representative_center_xy_float_pair_list.append(
                one_logical_line_representative_center_xy_float_pair,
            )
    if not logical_line_representative_center_xy_float_pair_list:
        return None
    final_anchor_center_xy_float_pair = (
        _midpoint_float_xy_pair_from_closest_pair_of_points_in_float_pair_list(
            logical_line_representative_center_xy_float_pair_list,
        )
    )
    if final_anchor_center_xy_float_pair is None:
        return None
    return (
        int(round(final_anchor_center_xy_float_pair[0])),
        int(round(final_anchor_center_xy_float_pair[1])),
    )


def ocr_find_text_center(
    image: str | Path | bytes | np.ndarray,
    target_text: str,
    *,
    prefer_highest_score: bool = True,
) -> tuple[int, int] | None:
    """
    对 ``image``（路径、``Path``、图像字节或 OpenCV BGR ``numpy`` 数组）跑 RapidOCR，找到与 ``target_text`` 最匹配的一行，
    返回该行四边形框的**中心点** ``(cx, cy)``（与图像像素坐标一致）。

    匹配规则：识别行经空白归一后，必须 **完整包含** ``target_text``（连续子串），
    不要求逐字与目标一致，但必须整段关键词都在同一 OCR 框对应的文字里。

    多行命中时：优先与目标 **完全一致** 的行，否则取识别串 **更短** 的行（更接近整块占位符），
    再按 ``prefer_highest_score`` 用置信度打破平局。
    """
    ocr = _get_rapid_ocr()
    result = ocr(_rapidocr_input_from_path_bytes_or_bgr_numpy(image))
    if result is None or not result.txts or result.boxes is None:
        return None

    target_norm = _normalize_ocr_match_string(target_text)
    if not target_norm:
        return None

    matches: list[tuple[int, float, int, int]] = []
    for i, txt in enumerate(result.txts):
        if not _ocr_line_contains_full_target(txt, target_text):
            continue
        line_norm = _normalize_ocr_match_string(txt)
        exact = 0 if line_norm == target_norm else 1
        line_len = len(line_norm)
        score = float(result.scores[i]) if result.scores and i < len(result.scores) else 0.0
        matches.append((i, score, exact, line_len))

    if not matches:
        return None

    if len(matches) > 1:
        if prefer_highest_score:
            matches.sort(key=lambda row: (row[2], row[3], -row[1]))
        else:
            matches.sort(key=lambda row: (row[2], row[3], row[1]))

    idx = matches[0][0]
    box = result.boxes[idx]
    cx, cy = _quad_center_xy(box)
    return int(round(cx)), int(round(cy))


def ocr_find_text_center_by_shortening_prefix_of_anchor_until_match(
    image: str | Path | bytes | np.ndarray,
    anchor_text: str,
    *,
    minimum_raw_character_count_inclusive: int = 6,
    prefer_highest_score: bool = True,
) -> tuple[int, int] | None:
    """
    在整段 ``anchor_text`` 无法命中时，从去掉末尾一字的前缀起，逐步缩短，
    直到某一前缀在单行 OCR 中可子串匹配（仍用 ``ocr_find_text_center`` 规则）。
    用于识别结果漏掉标题末尾一两字、或个别字与 DOM 不一致时仍能落到标题行。
    """
    stripped_anchor_text = (anchor_text or "").strip()
    character_count = len(stripped_anchor_text)
    if character_count <= minimum_raw_character_count_inclusive:
        return None
    for end_exclusive in range(
        character_count - 1,
        minimum_raw_character_count_inclusive - 1,
        -1,
    ):
        prefix_only_anchor_text = stripped_anchor_text[:end_exclusive]
        found_center_xy_integer_pair = ocr_find_text_center(
            image,
            prefix_only_anchor_text,
            prefer_highest_score=prefer_highest_score,
        )
        if found_center_xy_integer_pair is not None:
            return found_center_xy_integer_pair
    return None


def start(argv: list[str] | None = None) -> int:
    rest = argv if argv is not None else sys.argv[1:]
    if len(rest) < 2:
        print("用法: ocr-pos.py <目标文字> <图片路径>", file=sys.stderr)
        return 2

    target = (rest[0] or "").strip()
    image_path = Path(rest[1]).expanduser()
    if not target:
        print("目标文字不能为空。", file=sys.stderr)
        return 2
    if not image_path.is_file():
        print(f"图片不存在: {image_path}", file=sys.stderr)
        return 2

    pt = ocr_find_text_center(image_path, target)
    if pt is None:
        print(f"未在图中找到与「{target}」匹配的文字。", file=sys.stderr)
        return 1

    print(f"{pt[0]} {pt[1]}")
    return 0


def main() -> int:
    return start()


__all__ = [
    "REPO_ROOT",
    "OCR_RAPIDOCR_GLOBAL_MAX_SIDE_LEN",
    "OCR_RAPIDOCR_INFERENCE_ENGINE_NAME",
    "OCR_RAPIDOCR_ONNX_RUNTIME_USE_CUDA",
    "ocr_find_text_center",
    "ocr_find_text_center_allowing_note_title_wrapped_across_two_ocr_lines",
    "ocr_find_text_center_by_shortening_prefix_of_anchor_until_match",
    "ocr_smallest_line_center_for_dom_text",
    "start",
    "main",
]


if __name__ == "__main__":
    raise SystemExit(start())