#!/usr/bin/env python3 """本文件是串联入口:附着浏览器进首页 → 顶栏搜索 → 再按命令行做筛选与排序。""" from __future__ import annotations import importlib.util import sys from collections.abc import Callable from pathlib import Path from typing import Any # 以脚本路径直接运行时把仓库根加入 ``sys.path``,保证能导入 ``workplace``。 _main_source_file_repository_root = Path(__file__).resolve().parent.parent if str(_main_source_file_repository_root) not in sys.path: sys.path.insert(0, str(_main_source_file_repository_root)) # 从仓库 ini 单例读出首页 URL、根目录与 Python 路径等全局常量。 from workplace.singleton import ( # noqa: E402 HOME_PAGE_URL, PYAUTOGUI_SCREEN_CALIBRATION, PYTHON_EXECUTABLE_ABSOLUTE_PATH, PYTHON_EXECUTABLE_RELATIVE_PATH, REPOSITORY_CONFIG_INI_SNAPSHOT, REPOSITORY_ROOT_DIRECTORY, PyAutoGUIScreenCalibration, RepositoryConfigIniSnapshot, ) # 为 ``from workplace.main import …`` 提供简短别名,方便外部脚本引用。 REPOSITORY_ROOT_DIRECTORY_SINGLETON = REPOSITORY_ROOT_DIRECTORY REPOSITORY_CONFIG_INI_SINGLETON = REPOSITORY_CONFIG_INI_SNAPSHOT REPO_ROOT = REPOSITORY_ROOT_DIRECTORY PYTHON_RELATIVE = PYTHON_EXECUTABLE_RELATIVE_PATH HOME_URL = HOME_PAGE_URL PYTHON_EXE = PYTHON_EXECUTABLE_ABSOLUTE_PATH PYAUTOGUI_SCREEN_CALIBRATION_FROM_REPOSITORY_CONFIG_INI = PYAUTOGUI_SCREEN_CALIBRATION # 本入口统一使用仓库封装的 Playwright 与 PyAutoGUI 单例。 from workplace import playwright as workplace_playwright # noqa: E402 from workplace import pyautogui as workplace_pyautogui # noqa: E402 from workplace.note_pipeline_forbidden_standalone_url_image_fetch import ( XHS_NOTE_PIPELINE_ABSOLUTE_PROHIBITION_STANDALONE_HTTP_GET_FOR_IMAGE_BYTES, ) def _load_python_module_from_file( python_file_path: Path, importlib_logical_module_name: str, ): """从磁盘路径载入一段业务 ``.py`` 为模块(用于文件名含 ``-`` 的脚本)。""" importlib_module_spec = importlib.util.spec_from_file_location( importlib_logical_module_name, python_file_path, ) if importlib_module_spec is None or importlib_module_spec.loader is None: raise ImportError(f"Cannot load {python_file_path}") loaded_python_module = importlib.util.module_from_spec(importlib_module_spec) importlib_module_spec.loader.exec_module(loaded_python_module) return loaded_python_module def _load_input_keyword_module(): # 载入顶栏「搜索小红书」定位、输入与点搜索整条串联脚本。 return _load_python_module_from_file( REPO_ROOT / "workplace" / "input-keyword" / "input-keyword.py", "workplace_input_keyword", ) def _load_open_browser_module(): # 载入通过 CDP 附着 Chrome 并按需打开配置首页的脚本。 return _load_python_module_from_file( REPO_ROOT / "workplace" / "open-browser.py", "workplace_open_browser", ) def _load_find_high_score_note_module(): # 载入搜索结果页上「筛选」与「最多点赞/评论/收藏」排序的键鼠脚本(与 bat 第二参联动)。 return _load_python_module_from_file( REPO_ROOT / "workplace" / "find-hight-score-note" / "find-hight-score-note.py", "workplace_find_high_score_note", ) def _load_download_page_sources_module(): # 载入当前结果页笔记流封面与元数据的缓存脚本。 return _load_python_module_from_file( REPO_ROOT / "workplace" / "download-page-sources" / "download-page-sources.py", "workplace_download_page_sources", ) def _load_memory_cache_image_name_module(): return _load_python_module_from_file( REPO_ROOT / "workplace" / "memory-cache-image-name.py", "workplace_memory_cache_image_name", ) def _load_download_note_module(): return _load_python_module_from_file( REPO_ROOT / "workplace" / "download-note" / "download-note.py", "workplace_download_note", ) def start(command_line_argument_strings: list[str] | None = None) -> int: """跑通一次:附着首页 → 顶栏搜索 → 筛选排序;进程返回码与最内层 ``then`` 一致。""" _xhs_note_pipeline_image_bytes_must_not_use_standalone_http_get = ( XHS_NOTE_PIPELINE_ABSOLUTE_PROHIBITION_STANDALONE_HTTP_GET_FOR_IMAGE_BYTES ) _ = _xhs_note_pipeline_image_bytes_must_not_use_standalone_http_get # 初始化全局键鼠节奏与 Playwright 单例,后续子脚本共用同一套行为。 workplace_pyautogui.init_singleton() workplace_playwright.init_singleton() # 解析入参:首段为搜索词,第二段为筛选/排序(与 ``start.bat`` 一致;交互下回车不筛选为 ``0``); # 第三段为 ``output/note/<序号>/`` 的 **从 0 起** 文件夹名(``start.bat`` 交互里问「第几条」为 **从 1 起**,会减 1 再传入)。 command_line_arguments = ( command_line_argument_strings if command_line_argument_strings is not None else sys.argv[1:] ) search_keyword_text = command_line_arguments[0] if command_line_arguments else "" filter_keyword_from_command_line = ( command_line_arguments[1].strip() if len(command_line_arguments) > 1 else "" ) note_list_zero_based_index_for_detail_download = ( command_line_arguments[2].strip() if len(command_line_arguments) > 2 else "0" ) # 预加载本流程要串起来的业务脚本模块,避免在回调里重复读盘。 open_browser_script_module = _load_open_browser_module() input_keyword_script_module = _load_input_keyword_module() find_high_score_note_script_module = _load_find_high_score_note_module() download_page_sources_script_module = _load_download_page_sources_module() memory_cache_image_name_script_module = _load_memory_cache_image_name_module() download_note_script_module = _load_download_note_module() # 与 ``config.ini`` 一并缓存在 ini 快照里的 ``save/project-config`` 可变字典,全流程共用同一引用、不重复读盘。 project_config = REPOSITORY_CONFIG_INI_SNAPSHOT.project_config cached_image_body_by_request_url: dict[str, bytes] = {} stop_listening_page_image_responses_cell: list[Callable[[], None] | None] = [None] # 持有 Playwright 同步驱动生命周期,块结束时释放与浏览器相关的同步资源。 with workplace_playwright.sync_playwright_singleton() as playwright_sync_driver: # 第一步:附着 CDP 与首页;须在当前页 20s 内达到 load(超时由 Playwright 抛错,见 open-browser.prepare_page_at_home_url)。 first_step_exit_code, playwright_page = ( open_browser_script_module.prepare_page_at_home_url( playwright_sync_driver, HOME_URL, cdp=None, same_site_preserves_current_page=False, skip_home_navigation_when_top_search_input_has_text=True, ) ) if first_step_exit_code != 0: return first_step_exit_code if playwright_page is None: return 1 find_high_score_then_callback = find_high_score_note_script_module.build_then_callback_that_runs_find_high_score_note_after_keyword_search( playwright_page, keyword=search_keyword_text, filter_keyword=filter_keyword_from_command_line, project_config=project_config, ) # 第二步:顶栏搜索; return input_keyword_script_module.start( playwright_page, keyword=search_keyword_text, project_config=project_config, then=lambda pointer_final_screen_xy: _execute_third_fourth_fifth_step_linear_after_top_bar_keyword_search( pointer_final_screen_xy, playwright_page, find_high_score_then_callback, memory_cache_image_name_script_module, download_page_sources_script_module, download_note_script_module, note_list_zero_based_index_for_detail_download, cached_image_body_by_request_url, stop_listening_page_image_responses_cell, ), ) def _execute_third_fourth_fifth_step_linear_after_top_bar_keyword_search( pointer_final_screen_xy: tuple[int, int], playwright_page: Any, find_high_score_then_callback: Callable[[tuple[int, int]], int], memory_cache_image_name_script_module: Any, download_page_sources_script_module: Any, download_note_script_module: Any, note_list_zero_based_index_for_detail_download: str, cached_image_body_by_request_url: dict[str, bytes], stop_listening_page_image_responses_cell: list[Callable[[], None] | None], ) -> int: # 第三步:先注册图片响应缓存。须在「筛选/排序」加载笔记流之前启动,否则首屏封面/头像请求往往已完成, # ``cached_image_body_by_request_url`` 缺条目,落盘会跳过(监听器晚于请求是常见原因)。 stop_listening_page_image_responses_cell[0] = memory_cache_image_name_script_module.start( playwright_page, cached_image_body_by_request_url=cached_image_body_by_request_url, ) # 第四步:筛选与排序(笔记流与图片在此阶段请求,可被上一步监听器捕获)。 find_high_score_step_exit_code = find_high_score_then_callback( pointer_final_screen_xy, ) if find_high_score_step_exit_code != 0: if stop_listening_page_image_responses_cell[0] is not None: stop_listening_page_image_responses_cell[0]() stop_listening_page_image_responses_cell[0] = None return find_high_score_step_exit_code # 第五步:缓存当前页笔记流到 ``output/note/``。 download_page_sources_script_module.start( playwright_page, cached_image_body_by_request_url=cached_image_body_by_request_url, ) # 第六步: 定位笔记卡片并打开详情 → 保存详情 DOM 与图片 → 退出(业务合作)并生成预览(见 ``workplace/download-note/`` 下三个脚本,或由 ``download-note.py`` 串联)。 download_note_exit_code = download_note_script_module.start( [note_list_zero_based_index_for_detail_download], existing_playwright_page=playwright_page, cached_image_body_by_request_url=cached_image_body_by_request_url, ) if stop_listening_page_image_responses_cell[0] is not None: stop_listening_page_image_responses_cell[0]() return download_note_exit_code def main() -> int: # 命令行默认入口:把整段自动化流程的返回码原样交给调用方。 return start() if __name__ == "__main__": # 直接执行本文件时,用流程返回码作为进程退出码结束解释器。 raise SystemExit(main()) # 声明 ``from workplace.main import *`` 时对外可见的符号集合。 __all__ = [ "REPOSITORY_ROOT_DIRECTORY", "REPOSITORY_ROOT_DIRECTORY_SINGLETON", "REPOSITORY_CONFIG_INI_SNAPSHOT", "REPOSITORY_CONFIG_INI_SINGLETON", "RepositoryConfigIniSnapshot", "PyAutoGUIScreenCalibration", "REPO_ROOT", "PYTHON_RELATIVE", "HOME_URL", "HOME_PAGE_URL", "PYTHON_EXE", "PYTHON_EXECUTABLE_ABSOLUTE_PATH", "PYTHON_EXECUTABLE_RELATIVE_PATH", "PYAUTOGUI_SCREEN_CALIBRATION", "PYAUTOGUI_SCREEN_CALIBRATION_FROM_REPOSITORY_CONFIG_INI", "workplace_playwright", "workplace_pyautogui", "main", "start", ]