| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269 |
- #!/usr/bin/env python3
- """本文件是串联入口:附着浏览器进首页 → 顶栏搜索 → 再按命令行做筛选与排序。"""
- from __future__ import annotations
- import importlib.util
- import sys
- from collections.abc import Callable
- from pathlib import Path
- from typing import Any
- # 以脚本路径直接运行时把仓库根加入 ``sys.path``,保证能导入 ``workplace``。
- _main_source_file_repository_root = Path(__file__).resolve().parent.parent
- if str(_main_source_file_repository_root) not in sys.path:
- sys.path.insert(0, str(_main_source_file_repository_root))
- # 从仓库 ini 单例读出首页 URL、根目录与 Python 路径等全局常量。
- from workplace.singleton import ( # noqa: E402
- HOME_PAGE_URL,
- PYAUTOGUI_SCREEN_CALIBRATION,
- PYTHON_EXECUTABLE_ABSOLUTE_PATH,
- PYTHON_EXECUTABLE_RELATIVE_PATH,
- REPOSITORY_CONFIG_INI_SNAPSHOT,
- REPOSITORY_ROOT_DIRECTORY,
- PyAutoGUIScreenCalibration,
- RepositoryConfigIniSnapshot,
- )
- # 为 ``from workplace.main import …`` 提供简短别名,方便外部脚本引用。
- REPOSITORY_ROOT_DIRECTORY_SINGLETON = REPOSITORY_ROOT_DIRECTORY
- REPOSITORY_CONFIG_INI_SINGLETON = REPOSITORY_CONFIG_INI_SNAPSHOT
- REPO_ROOT = REPOSITORY_ROOT_DIRECTORY
- PYTHON_RELATIVE = PYTHON_EXECUTABLE_RELATIVE_PATH
- HOME_URL = HOME_PAGE_URL
- PYTHON_EXE = PYTHON_EXECUTABLE_ABSOLUTE_PATH
- PYAUTOGUI_SCREEN_CALIBRATION_FROM_REPOSITORY_CONFIG_INI = PYAUTOGUI_SCREEN_CALIBRATION
- # 本入口统一使用仓库封装的 Playwright 与 PyAutoGUI 单例。
- from workplace import playwright as workplace_playwright # noqa: E402
- from workplace import pyautogui as workplace_pyautogui # noqa: E402
- from workplace.note_pipeline_forbidden_standalone_url_image_fetch import (
- XHS_NOTE_PIPELINE_ABSOLUTE_PROHIBITION_STANDALONE_HTTP_GET_FOR_IMAGE_BYTES,
- )
- def _load_python_module_from_file(
- python_file_path: Path,
- importlib_logical_module_name: str,
- ):
- """从磁盘路径载入一段业务 ``.py`` 为模块(用于文件名含 ``-`` 的脚本)。"""
- importlib_module_spec = importlib.util.spec_from_file_location(
- importlib_logical_module_name,
- python_file_path,
- )
- if importlib_module_spec is None or importlib_module_spec.loader is None:
- raise ImportError(f"Cannot load {python_file_path}")
- loaded_python_module = importlib.util.module_from_spec(importlib_module_spec)
- importlib_module_spec.loader.exec_module(loaded_python_module)
- return loaded_python_module
- def _load_input_keyword_module():
- # 载入顶栏「搜索小红书」定位、输入与点搜索整条串联脚本。
- return _load_python_module_from_file(
- REPO_ROOT / "workplace" / "input-keyword" / "input-keyword.py",
- "workplace_input_keyword",
- )
- def _load_open_browser_module():
- # 载入通过 CDP 附着 Chrome 并按需打开配置首页的脚本。
- return _load_python_module_from_file(
- REPO_ROOT / "workplace" / "open-browser.py",
- "workplace_open_browser",
- )
- def _load_find_high_score_note_module():
- # 载入搜索结果页上「筛选」与「最多点赞/评论/收藏」排序的键鼠脚本(与 bat 第二参联动)。
- return _load_python_module_from_file(
- REPO_ROOT / "workplace" / "find-hight-score-note" / "find-hight-score-note.py",
- "workplace_find_high_score_note",
- )
- def _load_download_page_sources_module():
- # 载入当前结果页笔记流封面与元数据的缓存脚本。
- return _load_python_module_from_file(
- REPO_ROOT / "workplace" / "download-page-sources" / "download-page-sources.py",
- "workplace_download_page_sources",
- )
- def _load_memory_cache_image_name_module():
- return _load_python_module_from_file(
- REPO_ROOT / "workplace" / "memory-cache-image-name.py",
- "workplace_memory_cache_image_name",
- )
- def _load_download_note_module():
- return _load_python_module_from_file(
- REPO_ROOT / "workplace" / "download-note" / "download-note.py",
- "workplace_download_note",
- )
- def start(command_line_argument_strings: list[str] | None = None) -> int:
- """跑通一次:附着首页 → 顶栏搜索 → 筛选排序;进程返回码与最内层 ``then`` 一致。"""
- _xhs_note_pipeline_image_bytes_must_not_use_standalone_http_get = (
- XHS_NOTE_PIPELINE_ABSOLUTE_PROHIBITION_STANDALONE_HTTP_GET_FOR_IMAGE_BYTES
- )
- _ = _xhs_note_pipeline_image_bytes_must_not_use_standalone_http_get
- # 初始化全局键鼠节奏与 Playwright 单例,后续子脚本共用同一套行为。
- workplace_pyautogui.init_singleton()
- workplace_playwright.init_singleton()
- # 解析入参:首段为搜索词,第二段为筛选/排序(与 ``start.bat`` 一致;交互下回车不筛选为 ``0``);
- # 第三段为 ``output/note/<序号>/`` 的 **从 0 起** 文件夹名(``start.bat`` 交互里问「第几条」为 **从 1 起**,会减 1 再传入)。
- command_line_arguments = (
- command_line_argument_strings
- if command_line_argument_strings is not None
- else sys.argv[1:]
- )
- search_keyword_text = command_line_arguments[0] if command_line_arguments else ""
- filter_keyword_from_command_line = (
- command_line_arguments[1].strip()
- if len(command_line_arguments) > 1
- else ""
- )
- note_list_zero_based_index_for_detail_download = (
- command_line_arguments[2].strip()
- if len(command_line_arguments) > 2
- else "0"
- )
- # 预加载本流程要串起来的业务脚本模块,避免在回调里重复读盘。
- open_browser_script_module = _load_open_browser_module()
- input_keyword_script_module = _load_input_keyword_module()
- find_high_score_note_script_module = _load_find_high_score_note_module()
- download_page_sources_script_module = _load_download_page_sources_module()
- memory_cache_image_name_script_module = _load_memory_cache_image_name_module()
- download_note_script_module = _load_download_note_module()
- # 与 ``config.ini`` 一并缓存在 ini 快照里的 ``save/project-config`` 可变字典,全流程共用同一引用、不重复读盘。
- project_config = REPOSITORY_CONFIG_INI_SNAPSHOT.project_config
- cached_image_body_by_request_url: dict[str, bytes] = {}
- stop_listening_page_image_responses_cell: list[Callable[[], None] | None] = [None]
- # 持有 Playwright 同步驱动生命周期,块结束时释放与浏览器相关的同步资源。
- with workplace_playwright.sync_playwright_singleton() as playwright_sync_driver:
- # 第一步:附着 CDP 与首页;须在当前页 20s 内达到 load(超时由 Playwright 抛错,见 open-browser.prepare_page_at_home_url)。
- first_step_exit_code, playwright_page = (
- open_browser_script_module.prepare_page_at_home_url(
- playwright_sync_driver,
- HOME_URL,
- cdp=None,
- same_site_preserves_current_page=False,
- skip_home_navigation_when_top_search_input_has_text=True,
- )
- )
- if first_step_exit_code != 0:
- return first_step_exit_code
- if playwright_page is None:
- return 1
- find_high_score_then_callback = find_high_score_note_script_module.build_then_callback_that_runs_find_high_score_note_after_keyword_search(
- playwright_page,
- keyword=search_keyword_text,
- filter_keyword=filter_keyword_from_command_line,
- project_config=project_config,
- )
- # 第二步:顶栏搜索;
- return input_keyword_script_module.start(
- playwright_page,
- keyword=search_keyword_text,
- project_config=project_config,
- then=lambda pointer_final_screen_xy: _execute_third_fourth_fifth_step_linear_after_top_bar_keyword_search(
- pointer_final_screen_xy,
- playwright_page,
- find_high_score_then_callback,
- memory_cache_image_name_script_module,
- download_page_sources_script_module,
- download_note_script_module,
- note_list_zero_based_index_for_detail_download,
- cached_image_body_by_request_url,
- stop_listening_page_image_responses_cell,
- ),
- )
- def _execute_third_fourth_fifth_step_linear_after_top_bar_keyword_search(
- pointer_final_screen_xy: tuple[int, int],
- playwright_page: Any,
- find_high_score_then_callback: Callable[[tuple[int, int]], int],
- memory_cache_image_name_script_module: Any,
- download_page_sources_script_module: Any,
- download_note_script_module: Any,
- note_list_zero_based_index_for_detail_download: str,
- cached_image_body_by_request_url: dict[str, bytes],
- stop_listening_page_image_responses_cell: list[Callable[[], None] | None],
- ) -> int:
- # 第三步:先注册图片响应缓存。须在「筛选/排序」加载笔记流之前启动,否则首屏封面/头像请求往往已完成,
- # ``cached_image_body_by_request_url`` 缺条目,落盘会跳过(监听器晚于请求是常见原因)。
- stop_listening_page_image_responses_cell[0] = memory_cache_image_name_script_module.start(
- playwright_page,
- cached_image_body_by_request_url=cached_image_body_by_request_url,
- )
- # 第四步:筛选与排序(笔记流与图片在此阶段请求,可被上一步监听器捕获)。
- find_high_score_step_exit_code = find_high_score_then_callback(
- pointer_final_screen_xy,
- )
- if find_high_score_step_exit_code != 0:
- if stop_listening_page_image_responses_cell[0] is not None:
- stop_listening_page_image_responses_cell[0]()
- stop_listening_page_image_responses_cell[0] = None
- return find_high_score_step_exit_code
- # 第五步:缓存当前页笔记流到 ``output/note/``。
- download_page_sources_script_module.start(
- playwright_page,
- cached_image_body_by_request_url=cached_image_body_by_request_url,
- )
- # 第六步: 定位笔记卡片并打开详情 → 保存详情 DOM 与图片 → 退出(业务合作)并生成预览(见 ``workplace/download-note/`` 下三个脚本,或由 ``download-note.py`` 串联)。
- download_note_exit_code = download_note_script_module.start(
- [note_list_zero_based_index_for_detail_download],
- existing_playwright_page=playwright_page,
- cached_image_body_by_request_url=cached_image_body_by_request_url,
- )
- if stop_listening_page_image_responses_cell[0] is not None:
- stop_listening_page_image_responses_cell[0]()
- return download_note_exit_code
- def main() -> int:
- # 命令行默认入口:把整段自动化流程的返回码原样交给调用方。
- return start()
- if __name__ == "__main__":
- # 直接执行本文件时,用流程返回码作为进程退出码结束解释器。
- raise SystemExit(main())
- # 声明 ``from workplace.main import *`` 时对外可见的符号集合。
- __all__ = [
- "REPOSITORY_ROOT_DIRECTORY",
- "REPOSITORY_ROOT_DIRECTORY_SINGLETON",
- "REPOSITORY_CONFIG_INI_SNAPSHOT",
- "REPOSITORY_CONFIG_INI_SINGLETON",
- "RepositoryConfigIniSnapshot",
- "PyAutoGUIScreenCalibration",
- "REPO_ROOT",
- "PYTHON_RELATIVE",
- "HOME_URL",
- "HOME_PAGE_URL",
- "PYTHON_EXE",
- "PYTHON_EXECUTABLE_ABSOLUTE_PATH",
- "PYTHON_EXECUTABLE_RELATIVE_PATH",
- "PYAUTOGUI_SCREEN_CALIBRATION",
- "PYAUTOGUI_SCREEN_CALIBRATION_FROM_REPOSITORY_CONFIG_INI",
- "workplace_playwright",
- "workplace_pyautogui",
- "main",
- "start",
- ]
|