main.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. #!/usr/bin/env python3
  2. """本文件是串联入口:附着浏览器进首页 → 顶栏搜索 → 再按命令行做筛选与排序。"""
  3. from __future__ import annotations
  4. import importlib.util
  5. import sys
  6. from collections.abc import Callable
  7. from pathlib import Path
  8. from typing import Any
  9. # 以脚本路径直接运行时把仓库根加入 ``sys.path``,保证能导入 ``workplace``。
  10. _main_source_file_repository_root = Path(__file__).resolve().parent.parent
  11. if str(_main_source_file_repository_root) not in sys.path:
  12. sys.path.insert(0, str(_main_source_file_repository_root))
  13. # 从仓库 ini 单例读出首页 URL、根目录与 Python 路径等全局常量。
  14. from workplace.singleton import ( # noqa: E402
  15. HOME_PAGE_URL,
  16. PYAUTOGUI_SCREEN_CALIBRATION,
  17. PYTHON_EXECUTABLE_ABSOLUTE_PATH,
  18. PYTHON_EXECUTABLE_RELATIVE_PATH,
  19. REPOSITORY_CONFIG_INI_SNAPSHOT,
  20. REPOSITORY_ROOT_DIRECTORY,
  21. PyAutoGUIScreenCalibration,
  22. RepositoryConfigIniSnapshot,
  23. )
  24. # 为 ``from workplace.main import …`` 提供简短别名,方便外部脚本引用。
  25. REPOSITORY_ROOT_DIRECTORY_SINGLETON = REPOSITORY_ROOT_DIRECTORY
  26. REPOSITORY_CONFIG_INI_SINGLETON = REPOSITORY_CONFIG_INI_SNAPSHOT
  27. REPO_ROOT = REPOSITORY_ROOT_DIRECTORY
  28. PYTHON_RELATIVE = PYTHON_EXECUTABLE_RELATIVE_PATH
  29. HOME_URL = HOME_PAGE_URL
  30. PYTHON_EXE = PYTHON_EXECUTABLE_ABSOLUTE_PATH
  31. PYAUTOGUI_SCREEN_CALIBRATION_FROM_REPOSITORY_CONFIG_INI = PYAUTOGUI_SCREEN_CALIBRATION
  32. # 本入口统一使用仓库封装的 Playwright 与 PyAutoGUI 单例。
  33. from workplace import playwright as workplace_playwright # noqa: E402
  34. from workplace import pyautogui as workplace_pyautogui # noqa: E402
  35. from workplace.note_pipeline_forbidden_standalone_url_image_fetch import (
  36. XHS_NOTE_PIPELINE_ABSOLUTE_PROHIBITION_STANDALONE_HTTP_GET_FOR_IMAGE_BYTES,
  37. )
  38. def _load_python_module_from_file(
  39. python_file_path: Path,
  40. importlib_logical_module_name: str,
  41. ):
  42. """从磁盘路径载入一段业务 ``.py`` 为模块(用于文件名含 ``-`` 的脚本)。"""
  43. importlib_module_spec = importlib.util.spec_from_file_location(
  44. importlib_logical_module_name,
  45. python_file_path,
  46. )
  47. if importlib_module_spec is None or importlib_module_spec.loader is None:
  48. raise ImportError(f"Cannot load {python_file_path}")
  49. loaded_python_module = importlib.util.module_from_spec(importlib_module_spec)
  50. importlib_module_spec.loader.exec_module(loaded_python_module)
  51. return loaded_python_module
  52. def _load_input_keyword_module():
  53. # 载入顶栏「搜索小红书」定位、输入与点搜索整条串联脚本。
  54. return _load_python_module_from_file(
  55. REPO_ROOT / "workplace" / "input-keyword" / "input-keyword.py",
  56. "workplace_input_keyword",
  57. )
  58. def _load_open_browser_module():
  59. # 载入通过 CDP 附着 Chrome 并按需打开配置首页的脚本。
  60. return _load_python_module_from_file(
  61. REPO_ROOT / "workplace" / "open-browser.py",
  62. "workplace_open_browser",
  63. )
  64. def _load_find_high_score_note_module():
  65. # 载入搜索结果页上「筛选」与「最多点赞/评论/收藏」排序的键鼠脚本(与 bat 第二参联动)。
  66. return _load_python_module_from_file(
  67. REPO_ROOT / "workplace" / "find-hight-score-note" / "find-hight-score-note.py",
  68. "workplace_find_high_score_note",
  69. )
  70. def _load_download_page_sources_module():
  71. # 载入当前结果页笔记流封面与元数据的缓存脚本。
  72. return _load_python_module_from_file(
  73. REPO_ROOT / "workplace" / "download-page-sources" / "download-page-sources.py",
  74. "workplace_download_page_sources",
  75. )
  76. def _load_memory_cache_image_name_module():
  77. return _load_python_module_from_file(
  78. REPO_ROOT / "workplace" / "memory-cache-image-name.py",
  79. "workplace_memory_cache_image_name",
  80. )
  81. def _load_download_note_module():
  82. return _load_python_module_from_file(
  83. REPO_ROOT / "workplace" / "download-note" / "download-note.py",
  84. "workplace_download_note",
  85. )
  86. def start(command_line_argument_strings: list[str] | None = None) -> int:
  87. """跑通一次:附着首页 → 顶栏搜索 → 筛选排序;进程返回码与最内层 ``then`` 一致。"""
  88. _xhs_note_pipeline_image_bytes_must_not_use_standalone_http_get = (
  89. XHS_NOTE_PIPELINE_ABSOLUTE_PROHIBITION_STANDALONE_HTTP_GET_FOR_IMAGE_BYTES
  90. )
  91. _ = _xhs_note_pipeline_image_bytes_must_not_use_standalone_http_get
  92. # 初始化全局键鼠节奏与 Playwright 单例,后续子脚本共用同一套行为。
  93. workplace_pyautogui.init_singleton()
  94. workplace_playwright.init_singleton()
  95. # 解析入参:首段为搜索词,第二段为筛选/排序(与 ``start.bat`` 一致;交互下回车不筛选为 ``0``);
  96. # 第三段为 ``output/note/<序号>/`` 的 **从 0 起** 文件夹名(``start.bat`` 交互里问「第几条」为 **从 1 起**,会减 1 再传入)。
  97. command_line_arguments = (
  98. command_line_argument_strings
  99. if command_line_argument_strings is not None
  100. else sys.argv[1:]
  101. )
  102. search_keyword_text = command_line_arguments[0] if command_line_arguments else ""
  103. filter_keyword_from_command_line = (
  104. command_line_arguments[1].strip()
  105. if len(command_line_arguments) > 1
  106. else ""
  107. )
  108. note_list_zero_based_index_for_detail_download = (
  109. command_line_arguments[2].strip()
  110. if len(command_line_arguments) > 2
  111. else "0"
  112. )
  113. # 预加载本流程要串起来的业务脚本模块,避免在回调里重复读盘。
  114. open_browser_script_module = _load_open_browser_module()
  115. input_keyword_script_module = _load_input_keyword_module()
  116. find_high_score_note_script_module = _load_find_high_score_note_module()
  117. download_page_sources_script_module = _load_download_page_sources_module()
  118. memory_cache_image_name_script_module = _load_memory_cache_image_name_module()
  119. download_note_script_module = _load_download_note_module()
  120. # 与 ``config.ini`` 一并缓存在 ini 快照里的 ``save/project-config`` 可变字典,全流程共用同一引用、不重复读盘。
  121. project_config = REPOSITORY_CONFIG_INI_SNAPSHOT.project_config
  122. cached_image_body_by_request_url: dict[str, bytes] = {}
  123. stop_listening_page_image_responses_cell: list[Callable[[], None] | None] = [None]
  124. # 持有 Playwright 同步驱动生命周期,块结束时释放与浏览器相关的同步资源。
  125. with workplace_playwright.sync_playwright_singleton() as playwright_sync_driver:
  126. # 第一步:附着 CDP 与首页;须在当前页 20s 内达到 load(超时由 Playwright 抛错,见 open-browser.prepare_page_at_home_url)。
  127. first_step_exit_code, playwright_page = (
  128. open_browser_script_module.prepare_page_at_home_url(
  129. playwright_sync_driver,
  130. HOME_URL,
  131. cdp=None,
  132. same_site_preserves_current_page=False,
  133. skip_home_navigation_when_top_search_input_has_text=True,
  134. )
  135. )
  136. if first_step_exit_code != 0:
  137. return first_step_exit_code
  138. if playwright_page is None:
  139. return 1
  140. find_high_score_then_callback = find_high_score_note_script_module.build_then_callback_that_runs_find_high_score_note_after_keyword_search(
  141. playwright_page,
  142. keyword=search_keyword_text,
  143. filter_keyword=filter_keyword_from_command_line,
  144. project_config=project_config,
  145. )
  146. # 第二步:顶栏搜索;
  147. return input_keyword_script_module.start(
  148. playwright_page,
  149. keyword=search_keyword_text,
  150. project_config=project_config,
  151. then=lambda pointer_final_screen_xy: _execute_third_fourth_fifth_step_linear_after_top_bar_keyword_search(
  152. pointer_final_screen_xy,
  153. playwright_page,
  154. find_high_score_then_callback,
  155. memory_cache_image_name_script_module,
  156. download_page_sources_script_module,
  157. download_note_script_module,
  158. note_list_zero_based_index_for_detail_download,
  159. cached_image_body_by_request_url,
  160. stop_listening_page_image_responses_cell,
  161. ),
  162. )
  163. def _execute_third_fourth_fifth_step_linear_after_top_bar_keyword_search(
  164. pointer_final_screen_xy: tuple[int, int],
  165. playwright_page: Any,
  166. find_high_score_then_callback: Callable[[tuple[int, int]], int],
  167. memory_cache_image_name_script_module: Any,
  168. download_page_sources_script_module: Any,
  169. download_note_script_module: Any,
  170. note_list_zero_based_index_for_detail_download: str,
  171. cached_image_body_by_request_url: dict[str, bytes],
  172. stop_listening_page_image_responses_cell: list[Callable[[], None] | None],
  173. ) -> int:
  174. # 第三步:先注册图片响应缓存。须在「筛选/排序」加载笔记流之前启动,否则首屏封面/头像请求往往已完成,
  175. # ``cached_image_body_by_request_url`` 缺条目,落盘会跳过(监听器晚于请求是常见原因)。
  176. stop_listening_page_image_responses_cell[0] = memory_cache_image_name_script_module.start(
  177. playwright_page,
  178. cached_image_body_by_request_url=cached_image_body_by_request_url,
  179. )
  180. # 第四步:筛选与排序(笔记流与图片在此阶段请求,可被上一步监听器捕获)。
  181. find_high_score_step_exit_code = find_high_score_then_callback(
  182. pointer_final_screen_xy,
  183. )
  184. if find_high_score_step_exit_code != 0:
  185. if stop_listening_page_image_responses_cell[0] is not None:
  186. stop_listening_page_image_responses_cell[0]()
  187. stop_listening_page_image_responses_cell[0] = None
  188. return find_high_score_step_exit_code
  189. # 第五步:缓存当前页笔记流到 ``output/note/``。
  190. download_page_sources_script_module.start(
  191. playwright_page,
  192. cached_image_body_by_request_url=cached_image_body_by_request_url,
  193. )
  194. # 第六步: 定位笔记卡片并打开详情 → 保存详情 DOM 与图片 → 退出(业务合作)并生成预览(见 ``workplace/download-note/`` 下三个脚本,或由 ``download-note.py`` 串联)。
  195. download_note_exit_code = download_note_script_module.start(
  196. [note_list_zero_based_index_for_detail_download],
  197. existing_playwright_page=playwright_page,
  198. cached_image_body_by_request_url=cached_image_body_by_request_url,
  199. )
  200. if stop_listening_page_image_responses_cell[0] is not None:
  201. stop_listening_page_image_responses_cell[0]()
  202. return download_note_exit_code
  203. def main() -> int:
  204. # 命令行默认入口:把整段自动化流程的返回码原样交给调用方。
  205. return start()
  206. if __name__ == "__main__":
  207. # 直接执行本文件时,用流程返回码作为进程退出码结束解释器。
  208. raise SystemExit(main())
  209. # 声明 ``from workplace.main import *`` 时对外可见的符号集合。
  210. __all__ = [
  211. "REPOSITORY_ROOT_DIRECTORY",
  212. "REPOSITORY_ROOT_DIRECTORY_SINGLETON",
  213. "REPOSITORY_CONFIG_INI_SNAPSHOT",
  214. "REPOSITORY_CONFIG_INI_SINGLETON",
  215. "RepositoryConfigIniSnapshot",
  216. "PyAutoGUIScreenCalibration",
  217. "REPO_ROOT",
  218. "PYTHON_RELATIVE",
  219. "HOME_URL",
  220. "HOME_PAGE_URL",
  221. "PYTHON_EXE",
  222. "PYTHON_EXECUTABLE_ABSOLUTE_PATH",
  223. "PYTHON_EXECUTABLE_RELATIVE_PATH",
  224. "PYAUTOGUI_SCREEN_CALIBRATION",
  225. "PYAUTOGUI_SCREEN_CALIBRATION_FROM_REPOSITORY_CONFIG_INI",
  226. "workplace_playwright",
  227. "workplace_pyautogui",
  228. "main",
  229. "start",
  230. ]