main.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. #!/usr/bin/env python3
  2. """本文件是串联入口:附着浏览器进首页 → 顶栏搜索 → 再按命令行做筛选与排序。"""
  3. from __future__ import annotations
  4. import importlib.util
  5. import sys
  6. from collections.abc import Callable
  7. from pathlib import Path
  8. from typing import Any
  9. # 以脚本路径直接运行时把仓库根加入 ``sys.path``,保证能导入 ``workplace``。
  10. _main_source_file_repository_root = Path(__file__).resolve().parent.parent
  11. if str(_main_source_file_repository_root) not in sys.path:
  12. sys.path.insert(0, str(_main_source_file_repository_root))
  13. # 从仓库 ini 单例读出首页 URL、根目录与 Python 路径等全局常量。
  14. from workplace.singleton import ( # noqa: E402
  15. HOME_PAGE_URL,
  16. PYAUTOGUI_SCREEN_CALIBRATION,
  17. PYTHON_EXECUTABLE_ABSOLUTE_PATH,
  18. PYTHON_EXECUTABLE_RELATIVE_PATH,
  19. REPOSITORY_CONFIG_INI_SNAPSHOT,
  20. REPOSITORY_ROOT_DIRECTORY,
  21. PyAutoGUIScreenCalibration,
  22. RepositoryConfigIniSnapshot,
  23. )
  24. # 为 ``from workplace.main import …`` 提供简短别名,方便外部脚本引用。
  25. REPOSITORY_ROOT_DIRECTORY_SINGLETON = REPOSITORY_ROOT_DIRECTORY
  26. REPOSITORY_CONFIG_INI_SINGLETON = REPOSITORY_CONFIG_INI_SNAPSHOT
  27. REPO_ROOT = REPOSITORY_ROOT_DIRECTORY
  28. PYTHON_RELATIVE = PYTHON_EXECUTABLE_RELATIVE_PATH
  29. HOME_URL = HOME_PAGE_URL
  30. PYTHON_EXE = PYTHON_EXECUTABLE_ABSOLUTE_PATH
  31. PYAUTOGUI_SCREEN_CALIBRATION_FROM_REPOSITORY_CONFIG_INI = PYAUTOGUI_SCREEN_CALIBRATION
  32. # 本入口统一使用仓库封装的 Playwright 与 PyAutoGUI 单例。
  33. from workplace import playwright as workplace_playwright # noqa: E402
  34. from workplace import pyautogui as workplace_pyautogui # noqa: E402
  35. from workplace.note_pipeline_forbidden_standalone_url_image_fetch import (
  36. XHS_NOTE_PIPELINE_ABSOLUTE_PROHIBITION_STANDALONE_HTTP_GET_FOR_IMAGE_BYTES,
  37. )
  38. def _load_python_module_from_file(
  39. python_file_path: Path,
  40. importlib_logical_module_name: str,
  41. ):
  42. """从磁盘路径载入一段业务 ``.py`` 为模块(用于文件名含 ``-`` 的脚本)。"""
  43. importlib_module_spec = importlib.util.spec_from_file_location(
  44. importlib_logical_module_name,
  45. python_file_path,
  46. )
  47. if importlib_module_spec is None or importlib_module_spec.loader is None:
  48. raise ImportError(f"Cannot load {python_file_path}")
  49. loaded_python_module = importlib.util.module_from_spec(importlib_module_spec)
  50. importlib_module_spec.loader.exec_module(loaded_python_module)
  51. return loaded_python_module
  52. def _load_input_keyword_module():
  53. # 载入顶栏「搜索小红书」定位、输入与点搜索整条串联脚本。
  54. return _load_python_module_from_file(
  55. REPO_ROOT / "workplace" / "input-keyword" / "input-keyword.py",
  56. "workplace_input_keyword",
  57. )
  58. def _load_open_browser_module():
  59. # 载入通过 CDP 附着 Chrome 并按需打开配置首页的脚本。
  60. return _load_python_module_from_file(
  61. REPO_ROOT / "workplace" / "open-browser.py",
  62. "workplace_open_browser",
  63. )
  64. def _load_find_high_score_note_module():
  65. # 载入搜索结果页上「筛选」与「最多点赞/评论/收藏」排序的键鼠脚本(与 bat 第二参联动)。
  66. return _load_python_module_from_file(
  67. REPO_ROOT / "workplace" / "find-hight-score-note" / "find-hight-score-note.py",
  68. "workplace_find_high_score_note",
  69. )
  70. def _load_download_page_sources_module():
  71. # 载入当前结果页笔记流封面与元数据的缓存脚本。
  72. return _load_python_module_from_file(
  73. REPO_ROOT / "workplace" / "download-page-sources" / "download-page-sources.py",
  74. "workplace_download_page_sources",
  75. )
  76. def _load_memory_cache_image_name_module():
  77. return _load_python_module_from_file(
  78. REPO_ROOT / "workplace" / "memory-cache-image-name.py",
  79. "workplace_memory_cache_image_name",
  80. )
  81. def _load_download_note_module():
  82. return _load_python_module_from_file(
  83. REPO_ROOT / "workplace" / "download-note" / "download-note.py",
  84. "workplace_download_note",
  85. )
  86. def _load_preview_note_module():
  87. return _load_python_module_from_file(
  88. REPO_ROOT / "workplace" / "preview-note" / "preview-note.py",
  89. "workplace_preview_note",
  90. )
  91. def start(command_line_argument_strings: list[str] | None = None) -> int:
  92. """跑通一次:附着首页 → 顶栏搜索 → 筛选排序;进程返回码与最内层 ``then`` 一致。"""
  93. _xhs_note_pipeline_image_bytes_must_not_use_standalone_http_get = (
  94. XHS_NOTE_PIPELINE_ABSOLUTE_PROHIBITION_STANDALONE_HTTP_GET_FOR_IMAGE_BYTES
  95. )
  96. _ = _xhs_note_pipeline_image_bytes_must_not_use_standalone_http_get
  97. # 初始化全局键鼠节奏与 Playwright 单例,后续子脚本共用同一套行为。
  98. workplace_pyautogui.init_singleton()
  99. workplace_playwright.init_singleton()
  100. # 解析入参:首段为搜索词,第二段为是否跳过筛选及排序偏好(与 ``start.bat`` 一致)。
  101. command_line_arguments = (
  102. command_line_argument_strings
  103. if command_line_argument_strings is not None
  104. else sys.argv[1:]
  105. )
  106. search_keyword_text = command_line_arguments[0] if command_line_arguments else ""
  107. filter_keyword_from_command_line = (
  108. command_line_arguments[1].strip()
  109. if len(command_line_arguments) > 1
  110. else ""
  111. )
  112. note_list_zero_based_index_for_detail_download = (
  113. command_line_arguments[2].strip()
  114. if len(command_line_arguments) > 2
  115. else "0"
  116. )
  117. # 预加载本流程要串起来的业务脚本模块,避免在回调里重复读盘。
  118. open_browser_script_module = _load_open_browser_module()
  119. input_keyword_script_module = _load_input_keyword_module()
  120. find_high_score_note_script_module = _load_find_high_score_note_module()
  121. download_page_sources_script_module = _load_download_page_sources_module()
  122. memory_cache_image_name_script_module = _load_memory_cache_image_name_module()
  123. download_note_script_module = _load_download_note_module()
  124. # 与 ``config.ini`` 一并缓存在 ini 快照里的 ``save/project-config`` 可变字典,全流程共用同一引用、不重复读盘。
  125. project_config = REPOSITORY_CONFIG_INI_SNAPSHOT.project_config
  126. cached_image_body_by_request_url: dict[str, bytes] = {}
  127. stop_listening_page_image_responses_cell: list[Callable[[], None] | None] = [None]
  128. # 持有 Playwright 同步驱动生命周期,块结束时释放与浏览器相关的同步资源。
  129. with workplace_playwright.sync_playwright_singleton() as playwright_sync_driver:
  130. # 第一步:附着 CDP 与首页;
  131. first_step_exit_code, playwright_page = (
  132. open_browser_script_module.prepare_page_at_home_url(
  133. playwright_sync_driver,
  134. HOME_URL,
  135. cdp=None,
  136. )
  137. )
  138. if first_step_exit_code != 0:
  139. return first_step_exit_code
  140. if playwright_page is None:
  141. return 1
  142. find_high_score_then_callback = find_high_score_note_script_module.build_then_callback_that_runs_find_high_score_note_after_keyword_search(
  143. playwright_page,
  144. keyword=search_keyword_text,
  145. filter_keyword=filter_keyword_from_command_line,
  146. project_config=project_config,
  147. )
  148. # 第二步:顶栏搜索;
  149. return input_keyword_script_module.start(
  150. playwright_page,
  151. keyword=search_keyword_text,
  152. project_config=project_config,
  153. then=lambda pointer_final_screen_xy: _execute_third_fourth_fifth_step_linear_after_top_bar_keyword_search(
  154. pointer_final_screen_xy,
  155. playwright_page,
  156. find_high_score_then_callback,
  157. memory_cache_image_name_script_module,
  158. download_page_sources_script_module,
  159. download_note_script_module,
  160. note_list_zero_based_index_for_detail_download,
  161. cached_image_body_by_request_url,
  162. stop_listening_page_image_responses_cell,
  163. ),
  164. )
  165. def _execute_third_fourth_fifth_step_linear_after_top_bar_keyword_search(
  166. pointer_final_screen_xy: tuple[int, int],
  167. playwright_page: Any,
  168. find_high_score_then_callback: Callable[[tuple[int, int]], int],
  169. memory_cache_image_name_script_module: Any,
  170. download_page_sources_script_module: Any,
  171. download_note_script_module: Any,
  172. note_list_zero_based_index_for_detail_download: str,
  173. cached_image_body_by_request_url: dict[str, bytes],
  174. stop_listening_page_image_responses_cell: list[Callable[[], None] | None],
  175. ) -> int:
  176. # 第三步:筛选与排序。
  177. find_high_score_step_exit_code = find_high_score_then_callback(
  178. pointer_final_screen_xy,
  179. )
  180. if find_high_score_step_exit_code != 0:
  181. return find_high_score_step_exit_code
  182. # 第四步:启动图片缓存监听;``memory-cache-image-name.start`` 注册 ``page.on`` 返回后监听已开启。
  183. stop_listening_page_image_responses_cell[0] = memory_cache_image_name_script_module.start(
  184. playwright_page,
  185. cached_image_body_by_request_url=cached_image_body_by_request_url,
  186. )
  187. # 第五步:缓存当前页笔记流到 ``output/note/``。
  188. download_page_sources_script_module.start(
  189. playwright_page,
  190. cached_image_body_by_request_url=cached_image_body_by_request_url,
  191. )
  192. # 第六步: 缓存笔记详情(图片仅从 ``cached_image_body_by_request_url`` 落盘,不发起独立 HTTP)。
  193. download_note_exit_code = download_note_script_module.start(
  194. [note_list_zero_based_index_for_detail_download],
  195. existing_playwright_page=playwright_page,
  196. cached_image_body_by_request_url=cached_image_body_by_request_url,
  197. )
  198. if stop_listening_page_image_responses_cell[0] is not None:
  199. stop_listening_page_image_responses_cell[0]()
  200. # 第七步: 预览笔记详情。
  201. preview_note_script_module = _load_preview_note_module()
  202. preview_note_exit_code = preview_note_script_module.start(
  203. [note_list_zero_based_index_for_detail_download],
  204. )
  205. if preview_note_exit_code != 0:
  206. return preview_note_exit_code
  207. return download_note_exit_code
  208. def main() -> int:
  209. # 命令行默认入口:把整段自动化流程的返回码原样交给调用方。
  210. return start()
  211. if __name__ == "__main__":
  212. # 直接执行本文件时,用流程返回码作为进程退出码结束解释器。
  213. raise SystemExit(main())
  214. # 声明 ``from workplace.main import *`` 时对外可见的符号集合。
  215. __all__ = [
  216. "REPOSITORY_ROOT_DIRECTORY",
  217. "REPOSITORY_ROOT_DIRECTORY_SINGLETON",
  218. "REPOSITORY_CONFIG_INI_SNAPSHOT",
  219. "REPOSITORY_CONFIG_INI_SINGLETON",
  220. "RepositoryConfigIniSnapshot",
  221. "PyAutoGUIScreenCalibration",
  222. "REPO_ROOT",
  223. "PYTHON_RELATIVE",
  224. "HOME_URL",
  225. "HOME_PAGE_URL",
  226. "PYTHON_EXE",
  227. "PYTHON_EXECUTABLE_ABSOLUTE_PATH",
  228. "PYTHON_EXECUTABLE_RELATIVE_PATH",
  229. "PYAUTOGUI_SCREEN_CALIBRATION",
  230. "PYAUTOGUI_SCREEN_CALIBRATION_FROM_REPOSITORY_CONFIG_INI",
  231. "workplace_playwright",
  232. "workplace_pyautogui",
  233. "main",
  234. "start",
  235. ]