open-browser.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. #!/usr/bin/env python3
  2. """
  3. 第一步:检测 CDP 远程调试端口是否已有浏览器(``cdp_endpoint_open`` + ``/json/version``);
  4. 已开启则只附着、不新启 Chrome;否则再按 browser_core 逻辑连接或拉起调试实例。
  5. 若已是 HOME_URL,则 **不 goto**。默认同站且非空白时也不跳转(保留当前子路径,如搜索结果页);主流程可将 ``same_site_preserves_current_page=False`` 以始终打开 ``home_url``;若同时 ``skip_home_navigation_when_top_search_input_has_text=True``,则顶栏搜索框已有文案时 **不** 先刷新主页,交给 ``clear-input``。仅空白页或(在允许时)跨站时导航到 ``home_url``。
  6. """
  7. from __future__ import annotations
  8. import sys
  9. import urllib.request
  10. from collections.abc import Callable
  11. from pathlib import Path
  12. from typing import Any, TypeVar
  13. from urllib.parse import urlparse
  14. _ROOT = Path(__file__).resolve().parent.parent
  15. if str(_ROOT) not in sys.path:
  16. sys.path.insert(0, str(_ROOT))
  17. from workplace.cdp_chrome_browser_core import cdp_endpoint_open # noqa: E402
  18. from workplace.playwright import ( # noqa: E402
  19. DEFAULT_CDP_URL,
  20. _is_blank_url,
  21. connect_browser_via_cdp,
  22. pick_target_page,
  23. sync_playwright,
  24. )
  25. _T = TypeVar("_T")
  26. _HOME_PAGE_DOCUMENT_LOAD_WAIT_TIMEOUT_MILLISECONDS = 20_000
  27. # 与 ``workplace/input-keyword/find_search_input.py`` 中顶栏搜索框一致(用于判断是否已有文案、避免先 goto 冲掉)。
  28. _TOP_BAR_SEARCH_INPUT_SELECTOR = "#search-input"
  29. def _top_search_input_has_non_empty_text(page: Any) -> bool:
  30. """当前页存在顶栏 ``#search-input`` 且 ``input_value`` 去空白后非空时返回 True;否则 False。"""
  31. try:
  32. locator = page.locator(_TOP_BAR_SEARCH_INPUT_SELECTOR).first
  33. if locator.count() == 0:
  34. return False
  35. raw = str(locator.input_value(timeout=3_000)).strip()
  36. return bool(raw)
  37. except Exception:
  38. return False
  39. def _wait_current_page_document_load_completed_or_raise_error(page) -> None:
  40. page.wait_for_load_state(
  41. "load",
  42. timeout=_HOME_PAGE_DOCUMENT_LOAD_WAIT_TIMEOUT_MILLISECONDS,
  43. )
  44. def _path_key(path: str | None) -> str:
  45. p = (path or "").strip() or "/"
  46. p = p.rstrip("/")
  47. return "" if p in ("", "/") else p
  48. def _maximize_chrome_main_window_and_bring_to_foreground_win32(page) -> None:
  49. """
  50. Windows:把 Chrome 主窗口**最大化**(工作区铺满,非 F11 全屏)并置于前台。
  51. 若已最小化则一并拉起。Playwright 的 ``bring_to_front`` 只作用于页签层。
  52. """
  53. if sys.platform != "win32":
  54. return
  55. try:
  56. tab_title = (page.title() or "").strip()
  57. except Exception:
  58. tab_title = ""
  59. import ctypes
  60. from ctypes import wintypes
  61. user32 = ctypes.windll.user32
  62. SW_MAXIMIZE = 3
  63. candidates: list[tuple[int, str, int]] = []
  64. @ctypes.WINFUNCTYPE(ctypes.c_bool, wintypes.HWND, wintypes.LPARAM)
  65. def enum_proc(hwnd, _lparam):
  66. if not user32.IsWindowVisible(hwnd):
  67. return True
  68. cls = ctypes.create_unicode_buffer(256)
  69. user32.GetClassNameW(hwnd, cls, 256)
  70. if cls.value != "Chrome_WidgetWin_1":
  71. return True
  72. ln = user32.GetWindowTextLengthW(hwnd)
  73. if ln <= 0:
  74. return True
  75. buf = ctypes.create_unicode_buffer(ln + 1)
  76. user32.GetWindowTextW(hwnd, buf, ln + 1)
  77. title = buf.value
  78. if not title:
  79. return True
  80. rect = wintypes.RECT()
  81. if not user32.GetWindowRect(hwnd, ctypes.byref(rect)):
  82. return True
  83. w = max(0, rect.right - rect.left)
  84. h = max(0, rect.bottom - rect.top)
  85. if w * h < 80_000:
  86. return True
  87. candidates.append((int(hwnd), title, w * h))
  88. return True
  89. user32.EnumWindows(enum_proc, 0)
  90. if not candidates:
  91. return
  92. def score(item: tuple[int, str, int]) -> int:
  93. _hwnd, title, area = item
  94. s = min(area // 500, 8000)
  95. if tab_title:
  96. if tab_title in title:
  97. s += 50_000
  98. elif len(tab_title) >= 4 and tab_title[:12] in title:
  99. s += 25_000
  100. if "Chrome" in title or "chrome" in title.lower():
  101. s += 100
  102. return s
  103. hwnd = max(candidates, key=score)[0]
  104. try:
  105. user32.ShowWindow(hwnd, SW_MAXIMIZE)
  106. user32.SetForegroundWindow(hwnd)
  107. except Exception:
  108. pass
  109. def _chrome_remote_debugging_http_endpoint_ready(cdp_browser_http_url: str) -> bool:
  110. """
  111. 若 Chrome 已在该地址开启远程调试,``GET …/json/version`` 通常返回 200。
  112. 作为 ``cdp_endpoint_open`` 的补充,避免误判后再次 ``auto_chrome`` 拉起第二个实例。
  113. """
  114. raw = (cdp_browser_http_url or "").strip()
  115. if not raw:
  116. return False
  117. parsed = urlparse(raw)
  118. if not parsed.hostname:
  119. return False
  120. scheme = (parsed.scheme or "http").lower()
  121. port = parsed.port
  122. if port is None:
  123. port = 443 if scheme == "https" else 80
  124. version_probe_url = f"{scheme}://{parsed.hostname}:{port}/json/version"
  125. try:
  126. with urllib.request.urlopen(version_probe_url, timeout=2.0) as http_response:
  127. return http_response.status == 200
  128. except Exception:
  129. return False
  130. def _same_page_as_home(current: str, home: str) -> bool:
  131. h = (home or "").strip()
  132. if not h:
  133. return False
  134. c = (current or "").strip()
  135. if not c.startswith("http"):
  136. return False
  137. a = urlparse(c)
  138. b = urlparse(h)
  139. return (
  140. a.scheme.lower() == b.scheme.lower()
  141. and a.netloc.lower() == b.netloc.lower()
  142. and _path_key(a.path) == _path_key(b.path)
  143. and (a.query or "") == (b.query or "")
  144. )
  145. def _same_site_as_home_not_blank(current: str, home: str) -> bool:
  146. """与 ``home_url`` 同 host、且当前不是空白页 — 候选「可跳过 goto」的前置条件。"""
  147. h = (home or "").strip()
  148. if not h.startswith("http"):
  149. return False
  150. c = (current or "").strip()
  151. if not c.startswith("http") or _is_blank_url(c):
  152. return False
  153. return urlparse(c).netloc.lower() == urlparse(h).netloc.lower()
  154. def prepare_page_at_home_url(
  155. playwright_inst,
  156. home_url: str,
  157. *,
  158. cdp: str | None = None,
  159. same_site_preserves_current_page: bool = True,
  160. skip_home_navigation_when_top_search_input_has_text: bool = False,
  161. ):
  162. """
  163. 在调用方已有的 ``with sync_playwright() as p:`` 里执行:附着 CDP、选页;按规则 ``goto`` 到 ``home_url``。
  164. ``same_site_preserves_current_page``:为 ``True``(默认)时,若当前已是小红书同站且非空白,则 **不** 导航,以免冲掉已有搜索结果等;主流程入口宜传 ``False``,保证打开配置中的主页 URL。
  165. ``skip_home_navigation_when_top_search_input_has_text``:为 ``True`` 时,若已附着到小红书同站页且顶栏 ``#search-input`` 内已有文案,则 **不** 先 ``goto`` 主页,保留当前页,由 ``input-keyword`` 里 ``clear-input`` 清空后再输入(供 ``start.bat`` 第二轮关键词等场景)。
  166. 返回 ``(退出码, page | None)``。成功时 ``page`` 仍绑定当前浏览器上下文,勿在 ``with`` 块外长期使用。
  167. """
  168. home_url = (home_url or "").strip()
  169. if not home_url.startswith("http"):
  170. print("Invalid HOME_URL.", file=sys.stderr)
  171. return 1, None
  172. url = (cdp or "").strip() or DEFAULT_CDP_URL
  173. debugging_endpoint_already_listening = cdp_endpoint_open(url) or (
  174. _chrome_remote_debugging_http_endpoint_ready(url)
  175. )
  176. if debugging_endpoint_already_listening:
  177. print("检测到远程调试端口已有浏览器,仅附着、不新启 Chrome:", url, file=sys.stderr)
  178. else:
  179. print(
  180. f"未检测到调试端口监听,将连接并在需要时启动调试 Chrome:{url}",
  181. file=sys.stderr,
  182. )
  183. try:
  184. browser = connect_browser_via_cdp(
  185. playwright_inst,
  186. cdp=url,
  187. auto_chrome=not debugging_endpoint_already_listening,
  188. )
  189. except Exception as e:
  190. print("CDP connect failed:", e, file=sys.stderr)
  191. return 1, None
  192. page = pick_target_page(browser)
  193. if not page:
  194. print("No browser page available.", file=sys.stderr)
  195. return 1, None
  196. try:
  197. page.bring_to_front()
  198. except Exception:
  199. pass
  200. _maximize_chrome_main_window_and_bring_to_foreground_win32(page)
  201. url_now = (page.url or "").strip()
  202. if _is_blank_url(url_now):
  203. print("Blank tab — opening HOME_URL:", home_url, file=sys.stderr)
  204. page.goto(
  205. home_url,
  206. wait_until="load",
  207. timeout=_HOME_PAGE_DOCUMENT_LOAD_WAIT_TIMEOUT_MILLISECONDS,
  208. )
  209. return 0, page
  210. if _same_page_as_home(url_now, home_url):
  211. print("已在 HOME_URL,跳过导航。", file=sys.stderr)
  212. _wait_current_page_document_load_completed_or_raise_error(page)
  213. return 0, page
  214. if (
  215. same_site_preserves_current_page
  216. and _same_site_as_home_not_blank(url_now, home_url)
  217. ):
  218. print("同站已打开,跳过导航与刷新:", url_now, file=sys.stderr)
  219. _wait_current_page_document_load_completed_or_raise_error(page)
  220. return 0, page
  221. if (
  222. skip_home_navigation_when_top_search_input_has_text
  223. and _same_site_as_home_not_blank(url_now, home_url)
  224. and _top_search_input_has_non_empty_text(page)
  225. ):
  226. print(
  227. "顶栏搜索框已有文案,保留当前页(不先刷新主页),随后由 clear-input 清空。",
  228. file=sys.stderr,
  229. )
  230. _wait_current_page_document_load_completed_or_raise_error(page)
  231. return 0, page
  232. print("导航到 HOME_URL:", home_url, file=sys.stderr)
  233. page.goto(
  234. home_url,
  235. wait_until="load",
  236. timeout=_HOME_PAGE_DOCUMENT_LOAD_WAIT_TIMEOUT_MILLISECONDS,
  237. )
  238. return 0, page
  239. def prepare_page_at_home_url_then(
  240. playwright_inst,
  241. home_url: str,
  242. then: Callable[[Any], _T],
  243. *,
  244. cdp: str | None = None,
  245. same_site_preserves_current_page: bool = True,
  246. skip_home_navigation_when_top_search_input_has_text: bool = False,
  247. ) -> _T:
  248. """
  249. ``prepare_page_at_home_url`` 成功后把 ``page`` 交给 ``then``;失败则抛错(不返回 (code, page))。
  250. ``then`` 签名为 ``(page) -> T``。
  251. """
  252. code, page = prepare_page_at_home_url(
  253. playwright_inst,
  254. home_url,
  255. cdp=cdp,
  256. same_site_preserves_current_page=same_site_preserves_current_page,
  257. skip_home_navigation_when_top_search_input_has_text=skip_home_navigation_when_top_search_input_has_text,
  258. )
  259. if code != 0 or page is None:
  260. raise RuntimeError(f"prepare_page_at_home_url failed: code={code}")
  261. return then(page)
  262. def start(
  263. playwright_inst,
  264. home_url: str,
  265. then: Callable[[Any], _T],
  266. *,
  267. cdp: str | None = None,
  268. same_site_preserves_current_page: bool = True,
  269. skip_home_navigation_when_top_search_input_has_text: bool = False,
  270. ) -> _T:
  271. """
  272. 附着 CDP;仅在需要时 ``goto``(见 ``prepare_page_at_home_url``)。
  273. 不在此重复 ``wait_for_load_state``,避免同页再次等待或体感上的刷新。
  274. ``then`` 签名为 ``(page) -> T``;失败在 ``prepare`` 阶段抛 ``RuntimeError``。
  275. """
  276. def _after_browser_and_navigation_ready(page: Any) -> _T:
  277. return then(page)
  278. return prepare_page_at_home_url_then(
  279. playwright_inst,
  280. home_url,
  281. _after_browser_and_navigation_ready,
  282. cdp=cdp,
  283. same_site_preserves_current_page=same_site_preserves_current_page,
  284. skip_home_navigation_when_top_search_input_has_text=skip_home_navigation_when_top_search_input_has_text,
  285. )
  286. def ensure_browser_on_home_url(
  287. home_url: str,
  288. *,
  289. cdp: str | None = None,
  290. same_site_preserves_current_page: bool = True,
  291. skip_home_navigation_when_top_search_input_has_text: bool = False,
  292. ) -> tuple[int, str | None]:
  293. """
  294. 附着 Chrome CDP;选取目标页。
  295. - 空白页:goto home_url
  296. - 已是 HOME_URL:跳过导航
  297. - 与 home 同站且非空白:跳过导航(含搜索结果页等,不强制回首页)
  298. - 其它(跨站等):goto home_url
  299. 若 CDP 端口已在监听,视为浏览器已开启调试,仅附着且 **不再** 自动拉起第二个 Chrome;
  300. 未监听时再 ``connect_browser_via_cdp(..., auto_chrome=True)`` 尝试启动调试实例。
  301. 成功时返回 ``(0, html)``,``html`` 为当前页 ``page.content()``;失败时 ``(非0, None)``。
  302. Windows 下会在附着后把 **Chrome 主窗口最大化** 并置于前台(便于后续 PyAutoGUI 等)。
  303. """
  304. with sync_playwright() as p:
  305. code, page = prepare_page_at_home_url(
  306. p,
  307. home_url,
  308. cdp=cdp,
  309. same_site_preserves_current_page=same_site_preserves_current_page,
  310. skip_home_navigation_when_top_search_input_has_text=skip_home_navigation_when_top_search_input_has_text,
  311. )
  312. if code != 0 or page is None:
  313. return code, None
  314. try:
  315. return 0, page.content()
  316. except Exception:
  317. return 1, None