download_note_shared.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. #!/usr/bin/env python3
  2. """download-note 流水线共用:路径、封面高度、OCR 锚点清洗、详情页 DOM 提取 JS、轮播翻页、选详情页 Playwright Page。"""
  3. from __future__ import annotations
  4. import importlib.util
  5. from pathlib import Path
  6. from typing import Any
  7. from PIL import Image
  8. from playwright.sync_api import Page
  9. REPOSITORY_ROOT_DIRECTORY = Path(__file__).resolve().parent.parent
  10. def cover_image_height_px_from_note_output_sequence_folder(
  11. note_output_sequence_folder_path: Path,
  12. ) -> int:
  13. cover_image_path = sorted(note_output_sequence_folder_path.glob("cover_image.*"))[0]
  14. with Image.open(cover_image_path) as cover_image:
  15. return int(cover_image.height)
  16. def half_cover_image_height_pixels_from_disk_cover_file_under_note_output_folder(
  17. note_output_sequence_folder_path: Path,
  18. ) -> float:
  19. return cover_image_height_px_from_note_output_sequence_folder(
  20. note_output_sequence_folder_path,
  21. ) / 2.0
  22. def normalize_raw_note_card_metadata_string_to_plain_text_for_ocr_anchor(
  23. raw_string: str,
  24. ) -> str:
  25. allowed_chinese_punctuation_character_set = frozenset(
  26. ",。!?、:;()「」『』【】《》…—·\u201c\u201d\u2018\u2019"
  27. )
  28. result_character_list: list[str] = []
  29. for single_character in raw_string:
  30. code_point_integer = ord(single_character)
  31. if single_character.isascii():
  32. if single_character.isalnum() or single_character in " \t.-":
  33. result_character_list.append(single_character)
  34. continue
  35. if (
  36. 0x4E00 <= code_point_integer <= 0x9FFF
  37. or 0x3400 <= code_point_integer <= 0x4DBF
  38. or 0x20000 <= code_point_integer <= 0x2CEAF
  39. ):
  40. result_character_list.append(single_character)
  41. continue
  42. if single_character in allowed_chinese_punctuation_character_set:
  43. result_character_list.append(single_character)
  44. return "".join(result_character_list).strip()
  45. def clip_note_title_plain_text_for_feed_ocr_anchor(
  46. normalized_title_plain_text: str,
  47. *,
  48. maximum_anchor_character_count_inclusive: int = 11,
  49. ) -> str:
  50. """
  51. 仅用 ``note_title`` 做 feed 卡片 OCR 锚点:规范化后**最多**保留前 ``maximum`` 个 Unicode 码点
  52. (汉字与中文标点各算 1);超长则截断头部,与列表区标题展示一致。
  53. """
  54. s = (normalized_title_plain_text or "").strip()
  55. if not s:
  56. return ""
  57. if len(s) <= maximum_anchor_character_count_inclusive:
  58. return s
  59. return s[:maximum_anchor_character_count_inclusive]
  60. EXTRACT_NOTE_DETAIL_DOM_FOR_DISK_CACHE_JS = r"""
  61. () => {
  62. const extraction_result = {
  63. detailTitleText: "",
  64. detailBodyPlainText: "",
  65. detailBodyHashtagObjectList: [],
  66. detailBodyEmojiImageUrlList: [],
  67. noteDetailImageUrlList: [],
  68. };
  69. const detailTitleElement = document.querySelector("#detail-title");
  70. if (detailTitleElement && detailTitleElement.innerText.trim()) {
  71. extraction_result.detailTitleText = detailTitleElement.innerText.trim();
  72. }
  73. const detailDescElement = document.querySelector("#detail-desc");
  74. if (detailDescElement) {
  75. extraction_result.detailBodyPlainText = detailDescElement.innerText.trim();
  76. detailDescElement.querySelectorAll("a.tag, a[id='hash-tag']").forEach((anchorElement) => {
  77. extraction_result.detailBodyHashtagObjectList.push({
  78. display_text: (anchorElement.innerText || "").trim(),
  79. relative_href: (anchorElement.getAttribute("href") || "").trim(),
  80. });
  81. });
  82. detailDescElement.querySelectorAll("img.note-content-emoji").forEach((imgElement) => {
  83. let emojiImageUrl = (
  84. imgElement.getAttribute("src") ||
  85. imgElement.currentSrc ||
  86. ""
  87. ).trim();
  88. if (emojiImageUrl.startsWith("//")) emojiImageUrl = "https:" + emojiImageUrl;
  89. if (
  90. emojiImageUrl.startsWith("http") &&
  91. !extraction_result.detailBodyEmojiImageUrlList.includes(emojiImageUrl)
  92. ) {
  93. extraction_result.detailBodyEmojiImageUrlList.push(emojiImageUrl);
  94. }
  95. });
  96. }
  97. const deduplicatedAbsoluteImageUrlSet = new Set();
  98. const deduplicatedSnsWebpicNoteImageStemSet = new Set();
  99. const noteImageCanonicalStemFromSnsWebpicUrl = (absoluteImageUrl) => {
  100. let urlPathname = "";
  101. try {
  102. urlPathname = new URL(absoluteImageUrl).pathname;
  103. } catch {
  104. return "";
  105. }
  106. const pathSegmentList = urlPathname.split("/").filter(Boolean);
  107. if (!pathSegmentList.length) return "";
  108. const lastPathSegment = pathSegmentList[pathSegmentList.length - 1];
  109. const stemBeforeBang = lastPathSegment.split("!")[0];
  110. return stemBeforeBang || "";
  111. };
  112. const pushNormalizedImageUrlForDetailCache = (rawUrl, applySnsWebpicStemDedup) => {
  113. let normalizedUrl = (rawUrl || "").trim();
  114. if (!normalizedUrl) return;
  115. if (normalizedUrl.startsWith("//")) normalizedUrl = "https:" + normalizedUrl;
  116. if (!normalizedUrl.startsWith("http")) return;
  117. if (deduplicatedAbsoluteImageUrlSet.has(normalizedUrl)) return;
  118. if (
  119. applySnsWebpicStemDedup &&
  120. normalizedUrl.includes("sns-webpic")
  121. ) {
  122. const canonicalStem = noteImageCanonicalStemFromSnsWebpicUrl(normalizedUrl);
  123. if (
  124. canonicalStem &&
  125. deduplicatedSnsWebpicNoteImageStemSet.has(canonicalStem)
  126. ) {
  127. return;
  128. }
  129. if (canonicalStem) {
  130. deduplicatedSnsWebpicNoteImageStemSet.add(canonicalStem);
  131. }
  132. }
  133. deduplicatedAbsoluteImageUrlSet.add(normalizedUrl);
  134. extraction_result.noteDetailImageUrlList.push(normalizedUrl);
  135. };
  136. const pickSnsWebpicUrlFromCarouselImgElement = (imgElement) => {
  137. const fromSrc = (imgElement.getAttribute("src") || "").trim();
  138. const fromCurrent = (imgElement.currentSrc || "").trim();
  139. if (fromCurrent.includes("sns-webpic")) return fromCurrent;
  140. if (fromSrc.includes("sns-webpic")) return fromSrc;
  141. return "";
  142. };
  143. const carouselImgIsOutsideDetailDescAndCommentAreas = (imgElement) => {
  144. if (detailDescElement && detailDescElement.contains(imgElement)) return false;
  145. if (imgElement.closest("[class*='comment']")) return false;
  146. if (imgElement.closest(".note-content-emoji")) return false;
  147. return true;
  148. };
  149. const noteDetailLayoutRootElement =
  150. document.querySelector("#noteContainer") ||
  151. document.querySelector(".interaction-container") ||
  152. document.querySelector("[class*='note-detail']") ||
  153. document.body;
  154. const noteImageCarouselScopeElement =
  155. noteDetailLayoutRootElement.querySelector(".swiper") ||
  156. noteDetailLayoutRootElement.querySelector(".note-slider") ||
  157. noteDetailLayoutRootElement.querySelector("[class*='swiper']") ||
  158. noteDetailLayoutRootElement;
  159. const swiperSlideElementList = Array.from(
  160. noteImageCarouselScopeElement.querySelectorAll(".swiper-slide"),
  161. ).filter(
  162. (slideElement) => !slideElement.classList.contains("swiper-slide-duplicate"),
  163. );
  164. if (swiperSlideElementList.length > 0) {
  165. swiperSlideElementList.forEach((slideElement) => {
  166. const imgNodeListInSlide = slideElement.querySelectorAll("img");
  167. let chosenCarouselRawUrl = "";
  168. imgNodeListInSlide.forEach((imgElement) => {
  169. if (chosenCarouselRawUrl) return;
  170. chosenCarouselRawUrl = pickSnsWebpicUrlFromCarouselImgElement(imgElement);
  171. });
  172. if (chosenCarouselRawUrl) {
  173. pushNormalizedImageUrlForDetailCache(chosenCarouselRawUrl, true);
  174. }
  175. });
  176. } else {
  177. noteImageCarouselScopeElement.querySelectorAll("img[src*='sns-webpic']").forEach(
  178. (imgElement) => {
  179. if (!carouselImgIsOutsideDetailDescAndCommentAreas(imgElement)) return;
  180. const fallbackRawUrl = pickSnsWebpicUrlFromCarouselImgElement(imgElement);
  181. if (fallbackRawUrl) {
  182. pushNormalizedImageUrlForDetailCache(fallbackRawUrl, true);
  183. }
  184. },
  185. );
  186. }
  187. return extraction_result;
  188. }
  189. """
  190. COUNT_NOTE_DETAIL_CAROUSEL_NON_DUPLICATE_SLIDE_ELEMENT_JS = r"""
  191. () => {
  192. const noteDetailLayoutRootElement =
  193. document.querySelector("#noteContainer") ||
  194. document.querySelector(".interaction-container") ||
  195. document.querySelector("[class*='note-detail']") ||
  196. document.body;
  197. const noteImageCarouselScopeElement =
  198. noteDetailLayoutRootElement.querySelector(".swiper") ||
  199. noteDetailLayoutRootElement.querySelector(".note-slider") ||
  200. noteDetailLayoutRootElement.querySelector("[class*='swiper']") ||
  201. noteDetailLayoutRootElement;
  202. return noteImageCarouselScopeElement.querySelectorAll(
  203. ".swiper-slide:not(.swiper-slide-duplicate)",
  204. ).length;
  205. }
  206. """
  207. MAX_NOTE_DETAIL_CAROUSEL_SLIDE_ITERATION_COUNT = 48
  208. MILLISECONDS_TO_PAUSE_AFTER_EACH_NOTE_DETAIL_CAROUSEL_SLIDE_NAVIGATION = 600
  209. NOTE_DETAIL_CAROUSEL_ESTIMATED_NEW_SLIDE_IMAGE_COUNT_LOADED_EACH_ARROW_RIGHT_KEY_PRESS = 3
  210. def iterate_note_detail_carousel_each_slide_so_playwright_image_response_cache_populates(
  211. note_detail_playwright_page: Page,
  212. ) -> None:
  213. carousel_non_duplicate_slide_element_count = int(
  214. note_detail_playwright_page.evaluate(
  215. COUNT_NOTE_DETAIL_CAROUSEL_NON_DUPLICATE_SLIDE_ELEMENT_JS,
  216. )
  217. )
  218. iteration_upper_bound = min(
  219. carousel_non_duplicate_slide_element_count,
  220. MAX_NOTE_DETAIL_CAROUSEL_SLIDE_ITERATION_COUNT,
  221. )
  222. if iteration_upper_bound <= 1:
  223. note_detail_playwright_page.wait_for_timeout(
  224. MILLISECONDS_TO_PAUSE_AFTER_EACH_NOTE_DETAIL_CAROUSEL_SLIDE_NAVIGATION,
  225. )
  226. return
  227. note_detail_playwright_page.bring_to_front()
  228. carousel_edge_transition_count = iteration_upper_bound - 1
  229. batch_load_estimate = max(
  230. 1,
  231. int(NOTE_DETAIL_CAROUSEL_ESTIMATED_NEW_SLIDE_IMAGE_COUNT_LOADED_EACH_ARROW_RIGHT_KEY_PRESS),
  232. )
  233. arrow_right_key_press_count = (
  234. (carousel_edge_transition_count + batch_load_estimate - 1) // batch_load_estimate
  235. )
  236. for _arrow_right_key_press_index in range(arrow_right_key_press_count):
  237. note_detail_playwright_page.keyboard.press("ArrowRight")
  238. note_detail_playwright_page.wait_for_timeout(
  239. MILLISECONDS_TO_PAUSE_AFTER_EACH_NOTE_DETAIL_CAROUSEL_SLIDE_NAVIGATION,
  240. )
  241. def playwright_page_for_opened_note_detail_extraction(playwright_page: Page) -> Page:
  242. ordered_candidate_page_list: list[Page] = [playwright_page]
  243. for context_page in playwright_page.context.pages:
  244. if context_page is not playwright_page:
  245. ordered_candidate_page_list.append(context_page)
  246. for candidate_playwright_page in ordered_candidate_page_list:
  247. if candidate_playwright_page.is_closed():
  248. continue
  249. dom_has_note_detail_root = candidate_playwright_page.evaluate(
  250. "() => !!(document.querySelector('#detail-title') || document.querySelector('#detail-desc'))",
  251. )
  252. if dom_has_note_detail_root:
  253. return candidate_playwright_page
  254. return playwright_page
  255. def load_python_module_from_file(
  256. python_file_path: Path,
  257. importlib_logical_module_name: str,
  258. ):
  259. importlib_module_spec = importlib.util.spec_from_file_location(
  260. importlib_logical_module_name,
  261. python_file_path,
  262. )
  263. if importlib_module_spec is None or importlib_module_spec.loader is None:
  264. raise ImportError(f"Cannot load {python_file_path}")
  265. loaded_python_module = importlib.util.module_from_spec(importlib_module_spec)
  266. importlib_module_spec.loader.exec_module(loaded_python_module)
  267. return loaded_python_module
  268. PROJECT_CONFIG_BUSINESS_COOPERATION_BTN_SCREEN_XY_PAIR_CONFIG_KEY = "business-cooperation-btn-pos"
  269. BUSINESS_COOPERATION_BUTTON_VISIBLE_TEXT_FOR_OCR_ANCHOR = "业务合作"
  270. __all__ = [
  271. "REPOSITORY_ROOT_DIRECTORY",
  272. "BUSINESS_COOPERATION_BUTTON_VISIBLE_TEXT_FOR_OCR_ANCHOR",
  273. "COUNT_NOTE_DETAIL_CAROUSEL_NON_DUPLICATE_SLIDE_ELEMENT_JS",
  274. "EXTRACT_NOTE_DETAIL_DOM_FOR_DISK_CACHE_JS",
  275. "MAX_NOTE_DETAIL_CAROUSEL_SLIDE_ITERATION_COUNT",
  276. "MILLISECONDS_TO_PAUSE_AFTER_EACH_NOTE_DETAIL_CAROUSEL_SLIDE_NAVIGATION",
  277. "NOTE_DETAIL_CAROUSEL_ESTIMATED_NEW_SLIDE_IMAGE_COUNT_LOADED_EACH_ARROW_RIGHT_KEY_PRESS",
  278. "PROJECT_CONFIG_BUSINESS_COOPERATION_BTN_SCREEN_XY_PAIR_CONFIG_KEY",
  279. "cover_image_height_px_from_note_output_sequence_folder",
  280. "half_cover_image_height_pixels_from_disk_cover_file_under_note_output_folder",
  281. "iterate_note_detail_carousel_each_slide_so_playwright_image_response_cache_populates",
  282. "load_python_module_from_file",
  283. "normalize_raw_note_card_metadata_string_to_plain_text_for_ocr_anchor",
  284. "clip_note_title_plain_text_for_feed_ocr_anchor",
  285. "playwright_page_for_opened_note_detail_extraction",
  286. ]