ocr_images_pdfs.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. # logic.py
  2. import sys
  3. import os
  4. # 添加父目录到sys.path,便于导入onnxocr包
  5. sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
  6. from onnxocr.onnx_paddleocr import ONNXPaddleOcr, sav2Img
  7. import cv2
  8. from typing import List, Callable
  9. from pathlib import Path
  10. import time
  11. import numpy as np
  12. # 尝试导入pdf2image用于PDF转图片
  13. try:
  14. from pdf2image import convert_from_path
  15. except ImportError:
  16. convert_from_path = None
  17. # 尝试导入pymupdf用于PDF转图片
  18. try:
  19. import fitz # pymupdf
  20. def pdf_to_images(pdf_path, dpi=200):
  21. """
  22. 使用pymupdf将PDF每一页转为图片(numpy数组)
  23. """
  24. doc = fitz.open(pdf_path)
  25. images = []
  26. for page in doc:
  27. pix = page.get_pixmap(dpi=dpi)
  28. img = np.frombuffer(pix.samples, dtype=np.uint8)
  29. img = img.reshape((pix.height, pix.width, pix.n))
  30. if pix.n == 4:
  31. img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
  32. images.append(img)
  33. return images
  34. except ImportError:
  35. pdf_to_images = None
  36. class OCRLogic:
  37. """
  38. OCR 业务逻辑主类,支持批量图片/PDF识别,多线程加速,模型热切换等
  39. """
  40. def __init__(self, status_callback: Callable[[str], None]):
  41. """
  42. 初始化,传入状态回调函数用于UI进度提示
  43. """
  44. self.status_callback = status_callback
  45. # 默认初始化OCR模型
  46. self.model = ONNXPaddleOcr(use_angle_cls=True, use_gpu=False)
  47. def run(self, files: List[str], save_txt: bool, merge_txt: bool, output_img: bool = False, file_time_callback=None, pdf_progress_callback=None, max_workers: int = 4):
  48. """
  49. 批量图片/PDF识别主入口,支持多线程加速
  50. files: 待识别文件路径列表
  51. save_txt: 是否保存txt
  52. merge_txt: 是否合并为一个txt
  53. output_img: 是否输出带框图片
  54. file_time_callback: 单文件识别耗时回调
  55. pdf_progress_callback: PDF页进度回调
  56. max_workers: 最大线程数,默认4
  57. """
  58. import concurrent.futures
  59. start_time = time.time()
  60. all_text = [None] * len(files) # 用于顺序合并结果
  61. def process_one(idx_file):
  62. idx, file = idx_file
  63. ext = os.path.splitext(file)[1].lower()
  64. self.status_callback(f"正在处理: {os.path.basename(file)} ({idx+1}/{len(files)})")
  65. t0 = time.time()
  66. text = ""
  67. if ext == ".pdf":
  68. # PDF转图片后识别
  69. if pdf_to_images is None:
  70. raise RuntimeError("未安装pymupdf库,无法处理PDF文件。请先安装pymupdf。")
  71. images = pdf_to_images(file, dpi=300)
  72. text = self._ocr_images(images, file, save_txt, merge_txt, output_img=output_img, is_pdf=True, pdf_progress_callback=pdf_progress_callback, max_workers=max_workers)
  73. else:
  74. # 普通图片识别,兼容中文路径
  75. try:
  76. if file.lower().endswith('.bmp'):
  77. img = cv2.imdecode(np.fromfile(file, dtype=np.uint8), cv2.IMREAD_COLOR)
  78. else:
  79. with open(file, 'rb') as fimg:
  80. img_array = np.frombuffer(fimg.read(), np.uint8)
  81. img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
  82. except Exception as e:
  83. self.status_callback(f"图片读取失败: {file},错误: {e}")
  84. if file_time_callback:
  85. file_time_callback(idx, 0)
  86. return (idx, "")
  87. if img is None:
  88. self.status_callback(f"文件无法读取或不是有效图片: {file}")
  89. if file_time_callback:
  90. file_time_callback(idx, 0)
  91. return (idx, "")
  92. text = self._ocr_image(img, file, save_txt, output_img=output_img)
  93. t1 = time.time()
  94. if file_time_callback:
  95. file_time_callback(idx, t1-t0)
  96. self.status_callback(f"{os.path.basename(file)} 识别用时: {t1-t0:.2f} 秒")
  97. if len(files) > 1:
  98. avg = (t1 - start_time) / (idx + 1)
  99. self.status_callback(f"已完成 {idx+1}/{len(files)},平均单张用时: {avg:.2f} 秒")
  100. return (idx, text)
  101. # 多线程处理所有文件,结果按索引回填,保证顺序
  102. with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
  103. futures = [executor.submit(process_one, (idx, file)) for idx, file in enumerate(files)]
  104. for future in concurrent.futures.as_completed(futures):
  105. idx, text = future.result()
  106. all_text[idx] = text
  107. # 合并写入txt
  108. if save_txt and merge_txt and len(files) > 1:
  109. out_dir = self._get_output_dir(files[0])
  110. timestamp = time.strftime("%Y%m%d_%H%M%S")
  111. out_txt = os.path.join(out_dir, f"merged_ocr_{timestamp}.txt")
  112. with open(out_txt, "w", encoding="utf-8") as f:
  113. for text in all_text:
  114. if text:
  115. f.write(text)
  116. f.write("\n\n")
  117. elapsed = time.time() - start_time
  118. if files:
  119. out_dir = self._get_output_dir(files[0])
  120. self.status_callback(f"识别完成,总耗时:{elapsed:.2f}秒,文件保存在:{out_dir}")
  121. else:
  122. self.status_callback(f"识别完成,总耗时:{elapsed:.2f}秒")
  123. def _ocr_images(self, images, pdf_path, save_txt, merge_txt, output_img=False, is_pdf=False, pdf_progress_callback=None, max_workers: int = 4):
  124. """
  125. PDF转图片后,批量图片识别,支持多线程加速
  126. images: PDF每页图片(numpy数组)
  127. pdf_path: 原PDF路径
  128. save_txt: 是否保存txt
  129. merge_txt: 是否合并txt(未用)
  130. output_img: 是否输出带框图片
  131. pdf_progress_callback: 页进度回调
  132. max_workers: 最大线程数,默认4
  133. """
  134. import concurrent.futures
  135. out_dir = self._get_output_dir(pdf_path)
  136. pdf_text = [None] * len(images)
  137. timestamp = time.strftime("%Y%m%d_%H%M%S")
  138. total = len(images)
  139. def process_page(i_img):
  140. i, img = i_img
  141. img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
  142. result = self.model.ocr(img_cv)
  143. if output_img:
  144. out_img_path = os.path.join(out_dir, f"{Path(pdf_path).stem}_page{i+1}_ocr.jpg")
  145. sav2Img(img_cv, result, name=out_img_path)
  146. page_text = self._result_to_text(result)
  147. return (i, page_text)
  148. # 多线程识别每一页,结果按页码顺序合并
  149. with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
  150. futures = [executor.submit(process_page, (i, img)) for i, img in enumerate(images)]
  151. for future in concurrent.futures.as_completed(futures):
  152. i, page_text = future.result()
  153. pdf_text[i] = page_text
  154. if pdf_progress_callback:
  155. pdf_progress_callback(i + 1, total)
  156. if save_txt:
  157. txt_path = os.path.join(out_dir, f"{Path(pdf_path).stem}_ocr_{timestamp}.txt")
  158. with open(txt_path, "w", encoding="utf-8") as f:
  159. f.write("\n\n".join(pdf_text))
  160. return "\n\n".join(pdf_text)
  161. def _ocr_image(self, img, img_path, save_txt, output_img=False):
  162. """
  163. 单张图片OCR识别,支持保存txt和输出带框图片
  164. """
  165. out_dir = self._get_output_dir(img_path)
  166. result = self.model.ocr(img)
  167. if output_img:
  168. out_img_path = os.path.join(out_dir, f"{Path(img_path).stem}_ocr.jpg")
  169. sav2Img(img, result, name=out_img_path)
  170. text = self._result_to_text(result)
  171. if save_txt:
  172. timestamp = time.strftime("%Y%m%d_%H%M%S")
  173. txt_path = os.path.join(out_dir, f"{Path(img_path).stem}_ocr_{timestamp}.txt")
  174. with open(txt_path, "w", encoding="utf-8") as f:
  175. f.write(text)
  176. return text
  177. def _result_to_text(self, result):
  178. """
  179. 将OCR识别结果结构化为纯文本,兼容只检测无识别内容的情况
  180. """
  181. # 健壮性检查,防止result为空或结构异常
  182. if not result or not isinstance(result, list) or not result[0] or not isinstance(result[0], list):
  183. return "[未检测到内容]"
  184. lines = []
  185. for box in result[0]:
  186. # 兼容只检测无识别内容的情况
  187. if isinstance(box, list) and len(box) == 2 and isinstance(box[1], (list, tuple)) and len(box[1]) >= 1:
  188. lines.append(str(box[1][0]))
  189. elif isinstance(box, list) and (isinstance(box[0], (list, tuple)) or isinstance(box[0], float)):
  190. # 只有检测框,无识别内容
  191. lines.append("[未识别] " + str(box))
  192. else:
  193. lines.append(str(box))
  194. return "\n".join(lines)
  195. def _get_output_dir(self, file_path):
  196. """
  197. 获取输出目录,自动创建
  198. """
  199. base_dir = os.path.dirname(file_path)
  200. out_dir = os.path.join(base_dir, "Output_OCR")
  201. os.makedirs(out_dir, exist_ok=True)
  202. return out_dir
  203. def set_model(self, model_name, use_gpu=False):
  204. """
  205. 切换OCR模型,支持多模型热切换,所有模型统一用ppocrv5字典
  206. use_gpu: 是否启用GPU
  207. """
  208. import os
  209. import tkinter.messagebox as messagebox
  210. base_model_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "onnxocr", "models"))
  211. model_map = {
  212. "PP-OCRv5": "ppocrv5",
  213. "PP-OCRv4": "ppocrv4",
  214. "ch_ppocr_server_v2.0": "ch_ppocr_server_v2.0"
  215. }
  216. model_dir = model_map.get(model_name, "ppocrv5")
  217. model_path = os.path.join(base_model_dir, model_dir)
  218. det_model_dir = os.path.join(model_path, "det", "det.onnx")
  219. cls_model_dir = os.path.join(model_path, "cls", "cls.onnx")
  220. rec_char_dict_path = os.path.join(base_model_dir, "ppocrv5", "ppocrv5_dict.txt")
  221. rec_model_dir = os.path.join(model_path, "rec", "rec.onnx") if os.path.exists(os.path.join(model_path, "rec", "rec.onnx")) else None
  222. ocr_kwargs = dict(
  223. use_angle_cls=True,
  224. use_gpu=use_gpu, # 关键:传递GPU参数
  225. det_model_dir=det_model_dir,
  226. cls_model_dir=cls_model_dir,
  227. rec_char_dict_path=rec_char_dict_path
  228. )
  229. if rec_model_dir and os.path.exists(rec_model_dir):
  230. ocr_kwargs["rec_model_dir"] = rec_model_dir
  231. try:
  232. self.model = ONNXPaddleOcr(**ocr_kwargs)
  233. if use_gpu:
  234. try:
  235. import onnxruntime as ort
  236. providers = self.model.session.get_providers() if hasattr(self.model, 'session') else []
  237. if not any('CUDA' in p for p in providers):
  238. msg = ("未检测到可用GPU,已自动切换为CPU推理。请检查CUDA/cuDNN环境配置。")
  239. if hasattr(self, 'ui_ref') and hasattr(self.ui_ref, 'update_gpu_status'):
  240. self.ui_ref.update_gpu_status(msg)
  241. if hasattr(self, 'status_callback'):
  242. self.status_callback("[警告] 未检测到可用GPU,已切换为CPU推理。请检查CUDA/cuDNN环境配置。")
  243. except Exception:
  244. msg = ("检测GPU状态时发生异常,可能未正确安装CUDA/cuDNN或onnxruntime-gpu。已自动切换为CPU推理。")
  245. if hasattr(self, 'ui_ref') and hasattr(self.ui_ref, 'update_gpu_status'):
  246. self.ui_ref.update_gpu_status(msg)
  247. if hasattr(self, 'status_callback'):
  248. self.status_callback("[警告] GPU检测异常,已切换为CPU推理。请检查CUDA/cuDNN环境配置。")
  249. except Exception as e:
  250. if use_gpu:
  251. msg = f"GPU初始化失败,已自动切换为CPU。请检查CUDA/cuDNN环境配置。错误信息: {e}"
  252. if hasattr(self, 'ui_ref') and hasattr(self.ui_ref, 'update_gpu_status'):
  253. self.ui_ref.update_gpu_status(msg)
  254. if hasattr(self, 'status_callback'):
  255. self.status_callback("[警告] GPU初始化失败,已切换为CPU推理。请检查CUDA/cuDNN环境配置。")
  256. ocr_kwargs["use_gpu"] = False
  257. self.model = ONNXPaddleOcr(**ocr_kwargs)
  258. else:
  259. raise