__init__.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. from __future__ import annotations
  2. import re
  3. import time
  4. from pathlib import Path
  5. from urllib.error import HTTPError
  6. from urllib.request import urlopen
  7. __all__ = ["DOWNLOAD_DIR", "retrieve_file", "output_file", "urls_from_file"]
  8. NAME_REMOVE = ("http://", "https://", "github.com/", "/raw/")
  9. DOWNLOAD_DIR = Path(__file__).parent
  10. # ----------------------------------------------------------------------
  11. # Please update ./preload.py accordingly when modifying this file
  12. # ----------------------------------------------------------------------
  13. def output_file(url: str, download_dir: Path = DOWNLOAD_DIR) -> Path:
  14. file_name = url.strip()
  15. for part in NAME_REMOVE:
  16. file_name = file_name.replace(part, '').strip().strip('/:').strip()
  17. return Path(download_dir, re.sub(r"[^\-_\.\w\d]+", "_", file_name))
  18. def retrieve_file(url: str, download_dir: Path = DOWNLOAD_DIR, wait: float = 5) -> Path:
  19. path = output_file(url, download_dir)
  20. if path.exists():
  21. print(f"Skipping {url} (already exists: {path})")
  22. else:
  23. download_dir.mkdir(exist_ok=True, parents=True)
  24. print(f"Downloading {url} to {path}")
  25. try:
  26. download(url, path)
  27. except HTTPError:
  28. time.sleep(wait) # wait a few seconds and try again.
  29. download(url, path)
  30. return path
  31. def urls_from_file(list_file: Path) -> list[str]:
  32. """``list_file`` should be a text file where each line corresponds to a URL to
  33. download.
  34. """
  35. print(f"file: {list_file}")
  36. content = list_file.read_text(encoding="utf-8")
  37. return [url for url in content.splitlines() if not url.startswith("#")]
  38. def download(url: str, dest: Path):
  39. with urlopen(url) as f:
  40. data = f.read()
  41. with open(dest, "wb") as f:
  42. f.write(data)
  43. assert Path(dest).exists()