| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859 |
- from __future__ import annotations
- import re
- import time
- from pathlib import Path
- from urllib.error import HTTPError
- from urllib.request import urlopen
- __all__ = ["DOWNLOAD_DIR", "retrieve_file", "output_file", "urls_from_file"]
- NAME_REMOVE = ("http://", "https://", "github.com/", "/raw/")
- DOWNLOAD_DIR = Path(__file__).parent
- # ----------------------------------------------------------------------
- # Please update ./preload.py accordingly when modifying this file
- # ----------------------------------------------------------------------
- def output_file(url: str, download_dir: Path = DOWNLOAD_DIR) -> Path:
- file_name = url.strip()
- for part in NAME_REMOVE:
- file_name = file_name.replace(part, '').strip().strip('/:').strip()
- return Path(download_dir, re.sub(r"[^\-_\.\w\d]+", "_", file_name))
- def retrieve_file(url: str, download_dir: Path = DOWNLOAD_DIR, wait: float = 5) -> Path:
- path = output_file(url, download_dir)
- if path.exists():
- print(f"Skipping {url} (already exists: {path})")
- else:
- download_dir.mkdir(exist_ok=True, parents=True)
- print(f"Downloading {url} to {path}")
- try:
- download(url, path)
- except HTTPError:
- time.sleep(wait) # wait a few seconds and try again.
- download(url, path)
- return path
- def urls_from_file(list_file: Path) -> list[str]:
- """``list_file`` should be a text file where each line corresponds to a URL to
- download.
- """
- print(f"file: {list_file}")
- content = list_file.read_text(encoding="utf-8")
- return [url for url in content.splitlines() if not url.startswith("#")]
- def download(url: str, dest: Path):
- with urlopen(url) as f:
- data = f.read()
- with open(dest, "wb") as f:
- f.write(data)
- assert Path(dest).exists()
|